Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
RC Data Science
createAndParseSACCT
Commits
93307071
Commit
93307071
authored
Jul 16, 2020
by
Ryan Randles Jones
Browse files
added elbow graph and facet grid
parent
6a1740ce
Changes
1
Hide whitespace changes
Inline
Side-by-side
Runtime-and-CoreCount-ReqMemCPU.ipynb
View file @
93307071
%% Cell type:markdown id: tags:
# Notebook Setup
%% Cell type:code id: tags:
```
```
python
# must run
import
sqlite3
import
slurm2sql
import
pandas
as
pd
import
matplotlib.pyplot
as
plt
%
matplotlib
inline
import
seaborn
as
sns
import
seaborn
as
sb
import
plotly.express
as
px
import
matplotlib.ticker
as
ticker
import
numpy
as
np
```
%% Cell type:code id: tags:
```
```
python
from
RC_styles
import
rc_styles
as
style
```
%% Cell type:code id: tags:
```
```
python
from
sklearn.cluster
import
KMeans
```
%% Cell type:code id: tags:
```
```
python
# must run
# creates database of info from March 2020 using sqlite 3
db
=
sqlite3
.
connect
(
'/data/rc/rc-team/slurm-since-March.sqlite3'
)
```
%% Cell type:code id: tags:
```
```
python
# must run
# df is starting database
df
=
pd
.
read_sql
(
'SELECT * FROM slurm'
,
db
)
```
%% Cell type:code id: tags:
```
```
python
# voluntary
# for displaying all available column options
pd
.
set_option
(
'display.max_columns'
,
None
)
df
.
head
(
5
)
```
%% Cell type:code id: tags:
```
```
python
# must run
# converts units in ReqMemCPU column from bytes to gigs
df
[
'ReqMemCPU'
]
=
df
[
'ReqMemCPU'
].
div
(
1024
**
3
)
```
%% Cell type:code id: tags:
```
```
python
# must run
# converts Elapsed time to hours (from seconds)
df
[
'Elapsed'
]
=
df
[
'Elapsed'
].
div
(
3600
)
```
%% Cell type:code id: tags:
```
```
python
# must run
# df_completed is dataframe of all completed jobs
df_completed
=
df
[
df
.
State
.
str
.
contains
(
'COMPLETED'
)]
#df_completed.head(5)
```
%% Cell type:code id: tags:
```
# must run
# df_batch is df with only batch jobs
df_batch = df[df.JobName.str.contains('batch')]
#df_batch.head(5)
```
python
``
`
%%
Cell
type
:
markdown
id
:
tags
:
#
Average RAM per CPU Requested by User
#
ReqMemCPU,Corecount,Runtime
%%
Cell
type
:
code
id
:
tags
:
```
# must run
# df_2 is database of completed jobs with only User and ReqMemCpu
# it is used for the user dataframes
df_2 = df_completed.loc[:,['User','ReqMemCPU']]
#df_2.head(5)
```
python
UpperlimitGB = 50
```
%% Cell type:code id: tags:
```
df_
2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)
#
df_
2
.head(5)
```
python
df_
1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
df_
1
.head(5)
```
%% Cell type:code id: tags:
```
# must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
%%%% Output: execute_result
nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True)
#df_2.head(5)
```
ReqMemCPU Elapsed AllocCPUS
0 8.000 144.090278 6
1 8.000 144.090278 6
2 8.000 144.090278 6
7 6.000 50.001389 4
9 78.125 150.001389 2
%% Cell type:code id: tags:
```
# must run
# count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
#df_user.head(5)
```
python
df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
# voluntary
# description of number of jobs run per user - can be used to choose the Upper Limit Job Count
df_user['count'].describe()
```
python
df_1['Elapsed'] = df_1['Elapsed'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max job count per user
# max = 367257
UpperlimitJobCount = 100
```
python
df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)
df_1_sorted.head(5)
```
%%
Cell type:code id: tags:
%%
%%
Output: execute_result
```
# must run
# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above
jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]
#jobscount_cutoff.head(5)
```
ReqMemCPU Elapsed AllocCPUS
824465 10.0 1.0 1
1101525 10.0 1.0 1
1101524 10.0 1.0 1
1101523 10.0 1.0 1
1101522 10.0 1.0 1
%% Cell type:code id: tags:
```
# must run
# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph
df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True)
df_user_graph_full.head(5)
```
python
df_runtime = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB)]
df_runtime.head(5)
```
%%
Cell type:code id: tags:
%%
%%
Output: execute_result
```
df_user_graph = df_user_graph_full.loc[:,['User','count','mean']]
df_user_graph.head(5)
```
ReqMemCPU Elapsed AllocCPUS
824465 10.0 1.0 1
1101525 10.0 1.0 1
1101524 10.0 1.0 1
1101523 10.0 1.0 1
1101522 10.0 1.0 1
%% Cell type:code id: tags:
```
```
python
style.default_axes_and_ticks()
style.figsize()
user_graph1 = sns.scatterplot(x="count", y="mean",data=df_user_graph)
runtime_graph = sns.scatterplot(x="ReqMemCPU", y="AllocCPUS",data=df_runtime)
#hue="AllocCPUS")
#, size="AllocCPUS")
plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
#
plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('Job Count Per User')
plt.ylabel('Average Requested RAM per CPU (Gigs)')
plt.xlabel('ReqMemCPU')
plt.ylabel('AllocCPUS')
#plt.yscale("log")
plt.show()
```
%% Cell type:code id: tags:
```
kmeans
=
KMeans
(
n_clusters
=
3
)
model
=
kmeans
.
fit
(
df_user_graph
[[
'count'
,
'mean'
]])
#
Now
,
we
can
get
the
predicted
model
labels
,
or
Centroids
,
in
the
form
of
an
array
:
model
.
cluster_centers_
```
%% Cell type:code id: tags:
```
# attach predicted cluster to original points
df_user_graph['predicted'] = model.labels_
df_user_graph.head(5)
```
%% Cell type:code id: tags:
%%%% Output: display_data
```
# Create a dataframe for cluster_centers (centroids)
centroids = pd.DataFrame(model.cluster_centers_, columns=["count", "mean"])
```

%% Cell type:code id: tags:
```
```
python
style.default_axes_and_ticks()
style.figsize()
## Plot scatter by cluster / color, and centroids
colors = ["red", "green", "blue"]
df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p])
ax = df_user_graph.plot(
kind="scatter",
x="count", y="mean",
c = df_user_graph['color']
)
centroids.plot(
kind="scatter",
x="count", y="mean",
marker="*", c=["r", "g", "b"], s=550,
ax=ax
)
```
%% Cell type:markdown id: tags:
# trying the same above graph using diffrerent syntax
%% Cell type:code id: tags:
```
df_user_graph_cluster = df_user_graph_full.loc[:,['count','mean']]
#df_user_graph_cluster.head(5)
g = sns.PairGrid(df_runtime, y_vars=["Elapsed"], x_vars=["ReqMemCPU", "AllocCPUS"], height=4)
g.map(sns.regplot, color="blue")
#g.set(ylim=(-1, 11), yticks=[0, 5, 10]);
```
%%
Cell type:code id: tags:
%%
%%
Output: execute_result
```
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_user_graph_cluster)
print(kmeans.cluster_centers_)
```
<seaborn.axisgrid.PairGrid at 0x2aab37653310>
%%
Cell type:code id: tags:
%%
%%
Output: display_data
```
plt.scatter(df_user_graph_cluster['count'],df_user_graph_cluster['mean'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
#plt.yscale("log")
```
%% Cell type:markdown id: tags:
# ReqMemCPU,Corecount,Runtime
%% Cell type:code id: tags:
```
UpperlimitGB1 = 50
```
%% Cell type:code id: tags:
```
df_4 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
df_4.head(5)
```

%% Cell type:code id: tags:
```
df_4['ReqMemCPU'] = df_4['ReqMemCPU'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
df_4['Elapsed'] = df_4['Elapsed'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
df_4.sort_values(by='AllocCPUS', ascending=True)
```
%% Cell type:code id: tags:
```
df_4_cutoff = df_4[(df_4['ReqMemCPU'] <= UpperlimitGB1)]
df_4_cutoff
```
%% Cell type:code id: tags:
```
```
python
style.default_axes_and_ticks()
style.figsize()
user_graph5 = sns.scatterplot(x="ReqMemCPU", y="Elapsed",data=df_4_cutoff)
#hue="AllocCPUS")
#, size="AllocCPUS")
#plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('ReqMemCPU')
plt.ylabel('Runtime')
#plt.yscale("log")
g = sb.PairGrid(df_runtime)
g.map(plt.scatter);
plt.show()
```
%%
Cell type:code id: tags:
%%
%%
Output: display_data
```
df_runtime_graph_cluster = df_4[(df_4['ReqMemCPU'] <= UpperlimitGB1)]
#df_runtime_graph_cluster.head(5)
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=4, random_state=111)
kmeans.fit(df_runtime_graph_cluster)
print(kmeans.cluster_centers_)
```

%% Cell type:code id: tags:
```
plt.scatter(df_runtime_graph_cluster['ReqMemCPU'],df_runtime_graph_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
#plt.yscale("log")
plt.xlabel('ReqMemCPU')
plt.ylabel('Runtime')
```
%% Cell type:markdown id: tags:
# Average RAM per CPU by Job
%% Cell type:code id: tags:
```
# must run
# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used to pull out needed information and create separate datasets to compare
df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]
#df_3.head(5)
```
%% Cell type:code id: tags:
```
df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)
#df_3.head(5)
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 50
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above
gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]
#gig_cutoff.head(5)
```
python
df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB)]
#df_runtime_graph_cluster.head(5)
```
%% Cell type:code id: tags:
```
# renames JobID column to JobCount since that's what it is now
df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
#df_cpu_per_job.head(5)
```
python
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(df_runtime_cluster)
Sum_of_squared_distances.append(km.inertia_)
```
%% Cell type:code id: tags:
```
df_cpu_per_job['ReqMemCPU'].describe()
```
python
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
cpu_per_job = sns.scatterplot(x="ReqMemCPU", y="JobCount",data=df_cpu_per_job)
cpu_per_job.set_yscale('log')
#cpu_per_job.yaxis.set_major_locator(ticker.MultipleLocator(100000))
#cpu_per_job.yaxis.set_major_formatter(ticker.ScalarFormatter())
%%%% Output: display_data
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB
)

plt.xlabel('Requested RAM per CPU (Gigs) per Job')
plt.ylabel('Job Count')
plt.show()
```
%% Cell type:code id: tags:
%% Cell type:markdown id: tags:
```
df_cpu_per_job_cluster = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
df_cpu_per_job_cluster.head(30)
```
#
%% Cell type:code id: tags:
```
```
python
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_
cpu_per_job
_cluster)
kmeans.fit(df_
runtime
_cluster)
print(kmeans.cluster_centers_)
```
%%
Cell type:code id: tags:
%%
%% Output: stream
```
print(kmeans.labels_)
```
[[31.01418718 1.76035076 2.01066712]
[ 9.07246984 1.16428982 1.31854662]