Commit 93307071 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added elbow graph and facet grid

parent 6a1740ce
%% Cell type:markdown id: tags:
# Notebook Setup
%% Cell type:code id: tags:
```
``` python
# must run
import sqlite3
import slurm2sql
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import seaborn as sb
import plotly.express as px
import matplotlib.ticker as ticker
import numpy as np
```
%% Cell type:code id: tags:
```
``` python
from RC_styles import rc_styles as style
```
%% Cell type:code id: tags:
```
``` python
from sklearn.cluster import KMeans
```
%% Cell type:code id: tags:
```
``` python
# must run
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
%% Cell type:code id: tags:
```
``` python
# must run
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
```
%% Cell type:code id: tags:
```
``` python
# voluntary
# for displaying all available column options
pd.set_option('display.max_columns', None)
df.head(5)
```
%% Cell type:code id: tags:
```
``` python
# must run
# converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
```
%% Cell type:code id: tags:
```
``` python
# must run
# converts Elapsed time to hours (from seconds)
df['Elapsed'] = df['Elapsed'].div(3600)
```
%% Cell type:code id: tags:
```
``` python
# must run
# df_completed is dataframe of all completed jobs
df_completed = df[df.State.str.contains('COMPLETED')]
#df_completed.head(5)
```
%% Cell type:code id: tags:
```
# must run
# df_batch is df with only batch jobs
df_batch = df[df.JobName.str.contains('batch')]
#df_batch.head(5)
``` python
```
%% Cell type:markdown id: tags:
# Average RAM per CPU Requested by User
# ReqMemCPU,Corecount,Runtime
%% Cell type:code id: tags:
```
# must run
# df_2 is database of completed jobs with only User and ReqMemCpu
# it is used for the user dataframes
df_2 = df_completed.loc[:,['User','ReqMemCPU']]
#df_2.head(5)
``` python
UpperlimitGB = 50
```
%% Cell type:code id: tags:
```
df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)
#df_2.head(5)
``` python
df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
df_1.head(5)
```
%% Cell type:code id: tags:
```
# must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
%%%% Output: execute_result
nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True)
#df_2.head(5)
```
ReqMemCPU Elapsed AllocCPUS
0 8.000 144.090278 6
1 8.000 144.090278 6
2 8.000 144.090278 6
7 6.000 50.001389 4
9 78.125 150.001389 2
%% Cell type:code id: tags:
```
# must run
# count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
#df_user.head(5)
``` python
df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
# voluntary
# description of number of jobs run per user - can be used to choose the Upper Limit Job Count
df_user['count'].describe()
``` python
df_1['Elapsed'] = df_1['Elapsed'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max job count per user
# max = 367257
UpperlimitJobCount = 100
``` python
df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)
df_1_sorted.head(5)
```
%% Cell type:code id: tags:
%%%% Output: execute_result
```
# must run
# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above
jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]
#jobscount_cutoff.head(5)
```
ReqMemCPU Elapsed AllocCPUS
824465 10.0 1.0 1
1101525 10.0 1.0 1
1101524 10.0 1.0 1
1101523 10.0 1.0 1
1101522 10.0 1.0 1
%% Cell type:code id: tags:
```
# must run
# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph
df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True)
df_user_graph_full.head(5)
``` python
df_runtime = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB)]
df_runtime.head(5)
```
%% Cell type:code id: tags:
%%%% Output: execute_result
```
df_user_graph = df_user_graph_full.loc[:,['User','count','mean']]
df_user_graph.head(5)
```
ReqMemCPU Elapsed AllocCPUS
824465 10.0 1.0 1
1101525 10.0 1.0 1
1101524 10.0 1.0 1
1101523 10.0 1.0 1
1101522 10.0 1.0 1
%% Cell type:code id: tags:
```
``` python
style.default_axes_and_ticks()
style.figsize()
user_graph1 = sns.scatterplot(x="count", y="mean",data=df_user_graph)
runtime_graph = sns.scatterplot(x="ReqMemCPU", y="AllocCPUS",data=df_runtime)
#hue="AllocCPUS")
#, size="AllocCPUS")
plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
#plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('Job Count Per User')
plt.ylabel('Average Requested RAM per CPU (Gigs)')
plt.xlabel('ReqMemCPU')
plt.ylabel('AllocCPUS')
#plt.yscale("log")
plt.show()
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=3)
model = kmeans.fit(df_user_graph[['count', 'mean']])
# Now, we can get the predicted model labels, or Centroids, in the form of an array:
model.cluster_centers_
```
%% Cell type:code id: tags:
```
# attach predicted cluster to original points
df_user_graph['predicted'] = model.labels_
df_user_graph.head(5)
```
%% Cell type:code id: tags:
%%%% Output: display_data
```
# Create a dataframe for cluster_centers (centroids)
centroids = pd.DataFrame(model.cluster_centers_, columns=["count", "mean"])
```
![]()
%% Cell type:code id: tags:
```
``` python
style.default_axes_and_ticks()
style.figsize()
## Plot scatter by cluster / color, and centroids
colors = ["red", "green", "blue"]
df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p])
ax = df_user_graph.plot(
kind="scatter",
x="count", y="mean",
c = df_user_graph['color']
)
centroids.plot(
kind="scatter",
x="count", y="mean",
marker="*", c=["r", "g", "b"], s=550,
ax=ax
)
```
%% Cell type:markdown id: tags:
# trying the same above graph using diffrerent syntax
%% Cell type:code id: tags:
```
df_user_graph_cluster = df_user_graph_full.loc[:,['count','mean']]
#df_user_graph_cluster.head(5)
g = sns.PairGrid(df_runtime, y_vars=["Elapsed"], x_vars=["ReqMemCPU", "AllocCPUS"], height=4)
g.map(sns.regplot, color="blue")
#g.set(ylim=(-1, 11), yticks=[0, 5, 10]);
```
%% Cell type:code id: tags:
%%%% Output: execute_result
```
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_user_graph_cluster)
print(kmeans.cluster_centers_)
```
<seaborn.axisgrid.PairGrid at 0x2aab37653310>
%% Cell type:code id: tags:
%%%% Output: display_data
```
plt.scatter(df_user_graph_cluster['count'],df_user_graph_cluster['mean'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
#plt.yscale("log")
```
%% Cell type:markdown id: tags:
# ReqMemCPU,Corecount,Runtime
%% Cell type:code id: tags:
```
UpperlimitGB1 = 50
```
%% Cell type:code id: tags:
```
df_4 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
df_4.head(5)
```
![]()
%% Cell type:code id: tags:
```
df_4['ReqMemCPU'] = df_4['ReqMemCPU'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
df_4['Elapsed'] = df_4['Elapsed'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
df_4.sort_values(by='AllocCPUS', ascending=True)
```
%% Cell type:code id: tags:
```
df_4_cutoff = df_4[(df_4['ReqMemCPU'] <= UpperlimitGB1)]
df_4_cutoff
```
%% Cell type:code id: tags:
```
``` python
style.default_axes_and_ticks()
style.figsize()
user_graph5 = sns.scatterplot(x="ReqMemCPU", y="Elapsed",data=df_4_cutoff)
#hue="AllocCPUS")
#, size="AllocCPUS")
#plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('ReqMemCPU')
plt.ylabel('Runtime')
#plt.yscale("log")
g = sb.PairGrid(df_runtime)
g.map(plt.scatter);
plt.show()
```
%% Cell type:code id: tags:
%%%% Output: display_data
```
df_runtime_graph_cluster = df_4[(df_4['ReqMemCPU'] <= UpperlimitGB1)]
#df_runtime_graph_cluster.head(5)
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=4, random_state=111)
kmeans.fit(df_runtime_graph_cluster)
print(kmeans.cluster_centers_)
```
![]()
%% Cell type:code id: tags:
```
plt.scatter(df_runtime_graph_cluster['ReqMemCPU'],df_runtime_graph_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
#plt.yscale("log")
plt.xlabel('ReqMemCPU')
plt.ylabel('Runtime')
```
%% Cell type:markdown id: tags:
# Average RAM per CPU by Job
%% Cell type:code id: tags:
```
# must run
# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used to pull out needed information and create separate datasets to compare
df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]
#df_3.head(5)
```
%% Cell type:code id: tags:
```
df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)
#df_3.head(5)
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 50
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above
gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]
#gig_cutoff.head(5)
``` python
df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB)]
#df_runtime_graph_cluster.head(5)
```
%% Cell type:code id: tags:
```
# renames JobID column to JobCount since that's what it is now
df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
#df_cpu_per_job.head(5)
``` python
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(df_runtime_cluster)
Sum_of_squared_distances.append(km.inertia_)
```
%% Cell type:code id: tags:
```
df_cpu_per_job['ReqMemCPU'].describe()
``` python
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
cpu_per_job = sns.scatterplot(x="ReqMemCPU", y="JobCount",data=df_cpu_per_job)
cpu_per_job.set_yscale('log')
#cpu_per_job.yaxis.set_major_locator(ticker.MultipleLocator(100000))
#cpu_per_job.yaxis.set_major_formatter(ticker.ScalarFormatter())
%%%% Output: display_data
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
![]()
plt.xlabel('Requested RAM per CPU (Gigs) per Job')
plt.ylabel('Job Count')
plt.show()
```
%% Cell type:code id: tags:
%% Cell type:markdown id: tags:
```
df_cpu_per_job_cluster = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
df_cpu_per_job_cluster.head(30)
```
#
%% Cell type:code id: tags:
```
``` python
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_cpu_per_job_cluster)
kmeans.fit(df_runtime_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
%%%% Output: stream
```
print(kmeans.labels_)
```
[[31.01418718 1.76035076 2.01066712]
[ 9.07246984 1.16428982 1.31854662]