Commit 25c3a602 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added runtime/corecount graph

parent 45bd87aa
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Notebook Setup # Notebook Setup
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
import sqlite3 import sqlite3
import slurm2sql import slurm2sql
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
%matplotlib inline %matplotlib inline
import seaborn as sns import seaborn as sns
import plotly.express as px import plotly.express as px
import matplotlib.ticker as ticker import matplotlib.ticker as ticker
import numpy as np import numpy as np
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
from RC_styles import rc_styles as style from RC_styles import rc_styles as style
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database of info from March 2020 using sqlite 3 # creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3') db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df is starting database # df is starting database
df = pd.read_sql('SELECT * FROM slurm', db) df = pd.read_sql('SELECT * FROM slurm', db)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# for displaying all available column options # for displaying all available column options
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
df.head(5) df.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# converts units in ReqMemCPU column from bytes to gigs # converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3) df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df['Elapsed'] = df['Elapsed'].div(3600)
```
%% Cell type:code id: tags:
```
# must run # must run
# df_completed is dataframe of all completed jobs # df_completed is dataframe of all completed jobs
df_completed = df[df.State.str.contains('COMPLETED')] df_completed = df[df.State.str.contains('COMPLETED')]
#df_completed.head(5) #df_completed.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_batch is df with only batch jobs # df_batch is df with only batch jobs
df_batch = df[df.JobName.str.contains('batch')] df_batch = df[df.JobName.str.contains('batch')]
#df_batch.head(5) #df_batch.head(5)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Average RAM per CPU Requested by User # Average RAM per CPU Requested by User
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_2 is database of completed jobs with only User and ReqMemCpu # df_2 is database of completed jobs with only User and ReqMemCpu
# it is used for the user dataframes # it is used for the user dataframes
df_2 = df_completed.loc[:,['User','ReqMemCPU']] df_2 = df_completed.loc[:,['User','ReqMemCPU']]
#df_2.head(5) #df_2.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil) df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)
#df_2.head(5) #df_2.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings # fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN") nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True) df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True) df_2.dropna(subset = ["User"], inplace=True)
#df_2.head(5) #df_2.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# count = count of jobs per user # count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs # mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index() df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
#df_user.head(5) #df_user.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# description of number of jobs run per user - can be used to choose the Upper Limit Job Count # description of number of jobs run per user - can be used to choose the Upper Limit Job Count
df_user['count'].describe() df_user['count'].describe()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# variable for to be used in names of plots to describe the max job count per user # variable for to be used in names of plots to describe the max job count per user
# max = 367257 # max = 367257
UpperlimitJobCount = 100 UpperlimitJobCount = 100
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above # creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above
jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)] jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]
#jobscount_cutoff.head(5) #jobscount_cutoff.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph # df_user_graph is df_user sorted in ascending order by count for easy readibility of graph
df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True) df_user_graph_full = jobscount_cutoff.sort_values(by='count', ascending=True)
df_user_graph_full.head(5) df_user_graph_full.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_user_graph = df_user_graph_full.loc[:,['User','count','mean']] df_user_graph = df_user_graph_full.loc[:,['User','count','mean']]
df_user_graph.head(5) df_user_graph.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
user_graph1 = sns.scatterplot(x="count", y="mean",data=df_user_graph) user_graph1 = sns.scatterplot(x="count", y="mean",data=df_user_graph)
plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount) plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('Job Count Per User') plt.xlabel('Job Count Per User')
plt.ylabel('Average Requested RAM per CPU (Gigs)') plt.ylabel('Average Requested RAM per CPU (Gigs)')
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
kmeans = KMeans(n_clusters=3) kmeans = KMeans(n_clusters=3)
model = kmeans.fit(df_user_graph[['count', 'mean']]) model = kmeans.fit(df_user_graph[['count', 'mean']])
# Now, we can get the predicted model labels, or Centroids, in the form of an array: # Now, we can get the predicted model labels, or Centroids, in the form of an array:
model.cluster_centers_ model.cluster_centers_
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# attach predicted cluster to original points # attach predicted cluster to original points
df_user_graph['predicted'] = model.labels_ df_user_graph['predicted'] = model.labels_
df_user_graph.head(5) df_user_graph.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# Create a dataframe for cluster_centers (centroids) # Create a dataframe for cluster_centers (centroids)
centroids = pd.DataFrame(model.cluster_centers_, columns=["count", "mean"]) centroids = pd.DataFrame(model.cluster_centers_, columns=["count", "mean"])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
## Plot scatter by cluster / color, and centroids ## Plot scatter by cluster / color, and centroids
colors = ["red", "green", "blue"] colors = ["red", "green", "blue"]
df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p]) df_user_graph['color'] = df_user_graph['predicted'].map(lambda p: colors[p])
ax = df_user_graph.plot( ax = df_user_graph.plot(
kind="scatter", kind="scatter",
x="count", y="mean", x="count", y="mean",
c = df_user_graph['color'] c = df_user_graph['color']
) )
centroids.plot( centroids.plot(
kind="scatter", kind="scatter",
x="count", y="mean", x="count", y="mean",
marker="*", c=["r", "g", "b"], s=550, marker="*", c=["r", "g", "b"], s=550,
ax=ax ax=ax
) )
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# trying the same above graph using diffrerent syntax
%% Cell type:code id: tags:
```
df_user_graph_cluster = df_user_graph_full.loc[:,['count','mean']]
#df_user_graph_cluster.head(5)
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=3, random_state=111)
kmeans.fit(df_user_graph_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
```
plt.scatter(df_user_graph_cluster['count'],df_user_graph_cluster['mean'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
#plt.yscale("log")
```
%% Cell type:markdown id: tags:
# ReqMemCPU,Corecount,Runtime
%% Cell type:code id: tags:
```
UpperlimitGB1 = 50
```
%% Cell type:code id: tags:
```
df_4 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
df_4.head(5)
```
%% Cell type:code id: tags:
```
df_4['ReqMemCPU'] = df_4['ReqMemCPU'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
df_4['Elapsed'] = df_4['Elapsed'].apply(np.ceil)
```
%% Cell type:code id: tags:
```
df_4.sort_values(by='AllocCPUS', ascending=True)
```
%% Cell type:code id: tags:
```
df_4_cutoff = df_4[(df_4['ReqMemCPU'] <= UpperlimitGB1)]
df_4_cutoff
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
user_graph5 = sns.scatterplot(x="ReqMemCPU", y="Elapsed",data=df_4_cutoff)
#hue="AllocCPUS")
#, size="AllocCPUS")
#plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('ReqMemCPU')
plt.ylabel('Runtime')
#plt.yscale("log")
plt.show()
```
%% Cell type:code id: tags:
```
df_runtime_graph_cluster = df_4[(df_4['ReqMemCPU'] <= UpperlimitGB1)]
#df_runtime_graph_cluster.head(5)
```
%% Cell type:code id: tags:
```
kmeans = KMeans(n_clusters=4, random_state=111)
kmeans.fit(df_runtime_graph_cluster)
print(kmeans.cluster_centers_)
```
%% Cell type:code id: tags:
```
plt.scatter(df_runtime_graph_cluster['ReqMemCPU'],df_runtime_graph_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='grey')
#plt.yscale("log")
plt.xlabel('ReqMemCPU')
plt.ylabel('Runtime')
```
%% Cell type:markdown id: tags:
# Average RAM per CPU by Job # Average RAM per CPU by Job
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID # df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used to pull out needed information and create separate datasets to compare # it is used to pull out needed information and create separate datasets to compare
df_3 = df_batch.loc[:,['ReqMemCPU','JobID']] df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]
#df_3.head(5) #df_3.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil) df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)
#df_3.head(5) #df_3.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# variable for to be used in names of plots to describe the max gigs measured # variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 10 UpperlimitGB = 50
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above # creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above
gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)] gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]
#gig_cutoff.head(5) #gig_cutoff.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# renames JobID column to JobCount since that's what it is now # renames JobID column to JobCount since that's what it is now
df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index() df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
#df_cpu_per_job.head(5) #df_cpu_per_job.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags: