Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
RC Data Science
createAndParseSACCT
Commits
d4a3139b
Commit
d4a3139b
authored
Jun 18, 2020
by
Ryan Randles Jones
Browse files
update
parent
7d55d10a
Changes
2
Hide whitespace changes
Inline
Side-by-side
RC_styles**
@
7a2fd654
Compare
7a2fd654
...
7a2fd654
Subproject commit 7a2fd65482b64345b1cd923f38846433f7c33399
slurm-2sql.ipynb
View file @
d4a3139b
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
# Notebook Setup
# Notebook Setup
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
import sqlite3
import sqlite3
import slurm2sql
import slurm2sql
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
%matplotlib inline
%matplotlib inline
import seaborn as sns
import seaborn as sns
import plotly.express as px
import plotly.express as px
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
from RC_styles import rc_styles as style
```
%% Cell type:code id: tags:
```
# must run
# must run
# creates database of info from March 2020 using sqlite 3
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# creates database of allocation info from March 2020 using sqlite 3
# creates database of allocation info from March 2020 using sqlite 3
# not using this right now, but is here as an option
# not using this right now, but is here as an option
#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# variable for to be used in names of plots to describe the max gigs measured
# variable for to be used in names of plots to describe the max gigs measured
#UpperlimitGB = 5
#UpperlimitGB = 5
# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
#upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
#upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# df is starting database
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
df = pd.read_sql('SELECT * FROM slurm', db)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# voluntary
# voluntary
# for displaying all available column options
# for displaying all available column options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', None)
df.head(5)
df.head(5)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# converts units in ReqMemCPU column from bytes to gigs
# converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# df_1 is dataframe of all completed jobs
# df_1 is dataframe of all completed jobs
df_1 = df[df.State.str.contains('COMPLETED')]
df_1 = df[df.State.str.contains('COMPLETED')]
df_1.head(5)
df_1.head(5)
```
```
%% Cell type:markdown id: tags:
%% Cell type:markdown id: tags:
# Average RAM per CPU Requested by User
# Average RAM per CPU Requested by User
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# df_2 is database of completed jobs with only User and ReqMemCpu
# df_2 is database of completed jobs with only User and ReqMemCpu
# it is used for the user dataframes
# it is used for the user dataframes
df_2 = df_1.loc[:,['User','ReqMemCPU']]
df_2 = df_1.loc[:,['User','ReqMemCPU']]
df_2.head(5)
df_2.head(5)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN")
nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True)
df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True)
df_2.dropna(subset = ["User"], inplace=True)
df_2.head(5)
df_2.head(5)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# count = count of jobs per user
# count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
df_user.head(5)
df_user.head(5)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# creates user number column of strings of numbers from 0 to the total number of users
# creates user number column of strings of numbers from 0 to the total number of users
# used in graphs in place of usernames
# used in graphs in place of usernames
usernames = df_user['User']
usernames = df_user['User']
user_numbers = [str(i) for i in range(len(usernames))]
user_numbers = [str(i) for i in range(len(usernames))]
df_user['User Number'] = user_numbers
df_user['User Number'] = user_numbers
df_user
df_user
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# df_user_graph is df_user sorted in decending order by mean for easy readibility of graph
# df_user_graph is df_user sorted in decending order by mean for easy readibility of graph
df_user_graph = df_user.sort_values(by='mean', ascending=False)
df_user_graph = df_user.sort_values(by='mean', ascending=False)
df_user_graph.head(5)
df_user_graph.head(5)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# bar graph for jobs run per user - shows average requested RAM per CPU for all jobs by user
# bar graph for jobs run per user - shows average requested RAM per CPU for all jobs by user
user_graph = px.bar(df_user_graph, x='User Number', y='mean', color = 'mean',
user_graph = px.bar(df_user_graph, x='User Number', y='mean', color = 'mean',
hover_data=['max','count'],
hover_data=['max','count'],
labels={'mean':'Average Requested RAM per CPU (Gigs)'},
labels={'mean':'Average Requested RAM per CPU (Gigs)'},
height=400)
height=400)
user_graph.update_xaxes(showticklabels=False)
user_graph.update_xaxes(showticklabels=False)
user_graph.update_layout(
user_graph.update_layout(
xaxis_type = 'category',
xaxis_type = 'category',
title={
title={
'text': "Average Requested RAM per CPU by User for all Users",
'text': "Average Requested RAM per CPU by User for all Users",
'y':0.9,
'y':0.9,
'x':0.5,
'x':0.5,
'xanchor': 'center',
'xanchor': 'center',
'yanchor': 'top'})
'yanchor': 'top'})
user_graph.show()
user_graph.show()
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# dataset of all users whose number of jobs equal 1000 or less.
# dataset of all users whose number of jobs equal 1000 or less.
df_without_outlier = df_user[df_user['count'] <= 1060]
df_without_outlier = df_user[df_user['count'] <= 1060]
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# df_without_outlier_graph is df_without_outlier sorted in decending order by mean for easy readibility of graph
# df_without_outlier_graph is df_without_outlier sorted in decending order by mean for easy readibility of graph
df_without_outlier_graph = df_without_outlier.sort_values(by='mean', ascending=False)
df_without_outlier_graph = df_without_outlier.sort_values(by='mean', ascending=False)
df_without_outlier_graph.head(5)
df_without_outlier_graph.head(5)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# bar graph jobs run per user for all users whose number of jobs equal 1000 or less.
# bar graph jobs run per user for all users whose number of jobs equal 1000 or less.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 21 out of 162 users
# represents 21 out of 162 users
without_outlier_graph = px.bar(df_without_outlier_graph, x='User Number', y='mean',
without_outlier_graph = px.bar(df_without_outlier_graph, x='User Number', y='mean',
hover_data=['count', 'max'], color='mean',
hover_data=['count', 'max'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
without_outlier_graph.update_xaxes(showticklabels=False)
without_outlier_graph.update_xaxes(showticklabels=False)
without_outlier_graph.update_layout(
without_outlier_graph.update_layout(
xaxis_type = 'category',
xaxis_type = 'category',
title={
title={
'text': "Average Requested RAM per CPU by User for all Users Running 1060 Jobs or Less",
'text': "Average Requested RAM per CPU by User for all Users Running 1060 Jobs or Less",
'y':0.9,
'y':0.9,
'x':0.5,
'x':0.5,
'xanchor': 'center',
'xanchor': 'center',
'yanchor': 'top'})
'yanchor': 'top'})
without_outlier_graph.show()
without_outlier_graph.show()
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
#voluntary
#voluntary
# gives description of the counts (number of jobs ran) for all the users
# gives description of the counts (number of jobs ran) for all the users
df_count = df_user['count'].describe()
df_count = df_user['count'].describe()
df_count
df_count
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# dataset of users whose number of jobs ran falls in the lower 25 percentile of jobs ran
# dataset of users whose number of jobs ran falls in the lower 25 percentile of jobs ran
df_25percent = df_user[df_user['count'] <= 4.25]
df_25percent = df_user[df_user['count'] <= 4.25]
print(df_25percent.User.count(),'users in the lower 25% out of', df_user.User.count(),'users total')
print(df_25percent.User.count(),'users in the lower 25% out of', df_user.User.count(),'users total')
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# df_25percent_graph is df_25percent sorted in decending order by mean for easy readibility of graph
# df_25percent_graph is df_25percent sorted in decending order by mean for easy readibility of graph
df_25percent_graph = df_25percent.sort_values(by='mean', ascending=False)
df_25percent_graph = df_25percent.sort_values(by='mean', ascending=False)
df_25percent_graph.head(5)
df_25percent_graph.head(5)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# bar graph jobs run per user for all users whose number of jobs fall in the lower 25th percentile.
# bar graph jobs run per user for all users whose number of jobs fall in the lower 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 41 out of 162 users
# represents 41 out of 162 users
lower25percent = px.bar(df_25percent_graph, x='User Number', y='mean',
lower25percent = px.bar(df_25percent_graph, x='User Number', y='mean',
hover_data=['count', 'max'], color='mean',
hover_data=['count', 'max'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
lower25percent.update_xaxes(showticklabels=False)
lower25percent.update_xaxes(showticklabels=False)
lower25percent.update_layout(
lower25percent.update_layout(
xaxis_type = 'category',
xaxis_type = 'category',
title={
title={
'text': "Average Requested RAM per CPU by User for all Users Running 4.25 Jobs or Less",
'text': "Average Requested RAM per CPU by User for all Users Running 4.25 Jobs or Less",
'y':0.9,
'y':0.9,
'x':0.5,
'x':0.5,
'xanchor': 'center',
'xanchor': 'center',
'yanchor': 'top'})
'yanchor': 'top'})
lower25percent.show()
lower25percent.show()
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# dataset of users whose number of jobs ran falls between the median and 75th percentile of jobs ran
# dataset of users whose number of jobs ran falls between the median and 75th percentile of jobs ran
df_mid = df_user[(df_user['count'] > 4.25) & (df_user['count'] < 145)]
df_mid = df_user[(df_user['count'] > 4.25) & (df_user['count'] < 145)]
print(df_mid.User.count(),'users in the middle range out of', df_user.User.count(),'users total')
print(df_mid.User.count(),'users in the middle range out of', df_user.User.count(),'users total')
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# df_mid_graph is df_mid sorted in decending order by mean for easy readibility of graph
# df_mid_graph is df_mid sorted in decending order by mean for easy readibility of graph
df_mid_graph = df_mid.sort_values(by='mean', ascending=False)
df_mid_graph = df_mid.sort_values(by='mean', ascending=False)
df_mid_graph.head(5)
df_mid_graph.head(5)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# bar graph jobs run per user for all users whose number of jobs fall in the middle range between the lower and upper 25th percentile.
# bar graph jobs run per user for all users whose number of jobs fall in the middle range between the lower and upper 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 80 out of 162 users
# represents 80 out of 162 users
mid_graph = px.bar(df_mid_graph, x='User Number', y='mean',
mid_graph = px.bar(df_mid_graph, x='User Number', y='mean',
hover_data=['count', 'max'], color='mean',
hover_data=['count', 'max'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
mid_graph.update_xaxes(showticklabels=False)
mid_graph.update_xaxes(showticklabels=False)
mid_graph.update_layout(
mid_graph.update_layout(
xaxis_type = 'category',
xaxis_type = 'category',
title={
title={
'text': "Average Requested RAM per CPU by User for all Users Running between 4.25 and 145 Jobs",
'text': "Average Requested RAM per CPU by User for all Users Running between 4.25 and 145 Jobs",
'y':0.9,
'y':0.9,
'x':0.5,
'x':0.5,
'xanchor': 'center',
'xanchor': 'center',
'yanchor': 'top'})
'yanchor': 'top'})
mid_graph.show()
mid_graph.show()
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# dataset of users whose number of jobs ran falls in the upper 25th percentile of jobs ran (or the 75th percentile)
# dataset of users whose number of jobs ran falls in the upper 25th percentile of jobs ran (or the 75th percentile)
df_75percent = df_user[df_user['count'] >= 145]
df_75percent = df_user[df_user['count'] >= 145]
print(df_75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
print(df_75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# df_75percent_graph is df_75percent sorted in decending order by mean for easy readibility of graph
# df_75percent_graph is df_75percent sorted in decending order by mean for easy readibility of graph
df_75percent_graph = df_75percent.sort_values(by='mean', ascending=False)
df_75percent_graph = df_75percent.sort_values(by='mean', ascending=False)
df_75percent_graph.head(5)
df_75percent_graph.head(5)
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# bar graph jobs run per user for all users whose number of jobs fall in the upper 25th percentile.
# bar graph jobs run per user for all users whose number of jobs fall in the upper 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 41 out of 162 users
# represents 41 out of 162 users
upper25percent = px.bar(df_75percent_graph, x='User Number', y='mean',
upper25percent = px.bar(df_75percent_graph, x='User Number', y='mean',
hover_data=['count', 'max'], color='mean',
hover_data=['count', 'max'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
upper25percent.update_xaxes(showticklabels=False)
upper25percent.update_xaxes(showticklabels=False)
upper25percent.update_layout(
upper25percent.update_layout(
xaxis_type = 'category',
xaxis_type = 'category',
title={
title={
'text': "Average Requested RAM per CPU by User for all Users Running 145 Jobs or Less",
'text': "Average Requested RAM per CPU by User for all Users Running 145 Jobs or Less",
'y':0.9,
'y':0.9,
'x':0.5,
'x':0.5,
'xanchor': 'center',
'xanchor': 'center',
'yanchor': 'top'})
'yanchor': 'top'})
upper25percent.show()
upper25percent.show()
```
```
%% Cell type:code id: tags:
%% Cell type:code id: tags:
```
```
# must run
# must run
# dataset of users whose number of jobs ran falls in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
# dataset of users whose number of jobs ran falls in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
df_lower75percent = df_user[(df_user['count'] >= 145) & (df_user['count'] <= 1060)]
df_lower75percent = df_user[(df_user['count'] &