Commit d9cf8863 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

putting users and jobs graphs together

parent 428e2440
This diff is collapsed.
%% Cell type:markdown id: tags:
# Notebook Setup
%% Cell type:code id: tags:
```
# must run
import sqlite3
import slurm2sql
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
```
%% Cell type:code id: tags:
```
from RC_styles import rc_styles as style
```
%% Cell type:code id: tags:
```
# must run
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
%% Cell type:code id: tags:
```
# creates database of allocation info from March 2020 using sqlite 3
# not using this right now, but is here as an option
#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
```
%% Cell type:code id: tags:
```
# variable for to be used in names of plots to describe the max gigs measured
#UpperlimitGB = 5
# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
#upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
```
%% Cell type:code id: tags:
```
# must run
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
```
%% Cell type:code id: tags:
```
# voluntary
# for displaying all available column options
pd.set_option('display.max_columns', None)
df.head(5)
```
%% Cell type:code id: tags:
```
# must run
# converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
```
%% Cell type:code id: tags:
```
# must run
# df_1 is dataframe of all completed jobs
df_1 = df[df.State.str.contains('COMPLETED')]
df_1.head(5)
#df_1.head(5)
```
%% Cell type:markdown id: tags:
# Average RAM per CPU Requested by User
%% Cell type:code id: tags:
```
# must run
# df_2 is database of completed jobs with only User and ReqMemCpu
# it is used for the user dataframes
df_2 = df_1.loc[:,['User','ReqMemCPU']]
df_2.head(5)
#df_2.head(5)
```
%% Cell type:code id: tags:
```
# must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True)
df_2.head(5)
#df_2.head(5)
```
%% Cell type:code id: tags:
```
# must run
# count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
df_user.head(5)
#df_user.head(5)
```
%% Cell type:code id: tags:
```
# must run
# creates user number column of strings of numbers from 0 to the total number of users
# used in graphs in place of usernames
usernames = df_user['User']
user_numbers = [str(i) for i in range(len(usernames))]
df_user['User Number'] = user_numbers
df_user
#df_user
```
%% Cell type:code id: tags:
```
# must run
# df_user_graph is df_user sorted in decending order by mean for easy readibility of graph
df_user_graph = df_user.sort_values(by='mean', ascending=False)
df_user_graph.head(5)
df_user_graph = df_user.sort_values(by='count', ascending=False)
df_user_graph.tail(5)
```
%% Cell type:code id: tags:
```
df_user_graph['count'].describe()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
style.figsize()
user_graph1 = sns.scatterplot(x="User", y="mean",
sizes=(20, 100), hue_norm=(1, 162),
hue="count", size="count",data=df_user_graph)
user_graph1.set(xticklabels=[])
user_graph1.set(xlabel=None)
plt.legend(scatterpoints=1, title='Job Count')
plt.title('Average Requested RAM per CPU by User for all Users')
plt.xlabel('User')
plt.ylabel('Average Requested RAM per CPU (Gigs)')
plt.show()
```
%% Cell type:code id: tags:
```
style.default_axes_and_ticks()
#style.figsize()
user_graph = sns.barplot(x="User", y="mean", data= df_user_graph, color = 'blue')
#user_graph.set_yscale('log')
user_graph.set(xticklabels=[])
user_graph.set(xlabel=None)
plt.title('Average Requested RAM per CPU by User for all Users')
plt.xlabel('User')
plt.ylabel('Average Requested RAM per CPU (Gigs)')
plt.show()
```
%% Cell type:code id: tags:
```
# bar graph for jobs run per user - shows average requested RAM per CPU for all jobs by user
user_graph = px.bar(df_user_graph, x='User Number', y='mean', color = 'mean',
user_graph = px.bar(df_user_graph, x='User', y='mean', color = 'mean',
hover_data=['max','count'],
labels={'mean':'Average Requested RAM per CPU (Gigs)'},
height=400)
user_graph.update_xaxes(showticklabels=False)
user_graph.update_layout(
xaxis_type = 'category',
title={
'text': "Average Requested RAM per CPU by User for all Users",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
user_graph.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of all users whose number of jobs equal 1000 or less.
df_without_outlier = df_user[df_user['count'] <= 1060]
```
%% Cell type:code id: tags:
```
# must run
# df_without_outlier_graph is df_without_outlier sorted in decending order by mean for easy readibility of graph
df_without_outlier_graph = df_without_outlier.sort_values(by='mean', ascending=False)
df_without_outlier_graph.head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs equal 1000 or less.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 21 out of 162 users
without_outlier_graph = px.bar(df_without_outlier_graph, x='User Number', y='mean',
hover_data=['count', 'max'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
without_outlier_graph.update_xaxes(showticklabels=False)
without_outlier_graph.update_layout(
xaxis_type = 'category',
title={
'text': "Average Requested RAM per CPU by User for all Users Running 1060 Jobs or Less",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
without_outlier_graph.show()
```
%% Cell type:code id: tags:
```
#voluntary
# gives description of the counts (number of jobs ran) for all the users
df_count = df_user['count'].describe()
df_count
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls in the lower 25 percentile of jobs ran
df_25percent = df_user[df_user['count'] <= 4.25]
print(df_25percent.User.count(),'users in the lower 25% out of', df_user.User.count(),'users total')
```
%% Cell type:code id: tags:
```
# must run
# df_25percent_graph is df_25percent sorted in decending order by mean for easy readibility of graph
df_25percent_graph = df_25percent.sort_values(by='mean', ascending=False)
df_25percent_graph.head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs fall in the lower 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 41 out of 162 users
lower25percent = px.bar(df_25percent_graph, x='User Number', y='mean',
hover_data=['count', 'max'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
lower25percent.update_xaxes(showticklabels=False)
lower25percent.update_layout(
xaxis_type = 'category',
title={
'text': "Average Requested RAM per CPU by User for all Users Running 4.25 Jobs or Less",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
lower25percent.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls between the median and 75th percentile of jobs ran
df_mid = df_user[(df_user['count'] > 4.25) & (df_user['count'] < 145)]
print(df_mid.User.count(),'users in the middle range out of', df_user.User.count(),'users total')
```
%% Cell type:code id: tags:
```
# must run
# df_mid_graph is df_mid sorted in decending order by mean for easy readibility of graph
df_mid_graph = df_mid.sort_values(by='mean', ascending=False)
df_mid_graph.head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs fall in the middle range between the lower and upper 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 80 out of 162 users
mid_graph = px.bar(df_mid_graph, x='User Number', y='mean',
hover_data=['count', 'max'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
mid_graph.update_xaxes(showticklabels=False)
mid_graph.update_layout(
xaxis_type = 'category',
title={
'text': "Average Requested RAM per CPU by User for all Users Running between 4.25 and 145 Jobs",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
mid_graph.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls in the upper 25th percentile of jobs ran (or the 75th percentile)
df_75percent = df_user[df_user['count'] >= 145]
print(df_75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
```
%% Cell type:code id: tags:
```
# must run
# df_75percent_graph is df_75percent sorted in decending order by mean for easy readibility of graph
df_75percent_graph = df_75percent.sort_values(by='mean', ascending=False)
df_75percent_graph.head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs fall in the upper 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 41 out of 162 users
upper25percent = px.bar(df_75percent_graph, x='User Number', y='mean',
hover_data=['count', 'max'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
upper25percent.update_xaxes(showticklabels=False)
upper25percent.update_layout(
xaxis_type = 'category',
title={
'text': "Average Requested RAM per CPU by User for all Users Running 145 Jobs or Less",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
upper25percent.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
df_lower75percent = df_user[(df_user['count'] >= 145) & (df_user['count'] <= 1060)]
print(df_lower75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
```
%% Cell type:code id: tags:
```
# must run
# df_lower75percent_graph is df_lower75percent sorted in decending order by mean for easy readibility of graph
df_lower75percent_graph = df_lower75percent.sort_values(by='mean', ascending=False)
df_lower75percent_graph.head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs that fall in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 20 out of 162 users
lower75percent_graph = px.bar(df_lower75percent_graph, x='User Number', y='mean',
hover_data=['max', 'count'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
lower75percent_graph.update_xaxes(showticklabels=False)
lower75percent_graph.update_layout(
xaxis_type = 'category',
title={
'text': "Average Requested RAM per CPU by User for all Users Running between 145 and 1060 Jobs",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
lower75percent_graph.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
df_upper75percent = df_user[df_user['count'] > 1060]
print(df_upper75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
```
%% Cell type:code id: tags:
```
# must run
# df_upper75percent_graph is df_upper75percent sorted in decending order by mean for easy readibility of graph
df_upper75percent_graph = df_upper75percent.sort_values(by='mean', ascending=False)
df_upper75percent_graph.head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs that fall in the upper half of the upper 25th percentile of jobs ran (or the 75th percentile)
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 21 out of 162 users
upper75percent_graph = px.bar(df_upper75percent_graph, x='User Number', y='mean',
hover_data=['max', 'count'], color='mean',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
upper75percent_graph.update_xaxes(showticklabels=False)
upper75percent_graph.update_layout(
xaxis_type = 'category',
title={
'text': "Average Requested RAM per CPU by User for all Users Running over 1060 Jobs",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
upper75percent_graph.show()
```
%% Cell type:markdown id: tags:
# Number of Jobs Requested per User
%% Cell type:code id: tags:
```
# must run
# df_3 is database of completed jobs with only User, JobName
df_3 = df_1.loc[:,['User','JobIDSlurm']]
df_3.head(5)
```
%% Cell type:code id: tags:
```
# must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN")
df_3.replace("", nan_value, inplace=True)
df_3.dropna(subset = ["User"], inplace=True)
df_3.head(20)
```
%% Cell type:code id: tags:
```
# must run
# count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user_count = df_3.groupby('User')['JobIDSlurm'].count().reset_index()
df_user_count.head(5)
```
%% Cell type:code id: tags:
```
df_count_without_outlier = df_user_count[df_user_count['JobIDSlurm'] < 3000]
```
%% Cell type:code id: tags:
```
df_count_without_outlier_graph = df_count_without_outlier.sort_values(by='JobIDSlurm', ascending=False)
df_count_without_outlier_graph
```
%% Cell type:code id: tags:
```
# bar graph for jobs run per user - shows average requested RAM per CPU for all jobs by user
count_without_outlier_graph = px.bar(df_count_without_outlier_graph, x='User', y='JobIDSlurm', color = 'JobIDSlurm',
#hover_data=['max','count'],
#labels={'mean':'Average Requested RAM per CPU (Gigs)'},
height=400)
count_without_outlier_graph.update_xaxes(showticklabels=False)
count_without_outlier_graph.update_layout(
xaxis_type = 'category',
title={
'text': "Average Requested RAM per CPU by User for all Users",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
count_without_outlier_graph.show()
```
%% Cell type:markdown id: tags:
# Average RAM per CPU by Job
%% Cell type:code id: tags:
```
# must run
# df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used to pull out needed information and create separate datasets to compare
df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID']]
#df_3.head(5)
```
%% Cell type:code id: tags:
```
# must run
# df_batch is df_2 with only batch jobs
df_batch = df_3.JobName.str.contains('batch')
#df_3[df_batch].head(5)
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above
batch_cutoff = df_3[df_batch][(df_3[df_batch].ReqMemCPU <= upperRAMlimit)]
#print(batch_cutoff.head(5))
```
%% Cell type:code id: tags:
```
# voluntary
# gives mean, min, max, std, and 3 percentiles for cutoff data
# can change what to include or exclude
batch_cutoff.describe(include=None, exclude=None)
```
%% Cell type:code id: tags:
```
# shows the number of jobs requesting cpu memory for all jobs (array and non array jobs)
Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = "green")
Jobs_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting')
```
%% Cell type:code id: tags:
```
```
%% Cell type:code id: tags:
```
```
%% Cell type:code id: tags:
```
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment