Commit ef334e0d authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added titles to graphs

parent 5ec2814a
%% Cell type:code id: tags:
```
# must run
import sqlite3
import slurm2sql
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
```
%% Cell type:code id: tags:
```
# must run
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
%% Cell type:code id: tags:
```
# creates database of allocation info from March 2020 using sqlite 3
# not using this right now, but is here as an option
#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 5
# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
```
%% Cell type:code id: tags:
```
# must run
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
```
%% Cell type:code id: tags:
```
# voluntary
# for displaying all available column options
pd.set_option('display.max_columns', None)
df.head(5)
```
%% Cell type:code id: tags:
```
# must run
# converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
```
%% Cell type:code id: tags:
```
# must run
# df_1 is dataframe of all completed jobs
df_1 = df[df.State.str.contains('COMPLETED')]
#df_1.head(20)
```
%% Cell type:code id: tags:
```
# must run
# df_2 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used for the user dataframes
df_2 = df_1.loc[:,['User','ReqMemCPU']]
#df_2
```
%% Cell type:code id: tags:
```
# must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True)
#df_2.head(5)
```
%% Cell type:markdown id: tags:
# User Data
%% Cell type:code id: tags:
```
# must run
# count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
df_user.sort_values(by='count', ascending=True)
```
%% Cell type:code id: tags:
```
# bar graph for jobs run per user - shows average requested RAM per CPU for all jobs by user
user = px.bar(df_user, x='User', y='mean',
hover_data=['mean', 'max'], color='User',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
user.update_layout(
title={
'text': "Average Requested RAM per CPU by User for all Users",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
user.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of all users whose number of jobs equal 1000 or less.
df_without_outlier = df_user[df_user['count'] <= 1060]
df_without_outlier.sort_values(by='count', ascending=True).head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs equal 1000 or less.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 21 out of 162 users
without_outlier = px.bar(df_without_outlier, x='User', y='mean',
hover_data=['mean', 'max'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
without_outlier.update_layout(
title={
'text': "Average Requested RAM per CPU by User for all Users Running 1060 Jobs or Less",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
user.show()
without_outlier.show()
```
%% Cell type:code id: tags:
```
#voluntary
# gives description of the counts (number of jobs ran) for all the users
df_count = df_user['count'].describe()
df_count
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls in the lower 25 percentile of jobs ran
df_25percent = df_user[df_user['count'] <= 4.25]
print(df_25percent.User.count(),'users in the lower 25% out of', df_user.User.count(),'users total')
df_25percent.sort_values(by='count', ascending=True).head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs fall in the lower 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 41 out of 162 users
lower25percent = px.bar(df_25percent, x='User', y='mean',
hover_data=['mean', 'max'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
lower25percent.update_layout(
title={
'text': "Average Requested RAM per CPU by User for all Users Running 4.25 Jobs or Less",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
lower25percent.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls between the median and 75th percentile of jobs ran
df_mid = df_user[(df_user['count'] > 4.25) & (df_user['count'] < 145)]
print(df_mid.User.count(),'users in the middle range out of', df_user.User.count(),'users total')
df_mid.sort_values(by='count', ascending=True).head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs fall in the middle range between the lower and upper 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 80 out of 162 users
mid = px.bar(df_mid, x='User', y='mean',
hover_data=['mean', 'max'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
mid.update_layout(
title={
'text': "Average Requested RAM per CPU by User for all Users Running between 4.25 and 145 Jobs",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
mid.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls in the upper 25th percentile of jobs ran (or the 75th percentile)
df_75percent = df_user[df_user['count'] >= 145]
print(df_75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
df_75percent.sort_values(by='count', ascending=True).head(20)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs fall in the upper 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 41 out of 162 users
upper25percent = px.bar(df_75percent, x='User', y='mean',
hover_data=['mean', 'max', 'count'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
upper25percent.update_layout(
title={
'text': "Average Requested RAM per CPU by User for all Users Running 145 Jobs or Less",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
upper25percent.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
df_lower75percent = df_user[(df_user['count'] >= 145) & (df_user['count'] <= 1060)]
print(df_lower75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
df_lower75percent.sort_values(by='count', ascending=True).head(5)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs that fall in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 20 out of 162 users
lower75percent = px.bar(df_lower75percent, x='User', y='mean',
hover_data=['mean', 'max', 'count'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
lower75percent.update_layout(
title={
'text': "Average Requested RAM per CPU by User for all Users Running between 145 and 1060 Jobs",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
lower75percent.show()
```
%% Cell type:code id: tags:
```
# must run
# dataset of users whose number of jobs ran falls in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
df_upper75percent = df_user[df_user['count'] > 1060]
print(df_upper75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
df_upper75percent.sort_values(by='count', ascending=True)
```
%% Cell type:code id: tags:
```
# bar graph jobs run per user for all users whose number of jobs that fall in the upper half of the upper 25th percentile of jobs ran (or the 75th percentile)
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 21 out of 162 users
upper75percent = px.bar(df_upper75percent, x='User', y='mean',
hover_data=['mean', 'max', 'count'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
upper75percent.update_layout(
title={
'text': "Average Requested RAM per CPU by User for all Users Running over 1060 Jobs",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
upper75percent.show()
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment