Commit 5ec2814a authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added user info graphs and documentation

parent f5dc940e
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
import sqlite3 import sqlite3
import slurm2sql import slurm2sql
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
%matplotlib inline %matplotlib inline
import seaborn as sns import seaborn as sns
import plotly.express as px import plotly.express as px
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database of info from March 2020 using sqlite 3 # creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3') db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# creates database of allocation info from March 2020 using sqlite 3 # creates database of allocation info from March 2020 using sqlite 3
# not using this right now, but is here as an option # not using this right now, but is here as an option
#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3') #db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# variable for to be used in names of plots to describe the max gigs measured # variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 50 UpperlimitGB = 5
# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs # variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
#upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df is starting database # df is starting database
df = pd.read_sql('SELECT * FROM slurm', db) df = pd.read_sql('SELECT * FROM slurm', db)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# for displaying all available column options # for displaying all available column options
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
df.head(5) df.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run
# converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3) df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run
# df_1 is dataframe of all completed jobs
df_1 = df[df.State.str.contains('COMPLETED')] df_1 = df[df.State.str.contains('COMPLETED')]
#df_1 #df_1.head(20)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_2 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID # df_2 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used for the user dataframes # it is used for the user dataframes
df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID', 'State']] df_2 = df_1.loc[:,['User','ReqMemCPU']]
#df_2 #df_2
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings # fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN") nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True) df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True) df_2.dropna(subset = ["User"], inplace=True)
#df_2 #df_2.head(5)
```
%% Cell type:code id: tags:
```
# number of jobs completed per user
df_user_job_count = df_2.groupby(['User'])['User'].count()
df_user_job_count
``` ```
%% Cell type:code id: tags: %% Cell type:markdown id: tags:
``` # User Data
df_user_job_count.describe()
```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# average requested RAM per CPU per user # must run
df_user_job_ave = df_2.groupby(['User'])['ReqMemCPU'].mean() # count = count of jobs per user
df_user_job_ave # mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
df_user.sort_values(by='count', ascending=True)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_user_job_ave.describe() # bar graph for jobs run per user - shows average requested RAM per CPU for all jobs by user
user = px.bar(df_user, x='User', y='mean',
hover_data=['mean', 'max'], color='User',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
user.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# total requested RAM per CPU per user # must run
df_user_job_RAMcount = df_2.groupby(['User'])['ReqMemCPU'].sum() # dataset of all users whose number of jobs equal 1000 or less.
df_user_job_RAMcount df_without_outlier = df_user[df_user['count'] <= 1060]
df_without_outlier.sort_values(by='count', ascending=True).head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# mean, min, max, etc for # bar graph jobs run per user for all users whose number of jobs equal 1000 or less.
df_user_job_RAMcount.describe() # shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 21 out of 162 users
without_outlier = px.bar(df_without_outlier, x='User', y='mean',
hover_data=['mean', 'max'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
without_outlier.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_user = df_2.groupby(['User']).mean().reset_index() #voluntary
df_user
```
%% Cell type:code id: tags: # gives description of the counts (number of jobs ran) for all the users
df_count = df_user['count'].describe()
``` df_count
# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above
user_cutoff = df_user[(df_user.ReqMemCPU <= UpperlimitGB)]
user_cutoff
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
total_users = df_user_job_count.count() # total users # must run
total_users
```
%% Cell type:code id: tags:
``` # dataset of users whose number of jobs ran falls in the lower 25 percentile of jobs ran
# number of users whose requested ram per cpu average is equal to or less than the upper limit df_25percent = df_user[df_user['count'] <= 4.25]
users_lessthan_cutoff = user_cutoff.User.count() print(df_25percent.User.count(),'users in the lower 25% out of', df_user.User.count(),'users total')
users_lessthan_cutoff df_25percent.sort_values(by='count', ascending=True).head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # bar graph jobs run per user for all users whose number of jobs fall in the lower 25th percentile.
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# df_3 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID # represents 41 out of 162 users
# it is used for the user dataframes lower25percent = px.bar(df_25percent, x='User', y='mean',
hover_data=['mean', 'max'], color='count',
df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID', 'State']] labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
#df_3 lower25percent.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_batch is df_2 with only batch jobs
df_batch = df_3[df_3.JobName.str.contains('batch')] # dataset of users whose number of jobs ran falls between the median and 75th percentile of jobs ran
#df_batch df_mid = df_user[(df_user['count'] > 4.25) & (df_user['count'] < 145)]
print(df_mid.User.count(),'users in the middle range out of', df_user.User.count(),'users total')
df_mid.sort_values(by='count', ascending=True).head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
total_jobs = df_batch.JobStep.count() # total number of jobs in the month # bar graph jobs run per user for all users whose number of jobs fall in the middle range between the lower and upper 25th percentile.
total_jobs # shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 80 out of 162 users
mid = px.bar(df_mid, x='User', y='mean',
hover_data=['mean', 'max'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
mid.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above # dataset of users whose number of jobs ran falls in the upper 25th percentile of jobs ran (or the 75th percentile)
batch_cutoff = df_batch[(df_batch.ReqMemCPU <= UpperlimitGB)] df_75percent = df_user[df_user['count'] >= 145]
batch_cutoff print(df_75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
df_75percent.sort_values(by='count', ascending=True).head(20)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# number of jobs that requested ram per cpu equal to or less than the upper limit # bar graph jobs run per user for all users whose number of jobs fall in the upper 25th percentile.
jobs_lessthan_cutoff = batch_cutoff.JobStep.count() # shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
jobs_lessthan_cutoff # represents 41 out of 162 users
upper25percent = px.bar(df_75percent, x='User', y='mean',
hover_data=['mean', 'max', 'count'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
upper25percent.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
job_percentage = (jobs_lessthan_cutoff/total_jobs)*100 # must run
job_percentage
```
%% Cell type:markdown id: tags:
# Number of Jobs vs Users Requesting RAM per CPU
%% Cell type:markdown id: tags:
Graphs: <br>
Number of Jobs Requesting RAM per CPU for all Jobs
<br>
Number of Users Requesting RAM per CPU for all Jobs
<br>
Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs
<br>
Detailed look at Users Requesting RAM per CPU for All Jobs
<br>
These graphs create histograms using the data for the month of March 2020.
The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
The y axis measures how many jobs/users requested that amount RAM per CPU.
%% Cell type:code id: tags:
```
# shows number of users requesting cpu memory for all jobs (array and non array jobs) without a cutoff
Users_fig = sns.distplot(df_user_job_count, kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = "green")
#Users_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) # dataset of users whose number of jobs ran falls in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
plt.title('Number of Users Requesting RAM per CPU for all Jobs') df_lower75percent = df_user[(df_user['count'] >= 145) & (df_user['count'] <= 1060)]
plt.xlabel('Requested Gigs of RAM') print(df_lower75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
plt.ylabel('Number of Users Requesting') df_lower75percent.sort_values(by='count', ascending=True).head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# shows number of users requesting cpu memory for all jobs (array and non array jobs) # bar graph jobs run per user for all users whose number of jobs that fall in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = "green") # shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
#Users_fig.set_yscale('log') # represents 20 out of 162 users
lower75percent = px.bar(df_lower75percent, x='User', y='mean',
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) hover_data=['mean', 'max', 'count'], color='count',
plt.title('Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB) labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
plt.xlabel('Requested Gigs of RAM') lower75percent.show()
plt.ylabel('Number of Users Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# shows a more detailed, interactive veiw of the number of users requesting cpu memory for all jobs (array and non array jobs) # must run
Users_fig = px.histogram(df_user_job_count, x="User", # dataset of users whose number of jobs ran falls in the lower half of the upper 25th percentile of jobs ran (or the 75th percentile)
title='Detailed look at Users Requesting RAM per CPU for All Jobs %i gigs or less'%UpperlimitGB, df_upper75percent = df_user[df_user['count'] > 1060]
#labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column print(df_upper75percent.User.count(),'users in the upper 75% out of', df_user.User.count(),'users total')
opacity=0.8, df_upper75percent.sort_values(by='count', ascending=True)
#log_y=True, # represent bars with log scale
marginal="box", # can be `box`, `violin`
#hover_data=df_user_job_count.columns,
nbins=30,
color_discrete_sequence=['goldenrod'] # color of histogram bars
)
Users_fig.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# bar graph jobs run per user for all users whose number of jobs that fall in the upper half of the upper 25th percentile of jobs ran (or the 75th percentile)
# shows average requested RAM per CPU for jobs by user, with the color of the bars being the count of jobs run for that user
# represents 21 out of 162 users
upper75percent = px.bar(df_upper75percent, x='User', y='mean',
hover_data=['mean', 'max', 'count'], color='count',
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, height=400)
upper75percent.show()
``` ```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment