Commit e8ae73a2 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added graphs and updated doc strings

parent 6efc4f62
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
import sqlite3 import sqlite3
import slurm2sql import slurm2sql
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
%matplotlib inline %matplotlib inline
import seaborn as sns import seaborn as sns
import plotly.express as px import plotly.express as px
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database of info from March 2020 using sqlite 3 # creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3') db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# creates database of allocation info from March 2020 using sqlite 3 # creates database of allocation info from March 2020 using sqlite 3
# not using this right now, but is here as an option # not using this right now, but is here as an option
#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3') #db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# variable for to be used in names of plots to describe the max gigs measured # variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 5 UpperlimitGB = 5
# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs # variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_1 is starting database # df_1 is starting database
df_1 = pd.read_sql('SELECT * FROM slurm', db) df_1 = pd.read_sql('SELECT * FROM slurm', db)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# for displaying all available column options # for displaying all available column options
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
df_1.head(5) df_1.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_2 is database with only ReqMemCpu and ReqMemNode, and ArrayTaskID # df_2 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']] # it is used to pull out needed information and create separate datasets to compare
df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID']]
#df_2.head(5) #df_2.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_user is df_2 with only user defined jobs # fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
df_3 = df_2[df_2['JobStep'].isnull()] # jobs where jobstep is None
df_3
```
%% Cell type:code id: tags: nan_value = float("NaN")
``` df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID']]
df_user = df_3.loc[:,['User', 'JobName', 'ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]
df_user df_3.replace("", nan_value, inplace=True)
df_3.dropna(subset = ["User"], inplace=True)
# df_user is a dataset consisting of each user and the total amout of RAM per CPU they have requested over all jobs they have run
df_user = df_3.groupby(['User']).sum().reset_index()
#df_user.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_batch is df_2 with only batch jobs # df_batch is df_2 with only batch jobs
df_batch = df_2.JobName.str.contains('batch') df_batch = df_2.JobName.str.contains('batch')
#df_batch #df_2[df_batch].head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database from df_batch of ReqMemCPU batch jobs that are < or = a given point # creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above
JobsCPU_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)] batch_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]
#JobsCPU_cutoff #print(batch_cutoff.head(5))
JobsNode_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemNode <= upperRAMlimit)]
UsersCPU_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]
#UsersCPU_cutoff
UsersNode_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]
# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above
user_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]
#user_cutoff.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary #voluntary
# gives mean, min, max, std, and 3 percentiles for cutoff data
# can change what to include or exclude
JobsCPU_cutoff.describe(include=None, exclude=None)
```
%% Cell type:code id: tags:
``` # all the users who run array jobs before the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 16 users out of 230
# voluntary arrayjobs = df_user[(df_user != 0).all(1)]
print(arrayjobs.head(5))
# gives mean, min, max, std, and 3 percentiles for cutoff data # all the users who run array jobs after the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 1 to 2 users out of 230
# can change what to include or exclude arrayjobs_after_cutoff = user_cutoff[(user_cutoff != 0).all(1)]
JobsNode_cutoff.describe(include=None, exclude=None) arrayjobs_after_cutoff
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# gives mean, min, max, std, and 3 percentiles for cutoff data # gives mean, min, max, std, and 3 percentiles for cutoff data
# can change what to include or exclude # can change what to include or exclude
UsersCPU_cutoff.describe(include=None, exclude=None) batch_cutoff.describe(include=None, exclude=None)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# gives mean, min, max, std, and 3 percentiles for cutoff data # gives mean, min, max, std, and 3 percentiles for cutoff data
# can change what to include or exclude # can change what to include or exclude
UsersNode_cutoff.describe(include=None, exclude=None) user_cutoff.describe(include=None, exclude=None)
```
%% Cell type:code id: tags:
```
# msut run
# creates databases of Requested Ram per CPU and per Node that have an array task id using the upper RAM limit cutoff
JobsCPU_arraytask = JobsCPU_cutoff.dropna(subset=['ArrayTaskID'])
JobsNode_arraytask = JobsNode_cutoff.dropna(subset=['ArrayTaskID'])
UsersCPU_arraytask = UsersCPU_cutoff.dropna(subset=['ArrayTaskID'])
UsersNode_arraytask = UsersNode_cutoff.dropna(subset=['ArrayTaskID'])
```
%% Cell type:code id: tags:
```
# must run
# creates databases of Requested Ram per CPU and per Node that do not have an array task id using the upper RAM limit cutoff
JobsCPU_nonarraytask = JobsCPU_cutoff[JobsCPU_cutoff['ArrayTaskID'].isnull()]
JobsNode_nonarraytask = JobsNode_cutoff[JobsNode_cutoff['ArrayTaskID'].isnull()]
#JobsCPU_nonarraytask.head(5)
UsersCPU_nonarraytask = UsersCPU_cutoff[UsersCPU_cutoff['ArrayTaskID'].isnull()]
UsersNode_nonarraytask = UsersNode_cutoff[UsersNode_cutoff['ArrayTaskID'].isnull()]
#UsersCPU_nonarraytask.head(5)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Number of Jobs vs Users Requesting RAM per CPU # Number of Jobs vs Users Requesting RAM per CPU
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Graphs: <br> Graphs: <br>
Jobs Requesting RAM per CPU for all Jobs Number of Jobs Requesting RAM per CPU for all Jobs
<br> <br>
Users Requesting RAM per CPU for all Jobs Number of Users Requesting RAM per CPU for all Jobs
<br> <br>
Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs
<br> <br>
Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs Detailed look at Users Requesting RAM per CPU for All Jobs
<br> <br>
These graphs create histograms using the data for the month of March 2020. These graphs create histograms using the data for the month of March 2020.
The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs. The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
The y axis measures how many jobs requested that amount RAM per CPU. The y axis measures how many jobs/users requested that amount RAM per CPU.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# shows all user requested cpu memory for array and non array jobs # shows the number of jobs requesting cpu memory for all jobs (array and non array jobs)
Jobs_fig = sns.distplot(JobsCPU_cutoff['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array and Non Array Jobs', color = "green") Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = "green")
Jobs_fig.set_yscale('log') Jobs_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB) plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM') plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting') plt.ylabel('Number of Jobs Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# shows all user requested cpu memory for array and non array jobs # shows number of users requesting cpu memory for all jobs (array and non array jobs)
Users_fig = sns.distplot(UsersCPU_cutoff['ReqMemCPU'], kde=False, label='Users Requesting RAM per CPU for Array and Non Array Jobs', color = "green") Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = "green")
Users_fig.set_yscale('log') Users_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB) plt.title('Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM') plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Users Requesting') plt.ylabel('Number of Users Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison. # shows uthe nmber of jobs vs users requesting cpu memory for all jobs (array and non array jobs)
Jobs_arraytask_fig = sns.distplot(JobsCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = "green") Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = "green")
Jobs_arraytask_fig.set_yscale('log') Jobs_fig.set_yscale('log')
Jobs_nonarraytask_fig = sns.distplot(JobsCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs') Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for for all Jobs')
Jobs_nonarraytask_fig.set_yscale('log') Users_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1) plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB) plt.title('Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM') plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting') plt.ylabel('Number of Jobs Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison. # shows a more detailed, interactive veiw of the number of users requesting cpu memory for all jobs (array and non array jobs)
Users_arraytask_fig = sns.distplot(UsersCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = "green")
Users_arraytask_fig.set_yscale('log') Users_fig = px.histogram(user_cutoff, x="ReqMemCPU",
title='Detailed look at Users Requesting RAM per CPU for All Jobs %i gigs or less'%UpperlimitGB,
labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
opacity=0.8,
log_y=True, # represent bars with log scale
marginal="box", # can be `box`, `violin`
hover_data=user_cutoff.columns,
nbins=30,
color_discrete_sequence=['goldenrod'] # color of histogram bars
)
Users_fig.show()
```
Users_nonarraytask_fig = sns.distplot(UsersCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs') %% Cell type:code id: tags:
Users_nonarraytask_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1) ```
plt.title('Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting')
``` ```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment