Commit 6efc4f62 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added jobs vs users graphs

parent 9bfd57c5
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
import sqlite3 import sqlite3
import slurm2sql import slurm2sql
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
%matplotlib inline %matplotlib inline
import seaborn as sns import seaborn as sns
import plotly.express as px import plotly.express as px
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database of info from March 2020 using sqlite 3 # creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3') db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# creates database of allocation info from March 2020 using sqlite 3 # creates database of allocation info from March 2020 using sqlite 3
# not using this right now, but is here as an option # not using this right now, but is here as an option
#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3') #db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# variable for to be used in names of plots to describe the max gigs measured # variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 5 UpperlimitGB = 5
# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs # variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_1 is starting database # df_1 is starting database
df_1 = pd.read_sql('SELECT * FROM slurm', db) df_1 = pd.read_sql('SELECT * FROM slurm', db)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# for displaying all available column options # for displaying all available column options
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
df_1.head(5) df_1.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_2 is database with only ReqMemCpu and ReqMemNode, and ArrayTaskID # df_2 is database with only ReqMemCpu and ReqMemNode, and ArrayTaskID
df_2 = df_1.loc[:,['JobName','ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']] df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]
#df_2.head(5) #df_2.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_batch is df_2 with only batch jobs # df_user is df_2 with only user defined jobs
df_batch = df_1.JobName.str.contains('batch') df_3 = df_2[df_2['JobStep'].isnull()] # jobs where jobstep is None
#df_2[df_batch] df_3
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run df_user = df_3.loc[:,['User', 'JobName', 'ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]
df_user
# creates database from df_batch of ReqMemCPU batch jobs that are < or = a given point
CPU_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]
#CPU_cutoff
Node_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemNode <= upperRAMlimit)]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # must run
# gives mean, min, max, std, and 3 percentiles for cutoff data # df_batch is df_2 with only batch jobs
# can change what to include or exclude df_batch = df_2.JobName.str.contains('batch')
CPU_cutoff.describe(include=None, exclude=None) #df_batch
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # must run
# gives mean, min, max, std, and 3 percentiles for cutoff data
# can change what to include or exclude
Node_cutoff.describe(include=None, exclude=None)
```
%% Cell type:code id: tags: # creates database from df_batch of ReqMemCPU batch jobs that are < or = a given point
JobsCPU_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]
#JobsCPU_cutoff
JobsNode_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemNode <= upperRAMlimit)]
UsersCPU_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]
#UsersCPU_cutoff
UsersNode_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]
```
# msut run
# creates databases of Requested Ram per CPU and per Node that have an array task id using the upper RAM limit cutoff
CPU_arraytask = CPU_cutoff.dropna(subset=['ArrayTaskID'])
Node_arraytask = Node_cutoff.dropna(subset=['ArrayTaskID'])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # voluntary
# creates databases of Requested Ram per CPU and per Node that do not have an array task id using the upper RAM limit cutoff # gives mean, min, max, std, and 3 percentiles for cutoff data
CPU_nonarraytask = CPU_cutoff[CPU_cutoff['ArrayTaskID'].isnull()] # can change what to include or exclude
Node_nonarraytask = Node_cutoff[Node_cutoff['ArrayTaskID'].isnull()] JobsCPU_cutoff.describe(include=None, exclude=None)
#CPU_nonarraytask.head(5)
``` ```
%% Cell type:markdown id: tags:
Graphs: <br>
User Requested RAM per CPU for all Jobs
<br>
User Requested RAM per Node for all Jobs
<br>
User Requested RAM per CPU and per Node together for all Jobs
<br>
User Requested RAM per CPU for Array Jobs vs Not Array Jobs
<br>
User Requested RAM per Node for Array Jobs vs Not Array Jobs
<br>
These graphs create histograms using the data for the month of March 2020.
The x axis measures the amount of requested RAM in gigs per CPU/Node, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
The y axis measures how many users requested that amount RAM per CPU or Node.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# shows all user requested cpu memory for array and non array jobs # voluntary
CPU_fig = sns.distplot(CPU_cutoff['ReqMemCPU'], kde=False, label='User Requested RAM per CPU for Array and Non Array Jobs', color = "green")
CPU_fig.set_yscale('log') # gives mean, min, max, std, and 3 percentiles for cutoff data
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) # can change what to include or exclude
plt.title('User Requested RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB) JobsNode_cutoff.describe(include=None, exclude=None)
plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# shows all user requested node memory for array and non array jobs # voluntary
Node_fig = sns.distplot(Node_cutoff['ReqMemNode'], kde=False, label='User Requested RAM per Node for Array and Non Array Jobs')
Node_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) # gives mean, min, max, std, and 3 percentiles for cutoff data
plt.title('User Requested RAM per Node for all Jobs %i gigs or less'%UpperlimitGB) # can change what to include or exclude
plt.xlabel('Requested Gigs of RAM') UsersCPU_cutoff.describe(include=None, exclude=None)
plt.ylabel('Number of Jobs Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
#shows requested cpu and node for all job types (array and non array jobs) side by side for easy comparison. # voluntary
CPU_fig = sns.distplot(CPU_cutoff['ReqMemCPU'], kde=False, label='User Requested RAM per CPU for Array and Non Array Jobs', color = "green")
CPU_fig.set_yscale('log')
Node_fig = sns.distplot(Node_cutoff['ReqMemNode'], kde=False, label='User Requested RAM per Node for Array and Non Array Jobs') #color = 'darkblue')
Node_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) # gives mean, min, max, std, and 3 percentiles for cutoff data
plt.title('User Requested RAM per CPU and per Node together for all Jobs %i gigs or less'%UpperlimitGB) # can change what to include or exclude
plt.xlabel('Requested Gigs of RAM') UsersNode_cutoff.describe(include=None, exclude=None)
plt.ylabel('Number of Jobs Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison. # msut run
CPU_arraytask_fig = sns.distplot(CPU_arraytask['ReqMemCPU'], kde=False, label='User Requested RAM per CPU for Array Jobs', color = "green")
CPU_arraytask_fig.set_yscale('log')
CPU_nonarraytask_fig = sns.distplot(CPU_nonarraytask['ReqMemCPU'], kde=False, label='User Requested RAM per CPU for Non Array Jobs') # creates databases of Requested Ram per CPU and per Node that have an array task id using the upper RAM limit cutoff
CPU_nonarraytask_fig.set_yscale('log') JobsCPU_arraytask = JobsCPU_cutoff.dropna(subset=['ArrayTaskID'])
JobsNode_arraytask = JobsNode_cutoff.dropna(subset=['ArrayTaskID'])
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1) UsersCPU_arraytask = UsersCPU_cutoff.dropna(subset=['ArrayTaskID'])
plt.title('User Requested RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB) UsersNode_arraytask = UsersNode_cutoff.dropna(subset=['ArrayTaskID'])
plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
#shows requested node memory for array jobs alongside requested node memory for non array jobs for easy comparison. # must run
Node_arraytask_fig = sns.distplot(Node_arraytask['ReqMemCPU'], kde=False, label='User Requested RAM per Node for Array Jobs', color = "green")
Node_arraytask_fig.set_yscale('log')
Node_nonarraytask_fig = sns.distplot(Node_nonarraytask['ReqMemNode'], kde=False, label='User Requested RAM per Node for Non Array Jobs')
Node_nonarraytask_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.10, 1.0),ncol=1) # creates databases of Requested Ram per CPU and per Node that do not have an array task id using the upper RAM limit cutoff
plt.title('User Requested RAM per Node for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB) JobsCPU_nonarraytask = JobsCPU_cutoff[JobsCPU_cutoff['ArrayTaskID'].isnull()]
plt.xlabel('Requested Gigs of RAM') JobsNode_nonarraytask = JobsNode_cutoff[JobsNode_cutoff['ArrayTaskID'].isnull()]
plt.ylabel('Number of Jobs Requesting') #JobsCPU_nonarraytask.head(5)
UsersCPU_nonarraytask = UsersCPU_cutoff[UsersCPU_cutoff['ArrayTaskID'].isnull()]
UsersNode_nonarraytask = UsersNode_cutoff[UsersNode_cutoff['ArrayTaskID'].isnull()]
#UsersCPU_nonarraytask.head(5)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# These are Plotly Express Graphs of the some of the above Seaborn graphs. Run them only if you need more details about the data in the graph. They will make your notebook run slower. # Number of Jobs vs Users Requesting RAM per CPU
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Graphs: <br> Graphs: <br>
User Requested RAM per CPU for all Jobs Jobs Requesting RAM per CPU for all Jobs
<br>
User Requested RAM per CPU for Non Array Jobs
<br> <br>
User Requested RAM per CPU for Array Jobs Users Requesting RAM per CPU for all Jobs
<br> <br>
User Requested RAM per Node for all Jobs Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs
<br> <br>
User Requested RAM per Node for Non Array Jobs Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs
<br>
User Requested RAM per Node for Array Jobs
<br> <br>
These graphs create histograms using the data for the month of March 2020. These graphs create histograms using the data for the month of March 2020.
The x axis measures the amount of requested RAM in gigs per CPU/Node, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs. The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
The y axis measures how many users requested that amount RAM per CPU or Node. The y axis measures how many jobs requested that amount RAM per CPU.
Can also show box or violin graph above to show where min, max, median, and 3rd quartile is.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
CPU_fig = px.histogram(CPU_cutoff, x="ReqMemCPU", # shows all user requested cpu memory for array and non array jobs
title='User Requested RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB, Jobs_fig = sns.distplot(JobsCPU_cutoff['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array and Non Array Jobs', color = "green")
labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column Jobs_fig.set_yscale('log')
opacity=0.8, plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
log_y=True, # represent bars with log scale plt.title('Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
marginal="box", # can be `box`, `violin` plt.xlabel('Requested Gigs of RAM')
hover_data=CPU_cutoff.columns, plt.ylabel('Number of Jobs Requesting')
nbins=30,
color_discrete_sequence=['goldenrod'] # color of histogram bars
)
CPU_fig.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
CPU_nonarraytask_fig = px.histogram(CPU_nonarraytask, x="ReqMemCPU", # shows all user requested cpu memory for array and non array jobs
title='User Requested RAM per CPU for Non Array Jobs %i gigs or less'%UpperlimitGB, Users_fig = sns.distplot(UsersCPU_cutoff['ReqMemCPU'], kde=False, label='Users Requesting RAM per CPU for Array and Non Array Jobs', color = "green")
labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column Users_fig.set_yscale('log')
opacity=0.8, plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
log_y=True, # represent bars with log scale plt.title('Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
marginal="box", # can be `box`, `violin` plt.xlabel('Requested Gigs of RAM')
hover_data=CPU_nonarraytask.columns, plt.ylabel('Number of Users Requesting')
nbins=30,
color_discrete_sequence=['goldenrod'] # color of histogram bars
)
CPU_nonarraytask_fig.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
CPU_arraytask_fig = px.histogram(CPU_arraytask, x="ReqMemCPU", #shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.
title='User Requested RAM per CPU for Array Jobs %i gigs or less'%UpperlimitGB, Jobs_arraytask_fig = sns.distplot(JobsCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = "green")
labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column Jobs_arraytask_fig.set_yscale('log')
opacity=0.8,
log_y=True, # represent bars with log scale
marginal="box", # can be `box`, `violin`
hover_data=CPU_arraytask.columns,
nbins=30,
color_discrete_sequence=['goldenrod'] # color of histogram bars
)
CPU_arraytask_fig.show()
```
%% Cell type:code id: tags: Jobs_nonarraytask_fig = sns.distplot(JobsCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs')
Jobs_nonarraytask_fig.set_yscale('log')
``` plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1)
Node_fig = px.histogram(Node_cutoff, x="ReqMemNode", plt.title('Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB)
title='User Requested RAM per Node for all Jobs %i gigs or less'%UpperlimitGB, plt.xlabel('Requested Gigs of RAM')
labels={'ReqMemNode':'ReqMemNode'}, # can specify one label per df column plt.ylabel('Number of Jobs Requesting')
opacity=0.8,
log_y=True, # represent bars with log scale
marginal="box", # can be `box`, `violin`
hover_data=Node_cutoff.columns,
nbins=30,
color_discrete_sequence=['darkblue'] # color of histogram bars
)
Node_fig.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
Node_nonarraytask_fig = px.histogram(Node_nonarraytask, x="ReqMemNode", #shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.
title='User Requested RAM per Node for Non Array Jobs %i gigs or less'%UpperlimitGB, Users_arraytask_fig = sns.distplot(UsersCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = "green")
labels={'ReqMemNode':'ReqMemNode'}, # can specify one label per df column Users_arraytask_fig.set_yscale('log')
opacity=0.8,
log_y=True, # represent bars with log scale
marginal="box", # can be `box`, `violin`
hover_data=Node_nonarraytask.columns,
nbins=30,
color_discrete_sequence=['darkblue'] # color of histogram bars
)
Node_nonarraytask_fig.show()
```
%% Cell type:code id: tags: Users_nonarraytask_fig = sns.distplot(UsersCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs')
Users_nonarraytask_fig.set_yscale('log')
``` plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1)
Node_arraytask_fig = px.histogram(Node_arraytask, x="ReqMemNode", plt.title('Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB)
title='User Requested RAM per Node for Array Jobs %i gigs or less'%UpperlimitGB, plt.xlabel('Requested Gigs of RAM')
labels={'ReqMemNode':'ReqMemNode'}, # can specify one label per df column plt.ylabel('Number of Jobs Requesting')
opacity=0.8,
log_y=True, # represent bars with log scale
marginal="box", # can be `box`, `violin`
hover_data=Node_arraytask.columns,
nbins=30,
color_discrete_sequence=['darkblue'] # color of histogram bars
)
Node_arraytask_fig.show()
``` ```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment