Commit f5dc940e authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added user ave, and count data. Started some rough graphs

parent ed12bb09
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
import sqlite3 import sqlite3
import slurm2sql import slurm2sql
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
%matplotlib inline %matplotlib inline
import seaborn as sns import seaborn as sns
import plotly.express as px import plotly.express as px
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database of info from March 2020 using sqlite 3 # creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3') db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# creates database of allocation info from March 2020 using sqlite 3 # creates database of allocation info from March 2020 using sqlite 3
# not using this right now, but is here as an option # not using this right now, but is here as an option
#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3') #db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# variable for to be used in names of plots to describe the max gigs measured # variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 50 UpperlimitGB = 50
# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs # variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs #upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df is starting database # df is starting database
df = pd.read_sql('SELECT * FROM slurm', db) df = pd.read_sql('SELECT * FROM slurm', db)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# for displaying all available column options # for displaying all available column options
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
df.head(5) df.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3) df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_1 = df[df.State.str.contains('COMPLETED')] df_1 = df[df.State.str.contains('COMPLETED')]
#df_1 #df_1
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_2 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID # df_2 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used for the user dataframes # it is used for the user dataframes
df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID', 'State']] df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID', 'State']]
df_2 #df_2
```
%% Cell type:code id: tags:
```
# must run
# df_3 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used for the jobs dataframes
df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID', 'State']]
df_3
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings # fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN") nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True) df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True) df_2.dropna(subset = ["User"], inplace=True)
df_2 #df_2
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# number of jobs completed per user # number of jobs completed per user
df_user_job_count = df_2.groupby(['User'])['User'].count() df_user_job_count = df_2.groupby(['User'])['User'].count()
df_user_job_count df_user_job_count
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_user_job_count.describe() df_user_job_count.describe()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# average requested RAM per CPU per user # average requested RAM per CPU per user
df_user_job_ave = df_2.groupby(['User'])['ReqMemCPU'].mean() df_user_job_ave = df_2.groupby(['User'])['ReqMemCPU'].mean()
df_user_job_ave df_user_job_ave
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_user_job_ave.describe() df_user_job_ave.describe()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# total requested RAM per CPU per user # total requested RAM per CPU per user
df_user_job_RAMcount = df_2.groupby(['User'])['ReqMemCPU'].sum() df_user_job_RAMcount = df_2.groupby(['User'])['ReqMemCPU'].sum()
df_user_job_RAMcount df_user_job_RAMcount
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# mean, min, max, etc for # mean, min, max, etc for
df_user_job_RAMcount.describe() df_user_job_RAMcount.describe()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_2 df_user = df_2.groupby(['User']).mean().reset_index()
df_user
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# df_user is a dataset consisting of each user and the average amout of RAM per CPU they have requested over all jobs they have run # creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above
df_user_ave = df_2.groupby(['User']).mean().reset_index() user_cutoff = df_user[(df_user.ReqMemCPU <= UpperlimitGB)]
df_user_ave user_cutoff
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_user_ave['ReqMemCPU'].describe() total_users = df_user_job_count.count() # total users
total_users
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
total_users = df_user.User.count() # total users # number of users whose requested ram per cpu average is equal to or less than the upper limit
total_users users_lessthan_cutoff = user_cutoff.User.count()
users_lessthan_cutoff
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above # must run
user_cutoff = df_user_ave[(df_user_ave.ReqMemCPU <= UpperlimitGB)]
user_cutoff
```
%% Cell type:code id: tags: # df_3 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used for the user dataframes
``` df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID', 'State']]
# number of users whose requested ram per cpu average is equal to or less than the upper limit #df_3
users_lessthan_cutoff = user_cutoff.User.count()
users_lessthan_cutoff
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_batch is df_2 with only batch jobs # df_batch is df_2 with only batch jobs
df_batch = df_3[df_3.JobName.str.contains('batch')] df_batch = df_3[df_3.JobName.str.contains('batch')]
df_batch #df_batch
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
total_jobs = df_batch.JobStep.count() # total number of jobs in the month total_jobs = df_batch.JobStep.count() # total number of jobs in the month
total_jobs total_jobs
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above # creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above
batch_cutoff = df_batch[(df_batch.ReqMemCPU <= UpperlimitGB)] batch_cutoff = df_batch[(df_batch.ReqMemCPU <= UpperlimitGB)]
batch_cutoff batch_cutoff
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# number of jobs that requested ram per cpu equal to or less than the upper limit # number of jobs that requested ram per cpu equal to or less than the upper limit
jobs_lessthan_cutoff = batch_cutoff.JobStep.count() jobs_lessthan_cutoff = batch_cutoff.JobStep.count()
jobs_lessthan_cutoff jobs_lessthan_cutoff
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
job_percentage = (jobs_lessthan_cutoff/total_jobs)*100 job_percentage = (jobs_lessthan_cutoff/total_jobs)*100
job_percentage job_percentage
``` ```
%% Cell type:code id: tags:
```
#voluntary
# all the users who run array jobs before the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 16 users out of 230
arrayjobs = df_user[(df_user != 0).all(1)]
arrayjobs.count()
```
%% Cell type:code id: tags:
```
#voluntary
# all the users who run array jobs after the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 1 to 2 users out of 230
arrayjobs_after_cutoff = user_cutoff[(user_cutoff != 0).all(1)]
arrayjobs_after_cutoff.count()
```
%% Cell type:code id: tags:
```
# voluntary
# gives mean, min, max, std, and 3 percentiles for cutoff data
# can change what to include or exclude
batch_cutoff.describe(include=None, exclude=None)
```
%% Cell type:code id: tags:
```
# voluntary
# gives mean, min, max, std, and 3 percentiles for cutoff data
# can change what to include or exclude
user_cutoff.describe(include=None, exclude=None)
```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Number of Jobs vs Users Requesting RAM per CPU # Number of Jobs vs Users Requesting RAM per CPU
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Graphs: <br> Graphs: <br>
Number of Jobs Requesting RAM per CPU for all Jobs Number of Jobs Requesting RAM per CPU for all Jobs
<br> <br>
Number of Users Requesting RAM per CPU for all Jobs Number of Users Requesting RAM per CPU for all Jobs
<br> <br>
Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs
<br> <br>
Detailed look at Users Requesting RAM per CPU for All Jobs Detailed look at Users Requesting RAM per CPU for All Jobs
<br> <br>
These graphs create histograms using the data for the month of March 2020. These graphs create histograms using the data for the month of March 2020.
The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs. The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
The y axis measures how many jobs/users requested that amount RAM per CPU. The y axis measures how many jobs/users requested that amount RAM per CPU.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# shows the number of jobs requesting cpu memory for all jobs (array and non array jobs) # shows number of users requesting cpu memory for all jobs (array and non array jobs) without a cutoff
Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = "green") Users_fig = sns.distplot(df_user_job_count, kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = "green")
Jobs_fig.set_yscale('log') #Users_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB) plt.title('Number of Users Requesting RAM per CPU for all Jobs')
plt.xlabel('Requested Gigs of RAM') plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting') plt.ylabel('Number of Users Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# shows number of users requesting cpu memory for all jobs (array and non array jobs) # shows number of users requesting cpu memory for all jobs (array and non array jobs)
Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = "green") Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = "green")
#Users_fig.set_yscale('log') #Users_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB) plt.title('Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM') plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Users Requesting') plt.ylabel('Number of Users Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# shows uthe nmber of jobs vs users requesting cpu memory for all jobs (array and non array jobs)
Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = "green")
Jobs_fig.set_yscale('log')
Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for for all Jobs')
#Users_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting')
```
%% Cell type:code id: tags:
```
# shows a more detailed, interactive veiw of the number of users requesting cpu memory for all jobs (array and non array jobs) # shows a more detailed, interactive veiw of the number of users requesting cpu memory for all jobs (array and non array jobs)
Users_fig = px.histogram(user_cutoff, x="ReqMemCPU", Users_fig = px.histogram(df_user_job_count, x="User",
title='Detailed look at Users Requesting RAM per CPU for All Jobs %i gigs or less'%UpperlimitGB, title='Detailed look at Users Requesting RAM per CPU for All Jobs %i gigs or less'%UpperlimitGB,
labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column #labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
opacity=0.8, opacity=0.8,
#og_y=True, # represent bars with log scale #log_y=True, # represent bars with log scale
marginal="box", # can be `box`, `violin` marginal="box", # can be `box`, `violin`
hover_data=user_cutoff.columns, #hover_data=df_user_job_count.columns,
nbins=30, nbins=30,
color_discrete_sequence=['goldenrod'] # color of histogram bars color_discrete_sequence=['goldenrod'] # color of histogram bars
) )
Users_fig.show() Users_fig.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
``` ```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment