Commit f5dc940e authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added user ave, and count data. Started some rough graphs

parent ed12bb09
%% Cell type:code id: tags:
```
# must run
import sqlite3
import slurm2sql
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
```
%% Cell type:code id: tags:
```
# must run
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
%% Cell type:code id: tags:
```
# creates database of allocation info from March 2020 using sqlite 3
# not using this right now, but is here as an option
#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
```
%% Cell type:code id: tags:
```
# must run
# variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 50
# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
#upperRAMlimit = UpperlimitGB * 10e+10 # 5 gigs
```
%% Cell type:code id: tags:
```
# must run
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
```
%% Cell type:code id: tags:
```
# voluntary
# for displaying all available column options
pd.set_option('display.max_columns', None)
df.head(5)
```
%% Cell type:code id: tags:
```
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
```
%% Cell type:code id: tags:
```
df_1 = df[df.State.str.contains('COMPLETED')]
#df_1
```
%% Cell type:code id: tags:
```
# must run
# df_2 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used for the user dataframes
df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID', 'State']]
df_2
```
%% Cell type:code id: tags:
```
# must run
# df_3 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used for the jobs dataframes
df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID', 'State']]
df_3
#df_2
```
%% Cell type:code id: tags:
```
# must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True)
df_2
#df_2
```
%% Cell type:code id: tags:
```
# number of jobs completed per user
df_user_job_count = df_2.groupby(['User'])['User'].count()
df_user_job_count
```
%% Cell type:code id: tags:
```
df_user_job_count.describe()
```
%% Cell type:code id: tags:
```
# average requested RAM per CPU per user
df_user_job_ave = df_2.groupby(['User'])['ReqMemCPU'].mean()
df_user_job_ave
```
%% Cell type:code id: tags:
```
df_user_job_ave.describe()
```
%% Cell type:code id: tags:
```
# total requested RAM per CPU per user
df_user_job_RAMcount = df_2.groupby(['User'])['ReqMemCPU'].sum()
df_user_job_RAMcount
```
%% Cell type:code id: tags:
```
# mean, min, max, etc for
df_user_job_RAMcount.describe()
```
%% Cell type:code id: tags:
```
df_2
df_user = df_2.groupby(['User']).mean().reset_index()
df_user
```
%% Cell type:code id: tags:
```
# df_user is a dataset consisting of each user and the average amout of RAM per CPU they have requested over all jobs they have run
df_user_ave = df_2.groupby(['User']).mean().reset_index()
df_user_ave
# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above
user_cutoff = df_user[(df_user.ReqMemCPU <= UpperlimitGB)]
user_cutoff
```
%% Cell type:code id: tags:
```
df_user_ave['ReqMemCPU'].describe()
total_users = df_user_job_count.count() # total users
total_users
```
%% Cell type:code id: tags:
```
total_users = df_user.User.count() # total users
total_users
# number of users whose requested ram per cpu average is equal to or less than the upper limit
users_lessthan_cutoff = user_cutoff.User.count()
users_lessthan_cutoff
```
%% Cell type:code id: tags:
```
# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above
user_cutoff = df_user_ave[(df_user_ave.ReqMemCPU <= UpperlimitGB)]
user_cutoff
```
# must run
%% Cell type:code id: tags:
# df_3 is database of completed jobs with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used for the user dataframes
```
# number of users whose requested ram per cpu average is equal to or less than the upper limit
users_lessthan_cutoff = user_cutoff.User.count()
users_lessthan_cutoff
df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID', 'State']]
#df_3
```
%% Cell type:code id: tags:
```
# must run
# df_batch is df_2 with only batch jobs
df_batch = df_3[df_3.JobName.str.contains('batch')]
df_batch
#df_batch
```
%% Cell type:code id: tags:
```
total_jobs = df_batch.JobStep.count() # total number of jobs in the month
total_jobs
```
%% Cell type:code id: tags:
```
# must run
# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above
batch_cutoff = df_batch[(df_batch.ReqMemCPU <= UpperlimitGB)]
batch_cutoff
```
%% Cell type:code id: tags:
```
# number of jobs that requested ram per cpu equal to or less than the upper limit
jobs_lessthan_cutoff = batch_cutoff.JobStep.count()
jobs_lessthan_cutoff
```
%% Cell type:code id: tags:
```
job_percentage = (jobs_lessthan_cutoff/total_jobs)*100
job_percentage
```
%% Cell type:code id: tags:
```
#voluntary
# all the users who run array jobs before the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 16 users out of 230
arrayjobs = df_user[(df_user != 0).all(1)]
arrayjobs.count()
```
%% Cell type:code id: tags:
```
#voluntary
# all the users who run array jobs after the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 1 to 2 users out of 230
arrayjobs_after_cutoff = user_cutoff[(user_cutoff != 0).all(1)]
arrayjobs_after_cutoff.count()
```
%% Cell type:code id: tags:
```
# voluntary
# gives mean, min, max, std, and 3 percentiles for cutoff data
# can change what to include or exclude
batch_cutoff.describe(include=None, exclude=None)
```
%% Cell type:code id: tags:
```
# voluntary
# gives mean, min, max, std, and 3 percentiles for cutoff data
# can change what to include or exclude
user_cutoff.describe(include=None, exclude=None)
```
%% Cell type:markdown id: tags:
# Number of Jobs vs Users Requesting RAM per CPU
%% Cell type:markdown id: tags:
Graphs: <br>
Number of Jobs Requesting RAM per CPU for all Jobs
<br>
Number of Users Requesting RAM per CPU for all Jobs
<br>
Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs
<br>
Detailed look at Users Requesting RAM per CPU for All Jobs
<br>
These graphs create histograms using the data for the month of March 2020.
The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
The y axis measures how many jobs/users requested that amount RAM per CPU.
%% Cell type:code id: tags:
```
# shows the number of jobs requesting cpu memory for all jobs (array and non array jobs)
Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = "green")
Jobs_fig.set_yscale('log')
# shows number of users requesting cpu memory for all jobs (array and non array jobs) without a cutoff
Users_fig = sns.distplot(df_user_job_count, kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = "green")
#Users_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.title('Number of Users Requesting RAM per CPU for all Jobs')
plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting')
plt.ylabel('Number of Users Requesting')
```
%% Cell type:code id: tags:
```
# shows number of users requesting cpu memory for all jobs (array and non array jobs)
Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = "green")
#Users_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Users Requesting')
```
%% Cell type:code id: tags:
```
# shows uthe nmber of jobs vs users requesting cpu memory for all jobs (array and non array jobs)
Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = "green")
Jobs_fig.set_yscale('log')
Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for for all Jobs')
#Users_fig.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting')
```
%% Cell type:code id: tags:
```
# shows a more detailed, interactive veiw of the number of users requesting cpu memory for all jobs (array and non array jobs)
Users_fig = px.histogram(user_cutoff, x="ReqMemCPU",
Users_fig = px.histogram(df_user_job_count, x="User",
title='Detailed look at Users Requesting RAM per CPU for All Jobs %i gigs or less'%UpperlimitGB,
labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
#labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
opacity=0.8,
#og_y=True, # represent bars with log scale
#log_y=True, # represent bars with log scale
marginal="box", # can be `box`, `violin`
hover_data=user_cutoff.columns,
#hover_data=df_user_job_count.columns,
nbins=30,
color_discrete_sequence=['goldenrod'] # color of histogram bars
)
Users_fig.show()
```
%% Cell type:code id: tags:
```
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment