Commit 482bb191 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

memcpu rounded to ceiling

parent 94bdfa12
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Notebook Setup # Notebook Setup
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
import sqlite3 import sqlite3
import slurm2sql import slurm2sql
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
%matplotlib inline %matplotlib inline
import seaborn as sns import seaborn as sns
import plotly.express as px import plotly.express as px
import matplotlib.ticker as ticker import matplotlib.ticker as ticker
import numpy as np
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
from RC_styles import rc_styles as style from RC_styles import rc_styles as style
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database of info from March 2020 using sqlite 3 # creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3') db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df is starting database # df is starting database
df = pd.read_sql('SELECT * FROM slurm', db) df = pd.read_sql('SELECT * FROM slurm', db)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# voluntary # voluntary
# for displaying all available column options # for displaying all available column options
pd.set_option('display.max_columns', None) pd.set_option('display.max_columns', None)
df.head(5) df.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# converts units in ReqMemCPU column from bytes to gigs # converts units in ReqMemCPU column from bytes to gigs
df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3) df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_completed is dataframe of all completed jobs # df_completed is dataframe of all completed jobs
df_completed = df[df.State.str.contains('COMPLETED')] df_completed = df[df.State.str.contains('COMPLETED')]
#df_completed.head(5) df_completed.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_batch is df with only batch jobs # df_batch is df with only batch jobs
df_batch = df[df.JobName.str.contains('batch')] df_batch = df[df.JobName.str.contains('batch')]
#df_batch.head(5) df_batch.head(5)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Average RAM per CPU Requested by User # Average RAM per CPU Requested by User
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_2 is database of completed jobs with only User and ReqMemCpu # df_2 is database of completed jobs with only User and ReqMemCpu
# it is used for the user dataframes # it is used for the user dataframes
df_2 = df_completed.loc[:,['User','ReqMemCPU']] df_2 = df_completed.loc[:,['User','ReqMemCPU']]
#df_2.head(5) df_2.head(5)
```
%% Cell type:code id: tags:
```
df_2['ReqMemCPU'] = df_2['ReqMemCPU'].apply(np.ceil)
df_2.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings # fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings
nan_value = float("NaN") nan_value = float("NaN")
df_2.replace("", nan_value, inplace=True) df_2.replace("", nan_value, inplace=True)
df_2.dropna(subset = ["User"], inplace=True) df_2.dropna(subset = ["User"], inplace=True)
#df_2.head(5) df_2.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# count = count of jobs per user # count = count of jobs per user
# mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs # mean,std,min,25%,50%,75%, and max refers to the gigs of memory per cpu requested by that user for all their jobs
df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index() df_user = df_2.groupby('User')['ReqMemCPU'].describe().reset_index()
df_user.head(5) df_user.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run
# creates user number column of strings of numbers from 0 to the total number of users
# used in graphs in place of usernames
usernames = df_user['User']
user_numbers = [str(i) for i in range(len(usernames))]
df_user['User Number'] = user_numbers
df_user.head(5)
```
%% Cell type:code id: tags:
```
# voluntary # voluntary
# description of number of jobs run per user - can be used to choose the Upper Limit Job Count # description of number of jobs run per user - can be used to choose the Upper Limit Job Count
df_user['count'].describe() df_user['count'].describe()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# variable for to be used in names of plots to describe the max job count per user # variable for to be used in names of plots to describe the max job count per user
# max = 367257 # max = 367257
UpperlimitJobCount = 50 UpperlimitJobCount = 50
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above # creates database from df_user that returns all jobs per user up to the UpperlimitJobCount defined above
jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)] jobscount_cutoff = df_user[(df_user['count'] <= UpperlimitJobCount)]
jobscount_cutoff.head(5) jobscount_cutoff.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_user_graph is df_user sorted in ascending order by count for easy readibility of graph # df_user_graph is df_user sorted in ascending order by count for easy readibility of graph
df_user_graph = jobscount_cutoff.sort_values(by='count', ascending=True) df_user_graph = jobscount_cutoff.sort_values(by='count', ascending=True)
df_user_graph.head(5) df_user_graph.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
user_graph1 = sns.scatterplot(x="count", y="mean",data=df_user_graph) user_graph1 = sns.scatterplot(x="count", y="mean",data=df_user_graph)
plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount) plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('Job Count Per User') plt.xlabel('Job Count Per User')
plt.ylabel('Average Requested RAM per CPU (Gigs)') plt.ylabel('Average Requested RAM per CPU (Gigs)')
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
user_graph = sns.barplot(x="count", y="mean", data= df_user_graph, color = 'blue', ci=None) user_graph = sns.barplot(x="count", y="mean", data= df_user_graph, color = 'blue', ci=None)
#user_graph.set_xscale('log') #user_graph.set_xscale('log')
user_graph.xaxis.set_major_locator(ticker.MultipleLocator(2)) user_graph.xaxis.set_major_locator(ticker.MultipleLocator(2))
user_graph.xaxis.set_major_formatter(ticker.ScalarFormatter()) user_graph.xaxis.set_major_formatter(ticker.ScalarFormatter())
user_graph.yaxis.set_major_locator(ticker.MultipleLocator(10)) user_graph.yaxis.set_major_locator(ticker.MultipleLocator(10))
user_graph.yaxis.set_major_formatter(ticker.ScalarFormatter()) user_graph.yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount) plt.title('Average Requested RAM per CPU by User for all Users Running %i Jobs or less'%UpperlimitJobCount)
plt.xlabel('Job Count') plt.xlabel('Job Count')
plt.ylabel('Average Requested RAM per CPU (Gigs)') plt.ylabel('Average Requested RAM per CPU (Gigs)')
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# bar graph for jobs run per user - shows average requested RAM per CPU for all jobs by user # bar graph for jobs run per user - shows average requested RAM per CPU for all jobs by user
user_graph2 = px.bar(df_user_graph, x='count', y='mean', color = 'count', user_graph2 = px.bar(df_user_graph, x='count', y='mean', color = 'count',
hover_data=['max','count'], hover_data=['max','count'],
labels={'mean':'Average Requested RAM per CPU (Gigs)'}, labels={'mean':'Average Requested RAM per CPU (Gigs)'},
height=400) height=400)
user_graph2.update_layout( user_graph2.update_layout(
xaxis_type = 'category', xaxis_type = 'category',
title={ title={
'text': "Average Requested RAM per CPU by User for all Users", 'text': "Average Requested RAM per CPU by User for all Users",
'y':0.9, 'y':0.9,
'x':0.5, 'x':0.5,
'xanchor': 'center', 'xanchor': 'center',
'yanchor': 'top'}) 'yanchor': 'top'})
user_graph2.show() user_graph2.show()
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Average RAM per CPU by Job # Average RAM per CPU by Job
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# df_4 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID # df_3 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID
# it is used to pull out needed information and create separate datasets to compare # it is used to pull out needed information and create separate datasets to compare
df_4 = df_batch.loc[:,['JobStep','ReqMemCPU','ArrayJobID']] df_3 = df_batch.loc[:,['ReqMemCPU','JobID']]
#df_4.head(5) df_3.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run df_3['ReqMemCPU'] = df_3['ReqMemCPU'].apply(np.ceil)
df_3.head(5)
# variable for to be used in names of plots to describe the max gigs measured
UpperlimitGB = 40
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database from df_4 that returns all RAM per CPU requested up to the UpperRAMlimit defined above # variable for to be used in names of plots to describe the max gigs measured
gig_cutoff = df_4[(df_4.ReqMemCPU <= UpperlimitGB)] UpperlimitGB = 40
gig_cutoff.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_4['ReqMemCPU'].describe() # must run
# creates database from df_3 that returns all RAM per CPU requested up to the UpperRAMlimit defined above
gig_cutoff = df_3[(df_3.ReqMemCPU <= UpperlimitGB)]
gig_cutoff.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
# shows the number of jobs requesting cpu memory for all jobs (array and non array jobs) # shows the number of jobs requesting cpu memory for all jobs (array and non array jobs)
jobs_graph_hist = sns.distplot(gig_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = "green") jobs_graph_hist = sns.distplot(gig_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = "green")
jobs_graph_hist.set_yscale('log') jobs_graph_hist.set_yscale('log')
plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1) plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB) plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested Gigs of RAM') plt.xlabel('Requested Gigs of RAM')
plt.ylabel('Number of Jobs Requesting') plt.ylabel('Number of Jobs Requesting')
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_jobs_graph1 = gig_cutoff.groupby('ReqMemCPU')['ArrayJobID'].describe().reset_index() # renames JobID column to JobCount since that's what it is now
df_jobs_graph1.tail(5) df_cpu_per_job = gig_cutoff.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
df_cpu_per_job.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_jobs_graph1['ReqMemCPU'].describe() df_cpu_per_job['ReqMemCPU'].describe()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
jobs_graph1 = sns.scatterplot(x="ReqMemCPU", y="count",data=df_jobs_graph1) cpu_per_job = sns.scatterplot(x="ReqMemCPU", y="JobCount",data=df_cpu_per_job)
jobs_graph1.set_yscale('log') cpu_per_job.set_yscale('log')
#jobs_graph1.yaxis.set_major_locator(ticker.MultipleLocator(100000)) #cpu_per_job.yaxis.set_major_locator(ticker.MultipleLocator(100000))
#jobs_graph1.yaxis.set_major_formatter(ticker.ScalarFormatter()) #cpu_per_job.yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB) plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)
plt.xlabel('Requested RAM per CPU (Gigs) per Job') plt.xlabel('Requested RAM per CPU (Gigs) per Job')
plt.ylabel('Job Count') plt.ylabel('Job Count')
plt.show() plt.show()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
df_count = df_4.groupby('ReqMemCPU')['ArrayJobID'].describe().reset_index() # renames JobID column to JobCount since that's what it is now
df_count.tail(5) job_count = df_3.groupby('ReqMemCPU').count().rename(columns={'JobID': 'JobCount'}).reset_index()
``` job_count.head(5)
%% Cell type:code id: tags:
```
df_count['ReqMemCPU'].describe()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
UpperlimitJobCount2 = 50 UpperlimitJobCount2 = 10
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
# must run # must run
# creates database from df_4 that returns all RAM per CPU requested up to the UpperRAMlimit defined above # creates database from df_3 that returns all Jobs up to the UpperlimitJobCount2 defined above
df_jobs_graph2 = df_count[(df_count['count'] <= UpperlimitJobCount2)] df_job_count = job_count[(job_count.JobCount <= UpperlimitJobCount2)]
df_jobs_graph2.tail(5) df_job_count.head(5)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` ```
style.default_axes_and_ticks() style.default_axes_and_ticks()
style.figsize() style.figsize()
jobs_graph2 = sns.scatterplot(x="count", y="ReqMemCPU",data=df_jobs_graph2)