Commit e8ae73a2 authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added graphs and updated doc strings

parent 6efc4f62
......@@ -88,8 +88,9 @@
"source": [
"# must run\n",
"\n",
"# df_2 is database with only ReqMemCpu and ReqMemNode, and ArrayTaskID\n",
"df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]\n",
"# df_2 is database with only JobStep, User, JobName, ReqMemCpu, ArrayJob, and ArrayTaskID\n",
"# it is used to pull out needed information and create separate datasets to compare\n",
"df_2 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID']]\n",
"#df_2.head(5)"
]
},
......@@ -101,19 +102,19 @@
"source": [
"# must run\n",
"\n",
"# df_user is df_2 with only user defined jobs\n",
"df_3 = df_2[df_2['JobStep'].isnull()] # jobs where jobstep is None\n",
"df_3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_user = df_3.loc[:,['User', 'JobName', 'ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]\n",
"df_user"
"# fills empty strings in User column with NaN and then filters them out to give a dataset of users with no empty strings \n",
"\n",
"nan_value = float(\"NaN\")\n",
"\n",
"df_3 = df_1.loc[:,['JobStep','User', 'JobName','ReqMemCPU', 'ArrayJobID','ArrayTaskID']]\n",
"\n",
"df_3.replace(\"\", nan_value, inplace=True)\n",
"\n",
"df_3.dropna(subset = [\"User\"], inplace=True)\n",
" \n",
"# df_user is a dataset consisting of each user and the total amout of RAM per CPU they have requested over all jobs they have run \n",
"df_user = df_3.groupby(['User']).sum().reset_index()\n",
"#df_user.head(5)"
]
},
{
......@@ -126,7 +127,7 @@
"\n",
"# df_batch is df_2 with only batch jobs\n",
"df_batch = df_2.JobName.str.contains('batch')\n",
"#df_batch"
"#df_2[df_batch].head(5)"
]
},
{
......@@ -137,16 +138,13 @@
"source": [
"# must run\n",
"\n",
"# creates database from df_batch of ReqMemCPU batch jobs that are < or = a given point\n",
"JobsCPU_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]\n",
"#JobsCPU_cutoff\n",
"JobsNode_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemNode <= upperRAMlimit)]\n",
"# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above\n",
"batch_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]\n",
"#print(batch_cutoff.head(5))\n",
"\n",
"UsersCPU_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]\n",
"#UsersCPU_cutoff\n",
"UsersNode_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]\n",
"\n",
"\n"
"# creates database from df_batch that returns all RAM per CPU requested up to the UpperRAMlimit defined above\n",
"user_cutoff = df_user[(df_user.ReqMemCPU <= upperRAMlimit)]\n",
"#user_cutoff.head(5)"
]
},
{
......@@ -155,24 +153,15 @@
"metadata": {},
"outputs": [],
"source": [
"# voluntary\n",
"#voluntary\n",
"\n",
"# gives mean, min, max, std, and 3 percentiles for cutoff data\n",
"# can change what to include or exclude\n",
"JobsCPU_cutoff.describe(include=None, exclude=None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# voluntary\n",
"# all the users who run array jobs before the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 16 users out of 230\n",
"arrayjobs = df_user[(df_user != 0).all(1)]\n",
"print(arrayjobs.head(5))\n",
"\n",
"# gives mean, min, max, std, and 3 percentiles for cutoff data\n",
"# can change what to include or exclude\n",
"JobsNode_cutoff.describe(include=None, exclude=None)"
"# all the users who run array jobs after the 5 gig cutoff - ArrayJobID and ArrayTaskID > 0 - 1 to 2 users out of 230\n",
"arrayjobs_after_cutoff = user_cutoff[(user_cutoff != 0).all(1)]\n",
"arrayjobs_after_cutoff"
]
},
{
......@@ -185,7 +174,7 @@
"\n",
"# gives mean, min, max, std, and 3 percentiles for cutoff data\n",
"# can change what to include or exclude\n",
"UsersCPU_cutoff.describe(include=None, exclude=None)"
"batch_cutoff.describe(include=None, exclude=None)"
]
},
{
......@@ -198,41 +187,7 @@
"\n",
"# gives mean, min, max, std, and 3 percentiles for cutoff data\n",
"# can change what to include or exclude\n",
"UsersNode_cutoff.describe(include=None, exclude=None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# msut run\n",
"\n",
"# creates databases of Requested Ram per CPU and per Node that have an array task id using the upper RAM limit cutoff\n",
"JobsCPU_arraytask = JobsCPU_cutoff.dropna(subset=['ArrayTaskID'])\n",
"JobsNode_arraytask = JobsNode_cutoff.dropna(subset=['ArrayTaskID'])\n",
"\n",
"UsersCPU_arraytask = UsersCPU_cutoff.dropna(subset=['ArrayTaskID'])\n",
"UsersNode_arraytask = UsersNode_cutoff.dropna(subset=['ArrayTaskID'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates databases of Requested Ram per CPU and per Node that do not have an array task id using the upper RAM limit cutoff\n",
"JobsCPU_nonarraytask = JobsCPU_cutoff[JobsCPU_cutoff['ArrayTaskID'].isnull()]\n",
"JobsNode_nonarraytask = JobsNode_cutoff[JobsNode_cutoff['ArrayTaskID'].isnull()]\n",
"#JobsCPU_nonarraytask.head(5)\n",
"\n",
"UsersCPU_nonarraytask = UsersCPU_cutoff[UsersCPU_cutoff['ArrayTaskID'].isnull()]\n",
"UsersNode_nonarraytask = UsersNode_cutoff[UsersNode_cutoff['ArrayTaskID'].isnull()]\n",
"#UsersCPU_nonarraytask.head(5)"
"user_cutoff.describe(include=None, exclude=None)"
]
},
{
......@@ -247,18 +202,18 @@
"metadata": {},
"source": [
"Graphs: <br>\n",
" Jobs Requesting RAM per CPU for all Jobs\n",
" Number of Jobs Requesting RAM per CPU for all Jobs\n",
" <br>\n",
" Users Requesting RAM per CPU for all Jobs\n",
" Number of Users Requesting RAM per CPU for all Jobs\n",
" <br>\n",
" Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs\n",
" Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs\n",
" <br>\n",
" Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs\n",
" Detailed look at Users Requesting RAM per CPU for All Jobs\n",
" <br>\n",
"\n",
"These graphs create histograms using the data for the month of March 2020.\n",
"The x axis measures the amount of requested RAM in gigs per CPU, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.\n",
"The y axis measures how many jobs requested that amount RAM per CPU."
"The y axis measures how many jobs/users requested that amount RAM per CPU."
]
},
{
......@@ -267,11 +222,12 @@
"metadata": {},
"outputs": [],
"source": [
"# shows all user requested cpu memory for array and non array jobs\n",
"Jobs_fig = sns.distplot(JobsCPU_cutoff['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array and Non Array Jobs', color = \"green\")\n",
"# shows the number of jobs requesting cpu memory for all jobs (array and non array jobs)\n",
"Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = \"green\")\n",
"Jobs_fig.set_yscale('log')\n",
"\n",
"plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)\n",
"plt.title('Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.title('Number of Jobs Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.xlabel('Requested Gigs of RAM')\n",
"plt.ylabel('Number of Jobs Requesting')"
]
......@@ -282,11 +238,12 @@
"metadata": {},
"outputs": [],
"source": [
"# shows all user requested cpu memory for array and non array jobs\n",
"Users_fig = sns.distplot(UsersCPU_cutoff['ReqMemCPU'], kde=False, label='Users Requesting RAM per CPU for Array and Non Array Jobs', color = \"green\")\n",
"# shows number of users requesting cpu memory for all jobs (array and non array jobs)\n",
"Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for all Jobs', color = \"green\")\n",
"Users_fig.set_yscale('log')\n",
"\n",
"plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)\n",
"plt.title('Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.title('Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.xlabel('Requested Gigs of RAM')\n",
"plt.ylabel('Number of Users Requesting')"
]
......@@ -297,15 +254,15 @@
"metadata": {},
"outputs": [],
"source": [
"#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.\n",
"Jobs_arraytask_fig = sns.distplot(JobsCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = \"green\")\n",
"Jobs_arraytask_fig.set_yscale('log')\n",
"# shows uthe nmber of jobs vs users requesting cpu memory for all jobs (array and non array jobs)\n",
"Jobs_fig = sns.distplot(batch_cutoff['ReqMemCPU'], kde=False, label='Number of Jobs Requesting RAM per CPU for all Jobs', color = \"green\")\n",
"Jobs_fig.set_yscale('log')\n",
"\n",
"Jobs_nonarraytask_fig = sns.distplot(JobsCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs')\n",
"Jobs_nonarraytask_fig.set_yscale('log')\n",
"Users_fig = sns.distplot(user_cutoff['ReqMemCPU'], kde=False, label='Number of Users Requesting RAM per CPU for for all Jobs')\n",
"Users_fig.set_yscale('log')\n",
"\n",
"plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1)\n",
"plt.title('Jobs Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.25, 1.0),ncol=1)\n",
"plt.title('Number of Jobs vs Number of Users Requesting RAM per CPU for all Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.xlabel('Requested Gigs of RAM')\n",
"plt.ylabel('Number of Jobs Requesting')"
]
......@@ -316,18 +273,27 @@
"metadata": {},
"outputs": [],
"source": [
"#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.\n",
"Users_arraytask_fig = sns.distplot(UsersCPU_arraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Array Jobs', color = \"green\")\n",
"Users_arraytask_fig.set_yscale('log')\n",
"# shows a more detailed, interactive veiw of the number of users requesting cpu memory for all jobs (array and non array jobs)\n",
"\n",
"Users_nonarraytask_fig = sns.distplot(UsersCPU_nonarraytask['ReqMemCPU'], kde=False, label='Jobs Requesting RAM per CPU for Non Array Jobs')\n",
"Users_nonarraytask_fig.set_yscale('log')\n",
"\n",
"plt.legend(prop={'size': 12},loc='upper right',bbox_to_anchor=(2.05, 1.0),ncol=1)\n",
"plt.title('Users Requesting RAM per CPU for Array Jobs vs Not Array Jobs %i gigs or less'%UpperlimitGB)\n",
"plt.xlabel('Requested Gigs of RAM')\n",
"plt.ylabel('Number of Jobs Requesting')"
"Users_fig = px.histogram(user_cutoff, x=\"ReqMemCPU\",\n",
" title='Detailed look at Users Requesting RAM per CPU for All Jobs %i gigs or less'%UpperlimitGB,\n",
" labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column\n",
" opacity=0.8,\n",
" log_y=True, # represent bars with log scale\n",
" marginal=\"box\", # can be `box`, `violin`\n",
" hover_data=user_cutoff.columns,\n",
" nbins=30,\n",
" color_discrete_sequence=['goldenrod'] # color of histogram bars\n",
" )\n",
"Users_fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment