Commit 862dab3a authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

Merge branch 'Elapsed-AllocCPUs-ReqMem_ClusterAnalysis' into 'master'

Cluster Analysis(ReqMemCPU, AllocCPUS, Elapsed)

See merge request rc-data-science/createandparsesacct!19
parents 486c2eae 50fc90d4
This diff is collapsed.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Notebook Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"import sqlite3\n",
"import slurm2sql\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns\n",
"import seaborn as sb\n",
"import plotly.express as px\n",
"import matplotlib.ticker as ticker\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"from RC_styles import rc_styles as style"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates database of info from March 2020 using sqlite 3\n",
"db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df is starting database\n",
"df = pd.read_sql('SELECT * FROM slurm', db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# voluntary\n",
"\n",
"# for displaying all available column options\n",
"pd.set_option('display.max_columns', None)\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# converts units in ReqMemCPU column from bytes to gigs\n",
"df['ReqMemCPU'] = df['ReqMemCPU'].div(1024**3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# converts Elapsed time to hours (from seconds)\n",
"df['Elapsed'] = df['Elapsed'].div(3600)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df_completed is dataframe of all completed jobs\n",
"df_completed = df[df.State.str.contains('COMPLETED')]\n",
"#df_completed.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ReqMemCPU,Corecount,Runtime Clustering"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# dataset of needed columns for all graphs below\n",
"df_1 = df_completed.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n",
"df_1.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# rounds ReqMemCPU up to nearest whole number\n",
"df_1['ReqMemCPU'] = df_1['ReqMemCPU'].apply(np.ceil)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# rounds Elapsed up to nearest 2 decimal places\n",
"df_1['Elapsed'] = df_1['Elapsed'].round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sorts dataset by AllocCPUS for easy visualization\n",
"df_1_sorted = df_1.sort_values(by='AllocCPUS', ascending=True)\n",
"df_1_sorted.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for ReqMemCPU\n",
"UpperlimitGB = 50\n",
"LowerlimitGB = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for AllocCPUS\n",
"UpperlimitAllocCPU = 20\n",
"LowerlimitAllocCPU = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n",
"df_facet = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU)]\n",
"df_facet.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates a facet grid from df_runtime dataset\n",
"# Elapsed time in hours and ReqMemCPU in gigs\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"full_facet = sb.PairGrid(df_facet)\n",
"full_facet.map(plt.scatter);\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Detailed Look at Elapsed Time - In terms of Requested RAM and Cores"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for ReqMemCPU for clustered Elapsed Time Graphs\n",
"UpperlimitGB_elapsed = 50\n",
"LowerlimitGB_elapsed = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for AllocCPUS for clustered Elapsed Time Graphs\n",
"UpperlimitAllocCPU_elapsed = 20\n",
"LowerlimitAllocCPU_elapsed = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n",
"df_runtime_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB_elapsed) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB_elapsed) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU_elapsed) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU_elapsed)]\n",
"df_runtime_cluster.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet\n",
"Sum_of_squared_distances = []\n",
"K = range(1,10)\n",
"for k in K:\n",
" km = KMeans(n_clusters=k)\n",
" km = km.fit(df_runtime_cluster)\n",
" Sum_of_squared_distances.append(km.inertia_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset\n",
"plt.plot(K, Sum_of_squared_distances, 'bx-')\n",
"plt.xlabel('k')\n",
"plt.ylabel('Sum_of_squared_distances')\n",
"plt.title('Elbow Method For Optimal k')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets to clusters and returns the cluster points\n",
"kmeans = KMeans(n_clusters=3, random_state=111)\n",
"kmeans.fit(df_runtime_cluster)\n",
"print(kmeans.cluster_centers_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# facet grid of the two graphs being clustered using df_runtime_cluster dataset\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"elapsed_reqmem_alloc = sns.PairGrid(df_runtime_cluster, y_vars=[\"Elapsed\"], x_vars=[\"ReqMemCPU\", \"AllocCPUS\"], height=4)\n",
"elapsed_reqmem_alloc.map(sns.regplot, color=\"blue\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# clustered graph\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"elapsed_runtime_cluster_graph = plt.scatter(df_runtime_cluster['ReqMemCPU'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n",
"plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n",
"\n",
"plt.xlabel('ReqMemCPU(gigs)')\n",
"plt.ylabel('Elapsed(hours)')\n",
"plt.title('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB_elapsed)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# clustered graph\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"elapsed_alloc_cluster_graph = plt.scatter(df_runtime_cluster['AllocCPUS'],df_runtime_cluster['Elapsed'], c=kmeans.labels_, cmap='rainbow')\n",
"plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n",
"\n",
"plt.xlabel('AllocCPUS')\n",
"plt.ylabel('Elapsed(hours)')\n",
"plt.title('Runtime per Core %i cores or less'%UpperlimitAllocCPU_elapsed)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Detailed Look at Elapsed Time - In terms of Requested RAM and Cores"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# second set of min and max parameters for ReqMemCPU to use for AllocCPU/ReqMemCPU cluster graph \n",
"UpperlimitGB_alloc = 50\n",
"LowerlimitGB_alloc = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for AllocCPUS\n",
"UpperlimitAllocCPU_alloc = 60\n",
"LowerlimitAllocCPU_alloc = 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS using the min and max parameters created above\n",
"df_allocCPUS_cluster = df_1_sorted[(df_1_sorted['ReqMemCPU'] <= UpperlimitGB_alloc) & (df_1_sorted['ReqMemCPU'] >= LowerlimitGB_alloc) & (df_1_sorted['AllocCPUS'] <= UpperlimitAllocCPU_alloc) & (df_1_sorted['AllocCPUS'] >= LowerlimitAllocCPU_alloc)]\n",
"df_allocCPUS.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets up info for plotting the optimal number of clusters - uses df_runtime_cluster datasaet\n",
"Sum_of_squared_distances = []\n",
"K = range(1,10)\n",
"for k in K:\n",
" km = KMeans(n_clusters=k)\n",
" km = km.fit(df_allocCPUS_cluster)\n",
" Sum_of_squared_distances.append(km.inertia_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# the bend in the graph is the optimal number of clusters for graphs using the df_runtime_cluster dataset\n",
"plt.plot(K, Sum_of_squared_distances, 'bx-')\n",
"plt.xlabel('k')\n",
"plt.ylabel('Sum_of_squared_distances')\n",
"plt.title('Elbow Method For Optimal k')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets to clusters and returns the cluster points\n",
"kmeans = KMeans(n_clusters=3, random_state=111)\n",
"kmeans.fit(df_allocCPUS_cluster)\n",
"print(kmeans.cluster_centers_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"alloc_reqmem_graph = sns.scatterplot(x=\"ReqMemCPU\", y=\"AllocCPUS\",data=df_allocCPUS_cluster)\n",
"\n",
"plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB_alloc)\n",
"\n",
"plt.xlabel('ReqMemCPU(gigs)')\n",
"plt.ylabel('AllocCPUS')\n",
"#plt.yscale(\"log\")\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# clustered graph\n",
"style.default_axes_and_ticks()\n",
"style.figsize()\n",
"\n",
"alloc_reqmem_cluster_graph = plt.scatter(df_allocCPUS_cluster['ReqMemCPU'],df_allocCPUS_cluster['AllocCPUS'], c=kmeans.labels_, cmap='rainbow')\n",
"plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')\n",
"\n",
"plt.xlabel('ReqMemCPU(gigs)')\n",
"plt.ylabel('AllocCPUS')\n",
"plt.title('Number of Cores used by Requested RAM %i gigs or less'%UpperlimitGB_alloc)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
*.ipynb filter=nbstrip_full
[core]
attributesfile = ~/.gitattributes
[filter "nbstrip_full"]
clean = "jq --indent 1 \
'(.cells[] | select(has(\"outputs\")) | .outputs) = [] \
| (.cells[] | select(has(\"execution_count\")) | .execution_count) = null \
| .metadata = {\"language_info\": {\"name\": \"python\", \"pygments_lexer\": \"ipython3\"}} \
| .cells[].metadata = {} \
'"
smudge = cat
required = true
......@@ -48,7 +48,7 @@ parso==0.6.2
pexpect==4.8.0
phik==0.9.9
pickleshare==0.7.5
plotly==4.5.2
plotly==4.8.2
pluggy==0.13.1
prometheus-client==0.7.1
prompt-toolkit==3.0.3
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment