Commit af17830d authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added cluster analysis graphs

parent 657abaaa
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Setup Options"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# year-date-month\n",
"#start_date = '2020-10-09'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for ReqMemCPU\n",
"LowerlimitGB = 0\n",
"UpperlimitGB = 50"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for AllocCPUS\n",
"LowerlimitAllocCPU = 0\n",
"UpperlimitAllocCPU = 50"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets min and max parameters for Elapsed\n",
"LowerlimitElapsed = 0\n",
"UpperlimitElapsed = 150.02"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Enter 'none', '0-1', or 'log' as achoice for data nomralization\n",
"Data_Normalization_Choice = 'none'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"import sqlite3\n",
"import slurm2sql\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns\n",
"import seaborn as sb\n",
"import plotly.express as px\n",
"import matplotlib.ticker as ticker\n",
"import numpy as np\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"import os\n",
"from RC_styles import rc_styles as style\n",
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Database Creation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates database of info from March 2020 using sqlite 3\n",
"db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df is starting database\n",
"df = pd.read_sql('SELECT * FROM slurm', db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# df_1 is dataframe of all completed jobs\n",
"df_1 = df[df.State.str.contains('COMPLETED')]\n",
"#df_completed.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# dataset of needed columns for all graphs below\n",
"df_completed = df_1.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n",
"#df_1.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# converts units in ReqMemCPU column from bytes to gigs and rounds up to nearest whole number\n",
"df_completed['ReqMemCPU'] = df_completed['ReqMemCPU'].div(1024**3).apply(np.ceil).apply(int)\n",
"#df_completed.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# converts Elapsed time to hours (from seconds) and rounds up to nearest 2 decimal places\n",
"df_completed['Elapsed'] = df_completed['Elapsed'].div(3600).round(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS for completed jobs using the min and max parameters created above\n",
"df_clustering = df_completed[(df_completed['ReqMemCPU'] <= UpperlimitGB) & \n",
" (df_completed['ReqMemCPU'] >= LowerlimitGB) & \n",
" (df_completed['AllocCPUS'] <= UpperlimitAllocCPU) & \n",
" (df_completed['AllocCPUS'] >= LowerlimitAllocCPU)\n",
" & \n",
" (df_completed['Elapsed'] <= UpperlimitElapsed) & \n",
" (df_completed['Elapsed'] >= LowerlimitElapsed)]\n",
"df_clustering.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Normalizing the Data for ReqMem/Elapsed"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if Data_Normalization_Choice == '0-1':\n",
" column_max = df_clustering.max()\n",
" df_clustering_max = column_max.max()\n",
" fit = df_clustering / df_clustering_max\n",
" print(\"0-1\")\n",
" \n",
"elif Data_Normalization_Choice == 'log':\n",
" fit = np.log10(df_clustering+1)\n",
" print(\"log\")\n",
" \n",
"else:\n",
" fit = df_clustering\n",
" print(\"none\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# kmeans Clustering"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"# sets to clusters and returns the cluster points\n",
"kmeans_cluster = KMeans(n_clusters=3, random_state=111)\n",
"kmeans_cluster.fit(fit)\n",
"print(kmeans_cluster.cluster_centers_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Reverting Cluster Points Back to align with UnNormalized data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if Data_Normalization_Choice == '0-1':\n",
" clusterpoints = kmeans_cluster.cluster_centers_ * df_clustering_max\n",
" print(\"0-1\")\n",
" \n",
"elif Data_Normalization_Choice == 'log':\n",
" clusterpoints = 10 ** (kmeans_cluster.cluster_centers_) - 1\n",
" print(\"log\")\n",
" \n",
"else:\n",
" clusterpoints = kmeans_cluster.cluster_centers_\n",
" print(\"none\")\n",
" print(clusterpoints[:,0],clusterpoints[:,1])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run\n",
"\n",
"figure = plt.figure()\n",
"\n",
"figure.set_size_inches(20,20)\n",
"\n",
"# Elapsed/ReqMem 2d Graph\n",
"elapsed_rqmem_clustergraph = figure.add_subplot(3,3,1)\n",
"#figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n",
"elapsed_rqmem_clustergraph.scatter(df_clustering['ReqMemCPU'],df_clustering['Elapsed'], \n",
" c=kmeans_cluster.labels_, cmap='rainbow')\n",
"elapsed_rqmem_clustergraph.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')\n",
"plt.xlabel('ReqMemCPU(gigs)')\n",
"plt.ylabel('Elapsed(hours)')\n",
"\n",
"\n",
"# Elapsed/Alloc 2d Graph\n",
"elapsed_alloc_clustergraph = figure.add_subplot(3,3,2)\n",
"#figure.suptitle('Runtime per Core %i cores or less'%UpperlimitAllocCPU)\n",
"elapsed_alloc_clustergraph.scatter(df_clustering['AllocCPUS'],df_clustering['Elapsed'], \n",
" c=kmeans_cluster.labels_, cmap='rainbow')\n",
"elapsed_alloc_clustergraph.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')\n",
"plt.xlabel('AllocCPUS')\n",
"plt.ylabel('Elapsed(hours)')\n",
"\n",
"# Alloc/ReqMem 2d Graph\n",
"alloc_rqmem_clustergraph = figure.add_subplot(3,3,3)\n",
"#figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)\n",
"alloc_rqmem_clustergraph.scatter(df_clustering['ReqMemCPU'],df_clustering['AllocCPUS'], \n",
" c=kmeans_cluster.labels_, cmap='rainbow')\n",
"elapsed_rqmem_clustergraph.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')\n",
"plt.xlabel('ReqMemCPU(gigs)')\n",
"plt.ylabel('AllocCPUS')\n",
"\n",
"###########\n",
"# Alloc/ReqMem 3d Graph\n",
"alloc_reqmem_clustergraph_3d = figure.add_subplot(3,3,4, projection='3d')\n",
"alloc_reqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['AllocCPUS'], df_clustering['Elapsed'], \n",
" c=kmeans_cluster.labels_ ,cmap='rainbow')\n",
"alloc_reqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')\n",
"alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')\n",
"alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')\n",
"alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')\n",
"\n",
"# sets size and color for gridlines by axis\n",
"alloc_reqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"alloc_reqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"alloc_reqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"\n",
"\n",
"# Elapsed/Alloc 3d Graph\n",
"elapsed_alloc_clustergraph_3d = figure.add_subplot(3,3,5, projection='3d')\n",
"elapsed_alloc_clustergraph_3d.scatter(df_clustering['AllocCPUS'], df_clustering['ReqMemCPU'], df_clustering['Elapsed'], \n",
" c=kmeans_cluster.labels_ ,cmap='rainbow')\n",
"elapsed_alloc_clustergraph_3d.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')\n",
"elapsed_alloc_clustergraph_3d.set_xlabel('AllocCPUS')\n",
"elapsed_alloc_clustergraph_3d.set_ylabel('ReqMemCPU(gigs)')\n",
"elapsed_alloc_clustergraph_3d.set_zlabel('Elapsed(hours)')\n",
"\n",
"elapsed_alloc_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"elapsed_alloc_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"elapsed_alloc_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"\n",
"\n",
"\n",
"# Elapsed/ReqMem 3d Graph\n",
"elapsed_rqmem_clustergraph_3d = figure.add_subplot(3,3,6, projection='3d')\n",
"elapsed_rqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['Elapsed'], df_clustering['AllocCPUS'], \n",
" c=kmeans_cluster.labels_ ,cmap='rainbow')\n",
"elapsed_rqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')\n",
"\n",
"elapsed_rqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs)')\n",
"elapsed_rqmem_clustergraph_3d.set_ylabel('Elapsed(hours)')\n",
"elapsed_rqmem_clustergraph_3d.set_zlabel('AllocCPUS')\n",
"\n",
"elapsed_rqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"elapsed_rqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"elapsed_rqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"\n",
"\n",
"##############\n",
"# Alloc/ReqMem 3d Graph\n",
"alloc_reqmem_clustergraph_3d = figure.add_subplot(3,3,7, projection='3d')\n",
"alloc_reqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['AllocCPUS'], df_clustering['Elapsed'], \n",
" c=kmeans_cluster.labels_ ,cmap='rainbow', alpha = .08)\n",
"alloc_reqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')\n",
"alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')\n",
"alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')\n",
"alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')\n",
"\n",
"# sets size and color for gridlines by axis\n",
"alloc_reqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"alloc_reqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"alloc_reqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"\n",
"\n",
"# Elapsed/Alloc 3d Graph\n",
"elapsed_alloc_clustergraph_3d = figure.add_subplot(3,3,8, projection='3d')\n",
"elapsed_alloc_clustergraph_3d.scatter(df_clustering['AllocCPUS'], df_clustering['ReqMemCPU'], df_clustering['Elapsed'], \n",
" c=kmeans_cluster.labels_ ,cmap='rainbow', alpha = .08)\n",
"elapsed_alloc_clustergraph_3d.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')\n",
"elapsed_alloc_clustergraph_3d.set_xlabel('AllocCPUS')\n",
"elapsed_alloc_clustergraph_3d.set_ylabel('ReqMemCPU(gigs)')\n",
"elapsed_alloc_clustergraph_3d.set_zlabel('Elapsed(hours)')\n",
"\n",
"elapsed_alloc_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"elapsed_alloc_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"elapsed_alloc_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"\n",
"\n",
"\n",
"# Elapsed/ReqMem 3d Graph\n",
"elapsed_rqmem_clustergraph_3d = figure.add_subplot(3,3,9, projection='3d')\n",
"elapsed_rqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['Elapsed'], df_clustering['AllocCPUS'], \n",
" c=kmeans_cluster.labels_ ,cmap='rainbow', alpha = .08)\n",
"elapsed_rqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')\n",
"\n",
"elapsed_rqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs)')\n",
"elapsed_rqmem_clustergraph_3d.set_ylabel('Elapsed(hours)')\n",
"elapsed_rqmem_clustergraph_3d.set_zlabel('AllocCPUS')\n",
"\n",
"elapsed_rqmem_clustergraph_3d.xaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"elapsed_rqmem_clustergraph_3d.yaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"elapsed_rqmem_clustergraph_3d.zaxis._axinfo[\"grid\"].update({\"linewidth\":.5, \"color\" : \"black\"})\n",
"\n",
"\n",
"# sets the spacing\n",
"# top = space between title and graphs - increase number to bring title down and decrease to bring title up\n",
"# left = space to the left\n",
"# wspace = padding on both sides of graphs\n",
"# hspace = padding on top and bottom of graphs\n",
"figure.subplots_adjust(left=0.0, wspace=0.2, top=.92, hspace=0.3)\n",
"figure.suptitle('Clusters', fontsize=20)\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment