Commit 8ca3de4d authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

finalized graphs with normalization options

parent 3a0c54db
......@@ -7,6 +7,16 @@
"# Notebook Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# year-date-month\n",
"#start_date = '2020-10-09'"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -25,7 +35,8 @@
"import plotly.express as px\n",
"import matplotlib.ticker as ticker\n",
"import numpy as np\n",
"from mpl_toolkits.mplot3d import Axes3D"
"from mpl_toolkits.mplot3d import Axes3D\n",
"import os"
]
},
{
......@@ -50,6 +61,17 @@
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#connecting to database\n",
"#db = sqlite3.connect('runtime_and_core_count.db')\n",
"#print(db)"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -59,7 +81,29 @@
"# must run\n",
"\n",
"# creates database of info from March 2020 using sqlite 3\n",
"db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')"
"db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')\n",
"#print(db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#slurm2sql.slurm2sql(db, ['-S 2020-09-08 -E 2020-09-15 -a --allocations -o Job,Submit,Start,End'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" #creating a database based on the start date\n",
"#slurm2sql.slurm2sql(db, ['-S', '2020-01-09', '-a'])\n",
"#print(db)\n",
"#print(start_date)"
]
},
{
......@@ -71,7 +115,21 @@
"# must run\n",
"\n",
"# df is starting database\n",
"df = pd.read_sql('SELECT * FROM slurm', db)"
"df = pd.read_sql('SELECT * FROM slurm', db)\n",
"#df = pd.read_sql('SELECT JobID,Submit,Start,End FROM slurm', db)\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" #Deleting the database\n",
"#os.remove('runtime_and_core_count.db')\n",
"#os.remove('runtime_and_core_count.db-shm')\n",
"#os.remove('runtime_and_core_count.db-wal') "
]
},
{
......@@ -84,7 +142,7 @@
"\n",
"# for displaying all available column options\n",
"pd.set_option('display.max_columns', None)\n",
"df.head(5)"
"df.count()"
]
},
{
......@@ -316,16 +374,49 @@
"metadata": {},
"outputs": [],
"source": [
"#must run\n",
"#must run if dataset will not be normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n",
"\n",
"#ReqMemCPU = 0 - 50 gigs\n",
"#AllocCPUS = 0 - 50 cores\n",
"#Elapsed = 0 - 150.02 hours\n",
"\n",
"# data set without normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs\n",
"df_runtime_cluster = df_facet.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n",
"df_runtime_cluster.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run if dataset will be 0-1 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n",
"\n",
"# 0-1 normalized dataset\n",
"# used for 0-1 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n",
"column_maxes_runtime = df_runtime_cluster.max()\n",
"df_runtime_cluster_max = column_maxes_runtime.max()\n",
"normalized_runtime_df = df_runtime_cluster / df_runtime_cluster_max\n",
"\n",
"print(normalized_runtime_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run if dataset will be log10 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n",
"\n",
"# log10 normalized dataset\n",
"# used for log10 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n",
"\n",
"log_runtime_df = np.log10(df_runtime_cluster+1)\n",
"log_runtime_df.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -360,31 +451,41 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"column_maxes_runtime = df_runtime_cluster.max()\n",
"df_runtime_cluster_max = column_maxes_runtime.max()\n",
"normalized_runtime_df = df_runtime_cluster / df_runtime_cluster_max\n",
"\n",
"print(normalized_runtime_df)"
"# Elapsed/ReqMemCPU clustering"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Elapsed/ReqMemCPU clustering"
"The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n",
"All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n",
"All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made."
"# In the cell below, set the fit based on the normalization type by uncommenting the line to run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# uncomment for no normalization\n",
"#elapsed_reqmem_fit = df_runtime_cluster\n",
"\n",
"# uncomment for 0-1 normalization\n",
"#elapsed_reqmem_fit = normalized_runtime_df\n",
"\n",
"# uncomment for log10 normalization\n",
"elapsed_reqmem_fit = log_runtime_df"
]
},
{
......@@ -397,8 +498,30 @@
"\n",
"# sets to clusters and returns the cluster points\n",
"kmeans_elapsed_reqmem = KMeans(n_clusters=3, random_state=111)\n",
"kmeans_elapsed_reqmem.fit(normalized_runtime_df)\n",
"clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_ * df_runtime_cluster_max"
"kmeans_elapsed_reqmem.fit(elapsed_reqmem_fit)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# uncomment if no normalization\n",
"#clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_\n",
"\n",
"# uncomment if 0-1 normalization\n",
"#clusterpoints_elapsed_reqmem = kmeans_elapsed_reqmem.cluster_centers_ * df_runtime_cluster_max\n",
"\n",
"# uncomment if log10 normalization\n",
"clusterpoints_elapsed_reqmem = 10 ** (kmeans_elapsed_reqmem.cluster_centers_) - 1"
]
},
{
......@@ -612,6 +735,29 @@
"All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# In the cell below, set the fit based on the normalization type by uncommenting the line to run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# uncomment for no normalization\n",
"#elapsed_alloc_fit = df_runtime_cluster\n",
"\n",
"# uncomment for 0-1 normalization\n",
"#elapsed_alloc_fit = normalized_runtime_df\n",
"\n",
"# uncomment for log10 normalization\n",
"elapsed_alloc_fit = log_runtime_df"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -622,8 +768,30 @@
"\n",
"# sets to clusters and returns the cluster points\n",
"kmeans_elapsed_alloc = KMeans(n_clusters=3, random_state=111)\n",
"kmeans_elapsed_alloc.fit(normalized_runtime_df)\n",
"clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_ * df_runtime_cluster_max"
"kmeans_elapsed_alloc.fit(elapsed_alloc_fit)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# uncomment if no normalization\n",
"#clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_\n",
"\n",
"# uncomment if 0-1 normalization\n",
"#clusterpoints_elapsed_alloc = kmeans_elapsed_alloc.cluster_centers_ * df_runtime_cluster_max\n",
"\n",
"# uncomment if log10 normalization\n",
"clusterpoints_elapsed_alloc = 10 ** (kmeans_elapsed_reqmem.cluster_centers_) - 1"
]
},
{
......@@ -851,15 +1019,50 @@
"metadata": {},
"outputs": [],
"source": [
"# must run if dataset will not be normalized\n",
"\n",
"#ReqMemCPU = 0 - 50 gigs\n",
"#AllocCPUS = 0 - 50 cores\n",
"#Elapsed = 0 - 150.02 hours\n",
"\n",
"# non normalized dataset\n",
"# used for fitting for the Alloc/ReqMem graph without normalization\n",
"df_alloc_cluster = df_facet.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]\n",
"df_alloc_cluster.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run if dataset will be 0-1 normalized\n",
"\n",
"# 0-1 normalized dataset\n",
"# used for 0-1 normalization fitting for the Alloc/ReqMem graph\n",
"column_maxes_alloc = df_alloc_cluster.max()\n",
"df_alloc_cluster_max = column_maxes_alloc.max()\n",
"normalized_alloc_df = df_alloc_cluster / df_alloc_cluster_max\n",
"\n",
"print(normalized_alloc_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# must run if dataset will be log10 normalized for both Elapsed/ReqMem and Elapsed/Alloc graphs\n",
"\n",
"# log10 normalized dataset\n",
"# used for log10 normalization fitting for both the Elapsed/ReqMem and Elapsed/Alloc graphs \n",
"\n",
"log_alloc_df = np.log10(df_alloc_cluster+1)\n",
"log_alloc_df.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -893,17 +1096,27 @@
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# In the cell below, set the fit based on the normalization type by uncommenting the line to run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"column_maxes_alloc = df_alloc_cluster.max()\n",
"df_alloc_cluster_max = column_maxes_alloc.max()\n",
"normalized_alloc_df = df_alloc_cluster / df_alloc_cluster_max\n",
"# uncomment for no normalization\n",
"#alloc_reqmem_fit = df_alloc_cluster\n",
"\n",
"print(normalized_alloc_df)"
"# uncomment for 0-1 normalization\n",
"#alloc_reqmem_fit = normalized_alloc_df\n",
"\n",
"# uncomment for log10 normalization\n",
"alloc_reqmem_fit = log_alloc_df"
]
},
{
......@@ -916,15 +1129,56 @@
"\n",
"# sets to clusters and returns the cluster points\n",
"kmeans_alloc_reqmem = KMeans(n_clusters=3, random_state=111)\n",
"kmeans_alloc_reqmem.fit(normalized_alloc_df)\n",
"clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_ * df_alloc_cluster_max"
"kmeans_alloc_reqmem.fit(alloc_reqmem_fit)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The next 5 cells create the clusters, find each cluster label, and create datasets of data in each cluster.\n",
"# In the cell below, choose which cluster center to use - uncomment the line that goes with the normalization type"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# uncomment if no normalization\n",
"#clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_\n",
"\n",
"# uncomment if 0-1 normalization\n",
"#clusterpoints_alloc_reqmem = kmeans_alloc_reqmem.cluster_centers_ * df_alloc_cluster_max\n",
"\n",
"# uncomment if log10 normalization\n",
"clusterpoints_alloc_reqmem = (10 ** (kmeans_alloc_reqmem.cluster_centers_)) - 1\n",
"print(clusterpoints_alloc_reqmem)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clusterpoints_alloc_reqmem[:,0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clusterpoints_alloc_reqmem[:,2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The next 5 cells find each cluster label, and create datasets of data in each cluster.\n",
"All the datasets are created for both the cluster graphs and plots of each cluster before those graphs are made."
]
},
......@@ -1029,14 +1283,14 @@
"\n",
"alloc_reqmem_cluster_graph = figure.add_subplot(121)\n",
"alloc_reqmem_cluster_graph.scatter(df_alloc_cluster['ReqMemCPU'],df_alloc_cluster['AllocCPUS'], c=kmeans_alloc_reqmem.labels_, cmap='rainbow')\n",
"alloc_reqmem_cluster_graph.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,1], color='black')\n",
"alloc_reqmem_cluster_graph.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,2], color='black')\n",
"plt.xlabel('ReqMemCPU(gigs)')\n",
"plt.ylabel('AllocCPUS')\n",
"\n",
"# 3d veiw of the scatterplot for better understanding of the data\n",
"alloc_reqmem_clustergraph_3d = figure.add_subplot(122, projection='3d')\n",
"alloc_reqmem_clustergraph_3d.scatter(df_alloc_cluster['ReqMemCPU'], df_alloc_cluster['AllocCPUS'], df_alloc_cluster['Elapsed'], c=kmeans_alloc_reqmem.labels_ ,cmap='rainbow')\n",
"alloc_reqmem_clustergraph_3d.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,1], color='black')\n",
"alloc_reqmem_clustergraph_3d.scatter(clusterpoints_alloc_reqmem[:,0] ,clusterpoints_alloc_reqmem[:,2], color='black')\n",
"alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')\n",
"alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')\n",
"alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')\n",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment