Skip to content
Snippets Groups Projects
Commit af17830d authored by Ryan Randles Jones's avatar Ryan Randles Jones
Browse files

added cluster analysis graphs

parent 657abaaa
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# Data Setup Options
%% Cell type:code id: tags:
```
# year-date-month
#start_date = '2020-10-09'
```
%% Cell type:code id: tags:
```
# must run
# sets min and max parameters for ReqMemCPU
LowerlimitGB = 0
UpperlimitGB = 50
```
%% Cell type:code id: tags:
```
# must run
# sets min and max parameters for AllocCPUS
LowerlimitAllocCPU = 0
UpperlimitAllocCPU = 50
```
%% Cell type:code id: tags:
```
# must run
# sets min and max parameters for Elapsed
LowerlimitElapsed = 0
UpperlimitElapsed = 150.02
```
%% Cell type:code id: tags:
```
# Enter 'none', '0-1', or 'log' as achoice for data nomralization
Data_Normalization_Choice = 'none'
```
%% Cell type:markdown id: tags:
# Imports
%% Cell type:code id: tags:
```
# must run
import sqlite3
import slurm2sql
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import seaborn as sb
import plotly.express as px
import matplotlib.ticker as ticker
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import os
from RC_styles import rc_styles as style
from sklearn.cluster import KMeans
```
%% Cell type:markdown id: tags:
# Database Creation
%% Cell type:code id: tags:
```
# must run
# creates database of info from March 2020 using sqlite 3
db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
```
%% Cell type:code id: tags:
```
# must run
# df is starting database
df = pd.read_sql('SELECT * FROM slurm', db)
```
%% Cell type:code id: tags:
```
# must run
# df_1 is dataframe of all completed jobs
df_1 = df[df.State.str.contains('COMPLETED')]
#df_completed.head(5)
```
%% Cell type:code id: tags:
```
# must run
# dataset of needed columns for all graphs below
df_completed = df_1.loc[:,['ReqMemCPU', 'Elapsed', 'AllocCPUS']]
#df_1.head(5)
```
%% Cell type:code id: tags:
```
# must run
# converts units in ReqMemCPU column from bytes to gigs and rounds up to nearest whole number
df_completed['ReqMemCPU'] = df_completed['ReqMemCPU'].div(1024**3).apply(np.ceil).apply(int)
#df_completed.head()
```
%% Cell type:code id: tags:
```
# must run
# converts Elapsed time to hours (from seconds) and rounds up to nearest 2 decimal places
df_completed['Elapsed'] = df_completed['Elapsed'].div(3600).round(2)
```
%% Cell type:code id: tags:
```
# must run
# creates dataset of ReqMemCPU, Elapsed, and AllocCPUS for completed jobs using the min and max parameters created above
df_clustering = df_completed[(df_completed['ReqMemCPU'] <= UpperlimitGB) &
(df_completed['ReqMemCPU'] >= LowerlimitGB) &
(df_completed['AllocCPUS'] <= UpperlimitAllocCPU) &
(df_completed['AllocCPUS'] >= LowerlimitAllocCPU)
&
(df_completed['Elapsed'] <= UpperlimitElapsed) &
(df_completed['Elapsed'] >= LowerlimitElapsed)]
df_clustering.head(5)
```
%% Cell type:markdown id: tags:
# Normalizing the Data for ReqMem/Elapsed
%% Cell type:code id: tags:
```
if Data_Normalization_Choice == '0-1':
column_max = df_clustering.max()
df_clustering_max = column_max.max()
fit = df_clustering / df_clustering_max
print("0-1")
elif Data_Normalization_Choice == 'log':
fit = np.log10(df_clustering+1)
print("log")
else:
fit = df_clustering
print("none")
```
%% Cell type:markdown id: tags:
# kmeans Clustering
%% Cell type:code id: tags:
```
# must run
# sets to clusters and returns the cluster points
kmeans_cluster = KMeans(n_clusters=3, random_state=111)
kmeans_cluster.fit(fit)
print(kmeans_cluster.cluster_centers_)
```
%% Cell type:markdown id: tags:
# Reverting Cluster Points Back to align with UnNormalized data
%% Cell type:code id: tags:
```
if Data_Normalization_Choice == '0-1':
clusterpoints = kmeans_cluster.cluster_centers_ * df_clustering_max
print("0-1")
elif Data_Normalization_Choice == 'log':
clusterpoints = 10 ** (kmeans_cluster.cluster_centers_) - 1
print("log")
else:
clusterpoints = kmeans_cluster.cluster_centers_
print("none")
print(clusterpoints[:,0],clusterpoints[:,1])
```
%% Cell type:code id: tags:
```
# must run
figure = plt.figure()
figure.set_size_inches(20,20)
# Elapsed/ReqMem 2d Graph
elapsed_rqmem_clustergraph = figure.add_subplot(3,3,1)
#figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)
elapsed_rqmem_clustergraph.scatter(df_clustering['ReqMemCPU'],df_clustering['Elapsed'],
c=kmeans_cluster.labels_, cmap='rainbow')
elapsed_rqmem_clustergraph.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('Elapsed(hours)')
# Elapsed/Alloc 2d Graph
elapsed_alloc_clustergraph = figure.add_subplot(3,3,2)
#figure.suptitle('Runtime per Core %i cores or less'%UpperlimitAllocCPU)
elapsed_alloc_clustergraph.scatter(df_clustering['AllocCPUS'],df_clustering['Elapsed'],
c=kmeans_cluster.labels_, cmap='rainbow')
elapsed_alloc_clustergraph.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')
plt.xlabel('AllocCPUS')
plt.ylabel('Elapsed(hours)')
# Alloc/ReqMem 2d Graph
alloc_rqmem_clustergraph = figure.add_subplot(3,3,3)
#figure.suptitle('Runtime per Requested gigs of RAM %i gigs or less'%UpperlimitGB)
alloc_rqmem_clustergraph.scatter(df_clustering['ReqMemCPU'],df_clustering['AllocCPUS'],
c=kmeans_cluster.labels_, cmap='rainbow')
elapsed_rqmem_clustergraph.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')
plt.xlabel('ReqMemCPU(gigs)')
plt.ylabel('AllocCPUS')
###########
# Alloc/ReqMem 3d Graph
alloc_reqmem_clustergraph_3d = figure.add_subplot(3,3,4, projection='3d')
alloc_reqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['AllocCPUS'], df_clustering['Elapsed'],
c=kmeans_cluster.labels_ ,cmap='rainbow')
alloc_reqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')
alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')
alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')
alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')
# sets size and color for gridlines by axis
alloc_reqmem_clustergraph_3d.xaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
alloc_reqmem_clustergraph_3d.yaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
alloc_reqmem_clustergraph_3d.zaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
# Elapsed/Alloc 3d Graph
elapsed_alloc_clustergraph_3d = figure.add_subplot(3,3,5, projection='3d')
elapsed_alloc_clustergraph_3d.scatter(df_clustering['AllocCPUS'], df_clustering['ReqMemCPU'], df_clustering['Elapsed'],
c=kmeans_cluster.labels_ ,cmap='rainbow')
elapsed_alloc_clustergraph_3d.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')
elapsed_alloc_clustergraph_3d.set_xlabel('AllocCPUS')
elapsed_alloc_clustergraph_3d.set_ylabel('ReqMemCPU(gigs)')
elapsed_alloc_clustergraph_3d.set_zlabel('Elapsed(hours)')
elapsed_alloc_clustergraph_3d.xaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
elapsed_alloc_clustergraph_3d.yaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
elapsed_alloc_clustergraph_3d.zaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
# Elapsed/ReqMem 3d Graph
elapsed_rqmem_clustergraph_3d = figure.add_subplot(3,3,6, projection='3d')
elapsed_rqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['Elapsed'], df_clustering['AllocCPUS'],
c=kmeans_cluster.labels_ ,cmap='rainbow')
elapsed_rqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')
elapsed_rqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs)')
elapsed_rqmem_clustergraph_3d.set_ylabel('Elapsed(hours)')
elapsed_rqmem_clustergraph_3d.set_zlabel('AllocCPUS')
elapsed_rqmem_clustergraph_3d.xaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
elapsed_rqmem_clustergraph_3d.yaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
elapsed_rqmem_clustergraph_3d.zaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
##############
# Alloc/ReqMem 3d Graph
alloc_reqmem_clustergraph_3d = figure.add_subplot(3,3,7, projection='3d')
alloc_reqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['AllocCPUS'], df_clustering['Elapsed'],
c=kmeans_cluster.labels_ ,cmap='rainbow', alpha = .08)
alloc_reqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,2], color='black')
alloc_reqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs')
alloc_reqmem_clustergraph_3d.set_ylabel('AllocCPUS')
alloc_reqmem_clustergraph_3d.set_zlabel('Elapsed(hours)')
# sets size and color for gridlines by axis
alloc_reqmem_clustergraph_3d.xaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
alloc_reqmem_clustergraph_3d.yaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
alloc_reqmem_clustergraph_3d.zaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
# Elapsed/Alloc 3d Graph
elapsed_alloc_clustergraph_3d = figure.add_subplot(3,3,8, projection='3d')
elapsed_alloc_clustergraph_3d.scatter(df_clustering['AllocCPUS'], df_clustering['ReqMemCPU'], df_clustering['Elapsed'],
c=kmeans_cluster.labels_ ,cmap='rainbow', alpha = .08)
elapsed_alloc_clustergraph_3d.scatter(clusterpoints[:,2] ,clusterpoints[:,1], color='black')
elapsed_alloc_clustergraph_3d.set_xlabel('AllocCPUS')
elapsed_alloc_clustergraph_3d.set_ylabel('ReqMemCPU(gigs)')
elapsed_alloc_clustergraph_3d.set_zlabel('Elapsed(hours)')
elapsed_alloc_clustergraph_3d.xaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
elapsed_alloc_clustergraph_3d.yaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
elapsed_alloc_clustergraph_3d.zaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
# Elapsed/ReqMem 3d Graph
elapsed_rqmem_clustergraph_3d = figure.add_subplot(3,3,9, projection='3d')
elapsed_rqmem_clustergraph_3d.scatter(df_clustering['ReqMemCPU'], df_clustering['Elapsed'], df_clustering['AllocCPUS'],
c=kmeans_cluster.labels_ ,cmap='rainbow', alpha = .08)
elapsed_rqmem_clustergraph_3d.scatter(clusterpoints[:,0] ,clusterpoints[:,1], color='black')
elapsed_rqmem_clustergraph_3d.set_xlabel('ReqMemCPU(gigs)')
elapsed_rqmem_clustergraph_3d.set_ylabel('Elapsed(hours)')
elapsed_rqmem_clustergraph_3d.set_zlabel('AllocCPUS')
elapsed_rqmem_clustergraph_3d.xaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
elapsed_rqmem_clustergraph_3d.yaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
elapsed_rqmem_clustergraph_3d.zaxis._axinfo["grid"].update({"linewidth":.5, "color" : "black"})
# sets the spacing
# top = space between title and graphs - increase number to bring title down and decrease to bring title up
# left = space to the left
# wspace = padding on both sides of graphs
# hspace = padding on top and bottom of graphs
figure.subplots_adjust(left=0.0, wspace=0.2, top=.92, hspace=0.3)
figure.suptitle('Clusters', fontsize=20)
plt.show()
```
%% Cell type:code id: tags:
```
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment