Compare revisions

Ryan Randles Jones · Ryan Randles Jones · Ryan Randles Jones · Ryan Randles Jones · Ryan Randles Jones · Ryan Randles Jones
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "RC_styles"]
+	path = RC_styles
+	url = https://gitlab.rc.uab.edu/bkomal96/rc_styles.git
--- a/Cluster_Analysis.ipynb
+++ b/Cluster_Analysis.ipynb
--- a/RC_styles @ 7a2fd654
+++ b/RC_styles @ 7a2fd654
+Subproject commit 7a2fd65482b64345b1cd923f38846433f7c33399
--- a/Release_Notes.md
+++ b/Release_Notes.md
+# Release notes version - 1.0 (12/08/2020)
+# Cluster Analysis
+
+This initial version creates the dataset, kmeans clustering, and resulting graphs to analize how our users are utilizing the cluster.
+
+Features included:
+- User input to choose date range of data to analyze
+- User input to choose min and max values for ReqMemCPU, AllocCPUS, and Elapsed
+- User input to choose how data is normalized: 0-1, log, or no normalization
+- User input to choose min and max x and y axes for 2D histogram graphs
+
+    # Next Release Planned Features (coming December 2020)
+
+    - data on job counts for each density spot in 2d histograms
+    - summary statistics for each cluster in the form of the count of jobs and the count of users per cluster
+
+# Release Notes version - 1.1 Bug Fix (12/15/2020)
+
+Dataset for completed jobs orginally had all jobs and each of their job steps. This skewed the clustering graphs, as there were more data points than individual jobs ran. The data is now being pulled into the dataset using only allocated jobs (done with -X in the slurm2sql.slurm2sql command), which results in each row of the dataset being a different job.
+
+# Release Notes verion - 2.0 (12/22/2020)
+Added summary stats for each cluster. This includes the count for both jobs ran and users running those jobs for each of the four clusters.
+- summary statistics in the form of a table showing the job and user count for each cluster
+
+* Data on stats for each density spot in the 2d histograms will come in another notebook. This notebook will be a deeper analysis of each 2d histogram for each cluster. This notebook should be released by end of January 2021.
--- a/gitattributes.txt
+++ b/gitattributes.txt
+*.ipynb filter=nbstrip_full
--- a/gitconfig.txt
+++ b/gitconfig.txt
+[core]
+attributesfile = ~/.gitattributes
+[filter "nbstrip_full"]
+clean = "jq --indent 1 \
+        '(.cells[] | select(has(\"outputs\")) | .outputs) = []  \
+        | (.cells[] | select(has(\"execution_count\")) | .execution_count) = null  \
+        | .metadata = {\"language_info\": {\"name\": \"python\", \"pygments_lexer\": \"ipython3\"}} \
+        | .cells[].metadata = {} \
+        '"
+smudge = cat
+required = true
--- a/gitmodules.txt
+++ b/gitmodules.txt
+[submodule "RC_styles"]
+	path = RC_styles
+	url = https://gitlab.rc.uab.edu/bkomal96/rc_styles.git
--- a/importSACCTinfo.ipynb
+++ b/importSACCTinfo.ipynb
 %% Cell type:code id: tags:

-``` python
+``` 
 import numpy as np
 import pandas as pd
 import pandas_profiling
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df = pd.read_csv('userusage.txt',delimiter='|')
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df.head()
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df[['jid','step']] = df.JobID.str.split(".",expand=True)
 df.Partition.values
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 batchDF=df.dropna(subset=["MaxRSS"])
 userDF=df.dropna(subset=["User"])
 for jid in df.jid.unique():
    userDF['MaxRSS'][userDF['jid'] == jid]=batchDF['MaxRSS'][batchDF['jid'] == jid]

    #print(userDF[userDF['jid'] == jid])

 userDF.head()
 ```

 %% Cell type:markdown id: tags:

 # add more graphs here

 %% Cell type:code id: tags:

-``` python
+``` 
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 import numpy as np
 import pandas as pd
 import pandas_profiling
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df = pd.read_csv('userusage.txt',delimiter='|')
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df.head()
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df[['jid','step']] = df.JobID.str.split(".",expand=True)
 df.Partition.values
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 batchDF=df.dropna(subset=["MaxRSS"])
 userDF=df.dropna(subset=["User"])
 for jid in df.jid.unique():
    userDF['MaxRSS'][userDF['jid'] == jid]=batchDF['MaxRSS'][batchDF['jid'] == jid]

    #print(userDF[userDF['jid'] == jid])

 userDF.head()
 ```

 %% Cell type:markdown id: tags:

 # add more graphs here

 %% Cell type:code id: tags:

-``` python
+``` 
 ```

--- a/requirements.txt
+++ b/requirements.txt
@@ -48,7 +48,7 @@ parso==0.6.2
 pexpect==4.8.0
 phik==0.9.9
 pickleshare==0.7.5
-plotly==4.5.2
+plotly==4.8.2
 pluggy==0.13.1
 prometheus-client==0.7.1
 prompt-toolkit==3.0.3

--- a/slurm-2sql.ipynb
+++ b/slurm-2sql.ipynb
 %% Cell type:code id: tags:

 ``` 
-# must run
-
 import sqlite3
 import slurm2sql
 import pandas as pd
-import matplotlib.pyplot as plt
-%matplotlib inline
-import seaborn as sns
-import plotly.express as px
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# creates database of info from March 2020 using sqlite 3
-db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# creates database of allocation info from March 2020 using sqlite 3
-# not using this right now, but is here as an option
-#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
-upperRAMlimit = 50e+10 # 5 gigs
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# df_1 is starting database
-df_1 = pd.read_sql('SELECT * FROM slurm', db)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# voluntary
-
-# for displaying all available column options
-pd.set_option('display.max_columns', None)
-df_1.head(5)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# df_2 is database with only ReqMemCpu and ReqMemNode, and ArrayTaskID
-df_2 = df_1.loc[:,['JobName','ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]
-#df_2.head(5)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# df_batch is df_2 with only batch jobs
-df_batch = df_1.JobName.str.contains('batch')
-#df_2[df_batch]
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# creates database from df_batch of ReqMemCPU batch jobs that are < or = a given point
-CPU_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]
-#CPU_cutoff
-Node_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemNode <= upperRAMlimit)]
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# voluntary
-
-# gives mean, min, max, std, and 3 percentiles for cutoff data
-# can change what to include or exclude
-CPU_cutoff.describe(include=None, exclude=None)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# voluntary
-
-# gives mean, min, max, std, and 3 percentiles for cutoff data
-# can change what to include or exclude
-Node_cutoff.describe(include=None, exclude=None)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# msut run
-
-# creates databases of Requested Ram per CPU and per Node that have an array task id using the upper RAM limit cutoff
-CPU_arraytask = CPU_cutoff.dropna(subset=['ArrayTaskID'])
-Node_arraytask = Node_cutoff.dropna(subset=['ArrayTaskID'])
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# creates databases of Requested Ram per CPU and per Node  that do not have an array task id using the upper RAM limit cutoff
-CPU_nonarraytask = CPU_cutoff[CPU_cutoff['ArrayTaskID'].isnull()]
-Node_nonarraytask = Node_cutoff[Node_cutoff['ArrayTaskID'].isnull()]
-#CPU_nonarraytask.head(5)
-```
-
-%% Cell type:markdown id: tags:
-
-Graphs: <br>
-        User Requested RAM per CPU for all Jobs
-        <br>
-        User Requested RAM per Node for all Jobs
-        <br>
-        User Requested RAM per CPU and per Node together for all Jobs
-        <br>
-        User Requested RAM per CPU for Array Jobs vs Not Array Jobs
-        <br>
-        User Requested RAM per Node for Array Jobs vs Not Array Jobs
-        <br>
-
-These graphs create histograms using the data for the month of March 2020.
-The x axis measures the amount of requested RAM in gigs per CPU/Node, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
-The y axis measures how many users requested that amount RAM per CPU or Node.
-
-%% Cell type:code id: tags:
-
-``` 
-# shows all user requested cpu memory for array and non array jobs
-CPU_fig = sns.distplot(CPU_cutoff['ReqMemCPU'],  kde=False, label='All CPU', color = "green")
-CPU_fig.set_yscale('log')
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per CPU for all Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# shows all user requested node memory for array and non array jobs
-Node_fig = sns.distplot(Node_cutoff['ReqMemNode'],  kde=False, label='All Node')
-Node_fig.set_yscale('log')
-
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per Node for all Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-#shows requested cpu and node for all job types (array and non array jobs) side by side for easy comparison.
-CPU_fig = sns.distplot(CPU_cutoff['ReqMemCPU'],  kde=False, label='All CPU', color = "green")
-CPU_fig.set_yscale('log')
-
-Node_fig = sns.distplot(Node_cutoff['ReqMemNode'],  kde=False, label='All Node') #color = 'darkblue')
-Node_fig.set_yscale('log')
-
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per CPU and per Node together for all Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.
-CPU_arraytask_fig = sns.distplot(CPU_arraytask['ReqMemCPU'],  kde=False, label='CPU Array Task', color = "green")
-CPU_arraytask_fig.set_yscale('log')
-
-CPU_nonarraytask_fig = sns.distplot(CPU_nonarraytask['ReqMemCPU'],  kde=False, label='CPU Non Array Task')
-CPU_nonarraytask_fig.set_yscale('log')
-
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per CPU for Array Jobs vs Not Array Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-#shows requested node memory for array jobs alongside requested node memory for non array jobs for easy comparison.
-Node_arraytask_fig = sns.distplot(Node_arraytask['ReqMemCPU'],  kde=False, label='Node Array Task', color = "green")
-Node_arraytask_fig.set_yscale('log')
-
-Node_nonarraytask_fig = sns.distplot(Node_nonarraytask['ReqMemNode'],  kde=False, label='Node Non Array Task')
-Node_nonarraytask_fig.set_yscale('log')
-
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per Node for Array Jobs vs Not Array Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:markdown id: tags:
-
-# These are Plotly Express Graphs of the some of the above Seaborn graphs. Run them only if you need more details about the data in the graph. They will make your notebook run slower.
-
-%% Cell type:markdown id: tags:
-
-Graphs: <br>
-        User Requested RAM per CPU for all Jobs
-        <br>
-        User Requested RAM per CPU for Non Array Jobs
-        <br>
-        User Requested RAM per CPU for Array Jobs
-        <br>
-        User Requested RAM per Node for all Jobs
-        <br>
-        User Requested RAM per Node for Non Array Jobs
-        <br>
-        User Requested RAM per Node for Array Jobs
-        <br>
-
-These graphs create histograms using the data for the month of March 2020.
-The x axis measures the amount of requested RAM in gigs per CPU/Node, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
-The y axis measures how many users requested that amount RAM per CPU or Node.
-Can also show box or violin graph above to show where min, max, median, and 3rd quartile is.
-
-%% Cell type:code id: tags:
-
-``` 
-CPU_fig = px.histogram(CPU_cutoff, x="ReqMemCPU",
-                   title='User Requested RAM per CPU for all Jobs',
-                   labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=CPU_cutoff.columns,
-                   nbins=30,
-                   color_discrete_sequence=['goldenrod'] # color of histogram bars
-                   )
-CPU_fig.show()
 ```

 %% Cell type:code id: tags:

 ``` 
-CPU_nonarraytask_fig = px.histogram(CPU_nonarraytask, x="ReqMemCPU",
-                   title='User Requested RAM per CPU for Non Array Jobs',
-                   labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=CPU_nonarraytask.columns,
-                   nbins=30,
-                   color_discrete_sequence=['goldenrod'] # color of histogram bars
-                   )
-CPU_nonarraytask_fig.show()
+db = sqlite3.connect('test.db')
+slurm2sql.slurm2sql(db, ['-S', '2020-03-18', '-a'])
 ```

 %% Cell type:code id: tags:

 ``` 
-CPU_arraytask_fig = px.histogram(CPU_arraytask, x="ReqMemCPU",
-                   title='User Requested RAM per CPU for Array Jobs',
-                   labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=CPU_arraytask.columns,
-                   nbins=30,
-                   color_discrete_sequence=['goldenrod'] # color of histogram bars
-                   )
-CPU_arraytask_fig.show()
+# For example, you can then convert to a dataframe:
+df1 = pd.read_sql('SELECT * FROM slurm', db)
 ```

 %% Cell type:code id: tags:

 ``` 
-Node_fig = px.histogram(Node_cutoff, x="ReqMemNode",
-                   title='User Requested RAM per Node for all Jobs',
-                   labels={'ReqMemNode':'ReqMemNode'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=Node_cutoff.columns,
-                   nbins=30,
-                   color_discrete_sequence=['darkblue'] # color of histogram bars
-                   )
-Node_fig.show()
+df1.head(5)
 ```

 %% Cell type:code id: tags:

 ``` 
-Node_nonarraytask_fig = px.histogram(Node_nonarraytask, x="ReqMemNode",
-                   title='User Requested RAM per Node for Non Array Jobs',
-                   labels={'ReqMemNode':'ReqMemNode'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=Node_nonarraytask.columns,
-                   nbins=30,
-                   color_discrete_sequence=['darkblue'] # color of histogram bars
-                   )
-Node_nonarraytask_fig.show()
+print("more plots to come")
 ```

 %% Cell type:code id: tags:

 ``` 
-Node_arraytask_fig = px.histogram(Node_arraytask, x="ReqMemNode",
-                   title='User Requested RAM per Node for Array Jobs',
-                   labels={'ReqMemNode':'ReqMemNode'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=Node_arraytask.columns,
-                   nbins=30,
-                   color_discrete_sequence=['darkblue'] # color of histogram bars
-                   )
-Node_arraytask_fig.show()
 ```

 %% Cell type:code id: tags:

 ``` 
-# must run
-
 import sqlite3
 import slurm2sql
 import pandas as pd
-import matplotlib.pyplot as plt
-%matplotlib inline
-import seaborn as sns
-import plotly.express as px
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# creates database of info from March 2020 using sqlite 3
-db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# creates database of allocation info from March 2020 using sqlite 3
-# not using this right now, but is here as an option
-#db_allocation = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# variable for max gigs of RAM requested - Charts range from 0 to upperRAMlimit gigs
-upperRAMlimit = 50e+10 # 5 gigs
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# df_1 is starting database
-df_1 = pd.read_sql('SELECT * FROM slurm', db)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# voluntary
-
-# for displaying all available column options
-pd.set_option('display.max_columns', None)
-df_1.head(5)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# df_2 is database with only ReqMemCpu and ReqMemNode, and ArrayTaskID
-df_2 = df_1.loc[:,['JobName','ReqMemCPU', 'ReqMemNode', 'ArrayJobID','ArrayTaskID']]
-#df_2.head(5)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# df_batch is df_2 with only batch jobs
-df_batch = df_1.JobName.str.contains('batch')
-#df_2[df_batch]
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# creates database from df_batch of ReqMemCPU batch jobs that are < or = a given point
-CPU_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemCPU <= upperRAMlimit)]
-#CPU_cutoff
-Node_cutoff = df_2[df_batch][(df_2[df_batch].ReqMemNode <= upperRAMlimit)]
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# voluntary
-
-# gives mean, min, max, std, and 3 percentiles for cutoff data
-# can change what to include or exclude
-CPU_cutoff.describe(include=None, exclude=None)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# voluntary
-
-# gives mean, min, max, std, and 3 percentiles for cutoff data
-# can change what to include or exclude
-Node_cutoff.describe(include=None, exclude=None)
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# msut run
-
-# creates databases of Requested Ram per CPU and per Node that have an array task id using the upper RAM limit cutoff
-CPU_arraytask = CPU_cutoff.dropna(subset=['ArrayTaskID'])
-Node_arraytask = Node_cutoff.dropna(subset=['ArrayTaskID'])
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# must run
-
-# creates databases of Requested Ram per CPU and per Node  that do not have an array task id using the upper RAM limit cutoff
-CPU_nonarraytask = CPU_cutoff[CPU_cutoff['ArrayTaskID'].isnull()]
-Node_nonarraytask = Node_cutoff[Node_cutoff['ArrayTaskID'].isnull()]
-#CPU_nonarraytask.head(5)
-```
-
-%% Cell type:markdown id: tags:
-
-Graphs: <br>
-        User Requested RAM per CPU for all Jobs
-        <br>
-        User Requested RAM per Node for all Jobs
-        <br>
-        User Requested RAM per CPU and per Node together for all Jobs
-        <br>
-        User Requested RAM per CPU for Array Jobs vs Not Array Jobs
-        <br>
-        User Requested RAM per Node for Array Jobs vs Not Array Jobs
-        <br>
-
-These graphs create histograms using the data for the month of March 2020.
-The x axis measures the amount of requested RAM in gigs per CPU/Node, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
-The y axis measures how many users requested that amount RAM per CPU or Node.
-
-%% Cell type:code id: tags:
-
-``` 
-# shows all user requested cpu memory for array and non array jobs
-CPU_fig = sns.distplot(CPU_cutoff['ReqMemCPU'],  kde=False, label='All CPU', color = "green")
-CPU_fig.set_yscale('log')
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per CPU for all Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-# shows all user requested node memory for array and non array jobs
-Node_fig = sns.distplot(Node_cutoff['ReqMemNode'],  kde=False, label='All Node')
-Node_fig.set_yscale('log')
-
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per Node for all Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-#shows requested cpu and node for all job types (array and non array jobs) side by side for easy comparison.
-CPU_fig = sns.distplot(CPU_cutoff['ReqMemCPU'],  kde=False, label='All CPU', color = "green")
-CPU_fig.set_yscale('log')
-
-Node_fig = sns.distplot(Node_cutoff['ReqMemNode'],  kde=False, label='All Node') #color = 'darkblue')
-Node_fig.set_yscale('log')
-
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per CPU and per Node together for all Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-#shows requested cpu memory for array jobs alongside requested cpu memory for non array jobs for easy comparison.
-CPU_arraytask_fig = sns.distplot(CPU_arraytask['ReqMemCPU'],  kde=False, label='CPU Array Task', color = "green")
-CPU_arraytask_fig.set_yscale('log')
-
-CPU_nonarraytask_fig = sns.distplot(CPU_nonarraytask['ReqMemCPU'],  kde=False, label='CPU Non Array Task')
-CPU_nonarraytask_fig.set_yscale('log')
-
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per CPU for Array Jobs vs Not Array Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:code id: tags:
-
-``` 
-#shows requested node memory for array jobs alongside requested node memory for non array jobs for easy comparison.
-Node_arraytask_fig = sns.distplot(Node_arraytask['ReqMemCPU'],  kde=False, label='Node Array Task', color = "green")
-Node_arraytask_fig.set_yscale('log')
-
-Node_nonarraytask_fig = sns.distplot(Node_nonarraytask['ReqMemNode'],  kde=False, label='Node Non Array Task')
-Node_nonarraytask_fig.set_yscale('log')
-
-plt.legend(prop={'size': 12})
-plt.title('User Requested RAM per Node for Array Jobs vs Not Array Jobs')
-plt.xlabel('Requested Gigs of RAM')
-plt.ylabel('Number of Users Requesting')
-```
-
-%% Cell type:markdown id: tags:
-
-# These are Plotly Express Graphs of the some of the above Seaborn graphs. Run them only if you need more details about the data in the graph. They will make your notebook run slower.
-
-%% Cell type:markdown id: tags:
-
-Graphs: <br>
-        User Requested RAM per CPU for all Jobs
-        <br>
-        User Requested RAM per CPU for Non Array Jobs
-        <br>
-        User Requested RAM per CPU for Array Jobs
-        <br>
-        User Requested RAM per Node for all Jobs
-        <br>
-        User Requested RAM per Node for Non Array Jobs
-        <br>
-        User Requested RAM per Node for Array Jobs
-        <br>
-
-These graphs create histograms using the data for the month of March 2020.
-The x axis measures the amount of requested RAM in gigs per CPU/Node, from 0 to the max declared in the upperRAMlimit variable above - 5 gigs.
-The y axis measures how many users requested that amount RAM per CPU or Node.
-Can also show box or violin graph above to show where min, max, median, and 3rd quartile is.
-
-%% Cell type:code id: tags:
-
-``` 
-CPU_fig = px.histogram(CPU_cutoff, x="ReqMemCPU",
-                   title='User Requested RAM per CPU for all Jobs',
-                   labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=CPU_cutoff.columns,
-                   nbins=30,
-                   color_discrete_sequence=['goldenrod'] # color of histogram bars
-                   )
-CPU_fig.show()
 ```

 %% Cell type:code id: tags:

 ``` 
-CPU_nonarraytask_fig = px.histogram(CPU_nonarraytask, x="ReqMemCPU",
-                   title='User Requested RAM per CPU for Non Array Jobs',
-                   labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=CPU_nonarraytask.columns,
-                   nbins=30,
-                   color_discrete_sequence=['goldenrod'] # color of histogram bars
-                   )
-CPU_nonarraytask_fig.show()
+db = sqlite3.connect('test.db')
+slurm2sql.slurm2sql(db, ['-S', '2020-03-18', '-a'])
 ```

 %% Cell type:code id: tags:

 ``` 
-CPU_arraytask_fig = px.histogram(CPU_arraytask, x="ReqMemCPU",
-                   title='User Requested RAM per CPU for Array Jobs',
-                   labels={'ReqMemCPU':'ReqMemCPU'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=CPU_arraytask.columns,
-                   nbins=30,
-                   color_discrete_sequence=['goldenrod'] # color of histogram bars
-                   )
-CPU_arraytask_fig.show()
+# For example, you can then convert to a dataframe:
+df1 = pd.read_sql('SELECT * FROM slurm', db)
 ```

 %% Cell type:code id: tags:

 ``` 
-Node_fig = px.histogram(Node_cutoff, x="ReqMemNode",
-                   title='User Requested RAM per Node for all Jobs',
-                   labels={'ReqMemNode':'ReqMemNode'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=Node_cutoff.columns,
-                   nbins=30,
-                   color_discrete_sequence=['darkblue'] # color of histogram bars
-                   )
-Node_fig.show()
+df1.head(5)
 ```

 %% Cell type:code id: tags:

 ``` 
-Node_nonarraytask_fig = px.histogram(Node_nonarraytask, x="ReqMemNode",
-                   title='User Requested RAM per Node for Non Array Jobs',
-                   labels={'ReqMemNode':'ReqMemNode'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=Node_nonarraytask.columns,
-                   nbins=30,
-                   color_discrete_sequence=['darkblue'] # color of histogram bars
-                   )
-Node_nonarraytask_fig.show()
+print("more plots to come")
 ```

 %% Cell type:code id: tags:

 ``` 
-Node_arraytask_fig = px.histogram(Node_arraytask, x="ReqMemNode",
-                   title='User Requested RAM per Node for Array Jobs',
-                   labels={'ReqMemNode':'ReqMemNode'}, # can specify one label per df column
-                   opacity=0.8,
-                   log_y=True, # represent bars with log scale
-                   marginal="box", # can be `box`, `violin`
-                   hover_data=Node_arraytask.columns,
-                   nbins=30,
-                   color_discrete_sequence=['darkblue'] # color of histogram bars
-                   )
-Node_arraytask_fig.show()
 ```
No results found