Compare revisions

William Stonewall Monroe · William Monroe · William Monroe · William Monroe · Ryan Randles Jones · Ryan Randles Jones
--- a/.gitconfig
+++ b/.gitconfig
 [core]
-attributesfile = ~/.gitattributes_global
+attributesfile = ~/.gitattributes
 [filter "nbstrip_full"]
 clean = "jq --indent 1 \
        '(.cells[] | select(has(\"outputs\")) | .outputs) = []  \
@@ -8,4 +8,4 @@ clean = "jq --indent 1 \
        | .cells[].metadata = {} \
        '"
 smudge = cat
-required = true
\ No newline at end of file
+required = true
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "RC_styles"]
+	path = RC_styles
+	url = https://gitlab.rc.uab.edu/bkomal96/rc_styles.git
--- a/Cluster_Analysis.ipynb
+++ b/Cluster_Analysis.ipynb
--- a/RC_styles @ 7a2fd654
+++ b/RC_styles @ 7a2fd654
+Subproject commit 7a2fd65482b64345b1cd923f38846433f7c33399
--- a/README.md
+++ b/README.md
@@ -6,8 +6,7 @@ Copy and paste the following job script into a job composer job on rc.uab.edu

 ```
 #!/bin/bash
-#SBATCH --partition=pascalnodes
-#SBATCH --gres=gpu:1
+#SBATCH --partition=express
 #SBATCH --mem-per-cpu=4000
 module load cuda10.0/toolkit
 module load Anaconda3
@@ -23,12 +22,8 @@ else
    conda env update -f /data/user/$USER/slurm-ds/environment-slurm-ds.yml
 fi
 ```
-If we have to create a database from sacct
-```
-directoryToUse="/data/user/$USER/group"
-sacct -P -u $USER --starttime=2019-01-01 --format user,start,jobid,jobname,state,partition,maxrss,reqmem,reqcpus,node,nnodes,elapsed >> "$directoryToUse"group.db
-```
-# Check to see if the environment works
+
+# Configuring the environment

 After the environment is created, you can start up an interactive Jupyter notebook session through rc.uab.edu to check if the environment works.

@@ -37,9 +32,7 @@ Under environment setup, specify
 ```
 # Load required modules
 module load cuda10.0/toolkit
-module load Anaconda3
-
-
+module load Anaconda3/2019.10
 ```

 Under Extra jupyter arguments, specify
@@ -48,8 +41,26 @@ Under Extra jupyter arguments, specify
 --notebook-dir=/data/user/$USER/slurm-ds
 ```

-For partition, specify
+For partition, set partition to 
+```
+express
+```
+for time up to 2 hours. Also make sure to specify the number of hours field to match. For up to 12 hours, the  
+```
+short
+```
+partition can be used.
+
+After the Jupyter notebook is started, click on the blue "Connect to Jupyter" button.
+
+Once the Jupyter session is active, select the `slurm-2sql` notebook. Then change the kernel, via `Kernel->Change kernel->Python [conda env:.conda-slurm-ds]`
+
+Verify the environment loaded correctly by running the first cell of the `slurm-2sql` notebook (with the library imports)

+# Creating a text version of sacct output
+
+If we have to create a database from sacct
+```
+directoryToUse="/data/user/$USER/group"
+sacct -P -u $USER --starttime=2019-01-01 --format user,start,jobid,jobname,state,partition,maxrss,reqmem,reqcpus,node,nnodes,elapsed >> "$directoryToUse"group.txt
 ```
-pascalnodes
-```
\ No newline at end of file
--- a/Release_Notes.md
+++ b/Release_Notes.md
+# Release notes version - 1.0 (12/08/2020)
+# Cluster Analysis
+
+This initial version creates the dataset, kmeans clustering, and resulting graphs to analize how our users are utilizing the cluster.
+
+Features included:
+- User input to choose date range of data to analyze
+- User input to choose min and max values for ReqMemCPU, AllocCPUS, and Elapsed
+- User input to choose how data is normalized: 0-1, log, or no normalization
+- User input to choose min and max x and y axes for 2D histogram graphs
+
+    # Next Release Planned Features (coming December 2020)
+
+    - data on job counts for each density spot in 2d histograms
+    - summary statistics for each cluster in the form of the count of jobs and the count of users per cluster
+
+# Release Notes version - 1.1 Bug Fix (12/15/2020)
+
+Dataset for completed jobs orginally had all jobs and each of their job steps. This skewed the clustering graphs, as there were more data points than individual jobs ran. The data is now being pulled into the dataset using only allocated jobs (done with -X in the slurm2sql.slurm2sql command), which results in each row of the dataset being a different job.
+
+# Release Notes verion - 2.0 (12/22/2020)
+Added summary stats for each cluster. This includes the count for both jobs ran and users running those jobs for each of the four clusters.
+- summary statistics in the form of a table showing the job and user count for each cluster
+
+* Data on stats for each density spot in the 2d histograms will come in another notebook. This notebook will be a deeper analysis of each 2d histogram for each cluster. This notebook should be released by end of January 2021.
--- a/gitattributes.txt
+++ b/gitattributes.txt
+*.ipynb filter=nbstrip_full
--- a/gitconfig.txt
+++ b/gitconfig.txt
+[core]
+attributesfile = ~/.gitattributes
+[filter "nbstrip_full"]
+clean = "jq --indent 1 \
+        '(.cells[] | select(has(\"outputs\")) | .outputs) = []  \
+        | (.cells[] | select(has(\"execution_count\")) | .execution_count) = null  \
+        | .metadata = {\"language_info\": {\"name\": \"python\", \"pygments_lexer\": \"ipython3\"}} \
+        | .cells[].metadata = {} \
+        '"
+smudge = cat
+required = true
--- a/gitmodules.txt
+++ b/gitmodules.txt
+[submodule "RC_styles"]
+	path = RC_styles
+	url = https://gitlab.rc.uab.edu/bkomal96/rc_styles.git
--- a/importSACCTinfo.ipynb
+++ b/importSACCTinfo.ipynb
 %% Cell type:code id: tags:

-``` python
+``` 
 import numpy as np
 import pandas as pd
 import pandas_profiling
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df = pd.read_csv('userusage.txt',delimiter='|')
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df.head()
 ```

-%% Output
-
-           User                Start          JobID           JobName      State  \
-    0  user  2019-01-06T22:00:21        2040834      _interactive  COMPLETED
-    1       NaN  2019-01-06T22:00:21  2040834.batch             batch  COMPLETED
-    2    user  2019-01-07T16:15:21        2043373  Pipe_trim_galore  COMPLETED
-    3       NaN  2019-01-07T16:15:21  2043373.batch             batch  COMPLETED
-    4    user  2019-01-07T16:15:21        2043374  Pipe_trim_galore  COMPLETED
-    
-      Partition    MaxRSS   ReqMem  ReqCPUS NodeList  NNodes   Elapsed
-    0    medium       NaN  10000Mc        1    c0088       1  16:04:23
-    1       NaN  1394528K  10000Mc        1    c0088       1  16:04:23
-    2    medium       NaN   2000Mc        1    c0038       1  00:18:41
-    3       NaN    58592K   2000Mc        1    c0038       1  00:18:41
-    4    medium       NaN   2000Mc        1    c0063       1  00:15:48
-
 %% Cell type:code id: tags:

-``` python
+``` 
 df[['jid','step']] = df.JobID.str.split(".",expand=True)
 df.Partition.values
 ```

-%% Output
-
-    array(['medium', nan, 'medium', ..., 'medium', nan, nan], dtype=object)
-
 %% Cell type:code id: tags:

-``` python
+``` 
 batchDF=df.dropna(subset=["MaxRSS"])
 userDF=df.dropna(subset=["User"])
 for jid in df.jid.unique():
    userDF['MaxRSS'][userDF['jid'] == jid]=batchDF['MaxRSS'][batchDF['jid'] == jid]

    #print(userDF[userDF['jid'] == jid])

 userDF.head()
 ```

-%% Output
+%% Cell type:markdown id: tags:

-    /home/wsmonroe/.conda/envs/wsmplayground/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
-    A value is trying to be set on a copy of a slice from a DataFrame
-    
-    See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-      after removing the cwd from sys.path.
-    /home/wsmonroe/.conda/envs/wsmplayground/lib/python3.6/site-packages/pandas/core/generic.py:7626: SettingWithCopyWarning:
-    A value is trying to be set on a copy of a slice from a DataFrame
-    
-    See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-      self._update_inplace(new_data)
-    /home/wsmonroe/.conda/envs/wsmplayground/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2961: SettingWithCopyWarning:
-    A value is trying to be set on a copy of a slice from a DataFrame
-    
-    See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-      exec(code_obj, self.user_global_ns, self.user_ns)
+# add more graphs here

 %% Cell type:code id: tags:

-``` python
+``` 
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 import numpy as np
 import pandas as pd
 import pandas_profiling
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df = pd.read_csv('userusage.txt',delimiter='|')
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df.head()
 ```

-%% Output
-
-           User                Start          JobID           JobName      State  \
-    0  user  2019-01-06T22:00:21        2040834      _interactive  COMPLETED
-    1       NaN  2019-01-06T22:00:21  2040834.batch             batch  COMPLETED
-    2    user  2019-01-07T16:15:21        2043373  Pipe_trim_galore  COMPLETED
-    3       NaN  2019-01-07T16:15:21  2043373.batch             batch  COMPLETED
-    4    user  2019-01-07T16:15:21        2043374  Pipe_trim_galore  COMPLETED
-    
-      Partition    MaxRSS   ReqMem  ReqCPUS NodeList  NNodes   Elapsed
-    0    medium       NaN  10000Mc        1    c0088       1  16:04:23
-    1       NaN  1394528K  10000Mc        1    c0088       1  16:04:23
-    2    medium       NaN   2000Mc        1    c0038       1  00:18:41
-    3       NaN    58592K   2000Mc        1    c0038       1  00:18:41
-    4    medium       NaN   2000Mc        1    c0063       1  00:15:48
-
 %% Cell type:code id: tags:

-``` python
+``` 
 df[['jid','step']] = df.JobID.str.split(".",expand=True)
 df.Partition.values
 ```

-%% Output
-
-    array(['medium', nan, 'medium', ..., 'medium', nan, nan], dtype=object)
-
 %% Cell type:code id: tags:

-``` python
+``` 
 batchDF=df.dropna(subset=["MaxRSS"])
 userDF=df.dropna(subset=["User"])
 for jid in df.jid.unique():
    userDF['MaxRSS'][userDF['jid'] == jid]=batchDF['MaxRSS'][batchDF['jid'] == jid]

    #print(userDF[userDF['jid'] == jid])

 userDF.head()
 ```

-%% Output
+%% Cell type:markdown id: tags:

-    /home/wsmonroe/.conda/envs/wsmplayground/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
-    A value is trying to be set on a copy of a slice from a DataFrame
-    
-    See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-      after removing the cwd from sys.path.
-    /home/wsmonroe/.conda/envs/wsmplayground/lib/python3.6/site-packages/pandas/core/generic.py:7626: SettingWithCopyWarning:
-    A value is trying to be set on a copy of a slice from a DataFrame
-    
-    See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-      self._update_inplace(new_data)
-    /home/wsmonroe/.conda/envs/wsmplayground/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2961: SettingWithCopyWarning:
-    A value is trying to be set on a copy of a slice from a DataFrame
-    
-    See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-      exec(code_obj, self.user_global_ns, self.user_ns)
+# add more graphs here

 %% Cell type:code id: tags:

-``` python
+``` 
 ```

--- a/requirements.txt
+++ b/requirements.txt
@@ -48,6 +48,7 @@ parso==0.6.2
 pexpect==4.8.0
 phik==0.9.9
 pickleshare==0.7.5
+plotly==4.8.2
 pluggy==0.13.1
 prometheus-client==0.7.1
 prompt-toolkit==3.0.3

--- a/slurm-2sql.ipynb
+++ b/slurm-2sql.ipynb
 %% Cell type:code id: tags:

-``` python
+``` 
 import sqlite3
 import slurm2sql
 import pandas as pd
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 db = sqlite3.connect('test.db')
 slurm2sql.slurm2sql(db, ['-S', '2020-03-18', '-a'])
 ```

-%% Output
-
-    0
-
 %% Cell type:code id: tags:

-``` python
+``` 
 # For example, you can then convert to a dataframe:
 df1 = pd.read_sql('SELECT * FROM slurm', db)
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df1.head(5)
 ```

-%% Output
+%% Cell type:code id: tags:

-         JobID  ArrayJobID  ArrayTaskID JobStep            JobIDSlurm  \
-    0  3319116     3319116          NaN    None  3319116_[43-45,47%5]
-    1  3927198     3887451         30.0    None            3887451_30
-    2  3927198     3887451         30.0   batch      3887451_30.batch
-    3  3927198     3887451         30.0  extern     3887451_30.extern
-    4  3927199     3887451         31.0    None            3887451_31
-    
-            JobName      User     Group   Account      State  ...  \
-    0      1mUD1MPa  user  user  user    PENDING  ...
-    1  100kCrC20MPa  user  user  user  COMPLETED  ...
-    2         batch                      user  COMPLETED  ...
-    3        extern                      user  COMPLETED  ...
-    4  100kCrC20MPa  user  user  user  COMPLETED  ...
-    
-       MaxDiskReadNode  MaxDiskReadTask  MaxDiskWrite  MaxDiskWriteNode  \
-    0                                             NaN
-    1                                             NaN
-    2            c0088                0  1.222336e+10             c0088
-    3            c0088                0  0.000000e+00             c0088
-    4                                             NaN
-    
-       MaxDiskWriteTask  ReqGPUS Comment GPUMem  GPUEff  NGPU
-    0                        NaN    None   None    None  None
-    1                        NaN    None   None    None  None
-    2                 0      NaN    None   None    None  None
-    3                 0      NaN    None   None    None  None
-    4                        NaN    None   None    None  None
-    
-    [5 rows x 63 columns]
+``` 
+print("more plots to come")
+```

 %% Cell type:code id: tags:

-``` python
+``` 
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 import sqlite3
 import slurm2sql
 import pandas as pd
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 db = sqlite3.connect('test.db')
 slurm2sql.slurm2sql(db, ['-S', '2020-03-18', '-a'])
 ```

-%% Output
-
-    0
-
 %% Cell type:code id: tags:

-``` python
+``` 
 # For example, you can then convert to a dataframe:
 df1 = pd.read_sql('SELECT * FROM slurm', db)
 ```

 %% Cell type:code id: tags:

-``` python
+``` 
 df1.head(5)
 ```

-%% Output
+%% Cell type:code id: tags:

-         JobID  ArrayJobID  ArrayTaskID JobStep            JobIDSlurm  \
-    0  3319116     3319116          NaN    None  3319116_[43-45,47%5]
-    1  3927198     3887451         30.0    None            3887451_30
-    2  3927198     3887451         30.0   batch      3887451_30.batch
-    3  3927198     3887451         30.0  extern     3887451_30.extern
-    4  3927199     3887451         31.0    None            3887451_31
-    
-            JobName      User     Group   Account      State  ...  \
-    0      1mUD1MPa  user  user  user    PENDING  ...
-    1  100kCrC20MPa  user  user  user  COMPLETED  ...
-    2         batch                      user  COMPLETED  ...
-    3        extern                      user  COMPLETED  ...
-    4  100kCrC20MPa  user  user  user  COMPLETED  ...
-    
-       MaxDiskReadNode  MaxDiskReadTask  MaxDiskWrite  MaxDiskWriteNode  \
-    0                                             NaN
-    1                                             NaN
-    2            c0088                0  1.222336e+10             c0088
-    3            c0088                0  0.000000e+00             c0088
-    4                                             NaN
-    
-       MaxDiskWriteTask  ReqGPUS Comment GPUMem  GPUEff  NGPU
-    0                        NaN    None   None    None  None
-    1                        NaN    None   None    None  None
-    2                 0      NaN    None   None    None  None
-    3                 0      NaN    None   None    None  None
-    4                        NaN    None   None    None  None
-    
-    [5 rows x 63 columns]
+``` 
+print("more plots to come")
+```

 %% Cell type:code id: tags:

-``` python
+``` 
 ```
No results found