Commit 26fd9d19 authored by KOMAL BADI's avatar KOMAL BADI
Browse files

Throughput analysis for User account jobs.

parent 0195818e
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Mandatory\n",
"import sqlite3\n",
"import slurm2sql\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"import warnings\n",
"from RC_STYLES import rc_styles as s\n",
"warnings.filterwarnings(\"ignore\")\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Mandatory\n",
"db = sqlite3.connect('/data/rc/rc-team/slurm-since-March-allocation.sqlite3')\n",
"#db = sqlite3.connect('/data/rc/rc-team/slurm-since-March.sqlite3')\n",
"df = pd.read_sql('SELECT * FROM slurm', db)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Voluntary\n",
"df['start_time'] = pd.to_datetime(df['Start'],unit='s')\n",
"df['end_time'] = pd.to_datetime(df['End'],unit='s')\n",
"df['time'] = pd.to_datetime(df['Time'],unit='s')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Voluntary\n",
"#ReqMemNode is Requested memory for each node, in MB. \n",
"#Hence we are converting ReqMemNode in GB for better understanding.\n",
"#Converting ReqMemNodes in GB\n",
"df['ReqMemNode']=df['ReqMemNode']/((1024)*(1024)*(1024)) \n",
"\n",
"#AveRSS is Average resident set size of all tasks in job.\n",
"#Converting AveRSS in GB\n",
"df['AveRSS']=df['AveRSS']/((1024)*(1024)*(1024))\n",
"\n",
"##ReqMemCPU is Requested memory for each CPU, in MB.\n",
"#Converting ReqMemCPU in GB\n",
"df['ReqMemCPU']=df['ReqMemCPU']/((1024)*(1024)*(1024))\n",
"\n",
"###ReqTotalRAM is multiplying Requested memory per each CPU by No. of CPUS requested\n",
"#Computing Total Requested RAM in GB\n",
"df['ReqTotalRAM']=df['NCPUS']*df['ReqMemCPU'] \n",
"\n",
"#Naming all the cancelled by user jobs as Cancelled jobs\n",
"df.loc[df['State'].str.contains('CANCELLED'), 'State'] = 'CANCELLED'\n",
"\n",
"#Waiting time is the time between the job being submitted \n",
"#to slurm scheduluer and the time at which job starts\n",
"#computing waiting time\n",
"df['Waiting'] = df['Start']-df['Submit']\n",
"df1 = df.dropna(subset=['Waiting'])\n",
"\n",
"#Computing waiting time in hours\n",
"df1['Waiting'] = df1['Waiting']/3600 \n",
"\n",
"#Computing Elapsed time in hours\n",
"df1['Elapsed'] = df1['Elapsed']/3600\n",
"\n",
"#Computing CPU time in hours\n",
"df1['CPUTime']=df1['CPUTime']/3600\n",
"\n",
"#droping na values for time(submitted jobs at a particular time)\n",
"#df1 = df1.dropna(subset=['Time']) \n",
"#df1 = df1.dropna(subset=['Submit']) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_User_jobs=df1.dropna(subset=['User'])\n",
"df_User_jobs.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Mandatory\n",
"#As each Array Job consists of several tasks , No. od Array Tasks per \n",
"#each array job are calculated.\n",
"User_jobs = df_User_jobs.groupby(\"User\")[\"JobID\"].count().reset_index()\n",
"User_jobs.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"User_jobs=User_jobs[User_jobs!=0].dropna()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Voluntary\n",
"#Sorting the previous pandas data frame in descending order to see \n",
"#highest no. of array tasks for a single array job and pull out that specific array job.\n",
"sample_data=User_jobs.sort_values(by='JobID', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_User_job = df.loc[df['User'] =='abgvg9']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_User_job.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Mandatory\n",
"sample_User_job['submit_time'] = pd.to_datetime(sample_User_job['Submit'],unit='s')\n",
"sample_User_job['start_time'] = pd.to_datetime(sample_User_job['Start'],unit='s')\n",
"sample_User_job['end_time'] = pd.to_datetime(sample_User_job['End'],unit='s')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Mandatory\n",
"#Creating 3 different dataframes in which each data frame \n",
"#is grouped by submitted,started and end time of array job.\n",
"# Job count of each time is calculated.\n",
"count_jobs_submit_time= sample_User_job.groupby([\"submit_time\"] , as_index=False)[\"JobID\"].count()\n",
"count_jobs_start_time= sample_User_job.groupby([\"start_time\"] , as_index=False)[\"JobID\"].count()\n",
"count_jobs_end_time= sample_User_job.groupby([\"end_time\"] , as_index=False)[\"JobID\"].count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"count_jobs_submit_time.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_submit_time = count_jobs_submit_time.rename(columns={'JobID': 'submitted_Job_count'})\n",
"print(df_submit_time)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Voluntary\n",
"#Submit_time as date-time is set as index \n",
"df_submit_time=df_submit_time.set_index('submit_time')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_submit_time.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Mandatory\n",
"###Creating dataframe in which data frame \n",
"#is grouped by started time of array job.\n",
"# Job count of each time is calculated.\n",
"df_start_time = count_jobs_start_time.rename(columns={'JobID': 'Started_Job_count'})\n",
"print(df_start_time)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df1_start_time=df_start_time.set_index('start_time')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df1_start_time.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df1_start_time.plot(figsize=(15,5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Voluntary\n",
"#Resampling the data\n",
"#Resampling is the method that consists of drawing repeated samples\n",
"#from the original data samples.\n",
"#The method of Resampling is a nonparametric method of statistical inference.\n",
"Running_df=df1_start_time.resample('T').sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Mandatory\n",
"#Creating a new column named Running jobs where \n",
"#Running jobs are cumulative sum of \n",
"#started job count\n",
"Running_df['Running']=Running_df.cumsum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Running_df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"Running_df.plot(figsize=(15,5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_User_job['end_time'] = pd.to_datetime(sample_User_job['End'],unit='s')\n",
"count_jobs_end_time= sample_User_job.groupby([\"end_time\"] , as_index=False)[\"JobID\"].count()\n",
"df_end_time=count_jobs_end_time.rename(columns={'JobID': 'End_Job_count'})\n",
"print(df_end_time)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df1_end_time=df_end_time.set_index('end_time')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df1_end_time.index"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_df_end_time=df1_end_time.resample('T').sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_df_end_time.tail(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_df_end_time['Completed']=sample_df_end_time.cumsum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_df_end_time.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_df_end_time['Completed'].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_df_end_time.plot(figsize=(15,5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"axes = sample_df_end_time.plot( marker='.',alpha=0.5, figsize=(11, 4), subplots=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_df_end_time.plot(marker='.', alpha=0.5, linestyle='None')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mergedDf = Running_df.merge(sample_df_end_time, left_index=True, right_index=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mergedDf.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mergedDf['Currently_Running']=mergedDf['Running']-mergedDf['Completed']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mergedDf['Currently_Running'].plot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"t=pd.DataFrame(mergedDf[['Running','Currently_Running']])\n",
"t.plot(figsize=(15,5))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment