Commit 46d84d25 authored by Mitchell A Moore's avatar Mitchell A Moore
Browse files

Remove old cells

parent 2265f2eb
%% Cell type:code id: tags:
``` python
import globus_sdk
import matplotlib.pyplot as plt
from matplotlib import figure
import numpy as np
import csv
import pandas as pd
from datetime import datetime, timedelta
from mpl_toolkits.mplot3d import Axes3D
```
%% Cell type:code id: tags:
``` python
""" Read Files into DataFrame objects """
# Read File into DataFrame object
data = pd.read_csv("test_big.csv") # reads comma delimited file into a DataFrame object
#data.head(85) # returns the first n rows of the DataFrame, n here is 16
```
%% Cell type:code id: tags:
``` python
"""Replace Source EP ID with Endpoint name"""
# Replace Source EP ID with Endpoint name
data = data.replace(to_replace='924a32b0-6a2a-11e6-83a8-22000b97daec', value="Pamela Hill Data Share")
data = data.replace(to_replace='e261ffb8-6d04-11e5-ba46-22000b92c6ec', value="DME PerfTest - Argonne")
data = data.replace(to_replace='606579ae-5b03-11e9-bf32-0edbf3a4e7ee', value="cac_dtn_test")
data = data.replace(to_replace='9c8c88c2-ea4a-11e6-b9ba-22000b9a448b', value="Cheaha On-Campus")
data = data.replace(to_replace='7167cb38-9f78-11e6-b0dd-22000b92c261', value="Cheaha Off-Campus")
data.head(85)
```
%% Cell type:code id: tags:
``` python
# Convert String to datatime object and get total time elapsed
data['Elapsed'] = pd.to_datetime(data['Elapsed'], format='%H:%M:%S.%f')
total = 0.0
for item in data['Elapsed']:
total += timedelta(hours=item.hour, minutes=item.minute, seconds=item.second).total_seconds()
print(round(total/60/60, 2), "Hours")
```
%% Cell type:code id: tags:
``` python
# Group data by dataset name
ds01 = data[(data == 'ds01').any(axis=1)]
ds04 = data[(data == 'ds04').any(axis=1)]
ds06 = data[(data == 'ds06').any(axis=1)]
ds08 = data[(data == 'ds08').any(axis=1)]
ds10 = data[(data == 'ds10').any(axis=1)]
ds12 = data[(data == 'ds12').any(axis=1)]
ds14 = data[(data == 'ds14').any(axis=1)]
ds16 = data[(data == 'ds16').any(axis=1)]
# Group data by endpoint
cac = data[(data == 'cac_dtn_test').any(axis=1)]
cheaha_off = data[(data == 'Cheaha Off-Campus').any(axis=1)]
cheaha_on = data[(data == 'Cheaha On-Campus').any(axis=1)]
pamela = data[(data == 'Pamela Hill Data Share').any(axis=1)]
argonne = data[(data == 'DME PerfTest - Argonne').any(axis=1)]
# Examples
# argonne.head(50)
# ds10.head(15)
```
%% Cell type:code id: tags:
``` python
""" Builds bar graphs to represent transfer speeds for different datasets """
# Builds bar graphs to represent transfer speeds for different datasets
#plot
# bg1 = data.plot.bar(x = 'Dataset', y = 'Speed', rot = 100,) # graph shows the speed for each ds
```
%% Cell type:code id: tags:
``` python
""" Builds bar graphs to represent data for different endpoints """
"""
#time = (pd.to_datetime(data['End'], infer_datetime_format=True) - pd.to_datetime(data['Start']
bg2 = data.plot.bar(x = "Dataset",
y = (pd.to_datetime(data['End'],
infer_datetime_format=True) - pd.to_datetime(data['Start'])),
rot=100)
"""
# Builds bar graphs to represent data for different endpoints
# time = (pd.to_datetime(data['End'], infer_datetime_format=True) - pd.to_datetime(data['Start']))
# print(time)
# bg2 = data.plot.bar(x = data["Dataset"],
# y = (pd.to_datetime(data['End'],
# infer_datetime_format=True) - pd.to_datetime(data['Start'])),
# rot=100)
```
%% Cell type:code id: tags:
``` python
""" Builds scatter plots to represent transfer speeds for different datasets """
# Builds scatter plots to represent transfer speeds for different datasets
# plt.scatter((data['Dataset']), data['Speed'])
# plt.title('Dataset Speed')
# plt.xlabel('Dataset')
# plt.ylabel('Speed')
# plt.show()
```
%% Cell type:code id: tags:
``` python
ds01.head()
```
%% Cell type:code id: tags:
``` python
# Show how much the reading varied across endpoints
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds01 (100MB, 10,000 x 10KB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds01['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds01['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds04 (10GB, 10,000 x 1MB files, 100-dirs)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds04['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds04['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds06 (100GB, 100,000 x 1MB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds06['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds06['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds08 (50 x 10GB; 350 x 1GB; 1,000 x 100MB; 5,500 x 10MB; 23,176 x 1MB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds08['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds08['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds10 (1TB, 100 x 10GB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds10['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds10['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds12 (100GB, 1 x 100GB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds12['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds12['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds16 (1TB, 4 x 250GB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds16['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds16['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
```
%% Cell type:code id: tags:
``` python
# See Stats for each endpoint
ds01.head(30)
```
%% Cell type:code id: tags:
``` python
# Histogram using Maplotlib
plt.hist(data['Elapsed'],align='mid', color='red', edgecolor='black')
# align: accepts mid, right, left to assign where the bars should align in relation to their markers
plt.xlabel('Mb/s')
plt.ylabel('# of Datasets')
plt.title('Network Speed Distribution')
```
%% Cell type:code id: tags:
``` python
# File parse
ds_sizes = {'ds01': 0.1, 'ds04':10, 'ds06':100, 'ds08': 1000, 'ds10': 1000, 'ds12':100,
'ds14': 5000, 'ds16': 1000} # sizes given in gigabytes
ds_names = ['ds01', 'ds04', 'ds06', 'ds08', 'ds10', 'ds12',
'ds14', 'ds16']
src_to_oncampx = []
src_to_oncampy = []
oncampx_to_src = []
oncampy_to_src= []
src_to_offcampx = []
src_to_offcampy = []
offcampx_to_src = []
offcampy_to_src= []
def parse_dtn_data(filename, x_in, x_out, y_in, y_out):
"""
filename: string
x_axis: string[]
y_in, y_out: int[]
"""
num_rows = 0
with open(filename,'r') as csvfile:
plots = csv.reader(csvfile, delimiter=',')
for row in plots:
try:
if (row[0][0] == "d") and (row[0][1] == 's'):
if num_rows < 8: # distinguishes transfers from source to UAB.
x_in.append(ds_sizes.get(row[0]))
y_in.append(float(row[3]))
if num_rows >= 8: # distinguishes transfers from UAB to source.
x_out.append(ds_sizes.get(row[0]))
y_out.append(float(row[3]))
num_rows += 1
except IndexError:
# blank text will have been caught here
pass
def autolabel(rects, num):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = rect.get_height()
ax[num].annotate('{}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
x = np.arange(len(ds_names)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots(2, 1)
fig.set_figheight(7)
fig.set_figwidth(9)
def build_graph(num, endpoint, campus, data1, data2):
rects1 = ax[num].bar(x - width / 2, data1, width, label='{} > {}'.format(endpoint, campus))
rects2 = ax[num].bar(x + width / 2, data2, width, label='{} > {}'.format(campus, endpoint))
# Add some text for labels, title and custom x-axis tick labels, etc.
ax[num].set_ylabel('Speed (MB/s)')
ax[num].set_title('DTN Comparison to and from {} Endpoint'.format(campus))
ax[num].set_xticks(x)
ax[num].set_xticklabels(ds_names)
ax[num].legend()
autolabel(rects1, num)
autolabel(rects2, num)
def show_graphs():
build_graph(0, "cac_dtn_test", "OnCampus", src_to_oncampy, oncampy_to_src)
build_graph(1, "cac_dtn_test", "OffCampus", src_to_offcampy, offcampy_to_src)
fig.tight_layout()
plt.show()
def main():
parse_dtn_data("oncampus_data.csv", src_to_oncampx, oncampx_to_src, src_to_oncampy, oncampy_to_src)
parse_dtn_data("offcampus_data.csv", src_to_offcampx, offcampx_to_src, src_to_offcampy, offcampy_to_src)
show_graphs()
if __name__ == "__main__":
main()
```
%% Cell type:code id: tags:
``` python
# Bar graph to compare network speed (mb/s) between each data set per on-campus endpoint
#ax = data.plot.bar(stacked=True)
```
%% Cell type:code id: tags:
``` python
# Bar graph to compare network speed (mb/s) between each data set per off-campus endpoint
```
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment