Commit 39ead444 authored by Mitchell Moore's avatar Mitchell Moore
Browse files

Clean Notebook and remove checkpoints

parent 959704b4
%% Cell type:code id: tags:
``` python
import globus_sdk
import matplotlib.pyplot as plt
import numpy as np
import csv
import pandas as pd
from datetime import datetime, timedelta
from mpl_toolkits.mplot3d import Axes3D
```
%% Cell type:code id: tags:
``` python
""" Read Files into DataFrame objects """
data = pd.read_csv("test.csv") # reads comma delimited file into a DataFrame object
off_campus = pd.read_csv("offcampus_data.csv")
on_campus = pd.read_csv("oncampus_data.csv")
data.head(16) # returns the first n rows of the DataFrame, n here is 16
```
%% Cell type:code id: tags:
``` python
""" Builds a 3D plot based on the elapsed time values using pandas and matplotlib """
# plot
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(range(len(data['Dataset'])), data['Speed'],(pd.to_datetime(data['End'], infer_datetime_format=True) - pd.to_datetime(data['Start'], infer_datetime_format=True)).dt.total_seconds() / 60, c='navy', s=60)
ax.set(xticks=range(len(data['Dataset'])), xticklabels=data['Dataset'])
plt.xticks(rotation=45)
plt.show()
```
%% Cell type:code id: tags:
``` python
""" Builds bar graphs to represent transfer speeds for different datasets """
#plot
bg1 = data.plot.bar(x = 'Dataset', y = 'Speed', rot = 100) # graph shows the speed for each ds
```
%% Cell type:code id: tags:
``` python
""" Builds bar graphs to represent data for different endpoints """
"""
#time = (pd.to_datetime(data['End'], infer_datetime_format=True) - pd.to_datetime(data['Start']
bg2 = data.plot.bar(x = "Dataset",
y = (pd.to_datetime(data['End'],
infer_datetime_format=True) - pd.to_datetime(data['Start'])),
rot=100)
"""
```
%% Cell type:code id: tags:
``` python
""" Builds scatter plots to represent transfer speeds for different datasets """
plt.scatter((data['Dataset']), data['Speed'])
plt.title('Dataset Speed')
plt.xlabel('Dataset')
plt.ylabel('Speed')
plt.show()
color = ["r","r","g","r","r","g","r","r","r","g"]
df = pd.DataFrame( {"frequency" : frequency, "color" : color})
df.sort_values("frequency", inplace=True)
plt.scatter(x=range(len(df)), y= df["frequency"], c = df["color"])
plt.show()
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
import globus_sdk
import matplotlib.pyplot as plt
from matplotlib import figure
import numpy as np
import csv
import pandas as pd
from datetime import datetime, timedelta
from mpl_toolkits.mplot3d import Axes3D
import matplotlib
matplotlib.style.use('ggplot')
import matplotlib.ticker as ticker
```
%% Cell type:code id: tags:
``` python
# Read File into DataFrame object
data = pd.read_csv("test_big.csv") # reads comma delimited file into a DataFrame object
#data.head(85) # returns the first n rows of the DataFrame, n here is 85
```
%% Cell type:code id: tags:
``` python
# Replace Source EP ID with Endpoint name
data = data.replace(to_replace='924a32b0-6a2a-11e6-83a8-22000b97daec', value="Pamela Hill Data Share")
data = data.replace(to_replace='e261ffb8-6d04-11e5-ba46-22000b92c6ec', value="DME PerfTest - Argonne")
data = data.replace(to_replace='606579ae-5b03-11e9-bf32-0edbf3a4e7ee', value="cac_dtn_test")
data = data.replace(to_replace='9c8c88c2-ea4a-11e6-b9ba-22000b9a448b', value="Cheaha On-Campus")
data = data.replace(to_replace='7167cb38-9f78-11e6-b0dd-22000b92c261', value="Cheaha Off-Campus")
data.head(85)
```
%% Output
Dataset Start End \
0 ds01 2020-12-29T17:39:21+00:00 2020-12-29T17:39:39+00:00
1 ds04 2020-12-29T17:39:40+00:00 2020-12-29T17:40:03+00:00
2 ds06 2020-12-29T17:40:03+00:00 2020-12-29T17:43:10+00:00
3 ds08 2020-12-29T17:43:10+00:00 2020-12-29T18:10:01+00:00
4 ds10 2020-12-29T18:10:01+00:00 2020-12-29T18:35:55+00:00
.. ... ... ...
79 ds06 2020-12-30T12:18:49+00:00 2020-12-30T12:32:26+00:00
80 ds08 2020-12-30T12:32:26+00:00 2020-12-30T12:43:45+00:00
81 ds10 2020-12-30T12:43:46+00:00 2020-12-30T12:50:34+00:00
82 ds12 2020-12-30T12:50:35+00:00 2020-12-30T12:52:46+00:00
83 ds16 2020-12-30T12:52:47+00:00 2020-12-30T12:58:35+00:00
Elapsed Speed Source EP ID Dest. EP ID \
0 0:00:18.231796 5.26 cac_dtn_test Cheaha On-Campus
1 0:00:23.210824 412.26 cac_dtn_test Cheaha On-Campus
2 0:03:06.726332 510.74 cac_dtn_test Cheaha On-Campus
3 0:26:50.712815 608.86 cac_dtn_test Cheaha On-Campus
4 0:25:53.995720 613.69 cac_dtn_test Cheaha On-Campus
.. ... ... ... ...
79 0:13:37.127220 116.71 Cheaha Off-Campus Pamela Hill Data Share
80 0:11:19.083500 1444.57 Cheaha Off-Campus Pamela Hill Data Share
81 0:06:48.986279 2332.71 Cheaha Off-Campus Pamela Hill Data Share
82 0:02:11.667164 726.12 Cheaha Off-Campus Pamela Hill Data Share
83 0:05:48.506915 2737.70 Cheaha Off-Campus Pamela Hill Data Share
Task ID
0 c89e9dc2-49fc-11eb-8ffb-0a34088e79f9
1 d39f4bae-49fc-11eb-8ffb-0a34088e79f9
2 e1a46982-49fc-11eb-8ffb-0a34088e79f9
3 5110c13a-49fd-11eb-8ffb-0a34088e79f9
4 113d2658-4a01-11eb-b18d-0ee0d5d9299f
.. ...
79 2b9096aa-4a99-11eb-8ffb-0a34088e79f9
80 12ba1104-4a9b-11eb-b565-02d9497ca481
81 a7aa0e4e-4a9c-11eb-b18e-0ee0d5d9299f
82 9b9aee74-4a9d-11eb-b565-02d9497ca481
83 ea42f904-4a9d-11eb-b565-02d9497ca481
[84 rows x 8 columns]
%% Cell type:code id: tags:
``` python
# Convert String to datatime object and get total time elapsed
data['Elapsed'] = pd.to_datetime(data['Elapsed'], format='%H:%M:%S.%f')
total = 0.0
for item in data['Elapsed']:
total += timedelta(hours=item.hour, minutes=item.minute, seconds=item.second).total_seconds()
print(round(total/60/60, 2), "Hours")
# Show Elapsed time on a row by row basis
# time = (pd.to_datetime(data['End'], infer_datetime_format=True) - pd.to_datetime(data['Start']))
# print(time)
```
%% Output
19.3 Hours
%% Cell type:code id: tags:
``` python
# Group data by dataset name
ds01 = data[(data == 'ds01').any(axis=1)]
ds04 = data[(data == 'ds04').any(axis=1)]
ds06 = data[(data == 'ds06').any(axis=1)]
ds08 = data[(data == 'ds08').any(axis=1)]
ds10 = data[(data == 'ds10').any(axis=1)]
ds12 = data[(data == 'ds12').any(axis=1)]
ds14 = data[(data == 'ds14').any(axis=1)]
ds16 = data[(data == 'ds16').any(axis=1)]
# Group data by endpoint
cac = data[(data == 'cac_dtn_test').any(axis=1)]
cheaha_off = data[(data == 'Cheaha Off-Campus').any(axis=1)]
cheaha_on = data[(data == 'Cheaha On-Campus').any(axis=1)]
pamela = data[(data == 'Pamela Hill Data Share').any(axis=1)]
argonne = data[(data == 'DME PerfTest - Argonne').any(axis=1)]
# Examples
# argonne.head(50)
# ds10.head(15)
```
%% Cell type:code id: tags:
``` python
# Builds scatter plots to represent transfer speeds for different datasets
# plt.scatter((data['Dataset']), data['Speed'])
# plt.title('Dataset Speed')
# plt.xlabel('Dataset')
# plt.ylabel('Speed')
# plt.show()
```
%% Cell type:code id: tags:
``` python
# Show how much the reading varied across endpoints
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds01 (100MB, 10,000 x 10KB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds01['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds01['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds04 (10GB, 10,000 x 1MB files, 100-dirs)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds04['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds04['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds06 (100GB, 100,000 x 1MB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds06['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds06['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds08 (50 x 10GB; 350 x 1GB; 1,000 x 100MB; 5,500 x 10MB; 23,176 x 1MB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds08['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds08['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds10 (1TB, 100 x 10GB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds10['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds10['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds12 (100GB, 1 x 100GB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds12['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds12['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(15)
fig.suptitle('ds16 (1TB, 4 x 250GB, 1-dir)', fontsize=18)
ax1.set_title('Effective Speed')
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Speed (Mb/s)')
ax2.set_title('Elapsed Time')
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Time (hh:mm:ss)')
ds16['Speed'].hist(grid=True, ax=ax1, color='gold', edgecolor='green') # histogram using pandas
ds16['Elapsed'].hist(grid=True, ax=ax2, color='green', edgecolor='gold') # histogram using pandas
```
%% Cell type:code id: tags:
``` python
labels = ['ds01: 100MB, 10,000 x 10KB, 1-dir', 'ds04: 10GB, 10,000 x 1MB files, 100-dirs', 'ds06: 100GB, 100,000 x 1MB, 1-dir', 'ds08: 50 x 10GB; 350 x 1GB; 1,000 x 100MB; 5,500 x 10MB; 23,176 x 1MB, 1-dir', 'ds10: 1TB, 100 x 10GB, 1-dir', 'ds12: 100GB, 1 x 100GB, 1-dir', 'ds16: 1TB, 4 x 250GB, 1-dir']
cac_out = cac.where(cac['Source EP ID'] == 'cac_dtn_test').dropna()['Speed']
cac_in = cac.where(cac['Dest. EP ID'] == 'cac_dtn_test').dropna()['Speed']
pamela_out = pamela.where(pamela['Source EP ID'] == 'Pamela Hill Data Share').dropna()['Speed']
pamela_in = pamela.where(pamela['Dest. EP ID'] == 'Pamela Hill Data Share').dropna()['Speed']
argonne_out = argonne.where(argonne['Source EP ID'] == 'DME PerfTest - Argonne').dropna()['Speed']
argonne_in = argonne.where(argonne['Dest. EP ID'] == 'DME PerfTest - Argonne').dropna()['Speed']
cac_to = [cac_out.iloc[0], cac_out.iloc[1], cac_out.iloc[2], cac_out.iloc[3], cac_out.iloc[4], cac_out.iloc[5], cac_out.iloc[6]]
cac_from = [cac_in.iloc[0], cac_in.iloc[1], cac_in.iloc[2], cac_in.iloc[3], cac_in.iloc[4], cac_in.iloc[5], cac_in.iloc[6]]
pamela_to = [pamela_out.iloc[0], pamela_out.iloc[1], pamela_out.iloc[2], pamela_out.iloc[3], pamela_out.iloc[4], pamela_out.iloc[5], pamela_out.iloc[6]]
pamela_from = [pamela_in.iloc[0], pamela_in.iloc[1], pamela_in.iloc[2], pamela_in.iloc[3], pamela_in.iloc[4], pamela_in.iloc[5], pamela_in.iloc[6]]
argonne_to = [argonne_out.iloc[0], argonne_out.iloc[1], argonne_out.iloc[2], argonne_out.iloc[3], argonne_out.iloc[4], argonne_out.iloc[5], argonne_out.iloc[6]]
argonne_from = [argonne_in.iloc[0], argonne_in.iloc[1], argonne_in.iloc[2], argonne_in.iloc[3], argonne_in.iloc[4], argonne_in.iloc[5], argonne_in.iloc[6]]
```
%% Cell type:code id: tags:
``` python
x = np.arange(len(labels)) # the label locations
width = 0.27 # the width of the bars
fig, ax = plt.subplots()
# fig.set_figwidth(15)
ax.figure.set_size_inches(10,8)
rects1 = ax.bar(x - width, cac_to, width, label='cac -->', color='#1F618D')
stack1 = ax.bar(x - width, cac_from, width, bottom=cac_to, label='cac <--', color='#7FB3D5')
rects2 = ax.bar(x, pamela_to, width, label='pamela -->', color='#B9770E')
stack2 = ax.bar(x, pamela_from, width, bottom=pamela_to, label='pamela <--', color='#F8C471')
rects3 = ax.bar(x + width, argonne_to, width, label='argonne -->', color='#196F3D')
stack3 = ax.bar(x + width, argonne_from, width, bottom=argonne_to, label='argonne <--', color='#7DCEA0')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Speed (Mb/s)')
ax.set_title('Cheaha On-Campus')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
def autolabel(rects, stacks, lift):
"""Attach a text label above each bar in *rects*, displaying its height."""
i = 0
offset = 3
if lift: # lifts the output number by 12px
offset = 12
for rect in rects:
height = round((rect.get_height() + stacks[i].get_height()), 2)
ax.annotate('{}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, offset), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
i += 1
autolabel(rects1, stack1, False)
autolabel(rects2, stack2, True)
autolabel(rects3, stack3, False)
fig.tight_layout()
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right") # # rotates labels and aligns them horizontally to left
plt.show()
```
%% Cell type:code id: tags:
``` python
x = np.arange(len(labels)) # the label locations
width = 0.27 # the width of the bars
fig, ax = plt.subplots()
# fig.set_figwidth(15)
ax.figure.set_size_inches(10,8)
rects1 = ax.bar(x - width, cac_to, width, label='cac -->', color='#1F618D')
stack1 = ax.bar(x - width, cac_from, width, bottom=cac_to, label='cac <--', color='#7FB3D5')
rects2 = ax.bar(x, pamela_to, width, label='pamela -->', color='#B9770E')
stack2 = ax.bar(x, pamela_from, width, bottom=pamela_to, label='pamela <--', color='#F8C471')
rects3 = ax.bar(x + width, argonne_to, width, label='argonne -->', color='#196F3D')
stack3 = ax.bar(x + width, argonne_from, width, bottom=argonne_to, label='argonne <--', color='#7DCEA0')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Speed (Mb/s)')
ax.set_title('Cheaha On-Campus')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
def autolabel(rects, stacks, lift):
"""Attach a text label above each bar in *rects*, displaying its height."""
i = 0
offset = 3
if lift: # lifts the output number by 12px
offset = 12
for rect in rects:
height = round((rect.get_height() + stacks[i].get_height()), 2)
ax.annotate('{}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, offset), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
i += 1
autolabel(rects1, stack1, False)
autolabel(rects2, stack2, True)
autolabel(rects3, stack3, False)
fig.tight_layout()
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right") # # rotates labels and aligns them horizontally to left
plt.yscale('log')
plt.show()
```
%% Cell type:code id: tags:
``` python
# Indices substituted for Off-Campus Data
cac_to = [cac_out.iloc[7], cac_out.iloc[8], cac_out.iloc[9], cac_out.iloc[10], cac_out.iloc[11], cac_out.iloc[12], cac_out.iloc[13]]
cac_from = [cac_in.iloc[7], cac_in.iloc[8], cac_in.iloc[9], cac_in.iloc[10], cac_in.iloc[11], cac_in.iloc[12], cac_in.iloc[13]]
pamela_to = [pamela_out.iloc[7], pamela_out.iloc[8], pamela_out.iloc[9], pamela_out.iloc[10], pamela_out.iloc[11], pamela_out.iloc[12], pamela_out.iloc[13]]
pamela_from = [pamela_in.iloc[7], pamela_in.iloc[8], pamela_in.iloc[9], pamela_in.iloc[10], pamela_in.iloc[11], pamela_in.iloc[12], pamela_in.iloc[13]]
argonne_to = [argonne_out.iloc[7], argonne_out.iloc[8], argonne_out.iloc[9], argonne_out.iloc[10], argonne_out.iloc[11], argonne_out.iloc[12], argonne_out.iloc[13]]
argonne_from = [argonne_in.iloc[7], argonne_in.iloc[8], argonne_in.iloc[9], argonne_in.iloc[10], argonne_in.iloc[11], argonne_in.iloc[12], argonne_in.iloc[13]]
```
%% Cell type:code id: tags:
``` python
x = np.arange(len(labels)) # the label locations
width = 0.27 # the width of the bars
fig, ax = plt.subplots()
# fig.set_figwidth(15)
ax.figure.set_size_inches(10,8)
rects1 = ax.bar(x - width, cac_to, width, label='cac -->', color='#1F618D')
stack1 = ax.bar(x - width, cac_from, width, bottom=cac_to, label='cac <--', color='#7FB3D5')
rects2 = ax.bar(x, pamela_to, width, label='pamela -->', color='#B9770E')
stack2 = ax.bar(x, pamela_from, width, bottom=pamela_to, label='pamela <--', color='#F8C471')
rects3 = ax.bar(x + width, argonne_to, width, label='argonne -->', color='#196F3D')
stack3 = ax.bar(x + width, argonne_from, width, bottom=argonne_to, label='argonne <--', color='#7DCEA0')
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Speed (Mb/s)')
ax.set_title('Cheaha off-Campus')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
autolabel(rects1, stack1, False)
autolabel(rects2, stack2, True)
autolabel(rects3, stack3, False)
fig.tight_layout()
plt.setp(ax.xaxis.get_majorticklabels(), rotation=45, ha="right") # rotates labels and aligns them horizontally to left
plt.show()
```
%% Cell type:code id: tags:
``` python
x = np.arange(len(labels)) # the label locations
width = 0.27 # the width of the bars
fig, ax = plt.subplots()
# fig.set_figwidth(15)
ax.figure.set_size_inches(10,8)
rects1 = ax.bar(x - width, cac_to, width, label='cac -->', color='#1F618D')
stack1 = ax.bar(x - width,