{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5fb66d11",
   "metadata": {},
   "source": [
    "# run report on pickled list policy data\n",
    "\n",
    "The script reads pickled files that match the `glob_pattern` from the `pickledir` derived from `dirname` and runs the report saving it as a csv to the subdir \"`dirname`/reports\" dir by default.\n",
    "\n",
    "Some progress info is available via the `verbose` flag.\n",
    "\n",
    "The current report aggrates storage stats by top-level-dir and age (year) of data's last access. The goal of this report is to understand the distribution of lesser used data."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51c07f66",
   "metadata": {},
   "source": [
    "!conda info --envs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15997b7d",
   "metadata": {},
   "source": [
    "!conda list"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c740ad5f",
   "metadata": {},
   "source": [
    "!pip list -freeze"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5059337b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from urllib.parse import unquote\n",
    "import sys\n",
    "import os\n",
    "import pathlib\n",
    "import re\n",
    "import dask.dataframe as dd\n",
    "import dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2beaec9e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.diagnostics import ProgressBar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d8afdae",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dask.distributed import Client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81b2e176",
   "metadata": {},
   "source": [
    "Client(scheduler_file='scheduler.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "514ecfc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "client = Client(scheduler_file='scheduler.json')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b17e817d",
   "metadata": {},
   "source": [
    "\n",
    "client = Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a2cdaa6",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "5f4c10d1",
   "metadata": {},
   "source": [
    "## input vars"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9533a4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "dirname=\"data/list-policy_projects_2023-08-31\"  # directory to fine files to pickle\n",
    "glob_pattern = \"*.parquet\"  # file name glob pattern to match, can be file name for individual file\n",
    "line_regex_filter = \".*\"   # regex to match lines of interest in file\n",
    "pickledir=f\"{dirname}/parquet\"\n",
    "reportdir=f\"{dirname}/reports\"\n",
    "tldpath=\"/data/project/ccts/galaxy\"\n",
    "\n",
    "verbose = True\n",
    "limit = 0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a28d0f15",
   "metadata": {},
   "source": [
    "## Utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed367712",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get top level dir on which to aggregate\n",
    "\n",
    "def get_tld(df, dirname):\n",
    "    '''\n",
    "    df: dataframe with path column (e.g. from policy run)\n",
    "    dirname: top level dir (TLD) that contains dirs for report\n",
    "    \n",
    "    The function uses the length of dirname to locate the TLD column in the split path.\n",
    "    '''\n",
    "    dirpaths = dirname.split(\"/\")\n",
    "    new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n",
    "    #df=df.assign(tld=new[len(dirpaths)])\n",
    "    #df[\"tld\"] = new[len(dirpaths)]\n",
    "   \n",
    "    return new[len(dirpaths)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a057a9ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get top level dir on which to aggregate\n",
    "\n",
    "def get_year(df, column):\n",
    "    '''\n",
    "    df: dataframe with path column (e.g. from policy run)\n",
    "    dirname: top level dir (TLD) that contains dirs for report\n",
    "    \n",
    "    The function uses the length of dirname to locate the TLD column in the split path.\n",
    "    '''\n",
    "    new = df[column].dt.year\n",
    "    #dirpaths = dirname.split(\"/\")\n",
    "    #new=df[\"path\"].str.split(\"/\", n=len(dirpaths)+1, expand=True)\n",
    "    #df=df.assign(tld=new[len(dirpaths)])\n",
    "    #df[\"tld\"] = new[len(dirpaths)]\n",
    "   \n",
    "    return new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7bc11b96",
   "metadata": {},
   "outputs": [],
   "source": [
    "def report_tld_year(df):\n",
    "    '''\n",
    "    Aggregate the sum and count of files by year in the top level dir (TLD)\n",
    "    \n",
    "    Uses dict parameter to pandas agg to apply sum and count function to size column\n",
    "    '''\n",
    "    report = df.groupby(['tld', df.access.dt.year]).agg({\"size\": [\"sum\", \"count\"]})\n",
    "    return report"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd92dd03",
   "metadata": {},
   "source": [
    "## Read and parse the files according to glob_pattern"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cdc4558d",
   "metadata": {},
   "source": [
    "dask.config.set(scheduler='threads')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7b970eea",
   "metadata": {},
   "source": [
    "dask.config.set(scheduler='processes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f95ccab4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_policy_parquet(file, columns=['size', 'access', 'modify', 'uid', 'path'], engine=\"pyarrow\"):\n",
    "    \n",
    "    df = dd.read_parquet(file, columns=columns, engine=engine)\n",
    "    \n",
    "    df = client.persist(df)\n",
    "    \n",
    "    df=df.repartition(partition_size=\"64MB\")\n",
    "\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bfe3f88",
   "metadata": {},
   "source": [
    "df = dd.read_parquet(f'{pickledir}/list-*.parquet', columns=['size', 'access', 'modify', 'uid', 'path'], engine=\"pyarrow\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "060e04f9",
   "metadata": {},
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bd5bf01b",
   "metadata": {},
   "source": [
    "df = client.persist(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "480ed8e1",
   "metadata": {},
   "source": [
    "df.dask"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0f072d6",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "df=df.repartition(partition_size=\"64MB\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a5f7d5b",
   "metadata": {},
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f6b85535",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "df.map_partitions(len).compute()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "afe68a9c",
   "metadata": {},
   "source": [
    "df.dask"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "97e29f86",
   "metadata": {},
   "source": [
    "df = df[df.path.str.startswith(\"/data/project/ccts/galaxy/\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5588ef0",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "maydf = read_policy_parquet(\"data/list-policy_projects_2024-05-03/parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f039d3d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "maydf = maydf[maydf.path.str.startswith(\"/data/project/ccts/galaxy/\")]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "74c4102b",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "maydf = maydf.set_index(maydf.path, npartitions=\"auto\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "082b6af9",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "augdf = read_policy_parquet(\"data/list-policy_projects_2023-08-31/parquet\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e90c800",
   "metadata": {},
   "outputs": [],
   "source": [
    "augdf = augdf[augdf.path.str.startswith(\"/data/project/ccts/galaxy/\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a00b148a",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "augdf=augdf.repartition(partition_size=\"64MB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0f9c5c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "maydf=maydf.repartition(partition_size=\"64MB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00a50ff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(augdf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0196721",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(maydf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49511507",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "augdf = augdf.set_index(augdf.path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "173b6b00",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "maydf = maydf.set_index(maydf.path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9da71216",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "joindf = maydf.join(augdf, how=\"outer\", lsuffix=\"_may\", rsuffix=\"_aug\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30e7d44a",
   "metadata": {},
   "outputs": [],
   "source": [
    "joindf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29880891",
   "metadata": {},
   "outputs": [],
   "source": [
    "joindf.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8274f65b",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(joindf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ba51037",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(joindf[joindf.modify_aug.isna()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "250824be",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(joindf[joindf.modify_may.isna()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1890c403",
   "metadata": {},
   "outputs": [],
   "source": [
    "modify_comp = joindf.modify_may != joindf.modify_aug"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76d79094",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(joindf[modify_comp])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "803a863c",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(joindf[joindf.size_may != joindf.size_aug])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ef39703",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(joindf[joindf.size_may == joindf.size_aug])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b9953a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(joindf[joindf.uid_may != joindf.uid_aug])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cda237a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(joindf[joindf.access_may != joindf.access_aug])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cc7f6c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "len(joindf[joindf.access_may == joindf.access_aug])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0467d4df",
   "metadata": {},
   "outputs": [],
   "source": [
    "stop"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4352f00c",
   "metadata": {},
   "source": [
    "## Aggregate stats into running totals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1a345c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1=get_tld(df, tldpath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1da65bcc",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1.dask"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4f4ac4d1",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "with ProgressBar():\n",
    "    display(df1.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7706bcf8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.assign(tld=df1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11e5c92a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dask"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10590450",
   "metadata": {},
   "source": [
    "df = df.drop(columns=\"path\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70213716",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = get_year(df, \"access\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d777440",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.assign(year=df1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddf01b3d",
   "metadata": {},
   "source": [
    "df = df.drop(columns=[\"uid\",\"access\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84d184c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d86e507e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db76d5f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = client.persist(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "471d5b18",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d414a9ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a859189",
   "metadata": {},
   "outputs": [],
   "source": [
    "def ls_path(df, path):\n",
    "    tmp = df[df.path.str.match(path)]\n",
    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
    "    \n",
    "    return tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1e5fecb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def du_by_year(df, path, time=\"access\"):\n",
    "    tmp = df[df.path.str.match(path)]\n",
    "\n",
    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
    "    \n",
    "    tmp = tmp.assign(year=get_year(tmp, time))\n",
    "    \n",
    "    tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\"])\n",
    "    tmp = client.persist(tmp)\n",
    "    \n",
    "    tmp = tmp.groupby(['tld', 'year']).sum()\n",
    "    \n",
    "    tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n",
    "    \n",
    "    return tmp\n",
    "                    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16ef8689",
   "metadata": {},
   "outputs": [],
   "source": [
    "def du_by_year(df, path, time=\"access\"):\n",
    "    tmp = df[df.path.str.match(path)]\n",
    "\n",
    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
    "    \n",
    "    tmp = tmp.assign(year=get_year(tmp, time))\n",
    "    \n",
    "    tmp = tmp.drop(columns=[\"uid\", time, \"path\"])\n",
    "    tmp = client.persist(tmp)\n",
    "    \n",
    "    tmp = tmp.groupby(['tld', 'year']).sum()\n",
    "    \n",
    "    tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n",
    "    \n",
    "    return tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3de3cb54",
   "metadata": {},
   "outputs": [],
   "source": [
    "def du_by_tld(df, path, time=\"access\"):\n",
    "    tmp = df[df.path.str.match(path)]\n",
    "\n",
    "    \n",
    "    tmp = tmp.assign(tld=get_tld(tmp, path))\n",
    "    \n",
    "    #tmp = tmp.assign(year=get_year(tmp, time))\n",
    "    \n",
    "    tmp = tmp.drop(columns=[\"uid\", \"access\", \"path\", \"year\"])\n",
    "    tmp = client.persist(tmp)\n",
    "    tmp = tmp.groupby(['tld']).sum()\n",
    "    \n",
    "    tmp = tmp.assign(terabytes=tmp[\"size\"]/(10**12))\n",
    "    \n",
    "    return tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0ca1d03",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e7151f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1c75727",
   "metadata": {},
   "outputs": [],
   "source": [
    "%time\n",
    "\n",
    "dudf = du_by_year(df, '/data/project/ccts', \"modify\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f9639f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "dudf.dask"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "489989ef",
   "metadata": {},
   "source": [
    "%time\n",
    "\n",
    "dudf = du_by_tld(df, '/data/project/ccts')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "312cfb3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "dudf = client.persist(dudf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08783886",
   "metadata": {},
   "outputs": [],
   "source": [
    "dudf.dask"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b8686b92",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "dudf=dudf.repartition(partition_size=\"64MB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "932cb3e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "dudf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55e73440",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "dudf = client.compute(dudf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a095bd71",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "dudf = dudf.result()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79a0378d",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "dudf.sort_values([\"tld\", \"year\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89abcc33",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "tmp=dudf.reset_index()\n",
    "#tmp[(tmp['tld']==\"galaxy\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5642e1d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp[tmp.tld=='galaxy'].sort_values('year')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "134dcdb6",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "dudf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a801a5ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "dudf.groupby(\"tld\").sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9379b396",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "lsdf = ls_path(df, '/data/project/ccts/galaxy')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87b04252",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "lsdf = client.persist(lsdf.tld.unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5599d717",
   "metadata": {},
   "source": [
    "lsdf.dask"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8d8e827",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "lsdf = client.compute(lsdf)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6acbccb",
   "metadata": {},
   "source": [
    "lsdf.result()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce4ba931",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a7ed7af",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "dfccts = df[df.path.str.match('/data/project/ccts')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf656e76",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfccts.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffbd1abc",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfccts = dfccts.assign(tld=get_tld(dfccts, '/data/project/ccts/galaxy'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3748708",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfccts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c741dfdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfccts.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7045b7e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "dfccts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20beeb75",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eebf3e8f",
   "metadata": {},
   "source": [
    "lru_projects = ['ICOS', 'boldlab', 'hartmanlab', 'sdtrlab', 'kinglab', 'kobielab', 'MRIPhantom', 'NCRlab', 'bridgeslab', 'hsight', 'kutschlab', 'lcdl', 'metalsgroup', 'rowelab', 'szaflarski_mirman']\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9b5d64a0",
   "metadata": {},
   "source": [
    "condition=df1[\"tld\"].isin(lru_projects)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ccca62ff",
   "metadata": {},
   "source": [
    "condition=df1[\"tld\"].isin([\"ccts\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55aa52c5",
   "metadata": {},
   "source": [
    "lru=df1[condition]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b9e1df01",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "with ProgressBar():\n",
    "    display(lru.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31205ddd",
   "metadata": {},
   "source": [
    "df.groupby(['tld', 'year']).size.sum.visualize(node_attr={'penwidth': '6'})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b49da8a3",
   "metadata": {},
   "source": [
    "df2 = df.groupby(['tld', 'year']).agg({\"size\": [\"sum\", \"count\"]})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dca64098",
   "metadata": {},
   "source": [
    "df.groupby('name').x.mean().visualize(node_attr={'penwidth': '6'})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9de95bf7",
   "metadata": {},
   "source": [
    "%%time\n",
    "\n",
    "df2 = report_tld_year(lru)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c492742",
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ded14edd",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time \n",
    "\n",
    "df2 = df.groupby(['tld', 'year']).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "113ba27b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e87713f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e65585b",
   "metadata": {},
   "outputs": [],
   "source": [
    "tbsize = df2[\"size\"]/(10**12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4dbce6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2 = df2.assign(terrabytes=tbsize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60557552",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5056dd46",
   "metadata": {},
   "outputs": [],
   "source": [
    "df2.dask"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a57c5033",
   "metadata": {},
   "outputs": [],
   "source": [
    "report=df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7f78f66",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "report = client.compute(report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79a424a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "report"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04a25511",
   "metadata": {},
   "source": [
    "## Create final report\n",
    "\n",
    "Create summary format for gigabyte and terabyte columns https://stackoverflow.com/a/20937592/8928529"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ccfac4b",
   "metadata": {},
   "source": [
    "report[\"average_size\"] = report[\"sum\"]/report[\"count\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "708a62bf",
   "metadata": {},
   "source": [
    "report[\"terabytes\"] = report[\"sum\"]/(10**12)\n",
    "report[\"terabytes\"] = report[\"terabytes\"].map('{:,.2f}'.format)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88915adb",
   "metadata": {},
   "source": [
    "report[\"gigabytes\"] = report[\"sum\"]/(10**9)\n",
    "report[\"gigabytes\"] = report[\"gigabytes\"].map('{:,.2f}'.format)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5472320",
   "metadata": {},
   "source": [
    "## Save report as CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23092d7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "report = report.result()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc748cf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffc99a54",
   "metadata": {},
   "outputs": [],
   "source": [
    "# only create dir if there is data to pickle\n",
    "if (len(report) and not os.path.isdir(reportdir)):\n",
    "    os.mkdir(reportdir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d0ec8cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "reportdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4e836a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "if (verbose): print(f\"report: groupby-tld\")\n",
    "report.to_csv(f\"{reportdir}/groupby-tld-dask3.csv.gz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12d02352",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "report.to_parquet(f\"{reportdir}/groupby-tld-year-dask4.parquet\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cbec7b4",
   "metadata": {},
   "source": [
    "## Summarize high-level stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "600650db",
   "metadata": {},
   "outputs": [],
   "source": [
    "report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06f17bb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "report.reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab4090bd",
   "metadata": {},
   "source": [
    "report[report[\"sum\"] == report[\"sum\"].max()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d2f464a",
   "metadata": {},
   "outputs": [],
   "source": [
    "report[(report[\"size\"] > 5*10**13)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c7358c1",
   "metadata": {},
   "source": [
    "report=report.reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f42f3b99",
   "metadata": {},
   "source": [
    "summer = report.groupby(\"tld\").agg(\"sum\", \"sum\") #[report[\"sum\"] > 10**13"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f1f801b",
   "metadata": {},
   "source": [
    "summer[\"terabytes\"] = summer[\"sum\"]/(10**12)\n",
    "summer[\"terabytes\"] = summer[\"terabytes\"].map('{:,.2f}'.format)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f27fc339",
   "metadata": {},
   "source": [
    "print(summer[summer[\"sum\"] > 10**13].sort_values(\"sum\", ascending=False)[['count', 'terabytes']])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b45d06d",
   "metadata": {},
   "source": [
    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ef5dea0",
   "metadata": {},
   "source": [
    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d2d3afd3",
   "metadata": {},
   "source": [
    "report[(report[\"sum\"] <= 10**13) & (report[\"access\"] <= 2021)][\"sum\"].sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc80d326",
   "metadata": {},
   "source": [
    "report[(report[\"sum\"] > 10**13) & (report[\"access\"] < 2023)][\"sum\"].sum()/10**12"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}