Merge branch 'master' of...

Merge branch 'master' of gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/sciops/pipelines/quac into qc_under_one_umbrella

Merge branch 'master' of...
Merge branch 'master' of gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/sciops/pipelines/quac into qc_under_one_umbrella
5670cf16 · Manavalan Gajapathy · f184a1ec · 00cd5438 · 5670cf16 · 5670cf16
Commit 5670cf16 authored 3 years ago by Manavalan Gajapathy
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@
  - [How to run QuaC](#how-to-run-quac)
    - [Example usage](#example-usage)
  - [Output](#output)
+    - [Dummy pedigree file creator](#dummy-pedigree-file-creator)
  - [Contributing](#contributing)
  - [Changelog](#changelog)

@@ -193,6 +194,15 @@ python src/run_quac.py \
 QuaC results are stored at path specified via option `--outdir` (default: `$USER_SCRATCH/tmp/quac/results`). This
 includes aggregated QC results produced by [multiqc](https://multiqc.info/).

+### Dummy pedigree file creator
+
+Script `src/create_dummy_ped.py` creates a "dummy" pedigree file given a project path as input. It's purpose is just to
+create a basic pedigree file, which will lack sex (unless project tracking sheet is provided), relatedness and
+affected info. See header of the script for usage instructions.
+
+Note that we plan to use phenotips in future to produce fully capable pedigree file. One may manually create them as
+well, but this could be error-prone.
+
 ## Contributing

 If you like to make changes to the source code, please see the [contribution guidelines](./CONTRIBUTING.md).

--- a/src/create_dummy_ped.py
+++ b/src/create_dummy_ped.py
@@ -2,17 +2,24 @@
 Create dummy ped file by project

 Usage:
+# setup environment
 ml reset
 ml Anaconda3
 conda activate quac_common
-python src/create_dummy_ped.py
+
+# Example
+python src/create_dummy_ped.py --project_path "/data/project/worthey_lab/projects/CF_CFF_PFarrell/" --outfile test.ped
 """

 from pathlib import Path
 import pandas as pd
+import fire


 def read_project_tracker(project_tracker_f):
+    """
+    Reads project tracking excel file. Expects certain columns to be present.
+    """

    df = pd.read_excel(project_tracker_f, usecols=["CGDS ID", "Sex"])

@@ -24,70 +31,51 @@ def read_project_tracker(project_tracker_f):
    return sample_sex_dict


-def nbbbb():
+def main(project_path, outfile, tracking_sheet=False):
+    """
+    Creates dummy pedigree file for the project requested
+
+    Args:
+        project_path (str): Project path. Script will look for samples under its subdirectory "analysis".
+        outfile (str): Output pedigree file path
+        tracking_sheet (str, optional): Project tracking sheet in excel format. Uses this for sex info. Defaults to False.
+    """

-    project_path = Path("/data/project/worthey_lab/projects") / project_name / "analysis"
+    # get sample's sex info from project tracking sheet, if supplied
+    if tracking_sheet:
+        sample_sex_dict = read_project_tracker(tracking_sheet)
+
+    # get samples from cheaha for the project
+    project_path = Path(project_path) / "analysis"
    samples = (
        f.name for f in project_path.iterdir() if f.is_dir() and f.name.startswith(("LW", "UDN"))
    )

    header = ["#family_id", "sample_id", "paternal_id", "maternal_id", "sex", "phenotype"]
-    with open(Path(outpath) / f"{project_name}.ped", "w") as out_handle:
+    with open(outfile, "w") as out_handle:
        out_handle.write("\t".join(header) + "\n")

        for sample in sorted(samples):
-            data = ["unknown", sample, "-9", "-9", "-9", "-9"]
+            data = [
+                "unknown",
+                sample,
+                "-9",  # father
+                "-9",  # mother
+                sample_sex_dict[sample] if tracking_sheet else "-9",  # sample sex
+                "-9",  # affected
+            ]
            out_handle.write("\t".join(data) + "\n")

    return None


-def main(outpath):
-
-    project_dict = {
-        "CF_CFF_PFarrell": {
-            "tracking_sheet": "data/external/project_tracker/PROJECT TRACKING -CF.xlsx",
-            "affected": "all",
-        },
-        "CF_TLOAF_PFarrell": {
-            "tracking_sheet": "data/external/project_tracker/PROJECT TRACKING -CF.xlsx",
-            "affected": "all",
-        },
-        # "EDS3_unkn_DGreenspan",
-        # "MuscDyst_SU_MAlexander",
-        # "UDN_Phase1_EAWorthey",
-    }
-
-    for project_name in project_dict:
-        # get sample's sex info from project tracking sheet
-        sample_sex_dict = read_project_tracker(project_dict[project_name]["tracking_sheet"])
-
-        # get samples from cheaha for the project
-        project_path = Path("/data/project/worthey_lab/projects") / project_name / "analysis"
-        samples = (
-            f.name
-            for f in project_path.iterdir()
-            if f.is_dir() and f.name.startswith(("LW", "UDN"))
-        )
-
-        header = ["#family_id", "sample_id", "paternal_id", "maternal_id", "sex", "phenotype"]
-        with open(Path(outpath) / f"{project_name}.ped", "w") as out_handle:
-            out_handle.write("\t".join(header) + "\n")
-
-            for sample in sorted(samples):
-                data = [
-                    "unknown",
-                    sample,
-                    "-9",  # father
-                    "-9",  # mother
-                    sample_sex_dict[sample],  # sample sex
-                    "1" if project_dict[project_name]["affected"] == "all" else "-9",  # affected
-                ]
-                out_handle.write("\t".join(data) + "\n")
-
-    return None
-
-
 if __name__ == "__main__":
-    OUT_PATH = "data/raw/ped"  # not so raw, is it?
-    main(OUT_PATH)
+    FIRE_MODE = True
+    # FIRE_MODE = False
+
+    if FIRE_MODE:
+        fire.Fire(main)
+    else:
+        PROJECT_PATH = "/data/project/worthey_lab/projects/CF_CFF_PFarrell/"
+        OUTFILE = "out.ped"
+        main(PROJECT_PATH, OUTFILE)