Skip to content
Snippets Groups Projects
Commit 5670cf16 authored by Manavalan Gajapathy's avatar Manavalan Gajapathy
Browse files

Merge branch 'master' of...

Merge branch 'master' of gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/sciops/pipelines/quac into qc_under_one_umbrella
parents f184a1ec 00cd5438
No related branches found
No related tags found
1 merge request!2QC under one umbrella (well mostly) under QuaC
......@@ -10,6 +10,7 @@
- [How to run QuaC](#how-to-run-quac)
- [Example usage](#example-usage)
- [Output](#output)
- [Dummy pedigree file creator](#dummy-pedigree-file-creator)
- [Contributing](#contributing)
- [Changelog](#changelog)
......@@ -193,6 +194,15 @@ python src/run_quac.py \
QuaC results are stored at path specified via option `--outdir` (default: `$USER_SCRATCH/tmp/quac/results`). This
includes aggregated QC results produced by [multiqc](https://multiqc.info/).
### Dummy pedigree file creator
Script `src/create_dummy_ped.py` creates a "dummy" pedigree file given a project path as input. It's purpose is just to
create a basic pedigree file, which will lack sex (unless project tracking sheet is provided), relatedness and
affected info. See header of the script for usage instructions.
Note that we plan to use phenotips in future to produce fully capable pedigree file. One may manually create them as
well, but this could be error-prone.
## Contributing
If you like to make changes to the source code, please see the [contribution guidelines](./CONTRIBUTING.md).
......
......@@ -2,17 +2,24 @@
Create dummy ped file by project
Usage:
# setup environment
ml reset
ml Anaconda3
conda activate quac_common
python src/create_dummy_ped.py
# Example
python src/create_dummy_ped.py --project_path "/data/project/worthey_lab/projects/CF_CFF_PFarrell/" --outfile test.ped
"""
from pathlib import Path
import pandas as pd
import fire
def read_project_tracker(project_tracker_f):
"""
Reads project tracking excel file. Expects certain columns to be present.
"""
df = pd.read_excel(project_tracker_f, usecols=["CGDS ID", "Sex"])
......@@ -24,70 +31,51 @@ def read_project_tracker(project_tracker_f):
return sample_sex_dict
def nbbbb():
def main(project_path, outfile, tracking_sheet=False):
"""
Creates dummy pedigree file for the project requested
Args:
project_path (str): Project path. Script will look for samples under its subdirectory "analysis".
outfile (str): Output pedigree file path
tracking_sheet (str, optional): Project tracking sheet in excel format. Uses this for sex info. Defaults to False.
"""
project_path = Path("/data/project/worthey_lab/projects") / project_name / "analysis"
# get sample's sex info from project tracking sheet, if supplied
if tracking_sheet:
sample_sex_dict = read_project_tracker(tracking_sheet)
# get samples from cheaha for the project
project_path = Path(project_path) / "analysis"
samples = (
f.name for f in project_path.iterdir() if f.is_dir() and f.name.startswith(("LW", "UDN"))
)
header = ["#family_id", "sample_id", "paternal_id", "maternal_id", "sex", "phenotype"]
with open(Path(outpath) / f"{project_name}.ped", "w") as out_handle:
with open(outfile, "w") as out_handle:
out_handle.write("\t".join(header) + "\n")
for sample in sorted(samples):
data = ["unknown", sample, "-9", "-9", "-9", "-9"]
data = [
"unknown",
sample,
"-9", # father
"-9", # mother
sample_sex_dict[sample] if tracking_sheet else "-9", # sample sex
"-9", # affected
]
out_handle.write("\t".join(data) + "\n")
return None
def main(outpath):
project_dict = {
"CF_CFF_PFarrell": {
"tracking_sheet": "data/external/project_tracker/PROJECT TRACKING -CF.xlsx",
"affected": "all",
},
"CF_TLOAF_PFarrell": {
"tracking_sheet": "data/external/project_tracker/PROJECT TRACKING -CF.xlsx",
"affected": "all",
},
# "EDS3_unkn_DGreenspan",
# "MuscDyst_SU_MAlexander",
# "UDN_Phase1_EAWorthey",
}
for project_name in project_dict:
# get sample's sex info from project tracking sheet
sample_sex_dict = read_project_tracker(project_dict[project_name]["tracking_sheet"])
# get samples from cheaha for the project
project_path = Path("/data/project/worthey_lab/projects") / project_name / "analysis"
samples = (
f.name
for f in project_path.iterdir()
if f.is_dir() and f.name.startswith(("LW", "UDN"))
)
header = ["#family_id", "sample_id", "paternal_id", "maternal_id", "sex", "phenotype"]
with open(Path(outpath) / f"{project_name}.ped", "w") as out_handle:
out_handle.write("\t".join(header) + "\n")
for sample in sorted(samples):
data = [
"unknown",
sample,
"-9", # father
"-9", # mother
sample_sex_dict[sample], # sample sex
"1" if project_dict[project_name]["affected"] == "all" else "-9", # affected
]
out_handle.write("\t".join(data) + "\n")
return None
if __name__ == "__main__":
OUT_PATH = "data/raw/ped" # not so raw, is it?
main(OUT_PATH)
FIRE_MODE = True
# FIRE_MODE = False
if FIRE_MODE:
fire.Fire(main)
else:
PROJECT_PATH = "/data/project/worthey_lab/projects/CF_CFF_PFarrell/"
OUTFILE = "out.ped"
main(PROJECT_PATH, OUTFILE)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment