Skip to content
Snippets Groups Projects
Commit 614a5931 authored by Manavalan Gajapathy's avatar Manavalan Gajapathy
Browse files

User configurable hardware resources

parent fdbbd797
No related branches found
No related tags found
1 merge request!4User configurable hardware resources
......@@ -35,3 +35,8 @@ YYYY-MM-DD John Doe
* Bugfix: Fixes error when there is only one sample in input ped file (#34)
* Adds system-testing for such only-one-sample-in-input setup (#35).
2022-04-07 Manavalan Gajapathy
* Previously hardcoded hardware resources for snakemake rules can now be supplied via `configs/workflow.yaml` (closes #48)
* Modified multiqc conda env config to use explicit dependencies to get around installation issues (closes #47)
\ No newline at end of file
......@@ -185,18 +185,24 @@ snakemake rules.
### Set up workflow config file
QuaC requires a workflow config file in yaml format (`configs/workflow.yaml`), which provides filepaths to necessary
dependencies required by certain QC tools. Their format should look like:
QuaC requires a workflow config file in yaml format ([`configs/workflow.yaml`](./configs/workflow.yaml)), which provides filepaths to necessary
dataset dependencies required by certain QC tools. In addition, hardware resources can be configured (refer to [`configs/workflow.yaml`](./configs/workflow.y) for more info). File format should look like:
```yaml
ref: "path to ref genome path"
somalier:
sites: "path to somalier's site file"
labels_1kg: "path to somalier's ancestry-labels-1kg file"
somalier_1kg: "dirpath to somalier's 1kg-somalier files"
verifyBamID:
svd_dat_wgs: "path to WGS resources .dat files"
svd_dat_exome: "path to exome resources .dat files"
datasets:
ref: "path to ref genome path"
somalier:
sites: "path to somalier's site file"
labels_1kg: "path to somalier's ancestry-labels-1kg file"
somalier_1kg: "dirpath to somalier's 1kg-somalier files"
verifyBamID:
svd_dat_wgs: "path to WGS resources .dat files"
svd_dat_exome: "path to exome resources .dat files"
#### hardware resources ####
resources:
...
...
```
#### Prepare verifybamid datasets for exome analysis
......
......@@ -20,4 +20,4 @@
"multiqc_aggregation_all_samples": {
"mem-per-cpu": "24G"
}
}
}
\ No newline at end of file
channels:
- conda-forge
- anaconda
- bioconda
- conda-forge
- defaults
dependencies:
- python =3.6
- multiqc=1.9
- python=3.6.13
- multiqc==1.9
- networkx=2.5
- numpy=1.19.5
- _libgcc_mutex=0.1
- _openmp_mutex=4.5
- brotlipy=0.7.0
- ca-certificates=2021.5.30
- certifi=2021.5.30
- cffi=1.14.6
- chardet=4.0.0
- charset-normalizer=2.0.0
- click=8.0.1
- coloredlogs=15.0.1
- colormath=3.0.0
- cryptography=3.4.7
- cycler=0.10.0
- decorator=5.0.9
- freetype=2.10.4
- future=0.18.2
- humanfriendly=9.2
- idna=3.1
- importlib-metadata=4.6.3
- jbig=2.1
- jinja2=3.0.1
- jpeg=9d
- kiwisolver=1.3.1
- lcms2=2.12
- ld_impl_linux-64=2.36.1
- lerc=2.2.1
- libblas=3.9.0
- libcblas=3.9.0
- libdeflate=1.7
- libffi=3.3
- libgcc-ng=11.1.0
- libgfortran-ng=11.1.0
- libgfortran5=11.1.0
- libgomp=11.1.0
- liblapack=3.9.0
- libopenblas=0.3.17
- libpng=1.6.37
- libstdcxx-ng=11.1.0
- libtiff=4.3.0
- libwebp-base=1.2.0
- lz4-c=1.9.3
- lzstring=1.0.4
- markdown=3.3.4
- markupsafe=2.0.1
- matplotlib-base=3.3.4
- ncurses=6.2
- olefile=0.46
- openjpeg=2.4.0
- openssl=1.1.1k
- pillow=8.3.1
- pip=21.2.3
- pycparser=2.20
- pyopenssl=20.0.1
- pyparsing=2.4.7
- pysocks=1.7.1
- python-dateutil=2.8.2
- python_abi=3.6
- pyyaml=5.4.1
- readline=8.1
- requests=2.26.0
- setuptools=49.6.0
- simplejson=3.8.1
- six=1.16.0
- spectra=0.0.11
- sqlite=3.36.0
- tk=8.6.10
- tornado=6.1
- typing_extensions=3.10.0.0
- urllib3=1.26.6
- wheel=0.37.0
- xz=5.2.5
- yaml=0.2.5
- zipp=3.5.0
- zlib=1.2.11
- zstd=1.5.0
ref: "/data/project/worthey_lab/datasets_central/human_reference_genome/processed/GRCh38/no_alt_rel20190408/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
somalier:
sites: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/sites/sites.hg38.vcf.gz"
labels_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/ancestry-labels-1kg.tsv"
somalier_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/1kg-somalier/"
verifyBamID:
svd_dat_wgs: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/wgs/1000g.phase3.100k.b38.vcf.gz.dat"
svd_dat_exome: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/exome/chr_added/1000g.phase3.10k.b38.exome.vcf.gz.dat"
datasets:
ref: "/data/project/worthey_lab/datasets_central/human_reference_genome/processed/GRCh38/no_alt_rel20190408/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
somalier:
sites: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/sites/sites.hg38.vcf.gz"
labels_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/ancestry-labels-1kg.tsv"
somalier_1kg: "/data/project/worthey_lab/manual_datasets_central/somalier/0.2.13/ancestry/1kg-somalier/"
verifyBamID:
svd_dat_wgs: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/wgs/1000g.phase3.100k.b38.vcf.gz.dat"
svd_dat_exome: "/data/project/worthey_lab/manual_datasets_central/verifyBamID/2.0.1/resource/exome/chr_added/1000g.phase3.10k.b38.exome.vcf.gz.dat"
#### hardware resources ####
resources:
qualimap_bamqc:
no_cpu: 2
mem_per_cpu: "24G"
mosdepth_coverage:
no_cpu: 4
verifybamid:
no_cpu: 4
......@@ -43,17 +43,17 @@ def read_workflow_config(workflow_config_fpath):
data = yaml.safe_load(fh)
mount_paths = set()
datasets = data["datasets"]
# ref genome
mount_paths.add(Path(data["ref"]).parent)
mount_paths.add(Path(datasets["ref"]).parent)
# somalier resource files
for resource in data["somalier"]:
mount_paths.add(Path(data["somalier"][resource]).parent)
for resource in datasets["somalier"]:
mount_paths.add(Path(datasets["somalier"][resource]).parent)
# verifyBamID resource files
for resource in data["verifyBamID"]:
mount_paths.add(Path(data["verifyBamID"][resource]).parent)
for resource in datasets["verifyBamID"]:
mount_paths.add(Path(datasets["verifyBamID"][resource]).parent)
return mount_paths
......
......@@ -50,6 +50,9 @@ rule multiqc_by_sample_initial_pass:
# multiqc uses fastq's filenames to identify sample names. Rename them to in-house names,
# using custom rename config file
extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}",
conda:
### see issue #47 on why local conda env is used to sidestep snakemake-wrapper's ###
str(WORKFLOW_PATH / "configs/env/multiqc.yaml")
wrapper:
"0.64.0/bio/multiqc"
......@@ -133,10 +136,14 @@ rule multiqc_by_sample_final_pass:
# multiqc uses fastq's filenames to identify sample names. Rename them to in-house names,
# using custom rename config file
extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}",
conda:
### see issue #47 on why local conda env is used to sidestep snakemake-wrapper's ###
str(WORKFLOW_PATH / "configs/env/multiqc.yaml")
wrapper:
"0.64.0/bio/multiqc"
########################## Multi-sample QC aggregation ##########################
localrules:
aggregate_sample_rename_configs,
......@@ -192,5 +199,8 @@ rule multiqc_aggregation_all_samples:
--sample-names {input.rename_config} \
--cl_config "max_table_rows: 2000"'
),
conda:
### see issue #47 on why local conda env is used to sidestep snakemake-wrapper's ###
str(WORKFLOW_PATH / "configs/env/multiqc.yaml")
wrapper:
"0.64.0/bio/multiqc"
......@@ -24,11 +24,11 @@ rule qualimap_bamqc:
"stats bam using qualimap. Sample: {wildcards.sample}"
conda:
str(WORKFLOW_PATH / "configs/env/qualimap.yaml")
threads: 2
threads: config["resources"]["qualimap_bamqc"]["no_cpu"]
params:
outdir=lambda wildcards, output: str(Path(output["html_report"]).parent),
capture_bed=lambda wildcards, input: f"--feature-file {input.target_regions}" if input.target_regions else "",
java_mem="24G",
java_mem=config["resources"]["qualimap_bamqc"]["mem_per_cpu"],
shell:
r"""
unset DISPLAY
......@@ -49,7 +49,7 @@ rule picard_collect_multiple_metrics:
input:
bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam",
index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai",
ref=config["ref"],
ref=config["datasets"]["ref"],
output:
multiext(
str(OUT_DIR / "{sample}" / "qc" / "picard-stats" / "{sample}"),
......@@ -68,7 +68,7 @@ rule picard_collect_wgs_metrics:
input:
bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam",
index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai",
ref=config["ref"],
ref=config["datasets"]["ref"],
output:
OUT_DIR / "{sample}" / "qc" / "picard-stats" / "{sample}.collect_wgs_metrics",
message:
......@@ -97,7 +97,7 @@ rule mosdepth_coverage:
"Running mosdepth for coverage. Sample: {wildcards.sample}"
conda:
str(WORKFLOW_PATH / "configs/env/mosdepth.yaml")
threads: 4
threads: config["resources"]["mosdepth_coverage"]["no_cpu"]
params:
out_prefix=lambda wildcards, output: output["summary"].replace(".mosdepth.summary.txt", ""),
capture_bed=lambda wildcards, input: f"--by {input.target_regions}" if input.target_regions else "",
......
......@@ -2,8 +2,8 @@ rule somalier_extract:
input:
bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam",
bam_index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai",
sites=config["somalier"]["sites"],
ref_genome=config["ref"],
sites=config["datasets"]["somalier"]["sites"],
ref_genome=config["datasets"]["ref"],
output:
protected(OUT_DIR / "project_level_qc" / "somalier" / "extract" / "{sample}.somalier"),
message:
......@@ -55,8 +55,8 @@ rule somalier_relate:
rule somalier_ancestry:
input:
extracted=expand(OUT_DIR / "project_level_qc" / "somalier" / "extract" / "{sample}.somalier", sample=SAMPLES),
labels_1kg=config["somalier"]["labels_1kg"],
somalier_1kg_directory=config["somalier"]["somalier_1kg"],
labels_1kg=config["datasets"]["somalier"]["labels_1kg"],
somalier_1kg_directory=config["datasets"]["somalier"]["somalier_1kg"],
output:
out=protected(
expand(
......
def get_svd(wildcards):
if EXOME_MODE:
return expand(f"{config['verifyBamID']['svd_dat_exome']}.{{ext}}", ext=["bed", "mu", "UD"])
return expand(f"{config['datasets']['verifyBamID']['svd_dat_exome']}.{{ext}}", ext=["bed", "mu", "UD"])
else:
return expand(f"{config['verifyBamID']['svd_dat_wgs']}.{{ext}}", ext=["bed", "mu", "UD"])
return expand(f"{config['datasets']['verifyBamID']['svd_dat_wgs']}.{{ext}}", ext=["bed", "mu", "UD"])
rule verifybamid:
input:
bam=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam",
bam_index=PROJECT_PATH / "{sample}" / "bam" / "{sample}.bam.bai",
ref_genome=config["ref"],
ref_genome=config["datasets"]["ref"],
svd=get_svd,
output:
ancestry=protected(OUT_DIR / "{sample}" / "qc" / "verifyBamID" / "{sample}.Ancestry"),
......@@ -22,7 +22,7 @@ rule verifybamid:
svd_prefix=lambda wildcards, input: input["svd"][0].replace(Path(input["svd"][0]).suffix, ""),
out_prefix=lambda wildcards, output: output["ancestry"].replace(".Ancestry", ""),
sanity_check="--DisableSanityCheck" if is_testing_mode() else "",
threads: 4
threads: config["resources"]["verifybamid"]["no_cpu"]
shell:
r"""
verifybamid2 {params.sanity_check} \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment