Skip to content
Snippets Groups Projects
aggregate_results.smk 7.62 KiB
Newer Older
##########################   Single-sample-level QC aggregation  ##########################
rule multiqc_by_sample_initial_pass:
Manavalan Gajapathy's avatar
Manavalan Gajapathy committed
    input:
        get_small_var_pipeline_targets,
        OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt",
        OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "qualimapReport.html",
        OUT_DIR / "{sample}" / "qc" / "mosdepth" / "{sample}.mosdepth.global.dist.txt",
        OUT_DIR / "{sample}" / "qc" / "verifyBamID" / "{sample}.Ancestry",
        OUT_DIR / "{sample}" / "qc" / "bcftools-stats" / "{sample}.bcftools.stats",
        multiqc_config="configs/multiqc_config.yaml",
        rename_config=PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config"  / "{sample}_rename_config.tsv",
Manavalan Gajapathy's avatar
Manavalan Gajapathy committed
    output:
        protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc.html"),
        protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_general_stats.txt"),
        protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastqc_trimmed.txt"),
        protected(OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastq_screen.txt"),
    # WARNING: don't put this rule in a group, bad things will happen. see issue #23 in gitlab (small var caller pipeline repo)
Manavalan Gajapathy's avatar
Manavalan Gajapathy committed
    message:
        "Aggregates QC results using multiqc. First pass. Output will be used for the internal QC checkup. Sample: {wildcards.sample}"
        # multiqc uses fastq's filenames to identify sample names. Rename them to in-house names,
        # using custom rename config file
        extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}",
    wrapper:
        "0.64.0/bio/multiqc"
        qc_config="configs/qc_checkup/qc_checkup_config.yaml",
        multiqc_stats=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_general_stats.txt",
        fastqc_trimmed=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastqc_trimmed.txt",
        fastq_screen=OUT_DIR / "{sample}" / "qc" / "multiqc_initial_pass" / "{sample}_multiqc_data" / "multiqc_fastq_screen.txt",
        qualimap=OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "genome_results.txt",
        protected(expand(
            OUT_DIR / "{{sample}}" / "qc" / "qc_checkup" / "qc_checkup_{suffix}.yaml",
            suffix=["overall_summary", "fastqc", "fastq_screen", "qualimap_overall", "qualimap_chromosome_stats"],
    # WARNING: don't put this rule in a group, bad things will happen. see issue #23 in gitlab
    message:
        "Runs QC checkup on various QC tool output, based on custom defined QC thresholds. "
        "Note that this will NOT work as expected for multi-sample analysis."
    params:
        sample="{sample}",
        outdir=lambda wildcards, output: str(Path(output[0]).parent),
    conda:
        str(WORKFLOW_PATH / "configs/env/qc_checkup.yaml")
    shell:
        r"""
        python src/qc_checkup/qc_checkup.py \
            --config {input.qc_config} \
            --multiqc_stats {input.multiqc_stats} \
            --fastqc {input.fastqc_trimmed} \
            --fastq_screen {input.fastq_screen} \
            --qualimap {input.qualimap} \
            --sample {params.sample} \
            --outdir {params.outdir}
        """


rule multiqc_by_sample_final_pass:
        get_small_var_pipeline_targets,
        OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt",
        OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "qualimapReport.html",
        OUT_DIR / "{sample}" / "qc" / "mosdepth" / "{sample}.mosdepth.global.dist.txt",
        OUT_DIR / "{sample}" / "qc" / "verifyBamID" / "{sample}.Ancestry",
        OUT_DIR / "{sample}" / "qc" / "bcftools-stats" / "{sample}.bcftools.stats",
        OUT_DIR / "{sample}" / "qc" / "qc_checkup" / "qc_checkup_overall_summary.yaml",
        multiqc_config="configs/multiqc_config.yaml",
        rename_config=PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config"  / "{sample}_rename_config.tsv",
        qc_config="configs/qc_checkup/qc_checkup_config.yaml",
        protected(OUT_DIR / "{sample}" / "qc" / "multiqc_final_pass" / "{sample}_multiqc.html"),
        protected(OUT_DIR / "{sample}" / "qc" / "multiqc_final_pass" / "{sample}_multiqc_data" / "multiqc_general_stats.txt"),
    # WARNING: don't put this rule in a group, bad things will happen. see issue #23 in gitlab
    message:
        "Aggregates QC results using multiqc. Final pass, where QC checkup results are also aggregated"
    params:
        # multiqc uses fastq's filenames to identify sample names. Rename them to in-house names,
        # using custom rename config file
        extra=lambda wildcards, input: f"--config {input.multiqc_config} --sample-names {input.rename_config}",
    wrapper:
        "0.64.0/bio/multiqc"
##########################   Multi-sample QC aggregation  ##########################
localrules:
    aggregate_sample_rename_configs,


rule aggregate_sample_rename_configs:
    input:
        expand(PROJECT_PATH / "{sample}" / "qc" / "multiqc_initial_pass" / "multiqc_sample_rename_config"  / "{sample}_rename_config.tsv", sample=SAMPLES),
        protected(OUT_DIR / "project_level_qc" / "multiqc" / "aggregated_rename_configs.tsv"),
    message:
        "Aggregate all sample rename-config files"
    run:
        aggregate_rename_configs(input, output[0])


rule multiqc_aggregation_all_samples:
    input:
        expand(
            [
                PROJECT_PATH / "{sample}" / "qc" / "fastqc-raw" / "{sample}-{unit}-{read}_fastqc.zip",
                PROJECT_PATH / "{sample}" / "qc" / "fastqc-trimmed" / "{sample}-{unit}-{read}_fastqc.zip",
                PROJECT_PATH / "{sample}" / "qc" / "fastq_screen-trimmed" / "{sample}-{unit}-{read}_screen.txt",
                PROJECT_PATH / "{sample}" / "qc" / "dedup" / "{sample}-{unit}.metrics.txt",
                OUT_DIR / "project_level_qc" / "somalier" / "relatedness" / "somalier.html",
                OUT_DIR / "project_level_qc" / "somalier" / "ancestry" / "somalier.somalier-ancestry.html",
                OUT_DIR / "{sample}" / "qc" / "samtools-stats" / "{sample}.txt",
                OUT_DIR / "{sample}" / "qc" / "qualimap" / "{sample}" / "qualimapReport.html",
                OUT_DIR / "{sample}" / "qc" / "mosdepth" / "{sample}.mosdepth.global.dist.txt",
                OUT_DIR / "{sample}" / "qc" / "verifyBamID" / "{sample}.Ancestry",
                OUT_DIR / "{sample}" / "qc" / "bcftools-stats" / "{sample}.bcftools.stats",
                OUT_DIR / "{sample}" / "qc" / "qc_checkup" / "qc_checkup_overall_summary.yaml",
            ],
            sample=SAMPLES,
            read=["R1", "R2"],
        ),
        multiqc_config="configs/multiqc_config.yaml",
        rename_config=OUT_DIR / "project_level_qc" / "multiqc" / "aggregated_rename_configs.tsv",
        protected(OUT_DIR / "project_level_qc" / "multiqc" / "multiqc_report.html"),
    message:
        "Running multiqc for all samples"
    params:
        # multiqc uses fastq's filenames to identify sample names. Rename them to in-house names,
        # using custom rename config file
        extra=(
            lambda wildcards, input: f'--config {input.multiqc_config} \
Manavalan Gajapathy's avatar
Manavalan Gajapathy committed
                            --sample-names {input.rename_config} \
                            --cl_config "max_table_rows: 2000"'