Commit cd053441 authored by Curtis Hendrickson's avatar Curtis Hendrickson
Browse files

version 1 - derived from cov2WuHan1_hg38_ref project

parents
# cov2Vero ref genome with STAR index
Combines, in this order
* SARS-CoV-2 strain WuHan-1 [NC_045512.2](https://www.ncbi.nlm.nih.gov/nuccore/NC_045512.2)
* Hg38 ([Human GRCh38/seqs_for_alignemtn_piplines.ucsc_ids](ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/))
and then indexes the combined genome with STAR (overhang=100), BWA,
For use with the [CCTS-Informatics-Pipelines](CCTS-Informatics-Pipelines)/[Snakemake-Eukaryotic-RNAseq-Pipeline](CCTS-Informatics-Pipelines/Snakemake-Eukaryotic-RNAseq-Pipeline)
Edit config.yaml of the Snakemake-Eukaryotic-RNAseq-Pipeline to set
```
# required for RNA-Seq analysis
index: "/data/project/public_datasets/ngs/genomes/cov2WuHan1_hg38/cov2WuHan1_hg38noAlt/STAR/STAR_2.5.3a-100"
annotation: "/data/project/public_datasets/ngs/genomes/cov2WuHan1_hg38/cov2WuHan1_hg38noAlt/cov2WuHan1_hg38noAlt.gtf"
# optional - for bwa alignment
bwa: "/data/project/public_datasets/ngs/genomes/cov2WuHan1_hg38/BWA/0.7.13/cov2WuHan1_hg38noAlt.fa"
# optional - for creating sample-specific genomes
cov2: "/data/project/public_datasets/ngs/genomes/cov2WuHan1_hg38/cov2WuHan1/NC_045512.2.fasta"
```
#
# SARS-CoV-2 genome
#
#
# to run
#
# module load snakemakeslurm/5.9.1-4
# snakemakeslurm --use-conda --use-singularity
#
from snakemake.remote.NCBI import RemoteProvider as NCBIRemoteProvider
my_NCBI_Email='@'.join([os.environ['USER'],'uab.edu'])
NCBI = NCBIRemoteProvider(email=my_NCBI_Email) # email required by NCBI to prevent abuse
cov2acc="NC_045512.2"
cov2abbrev="cov2WuHan1"
rule all:
input:
# SARS-CoV-2 download from Genbank and convert GFF3->GTF
cov2=expand("{strain}/{acc}.{ext}",strain=[cov2abbrev],acc=[cov2acc],ext=["fasta", "gff3", "gtf","rRNA_gene_list.txt","dict"])
# samtools faidx
,Idx=expand("{strain}/{genome}.{ext}",strain=[cov2abbrev],genome=[cov2acc],ext=["fasta.fai"]) #,"dict"])
# star index
,StarIDX=expand("{strain}/{genome}/STAR/STAR_2.5.3a-{overhang}/{index_file}",strain=[cov2abbrev],genome=[cov2acc],overhang=["100"],index_file=["SAindex"])
# bwa index
# MISSING - liamvdp did by hand
,BWAIdx=expand("{strain}/BWA/0.7.17/{genome}.{ext}",strain=[cov2abbrev],genome=[cov2acc],ext=["fasta.fai","fasta.bwt","fasta.sa"])#, "dict"])
ruleorder: AGAT_gff2gtf > AGAT_gff2ext_cleaner > gunzip
ruleorder: gtf_get_rRNA_gene_list > gunzip
rule gunzip:
wildcard_constraints: file=".+(?<!gz)" # not ends with gz
output: "x{dir}/{file}"
input: "{dir}/{file}.gz"
shell: "gunzip -c '{input}' > '{output}'"
#
# pull rRNA genes
#
rule gtf_get_rRNA_gene_list:
output: "{dir}/{genome}.rRNA_gene_list.txt"
input: "{dir}/{genome}.gtf"
shell:
"awk -e 'BEGIN{{FS=\"\\t\"}}($3=\"CDS\"){{print $9}}' {input} "
# grep will exit=1 if no lines match, so use awk
#" | grep rRNA "
" | awk -e '/rRNA/{{print}}' "
" | cut -d \; -f 1 "
" | uniq "
" | cut -d \\\" -f 2 "
# grep will exit=1 if no lines match, so use awk
#" | grep -v '^$' "
" | awk -e '($0!=\"\"){{print}}' "
" > {output}"
# ----------------------------------------------------------------------
#
# SARS-Cov-2
#
# PULL FROM GENBANK
#
# ----------------------------------------------------------------------
rule ncbi_remote_fasta:
wildcard_constraints: acc="[A-Z_]+[0-9]+\.[0-9]+"
input: NCBI.remote(["{acc}.fasta"], db="nuccore")
output: "{dir}/{acc}.fasta"
shell: "mv {input} {output}"
# no gff3 support for nuccore in NCBI.remote
# note: this will also fetch fastas, if changes report=gff3 to report=fasta
rule pull_genbank_gff3:
wildcard_constraints: acc="[A-Z_]+[0-9]+\.[0-9]+"
output: "{dir}/{acc}.gff3"
shell: "wget -O '{output}' 'https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&save=file&log$=seqview&db=nuccore&report=gff3&id={wildcards.acc}&extrafeat=null&conwithfeat=on&hide-cdd=on'"
#
# GFF3 -> GTF conversion (AGAT package)
#
rule AGAT_gff2gtf:
wildcard_constraints: file=".*", dir="cov2.*"
output: "{dir}/{file}.agat_raw.gtf"
input: "{dir}/{file}.gff3"
# not working; see: https://github.com/NBISweden/AGAT/issues/4
#conda: "envs/AGAT.yaml"
singularity: "docker://quay.io/biocontainers/agat:0.2.3--pl526r35_1"
# use "_sp_" (SLURP) verion instead of "_sq_" (sequential), as memory is not an issue
shell:
"agat_convert_sp_gff2gtf.pl --gff {input} "
"> {output}"
rule AGAT_gff2ext_cleaner:
wildcard_constraints: dir="cov2.*", file=".*"
output: "{dir}/{file}.gtf"
input: "{dir}/{file}.agat_raw.gtf"
shell:
"cat '{input}' "
# remove warning from AGAT about " Primary tag values (3rd column) not expected => region"
"| egrep -v '^-' "
# remove parser chatter
"| egrep -v '^(converting to GTF3|Bye Bye)' "
"> {output}"
# ----------------------------------------------------------------------
#
# STAR index
#
# ----------------------------------------------------------------------
rule STAR_index_genome_with_gtf:
output:
expand("{{strain}}/{{genome}}/STAR/STAR_2.5.3a-{{overhang}}/{index_file}",
index_file=["SAindex", "SA", "Genome", "sjdbInfo.txt"])
input:
fa="{strain}/{genome}.fasta",
gtf="{strain}/{genome}.gtf"
# MUST match that from CCTS-Informatics-Pipelines / Snakemake-Eukaryotic-RNAseq-Pipeline
# which comes from a plug-in. We grabbed the yaml file from .snakemake/conda/8a33389f.yaml after a run.
conda: "envs/STAR_2.5.3a.yaml"
threads: 12
log:
stdout="{strain}/{genome}/STAR/STAR_2.5.3a-{overhang}/log_{overhang}_stdout.txt"
, stderr="{strain}/{genome}/STAR/STAR_2.5.3a-{overhang}/log_{overhang}_stderr.txt"
shell:
"STAR "
" --runThreadN {threads} "
" --runMode genomeGenerate "
" --outFileNamePrefix $(dirname {output[0]}) "
" --genomeDir $(dirname {output[0]}) "
" --genomeFastaFiles {input.fa} "
" --sjdbGTFfile {input.gtf} "
" --sjdbOverhang {wildcards.overhang}"
" > {log.stdout} 2> {log.stderr} "
# ----------------------------------------------------------------------
#
# STAR index
#
# ----------------------------------------------------------------------
rule BWA_index_genome:
wildcard_constraints: ext="fasta"
output:
expand("{{strain}}/BWA/{{bwa_ver}}/{{genome}}.{{ext}}.{index_file}",
index_file=["amb", "pac", "ann", "sa","bwt"])
input:
fa="{strain}/{genome}.{ext}",
conda: "envs/bwa.{bwa_ver}.yaml"
shell:
"mkdir -p $(dirname {output[1]}) "
"&& cd $(dirname {output[1]}) "
"&& ln -fs ../../$(basename {input.fa}) . "
"&& bwa index $(basename {input.fa}) "
# ----------------------------------------------------------------------
#
# SAMtools FAI index
#
# ----------------------------------------------------------------------
rule SAMtools_index_fasta:
wildcard_constraints: ext="(fa|fasta|fna)"
output: "{file}.{ext}.fai"
input: "{file}.{ext}"
conda: "envs/samtools.yaml"
shell: "samtools faidx {input}"
# ----------------------------------------------------------------------
#
# Picard/GATK4 .dict index
#
# How to merge these two rules so input can be .fasta or .fna w/o dup rule?
# ----------------------------------------------------------------------
rule gatk4_fasta_dict:
#wildcard_constraints: ext="(fa|fasta|fna)"
output: "{dir}/{file}.dict"
input: fasta="{dir}/{file}.fasta"
conda: "envs/gatk4.yaml"
shell:
"JAVA_XMX= "
" && "
"if [ ! -z ${{SLURM_MEM_PER_CPU+x}} ]; then JAVA_XMX=-Xmx$(( $SLURM_MEM_PER_CPU / 1000 * $SLURM_CPUS_ON_NODE ))g; fi "
" && "
#"java $JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp -jar $EBROOTGATK/GenomeAnalysisTK.jar "
' gatk --java-options "$JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp" '
" CreateSequenceDictionary "
" -R {input.fasta} "
" -O {output} "
rule gatk4_fna_dict:
#wildcard_constraints: ext="(fa|fasta|fna)"
output: "{dir}/{file}.dict"
input: fasta="{dir}/{file}.fna"
conda: "envs/gatk4.yaml"
shell:
"JAVA_XMX= "
" && "
"if [ ! -z ${{SLURM_MEM_PER_CPU+x}} ]; then JAVA_XMX=-Xmx$(( $SLURM_MEM_PER_CPU / 1000 * $SLURM_CPUS_ON_NODE ))g; fi "
" && "
#"java $JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp -jar $EBROOTGATK/GenomeAnalysisTK.jar "
' gatk --java-options "$JAVA_XMX -Djava.io.tmpdir=tmp -Djava.io.tmpdir=tmp" '
" CreateSequenceDictionary "
" -R {input.fasta} "
" -O {output} "
{
"#documenation": {
"#A": "----------------------------------------------------------------------------------",
"#raw_cmdline": "Use with snakemake --cluster-config cluster.slurm.cheaha.json --cluster 'sbatch --job-name {cluster.job-name} --ntasks {cluster.ntasks} --cpus-per-task {threads} --mem-per-cpu {cluster.mem-per-cpu} --partition {cluster.partition} --time {cluster.time} --mail-user {cluster.mail-user} --mail-type {cluster.mail-type} --error {cluster.error} --output {cluster.output}'",
"#cmdline": "Or use with module ngs-ccts/snakemakeslurm which builds the raw commandline for you.",
"#slurm partitions": "See https://docs.uabgrid.uab.edu/wiki/Slurm#Slurm_Partitions",
"#Z": "----------------------------------------------------------------------------------"
},
"#__default__": {
"# documentation": "# This is the default SLURM job config, except threads which are annotated directly on the rule",
"#job-name": "SM.{rule}.{wildcards}",
"# partition": "# run sinfo to list available partitions and their run-time limitations",
"#partition": "express",
"# time": "# Runtime in D-HH:MM:SS",
"#time": "02:00:00",
"# ntasks": "# number of SLURM tasks. We assume that programs are SMP, so all cores must be on the same node, and thus ask for only 1 task",
"#ntasks": 1,
"# cpus": "# number of threads/cpus comes from the 'threads' attribute on the individual rules in the Snakefile",
"#mem-per-cpu-mb": 2000,
"# output/error": "# output filenames for stdout and stderr",
"# filename_pattern": "%j=jobid, %N=short_hostname, %x=job_name; see https://slurm.schedmd.com/sbatch.html#lbAH",
"#output": "logs/%j.%N.%x.out.txt",
"#error": "logs/%j.%N.%x.err.txt",
"# mail-user": "USER@uab.edu # this is set in snakemakeslurm, as there is no SLURM filename pattern for USER name",
"#mail-type": "ALL"
},
"#Template Rules": {
"#A": "----------------------------------------------------------------------------------",
"# Additional Rules": "If the name of the section matches the name of the rule, then the values in the section override the __default__ settings. For items not listed",
"#Z": "----------------------------------------------------------------------------------"
},
"STAR_index_genome_with_gtf": {
"partition": "long",
"time": "150:00:00",
"mem-per-cpu-mb": "20000"
},
"SAMtools_index_fasta": {
"mem-per-cpu-mb": "20000"
},
"BWA_index_genome": {
"partition": "long",
"time": "150:00:00",
"mem-per-cpu-mb": "40000"
}
}
channels:
- bioconda
- conda-forge
- defaults
dependencies:
- star ==2.5.3a
name: bwa
channels:
- bioconda
- defaults
dependencies:
- bwa=0.7.17
name: bwa
channels:
- bioconda
- defaults
dependencies:
- bwa
name: gatk4
channels:
- bioconda
- defaults
dependencies:
- gatk4
channels:
- bioconda
- defaults
dependencies:
- samtools
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment