diff --git a/MIRA.sh b/MIRA.sh index 43a00f1..2f2bb94 100755 --- a/MIRA.sh +++ b/MIRA.sh @@ -3,7 +3,7 @@ usage() { echo -e "Usage in Spyne container: \n docker exec spyne bash $0 -s {path to samplesheet.csv starting with /data/} -r -e \n Usage in git cloned CLI: \n bash $0 -s {path to samplesheet.csv } -r -e \n \ -\nExperiment type options: Flu-ONT, SC2-Spike-Only-ONT, Flu_Illumina, SC2-Whole-Genome-ONT, SC2-Whole-Genome-Illumina, RSV-illumina, RSV-ONT \n\ +\nExperiment type options: Flu-ONT, SC2-Spike-Only-ONT, Flu_Illumina, SC2-Whole-Genome-ONT, SC2-Whole-Genome-Illumina, RSV-illumina, RSV-ONT, SC2-Spike-Only-Illumina \n\ \nPrimer Schema options for SC2: articv3, articv4, articv4.1, articv5.3.2, qiagen, swift, swift_211206\n\ Primer Schema options for RSV: RSV_CDC_8amplicon_230901, dong_et_al, davina_nunez_wgs" 1>&2; exit 1;} diff --git a/scripts/config_create.py b/scripts/config_create.py index 7503168..bebd612 100755 --- a/scripts/config_create.py +++ b/scripts/config_create.py @@ -10,7 +10,7 @@ parser = argparse.ArgumentParser() parser.add_argument("-s","--samplesheet", help="Samplesheet with sample names") parser.add_argument("-r", "--runid", help="Full path to data directory containing either a fastq_pass subdirectory for ONT data or fastq subdirectory for Illumina") -parser.add_argument("-e", "--experiment_type", help="Experiment type options: Flu-ONT, SC2-Spike-Only-ONT, Flu_Illumina, SC2-Whole-Genome-ONT, SC2-Whole-Genome-Illumina" ) +parser.add_argument("-e", "--experiment_type", help="Experiment type options: Flu-ONT, SC2-Spike-Only-ONT, Flu_Illumina, SC2-Whole-Genome-ONT, SC2-Whole-Genome-Illumina, SC2-Spike-Only-Illumina" ) parser.add_argument("-p", "--primer_schema", required=False, help="For whole-genome SARS-CoV-2 Illumina data, which primer schema was used?") parser.add_argument("-c", "--cleanup", required=False, help="option for data cleanup, CLEANUP-FOOTPRINT, other options for development and testing") parser.add_argument("-m", "--mira", action='store_true', required=False, help="Command-line MIRA called from MIRA.sh bash script") @@ -137,6 +137,8 @@ def reverse_complement(seq): else: if "flu" in experiment_type.lower(): snakefile_path += "illumina_influenza_snakefile" + elif "spike" in experiment_type.lower(): + snakefile_path += "illumina_sc2_spike_snakefile" elif "sc2" in experiment_type.lower(): snakefile_path += "illumina_sc2_snakefile" elif "rsv" in experiment_type.lower(): diff --git a/workflow/illumina_sc2_spike_snakefile b/workflow/illumina_sc2_spike_snakefile new file mode 100644 index 0000000..00147f6 --- /dev/null +++ b/workflow/illumina_sc2_spike_snakefile @@ -0,0 +1,274 @@ +from os.path import basename, realpath, dirname, abspath +import sys +import glob +import os +import gzip + +shell.executable("bash") + +workdir: "." +configfile: "config.yaml" + +config["samples"] = {str(k): v for k, v in config["samples"].items()} +config["samples"] = OrderedDict([(key, config["samples"][key]) for key in config["samples"] ]) + +try: + cli = bool(config["cli"]) +except: + cli = False + + +def memory_for_bbduk(): + # Try to detect how much total memory is available on host (or what was + # alloted to Docker) and assign 50% of that to bbduk. Assure that number is + # not negative and if all this fails, make it a tiny 30 megabytes. + try: + return abs(int(int(os.popen('free -t -m').readlines()[-1].split()[-1])*0.5)) + except: + return 30 + +def find_chemistry(fastq, runid): + global irma_custom + global subsample + try: + with open(fastq) as infi: + contents = infi.readlines() + except: + with gzip.open(fastq) as infi: + contents = infi.readlines() + try: + if len(contents[1]) >= 80: + config_path = workflow.basedir + "/irma_config/SC2-spike-illumina.sh" + irma_custom = [f"cp {config_path} IRMA/ &&",f"--external-config /data/{runid}/IRMA/SC2-spike-illumina.sh"] + subsample = "50000" + else: + config_path = workflow.basedir + "/irma_config/SC2-spike-illumina-2x75.sh" + irma_custom = [f"cp {config_path} IRMA/ &&",f"--external-config /data/{runid}/IRMA/SC2-spike-illumina-2x75.sh"] + subsample = "100000" + except: + config_path = workflow.basedir + "/irma_config/SC2-spike-illumina.sh" + irma_custom = [f"cp {config_path} IRMA/ &&",f"--external-config /data/{runid}/IRMA/SC2-spike-illumina.sh"] + subsample = "50000" + return irma_custom, subsample + +rule all: + input: + expand("IRMA/SC2_{sample}.fin", sample=config["samples"].keys()) + shell: + "touch IRMA/spyne.fin" + + +rule subsample: + input: + R1_fastq = lambda wildcards: config["samples"][wildcards.sample]["R1_fastq"], + R2_fastq = lambda wildcards: config["samples"][wildcards.sample]["R2_fastq"] + output: + O1 = "IRMA/{sample}_subsampled_R1.fastq", + O2 = "IRMA/{sample}_subsampled_R2.fastq" + log: + out = "logs/{sample}.reformat.stdout.log", + err = "logs/{sample}.reformat.stderr.log" + group: + "trim-map" + params: + target = lambda wildcards: find_chemistry(config["samples"][wildcards.sample]["R1_fastq"], config["runid"])[1] + message: "Step 1 - subsampling cleaned up reads if excess > {params.target} exist" + shell: + "reformat.sh" + " in1={input.R1_fastq}" + " in2={input.R2_fastq}" + " out1={output.O1}" + " out2={output.O2}" + " samplereadstarget={params.target}" + " tossbrokenreads" + " 1> {log.out}" + " 2> {log.err}" + +primers = {'swift': {'bedpe':'{}/primers/SNAP_v2_amplicon_panel.bedpe'.format(workflow.basedir), + 'fasta':'{}/primers/SNAP_v2_amplicon_panel.fasta'.format(workflow.basedir)}, + 'articv3': {'bedpe':'{}/primers/artic_v3.bedpe'.format(workflow.basedir), + 'fasta':'{}/primers/artic_v3.fasta'.format(workflow.basedir)}, + 'sgene_v1': {'bedpe':'{}/primers/sgene_v1.bedpe'.format(workflow.basedir), + 'fasta':'{}/primers/sgene_v1.fasta'.format(workflow.basedir)}, + 'articv4': {'bedpe':'{}/primers/artic_v4_IRMAref.bedpe'.format(workflow.basedir), + 'fasta':'{}/primers/artic_v4.fasta'.format(workflow.basedir)}, + 'articv4.1': {'bedpe':'{}/primers/artic_v4.1_NC.bedpe'.format(workflow.basedir), + 'fasta':'{}/primers/artic_v4.1.fasta'.format(workflow.basedir)}, + 'varskip': {'bedpe':'{}/primers/neb_vss1a.primer.bedpe'.format(workflow.basedir), + 'fasta':'{}/primers/neb_vss1a.primer.fasta'.format(workflow.basedir)}, + 'qiagen': {'bedpe':'{}/primers/QIAseqDIRECTSARSCoV2primersfinal.bedpe'.format(workflow.basedir), + 'fasta':'{}/primers/QIAseqDIRECTSARSCoV2primersfinal.fasta'.format(workflow.basedir)}, + 'swift_211206': {'bedpe':'{}/primers/swift_211206.bedpe'.format(workflow.basedir), + 'fasta':'{}/primers/swift_211206.fasta'.format(workflow.basedir)}, + 'articv5.3.2' : {'bedpe':'{}/primers/artic_v5.3.2_IRMAref.bedpe'.format(workflow.basedir), + 'fasta':'{}/primers/artic_v5.3.2.fasta'.format(workflow.basedir)}, + 's_gene' : {'bedpe':'', + 'fasta':'{}/primers/sgene.fasta'.format(workflow.basedir)} + } + +rule bbduk_r1: + input: + r1 = rules.subsample.output.O1, + r2 = rules.subsample.output.O2 + output: + O1 = 'IRMA/{sample}_r1_trim.fastq', + O2 = 'IRMA/{sample}_r2_trim.fastq' + params: + bbdukParams1 = lambda wildcards: ' '.join(['ktrim=l', + 'mm=f', + 'hdist=1', + 'rcomp=t', + 'ordered=t', + 'ref='+primers["s_gene"]["fasta"], + 'k=19', + 'restrictleft=45']), + #ref = primers[s_gene[fasta]], #revisit + mem = memory_for_bbduk() + log: + out = "logs/{sample}.bbduk.trim_left.stdout.log", + err = "logs/{sample}.bbduk.trim_left.stderr.log" + threads: 16 + benchmark: + "logs/benchmarks/bbduk_r1_{sample}.log" + shell: + "bbduk.sh" + " -Xmx{params.mem}m" + " -Xms{params.mem}m" + " in1={input.r1} in2={input.r2}" + " out1={output.O1} out2={output.O2}" + " {params.bbdukParams1} " + " threads={threads}" + " 1> {log.out}" + " 2> {log.err}" + +rule bbduk_r2: + input: + r1 = ancient('IRMA/{sample}_r1_trim.fastq'), + r2 = ancient('IRMA/{sample}_r2_trim.fastq') + output: + o1 = 'IRMA/{sample}_r1_primertrimmed.fastq', + o2 = 'IRMA/{sample}_r2_primertrimmed.fastq' + params: + bbdukParams1 = lambda wildcards: ' '.join(['ktrim=r', + 'rcomp=t', + 'mm=f', + 'hdist=1', + 'ordered=t', + 'ref='+primers['s_gene']["fasta"], + 'k=19', + 'restrictright=45']), #revisit + mem = memory_for_bbduk() + threads: 16 + log: + out = "logs/{sample}.bbduk.trim_right.stdout.log", + err = "logs/{sample}.bbduk.trim_right.stderr.log" + benchmark: + "logs/benchmarks/bbduk_r2_{sample}.log" + shell: + "bbduk.sh" + " -Xmx{params.mem}m" + " -Xms{params.mem}m" + " in1={input.r1} in2={input.r2}" + " out1={output.o1} out2={output.o2}" + " {params.bbdukParams1} " + " threads={threads}" + " 1> {log.out}" + " 2> {log.err}" + +rule irma: + input: + rules.bbduk_r2.output + output: + touch("IRMA/{sample}.irma.fin") + log: + out = "logs/{sample}.irma.stdout.log", + err = "logs/{sample}.irma.stderr.log" + params: + command = lambda wildcards: find_chemistry(config["samples"][wildcards.sample]["R1_fastq"], config["runid"])[0] + benchmark: + "logs/benchmarks/irma_{sample}.log" + threads: 14 + message: "Step 5 - assembling genome with IRMA" + shell: + "{params.command[0]} IRMA CoV-s-gene /data/{config[runid]}/IRMA/{wildcards.sample}_r1_primertrimmed.fastq /data/{config[runid]}/IRMA/{wildcards.sample}_r2_primertrimmed.fastq /data/{config[runid]}/IRMA/{wildcards.sample} {params.command[1]} 2> {log.err} |tee -a {log.out}" + +checkpoint checkirma: + input: + rules.irma.output + output: + 'IRMA/{sample}.irma.decision' + log: + "logs/irma/checkirma_{sample}.log" + shell: + "[ -d IRMA/{wildcards.sample}/amended_consensus ] &&" + "[ \"$(ls -A IRMA/{wildcards.sample}/amended_consensus)\" ] &&" + " echo passed > {output} ||" + " echo failed > {output}" + +def passed_irma(wildcards): + with checkpoints.checkirma.get(sample=wildcards.sample).\ + output[0].open() as f: + if f.read().strip() == "passed": + if not cli: + return rules.prepareIRMAjson.output + else: + return rules.staticHTML.output + else: + return rules.pass_negatives.output + +rule pass_negatives: + input: + ancient(rules.checkirma.output) + output: + "IRMA_negative/{sample}" + shell: + "touch {output}" + +rule catfiles: + input: + expand('IRMA/{sample}.irma.decision', sample=config["samples"].keys()) + output: + "DAIS_ribosome_input.fasta" + message: "Step 6 - Collecting consensus genomes" + shell: + "for file in $(ls IRMA/*/amended_consensus/*.fa | grep -v pad); do cat $file >> {output}; done" + +rule dais_ribosome: + input: + rules.catfiles.output + output: + touch('DAIS_ribosome_output.fin') + message: "Step 7 - Translating sequences into open reading frames (ORFs) with DAIS-Ribosome" + log: + "logs/dais_ribosome/dais.ribosome.log" + shell: + "{workflow.basedir}/scripts/daiswrapper.sh -i {input} -m BETACORONAVIRUS" + +rule prepareIRMAjson: + input: + rules.dais_ribosome.output + output: + touch('IRMA/prepareIRMAjson.fin') + message: "Step 8 - Creating Plotly-Dash readable figures and tables for IRMA-SPY" + log: + "logs/prepareIRMAjson.log" + shell: + "python3 {workflow.basedir}/scripts/prepareIRMAjson.py IRMA samplesheet.csv illumina sc2-spike" + +if cli: + rule staticHTML: + input: + rules.prepareIRMAjson.output + output: + touch('IRMA/staticHTML.fin') + message: "Step 9 - Creating static HTML output" + log: + "logs/staticHTML.log" + shell: + "python3 {workflow.basedir}/scripts/static_report.py $(pwd)" + +rule finishup: + input: + passed_irma + output: + touch("IRMA/SC2_{sample}.fin") \ No newline at end of file diff --git a/workflow/irma_config/SC2-spike-illumina-2x75.sh b/workflow/irma_config/SC2-spike-illumina-2x75.sh new file mode 100644 index 0000000..17e6522 --- /dev/null +++ b/workflow/irma_config/SC2-spike-illumina-2x75.sh @@ -0,0 +1,14 @@ +# HEADER +PARAM_FILE_NAME="CoV S-gene" +PARAM_FILE_AUTHOR="K. Lacek" +PARAM_FILE_VERSION="1.0" +PARAM_FILE_DATE="2024-10" + +MIN_LEN=50 +MIN_CONS_SUPPORT=30 +#ASSEM_REF=1 + +PADDED_CONSENSUS=0 +#CUSTOM_REF_FILE="SARS-CoV-2_spike.fasta" # custom ref file +#REF_SET=$(dirname $REF_SET)/$CUSTOM_REF_FILE + diff --git a/workflow/irma_config/SC2-spike-illumina.sh b/workflow/irma_config/SC2-spike-illumina.sh new file mode 100644 index 0000000..85c5b4a --- /dev/null +++ b/workflow/irma_config/SC2-spike-illumina.sh @@ -0,0 +1,13 @@ +# HEADER +PARAM_FILE_NAME="CoV S-gene" +PARAM_FILE_AUTHOR="K. Lacek" +PARAM_FILE_VERSION="1.0" +PARAM_FILE_DATE="2024-10" + +MIN_CONS_SUPPORT=30 +#ASSEM_REF=1 + +PADDED_CONSENSUS=0 +#CUSTOM_REF_FILE="SARS-CoV-2_spike.fasta" # custom ref file +#REF_SET=$(dirname $REF_SET)/$CUSTOM_REF_FILE + diff --git a/workflow/irma_config/qc_pass_fail_settings.yaml b/workflow/irma_config/qc_pass_fail_settings.yaml index ea0c578..e051e5f 100644 --- a/workflow/irma_config/qc_pass_fail_settings.yaml +++ b/workflow/irma_config/qc_pass_fail_settings.yaml @@ -16,6 +16,15 @@ ont-sc2-spike: negative_control_perc_exception: 10 # ignore `negative_control_perc` is total negative reads mapping to reference is less than this value positive_control_minimum: 1000 # minimum number of reads required to match to expected reference to pass positive controls padded_consensus: False +illumina-sc2-spike: + med_cov: 100 # mean coverage depth across reference + minor_vars: 10 # maximum allowed minor variants >= 5% + allow_stop_codons: False # True = allow premature stop codons; False = do not allow + perc_ref_covered: 90 # Mimimum % of reference to cover + negative_control_perc: 1 # fail negative controls if this percent of reads map to reference + negative_control_perc_exception: 1000 # ignore `negative_control_perc` is total negative reads mapping to reference is less than this value + positive_control_minimum: 100000 # minimum number of reads required to match to expected reference to pass positive controls + padded_consensus: False illumina-flu: med_cov: 100 # mean coverage depth across reference minor_vars: 10 # maximum allowed minor variants >= 5% diff --git a/workflow/primers/sgene.fasta b/workflow/primers/sgene.fasta new file mode 100644 index 0000000..c600be8 --- /dev/null +++ b/workflow/primers/sgene.fasta @@ -0,0 +1,8 @@ +>S1F_21358 +ACAAATCCAATTCAGTTGTCTTCCTATTC +>S1R_23813 +TGCTGCATTCAGTTGAATCACC +>S2F_23288 +GTCCGTGATCCACAGACACTT +>S2R_25460 +GCATCCTTGATTTCACCTTGCTTC \ No newline at end of file