check platform, if mac (darwin) then dont use tar

harvardinformatics · Mar 7, 2024 · 5665b7c · 5665b7c
1 parent 5b479be
commit 5665b7c
Show file tree

Hide file tree

Showing 2 changed files with 154 additions and 74 deletions.
diff --git a/workflow/rules/bam2vcf_gatk_intervals.smk b/workflow/rules/bam2vcf_gatk_intervals.smk
@@ -74,85 +74,159 @@ rule create_db_mapfile:
             for file_path in input:
                 sample_name = os.path.basename(file_path).replace(".g.vcf.gz", "")
                 print(sample_name, file_path, sep="\t", file=f)
+if PLATFORM == "Darwin":
+    rule gvcf2DB:
+        """
+        Creates GATK DB. If on Linux, DB is archived to reduce temp space.
+        """
+        input:
+            unpack(get_gvcfs_db),
+            l = "results/{refGenome}/intervals/db_intervals/{l}-scattered.interval_list",
+            db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt"
+        output:
+            db = temp(directory("results/{refGenome}/genomics_db_import/DB_L{l}"))        
+        resources:
+            mem_mb = lambda wildcards, attempt: attempt * resources['gvcf2DB']['mem'],   # this is the overall memory requested
+            reduced = lambda wildcards, attempt: int(attempt * resources['gvcf2DB']['mem'] * 0.80) # this is the maximum amount given to java
+        log:
+            "logs/{refGenome}/gatk_db_import/{l}.txt"
+        resources:
+            tmpdir = get_big_temp
+        benchmark:
+            "benchmarks/{refGenome}/gatk_db_import/{l}.txt"
+        conda:
+            "../envs/bam2vcf.yml"
+        shell:
+            # NOTE: reader-threads > 1 useless if you specify multiple intervals
+            # a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance
+            """
+            export TILEDB_DISABLE_FILE_LOCKING=1
+            gatk GenomicsDBImport \
+                --java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
+                --genomicsdb-shared-posixfs-optimizations true \
+                --batch-size 25 \
+                --genomicsdb-workspace-path {output.db} \
+                --merge-input-intervals \
+                -L {input.l} \
+                --tmp-dir {resources.tmpdir} \
+                --sample-name-map {input.db_mapfile} &> {log}
+            """
 
-rule gvcf2DB:
-    """
-    todo
-    """
-    input:
-        unpack(get_gvcfs_db),
-        l = "results/{refGenome}/intervals/db_intervals/{l}-scattered.interval_list",
-        db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt"
-    output:
-        db = temp(directory("results/{refGenome}/genomics_db_import/DB_L{l}")),
-        tar = temp("results/{refGenome}/genomics_db_import/DB_L{l}.tar"),        
-    resources:
-        mem_mb = lambda wildcards, attempt: attempt * resources['gvcf2DB']['mem'],   # this is the overall memory requested
-        reduced = lambda wildcards, attempt: int(attempt * resources['gvcf2DB']['mem'] * 0.80) # this is the maximum amount given to java
-    log:
-        "logs/{refGenome}/gatk_db_import/{l}.txt"
-    resources:
-        tmpdir = get_big_temp
-    benchmark:
-        "benchmarks/{refGenome}/gatk_db_import/{l}.txt"
-    conda:
-        "../envs/bam2vcf.yml"
-    shell:
-        # NOTE: reader-threads > 1 useless if you specify multiple intervals
-        # a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance
+    rule DB2vcf:
+        """
+        This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists
+        are still scattered.
         """
-        export TILEDB_DISABLE_FILE_LOCKING=1
-        gatk GenomicsDBImport \
-            --java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
-            --genomicsdb-shared-posixfs-optimizations true \
-            --batch-size 25 \
-            --genomicsdb-workspace-path {output.db} \
-            --merge-input-intervals \
-            -L {input.l} \
-            --tmp-dir {resources.tmpdir} \
-            --sample-name-map {input.db_mapfile} &> {log}
-        
-        tar --overwrite -cf {output.tar} {output.db}
+        input:
+            db = "results/{refGenome}/genomics_db_import/DB_L{l}",
+            ref = "results/{refGenome}/data/genome/{refGenome}.fna",
+            fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
+            dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
+        output:
+            vcf = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz"),
+            vcfidx = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi"),
+        params:
+            het = config['het_prior'],   
+        resources:
+            mem_mb = lambda wildcards, attempt: attempt * resources['DB2vcf']['mem'],   # this is the overall memory requested
+            reduced = lambda wildcards, attempt: attempt * (resources['DB2vcf']['mem'] - 3000),  # this is the maximum amount given to java
+            tmpdir = get_big_temp
+        log:
+            "logs/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
+        benchmark:
+            "benchmarks/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
+        conda:
+            "../envs/bam2vcf.yml"
+        shell:
+            """
+            gatk GenotypeGVCFs \
+                --java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
+                -R {input.ref} \
+                --heterozygosity {params.het} \
+                --genomicsdb-shared-posixfs-optimizations true \
+                -V gendb://{input.db} \
+                -O {output.vcf} \
+                --tmp-dir {resources.tmpdir} &> {log}
+            """
+else:
+    rule gvcf2DB:
         """
+        Creates GATK DB. If on Linux, DB is archived to reduce temp space.
+        """
+        input:
+            unpack(get_gvcfs_db),
+            l = "results/{refGenome}/intervals/db_intervals/{l}-scattered.interval_list",
+            db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt"
+        output:
+            db = temp(directory("results/{refGenome}/genomics_db_import/DB_L{l}")),
+            tar = temp("results/{refGenome}/genomics_db_import/DB_L{l}.tar"),        
+        resources:
+            mem_mb = lambda wildcards, attempt: attempt * resources['gvcf2DB']['mem'],   # this is the overall memory requested
+            reduced = lambda wildcards, attempt: int(attempt * resources['gvcf2DB']['mem'] * 0.80) # this is the maximum amount given to java
+        log:
+            "logs/{refGenome}/gatk_db_import/{l}.txt"
+        resources:
+            tmpdir = get_big_temp
+        benchmark:
+            "benchmarks/{refGenome}/gatk_db_import/{l}.txt"
+        conda:
+            "../envs/bam2vcf.yml"
+        shell:
+            # NOTE: reader-threads > 1 useless if you specify multiple intervals
+            # a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance
+            """
+            export TILEDB_DISABLE_FILE_LOCKING=1
+            gatk GenomicsDBImport \
+                --java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
+                --genomicsdb-shared-posixfs-optimizations true \
+                --batch-size 25 \
+                --genomicsdb-workspace-path {output.db} \
+                --merge-input-intervals \
+                -L {input.l} \
+                --tmp-dir {resources.tmpdir} \
+                --sample-name-map {input.db_mapfile} &> {log}
+            
+            tar --overwrite -cf {output.tar} {output.db}
+            """
 
-rule DB2vcf:
-    """
-    This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists
-    are still scattered.
-    """
-    input:
-        db = "results/{refGenome}/genomics_db_import/DB_L{l}.tar",
-        ref = "results/{refGenome}/data/genome/{refGenome}.fna",
-        fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
-        dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
-    output:
-        vcf = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz"),
-        vcfidx = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi"),
-    params:
-        het = config['het_prior'],
-        db = lambda wc, input: input.db[:-4]
-    resources:
-        mem_mb = lambda wildcards, attempt: attempt * resources['DB2vcf']['mem'],   # this is the overall memory requested
-        reduced = lambda wildcards, attempt: attempt * (resources['DB2vcf']['mem'] - 3000),  # this is the maximum amount given to java
-        tmpdir = get_big_temp
-    log:
-        "logs/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
-    benchmark:
-        "benchmarks/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
-    conda:
-        "../envs/bam2vcf.yml"
-    shell:
+    rule DB2vcf:
         """
-        tar -xf {input.db}
-        gatk GenotypeGVCFs \
-            --java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
-            -R {input.ref} \
-            --heterozygosity {params.het} \
-            --genomicsdb-shared-posixfs-optimizations true \
-            -V gendb://{params.db} \
-            -O {output.vcf} \
-            --tmp-dir {resources.tmpdir} &> {log}
+        This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists
+        are still scattered.
         """
+        input:
+            db = "results/{refGenome}/genomics_db_import/DB_L{l}.tar",
+            ref = "results/{refGenome}/data/genome/{refGenome}.fna",
+            fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
+            dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
+        output:
+            vcf = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz"),
+            vcfidx = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi"),
+        params:
+            het = config['het_prior'],
+            db = lambda wc, input: input.db[:-4]
+        resources:
+            mem_mb = lambda wildcards, attempt: attempt * resources['DB2vcf']['mem'],   # this is the overall memory requested
+            reduced = lambda wildcards, attempt: attempt * (resources['DB2vcf']['mem'] - 3000),  # this is the maximum amount given to java
+            tmpdir = get_big_temp
+        log:
+            "logs/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
+        benchmark:
+            "benchmarks/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
+        conda:
+            "../envs/bam2vcf.yml"
+        shell:
+            """
+            tar -xf {input.db}
+            gatk GenotypeGVCFs \
+                --java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
+                -R {input.ref} \
+                --heterozygosity {params.het} \
+                --genomicsdb-shared-posixfs-optimizations true \
+                -V gendb://{params.db} \
+                -O {output.vcf} \
+                --tmp-dir {resources.tmpdir} &> {log}
+            """
 
 rule filterVcfs:
     """

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -6,13 +6,20 @@ import tempfile
 import random
 import string
 import statistics
+import platform
 from collections import defaultdict
 from urllib.request import urlopen
 import pandas as pd
 from yaml import safe_load
 from collections import defaultdict, deque
 from snakemake.exceptions import WorkflowError
 
+PLATFORM = platform.system()
+
+if PLATFORM == "Windows":
+    raise WorkflowError("snpArcher does not currently support Windows.")
+
+
 samples = pd.read_table(config["samples"], sep=",", dtype=str).replace(
     " ", "_", regex=True
 )
@@ -56,7 +63,6 @@ def get_output():
                 out.append(rules.trackhub_all.input)
     return out
 
-
 def merge_bams_input(wc):
     return expand(
         "results/{{refGenome}}/bams/preMerge/{{sample}}/{run}.bam",