Skip to content

Commit

Permalink
check platform, if mac (darwin) then dont use tar
Browse files Browse the repository at this point in the history
  • Loading branch information
cademirch committed Mar 7, 2024
1 parent 5b479be commit 5665b7c
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 74 deletions.
220 changes: 147 additions & 73 deletions workflow/rules/bam2vcf_gatk_intervals.smk
Original file line number Diff line number Diff line change
Expand Up @@ -74,85 +74,159 @@ rule create_db_mapfile:
for file_path in input:
sample_name = os.path.basename(file_path).replace(".g.vcf.gz", "")
print(sample_name, file_path, sep="\t", file=f)
if PLATFORM == "Darwin":
rule gvcf2DB:
"""
Creates GATK DB. If on Linux, DB is archived to reduce temp space.
"""
input:
unpack(get_gvcfs_db),
l = "results/{refGenome}/intervals/db_intervals/{l}-scattered.interval_list",
db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt"
output:
db = temp(directory("results/{refGenome}/genomics_db_import/DB_L{l}"))
resources:
mem_mb = lambda wildcards, attempt: attempt * resources['gvcf2DB']['mem'], # this is the overall memory requested
reduced = lambda wildcards, attempt: int(attempt * resources['gvcf2DB']['mem'] * 0.80) # this is the maximum amount given to java
log:
"logs/{refGenome}/gatk_db_import/{l}.txt"
resources:
tmpdir = get_big_temp
benchmark:
"benchmarks/{refGenome}/gatk_db_import/{l}.txt"
conda:
"../envs/bam2vcf.yml"
shell:
# NOTE: reader-threads > 1 useless if you specify multiple intervals
# a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance
"""
export TILEDB_DISABLE_FILE_LOCKING=1
gatk GenomicsDBImport \
--java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
--genomicsdb-shared-posixfs-optimizations true \
--batch-size 25 \
--genomicsdb-workspace-path {output.db} \
--merge-input-intervals \
-L {input.l} \
--tmp-dir {resources.tmpdir} \
--sample-name-map {input.db_mapfile} &> {log}
"""

rule gvcf2DB:
"""
todo
"""
input:
unpack(get_gvcfs_db),
l = "results/{refGenome}/intervals/db_intervals/{l}-scattered.interval_list",
db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt"
output:
db = temp(directory("results/{refGenome}/genomics_db_import/DB_L{l}")),
tar = temp("results/{refGenome}/genomics_db_import/DB_L{l}.tar"),
resources:
mem_mb = lambda wildcards, attempt: attempt * resources['gvcf2DB']['mem'], # this is the overall memory requested
reduced = lambda wildcards, attempt: int(attempt * resources['gvcf2DB']['mem'] * 0.80) # this is the maximum amount given to java
log:
"logs/{refGenome}/gatk_db_import/{l}.txt"
resources:
tmpdir = get_big_temp
benchmark:
"benchmarks/{refGenome}/gatk_db_import/{l}.txt"
conda:
"../envs/bam2vcf.yml"
shell:
# NOTE: reader-threads > 1 useless if you specify multiple intervals
# a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance
rule DB2vcf:
"""
This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists
are still scattered.
"""
export TILEDB_DISABLE_FILE_LOCKING=1
gatk GenomicsDBImport \
--java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
--genomicsdb-shared-posixfs-optimizations true \
--batch-size 25 \
--genomicsdb-workspace-path {output.db} \
--merge-input-intervals \
-L {input.l} \
--tmp-dir {resources.tmpdir} \
--sample-name-map {input.db_mapfile} &> {log}
tar --overwrite -cf {output.tar} {output.db}
input:
db = "results/{refGenome}/genomics_db_import/DB_L{l}",
ref = "results/{refGenome}/data/genome/{refGenome}.fna",
fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
output:
vcf = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz"),
vcfidx = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi"),
params:
het = config['het_prior'],
resources:
mem_mb = lambda wildcards, attempt: attempt * resources['DB2vcf']['mem'], # this is the overall memory requested
reduced = lambda wildcards, attempt: attempt * (resources['DB2vcf']['mem'] - 3000), # this is the maximum amount given to java
tmpdir = get_big_temp
log:
"logs/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
benchmark:
"benchmarks/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
conda:
"../envs/bam2vcf.yml"
shell:
"""
gatk GenotypeGVCFs \
--java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
-R {input.ref} \
--heterozygosity {params.het} \
--genomicsdb-shared-posixfs-optimizations true \
-V gendb://{input.db} \
-O {output.vcf} \
--tmp-dir {resources.tmpdir} &> {log}
"""
else:
rule gvcf2DB:
"""
Creates GATK DB. If on Linux, DB is archived to reduce temp space.
"""
input:
unpack(get_gvcfs_db),
l = "results/{refGenome}/intervals/db_intervals/{l}-scattered.interval_list",
db_mapfile = "results/{refGenome}/genomics_db_import/DB_mapfile.txt"
output:
db = temp(directory("results/{refGenome}/genomics_db_import/DB_L{l}")),
tar = temp("results/{refGenome}/genomics_db_import/DB_L{l}.tar"),
resources:
mem_mb = lambda wildcards, attempt: attempt * resources['gvcf2DB']['mem'], # this is the overall memory requested
reduced = lambda wildcards, attempt: int(attempt * resources['gvcf2DB']['mem'] * 0.80) # this is the maximum amount given to java
log:
"logs/{refGenome}/gatk_db_import/{l}.txt"
resources:
tmpdir = get_big_temp
benchmark:
"benchmarks/{refGenome}/gatk_db_import/{l}.txt"
conda:
"../envs/bam2vcf.yml"
shell:
# NOTE: reader-threads > 1 useless if you specify multiple intervals
# a forum suggested TILEDB_DISABLE_FILE_LOCKING=1 to remedy sluggish performance
"""
export TILEDB_DISABLE_FILE_LOCKING=1
gatk GenomicsDBImport \
--java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
--genomicsdb-shared-posixfs-optimizations true \
--batch-size 25 \
--genomicsdb-workspace-path {output.db} \
--merge-input-intervals \
-L {input.l} \
--tmp-dir {resources.tmpdir} \
--sample-name-map {input.db_mapfile} &> {log}
tar --overwrite -cf {output.tar} {output.db}
"""

rule DB2vcf:
"""
This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists
are still scattered.
"""
input:
db = "results/{refGenome}/genomics_db_import/DB_L{l}.tar",
ref = "results/{refGenome}/data/genome/{refGenome}.fna",
fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
output:
vcf = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz"),
vcfidx = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi"),
params:
het = config['het_prior'],
db = lambda wc, input: input.db[:-4]
resources:
mem_mb = lambda wildcards, attempt: attempt * resources['DB2vcf']['mem'], # this is the overall memory requested
reduced = lambda wildcards, attempt: attempt * (resources['DB2vcf']['mem'] - 3000), # this is the maximum amount given to java
tmpdir = get_big_temp
log:
"logs/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
benchmark:
"benchmarks/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
conda:
"../envs/bam2vcf.yml"
shell:
rule DB2vcf:
"""
tar -xf {input.db}
gatk GenotypeGVCFs \
--java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
-R {input.ref} \
--heterozygosity {params.het} \
--genomicsdb-shared-posixfs-optimizations true \
-V gendb://{params.db} \
-O {output.vcf} \
--tmp-dir {resources.tmpdir} &> {log}
This rule uses the genomic databases from the previous step (gvcf2DB) to create VCF files, one per list file. Thus, lists
are still scattered.
"""
input:
db = "results/{refGenome}/genomics_db_import/DB_L{l}.tar",
ref = "results/{refGenome}/data/genome/{refGenome}.fna",
fai = "results/{refGenome}/data/genome/{refGenome}.fna.fai",
dictf = "results/{refGenome}/data/genome/{refGenome}.dict",
output:
vcf = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz"),
vcfidx = temp("results/{refGenome}/vcfs/intervals/L{l}.vcf.gz.tbi"),
params:
het = config['het_prior'],
db = lambda wc, input: input.db[:-4]
resources:
mem_mb = lambda wildcards, attempt: attempt * resources['DB2vcf']['mem'], # this is the overall memory requested
reduced = lambda wildcards, attempt: attempt * (resources['DB2vcf']['mem'] - 3000), # this is the maximum amount given to java
tmpdir = get_big_temp
log:
"logs/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
benchmark:
"benchmarks/{refGenome}/gatk_genotype_gvcfs/{l}.txt"
conda:
"../envs/bam2vcf.yml"
shell:
"""
tar -xf {input.db}
gatk GenotypeGVCFs \
--java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
-R {input.ref} \
--heterozygosity {params.het} \
--genomicsdb-shared-posixfs-optimizations true \
-V gendb://{params.db} \
-O {output.vcf} \
--tmp-dir {resources.tmpdir} &> {log}
"""

rule filterVcfs:
"""
Expand Down
8 changes: 7 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,20 @@ import tempfile
import random
import string
import statistics
import platform
from collections import defaultdict
from urllib.request import urlopen
import pandas as pd
from yaml import safe_load
from collections import defaultdict, deque
from snakemake.exceptions import WorkflowError

PLATFORM = platform.system()

if PLATFORM == "Windows":
raise WorkflowError("snpArcher does not currently support Windows.")


samples = pd.read_table(config["samples"], sep=",", dtype=str).replace(
" ", "_", regex=True
)
Expand Down Expand Up @@ -56,7 +63,6 @@ def get_output():
out.append(rules.trackhub_all.input)
return out


def merge_bams_input(wc):
return expand(
"results/{{refGenome}}/bams/preMerge/{{sample}}/{run}.bam",
Expand Down

0 comments on commit 5665b7c

Please sign in to comment.