diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d8c61f939..bcbbe35f12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [1660](https://github.com/nf-core/sarek/pull/1642) - Add `--length_required` for minimal reads length with `FASTP` - [1663](https://github.com/nf-core/sarek/pull/1663) - Massive conda modules update - [1664](https://github.com/nf-core/sarek/pull/1664) - Check if flowcell ID matches for read pair +- [1680](https://github.com/nf-core/sarek/pull/1682) - Add `bcftools_norm` in `POST_VARIANTCALLING` for normalization of all vcf files or for concatenated germline vcfs - [1730](https://github.com/nf-core/sarek/pull/1730) - Enable Harshil Alignment™️ in VS Code workspace settings ### Changed diff --git a/conf/modules/post_variant_calling.config b/conf/modules/post_variant_calling.config index 3354d4671f..742e0c03f9 100644 --- a/conf/modules/post_variant_calling.config +++ b/conf/modules/post_variant_calling.config @@ -16,7 +16,7 @@ process { - withName: 'GERMLINE_VCFS_CONCAT'{ + withName: 'GERMLINE_VCFS_CONCAT' { ext.args = { "-a" } ext.when = { params.concatenate_vcfs } publishDir = [ @@ -25,18 +25,41 @@ process { ] } - withName: 'GERMLINE_VCFS_CONCAT_SORT'{ + withName: 'GERMLINE_VCFS_CONCAT_SORT' { ext.prefix = { "${meta.id}.germline" } ext.when = { params.concatenate_vcfs } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/concat/${meta.id}/" } + path: { "${params.outdir}/variant_calling/concat/${meta.id}/" }, + pattern: "*vcf.gz" + ] + } + + withName: 'VCFS_NORM_SORT' { + ext.prefix = { "${meta.id}.${meta.variantcaller}.norm" } + ext.when = { params.normalize_vcfs } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/normalized/${meta.id}/" }, + pattern: "*vcf.gz" + ] + } + + withName: 'VCFS_NORM' { + ext.args = { [ + '--multiallelics -both', //split multiallelic sites into biallelic records and both SNPs and indels should be merged separately into two records + '--rm-dup all' //output only the first instance of a record which is present multiple times + ].join(' ') } + ext.when = { params.normalize_vcfs } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/normalized/${meta.id}/" } ] } withName: 'TABIX_EXT_VCF' { - ext.prefix = { "${input.baseName}" } - ext.when = { params.concatenate_vcfs } + ext.prefix = { "${input.baseName}" } + ext.when = { params.concatenate_vcfs || params.normalize_vcfs } } withName: 'TABIX_GERMLINE_VCFS_CONCAT_SORT'{ @@ -44,7 +67,19 @@ process { ext.when = { params.concatenate_vcfs } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/concat/${meta.id}/" } + path: { "${params.outdir}/variant_calling/concat/${meta.id}/" }, + pattern: "*.tbi" + ] + } + + withName: 'TABIX_VCFS_NORM_SORT'{ + ext.prefix = { "${meta.id}.${meta.variantcaller}.norm" } + ext.when = { params.normalize_vcfs } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/normalized/${meta.id}/" }, + pattern: "*.tbi" ] } } + diff --git a/modules.json b/modules.json index 75fec0dbc7..df24e040cd 100644 --- a/modules.json +++ b/modules.json @@ -26,6 +26,11 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["bam_ngscheckmate"] }, + "bcftools/norm": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "bcftools/sort": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/nf-core/bcftools/norm/environment.yml b/modules/nf-core/bcftools/norm/environment.yml new file mode 100644 index 0000000000..5c00b116ad --- /dev/null +++ b/modules/nf-core/bcftools/norm/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bcftools=1.20 diff --git a/modules/nf-core/bcftools/norm/main.nf b/modules/nf-core/bcftools/norm/main.nf new file mode 100644 index 0000000000..bd7a250127 --- /dev/null +++ b/modules/nf-core/bcftools/norm/main.nf @@ -0,0 +1,70 @@ +process BCFTOOLS_NORM { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.20--h8b25389_0': + 'biocontainers/bcftools:1.20--h8b25389_0' }" + + input: + tuple val(meta), path(vcf), path(tbi) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.{vcf,vcf.gz,bcf,bcf.gz}"), emit: vcf + tuple val(meta), path("*.tbi") , emit: tbi, optional: true + tuple val(meta), path("*.csi") , emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf.gz" + + """ + bcftools norm \\ + --fasta-ref ${fasta} \\ + --output ${prefix}.${extension} \\ + $args \\ + --threads $task.cpus \\ + ${vcf} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf.gz" + def index = args.contains("--write-index=tbi") || args.contains("-W=tbi") ? "tbi" : + args.contains("--write-index=csi") || args.contains("-W=csi") ? "csi" : + args.contains("--write-index") || args.contains("-W") ? "csi" : + "" + def create_cmd = extension.endsWith(".gz") ? "echo '' | gzip >" : "touch" + def create_index = extension.endsWith(".gz") && index.matches("csi|tbi") ? "touch ${prefix}.${extension}.${index}" : "" + + """ + ${create_cmd} ${prefix}.${extension} + ${create_index} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/norm/meta.yml b/modules/nf-core/bcftools/norm/meta.yml new file mode 100644 index 0000000000..b6edeb4aae --- /dev/null +++ b/modules/nf-core/bcftools/norm/meta.yml @@ -0,0 +1,85 @@ +name: bcftools_norm +description: Normalize VCF file +keywords: + - normalize + - norm + - variant calling + - VCF +tools: + - norm: + description: | + Normalize VCF files. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + identifier: biotools:bcftools +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + The vcf file to be normalized + e.g. 'file1.vcf' + pattern: "*.{vcf,vcf.gz}" + - tbi: + type: file + description: | + An optional index of the VCF file (for when the VCF is compressed) + pattern: "*.vcf.gz.tbi" + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: FASTA reference file + pattern: "*.{fasta,fa}" +output: + - vcf: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{vcf,vcf.gz,bcf,bcf.gz}": + type: file + description: One of uncompressed VCF (.vcf), compressed VCF (.vcf.gz), compressed + BCF (.bcf.gz) or uncompressed BCF (.bcf) normalized output file + pattern: "*.{vcf,vcf.gz,bcf,bcf.gz}" + - tbi: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tbi": + type: file + description: Alternative VCF file index + pattern: "*.tbi" + - csi: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.csi": + type: file + description: Default VCF file index + pattern: "*.csi" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" + - "@ramprasadn" +maintainers: + - "@abhi18av" + - "@ramprasadn" diff --git a/modules/nf-core/bcftools/norm/tests/main.nf.test b/modules/nf-core/bcftools/norm/tests/main.nf.test new file mode 100644 index 0000000000..dbc4150237 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/main.nf.test @@ -0,0 +1,563 @@ +nextflow_process { + + name "Test Process BCFTOOLS_NORM" + script "../main.nf" + process "BCFTOOLS_NORM" + + tag "modules" + tag "modules_nfcore" + tag "bcftools" + tag "bcftools/norm" + + test("sarscov2 - [ vcf, [] ], fasta") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, [] ], fasta - vcf_gz_index") { + + config "./vcf_gz_index.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf, + process.out.csi.collect { it.collect { it instanceof Map ? it : file(it).name } } + ).match() }, + { assert process.out.csi[0][1].endsWith(".csi") } + ) + } + + } + + test("sarscov2 - [ vcf, [] ], fasta - vcf_gz_index_csi") { + + config "./vcf_gz_index_csi.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf, + process.out.csi.collect { it.collect { it instanceof Map ? it : file(it).name } } + ).match() }, + { assert process.out.csi[0][1].endsWith(".csi") } + ) + } + + } + + test("sarscov2 - [ vcf, [] ], fasta - vcf_gz_index_tbi") { + + config "./vcf_gz_index_tbi.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf, + process.out.csi.collect { it.collect { it instanceof Map ? it : file(it).name } } + ).match() }, + { assert process.out.tbi[0][1].endsWith(".tbi") } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta - vcf output") { + + config "./nextflow.vcf.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta - vcf_gz output") { + + config "./nextflow.vcf.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf, + process.out.csi.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.tbi.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta - bcf output") { + + config "./nextflow.bcf.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta - bcf_gz output") { + + config "./nextflow.bcf_gz.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, [] ], fasta - stub") { + + config "./nextflow.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta -stub") { + + config "./nextflow.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta - vcf output -stub") { + + config "./nextflow.vcf.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta - vcf_gz output - stub") { + + config "./nextflow.vcf.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta - bcf output - stub") { + + config "./nextflow.bcf.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, tbi ], fasta - bcf_gz output - stub") { + + config "./nextflow.bcf_gz.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - [ vcf, [] ], fasta - vcf_gz_index - stub") { + + config "./vcf_gz_index.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.csi[0][1].endsWith(".csi") } + ) + } + + } + + test("sarscov2 - [ vcf, [] ], fasta - vcf_gz_index_csi - stub") { + + config "./vcf_gz_index_csi.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.csi[0][1].endsWith(".csi") } + ) + } + + } + + test("sarscov2 - [ vcf, [] ], fasta - vcf_gz_index_tbi - stub") { + + config "./vcf_gz_index_tbi.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.tbi[0][1].endsWith(".tbi") } + ) + } + + } + + +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/norm/tests/main.nf.test.snap b/modules/nf-core/bcftools/norm/tests/main.nf.test.snap new file mode 100644 index 0000000000..3be52116a9 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/main.nf.test.snap @@ -0,0 +1,758 @@ +{ + "sarscov2 - [ vcf, tbi ], fasta - vcf_gz output - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.vcf:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.vcf:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:38:42.639095032" + }, + "sarscov2 - [ vcf, [] ], fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:38:05.448449893" + }, + "sarscov2 - [ vcf, tbi ], fasta - vcf output": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.vcf:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.vcf:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:37:12.741719961" + }, + "sarscov2 - [ vcf, [] ], fasta - vcf_gz_index - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:39:22.875147941" + }, + "sarscov2 - [ vcf, tbi ], fasta - vcf_gz output": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_norm.vcf:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-05T08:15:23.38765384" + }, + "sarscov2 - [ vcf, [] ], fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.vcf.gz:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.vcf.gz:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:36:21.519977754" + }, + "sarscov2 - [ vcf, tbi ], fasta - vcf output -stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.vcf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.vcf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:38:27.8230994" + }, + "sarscov2 - [ vcf, tbi ], fasta - bcf_gz output": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.bcf:md5,f35545c26a788b5eb697d9c0490339d9" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.bcf:md5,f35545c26a788b5eb697d9c0490339d9" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:37:53.942403192" + }, + "sarscov2 - [ vcf, [] ], fasta - vcf_gz_index_csi - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-05T13:56:05.3799488" + }, + "sarscov2 - [ vcf, [] ], fasta - vcf_gz_index_tbi": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-05T13:53:28.356741947" + }, + "sarscov2 - [ vcf, tbi ], fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.vcf.gz:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.vcf.gz:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:36:58.39445154" + }, + "sarscov2 - [ vcf, tbi ], fasta -stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:38:16.259516142" + }, + "sarscov2 - [ vcf, tbi ], fasta - bcf_gz output - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.bcf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.bcf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:39:10.503208929" + }, + "sarscov2 - [ vcf, [] ], fasta - vcf_gz_index": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz.csi" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-05T07:52:58.381931979" + }, + "sarscov2 - [ vcf, tbi ], fasta - bcf output - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.bcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.bcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:38:59.121377258" + }, + "sarscov2 - [ vcf, [] ], fasta - vcf_gz_index_tbi - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-05T13:56:16.404380471" + }, + "sarscov2 - [ vcf, [] ], fasta - vcf_gz_index_csi": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz:md5,63e5adbaf3dd94550e9e3d7935dd28db" + ] + ], + [ + [ + { + "id": "test" + }, + "test_vcf.vcf.gz.csi" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-05T13:53:09.808834237" + }, + "sarscov2 - [ vcf, tbi ], fasta - bcf output": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_norm.bcf.gz:md5,638c3c25bdd495c90ecbccb69ee77f07" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ], + "csi": [ + + ], + "tbi": [ + + ], + "vcf": [ + [ + { + "id": "test" + }, + "test_norm.bcf.gz:md5,638c3c25bdd495c90ecbccb69ee77f07" + ] + ], + "versions": [ + "versions.yml:md5,ff760495922469e56d0fc3372773000d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T14:37:42.141945244" + } +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/norm/tests/nextflow.bcf.config b/modules/nf-core/bcftools/norm/tests/nextflow.bcf.config new file mode 100644 index 0000000000..b79af86817 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/nextflow.bcf.config @@ -0,0 +1,4 @@ +process { + ext.args = '-m -any --output-type b --no-version' + ext.prefix = "test_norm" +} diff --git a/modules/nf-core/bcftools/norm/tests/nextflow.bcf_gz.config b/modules/nf-core/bcftools/norm/tests/nextflow.bcf_gz.config new file mode 100644 index 0000000000..f36f397c2c --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/nextflow.bcf_gz.config @@ -0,0 +1,4 @@ +process { + ext.args = '-m -any --output-type u --no-version' + ext.prefix = "test_norm" +} diff --git a/modules/nf-core/bcftools/norm/tests/nextflow.config b/modules/nf-core/bcftools/norm/tests/nextflow.config new file mode 100644 index 0000000000..510803b407 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/nextflow.config @@ -0,0 +1,4 @@ +process { + ext.args = '-m -any --no-version' + ext.prefix = "test_norm" +} diff --git a/modules/nf-core/bcftools/norm/tests/nextflow.vcf.config b/modules/nf-core/bcftools/norm/tests/nextflow.vcf.config new file mode 100644 index 0000000000..10bf93e320 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/nextflow.vcf.config @@ -0,0 +1,4 @@ +process { + ext.args = '-m -any --output-type v --no-version' + ext.prefix = "test_norm" +} diff --git a/modules/nf-core/bcftools/norm/tests/nextflow.vcf_gz.config b/modules/nf-core/bcftools/norm/tests/nextflow.vcf_gz.config new file mode 100644 index 0000000000..b31dd2de22 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/nextflow.vcf_gz.config @@ -0,0 +1,4 @@ +process { + ext.args = '-m -any --output-type z ---no-version' + ext.prefix = "test_norm" +} diff --git a/modules/nf-core/bcftools/norm/tests/tags.yml b/modules/nf-core/bcftools/norm/tests/tags.yml new file mode 100644 index 0000000000..f6f5e35616 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/tags.yml @@ -0,0 +1,2 @@ +bcftools/norm: + - "modules/nf-core/bcftools/norm/**" diff --git a/modules/nf-core/bcftools/norm/tests/vcf_gz_index.config b/modules/nf-core/bcftools/norm/tests/vcf_gz_index.config new file mode 100644 index 0000000000..7dd696ee26 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/vcf_gz_index.config @@ -0,0 +1,4 @@ +process { + ext.prefix = { "${meta.id}_vcf" } + ext.args = "--output-type z --write-index --no-version" +} diff --git a/modules/nf-core/bcftools/norm/tests/vcf_gz_index_csi.config b/modules/nf-core/bcftools/norm/tests/vcf_gz_index_csi.config new file mode 100644 index 0000000000..aebffb6fb7 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/vcf_gz_index_csi.config @@ -0,0 +1,4 @@ +process { + ext.prefix = { "${meta.id}_vcf" } + ext.args = "--output-type z --write-index=csi --no-version" +} diff --git a/modules/nf-core/bcftools/norm/tests/vcf_gz_index_tbi.config b/modules/nf-core/bcftools/norm/tests/vcf_gz_index_tbi.config new file mode 100644 index 0000000000..b192ae7d19 --- /dev/null +++ b/modules/nf-core/bcftools/norm/tests/vcf_gz_index_tbi.config @@ -0,0 +1,4 @@ +process { + ext.prefix = { "${meta.id}_vcf" } + ext.args = "--output-type z --write-index=tbi --no-version" +} diff --git a/nextflow.config b/nextflow.config index f01ed136d2..0e4ce693c8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -73,6 +73,7 @@ params { ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling + normalize_vcfs = false // by default we don't normalize the vcf-files only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired sample sentieon_dnascope_emit_mode = 'variant' // default value for Sentieon dnascope sentieon_dnascope_pcr_indel_model = 'CONSERVATIVE' @@ -476,4 +477,3 @@ includeConfig 'conf/modules/lofreq.config' //annotate includeConfig 'conf/modules/annotate.config' - diff --git a/nextflow_schema.json b/nextflow_schema.json index 2e66ccdf53..3eaf2b1ea6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -402,6 +402,12 @@ "fa_icon": "fas fa-tape", "description": "Option for concatenating germline vcf-files.", "help_text": "Concatenating the germline vcf-files from each applied variant-caller into one vcf-file using bfctools concat." + }, + "normalize_vcfs": { + "type": "boolean", + "fa_icon": "fas fa-tape", + "description": "Option for normalization of vcf-files.", + "help_text": "Normalization of all vcf-files from each applied variant-caller using bfctools norm." } } }, diff --git a/subworkflows/local/post_variantcalling/main.nf b/subworkflows/local/post_variantcalling/main.nf index 6b75d2c6b8..86bcd47353 100644 --- a/subworkflows/local/post_variantcalling/main.nf +++ b/subworkflows/local/post_variantcalling/main.nf @@ -3,23 +3,36 @@ // include { CONCATENATE_GERMLINE_VCFS } from '../vcf_concatenate_germline/main' +include { NORMALIZE_VCFS } from '../vcf_normalization/main' workflow POST_VARIANTCALLING { take: - vcfs + germline_vcfs + tumor_only_vcfs + somatic_vcfs + fasta concatenate_vcfs + normalize_vcfs main: versions = Channel.empty() + vcfs = Channel.empty() if (concatenate_vcfs){ - CONCATENATE_GERMLINE_VCFS(vcfs) + CONCATENATE_GERMLINE_VCFS(germline_vcfs) vcfs = vcfs.mix(CONCATENATE_GERMLINE_VCFS.out.vcfs) versions = versions.mix(CONCATENATE_GERMLINE_VCFS.out.versions) } + if (normalize_vcfs){ + NORMALIZE_VCFS(germline_vcfs, tumor_only_vcfs, somatic_vcfs, fasta) + + vcfs = vcfs.mix(NORMALIZE_VCFS.out.vcfs) + versions = versions.mix(NORMALIZE_VCFS.out.versions) + } + emit: vcfs // post processed vcfs diff --git a/subworkflows/local/vcf_concatenate_germline/main.nf b/subworkflows/local/vcf_concatenate_germline/main.nf index 87f46b22e1..9d24420a2f 100644 --- a/subworkflows/local/vcf_concatenate_germline/main.nf +++ b/subworkflows/local/vcf_concatenate_germline/main.nf @@ -22,7 +22,7 @@ workflow CONCATENATE_GERMLINE_VCFS { TABIX_EXT_VCF(ADD_INFO_TO_VCF.out.vcf) // Gather vcfs and vcf-tbis for concatenating germline-vcfs - germline_vcfs_with_tbis = TABIX_EXT_VCF.out.gz_tbi.map{ meta, vcf, tbi -> [ meta.subMap('id'), vcf, tbi ] }.groupTuple() + germline_vcfs_with_tbis = TABIX_EXT_VCF.out.gz_tbi.groupTuple() GERMLINE_VCFS_CONCAT(germline_vcfs_with_tbis) GERMLINE_VCFS_CONCAT_SORT(GERMLINE_VCFS_CONCAT.out.vcf) @@ -32,11 +32,12 @@ workflow CONCATENATE_GERMLINE_VCFS { versions = versions.mix(ADD_INFO_TO_VCF.out.versions) versions = versions.mix(TABIX_EXT_VCF.out.versions) versions = versions.mix(GERMLINE_VCFS_CONCAT.out.versions) - versions = versions.mix(GERMLINE_VCFS_CONCAT.out.versions) + versions = versions.mix(GERMLINE_VCFS_CONCAT_SORT.out.versions) versions = versions.mix(TABIX_GERMLINE_VCFS_CONCAT_SORT.out.versions) emit: - vcfs = germline_vcfs_with_tbis // post processed vcfs + vcfs = GERMLINE_VCFS_CONCAT_SORT.out.vcf // concatenated vcfs versions // channel: [ versions.yml ] } + diff --git a/subworkflows/local/vcf_normalization/main.nf b/subworkflows/local/vcf_normalization/main.nf new file mode 100644 index 0000000000..3e1f9edf1e --- /dev/null +++ b/subworkflows/local/vcf_normalization/main.nf @@ -0,0 +1,50 @@ +// Normalize all unannotated VCFs + +// Import modules +include { ADD_INFO_TO_VCF } from '../../../modules/local/add_info_to_vcf/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { BCFTOOLS_NORM as VCFS_NORM } from '../../../modules/nf-core/bcftools/norm/main' +include { BCFTOOLS_SORT as VCFS_NORM_SORT } from '../../../modules/nf-core/bcftools/sort/main' +include { TABIX_TABIX as TABIX_VCFS_NORM_SORT } from '../../../modules/nf-core/tabix/tabix/main' + +// Workflow to normalize, compress, and index VCF files +workflow NORMALIZE_VCFS { + + take: + germline_vcfs + tumor_only_vcfs + somatic_vcfs + fasta + + main: + versions = Channel.empty() + + vcfs = germline_vcfs.mix(tumor_only_vcfs, somatic_vcfs) + + // Add additional information to VCF files + ADD_INFO_TO_VCF(vcfs) + + // Compress the VCF files with bgzip + TABIX_EXT_VCF(ADD_INFO_TO_VCF.out.vcf) + + // Normalize the VCF files with BCFTOOLS_NORM + VCFS_NORM(TABIX_EXT_VCF.out.gz_tbi, fasta) + + // Sort the normalized VCF files + VCFS_NORM_SORT(VCFS_NORM.out.vcf) + + // Index the sorted normalized VCF files + TABIX_VCFS_NORM_SORT(VCFS_NORM_SORT.out.vcf) + + // Gather versions of all tools used + versions = versions.mix(ADD_INFO_TO_VCF.out.versions) + versions = versions.mix(VCFS_NORM.out.versions) + versions = versions.mix(TABIX_EXT_VCF.out.versions) + versions = versions.mix(VCFS_NORM_SORT.out.versions) + versions = versions.mix(TABIX_VCFS_NORM_SORT.out.versions) + + emit: + vcfs = VCFS_NORM_SORT.out.vcf // normalized vcfs + versions // Channel: [versions.yml] +} + diff --git a/tests/config/pytesttags.yml b/tests/config/pytesttags.yml index 3f8723df7e..4262dece9f 100644 --- a/tests/config/pytesttags.yml +++ b/tests/config/pytesttags.yml @@ -394,6 +394,38 @@ concatenate_vcfs: - tests/csv/3.0/mapped_joint_bam.csv - tests/test_concat_germline_vcfs.yml +## normalize all vcfs +normalize_vcfs: + - conf/modules/post_variant_calling.config + - modules/nf-core/bcftools/concat/** + - modules/nf-core/bcftools/mpileup/** + - modules/nf-core/bcftools/norm/** + - modules/nf-core/bcftools/sort/** + - modules/nf-core/deepvariant/** + - modules/nf-core/freebayes/** + - modules/nf-core/gatk4/haplotypecaller/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/manta/germline/** + - modules/nf-core/samtools/mpileup/** + - modules/nf-core/strelka/germline/** + - modules/nf-core/tabix/bgziptabix/** + - modules/nf-core/tabix/tabix/** + - modules/nf-core/tiddit/sv/** + - subworkflows/local/bam_variant_calling_deepvariant/** + - subworkflows/local/bam_variant_calling_freebayes/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_germline_manta/** + - subworkflows/local/bam_variant_calling_haplotypecaller/** + - subworkflows/local/bam_variant_calling_mpileup/** + - subworkflows/local/bam_variant_calling_single_strelka/** + - subworkflows/local/bam_variant_calling_single_tiddit/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - subworkflows/local/post_variantcalling/** + - subworkflows/local/vcf_normalization/** + - tests/csv/3.0/mapped_joint_bam.csv + - tests/test_normalize_vcfs.yml + # sampleqc ## ngscheckmate diff --git a/tests/test_concat_germline_vcfs.yml b/tests/test_concat_germline_vcfs.yml index 97a2453fbc..bbc1ca3113 100644 --- a/tests/test_concat_germline_vcfs.yml +++ b/tests/test_concat_germline_vcfs.yml @@ -4,19 +4,9 @@ - concatenate_vcfs files: - path: results/variant_calling/concat/testN/testN.germline.vcf.gz - # binary changes md5sums on reruns - contains: - [ - "SOURCE=testN.deepvariant.vcf.gz", - "AB=0.167832;ABP=277.102;AC=1;AF=0.5;AN=2;AO=48;CIGAR=1X;DP=286;DPB=286;DPRA=0;EPP=3.0103;EPPR=3.0103;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=1;NUMALT=1;ODDS=105.855;PAIRED=1;PAIREDR=1;PAO=0;PQA=0;PQR=0;PRO=0;QA=2017;QR=9863;RO=238;RPL=0;RPP=107.241;RPPR=519.821;RPR=48;RUN=1;SAF=24;SAP=3.0103;SAR=24;SRF=119;SRP=3.0103;SRR=119;TYPE=snp;technology.illumina=1;", - "SOURCE=testN.freebayes.vcf.gz", - "SNVHPOL=7;MQ=60;", - "SOURCE=testN.strelka.variants.vcf.gz", - "SOURCE=testN.bcftools.vcf.gz", - ] - path: results/variant_calling/concat/testT/testT.germline.vcf.gz - # binary changes md5sums on reruns + # binary changes md5sums on reruns - path: results/variant_calling/concat/testN/testN.germline.vcf.gz.tbi - # binary changes md5sums on reruns + # binary changes md5sums on reruns - path: results/variant_calling/concat/testT/testT.germline.vcf.gz.tbi - # binary changes md5sums on reruns + # binary changes md5sums on reruns diff --git a/tests/test_normalize_vcfs.yml b/tests/test_normalize_vcfs.yml new file mode 100644 index 0000000000..d2e6813fd3 --- /dev/null +++ b/tests/test_normalize_vcfs.yml @@ -0,0 +1,33 @@ +- name: Run all variant callers and check for existence of normalized vcf-files + command: nextflow run main.nf -profile test --input ./tests/csv/3.0/mapped_joint_bam.csv --normalize_vcfs --tools deepvariant,freebayes,haplotypecaller,manta,mpileup,strelka,tiddit --step variant_calling --outdir results + tags: + - normalize_vcfs + files: + - path: results/variant_calling/normalized/testN/testN.deepvariant.norm.vcf.gz + - path: results/variant_calling/normalized/testT/testT.deepvariant.norm.vcf.gz + - path: results/variant_calling/normalized/testN/testN.deepvariant.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testT/testT.deepvariant.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testN/testN.freebayes.norm.vcf.gz + - path: results/variant_calling/normalized/testT/testT.freebayes.norm.vcf.gz + - path: results/variant_calling/normalized/testN/testN.freebayes.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testT/testT.freebayes.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testN/testN.haplotypecaller.norm.vcf.gz + - path: results/variant_calling/normalized/testT/testT.haplotypecaller.norm.vcf.gz + - path: results/variant_calling/normalized/testN/testN.haplotypecaller.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testT/testT.haplotypecaller.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testN/testN.manta.norm.vcf.gz + - path: results/variant_calling/normalized/testT/testT.manta.norm.vcf.gz + - path: results/variant_calling/normalized/testN/testN.manta.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testT/testT.manta.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testN/testN.bcftools.norm.vcf.gz + - path: results/variant_calling/normalized/testT/testT.bcftools.norm.vcf.gz + - path: results/variant_calling/normalized/testN/testN.bcftools.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testT/testT.bcftools.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testN/testN.strelka.norm.vcf.gz + - path: results/variant_calling/normalized/testT/testT.strelka.norm.vcf.gz + - path: results/variant_calling/normalized/testN/testN.strelka.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testT/testT.strelka.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testN/testN.tiddit.norm.vcf.gz + - path: results/variant_calling/normalized/testT/testT.tiddit.norm.vcf.gz + - path: results/variant_calling/normalized/testN/testN.tiddit.norm.vcf.gz.tbi + - path: results/variant_calling/normalized/testT/testT.tiddit.norm.vcf.gz.tbi diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 7595c279e7..ae7805fe59 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -776,21 +776,32 @@ workflow SAREK { // POST VARIANTCALLING POST_VARIANTCALLING(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_all, - params.concatenate_vcfs) + BAM_VARIANT_CALLING_TUMOR_ONLY_ALL.out.vcf_all, + BAM_VARIANT_CALLING_SOMATIC_ALL.out.vcf_all, + fasta, + params.concatenate_vcfs, + params.normalize_vcfs) // Gather vcf files for annotation and QC vcf_to_annotate = Channel.empty() - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_deepvariant) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_freebayes) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_haplotypecaller) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_manta) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_dnascope) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_haplotyper) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_strelka) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_tiddit) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_mpileup) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_ALL.out.vcf_all) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_SOMATIC_ALL.out.vcf_all) + + // Check if normalization is requested + if (params.normalize_vcfs) { + vcf_to_annotate = vcf_to_annotate.mix(POST_VARIANTCALLING.out.vcfs) + } else { + // If not normalized, gather existing VCFs + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_deepvariant) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_freebayes) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_haplotypecaller) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_manta) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_dnascope) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_haplotyper) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_strelka) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_tiddit) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_mpileup) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_ALL.out.vcf_all) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_SOMATIC_ALL.out.vcf_all) + } // QC VCF_QC_BCFTOOLS_VCFTOOLS(vcf_to_annotate, intervals_bed_combined)