From ac0b3ffd94fd49b11ae968d8ee364315fccbab3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 25 Nov 2024 10:34:13 +0100 Subject: [PATCH 01/11] add vep ad module --- modules.json | 58 +++++++-- .../nf-core/ensemblvep/vep/environment.yml | 5 + modules/nf-core/ensemblvep/vep/main.nf | 70 +++++++++++ modules/nf-core/ensemblvep/vep/meta.yml | 114 ++++++++++++++++++ .../nf-core/ensemblvep/vep/tests/main.nf.test | 114 ++++++++++++++++++ .../ensemblvep/vep/tests/main.nf.test.snap | 26 ++++ .../ensemblvep/vep/tests/nextflow.config | 12 ++ .../ensemblvep/vep/tests/tab.gz.config | 5 + modules/nf-core/ensemblvep/vep/tests/tags.yml | 2 + .../nf-core/ensemblvep/vep/tests/vcf.config | 5 + 10 files changed, 400 insertions(+), 11 deletions(-) create mode 100644 modules/nf-core/ensemblvep/vep/environment.yml create mode 100644 modules/nf-core/ensemblvep/vep/main.nf create mode 100644 modules/nf-core/ensemblvep/vep/meta.yml create mode 100644 modules/nf-core/ensemblvep/vep/tests/main.nf.test create mode 100644 modules/nf-core/ensemblvep/vep/tests/main.nf.test.snap create mode 100644 modules/nf-core/ensemblvep/vep/tests/nextflow.config create mode 100644 modules/nf-core/ensemblvep/vep/tests/tab.gz.config create mode 100644 modules/nf-core/ensemblvep/vep/tests/tags.yml create mode 100644 modules/nf-core/ensemblvep/vep/tests/vcf.config diff --git a/modules.json b/modules.json index be3070d..9a47197 100644 --- a/modules.json +++ b/modules.json @@ -8,37 +8,60 @@ "bcftools/view": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "ensemblvep/vep": { + "branch": "master", + "git_sha": "6e3585d9ad20b41adc7d271009f8cb5e191ecab4", + "installed_by": [ + "modules", + "vcf_annotate_ensemblvep" + ] }, "gunzip": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "picard/liftovervcf": { "branch": "master", "git_sha": "49f4e50534fe4b64101e62ea41d5dc43b1324358", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tabix/tabix": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules", + "vcf_annotate_ensemblvep" + ] }, "untar": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "vcf2maf": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/vcf2maf/vcf2maf.diff" } } @@ -48,20 +71,33 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "1b6b9a3338d011367137808b49b923515080e3ba", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfschema_plugin": { "branch": "master", "git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] + }, + "vcf_annotate_ensemblvep": { + "branch": "master", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/ensemblvep/vep/environment.yml b/modules/nf-core/ensemblvep/vep/environment.yml new file mode 100644 index 0000000..3d36eb1 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::ensembl-vep=113.0 diff --git a/modules/nf-core/ensemblvep/vep/main.nf b/modules/nf-core/ensemblvep/vep/main.nf new file mode 100644 index 0000000..7d2c82f --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/main.nf @@ -0,0 +1,70 @@ +process ENSEMBLVEP_VEP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ensembl-vep:113.0--pl5321h2a3209d_0' : + 'biocontainers/ensembl-vep:113.0--pl5321h2a3209d_0' }" + + input: + tuple val(meta), path(vcf), path(custom_extra_files) + val genome + val species + val cache_version + path cache + tuple val(meta2), path(fasta) + path extra_files + + output: + tuple val(meta), path("*.vcf.gz") , optional:true, emit: vcf + tuple val(meta), path("*.tab.gz") , optional:true, emit: tab + tuple val(meta), path("*.json.gz") , optional:true, emit: json + path "*.html" , optional:true, emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def file_extension = args.contains("--vcf") ? 'vcf' : args.contains("--json")? 'json' : args.contains("--tab")? 'tab' : 'vcf' + def compress_cmd = args.contains("--compress_output") ? '' : '--compress_output bgzip' + def prefix = task.ext.prefix ?: "${meta.id}" + def dir_cache = cache ? "\${PWD}/${cache}" : "/.vep" + def reference = fasta ? "--fasta $fasta" : "" + """ + vep \\ + -i $vcf \\ + -o ${prefix}.${file_extension}.gz \\ + $args \\ + $compress_cmd \\ + $reference \\ + --assembly $genome \\ + --species $species \\ + --cache \\ + --cache_version $cache_version \\ + --dir_cache $dir_cache \\ + --fork $task.cpus + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "" | gzip > ${prefix}.vcf.gz + echo "" | gzip > ${prefix}.tab.gz + echo "" | gzip > ${prefix}.json.gz + touch ${prefix}_summary.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ensemblvep/vep/meta.yml b/modules/nf-core/ensemblvep/vep/meta.yml new file mode 100644 index 0000000..9288a93 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/meta.yml @@ -0,0 +1,114 @@ +name: ensemblvep_vep +description: Ensembl Variant Effect Predictor (VEP). The output-file-format is controlled + through `task.ext.args`. +keywords: + - annotation + - vcf + - json + - tab +tools: + - ensemblvep: + description: | + VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs + or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. + homepage: https://www.ensembl.org/info/docs/tools/vep/index.html + documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html + licence: ["Apache-2.0"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + vcf to annotate + - custom_extra_files: + type: file + description: | + extra sample-specific files to be used with the `--custom` flag to be configured with ext.args + (optional) + - - genome: + type: string + description: | + which genome to annotate with + - - species: + type: string + description: | + which species to annotate with + - - cache_version: + type: integer + description: | + which version of the cache to annotate with + - - cache: + type: file + description: | + path to VEP cache (optional) + - - meta2: + type: map + description: | + Groovy Map containing fasta reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: | + reference FASTA file (optional) + pattern: "*.{fasta,fa}" + - - extra_files: + type: file + description: | + path to file(s) needed for plugins (optional) +output: + - vcf: + - meta: + type: file + description: | + annotated vcf (optional) + pattern: "*.ann.vcf.gz" + - "*.vcf.gz": + type: file + description: | + annotated vcf (optional) + pattern: "*.ann.vcf.gz" + - tab: + - meta: + type: file + description: | + tab file with annotated variants (optional) + pattern: "*.ann.tab.gz" + - "*.tab.gz": + type: file + description: | + tab file with annotated variants (optional) + pattern: "*.ann.tab.gz" + - json: + - meta: + type: file + description: | + json file with annotated variants (optional) + pattern: "*.ann.json.gz" + - "*.json.gz": + type: file + description: | + json file with annotated variants (optional) + pattern: "*.ann.json.gz" + - report: + - "*.html": + type: file + description: VEP report file + pattern: "*.html" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" +maintainers: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" diff --git a/modules/nf-core/ensemblvep/vep/tests/main.nf.test b/modules/nf-core/ensemblvep/vep/tests/main.nf.test new file mode 100644 index 0000000..3e8c0b5 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/main.nf.test @@ -0,0 +1,114 @@ +nextflow_process { + + name "Test Process ENSEMBLVEP_VEP" + script "../main.nf" + process "ENSEMBLVEP_VEP" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "ensemblvep" + tag "ensemblvep/vep" + tag "ensemblvep/download" + + test("test_ensemblvep_vep_fasta_vcf") { + config "./vcf.config" + + setup { + run("ENSEMBLVEP_DOWNLOAD") { + script "../../download/main.nf" + + process { + """ + input[0] = Channel.of([ + [id:"113_WBcel235"], + params.vep_genome, + params.vep_species, + params.vep_cache_version + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true), + [] + ]) + input[1] = params.vep_genome + input[2] = params.vep_species + input[3] = params.vep_cache_version + input[4] = ENSEMBLVEP_DOWNLOAD.out.cache.map{ meta, cache -> [cache] } + input[5] = Channel.value([ + [id:"fasta"], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[6] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert path(process.out.vcf.get(0).get(1)).linesGzip.contains("##fileformat=VCFv4.2") } + ) + } + + } + + test("test_ensemblvep_vep_fasta_tab_gz") { + config "./tab.gz.config" + + setup { + run("ENSEMBLVEP_DOWNLOAD") { + script "../../download/main.nf" + + process { + """ + input[0] = Channel.of([ + [id:"113_WBcel235"], + params.vep_genome, + params.vep_species, + params.vep_cache_version + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true), + [] + ]) + input[1] = params.vep_genome + input[2] = params.vep_species + input[3] = params.vep_cache_version + input[4] = ENSEMBLVEP_DOWNLOAD.out.cache.map{ meta, cache -> [cache] } + input[5] = Channel.value([ + [id:"fasta"], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[6] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert path(process.out.tab.get(0).get(1)).linesGzip.contains("## ENSEMBL VARIANT EFFECT PREDICTOR v113.0") } + ) + } + } +} diff --git a/modules/nf-core/ensemblvep/vep/tests/main.nf.test.snap b/modules/nf-core/ensemblvep/vep/tests/main.nf.test.snap new file mode 100644 index 0000000..1df9427 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/main.nf.test.snap @@ -0,0 +1,26 @@ +{ + "test_ensemblvep_vep_fasta_tab_gz": { + "content": [ + [ + "versions.yml:md5,4fbfeb73f0d4b4aa039f17be8ba9e1f2" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-21T09:12:23.474703494" + }, + "test_ensemblvep_vep_fasta_vcf": { + "content": [ + [ + "versions.yml:md5,4fbfeb73f0d4b4aa039f17be8ba9e1f2" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-21T09:11:54.343590485" + } +} \ No newline at end of file diff --git a/modules/nf-core/ensemblvep/vep/tests/nextflow.config b/modules/nf-core/ensemblvep/vep/tests/nextflow.config new file mode 100644 index 0000000..0a4ae1a --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/nextflow.config @@ -0,0 +1,12 @@ +params { + vep_cache_version = "113" + vep_genome = "WBcel235" + vep_species = "caenorhabditis_elegans" +} + +process { + withName: ENSEMBLVEP_DOWNLOAD { + ext.args = '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE' + ext.prefix = { "${params.vep_cache_version}_${params.vep_genome}" } + } +} diff --git a/modules/nf-core/ensemblvep/vep/tests/tab.gz.config b/modules/nf-core/ensemblvep/vep/tests/tab.gz.config new file mode 100644 index 0000000..40eb03e --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/tab.gz.config @@ -0,0 +1,5 @@ +process { + withName: ENSEMBLVEP_VEP { + ext.args = '--tab --compress_output bgzip' + } +} diff --git a/modules/nf-core/ensemblvep/vep/tests/tags.yml b/modules/nf-core/ensemblvep/vep/tests/tags.yml new file mode 100644 index 0000000..4aa4aa4 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/tags.yml @@ -0,0 +1,2 @@ +ensemblvep/vep: + - "modules/nf-core/ensemblvep/vep/**" diff --git a/modules/nf-core/ensemblvep/vep/tests/vcf.config b/modules/nf-core/ensemblvep/vep/tests/vcf.config new file mode 100644 index 0000000..ad8955a --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/vcf.config @@ -0,0 +1,5 @@ +process { + withName: ENSEMBLVEP_VEP { + ext.args = '--vcf' + } +} From 45a8db47130b32ac9c89382093df5f082e4daef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 25 Nov 2024 10:34:29 +0100 Subject: [PATCH 02/11] add annotation sbwf --- .../nf-core/vcf_annotate_ensemblvep/main.nf | 45 +++++++++++++ .../nf-core/vcf_annotate_ensemblvep/meta.yml | 65 +++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf create mode 100644 subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml diff --git a/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf b/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf new file mode 100644 index 0000000..291eddc --- /dev/null +++ b/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf @@ -0,0 +1,45 @@ +// +// Run VEP to annotate VCF files +// + +include { ENSEMBLVEP_VEP } from '../../../modules/nf-core/ensemblvep/vep/main' +include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' + +workflow VCF_ANNOTATE_ENSEMBLVEP { + take: + ch_vcf // channel: [ val(meta), path(vcf), [path(custom_file1), path(custom_file2)... (optionnal)]] + ch_fasta // channel: [ val(meta2), path(fasta) ] (optional) + val_genome // value: genome to use + val_species // value: species to use + val_cache_version // value: cache version to use + ch_cache // channel: [ val(meta3), path(cache) ] (optional) + ch_extra_files // channel: [ path(file1), path(file2)... ] (optional) + + main: + ch_versions = Channel.empty() + + ENSEMBLVEP_VEP( + ch_vcf, + val_genome, + val_species, + val_cache_version, + ch_cache, + ch_fasta, + ch_extra_files + ) + + TABIX_TABIX(ENSEMBLVEP_VEP.out.vcf) + + ch_vcf_tbi = ENSEMBLVEP_VEP.out.vcf.join(TABIX_TABIX.out.tbi, failOnDuplicate: true, failOnMismatch: true) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(ENSEMBLVEP_VEP.out.versions) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions) + + emit: + vcf_tbi = ch_vcf_tbi // channel: [ val(meta), path(vcf), path(tbi) ] + json = ENSEMBLVEP_VEP.out.json // channel: [ val(meta), path(json) ] + tab = ENSEMBLVEP_VEP.out.tab // channel: [ val(meta), path(tab) ] + reports = ENSEMBLVEP_VEP.out.report // channel: [ path(html) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml b/subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml new file mode 100644 index 0000000..15d42da --- /dev/null +++ b/subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml @@ -0,0 +1,65 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: vcf_annotate_ensemblvep +description: Perform annotation with ensemblvep and bgzip + tabix index the resulting VCF file +keywords: + - vcf + - annotation + - ensemblvep +components: + - ensemblvep/vep + - tabix/tabix +input: + - ch_vcf: + description: | + vcf file to annotate + Structure: [ val(meta), path(vcf), [path(custom_file1), path(custom_file2)... (optionnal)] ] + - ch_fasta: + description: | + Reference genome fasta file (optional) + Structure: [ val(meta2), path(fasta) ] + - val_genome: + type: string + description: genome to use + - val_species: + type: string + description: species to use + - val_cache_version: + type: integer + description: cache version to use + - ch_cache: + description: | + the root cache folder for ensemblvep (optional) + Structure: [ val(meta3), path(cache) ] + - ch_extra_files: + description: | + any extra files needed by plugins for ensemblvep (optional) + Structure: [ path(file1), path(file2)... ] +output: + - vcf_tbi: + description: | + Compressed vcf file + tabix index + Structure: [ val(meta), path(vcf), path(tbi) ] + - json: + description: | + json file + Structure: [ val(meta), path(json) ] + - tab: + description: | + tab file + Structure: [ val(meta), path(tab) ] + - reports: + type: file + description: html reports + pattern: "*.html" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" +maintainers: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" From eedf08b5c000e3f66bb059a03e0c081b40dd5a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 25 Nov 2024 10:55:48 +0100 Subject: [PATCH 03/11] wip --- main.nf | 17 +++++++++++---- workflows/vcftomaf/main.nf | 44 ++++++++++++++++++++++++++------------ 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/main.nf b/main.nf index b5db4f8..f878568 100644 --- a/main.nf +++ b/main.nf @@ -37,8 +37,11 @@ workflow QBICPIPELINES_VCFTOMAF { // // SET PARAMETERS // - params.fasta = getGenomeAttribute('fasta') - params.dict = getGenomeAttribute('dict') + params.fasta = getGenomeAttribute('fasta') + params.dict = getGenomeAttribute('dict') + params.vep_cache_version = getGenomeAttribute('vep_cache_version') + params.vep_genome = getGenomeAttribute('vep_genome') + params.vep_species = getGenomeAttribute('vep_species') // Extra files intervals = params.intervals ? Channel.fromPath(params.intervals).collect() : Channel.value([]) @@ -52,8 +55,11 @@ workflow QBICPIPELINES_VCFTOMAF { genome = params.genome ?: Channel.empty() // VEP cache - vep_cache = Channel.value([]) //params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : Channel.value([]) + vep_cache = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : Channel.value([]) vep_cache_unpacked = Channel.value([]) + vep_cache_version = params.vep_cache_version + vep_genome = params.vep_genome + vep_species = params.vep_species // @@ -68,7 +74,10 @@ workflow QBICPIPELINES_VCFTOMAF { liftover_chain, genome, vep_cache, - vep_cache_unpacked + vep_cache_unpacked, + vep_cache_version, + vep_genome, + vep_species ) emit: diff --git a/workflows/vcftomaf/main.nf b/workflows/vcftomaf/main.nf index 25ac296..cebd36c 100644 --- a/workflows/vcftomaf/main.nf +++ b/workflows/vcftomaf/main.nf @@ -18,6 +18,7 @@ include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../../subworkflows/local/utils_nfcore_vcftomaf_pipeline' +include { VCF_ANNOTATE_ENSEMBLVEP } from '../../subworkflows/nf-core/vcf_annotate_ensemblvep/main' /* @@ -37,6 +38,9 @@ workflow VCFTOMAF { genome vep_cache vep_cache_unpacked + vep_cache_version + vep_genome + vep_species main: @@ -47,20 +51,6 @@ workflow VCFTOMAF { // SUBWORKFLOW: Read in samplesheet, validate and stage input files // - // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335 - // if (params.vep_cache){ - // ch_vep_cache = vep_cache.map{ - // it -> def new_id = "" - // if(it) { - // new_id = it[0].simpleName.toString() - // } - // [[id:new_id], it] - // } - // // UNTAR if available - // vep_cache_unpacked = UNTAR(ch_vep_cache).untar.map { it[1] } - // ch_versions = ch_versions.mix(UNTAR.out.versions) - // } - // BRANCH CHANNEL ch_samplesheet.branch{ is_indexed: it[0].index == true @@ -80,6 +70,32 @@ workflow VCFTOMAF { // Join both channels back together ch_vcf = ch_input.is_indexed.mix(ch_indexed_to_index) + // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335 + // Therefore we use the vcf_annotate_ensemblvep subworkflow here + + if (params.vep_cache){ + ch_vep_cache = vep_cache.map{ + it -> def new_id = "" + if(it) { + new_id = it[0].simpleName.toString() + } + [[id:new_id], it] + } + // UNTAR if available + vep_cache_unpacked = UNTAR(ch_vep_cache).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR.out.versions) + } + + VCF_ANNOTATE_ENSEMBLVEP( + ch_vcf, + fasta, + vep_genome, + vep_species,// species + vep_cache_version, // cache_version + vep_cache_unpacked, // ch_cache + [] // ch_extra_files + ) + // // MODULE: Run PASS + BED filtering // From cbc8eca2e5c9e9c88852c3c4cbded78c103412f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 25 Nov 2024 11:04:10 +0100 Subject: [PATCH 04/11] add vep --- nextflow.config | 4 ++-- workflows/vcftomaf/main.nf | 37 +++++++++++++++++++++---------------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/nextflow.config b/nextflow.config index dc47825..f547e96 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,8 +13,8 @@ params { liftover_chain = null input = null intervals = null - //vep_cache = null - //run_vep = false + vep_cache = null + run_vep = false filter = false // References diff --git a/workflows/vcftomaf/main.nf b/workflows/vcftomaf/main.nf index cebd36c..97343b9 100644 --- a/workflows/vcftomaf/main.nf +++ b/workflows/vcftomaf/main.nf @@ -70,23 +70,24 @@ workflow VCFTOMAF { // Join both channels back together ch_vcf = ch_input.is_indexed.mix(ch_indexed_to_index) - // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335 - // Therefore we use the vcf_annotate_ensemblvep subworkflow here - - if (params.vep_cache){ - ch_vep_cache = vep_cache.map{ - it -> def new_id = "" - if(it) { - new_id = it[0].simpleName.toString() - } - [[id:new_id], it] + if (params.run_vep) { + // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335 + // Therefore we use the vcf_annotate_ensemblvep subworkflow here + + if (params.vep_cache){ + ch_vep_cache = vep_cache.map{ + it -> def new_id = "" + if(it) { + new_id = it[0].simpleName.toString() + } + [[id:new_id], it] + } + // UNTAR if available + vep_cache_unpacked = UNTAR(ch_vep_cache).untar.map { it[1] } + ch_versions = ch_versions.mix(UNTAR.out.versions) } - // UNTAR if available - vep_cache_unpacked = UNTAR(ch_vep_cache).untar.map { it[1] } - ch_versions = ch_versions.mix(UNTAR.out.versions) - } - VCF_ANNOTATE_ENSEMBLVEP( + VCF_ANNOTATE_ENSEMBLVEP( ch_vcf, fasta, vep_genome, @@ -94,7 +95,11 @@ workflow VCFTOMAF { vep_cache_version, // cache_version vep_cache_unpacked, // ch_cache [] // ch_extra_files - ) + ) + ch_vcf = VCF_ANNOTATE_ENSEMBLVEP.out.vcf_tbi + ch_versions = ch_versions.mix(VCF_ANNOTATE_ENSEMBLVEP.out.versions) + } + // // MODULE: Run PASS + BED filtering From 553fb292251ad5661ed701aaac360680a2da0788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 25 Nov 2024 12:46:52 +0100 Subject: [PATCH 05/11] add vep --- conf/test.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/test.config b/conf/test.config index a205f58..adca71e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,5 +29,6 @@ params { intervals = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.bed' genome = 'GATK.GRCh38' filter = false + run_vep = true } From a6681a816a0d7cbbce7e88764cb2509d68102bc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 25 Nov 2024 12:59:29 +0100 Subject: [PATCH 06/11] fix typo --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 76e7af8..5ff06b9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -94,7 +94,7 @@ Chain file, if liftover should be done The input vcf files are PASS filtered by default. Additionally, if the path to a file containing the targeted intervals for panel sequencing data is specified, off-target regions will be filtered out. -### `--filter`` +### `--filter` Boolean flag to enable filtering of the variants keeping only variants marked as `PASS` when set to `true`. Default is false. From 4599b6c66f986101ea8fbe475c3c4ceca2c8e3c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 25 Nov 2024 13:34:34 +0100 Subject: [PATCH 07/11] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4eccdfc..e84b4ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#21](https://github.com/qbic-pipelines/vcftomaf/pull/21) - Add VEP annotation via subworkflow (@famosab) + ### `Fixed` ### `Dependencies` From 8bb590582bf4d5c4482673c07878de3c7bcddafa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Tue, 26 Nov 2024 10:30:07 +0100 Subject: [PATCH 08/11] add meta to fasta --- workflows/vcftomaf/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/vcftomaf/main.nf b/workflows/vcftomaf/main.nf index 97343b9..571bc4d 100644 --- a/workflows/vcftomaf/main.nf +++ b/workflows/vcftomaf/main.nf @@ -89,7 +89,7 @@ workflow VCFTOMAF { VCF_ANNOTATE_ENSEMBLVEP( ch_vcf, - fasta, + fasta.map{ it -> [ [ id:it.baseName ], it ] }, vep_genome, vep_species,// species vep_cache_version, // cache_version From 4ea07b761d876e404ec3dfdd0bba772572ffaf12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Tue, 26 Nov 2024 10:31:46 +0100 Subject: [PATCH 09/11] prettier --- modules.json | 52 +++++++++++++--------------------------------------- 1 file changed, 13 insertions(+), 39 deletions(-) diff --git a/modules.json b/modules.json index 9a47197..e7aed3a 100644 --- a/modules.json +++ b/modules.json @@ -8,60 +8,42 @@ "bcftools/view": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "ensemblvep/vep": { "branch": "master", "git_sha": "6e3585d9ad20b41adc7d271009f8cb5e191ecab4", - "installed_by": [ - "modules", - "vcf_annotate_ensemblvep" - ] + "installed_by": ["modules", "vcf_annotate_ensemblvep"] }, "gunzip": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "picard/liftovervcf": { "branch": "master", "git_sha": "49f4e50534fe4b64101e62ea41d5dc43b1324358", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tabix/tabix": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules", - "vcf_annotate_ensemblvep" - ] + "installed_by": ["modules", "vcf_annotate_ensemblvep"] }, "untar": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "vcf2maf": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/vcf2maf/vcf2maf.diff" } } @@ -71,33 +53,25 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "1b6b9a3338d011367137808b49b923515080e3ba", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", "git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "vcf_annotate_ensemblvep": { "branch": "master", "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} From 299dc778afaeef49ead6c19b7a9862e3433ec56e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Tue, 26 Nov 2024 10:37:42 +0100 Subject: [PATCH 10/11] add VEP options to schema --- nextflow_schema.json | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index d099aed..da7f89e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -108,6 +108,34 @@ } } }, + "annotation_options": { + "title": "Annotation options", + "type": "object", + "description": "Define parameters for VEP annotation", + "default": "", + "properties": { + "run_vep": { + "type": "boolean", + "description": "True if you want to annotate with VEP" + }, + "vep_cache": { + "type": "string", + "description": "Path to VEP cache which should contain the relevant species, genome and build directories at the path ${vep_species}/${vep_genome}_${vep_cache_version}" + }, + "vep_cache_version": { + "type": "string", + "description": "Alternatively cache version can be use to specify the correct Ensembl Genomes version number as these differ from the concurrent Ensembl/VEP version numbers" + }, + "vep_genome": { + "type": "string", + "description": "This is used to specify the genome when looking for local cache, or cloud based cache." + }, + "vep_species": { + "type": "string", + "description": "Alternatively species listed in Ensembl Genomes caches can be used." + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -255,6 +283,9 @@ { "$ref": "#/$defs/reference_genome_options" }, + { + "$ref": "#/$defs/annotation_options" + }, { "$ref": "#/$defs/institutional_config_options" }, From ffa2cf97d9ea0c354e3691be897d852c86e79af9 Mon Sep 17 00:00:00 2001 From: famosab Date: Tue, 26 Nov 2024 12:37:49 +0100 Subject: [PATCH 11/11] current status --- conf/modules.config | 5 +++ workflows/vcftomaf/main.nf | 66 +++++++++++++++++++++----------------- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f6774ab..ac4d085 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -57,6 +57,7 @@ process { //specify to avoid publishing, overwritten otherwise enabled: false ] + ext.args = { '-f' } } withName: UNTAR { @@ -93,4 +94,8 @@ process { ] } + withName: 'ENSEMBLVEP_VEP' { + ext.args = { '--vcf --compress_output bgzip' } + } + } diff --git a/workflows/vcftomaf/main.nf b/workflows/vcftomaf/main.nf index 571bc4d..05042ea 100644 --- a/workflows/vcftomaf/main.nf +++ b/workflows/vcftomaf/main.nf @@ -18,7 +18,7 @@ include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../../subworkflows/local/utils_nfcore_vcftomaf_pipeline' -include { VCF_ANNOTATE_ENSEMBLVEP } from '../../subworkflows/nf-core/vcf_annotate_ensemblvep/main' +include { ENSEMBLVEP_VEP } from '../../modules/nf-core/ensemblvep/vep/main' /* @@ -51,25 +51,6 @@ workflow VCFTOMAF { // SUBWORKFLOW: Read in samplesheet, validate and stage input files // - // BRANCH CHANNEL - ch_samplesheet.branch{ - is_indexed: it[0].index == true - to_index: it[0].index == false - }.set{ch_input} - - // Remove empty index [] from channel = it[2] - input_to_index = ch_input.to_index.map{ it -> [it[0], it[1]] } - - // Create tbi index only if not provided - TABIX_TABIX(input_to_index) - ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) - - // Join tbi index back to input - ch_indexed_to_index = input_to_index.join(TABIX_TABIX.out.tbi) - - // Join both channels back together - ch_vcf = ch_input.is_indexed.mix(ch_indexed_to_index) - if (params.run_vep) { // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335 // Therefore we use the vcf_annotate_ensemblvep subworkflow here @@ -87,19 +68,46 @@ workflow VCFTOMAF { ch_versions = ch_versions.mix(UNTAR.out.versions) } - VCF_ANNOTATE_ENSEMBLVEP( - ch_vcf, - fasta.map{ it -> [ [ id:it.baseName ], it ] }, + ENSEMBLVEP_VEP( + ch_samplesheet, vep_genome, - vep_species,// species - vep_cache_version, // cache_version - vep_cache_unpacked, // ch_cache - [] // ch_extra_files + vep_species, + vep_cache_version, + vep_cache_unpacked, + fasta.map{ it -> [ [ id:it.baseName ], it ] }, + [] ) - ch_vcf = VCF_ANNOTATE_ENSEMBLVEP.out.vcf_tbi - ch_versions = ch_versions.mix(VCF_ANNOTATE_ENSEMBLVEP.out.versions) + ch_versions = ch_versions.mix(ENSEMBLVEP_VEP.out.versions) + + TABIX_TABIX(ENSEMBLVEP_VEP.out.vcf) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) + + ENSEMBLVEP_VEP.out.vcf.dump(tag: 'vcf-before') + + // Join tbi index back to input + ch_vcf = ENSEMBLVEP_VEP.out.vcf.join(TABIX_TABIX.out.tbi) + ch_vcf.dump(tag: 'vcf-sfter') } + else { + // BRANCH CHANNEL + ch_samplesheet.branch{ + is_indexed: it[0].index == true + to_index: it[0].index == false + }.set{ch_input} + // Remove empty index [] from channel = it[2] + input_to_index = ch_input.to_index.map{ it -> [it[0], it[1]] } + + // Create tbi index only if not provided + TABIX_TABIX(input_to_index) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first()) + + // Join tbi index back to input + ch_indexed_to_index = input_to_index.join(TABIX_TABIX.out.tbi) + + // Join both channels back together + ch_vcf = ch_input.is_indexed.mix(ch_indexed_to_index) + } // // MODULE: Run PASS + BED filtering