From ac0b3ffd94fd49b11ae968d8ee364315fccbab3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Mon, 25 Nov 2024 10:34:13 +0100
Subject: [PATCH 01/11] add vep ad module

---
 modules.json                                  |  58 +++++++--
 .../nf-core/ensemblvep/vep/environment.yml    |   5 +
 modules/nf-core/ensemblvep/vep/main.nf        |  70 +++++++++++
 modules/nf-core/ensemblvep/vep/meta.yml       | 114 ++++++++++++++++++
 .../nf-core/ensemblvep/vep/tests/main.nf.test | 114 ++++++++++++++++++
 .../ensemblvep/vep/tests/main.nf.test.snap    |  26 ++++
 .../ensemblvep/vep/tests/nextflow.config      |  12 ++
 .../ensemblvep/vep/tests/tab.gz.config        |   5 +
 modules/nf-core/ensemblvep/vep/tests/tags.yml |   2 +
 .../nf-core/ensemblvep/vep/tests/vcf.config   |   5 +
 10 files changed, 400 insertions(+), 11 deletions(-)
 create mode 100644 modules/nf-core/ensemblvep/vep/environment.yml
 create mode 100644 modules/nf-core/ensemblvep/vep/main.nf
 create mode 100644 modules/nf-core/ensemblvep/vep/meta.yml
 create mode 100644 modules/nf-core/ensemblvep/vep/tests/main.nf.test
 create mode 100644 modules/nf-core/ensemblvep/vep/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/ensemblvep/vep/tests/nextflow.config
 create mode 100644 modules/nf-core/ensemblvep/vep/tests/tab.gz.config
 create mode 100644 modules/nf-core/ensemblvep/vep/tests/tags.yml
 create mode 100644 modules/nf-core/ensemblvep/vep/tests/vcf.config

diff --git a/modules.json b/modules.json
index be3070d..9a47197 100644
--- a/modules.json
+++ b/modules.json
@@ -8,37 +8,60 @@
                     "bcftools/view": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
+                    },
+                    "ensemblvep/vep": {
+                        "branch": "master",
+                        "git_sha": "6e3585d9ad20b41adc7d271009f8cb5e191ecab4",
+                        "installed_by": [
+                            "modules",
+                            "vcf_annotate_ensemblvep"
+                        ]
                     },
                     "gunzip": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "picard/liftovervcf": {
                         "branch": "master",
                         "git_sha": "49f4e50534fe4b64101e62ea41d5dc43b1324358",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "tabix/tabix": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules",
+                            "vcf_annotate_ensemblvep"
+                        ]
                     },
                     "untar": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": ["modules"]
+                        "installed_by": [
+                            "modules"
+                        ]
                     },
                     "vcf2maf": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": ["modules"],
+                        "installed_by": [
+                            "modules"
+                        ],
                         "patch": "modules/nf-core/vcf2maf/vcf2maf.diff"
                     }
                 }
@@ -48,20 +71,33 @@
                     "utils_nextflow_pipeline": {
                         "branch": "master",
                         "git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": [
+                            "subworkflows"
+                        ]
                     },
                     "utils_nfcore_pipeline": {
                         "branch": "master",
                         "git_sha": "1b6b9a3338d011367137808b49b923515080e3ba",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": [
+                            "subworkflows"
+                        ]
                     },
                     "utils_nfschema_plugin": {
                         "branch": "master",
                         "git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c",
-                        "installed_by": ["subworkflows"]
+                        "installed_by": [
+                            "subworkflows"
+                        ]
+                    },
+                    "vcf_annotate_ensemblvep": {
+                        "branch": "master",
+                        "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f",
+                        "installed_by": [
+                            "subworkflows"
+                        ]
                     }
                 }
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/modules/nf-core/ensemblvep/vep/environment.yml b/modules/nf-core/ensemblvep/vep/environment.yml
new file mode 100644
index 0000000..3d36eb1
--- /dev/null
+++ b/modules/nf-core/ensemblvep/vep/environment.yml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::ensembl-vep=113.0
diff --git a/modules/nf-core/ensemblvep/vep/main.nf b/modules/nf-core/ensemblvep/vep/main.nf
new file mode 100644
index 0000000..7d2c82f
--- /dev/null
+++ b/modules/nf-core/ensemblvep/vep/main.nf
@@ -0,0 +1,70 @@
+process ENSEMBLVEP_VEP {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/ensembl-vep:113.0--pl5321h2a3209d_0' :
+        'biocontainers/ensembl-vep:113.0--pl5321h2a3209d_0' }"
+
+    input:
+    tuple val(meta), path(vcf), path(custom_extra_files)
+    val   genome
+    val   species
+    val   cache_version
+    path  cache
+    tuple val(meta2), path(fasta)
+    path  extra_files
+
+    output:
+    tuple val(meta), path("*.vcf.gz")  , optional:true, emit: vcf
+    tuple val(meta), path("*.tab.gz")  , optional:true, emit: tab
+    tuple val(meta), path("*.json.gz") , optional:true, emit: json
+    path "*.html"                      , optional:true, emit: report
+    path "versions.yml"                , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def file_extension = args.contains("--vcf") ? 'vcf' : args.contains("--json")? 'json' : args.contains("--tab")? 'tab' : 'vcf'
+    def compress_cmd = args.contains("--compress_output") ? '' : '--compress_output bgzip'
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def dir_cache = cache ? "\${PWD}/${cache}" : "/.vep"
+    def reference = fasta ? "--fasta $fasta" : ""
+    """
+    vep \\
+        -i $vcf \\
+        -o ${prefix}.${file_extension}.gz \\
+        $args \\
+        $compress_cmd \\
+        $reference \\
+        --assembly $genome \\
+        --species $species \\
+        --cache \\
+        --cache_version $cache_version \\
+        --dir_cache $dir_cache \\
+        --fork $task.cpus
+
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    echo "" | gzip > ${prefix}.vcf.gz
+    echo "" | gzip > ${prefix}.tab.gz
+    echo "" | gzip > ${prefix}.json.gz
+    touch ${prefix}_summary.html
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/ensemblvep/vep/meta.yml b/modules/nf-core/ensemblvep/vep/meta.yml
new file mode 100644
index 0000000..9288a93
--- /dev/null
+++ b/modules/nf-core/ensemblvep/vep/meta.yml
@@ -0,0 +1,114 @@
+name: ensemblvep_vep
+description: Ensembl Variant Effect Predictor (VEP). The output-file-format is controlled
+  through `task.ext.args`.
+keywords:
+  - annotation
+  - vcf
+  - json
+  - tab
+tools:
+  - ensemblvep:
+      description: |
+        VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs
+        or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions.
+      homepage: https://www.ensembl.org/info/docs/tools/vep/index.html
+      documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html
+      licence: ["Apache-2.0"]
+      identifier: ""
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. [ id:'test', single_end:false ]
+    - vcf:
+        type: file
+        description: |
+          vcf to annotate
+    - custom_extra_files:
+        type: file
+        description: |
+          extra sample-specific files to be used with the `--custom` flag to be configured with ext.args
+          (optional)
+  - - genome:
+        type: string
+        description: |
+          which genome to annotate with
+  - - species:
+        type: string
+        description: |
+          which species to annotate with
+  - - cache_version:
+        type: integer
+        description: |
+          which version of the cache to annotate with
+  - - cache:
+        type: file
+        description: |
+          path to VEP cache (optional)
+  - - meta2:
+        type: map
+        description: |
+          Groovy Map containing fasta reference information
+          e.g. [ id:'test' ]
+    - fasta:
+        type: file
+        description: |
+          reference FASTA file (optional)
+        pattern: "*.{fasta,fa}"
+  - - extra_files:
+        type: file
+        description: |
+          path to file(s) needed for plugins  (optional)
+output:
+  - vcf:
+      - meta:
+          type: file
+          description: |
+            annotated vcf (optional)
+          pattern: "*.ann.vcf.gz"
+      - "*.vcf.gz":
+          type: file
+          description: |
+            annotated vcf (optional)
+          pattern: "*.ann.vcf.gz"
+  - tab:
+      - meta:
+          type: file
+          description: |
+            tab file with annotated variants (optional)
+          pattern: "*.ann.tab.gz"
+      - "*.tab.gz":
+          type: file
+          description: |
+            tab file with annotated variants (optional)
+          pattern: "*.ann.tab.gz"
+  - json:
+      - meta:
+          type: file
+          description: |
+            json file with annotated variants (optional)
+          pattern: "*.ann.json.gz"
+      - "*.json.gz":
+          type: file
+          description: |
+            json file with annotated variants (optional)
+          pattern: "*.ann.json.gz"
+  - report:
+      - "*.html":
+          type: file
+          description: VEP report file
+          pattern: "*.html"
+  - versions:
+      - versions.yml:
+          type: file
+          description: File containing software versions
+          pattern: "versions.yml"
+authors:
+  - "@maxulysse"
+  - "@matthdsm"
+  - "@nvnieuwk"
+maintainers:
+  - "@maxulysse"
+  - "@matthdsm"
+  - "@nvnieuwk"
diff --git a/modules/nf-core/ensemblvep/vep/tests/main.nf.test b/modules/nf-core/ensemblvep/vep/tests/main.nf.test
new file mode 100644
index 0000000..3e8c0b5
--- /dev/null
+++ b/modules/nf-core/ensemblvep/vep/tests/main.nf.test
@@ -0,0 +1,114 @@
+nextflow_process {
+
+    name "Test Process ENSEMBLVEP_VEP"
+    script "../main.nf"
+    process "ENSEMBLVEP_VEP"
+    config "./nextflow.config"
+
+    tag "modules"
+    tag "modules_nfcore"
+    tag "ensemblvep"
+    tag "ensemblvep/vep"
+    tag "ensemblvep/download"
+
+    test("test_ensemblvep_vep_fasta_vcf") {
+        config "./vcf.config"
+
+        setup {
+            run("ENSEMBLVEP_DOWNLOAD") {
+                script "../../download/main.nf"
+
+                process {
+                    """
+                    input[0] = Channel.of([
+                        [id:"113_WBcel235"],
+                        params.vep_genome,
+                        params.vep_species,
+                        params.vep_cache_version
+                    ])
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = Channel.of([
+                    [ id:'test' ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true),
+                    []
+                ])
+                input[1] = params.vep_genome
+                input[2] = params.vep_species
+                input[3] = params.vep_cache_version
+                input[4] = ENSEMBLVEP_DOWNLOAD.out.cache.map{ meta, cache -> [cache] }
+                input[5] = Channel.value([
+                    [id:"fasta"],
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+                ])
+                input[6] = []
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.versions).match() },
+                { assert path(process.out.vcf.get(0).get(1)).linesGzip.contains("##fileformat=VCFv4.2") }
+            )
+        }
+
+    }
+
+    test("test_ensemblvep_vep_fasta_tab_gz") {
+        config "./tab.gz.config"
+
+        setup {
+            run("ENSEMBLVEP_DOWNLOAD") {
+                script "../../download/main.nf"
+
+                process {
+                    """
+                    input[0] = Channel.of([
+                        [id:"113_WBcel235"],
+                        params.vep_genome,
+                        params.vep_species,
+                        params.vep_cache_version
+                    ])
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = Channel.of([
+                    [ id:'test' ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true),
+                    []
+                ])
+                input[1] = params.vep_genome
+                input[2] = params.vep_species
+                input[3] = params.vep_cache_version
+                input[4] = ENSEMBLVEP_DOWNLOAD.out.cache.map{ meta, cache -> [cache] }
+                input[5] = Channel.value([
+                    [id:"fasta"],
+                    file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
+                ])
+                input[6] = []
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.versions).match() },
+                { assert path(process.out.tab.get(0).get(1)).linesGzip.contains("## ENSEMBL VARIANT EFFECT PREDICTOR v113.0") }
+            )
+        }
+    }
+}
diff --git a/modules/nf-core/ensemblvep/vep/tests/main.nf.test.snap b/modules/nf-core/ensemblvep/vep/tests/main.nf.test.snap
new file mode 100644
index 0000000..1df9427
--- /dev/null
+++ b/modules/nf-core/ensemblvep/vep/tests/main.nf.test.snap
@@ -0,0 +1,26 @@
+{
+    "test_ensemblvep_vep_fasta_tab_gz": {
+        "content": [
+            [
+                "versions.yml:md5,4fbfeb73f0d4b4aa039f17be8ba9e1f2"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.4"
+        },
+        "timestamp": "2024-10-21T09:12:23.474703494"
+    },
+    "test_ensemblvep_vep_fasta_vcf": {
+        "content": [
+            [
+                "versions.yml:md5,4fbfeb73f0d4b4aa039f17be8ba9e1f2"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.9.0",
+            "nextflow": "24.04.4"
+        },
+        "timestamp": "2024-10-21T09:11:54.343590485"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/ensemblvep/vep/tests/nextflow.config b/modules/nf-core/ensemblvep/vep/tests/nextflow.config
new file mode 100644
index 0000000..0a4ae1a
--- /dev/null
+++ b/modules/nf-core/ensemblvep/vep/tests/nextflow.config
@@ -0,0 +1,12 @@
+params {
+    vep_cache_version   = "113"
+    vep_genome          = "WBcel235"
+    vep_species         = "caenorhabditis_elegans"
+}
+
+process {
+    withName: ENSEMBLVEP_DOWNLOAD {
+        ext.args    = '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE'
+        ext.prefix  = { "${params.vep_cache_version}_${params.vep_genome}" }
+    }
+}
diff --git a/modules/nf-core/ensemblvep/vep/tests/tab.gz.config b/modules/nf-core/ensemblvep/vep/tests/tab.gz.config
new file mode 100644
index 0000000..40eb03e
--- /dev/null
+++ b/modules/nf-core/ensemblvep/vep/tests/tab.gz.config
@@ -0,0 +1,5 @@
+process {
+        withName: ENSEMBLVEP_VEP {
+        ext.args = '--tab --compress_output bgzip'
+    }
+}
diff --git a/modules/nf-core/ensemblvep/vep/tests/tags.yml b/modules/nf-core/ensemblvep/vep/tests/tags.yml
new file mode 100644
index 0000000..4aa4aa4
--- /dev/null
+++ b/modules/nf-core/ensemblvep/vep/tests/tags.yml
@@ -0,0 +1,2 @@
+ensemblvep/vep:
+  - "modules/nf-core/ensemblvep/vep/**"
diff --git a/modules/nf-core/ensemblvep/vep/tests/vcf.config b/modules/nf-core/ensemblvep/vep/tests/vcf.config
new file mode 100644
index 0000000..ad8955a
--- /dev/null
+++ b/modules/nf-core/ensemblvep/vep/tests/vcf.config
@@ -0,0 +1,5 @@
+process {
+        withName: ENSEMBLVEP_VEP {
+        ext.args = '--vcf'
+    }
+}

From 45a8db47130b32ac9c89382093df5f082e4daef3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Mon, 25 Nov 2024 10:34:29 +0100
Subject: [PATCH 02/11] add annotation sbwf

---
 .../nf-core/vcf_annotate_ensemblvep/main.nf   | 45 +++++++++++++
 .../nf-core/vcf_annotate_ensemblvep/meta.yml  | 65 +++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf
 create mode 100644 subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml

diff --git a/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf b/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf
new file mode 100644
index 0000000..291eddc
--- /dev/null
+++ b/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf
@@ -0,0 +1,45 @@
+//
+// Run VEP to annotate VCF files
+//
+
+include { ENSEMBLVEP_VEP } from '../../../modules/nf-core/ensemblvep/vep/main'
+include { TABIX_TABIX    } from '../../../modules/nf-core/tabix/tabix/main'
+
+workflow VCF_ANNOTATE_ENSEMBLVEP {
+    take:
+    ch_vcf                      // channel: [ val(meta), path(vcf), [path(custom_file1), path(custom_file2)... (optionnal)]]
+    ch_fasta                    // channel: [ val(meta2), path(fasta) ] (optional)
+    val_genome                  //   value: genome to use
+    val_species                 //   value: species to use
+    val_cache_version           //   value: cache version to use
+    ch_cache                    // channel: [ val(meta3), path(cache) ] (optional)
+    ch_extra_files              // channel: [ path(file1), path(file2)... ] (optional)
+
+    main:
+    ch_versions = Channel.empty()
+
+    ENSEMBLVEP_VEP(
+        ch_vcf,
+        val_genome,
+        val_species,
+        val_cache_version,
+        ch_cache,
+        ch_fasta,
+        ch_extra_files
+    )
+
+    TABIX_TABIX(ENSEMBLVEP_VEP.out.vcf)
+
+    ch_vcf_tbi = ENSEMBLVEP_VEP.out.vcf.join(TABIX_TABIX.out.tbi, failOnDuplicate: true, failOnMismatch: true)
+
+    // Gather versions of all tools used
+    ch_versions = ch_versions.mix(ENSEMBLVEP_VEP.out.versions)
+    ch_versions = ch_versions.mix(TABIX_TABIX.out.versions)
+
+    emit:
+    vcf_tbi  = ch_vcf_tbi                  // channel: [ val(meta), path(vcf), path(tbi) ]
+    json     = ENSEMBLVEP_VEP.out.json     // channel: [ val(meta), path(json) ]
+    tab      = ENSEMBLVEP_VEP.out.tab      // channel: [ val(meta), path(tab) ]
+    reports  = ENSEMBLVEP_VEP.out.report   // channel: [ path(html) ]
+    versions = ch_versions                 // channel: [ versions.yml ]
+}
diff --git a/subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml b/subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml
new file mode 100644
index 0000000..15d42da
--- /dev/null
+++ b/subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml
@@ -0,0 +1,65 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
+name: vcf_annotate_ensemblvep
+description: Perform annotation with ensemblvep and bgzip + tabix index the resulting VCF file
+keywords:
+  - vcf
+  - annotation
+  - ensemblvep
+components:
+  - ensemblvep/vep
+  - tabix/tabix
+input:
+  - ch_vcf:
+      description: |
+        vcf file to annotate
+        Structure: [ val(meta), path(vcf), [path(custom_file1), path(custom_file2)... (optionnal)] ]
+  - ch_fasta:
+      description: |
+        Reference genome fasta file (optional)
+        Structure: [ val(meta2), path(fasta) ]
+  - val_genome:
+      type: string
+      description: genome to use
+  - val_species:
+      type: string
+      description: species to use
+  - val_cache_version:
+      type: integer
+      description: cache version to use
+  - ch_cache:
+      description: |
+        the root cache folder for ensemblvep (optional)
+        Structure: [ val(meta3), path(cache) ]
+  - ch_extra_files:
+      description: |
+        any extra files needed by plugins for ensemblvep (optional)
+        Structure: [ path(file1), path(file2)... ]
+output:
+  - vcf_tbi:
+      description: |
+        Compressed vcf file + tabix index
+        Structure: [ val(meta), path(vcf), path(tbi) ]
+  - json:
+      description: |
+        json file
+        Structure: [ val(meta), path(json) ]
+  - tab:
+      description: |
+        tab file
+        Structure: [ val(meta), path(tab) ]
+  - reports:
+      type: file
+      description: html reports
+      pattern: "*.html"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@maxulysse"
+  - "@matthdsm"
+  - "@nvnieuwk"
+maintainers:
+  - "@maxulysse"
+  - "@matthdsm"
+  - "@nvnieuwk"

From eedf08b5c000e3f66bb059a03e0c081b40dd5a27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Mon, 25 Nov 2024 10:55:48 +0100
Subject: [PATCH 03/11] wip

---
 main.nf                    | 17 +++++++++++----
 workflows/vcftomaf/main.nf | 44 ++++++++++++++++++++++++++------------
 2 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/main.nf b/main.nf
index b5db4f8..f878568 100644
--- a/main.nf
+++ b/main.nf
@@ -37,8 +37,11 @@ workflow QBICPIPELINES_VCFTOMAF {
     //
     // SET PARAMETERS
     //
-    params.fasta = getGenomeAttribute('fasta')
-    params.dict = getGenomeAttribute('dict')
+    params.fasta               = getGenomeAttribute('fasta')
+    params.dict                = getGenomeAttribute('dict')
+    params.vep_cache_version   = getGenomeAttribute('vep_cache_version')
+    params.vep_genome          = getGenomeAttribute('vep_genome')
+    params.vep_species         = getGenomeAttribute('vep_species')
 
     // Extra files
     intervals      = params.intervals      ? Channel.fromPath(params.intervals).collect()      : Channel.value([])
@@ -52,8 +55,11 @@ workflow QBICPIPELINES_VCFTOMAF {
     genome        = params.genome   ?: Channel.empty()
 
     // VEP cache
-    vep_cache          = Channel.value([]) //params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : Channel.value([])
+    vep_cache          = params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : Channel.value([])
     vep_cache_unpacked = Channel.value([])
+    vep_cache_version  = params.vep_cache_version
+    vep_genome         = params.vep_genome
+    vep_species        = params.vep_species
 
 
     //
@@ -68,7 +74,10 @@ workflow QBICPIPELINES_VCFTOMAF {
         liftover_chain,
         genome,
         vep_cache,
-        vep_cache_unpacked
+        vep_cache_unpacked,
+        vep_cache_version,
+        vep_genome,
+        vep_species
     )
 
     emit:
diff --git a/workflows/vcftomaf/main.nf b/workflows/vcftomaf/main.nf
index 25ac296..cebd36c 100644
--- a/workflows/vcftomaf/main.nf
+++ b/workflows/vcftomaf/main.nf
@@ -18,6 +18,7 @@ include { paramsSummaryMap            } from 'plugin/nf-schema'
 include { paramsSummaryMultiqc        } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
 include { softwareVersionsToYAML      } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
 include { methodsDescriptionText      } from '../../subworkflows/local/utils_nfcore_vcftomaf_pipeline'
+include { VCF_ANNOTATE_ENSEMBLVEP     } from '../../subworkflows/nf-core/vcf_annotate_ensemblvep/main'
 
 
 /*
@@ -37,6 +38,9 @@ workflow VCFTOMAF {
     genome
     vep_cache
     vep_cache_unpacked
+    vep_cache_version
+    vep_genome
+    vep_species
 
     main:
 
@@ -47,20 +51,6 @@ workflow VCFTOMAF {
     // SUBWORKFLOW: Read in samplesheet, validate and stage input files
     //
 
-    // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335
-    // if (params.vep_cache){
-    //     ch_vep_cache = vep_cache.map{
-    //         it -> def new_id = ""
-    //             if(it) {
-    //                 new_id = it[0].simpleName.toString()
-    //             }
-    //         [[id:new_id], it]
-    //     }
-    //     // UNTAR if available
-    //     vep_cache_unpacked  = UNTAR(ch_vep_cache).untar.map { it[1] }
-    //     ch_versions         = ch_versions.mix(UNTAR.out.versions)
-    // }
-
     // BRANCH CHANNEL
     ch_samplesheet.branch{
         is_indexed:  it[0].index == true
@@ -80,6 +70,32 @@ workflow VCFTOMAF {
     // Join both channels back together
     ch_vcf = ch_input.is_indexed.mix(ch_indexed_to_index)
 
+    // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335
+    // Therefore we use the vcf_annotate_ensemblvep subworkflow here
+
+    if (params.vep_cache){
+        ch_vep_cache = vep_cache.map{
+            it -> def new_id = ""
+                if(it) {
+                    new_id = it[0].simpleName.toString()
+                }
+            [[id:new_id], it]
+        }
+        // UNTAR if available
+        vep_cache_unpacked  = UNTAR(ch_vep_cache).untar.map { it[1] }
+        ch_versions         = ch_versions.mix(UNTAR.out.versions)
+    }
+
+    VCF_ANNOTATE_ENSEMBLVEP(
+        ch_vcf,
+        fasta,
+        vep_genome,
+        vep_species,// species
+        vep_cache_version,  // cache_version
+        vep_cache_unpacked, // ch_cache
+        [] // ch_extra_files
+    )
+
     //
     // MODULE: Run PASS + BED filtering
     //

From cbc8eca2e5c9e9c88852c3c4cbded78c103412f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Mon, 25 Nov 2024 11:04:10 +0100
Subject: [PATCH 04/11] add vep

---
 nextflow.config            |  4 ++--
 workflows/vcftomaf/main.nf | 37 +++++++++++++++++++++----------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index dc47825..f547e96 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,8 +13,8 @@ params {
     liftover_chain             = null
     input                      = null
     intervals                  = null
-    //vep_cache                  = null
-    //run_vep                    = false
+    vep_cache                  = null
+    run_vep                    = false
     filter                     = false
 
     // References
diff --git a/workflows/vcftomaf/main.nf b/workflows/vcftomaf/main.nf
index cebd36c..97343b9 100644
--- a/workflows/vcftomaf/main.nf
+++ b/workflows/vcftomaf/main.nf
@@ -70,23 +70,24 @@ workflow VCFTOMAF {
     // Join both channels back together
     ch_vcf = ch_input.is_indexed.mix(ch_indexed_to_index)
 
-    // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335
-    // Therefore we use the vcf_annotate_ensemblvep subworkflow here
-
-    if (params.vep_cache){
-        ch_vep_cache = vep_cache.map{
-            it -> def new_id = ""
-                if(it) {
-                    new_id = it[0].simpleName.toString()
-                }
-            [[id:new_id], it]
+    if (params.run_vep) {
+        // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335
+        // Therefore we use the vcf_annotate_ensemblvep subworkflow here
+
+        if (params.vep_cache){
+            ch_vep_cache = vep_cache.map{
+                it -> def new_id = ""
+                    if(it) {
+                        new_id = it[0].simpleName.toString()
+                    }
+                [[id:new_id], it]
+            }
+            // UNTAR if available
+            vep_cache_unpacked  = UNTAR(ch_vep_cache).untar.map { it[1] }
+            ch_versions         = ch_versions.mix(UNTAR.out.versions)
         }
-        // UNTAR if available
-        vep_cache_unpacked  = UNTAR(ch_vep_cache).untar.map { it[1] }
-        ch_versions         = ch_versions.mix(UNTAR.out.versions)
-    }
 
-    VCF_ANNOTATE_ENSEMBLVEP(
+        VCF_ANNOTATE_ENSEMBLVEP(
         ch_vcf,
         fasta,
         vep_genome,
@@ -94,7 +95,11 @@ workflow VCFTOMAF {
         vep_cache_version,  // cache_version
         vep_cache_unpacked, // ch_cache
         [] // ch_extra_files
-    )
+        )
+        ch_vcf = VCF_ANNOTATE_ENSEMBLVEP.out.vcf_tbi
+        ch_versions = ch_versions.mix(VCF_ANNOTATE_ENSEMBLVEP.out.versions)
+    }
+
 
     //
     // MODULE: Run PASS + BED filtering

From 553fb292251ad5661ed701aaac360680a2da0788 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Mon, 25 Nov 2024 12:46:52 +0100
Subject: [PATCH 05/11] add vep

---
 conf/test.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conf/test.config b/conf/test.config
index a205f58..adca71e 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -29,5 +29,6 @@ params {
     intervals    = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.bed'
     genome       = 'GATK.GRCh38'
     filter       = false
+    run_vep      = true
 
 }

From a6681a816a0d7cbbce7e88764cb2509d68102bc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Mon, 25 Nov 2024 12:59:29 +0100
Subject: [PATCH 06/11] fix typo

---
 docs/usage.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/usage.md b/docs/usage.md
index 76e7af8..5ff06b9 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -94,7 +94,7 @@ Chain file, if liftover should be done
 The input vcf files are PASS filtered by default.
 Additionally, if the path to a file containing the targeted intervals for panel sequencing data is specified, off-target regions will be filtered out.
 
-### `--filter``
+### `--filter`
 
 Boolean flag to enable filtering of the variants keeping only variants marked as `PASS` when set to `true`. Default is false.
 

From 4599b6c66f986101ea8fbe475c3c4ceca2c8e3c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Mon, 25 Nov 2024 13:34:34 +0100
Subject: [PATCH 07/11] update changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4eccdfc..e84b4ff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#21](https://github.com/qbic-pipelines/vcftomaf/pull/21) - Add VEP annotation via subworkflow (@famosab)
+
 ### `Fixed`
 
 ### `Dependencies`

From 8bb590582bf4d5c4482673c07878de3c7bcddafa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Tue, 26 Nov 2024 10:30:07 +0100
Subject: [PATCH 08/11] add meta to fasta

---
 workflows/vcftomaf/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/vcftomaf/main.nf b/workflows/vcftomaf/main.nf
index 97343b9..571bc4d 100644
--- a/workflows/vcftomaf/main.nf
+++ b/workflows/vcftomaf/main.nf
@@ -89,7 +89,7 @@ workflow VCFTOMAF {
 
         VCF_ANNOTATE_ENSEMBLVEP(
         ch_vcf,
-        fasta,
+        fasta.map{ it -> [ [ id:it.baseName ], it ] },
         vep_genome,
         vep_species,// species
         vep_cache_version,  // cache_version

From 4ea07b761d876e404ec3dfdd0bba772572ffaf12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Tue, 26 Nov 2024 10:31:46 +0100
Subject: [PATCH 09/11] prettier

---
 modules.json | 52 +++++++++++++---------------------------------------
 1 file changed, 13 insertions(+), 39 deletions(-)

diff --git a/modules.json b/modules.json
index 9a47197..e7aed3a 100644
--- a/modules.json
+++ b/modules.json
@@ -8,60 +8,42 @@
                     "bcftools/view": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "ensemblvep/vep": {
                         "branch": "master",
                         "git_sha": "6e3585d9ad20b41adc7d271009f8cb5e191ecab4",
-                        "installed_by": [
-                            "modules",
-                            "vcf_annotate_ensemblvep"
-                        ]
+                        "installed_by": ["modules", "vcf_annotate_ensemblvep"]
                     },
                     "gunzip": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "picard/liftovervcf": {
                         "branch": "master",
                         "git_sha": "49f4e50534fe4b64101e62ea41d5dc43b1324358",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "tabix/tabix": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": [
-                            "modules",
-                            "vcf_annotate_ensemblvep"
-                        ]
+                        "installed_by": ["modules", "vcf_annotate_ensemblvep"]
                     },
                     "untar": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": [
-                            "modules"
-                        ]
+                        "installed_by": ["modules"]
                     },
                     "vcf2maf": {
                         "branch": "master",
                         "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
-                        "installed_by": [
-                            "modules"
-                        ],
+                        "installed_by": ["modules"],
                         "patch": "modules/nf-core/vcf2maf/vcf2maf.diff"
                     }
                 }
@@ -71,33 +53,25 @@
                     "utils_nextflow_pipeline": {
                         "branch": "master",
                         "git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082",
-                        "installed_by": [
-                            "subworkflows"
-                        ]
+                        "installed_by": ["subworkflows"]
                     },
                     "utils_nfcore_pipeline": {
                         "branch": "master",
                         "git_sha": "1b6b9a3338d011367137808b49b923515080e3ba",
-                        "installed_by": [
-                            "subworkflows"
-                        ]
+                        "installed_by": ["subworkflows"]
                     },
                     "utils_nfschema_plugin": {
                         "branch": "master",
                         "git_sha": "bbd5a41f4535a8defafe6080e00ea74c45f4f96c",
-                        "installed_by": [
-                            "subworkflows"
-                        ]
+                        "installed_by": ["subworkflows"]
                     },
                     "vcf_annotate_ensemblvep": {
                         "branch": "master",
                         "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f",
-                        "installed_by": [
-                            "subworkflows"
-                        ]
+                        "installed_by": ["subworkflows"]
                     }
                 }
             }
         }
     }
-}
\ No newline at end of file
+}

From 299dc778afaeef49ead6c19b7a9862e3433ec56e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= <famke.baeuerle@gmail.com>
Date: Tue, 26 Nov 2024 10:37:42 +0100
Subject: [PATCH 10/11] add VEP options to schema

---
 nextflow_schema.json | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index d099aed..da7f89e 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -108,6 +108,34 @@
                 }
             }
         },
+        "annotation_options": {
+            "title": "Annotation options",
+            "type": "object",
+            "description": "Define parameters for VEP annotation",
+            "default": "",
+            "properties": {
+                "run_vep": {
+                    "type": "boolean",
+                    "description": "True if you want to annotate with VEP"
+                },
+                "vep_cache": {
+                    "type": "string",
+                    "description": "Path to VEP cache which should contain the relevant species, genome and build directories at the path ${vep_species}/${vep_genome}_${vep_cache_version}"
+                },
+                "vep_cache_version": {
+                    "type": "string",
+                    "description": "Alternatively cache version can be use to specify the correct Ensembl Genomes version number as these differ from the concurrent Ensembl/VEP version numbers"
+                },
+                "vep_genome": {
+                    "type": "string",
+                    "description": "This is used to specify the genome when looking for local cache, or cloud based cache."
+                },
+                "vep_species": {
+                    "type": "string",
+                    "description": "Alternatively species listed in Ensembl Genomes caches can be used."
+                }
+            }
+        },
         "institutional_config_options": {
             "title": "Institutional config options",
             "type": "object",
@@ -255,6 +283,9 @@
         {
             "$ref": "#/$defs/reference_genome_options"
         },
+        {
+            "$ref": "#/$defs/annotation_options"
+        },
         {
             "$ref": "#/$defs/institutional_config_options"
         },

From ffa2cf97d9ea0c354e3691be897d852c86e79af9 Mon Sep 17 00:00:00 2001
From: famosab <famke.baeuerle@gmail.com>
Date: Tue, 26 Nov 2024 12:37:49 +0100
Subject: [PATCH 11/11] current status

---
 conf/modules.config        |  5 +++
 workflows/vcftomaf/main.nf | 66 +++++++++++++++++++++-----------------
 2 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index f6774ab..ac4d085 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -57,6 +57,7 @@ process {
             //specify to avoid publishing, overwritten otherwise
             enabled: false
         ]
+        ext.args = { '-f' }
     }
 
     withName: UNTAR {
@@ -93,4 +94,8 @@ process {
         ]
     }
 
+    withName: 'ENSEMBLVEP_VEP' {
+        ext.args = { '--vcf  --compress_output bgzip' }
+    }
+
 }
diff --git a/workflows/vcftomaf/main.nf b/workflows/vcftomaf/main.nf
index 571bc4d..05042ea 100644
--- a/workflows/vcftomaf/main.nf
+++ b/workflows/vcftomaf/main.nf
@@ -18,7 +18,7 @@ include { paramsSummaryMap            } from 'plugin/nf-schema'
 include { paramsSummaryMultiqc        } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
 include { softwareVersionsToYAML      } from '../../subworkflows/nf-core/utils_nfcore_pipeline'
 include { methodsDescriptionText      } from '../../subworkflows/local/utils_nfcore_vcftomaf_pipeline'
-include { VCF_ANNOTATE_ENSEMBLVEP     } from '../../subworkflows/nf-core/vcf_annotate_ensemblvep/main'
+include { ENSEMBLVEP_VEP } from '../../modules/nf-core/ensemblvep/vep/main'
 
 
 /*
@@ -51,25 +51,6 @@ workflow VCFTOMAF {
     // SUBWORKFLOW: Read in samplesheet, validate and stage input files
     //
 
-    // BRANCH CHANNEL
-    ch_samplesheet.branch{
-        is_indexed:  it[0].index == true
-        to_index:    it[0].index == false
-    }.set{ch_input}
-
-    // Remove empty index [] from channel = it[2]
-    input_to_index = ch_input.to_index.map{ it -> [it[0], it[1]] }
-
-    // Create tbi index only if not provided
-    TABIX_TABIX(input_to_index)
-    ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first())
-
-    // Join tbi index back to input
-    ch_indexed_to_index = input_to_index.join(TABIX_TABIX.out.tbi)
-
-    // Join both channels back together
-    ch_vcf = ch_input.is_indexed.mix(ch_indexed_to_index)
-
     if (params.run_vep) {
         // VEP annotation is currently not supported from within vcf2maf : https://github.com/mskcc/vcf2maf/issues/335
         // Therefore we use the vcf_annotate_ensemblvep subworkflow here
@@ -87,19 +68,46 @@ workflow VCFTOMAF {
             ch_versions         = ch_versions.mix(UNTAR.out.versions)
         }
 
-        VCF_ANNOTATE_ENSEMBLVEP(
-        ch_vcf,
-        fasta.map{ it -> [ [ id:it.baseName ], it ] },
+        ENSEMBLVEP_VEP(
+        ch_samplesheet,
         vep_genome,
-        vep_species,// species
-        vep_cache_version,  // cache_version
-        vep_cache_unpacked, // ch_cache
-        [] // ch_extra_files
+        vep_species,
+        vep_cache_version,
+        vep_cache_unpacked,
+        fasta.map{ it -> [ [ id:it.baseName ], it ] },
+        []
         )
-        ch_vcf = VCF_ANNOTATE_ENSEMBLVEP.out.vcf_tbi
-        ch_versions = ch_versions.mix(VCF_ANNOTATE_ENSEMBLVEP.out.versions)
+        ch_versions = ch_versions.mix(ENSEMBLVEP_VEP.out.versions)
+
+        TABIX_TABIX(ENSEMBLVEP_VEP.out.vcf)
+        ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first())
+
+        ENSEMBLVEP_VEP.out.vcf.dump(tag: 'vcf-before')
+
+        // Join tbi index back to input
+        ch_vcf = ENSEMBLVEP_VEP.out.vcf.join(TABIX_TABIX.out.tbi)
+        ch_vcf.dump(tag: 'vcf-sfter')
     }
+    else {
+        // BRANCH CHANNEL
+        ch_samplesheet.branch{
+            is_indexed:  it[0].index == true
+            to_index:    it[0].index == false
+        }.set{ch_input}
 
+        // Remove empty index [] from channel = it[2]
+        input_to_index = ch_input.to_index.map{ it -> [it[0], it[1]] }
+
+        // Create tbi index only if not provided
+        TABIX_TABIX(input_to_index)
+        ch_versions = ch_versions.mix(TABIX_TABIX.out.versions.first())
+
+        // Join tbi index back to input
+        ch_indexed_to_index = input_to_index.join(TABIX_TABIX.out.tbi)
+
+        // Join both channels back together
+        ch_vcf = ch_input.is_indexed.mix(ch_indexed_to_index)
+    }
 
     //
     // MODULE: Run PASS + BED filtering