feat: deduplicate with UMIs (#1358)

#### Added - UMI extraction and deduplication to TGA workflow - Adapter trimming of fastqs to UMI workflow - Cap base quality in bam for Manta input #### Changed - Refactored multi workflow rule-files to separate files to decrease complexity - Refactored output files to in general comply with format {sample_type}.{sample_name} - Replaced Picard QC tools with matching Sentieon QC tools #### Removed - UMI specific rules for UMI-extraction and alignment (using new TGA-rules instead) - Fastq and UMI trimming command-line options Merged this PR into this one: #1465 #### Added - Added extension of target bed regions to a minimum size of 100 for CNV analysis - PON for: Exome comprehensive 10.2 - PON for: GMSsolid 15.2 - PON for: GMCKsolid 4.2 #### Changed - updated PON for GMCKSolid v4.1 - updated PON for GMSMyeloid v5.3 - updated PON for GMSlymphoid v7.3 Merged this PR into this one: #1448 #### Added - Script to post-process CNVkit output to GENS-format - DNAscope gnomad calling to TGA for GENS #### Changed - Parsing of GENS arguments changed to account for TGA Merged this PR: #1475 into this one #### Changed - Refactored rules for bcftools filters - Renamed final UMI bamfile to ensure hsmetrics are collected in multiqc json - Changed ranked VCF from research to clincial - Lowered min AF for TGA from 0.007 to 0.005 - Lowered maximal SOR for TNscope in TGA tumor only cases from 3 to 2.7 - Changed filter settings for research TNscope vcf, now either PASS or triallelic_site (fixing this issue: #1293) #### Added - TNscope for TGA workflows, merged with VarDict results - New filter for VarDict for tumor in normal contamination - Export TMP environment variables to rules that lack them - Added genmod ranked VCFs to be delivered - Added family-id to genmod in order to get ranked variants to Scout (solved this: #1045) - Added DP and AF to INFO-field of TNscope vcfs for ranking model - Raw TNscope calls and unfiltered research-annotated SNVs to delivery #### Removed - ML-model for TNscope is removed due to license issue with new version of Sentieon - All code associated with TNhaplotyper - Removed research.filtered.pass VCFs from delivery and storage list
Clinical-Genomics · Oct 16, 2024 · f654750 · f654750
1 parent 4937d2d
commit f654750
Show file tree

Hide file tree

Showing 119 changed files with 4,755 additions and 3,143 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.github/workflows/black_linter.yml b/.github/workflows/black_linter.yml
@@ -11,4 +11,4 @@ jobs:
       - uses: psf/black@stable
         with:
           options: "--check --verbose"
-          version: "22.3.0"
+          version: "23.7.0"
diff --git a/BALSAMIC/assets/scripts/cap_base_quality_in_bam.py b/BALSAMIC/assets/scripts/cap_base_quality_in_bam.py
@@ -0,0 +1,37 @@
+import click
+import pysam
+import numpy as np
+
+
+@click.command()
+@click.argument("input_bam", type=click.Path(exists=True))
+@click.argument("output_bam", type=click.Path())
+@click.option(
+    "--max-quality",
+    default=70,
+    type=int,
+    help="Maximum quality value to cap to.",
+)
+def cap_base_qualities(input_bam: str, output_bam: str, max_quality: int):
+    """
+    Cap the base qualities in a BAM file.
+
+    Args:
+        input_bam (str): Input BAM file path.
+        output_bam (str): Output BAM file path.
+        max_quality (int): Maximum quality value to cap to.
+    """
+    # Open input BAM file for reading
+    samfile = pysam.AlignmentFile(input_bam, "rb")
+    out_bam = pysam.AlignmentFile(output_bam, "wb", header=samfile.header)
+    for read in samfile.fetch():
+        qualities = np.array(read.query_qualities)
+        capped_qualities = np.minimum(qualities, max_quality)
+        # Update the base qualities in the read
+        read.query_qualities = capped_qualities.tolist()
+        # Write the modified read to the output BAM file
+        out_bam.write(read)
+
+
+if __name__ == "__main__":
+    cap_base_qualities()
diff --git a/BALSAMIC/assets/scripts/collect_qc_metrics.py b/BALSAMIC/assets/scripts/collect_qc_metrics.py
@@ -204,7 +204,6 @@ def get_sample_id(multiqc_key: str) -> str:
     Returns
         str: The extracted sample ID with the ACCXXXXXX format.
     """
-
     if "_align_sort_" in multiqc_key:
         return multiqc_key.split("_")[0]
     return multiqc_key.split(".")[1].split("_")[0]

diff --git a/BALSAMIC/assets/scripts/extend_bedfile.py b/BALSAMIC/assets/scripts/extend_bedfile.py
@@ -0,0 +1,44 @@
+import click
+
+
+@click.command()
+@click.argument("input_bedfile", type=click.Path(exists=True))
+@click.argument("output_bedfile", type=click.Path())
+@click.option(
+    "--extend-to-min-region-size",
+    default=100,
+    help="Will extend regions shorter than the specified size to this minimum size.",
+)
+def extend_bedfile(
+    input_bedfile: str, output_bedfile: str, extend_to_min_region_size: int
+):
+    """
+    Process a BED file to ensure regions are at least a minimum size.
+
+    Args:
+        input_bedfile (str): Input BED file path.
+        output_bedfile (str): Output BED file path.
+        min_region_size (int): Minimum region size to enforce.
+    """
+    with open(input_bedfile, "r") as infile, open(output_bedfile, "w") as outfile:
+        for line in infile:
+            fields = line.strip().split("\t")
+
+            chrom: str = fields[0]
+            start = int(fields[1])
+            end = int(fields[2])
+
+            region_length: int = end - start
+            if region_length < extend_to_min_region_size:
+                center = (start + end) // 2
+                half_size = extend_to_min_region_size // 2
+                start = max(0, center - half_size)
+                end = center + half_size
+                if extend_to_min_region_size % 2 != 0:
+                    end += 1
+
+            outfile.write(f"{chrom}\t{start}\t{end}\n")
+
+
+if __name__ == "__main__":
+    extend_bedfile()
diff --git a/BALSAMIC/assets/scripts/modify_tnscope_infofield.py b/BALSAMIC/assets/scripts/modify_tnscope_infofield.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+import vcfpy
+import click
+import sys
+import logging
+from typing import List, Optional
+
+LOG = logging.getLogger(__name__)
+
+
+def summarize_ad_to_dp(ad_list):
+    """
+    Summarizes the AD (allelic depth) field into total DP (read depth).
+
+    Parameters:
+    ad_list (list): List of read depths supporting each allele, [ref_depth, alt1_depth, alt2_depth, ...]
+
+    Returns:
+    int: Total read depth (DP) across all alleles.
+    """
+    if ad_list is None:
+        return 0  # Return 0 if AD field is not present
+    return sum(ad_list)
+
+
+@click.command()
+@click.argument("input_vcf", type=click.Path(exists=True))
+@click.argument("output_vcf", type=click.Path())
+def process_vcf(input_vcf: str, output_vcf: str):
+    """
+    Processes the input VCF file and writes the updated information to the output VCF file.
+
+    INPUT_VCF: Path to the input VCF file.
+    OUTPUT_VCF: Path to the output VCF file.
+    """
+
+    # Open the input VCF file
+    reader: vcfpy.Reader = vcfpy.Reader.from_path(input_vcf)
+
+    # Ensure the sample name is 'TUMOR'
+    sample_name: str = reader.header.samples.names[0]
+    if sample_name != "TUMOR":
+        LOG.warning(
+            f"Error: The first sample is named '{sample_name}', but 'TUMOR' is expected."
+        )
+        sys.exit(1)
+
+    # Add AF and DP fields to the header if not already present
+    if "AF" not in reader.header.info_ids():
+        reader.header.add_info_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "AF"),
+                    ("Number", "A"),
+                    ("Type", "Float"),
+                    ("Description", "Allele Frequency"),
+                ]
+            )
+        )
+
+    if "DP" not in reader.header.info_ids():
+        reader.header.add_info_line(
+            vcfpy.OrderedDict(
+                [
+                    ("ID", "DP"),
+                    ("Number", "1"),
+                    ("Type", "Integer"),
+                    ("Description", "Total Depth"),
+                ]
+            )
+        )
+
+    # Open the output VCF file for writing
+    with vcfpy.Writer.from_path(output_vcf, reader.header) as writer:
+        # Loop through each record (variant)
+        for record in reader:
+            # Get the TUMOR sample data
+            sample_index: int = reader.header.samples.names.index(sample_name)
+            tumor_call: vcfpy.Call = record.calls[sample_index]
+
+            # Check and process AD field
+            tumor_ad: Optional[List[int]] = tumor_call.data.get(
+                "AD", None
+            )  # AD is a list [ref_count, alt_count]
+            if tumor_ad is None:
+                LOG.warning(
+                    f"Warning: AD field is missing for record at position {record.POS} on {record.CHROM}"
+                )
+            else:
+                record.INFO["DP"] = summarize_ad_to_dp(tumor_ad)
+
+            # Check and process AF field
+            tumor_af: Optional[float] = tumor_call.data.get("AF", None)
+            if tumor_af is None:
+                LOG.warning(
+                    f"Warning: AF field is missing for record at position {record.POS} on {record.CHROM}"
+                )
+                record.INFO["AF"] = [0.0]  # Default AF to 0.0 if missing
+            else:
+                record.INFO["AF"] = [tumor_af]  # Wrap AF in a list
+
+            # Write the updated record to the output VCF file
+            writer.write_record(record)
+
+    click.echo(f"VCF file processed and saved to {output_vcf}")
+
+
+if __name__ == "__main__":
+    process_vcf()
diff --git a/BALSAMIC/assets/scripts/postprocess_fixmate_bam.awk b/BALSAMIC/assets/scripts/postprocess_fixmate_bam.awk
@@ -0,0 +1,32 @@
+#!/usr/bin/awk -f
+
+BEGIN { OFS = "\t" }
+/^@/ { print; next }
+{
+    flag = $2
+
+    # If the mate is unmapped, remove the MC tag if it's "*"
+    if (and(flag, 8) != 0) {
+        for (i = 12; i <= NF; i++) {
+            if ($i ~ /^MC:Z:\*$/) {
+                $i = ""
+            }
+        }
+    }
+
+    # Check if any of the specific bitwise flags are set (2, 8, 32, 64, 128)
+    if (and(flag, 2 + 8 + 32 + 64 + 128) != 0) {
+        # Add mate unmapped flag if the mate is unmapped
+        if ($7 == "*" || and(flag, 8) != 0) {
+            flag = or(flag, 8)
+        }
+
+        # Ensure the read paired flag (1) is set if any of these are present
+        flag = or(flag, 1)
+    }
+
+    # Set the modified flag
+    $2 = flag
+
+    print
+}
diff --git a/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py b/BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
@@ -0,0 +1,102 @@
+import click
+from BALSAMIC.utils.io import read_csv, write_list_of_strings
+
+
+def calculate_log2_ratio(purity, log2_ratio, ploidy):
+    """Adjuts log2 ratio according to purity and ploidy.
+
+    Based on method in PureCN: https://github.com/lima1/PureCN/issues/40
+
+    Method is not used currently due to questionable results.
+    """
+    # Ensure that the inputs are within valid ranges
+    if not (0 <= purity <= 1):
+        raise ValueError("Purity must be between 0 and 1")
+
+    if ploidy <= 0:
+        raise ValueError("Ploidy must be a positive number")
+
+    # Ploidy and purity adjustment formula
+    log2_adjusted = (
+        purity * log2_ratio * ploidy + 2 * (1 - purity) * log2_ratio - 2 * (1 - purity)
+    ) / (purity * ploidy)
+
+    return log2_adjusted
+
+
+@click.command()
+@click.option(
+    "-o",
+    "--output-file",
+    required=True,
+    type=click.Path(exists=False),
+    help="Name of output-file.",
+)
+@click.option(
+    "-c",
+    "--normalised-coverage-path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Input CNVkit cnr. result.",
+)
+@click.option(
+    "-p",
+    "--tumor-purity-path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Tumor purity file from PureCN",
+)
+def create_gens_cov_file(
+    output_file: str, normalised_coverage_path: str, tumor_purity_path: str
+):
+    """Post-processes the CNVkit .cnr output for upload to GENS.
+
+    Removes Antitarget regions, adjusts for purity and ploidy and outputs the coverages in multiple resolution-formats.
+
+    Args:
+        output_file: Path to GENS output.cov file
+        normalised_coverage_path: Path to input CNVkit cnr file.
+        tumor_purity_path: Path to PureCN purity estimate csv file
+    """
+    log2_data = []
+
+    # Process CNVkit file
+    cnr_dict_list: list[dict] = read_csv(
+        csv_path=normalised_coverage_path, delimeter="\t"
+    )
+
+    # Process PureCN purity file
+    purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",")
+    purity = float(purecn_dict_list[0]["Purity"])
+    ploidy = float(purecn_dict_list[0]["Ploidy"])
+
+    for row in cnr_dict_list:
+        if row["gene"] == "Antitarget":
+            continue
+
+        # find midpoint
+        start: int = int(row["start"])
+        end: int = int(row["end"])
+        region_size: int = end - start
+        midpoint: int = start + int(region_size / 2)
+
+        # adjust log2 ratio
+        log2: float = float(row["log2"])
+
+        # De-activate purity and ploidy adjustment
+        # log2: float = calculate_log2_ratio(purity, log2, ploidy)
+        # log2: float = round(log2, 4)
+
+        # store values in list
+        log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")
+
+    # Write log2 data to output file
+    resolutions = ["o", "a", "b", "c", "d"]
+    resolution_log2_lines = [
+        f"{resolution}_{line}" for resolution in resolutions for line in log2_data
+    ]
+    write_list_of_strings(resolution_log2_lines, output_file)
+
+
+if __name__ == "__main__":
+    create_gens_cov_file()
diff --git a/BALSAMIC/assets/scripts/preprocess_gens.py b/BALSAMIC/assets/scripts/preprocess_gens.py
@@ -26,7 +26,7 @@
     "-s",
     "--sequencing-type",
     required=True,
-    type=click.Choice([SequencingType.WGS]),
+    type=click.Choice([SequencingType.WGS, SequencingType.TARGETED]),
     help="Sequencing type used.",
 )
 @click.pass_context

diff --git a/BALSAMIC/assets/scripts/sort_vcf.awk b/BALSAMIC/assets/scripts/sort_vcf.awk
@@ -0,0 +1,16 @@
+#!/usr/bin/awk -f
+
+BEGIN {
+    ENVIRON["LC_ALL"] = "en_US.UTF-8"
+}
+
+# If the line starts with a '#', it's a header, so print it as is
+$1 ~ /^#/ {
+    print $0;
+    next;
+}
+
+# Otherwise, send the body lines to an external sort command
+{
+    print $0 | "/usr/bin/sort -k1,1V -k2,2n"
+}