Skip to content

Commit

Permalink
feat: deduplicate with UMIs (#1358)
Browse files Browse the repository at this point in the history
#### Added

- UMI extraction and deduplication to TGA workflow
- Adapter trimming of fastqs to UMI workflow
- Cap base quality in bam for Manta input

#### Changed

- Refactored multi workflow rule-files to separate files to decrease complexity
- Refactored output files to in general comply with format {sample_type}.{sample_name}
- Replaced Picard QC tools with matching Sentieon QC tools

#### Removed

- UMI specific rules for UMI-extraction and alignment (using new TGA-rules instead) 
- Fastq and UMI trimming command-line options


Merged this PR into this one: #1465

#### Added

- Added extension of target bed regions to a minimum size of 100 for CNV analysis
- PON for: Exome comprehensive 10.2 
- PON for: GMSsolid 15.2 
- PON for: GMCKsolid 4.2

#### Changed

- updated PON for GMCKSolid v4.1 
- updated PON for GMSMyeloid v5.3 
- updated PON for GMSlymphoid v7.3

Merged this PR into this one: #1448

#### Added

- Script to post-process CNVkit output to GENS-format
- DNAscope gnomad calling to TGA for GENS

#### Changed

- Parsing of GENS arguments changed to account for TGA

Merged this PR: #1475 into this one

#### Changed

- Refactored rules for bcftools filters
- Renamed final UMI bamfile to ensure hsmetrics are collected in multiqc json
- Changed ranked VCF from research to clincial
- Lowered min AF for TGA from 0.007 to 0.005
- Lowered maximal SOR for TNscope in TGA tumor only cases from 3 to 2.7
- Changed filter settings for research TNscope vcf, now either PASS or triallelic_site (fixing this issue: #1293)

#### Added

- TNscope for TGA workflows, merged with VarDict results
- New filter for VarDict for tumor in normal contamination
- Export TMP environment variables to rules that lack them
- Added genmod ranked VCFs to be delivered
- Added family-id to genmod in order to get ranked variants to Scout (solved this: #1045)
- Added DP and AF to INFO-field of TNscope vcfs for ranking model
- Raw TNscope calls and unfiltered research-annotated SNVs to delivery

#### Removed

- ML-model for TNscope is removed due to license issue with new version of Sentieon
- All code associated with TNhaplotyper
- Removed research.filtered.pass VCFs from delivery and storage list
  • Loading branch information
mathiasbio authored Oct 16, 2024
1 parent 4937d2d commit f654750
Show file tree
Hide file tree
Showing 119 changed files with 4,755 additions and 3,143 deletions.
Binary file added .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion .github/workflows/black_linter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ jobs:
- uses: psf/black@stable
with:
options: "--check --verbose"
version: "22.3.0"
version: "23.7.0"
37 changes: 37 additions & 0 deletions BALSAMIC/assets/scripts/cap_base_quality_in_bam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import click
import pysam
import numpy as np


@click.command()
@click.argument("input_bam", type=click.Path(exists=True))
@click.argument("output_bam", type=click.Path())
@click.option(
"--max-quality",
default=70,
type=int,
help="Maximum quality value to cap to.",
)
def cap_base_qualities(input_bam: str, output_bam: str, max_quality: int):
"""
Cap the base qualities in a BAM file.
Args:
input_bam (str): Input BAM file path.
output_bam (str): Output BAM file path.
max_quality (int): Maximum quality value to cap to.
"""
# Open input BAM file for reading
samfile = pysam.AlignmentFile(input_bam, "rb")
out_bam = pysam.AlignmentFile(output_bam, "wb", header=samfile.header)
for read in samfile.fetch():
qualities = np.array(read.query_qualities)
capped_qualities = np.minimum(qualities, max_quality)
# Update the base qualities in the read
read.query_qualities = capped_qualities.tolist()
# Write the modified read to the output BAM file
out_bam.write(read)


if __name__ == "__main__":
cap_base_qualities()
1 change: 0 additions & 1 deletion BALSAMIC/assets/scripts/collect_qc_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ def get_sample_id(multiqc_key: str) -> str:
Returns
str: The extracted sample ID with the ACCXXXXXX format.
"""

if "_align_sort_" in multiqc_key:
return multiqc_key.split("_")[0]
return multiqc_key.split(".")[1].split("_")[0]
Expand Down
44 changes: 44 additions & 0 deletions BALSAMIC/assets/scripts/extend_bedfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import click


@click.command()
@click.argument("input_bedfile", type=click.Path(exists=True))
@click.argument("output_bedfile", type=click.Path())
@click.option(
"--extend-to-min-region-size",
default=100,
help="Will extend regions shorter than the specified size to this minimum size.",
)
def extend_bedfile(
input_bedfile: str, output_bedfile: str, extend_to_min_region_size: int
):
"""
Process a BED file to ensure regions are at least a minimum size.
Args:
input_bedfile (str): Input BED file path.
output_bedfile (str): Output BED file path.
min_region_size (int): Minimum region size to enforce.
"""
with open(input_bedfile, "r") as infile, open(output_bedfile, "w") as outfile:
for line in infile:
fields = line.strip().split("\t")

chrom: str = fields[0]
start = int(fields[1])
end = int(fields[2])

region_length: int = end - start
if region_length < extend_to_min_region_size:
center = (start + end) // 2
half_size = extend_to_min_region_size // 2
start = max(0, center - half_size)
end = center + half_size
if extend_to_min_region_size % 2 != 0:
end += 1

outfile.write(f"{chrom}\t{start}\t{end}\n")


if __name__ == "__main__":
extend_bedfile()
109 changes: 109 additions & 0 deletions BALSAMIC/assets/scripts/modify_tnscope_infofield.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python
import vcfpy
import click
import sys
import logging
from typing import List, Optional

LOG = logging.getLogger(__name__)


def summarize_ad_to_dp(ad_list):
"""
Summarizes the AD (allelic depth) field into total DP (read depth).
Parameters:
ad_list (list): List of read depths supporting each allele, [ref_depth, alt1_depth, alt2_depth, ...]
Returns:
int: Total read depth (DP) across all alleles.
"""
if ad_list is None:
return 0 # Return 0 if AD field is not present
return sum(ad_list)


@click.command()
@click.argument("input_vcf", type=click.Path(exists=True))
@click.argument("output_vcf", type=click.Path())
def process_vcf(input_vcf: str, output_vcf: str):
"""
Processes the input VCF file and writes the updated information to the output VCF file.
INPUT_VCF: Path to the input VCF file.
OUTPUT_VCF: Path to the output VCF file.
"""

# Open the input VCF file
reader: vcfpy.Reader = vcfpy.Reader.from_path(input_vcf)

# Ensure the sample name is 'TUMOR'
sample_name: str = reader.header.samples.names[0]
if sample_name != "TUMOR":
LOG.warning(
f"Error: The first sample is named '{sample_name}', but 'TUMOR' is expected."
)
sys.exit(1)

# Add AF and DP fields to the header if not already present
if "AF" not in reader.header.info_ids():
reader.header.add_info_line(
vcfpy.OrderedDict(
[
("ID", "AF"),
("Number", "A"),
("Type", "Float"),
("Description", "Allele Frequency"),
]
)
)

if "DP" not in reader.header.info_ids():
reader.header.add_info_line(
vcfpy.OrderedDict(
[
("ID", "DP"),
("Number", "1"),
("Type", "Integer"),
("Description", "Total Depth"),
]
)
)

# Open the output VCF file for writing
with vcfpy.Writer.from_path(output_vcf, reader.header) as writer:
# Loop through each record (variant)
for record in reader:
# Get the TUMOR sample data
sample_index: int = reader.header.samples.names.index(sample_name)
tumor_call: vcfpy.Call = record.calls[sample_index]

# Check and process AD field
tumor_ad: Optional[List[int]] = tumor_call.data.get(
"AD", None
) # AD is a list [ref_count, alt_count]
if tumor_ad is None:
LOG.warning(
f"Warning: AD field is missing for record at position {record.POS} on {record.CHROM}"
)
else:
record.INFO["DP"] = summarize_ad_to_dp(tumor_ad)

# Check and process AF field
tumor_af: Optional[float] = tumor_call.data.get("AF", None)
if tumor_af is None:
LOG.warning(
f"Warning: AF field is missing for record at position {record.POS} on {record.CHROM}"
)
record.INFO["AF"] = [0.0] # Default AF to 0.0 if missing
else:
record.INFO["AF"] = [tumor_af] # Wrap AF in a list

# Write the updated record to the output VCF file
writer.write_record(record)

click.echo(f"VCF file processed and saved to {output_vcf}")


if __name__ == "__main__":
process_vcf()
32 changes: 32 additions & 0 deletions BALSAMIC/assets/scripts/postprocess_fixmate_bam.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/awk -f

BEGIN { OFS = "\t" }
/^@/ { print; next }
{
flag = $2

# If the mate is unmapped, remove the MC tag if it's "*"
if (and(flag, 8) != 0) {
for (i = 12; i <= NF; i++) {
if ($i ~ /^MC:Z:\*$/) {
$i = ""
}
}
}

# Check if any of the specific bitwise flags are set (2, 8, 32, 64, 128)
if (and(flag, 2 + 8 + 32 + 64 + 128) != 0) {
# Add mate unmapped flag if the mate is unmapped
if ($7 == "*" || and(flag, 8) != 0) {
flag = or(flag, 8)
}

# Ensure the read paired flag (1) is set if any of these are present
flag = or(flag, 1)
}

# Set the modified flag
$2 = flag

print
}
102 changes: 102 additions & 0 deletions BALSAMIC/assets/scripts/postprocess_gens_cnvkit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import click
from BALSAMIC.utils.io import read_csv, write_list_of_strings


def calculate_log2_ratio(purity, log2_ratio, ploidy):
"""Adjuts log2 ratio according to purity and ploidy.
Based on method in PureCN: https://github.com/lima1/PureCN/issues/40
Method is not used currently due to questionable results.
"""
# Ensure that the inputs are within valid ranges
if not (0 <= purity <= 1):
raise ValueError("Purity must be between 0 and 1")

if ploidy <= 0:
raise ValueError("Ploidy must be a positive number")

# Ploidy and purity adjustment formula
log2_adjusted = (
purity * log2_ratio * ploidy + 2 * (1 - purity) * log2_ratio - 2 * (1 - purity)
) / (purity * ploidy)

return log2_adjusted


@click.command()
@click.option(
"-o",
"--output-file",
required=True,
type=click.Path(exists=False),
help="Name of output-file.",
)
@click.option(
"-c",
"--normalised-coverage-path",
required=True,
type=click.Path(exists=True),
help="Input CNVkit cnr. result.",
)
@click.option(
"-p",
"--tumor-purity-path",
required=True,
type=click.Path(exists=True),
help="Tumor purity file from PureCN",
)
def create_gens_cov_file(
output_file: str, normalised_coverage_path: str, tumor_purity_path: str
):
"""Post-processes the CNVkit .cnr output for upload to GENS.
Removes Antitarget regions, adjusts for purity and ploidy and outputs the coverages in multiple resolution-formats.
Args:
output_file: Path to GENS output.cov file
normalised_coverage_path: Path to input CNVkit cnr file.
tumor_purity_path: Path to PureCN purity estimate csv file
"""
log2_data = []

# Process CNVkit file
cnr_dict_list: list[dict] = read_csv(
csv_path=normalised_coverage_path, delimeter="\t"
)

# Process PureCN purity file
purecn_dict_list: list[dict] = read_csv(csv_path=tumor_purity_path, delimeter=",")
purity = float(purecn_dict_list[0]["Purity"])
ploidy = float(purecn_dict_list[0]["Ploidy"])

for row in cnr_dict_list:
if row["gene"] == "Antitarget":
continue

# find midpoint
start: int = int(row["start"])
end: int = int(row["end"])
region_size: int = end - start
midpoint: int = start + int(region_size / 2)

# adjust log2 ratio
log2: float = float(row["log2"])

# De-activate purity and ploidy adjustment
# log2: float = calculate_log2_ratio(purity, log2, ploidy)
# log2: float = round(log2, 4)

# store values in list
log2_data.append(f"{row['chromosome']}\t{midpoint - 1}\t{midpoint}\t{log2}")

# Write log2 data to output file
resolutions = ["o", "a", "b", "c", "d"]
resolution_log2_lines = [
f"{resolution}_{line}" for resolution in resolutions for line in log2_data
]
write_list_of_strings(resolution_log2_lines, output_file)


if __name__ == "__main__":
create_gens_cov_file()
2 changes: 1 addition & 1 deletion BALSAMIC/assets/scripts/preprocess_gens.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"-s",
"--sequencing-type",
required=True,
type=click.Choice([SequencingType.WGS]),
type=click.Choice([SequencingType.WGS, SequencingType.TARGETED]),
help="Sequencing type used.",
)
@click.pass_context
Expand Down
16 changes: 16 additions & 0 deletions BALSAMIC/assets/scripts/sort_vcf.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/awk -f

BEGIN {
ENVIRON["LC_ALL"] = "en_US.UTF-8"
}

# If the line starts with a '#', it's a header, so print it as is
$1 ~ /^#/ {
print $0;
next;
}

# Otherwise, send the body lines to an external sort command
{
print $0 | "/usr/bin/sort -k1,1V -k2,2n"
}
Loading

0 comments on commit f654750

Please sign in to comment.