feat: extend bedfiles for CNV analysis (#1469)

## Added - padding of bed-regions for CNVkit to minimum 100 base
Clinical-Genomics · Jul 30, 2024 · 2a2dcfe · 2a2dcfe
1 parent 8d6736a
commit 2a2dcfe
Show file tree

Hide file tree

Showing 19 changed files with 201 additions and 29 deletions.
diff --git a/BALSAMIC/assets/scripts/extend_bedfile.py b/BALSAMIC/assets/scripts/extend_bedfile.py
@@ -0,0 +1,38 @@
+import click
+
+
+@click.command()
+@click.argument("input_bedfile", type=click.Path(exists=True))
+@click.argument("output_bedfile", type=click.Path())
+@click.option("--min-region-size", default=20, help="Minimum region size to enforce.")
+def extend_bedfile(input_bedfile: str, output_bedfile: str, min_region_size: int):
+    """
+    Process a BED file to ensure regions are at least a minimum size.
+
+    Args:
+        input_bedfile (str): Input BED file path.
+        output_bedfile (str): Output BED file path.
+        min_region_size (int): Minimum region size to enforce.
+    """
+    with open(input_bedfile, "r") as infile, open(output_bedfile, "w") as outfile:
+        for line in infile:
+            fields = line.strip().split("\t")
+
+            chrom: str = fields[0]
+            start = int(fields[1])
+            end = int(fields[2])
+
+            region_length: int = end - start
+            if region_length < min_region_size:
+                center = (start + end) // 2
+                half_size = min_region_size // 2
+                start = max(0, center - half_size)
+                end = center + half_size
+                if min_region_size % 2 != 0:
+                    end += 1
+
+            outfile.write(f"{chrom}\t{start}\t{end}\n")
+
+
+if __name__ == "__main__":
+    extend_bedfile()
diff --git a/BALSAMIC/constants/cluster_analysis.json b/BALSAMIC/constants/cluster_analysis.json
@@ -23,6 +23,10 @@
 		"time": "05:00:00",
 		"n": 10
 	},
+	"extend_short_bedregions": {
+		"time": "01:00:00",
+		"n": 1
+	},
 	"cnvkit_create_coverage": {
 		"time": "6:00:00",
 		"n": 18
@@ -424,6 +428,10 @@
 		"time": "01:00:00",
 		"n": 1
 	},
+	"bedtools_merge": {
+		"time": "01:00:00",
+		"n": 1
+	},
 	"bam_compress": {
 		"time": "04:00:00",
 		"n": 20

diff --git a/BALSAMIC/constants/cluster_cache.json b/BALSAMIC/constants/cluster_cache.json
@@ -53,6 +53,12 @@
         "mail_type": "FAIL",
         "partition": "core"
     },
+    "expand_short_bedregions": {
+        "n": 1,
+        "time": "00:15:00",
+        "mail_type": "FAIL",
+        "partition": "core"
+    },
     "fasta_index_reference_genome": {
         "n": 24,
         "time": "01:00:00",

diff --git a/BALSAMIC/constants/rules.py b/BALSAMIC/constants/rules.py
@@ -76,7 +76,8 @@
             "snakemake_rules/umi/sentieon_consensuscall.rule",
         ],
         "varcall": [
-            "snakemake_rules/variant_calling/cnvkit_preprocess.rule"
+            "snakemake_rules/variant_calling/extend_bed.rule",
+            "snakemake_rules/variant_calling/cnvkit_preprocess.rule",
             "snakemake_rules/variant_calling/germline_tga.rule",
             "snakemake_rules/variant_calling/split_bed.rule",
             "snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule",
@@ -108,7 +109,8 @@
             "snakemake_rules/umi/umi_sentieon_alignment.rule",
         ],
         "varcall": [
-            "snakemake_rules/variant_calling/cnvkit_preprocess.rule"
+            "snakemake_rules/variant_calling/extend_bed.rule",
+            "snakemake_rules/variant_calling/cnvkit_preprocess.rule",
             "snakemake_rules/variant_calling/germline_tga.rule",
             "snakemake_rules/variant_calling/split_bed.rule",
             "snakemake_rules/variant_calling/somatic_tumor_normal.rule",

diff --git a/BALSAMIC/constants/workflow_params.py b/BALSAMIC/constants/workflow_params.py
@@ -121,6 +121,9 @@
     "bam_post_processing": {
         "manta_max_base_quality": 70,
     },
+    "bed_pre_processing": {
+        "minimum_region_size": 100,
+    },
     "common": {
         "header_per_lane": "'@RG\\tID:{fastq_pattern}\\tSM:{sample_type}\\tPL:ILLUMINAi'",
         "header_per_sample": "'@RG\\tID:{sample}\\tSM:{sample_type}\\tPL:ILLUMINAi'",

diff --git a/BALSAMIC/models/params.py b/BALSAMIC/models/params.py
@@ -194,6 +194,16 @@ class BAMPostProcessingParams(BaseModel):
     manta_max_base_quality: int
 
 
+class BEDPreProcessingParams(BaseModel):
+    """This class defines the params settings used as constants in bed pre-processing rules
+
+    Attributes:
+       minimum_region_size: int (required); the minimum region size in input bedfiles for CNV analysis
+    """
+
+    minimum_region_size: int
+
+
 class BalsamicWorkflowConfig(BaseModel):
     """Defines set of rules in balsamic workflow
 
@@ -216,6 +226,7 @@ class BalsamicWorkflowConfig(BaseModel):
     """
 
     bam_post_processing: BAMPostProcessingParams
+    bed_pre_processing: BEDPreProcessingParams
     common: ParamsCommon
     insert_size_metrics: ParamsInsertSizeMetrics
     manta: ParamsManta

diff --git a/BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule b/BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule
@@ -1,8 +1,8 @@
 rule create_target:
     input:
-        target_bait = target_bed,
-        refgene_flat = refgene_flat,
-        access_bed = access_5kb_hg19,
+        bed_expanded_merged = Path(cnv_dir + "capture_kit_expanded_merged.bed").as_posix(),
+        refgene_flat = config_model.reference["refgene_flat"],
+        access_bed = config_model.reference["access_regions"],
         wake_up = result_dir + "start_analysis",
     output:
         targets = cnv_dir + "targets.bed",
@@ -15,8 +15,8 @@ rule create_target:
         Path(benchmark_dir, "cnvkit.targets.tsv").as_posix()
     shell:
         """
-cnvkit.py target {input.target_bait} --annotate {input.refgene_flat} --split -o {output.targets};
-cnvkit.py antitarget {input.target_bait} -g {input.access_bed} -o {output.antitargets};
+cnvkit.py target {input.bed_expanded_merged} --annotate {input.refgene_flat} --split -o {output.targets};
+cnvkit.py antitarget {input.bed_expanded_merged} -g {input.access_bed} -o {output.antitargets};
         """
 
 rule create_coverage:

diff --git a/BALSAMIC/snakemake_rules/variant_calling/extend_bed.rule b/BALSAMIC/snakemake_rules/variant_calling/extend_bed.rule
@@ -0,0 +1,42 @@
+
+rule extend_short_bedregions:
+    input:
+        baits_bed = config_model.panel.capture_kit,
+        wake_up= result_dir + "start_analysis",
+    output:
+        baits_bed_expanded=Path(cnv_dir + "capture_kit_expanded.bed").as_posix(),
+    benchmark:
+        Path(benchmark_dir,"extend_short_bedregions.tsv").as_posix()
+    singularity:
+        Path(singularity_image,config["bioinfo_tools"].get("pysam") + ".sif").as_posix()
+    params:
+        bedfile_extend = get_script_path("extend_bedfile.py"),
+        minimum_region_size = params.bed_pre_processing.minimum_region_size
+    threads:
+        get_threads(cluster_config, "extend_short_bedregions")
+    message:
+        "Extending regions in bedfiel to a minimum size of {params.minimum_region_size}."
+    shell:
+        """
+python {params.bedfile_extend} --min-region-size {params.minimum_region_size} {input.baits_bed} {output.baits_bed_expanded} ; 
+        """
+
+
+rule bedtools_sort_and_merge:
+    input:
+        bed_expanded = Path(cnv_dir + "capture_kit_expanded.bed").as_posix(),
+    output:
+        bed_expanded_merged = Path(cnv_dir + "capture_kit_expanded_merged.bed").as_posix(),
+    benchmark:
+        Path(benchmark_dir, 'bedtools_merge_expanded_bedfile.tsv').as_posix()
+    singularity:
+        Path(singularity_image, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix()
+    threads:
+        get_threads(cluster_config, "bedtools_merge")
+    message:
+        "Running bedtools sort and merge to merge potentially overlapping regions."
+    shell:
+        """
+bedtools sort -i {input.bed_expanded} > {input.bed_expanded}_sorted.bed
+bedtools merge {input.bed_expanded}_sorted.bed {output.bed_expanded_merged}
+        """
diff --git a/BALSAMIC/snakemake_rules/variant_calling/gatk_read_counts.rule b/BALSAMIC/snakemake_rules/variant_calling/gatk_read_counts.rule
@@ -3,7 +3,7 @@
 
 rule gatk_collectreadcounts:
     input:
-        bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample),
+        bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample, specified_suffix="dedup"),
         genome_interval = config["reference"]["genome_interval"]
     output:
         readcounts_hdf5 = cnv_dir + "{sample}.collectreadcounts.hdf5"

diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_normal_tga.rule
@@ -43,7 +43,6 @@ rm -rf {params.tmpdir};
 rule cnvkit_segment_CNV_research:
     input:
         access_bed = config["reference"]["access_regions"],
-        baits_bed = config["panel"]["capture_kit"],
         fasta = config["reference"]["reference_genome"],
         refgene_flat = config["reference"]["refgene_flat"],
         tumor_target_cnn=expand(cnv_dir + "{sample}.targetcoverage.cnn", sample=tumor_sample),
@@ -78,21 +77,21 @@ if [[ ! -f "{params.pon}" ]]; then
 
 # Compile a coverage reference from the given list of files
 cnvkit.py reference \
-{output.normal_target_coverage} \
-{output.normal_antitarget_coverage} \
+{input.normal_target_cnn} \
+{input.normal_antitarget_cnn} \
 --fasta {input.fasta} \
 --output {params.normal_reference_cnn};
 
-cnvkit.py fix {output.tumor_target_coverage} \
-{output.tumor_antitarget_coverage} \
+cnvkit.py fix {input.tumor_target_cnn} \
+{input.tumor_antitarget_cnn} \
 {params.normal_reference_cnn} \
 --output {output.cnr};
 
 else
 
 echo "PON reference exists- Using it for coverage correction" 
-cnvkit.py fix {output.tumor_target_coverage} \
-{output.tumor_antitarget_coverage} \
+cnvkit.py fix {input.tumor_target_cnn} \
+{input.tumor_antitarget_cnn} \
 {params.pon} \
 --output {output.cnr};
 
@@ -203,14 +202,11 @@ rule cnvkit_call_CNV_research:
     input:
         access_bed = config["reference"]["access_regions"],
         fasta = config["reference"]["reference_genome"],
-        baits_bed = config["panel"]["capture_kit"],
         refgene_flat = config["reference"]["refgene_flat"],
         purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv",
         cns_initial = cnv_dir + "tumor.initial.cns",
         cnr = cnv_dir + "tumor.merged.cnr",
         snv_merged = vcf_dir + "SNV.germline.merged.dnascope.vcf.gz",
-        bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
-        bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample),
     output:
         cns = cnv_dir + "tumor.merged.cns",
         gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks",

diff --git a/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule b/BALSAMIC/snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule
@@ -4,7 +4,6 @@
 rule cnvkit_segment_CNV_research:
     input:
         access_bed = config["reference"]["access_regions"],
-        baits_bed = config["panel"]["capture_kit"],
         fasta = config["reference"]["reference_genome"],
         refgene_flat = config["reference"]["refgene_flat"],
         tumor_target_cnn=expand(cnv_dir + "{sample}.targetcoverage.cnn",sample=tumor_sample),
@@ -34,19 +33,19 @@ rule cnvkit_segment_CNV_research:
 if [[ ! -f "{params.pon}" ]]; then
 cnvkit.py reference --output {params.flat_reference_cnn} \
 --fasta {input.fasta} \
---targets {output.targets} \
---antitargets {output.antitargets};
+--targets {input.tumor_target_cnn} \
+--antitargets {input.tumor_antitarget_cnn};
 
-cnvkit.py fix {output.tumor_target_coverage} \
-{output.tumor_antitarget_coverage} \
+cnvkit.py fix {input.tumor_target_cnn} \
+{input.tumor_antitarget_cnn} \
 {params.flat_reference_cnn} \
 --output {output.cnr};
 
 else
 
 echo "PON reference exists- Using it for coverage correction"
-cnvkit.py fix {output.tumor_target_coverage} \
-{output.tumor_antitarget_coverage} \
+cnvkit.py fix {input.tumor_target_cnn} \
+{input.tumor_antitarget_cnn} \
 {params.pon} \
 --output {output.cnr};
 
@@ -153,7 +152,6 @@ rule cnvkit_call_CNV_research:
     input:
         access_bed = config["reference"]["access_regions"],
         fasta = config["reference"]["reference_genome"],
-        baits_bed = config["panel"]["capture_kit"],
         refgene_flat = config["reference"]["refgene_flat"],
         purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv",
         cns_initial= cnv_dir + "tumor.initial.cns",

diff --git a/BALSAMIC/workflows/PON.smk b/BALSAMIC/workflows/PON.smk
@@ -68,6 +68,7 @@ if sequence_type == SequencingType.TARGETED:
     rules_to_include.append("snakemake_rules/quality_control/fastp_tga.rule")
     rules_to_include.append("snakemake_rules/align/tga_sentieon_alignment.rule")
     rules_to_include.append("snakemake_rules/align/tga_bam_postprocess.rule")
+    rules_to_include.append("snakemake_rules/variant_calling/extend_bed.rule")
     rules_to_include.append("snakemake_rules/variant_calling/cnvkit_preprocess.rule")
 else:
     rules_to_include.append("snakemake_rules/quality_control/fastp_wgs.rule")
@@ -91,7 +92,7 @@ if pon_workflow in [PONWorkflow.GENS_MALE, PONWorkflow.GENS_FEMALE]:
     rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule")
     rules_to_include.append("snakemake_rules/pon/gens_create_pon.rule")
 
-pon_finish = expand(analysis_dir + "analysis_PON_finish")
+pon_finish = Path(analysis_dir + "analysis_PON_finish").as_posix()
 
 for r in rules_to_include:
     include: Path(BALSAMIC_DIR, r).as_posix()

diff --git a/BALSAMIC/workflows/balsamic.smk b/BALSAMIC/workflows/balsamic.smk
@@ -90,7 +90,6 @@ delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/"
 umi_dir: str = Path(result_dir, "umi").as_posix() + "/"
 umi_qc_dir: str = Path(qc_dir, "umi_qc").as_posix() + "/"
 
-
 # Annotations
 research_annotations = []
 clinical_annotations = []

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,7 @@ Added:
 * MSI analysis to the tumor-normal workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1454
 * Sentieon install directory path to case config arguments https://github.com/Clinical-Genomics/BALSAMIC/pull/1461
 * UMI extraction and deduplication to TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1358
+* Padding of bed-regions for CNVkit to minimum 100 bases https://github.com/Clinical-Genomics/BALSAMIC/pull/1469
 * Added min mapq 20 to CNVkit PON workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1465
 
 Changed:

diff --git a/docs/balsamic_pon.rst b/docs/balsamic_pon.rst
@@ -54,6 +54,8 @@ When creating a new PON reference file, the next steps have to be followed:
     /path/analysis/analysis_PON_finish
     /path/analysis/cnv/<panel_name>_CNVkit_PON_reference_<version>.cnn
 
+``Note: The bedfile from the bait-set will be padded in the generation of the PON according to the minimum bed region size set in Balsamic as well as during the analysis with CNVkit, this to avoid CNVkit filtering out short regions.``
+
 Using the PON during analysis
 -----------------------------
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -667,6 +667,12 @@ def references_dir(test_data_dir) -> Path:
     return Path(test_data_dir, "references")
 
 
+@pytest.fixture(scope="session")
+def bedfile_path(test_data_dir) -> Path:
+    """Return bedfile path."""
+    return Path(test_data_dir, "bedfiles", "test_bedfile.bed")
+
+
 @pytest.fixture(scope="session")
 def purity_csv_path(test_data_dir) -> Path:
     """Return pureCN purity CSV path."""

diff --git a/tests/models/test_config_models.py b/tests/models/test_config_models.py
@@ -418,5 +418,5 @@ def test_get_final_bam_name_pon(balsamic_pon_model: ConfigModel):
     )
 
     # Then retrieved final bam names should match the expected format and be identical regardless of request parameter
-    expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.bam"
+    expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.fixmate.bam"
     assert expected_final_bam_name == bam_name_sample_name