Skip to content

Commit

Permalink
feat: extend bedfiles for CNV analysis (#1469)
Browse files Browse the repository at this point in the history
## Added
- padding of bed-regions for CNVkit to minimum 100 base
  • Loading branch information
mathiasbio authored Jul 30, 2024
1 parent 8d6736a commit 2a2dcfe
Show file tree
Hide file tree
Showing 19 changed files with 201 additions and 29 deletions.
38 changes: 38 additions & 0 deletions BALSAMIC/assets/scripts/extend_bedfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import click


@click.command()
@click.argument("input_bedfile", type=click.Path(exists=True))
@click.argument("output_bedfile", type=click.Path())
@click.option("--min-region-size", default=20, help="Minimum region size to enforce.")
def extend_bedfile(input_bedfile: str, output_bedfile: str, min_region_size: int):
"""
Process a BED file to ensure regions are at least a minimum size.
Args:
input_bedfile (str): Input BED file path.
output_bedfile (str): Output BED file path.
min_region_size (int): Minimum region size to enforce.
"""
with open(input_bedfile, "r") as infile, open(output_bedfile, "w") as outfile:
for line in infile:
fields = line.strip().split("\t")

chrom: str = fields[0]
start = int(fields[1])
end = int(fields[2])

region_length: int = end - start
if region_length < min_region_size:
center = (start + end) // 2
half_size = min_region_size // 2
start = max(0, center - half_size)
end = center + half_size
if min_region_size % 2 != 0:
end += 1

outfile.write(f"{chrom}\t{start}\t{end}\n")


if __name__ == "__main__":
extend_bedfile()
8 changes: 8 additions & 0 deletions BALSAMIC/constants/cluster_analysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
"time": "05:00:00",
"n": 10
},
"extend_short_bedregions": {
"time": "01:00:00",
"n": 1
},
"cnvkit_create_coverage": {
"time": "6:00:00",
"n": 18
Expand Down Expand Up @@ -424,6 +428,10 @@
"time": "01:00:00",
"n": 1
},
"bedtools_merge": {
"time": "01:00:00",
"n": 1
},
"bam_compress": {
"time": "04:00:00",
"n": 20
Expand Down
6 changes: 6 additions & 0 deletions BALSAMIC/constants/cluster_cache.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@
"mail_type": "FAIL",
"partition": "core"
},
"expand_short_bedregions": {
"n": 1,
"time": "00:15:00",
"mail_type": "FAIL",
"partition": "core"
},
"fasta_index_reference_genome": {
"n": 24,
"time": "01:00:00",
Expand Down
6 changes: 4 additions & 2 deletions BALSAMIC/constants/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@
"snakemake_rules/umi/sentieon_consensuscall.rule",
],
"varcall": [
"snakemake_rules/variant_calling/cnvkit_preprocess.rule"
"snakemake_rules/variant_calling/extend_bed.rule",
"snakemake_rules/variant_calling/cnvkit_preprocess.rule",
"snakemake_rules/variant_calling/germline_tga.rule",
"snakemake_rules/variant_calling/split_bed.rule",
"snakemake_rules/variant_calling/somatic_cnv_tumor_only_tga.rule",
Expand Down Expand Up @@ -108,7 +109,8 @@
"snakemake_rules/umi/umi_sentieon_alignment.rule",
],
"varcall": [
"snakemake_rules/variant_calling/cnvkit_preprocess.rule"
"snakemake_rules/variant_calling/extend_bed.rule",
"snakemake_rules/variant_calling/cnvkit_preprocess.rule",
"snakemake_rules/variant_calling/germline_tga.rule",
"snakemake_rules/variant_calling/split_bed.rule",
"snakemake_rules/variant_calling/somatic_tumor_normal.rule",
Expand Down
3 changes: 3 additions & 0 deletions BALSAMIC/constants/workflow_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@
"bam_post_processing": {
"manta_max_base_quality": 70,
},
"bed_pre_processing": {
"minimum_region_size": 100,
},
"common": {
"header_per_lane": "'@RG\\tID:{fastq_pattern}\\tSM:{sample_type}\\tPL:ILLUMINAi'",
"header_per_sample": "'@RG\\tID:{sample}\\tSM:{sample_type}\\tPL:ILLUMINAi'",
Expand Down
11 changes: 11 additions & 0 deletions BALSAMIC/models/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,16 @@ class BAMPostProcessingParams(BaseModel):
manta_max_base_quality: int


class BEDPreProcessingParams(BaseModel):
"""This class defines the params settings used as constants in bed pre-processing rules
Attributes:
minimum_region_size: int (required); the minimum region size in input bedfiles for CNV analysis
"""

minimum_region_size: int


class BalsamicWorkflowConfig(BaseModel):
"""Defines set of rules in balsamic workflow
Expand All @@ -216,6 +226,7 @@ class BalsamicWorkflowConfig(BaseModel):
"""

bam_post_processing: BAMPostProcessingParams
bed_pre_processing: BEDPreProcessingParams
common: ParamsCommon
insert_size_metrics: ParamsInsertSizeMetrics
manta: ParamsManta
Expand Down
10 changes: 5 additions & 5 deletions BALSAMIC/snakemake_rules/variant_calling/cnvkit_preprocess.rule
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
rule create_target:
input:
target_bait = target_bed,
refgene_flat = refgene_flat,
access_bed = access_5kb_hg19,
bed_expanded_merged = Path(cnv_dir + "capture_kit_expanded_merged.bed").as_posix(),
refgene_flat = config_model.reference["refgene_flat"],
access_bed = config_model.reference["access_regions"],
wake_up = result_dir + "start_analysis",
output:
targets = cnv_dir + "targets.bed",
Expand All @@ -15,8 +15,8 @@ rule create_target:
Path(benchmark_dir, "cnvkit.targets.tsv").as_posix()
shell:
"""
cnvkit.py target {input.target_bait} --annotate {input.refgene_flat} --split -o {output.targets};
cnvkit.py antitarget {input.target_bait} -g {input.access_bed} -o {output.antitargets};
cnvkit.py target {input.bed_expanded_merged} --annotate {input.refgene_flat} --split -o {output.targets};
cnvkit.py antitarget {input.bed_expanded_merged} -g {input.access_bed} -o {output.antitargets};
"""

rule create_coverage:
Expand Down
42 changes: 42 additions & 0 deletions BALSAMIC/snakemake_rules/variant_calling/extend_bed.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@

rule extend_short_bedregions:
input:
baits_bed = config_model.panel.capture_kit,
wake_up= result_dir + "start_analysis",
output:
baits_bed_expanded=Path(cnv_dir + "capture_kit_expanded.bed").as_posix(),
benchmark:
Path(benchmark_dir,"extend_short_bedregions.tsv").as_posix()
singularity:
Path(singularity_image,config["bioinfo_tools"].get("pysam") + ".sif").as_posix()
params:
bedfile_extend = get_script_path("extend_bedfile.py"),
minimum_region_size = params.bed_pre_processing.minimum_region_size
threads:
get_threads(cluster_config, "extend_short_bedregions")
message:
"Extending regions in bedfiel to a minimum size of {params.minimum_region_size}."
shell:
"""
python {params.bedfile_extend} --min-region-size {params.minimum_region_size} {input.baits_bed} {output.baits_bed_expanded} ;
"""


rule bedtools_sort_and_merge:
input:
bed_expanded = Path(cnv_dir + "capture_kit_expanded.bed").as_posix(),
output:
bed_expanded_merged = Path(cnv_dir + "capture_kit_expanded_merged.bed").as_posix(),
benchmark:
Path(benchmark_dir, 'bedtools_merge_expanded_bedfile.tsv').as_posix()
singularity:
Path(singularity_image, config["bioinfo_tools"].get("bedtools") + ".sif").as_posix()
threads:
get_threads(cluster_config, "bedtools_merge")
message:
"Running bedtools sort and merge to merge potentially overlapping regions."
shell:
"""
bedtools sort -i {input.bed_expanded} > {input.bed_expanded}_sorted.bed
bedtools merge {input.bed_expanded}_sorted.bed {output.bed_expanded_merged}
"""
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

rule gatk_collectreadcounts:
input:
bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample),
bam = lambda wildcards: config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = wildcards.sample, specified_suffix="dedup"),
genome_interval = config["reference"]["genome_interval"]
output:
readcounts_hdf5 = cnv_dir + "{sample}.collectreadcounts.hdf5"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ rm -rf {params.tmpdir};
rule cnvkit_segment_CNV_research:
input:
access_bed = config["reference"]["access_regions"],
baits_bed = config["panel"]["capture_kit"],
fasta = config["reference"]["reference_genome"],
refgene_flat = config["reference"]["refgene_flat"],
tumor_target_cnn=expand(cnv_dir + "{sample}.targetcoverage.cnn", sample=tumor_sample),
Expand Down Expand Up @@ -78,21 +77,21 @@ if [[ ! -f "{params.pon}" ]]; then
# Compile a coverage reference from the given list of files
cnvkit.py reference \
{output.normal_target_coverage} \
{output.normal_antitarget_coverage} \
{input.normal_target_cnn} \
{input.normal_antitarget_cnn} \
--fasta {input.fasta} \
--output {params.normal_reference_cnn};
cnvkit.py fix {output.tumor_target_coverage} \
{output.tumor_antitarget_coverage} \
cnvkit.py fix {input.tumor_target_cnn} \
{input.tumor_antitarget_cnn} \
{params.normal_reference_cnn} \
--output {output.cnr};
else
echo "PON reference exists- Using it for coverage correction"
cnvkit.py fix {output.tumor_target_coverage} \
{output.tumor_antitarget_coverage} \
cnvkit.py fix {input.tumor_target_cnn} \
{input.tumor_antitarget_cnn} \
{params.pon} \
--output {output.cnr};
Expand Down Expand Up @@ -203,14 +202,11 @@ rule cnvkit_call_CNV_research:
input:
access_bed = config["reference"]["access_regions"],
fasta = config["reference"]["reference_genome"],
baits_bed = config["panel"]["capture_kit"],
refgene_flat = config["reference"]["refgene_flat"],
purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv",
cns_initial = cnv_dir + "tumor.initial.cns",
cnr = cnv_dir + "tumor.merged.cnr",
snv_merged = vcf_dir + "SNV.germline.merged.dnascope.vcf.gz",
bamT = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = tumor_sample),
bamN = config_model.get_final_bam_name(bam_dir = bam_dir, sample_name = normal_sample),
output:
cns = cnv_dir + "tumor.merged.cns",
gene_breaks = cnv_dir + config["analysis"]["case_id"] + ".gene_breaks",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
rule cnvkit_segment_CNV_research:
input:
access_bed = config["reference"]["access_regions"],
baits_bed = config["panel"]["capture_kit"],
fasta = config["reference"]["reference_genome"],
refgene_flat = config["reference"]["refgene_flat"],
tumor_target_cnn=expand(cnv_dir + "{sample}.targetcoverage.cnn",sample=tumor_sample),
Expand Down Expand Up @@ -34,19 +33,19 @@ rule cnvkit_segment_CNV_research:
if [[ ! -f "{params.pon}" ]]; then
cnvkit.py reference --output {params.flat_reference_cnn} \
--fasta {input.fasta} \
--targets {output.targets} \
--antitargets {output.antitargets};
--targets {input.tumor_target_cnn} \
--antitargets {input.tumor_antitarget_cnn};
cnvkit.py fix {output.tumor_target_coverage} \
{output.tumor_antitarget_coverage} \
cnvkit.py fix {input.tumor_target_cnn} \
{input.tumor_antitarget_cnn} \
{params.flat_reference_cnn} \
--output {output.cnr};
else
echo "PON reference exists- Using it for coverage correction"
cnvkit.py fix {output.tumor_target_coverage} \
{output.tumor_antitarget_coverage} \
cnvkit.py fix {input.tumor_target_cnn} \
{input.tumor_antitarget_cnn} \
{params.pon} \
--output {output.cnr};
Expand Down Expand Up @@ -153,7 +152,6 @@ rule cnvkit_call_CNV_research:
input:
access_bed = config["reference"]["access_regions"],
fasta = config["reference"]["reference_genome"],
baits_bed = config["panel"]["capture_kit"],
refgene_flat = config["reference"]["refgene_flat"],
purity_ploidy = cnv_dir + "CNV.somatic." + config["analysis"]["case_id"] + ".purecn.purity.csv",
cns_initial= cnv_dir + "tumor.initial.cns",
Expand Down
3 changes: 2 additions & 1 deletion BALSAMIC/workflows/PON.smk
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ if sequence_type == SequencingType.TARGETED:
rules_to_include.append("snakemake_rules/quality_control/fastp_tga.rule")
rules_to_include.append("snakemake_rules/align/tga_sentieon_alignment.rule")
rules_to_include.append("snakemake_rules/align/tga_bam_postprocess.rule")
rules_to_include.append("snakemake_rules/variant_calling/extend_bed.rule")
rules_to_include.append("snakemake_rules/variant_calling/cnvkit_preprocess.rule")
else:
rules_to_include.append("snakemake_rules/quality_control/fastp_wgs.rule")
Expand All @@ -91,7 +92,7 @@ if pon_workflow in [PONWorkflow.GENS_MALE, PONWorkflow.GENS_FEMALE]:
rules_to_include.append("snakemake_rules/variant_calling/gatk_read_counts.rule")
rules_to_include.append("snakemake_rules/pon/gens_create_pon.rule")

pon_finish = expand(analysis_dir + "analysis_PON_finish")
pon_finish = Path(analysis_dir + "analysis_PON_finish").as_posix()

for r in rules_to_include:
include: Path(BALSAMIC_DIR, r).as_posix()
Expand Down
1 change: 0 additions & 1 deletion BALSAMIC/workflows/balsamic.smk
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ delivery_dir: str = Path(result_dir, "delivery").as_posix() + "/"
umi_dir: str = Path(result_dir, "umi").as_posix() + "/"
umi_qc_dir: str = Path(qc_dir, "umi_qc").as_posix() + "/"


# Annotations
research_annotations = []
clinical_annotations = []
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Added:
* MSI analysis to the tumor-normal workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1454
* Sentieon install directory path to case config arguments https://github.com/Clinical-Genomics/BALSAMIC/pull/1461
* UMI extraction and deduplication to TGA workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1358
* Padding of bed-regions for CNVkit to minimum 100 bases https://github.com/Clinical-Genomics/BALSAMIC/pull/1469
* Added min mapq 20 to CNVkit PON workflow https://github.com/Clinical-Genomics/BALSAMIC/pull/1465

Changed:
Expand Down
2 changes: 2 additions & 0 deletions docs/balsamic_pon.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ When creating a new PON reference file, the next steps have to be followed:
/path/analysis/analysis_PON_finish
/path/analysis/cnv/<panel_name>_CNVkit_PON_reference_<version>.cnn
``Note: The bedfile from the bait-set will be padded in the generation of the PON according to the minimum bed region size set in Balsamic as well as during the analysis with CNVkit, this to avoid CNVkit filtering out short regions.``

Using the PON during analysis
-----------------------------

Expand Down
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,12 @@ def references_dir(test_data_dir) -> Path:
return Path(test_data_dir, "references")


@pytest.fixture(scope="session")
def bedfile_path(test_data_dir) -> Path:
"""Return bedfile path."""
return Path(test_data_dir, "bedfiles", "test_bedfile.bed")


@pytest.fixture(scope="session")
def purity_csv_path(test_data_dir) -> Path:
"""Return pureCN purity CSV path."""
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_config_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,5 +418,5 @@ def test_get_final_bam_name_pon(balsamic_pon_model: ConfigModel):
)

# Then retrieved final bam names should match the expected format and be identical regardless of request parameter
expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.bam"
expected_final_bam_name = f"{bam_dir}{sample_type}.{sample_name}.dedup.fixmate.bam"
assert expected_final_bam_name == bam_name_sample_name
Loading

0 comments on commit 2a2dcfe

Please sign in to comment.