Merge pull request #53 from HomoPolyethylen/dev

civic+cgi evidence bug fixes & example data update closes #51 closes #52
qbic-pipelines · Sep 9, 2024 · 21b8e58 · 21b8e58
2 parents c0d53ac + c1b68fb
commit 21b8e58
Show file tree

Hide file tree

Showing 18 changed files with 839 additions and 715 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,20 @@
 Changelog
 ============
 
+0.5.5 - Sulfur Io  (2024-09-09)
+---------------------------------------------
+
+**Added**
+
+**Fixed**
+
+* [#52](https://github.com/qbic-pipelines/querynator/issues/52): issue that lead to an inconsitent number of fields for the CIViC evidences
+* [#51](https://github.com/qbic-pipelines/querynator/issues/51): CGI evidences are now filtered by the specified cancer type
+
+**Dependencies**
+
+**Deprecated**
+
 0.5.4 - Sulfur Io  (2024-07-24)
 ---------------------------------------------
 

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -277,7 +277,7 @@ The command above generates the following result directory:
     outdir
     ├── combined_files
     |   ├── alterations_vep.tsv
-    |   ├── biomarkers_linked.tsv
+    |   ├── biomarkers_linked_filtered.tsv
     |   ├── civic_cgi_vep.tsv
     |   └── civic_vep.tsv
     ├── report

diff --git a/example_files/cgi_test_out/cgi_test_out.cgi_results.zip b/example_files/cgi_test_out/cgi_test_out.cgi_results.zip
diff --git a/example_files/cgi_test_out/cgi_test_out.cgi_results/alterations.tsv b/example_files/cgi_test_out/cgi_test_out.cgi_results/alterations.tsv
diff --git a/example_files/cgi_test_out/cgi_test_out.cgi_results/biomarkers.tsv b/example_files/cgi_test_out/cgi_test_out.cgi_results/biomarkers.tsv
diff --git a/example_files/cgi_test_out/cgi_test_out.cgi_results/cna_analysis.tsv b/example_files/cgi_test_out/cgi_test_out.cgi_results/cna_analysis.tsv
diff --git a/example_files/cgi_test_out/cgi_test_out.cgi_results/input01.tsv b/example_files/cgi_test_out/cgi_test_out.cgi_results/input01.tsv
diff --git a/example_files/cgi_test_out/cgi_test_out.cgi_results/metadata.txt b/example_files/cgi_test_out/cgi_test_out.cgi_results/metadata.txt
@@ -1,5 +1,5 @@
-CGI query date: 2023-05-30
+CGI query date: 2024-09-09
 API version: https://www.cancergenomeinterpreter.org/api/v1/
-Input mutations: /Users/students/Documents/work_dir/querynator/example_files/example.vcf
+Input mutations: /home-link/zxmgc83/querynator/example_files/example.vcf
 Reference genome: GRCh37
 Filtered out synonymous & low impact variants based on VEP annotation
diff --git a/example_files/cgi_test_out/cgi_test_out.cgi_results/report.txt b/example_files/cgi_test_out/cgi_test_out.cgi_results/report.txt
diff --git a/example_files/cgi_test_out/cgi_test_out.cgi_results/summary.txt b/example_files/cgi_test_out/cgi_test_out.cgi_results/summary.txt
@@ -0,0 +1,29 @@
+===============================
+=== Summary of the analysis ===
+===============================
+
+=== CGI-ANALYSIS ===
+Analysis Code: CGI_query
+Analysis ID: 45474d2bd5ff2e2a81e3
+CGI version: v23.12.2
+Date: 2024-09-09 12:17:40
+
+=== INPUT ===
+Analysed mutations: 88
+Analysed cnas: 0
+Analysed fusions: 0
+Total samples: 1
+Cancer type: BRCA
+Reference genome: hg19
+
+=== ALTERATIONS ===
+Driver mutations: 84
+Predicted and annotated drivers: 47
+Predicted drivers: 4
+Annotated drivers: 33
+
+=== BIOMARKERS ===
+Biomarkers in cancer type: 33
+Biomarkers in cancer type - Level A: 1
+Biomarkers in other cancer type: 441
+
diff --git a/example_files/cgi_test_out/vcf_files/cgi_test_out.filtered_variants.vcf b/example_files/cgi_test_out/vcf_files/cgi_test_out.filtered_variants.vcf
diff --git a/example_files/civic_test_out/civic_test_out.civic_results.tsv b/example_files/civic_test_out/civic_test_out.civic_results.tsv
diff --git a/example_files/civic_test_out/metadata.txt b/example_files/civic_test_out/metadata.txt
@@ -1,6 +1,6 @@
-CIViC query date: 2023-05-30
+CIViC query date: 2024-09-09
 CIViCpy version: 3.0.0
 Search mode: exact
 Reference genome: GRCh37
 Filtered out synonymous & low impact variants based on VEP annotation
-Input File: /Users/students/Documents/work_dir/querynator/example_files/example.vcf
+Input File: /home-link/zxmgc83/querynator/example_files/example.vcf
diff --git a/example_files/civic_test_out/vcf_files/civic_test_out.filtered_variants.vcf b/example_files/civic_test_out/vcf_files/civic_test_out.filtered_variants.vcf
diff --git a/querynator/query_api/civic_api.py b/querynator/query_api/civic_api.py
@@ -159,7 +159,7 @@ def append_to_dict(dict1, dict2):
                 dict1[key].append(value)
             else:
                 if dict1[key] == "":
-                    dict1[key] = dict2[key]
+                    dict1[key] = [dict2[key]]
                 else:
                     dict1[key] = [dict1[key], dict2[key]]
 
@@ -417,12 +417,12 @@ def get_evidence_information_from_variant(variant_obj, diseases):
                     "evidence_level": evidence.evidence_level,
                     "evidence_support": evidence.evidence_direction,
                     "evidence_type": evidence.evidence_type,
-                    "evidence_phenotypes": ", ".join([i.name for i in evidence.phenotypes]),
+                    "evidence_phenotypes": "+".join([i.name for i in evidence.phenotypes]),
                     "evidence_rating": evidence.rating,
                     "evidence_significance": evidence.significance,
                     "evidence_source": evidence.source.name,
                     "evidence_status": evidence.status,
-                    "evidence_therapies": ", ".join([i.name for i in evidence.therapies]),
+                    "evidence_therapies": "+".join([i.name for i in evidence.therapies]),
                     "evidence_therapy_interaction_type": evidence.therapy_interaction_type,
                 }
             except IndexError:

diff --git a/querynator/report_scripts/combine_cgi.py b/querynator/report_scripts/combine_cgi.py
@@ -293,7 +293,9 @@ def link_biomarkers(biomarkers_df, logger):
 
 def get_highest_evidence(row, biomarkers_linked):
     """
-    get highest associated CGI evidence of the current alteration (A-D) from the biomarkers datafrane
+    get highest associated CGI evidence of the current alteration (A-D) from the biomarkers dataframe.
+
+    consider evidence matched on gene, alteration and cancer type, as well as off-label use (level A evidence for different cancer is level C evidence for this cancer).
 
     :param row: row of a pandas DataFrame
     :type row: pandas Series
@@ -308,10 +310,8 @@ def get_highest_evidence(row, biomarkers_linked):
         if row["Protein Change_CGI"].startswith("*"):
             row["Protein Change_CGI"] = row["Protein Change_CGI"].replace("*", "\*")
 
-        # highest evidence level has lowest char value (A<B<C<D)
-        max_evidence_level = biomarkers_linked.loc[
-            (biomarkers_linked["alterations_link"].str.contains(row["Protein Change_CGI"]))
-        ]["Evidence"].min()
+        curr_alteration_msk = biomarkers_linked["alterations_link"].str.contains(row["Protein Change_CGI"])
+        max_evidence_level = biomarkers_linked.loc[curr_alteration_msk, "Evidence"].min()
 
         return max_evidence_level
 
@@ -342,6 +342,30 @@ def check_wildtypes(biomarkers: pd.DataFrame, vcf: pd.DataFrame, logger) -> None
     return
 
 
+def filter_biomarkers(biomarkers_df: pd.DataFrame, logger) -> pd.DataFrame:
+    """
+    adapt biomarkers to only consider
+    - "complete" biomarkers (gene, alteration)
+    - matches between alteration & biomarker (gene, alteration, cancer type)
+    - off-label use (level A evidence for different cancer is level C evidence for this cancer)
+
+    :biomarkers_df : the dataframe containing the cgi result 'biomarkers.tsv'
+    :logger        : the logger
+    :return        : the adapted biomarkers dataframe
+    """
+    complete_biom_msk = biomarkers_df.BioM == "complete"
+    match_msk = biomarkers_df["Match"] == "YES"
+    off_label_msk = ~match_msk & (biomarkers_df["Evidence"] == "A")
+    filter = complete_biom_msk & (match_msk | off_label_msk)
+
+    biomarkers_df.loc[off_label_msk, "Evidence"] = "C"
+    biomarkers_df["alterations_link"] = biomarkers_df["alterations_link"].astype(str)
+
+    logger.info(f"CGI: filtered {(~filter).sum()} irrelevant biomarkers, {filter.sum()} remaining")
+
+    return biomarkers_df.loc[filter]
+
+
 def combine_cgi(cgi_path, outdir, logger):
     """
     Command to combine the cgi results with the vcf's VEP annotation
@@ -369,19 +393,14 @@ def combine_cgi(cgi_path, outdir, logger):
     alterations_df = read_modify_alterations(alterations_path)
     merged_df = merge_alterations_vep(vep_df, alterations_df)
 
-    # link alterations in biomarkers
+    # link alterations in biomarkers & filter
     biomarkers_df = link_biomarkers(biomarkers_df, logger)
-    biomarkers_df.to_csv(f"{outdir}/combined_files/biomarkers_linked.tsv", sep="\t", index=False)
+    biomarkers_df = filter_biomarkers(biomarkers_df, logger)
+    biomarkers_df.to_csv(f"{outdir}/combined_files/biomarkers_linked_filtered.tsv", sep="\t", index=False)
 
     check_wildtypes(biomarkers_df, vep_df, logger)
 
     # add CGI evidence col to merged_df
-
-    # adapt biomarkers to only consider "complete" matches between alteration & biomarker
-    biomarkers_df = biomarkers_df[biomarkers_df.BioM == "complete"]
-    # biomarkers_linked["alterations_link"] = biomarkers_linked["alterations_link"].astype(str)
-    biomarkers_df["alterations_link"] = biomarkers_df["alterations_link"].apply(str)
-    # add CGI evidence col
     merged_df["evidence_CGI"] = merged_df.apply(lambda x: get_highest_evidence(x, biomarkers_df), axis=1)
     # write merged to report dir
     merged_df.to_csv(f"{outdir}/combined_files/alterations_vep.tsv", sep="\t", index=False)
diff --git a/querynator/report_scripts/create_report.py b/querynator/report_scripts/create_report.py
@@ -705,7 +705,7 @@ def create_report_htmls(outdir, basename, civic_path, logger):
 
     # read in files
     vep_civic_cgi_merge = pd.read_csv(f"{outdir}/combined_files/civic_cgi_vep.tsv", sep="\t")
-    biomarkers_df = pd.read_csv(f"{outdir}/combined_files/biomarkers_linked.tsv", sep="\t")
+    biomarkers_df = pd.read_csv(f"{outdir}/combined_files/biomarkers_linked_filtered.tsv", sep="\t")
     metadata_civic = f"{civic_path}/metadata.txt"  # read reference genome from metadata file
     # get path to save individual reports
     report_path = f"{os.path.abspath(outdir)}/report/variant_reports"

diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 from setuptools import find_packages, setup
 
-VERSION = "0.5.4"
+VERSION = "0.5.5"
 
 with open("README.rst") as readme_file:
     readme = readme_file.read()