Merge pull request #267 from theGreatHerrLebert/feature/property-pred…

…iction Feature/property prediction
theGreatHerrLebert · Oct 9, 2024 · 3b2f7e6 · 3b2f7e6
2 parents 9736147 + f33d2d2
commit 3b2f7e6
Show file tree

Hide file tree

Showing 5 changed files with 108 additions and 25 deletions.
diff --git a/imspy/imspy/algorithm/intensity/predictors.py b/imspy/imspy/algorithm/intensity/predictors.py
@@ -16,8 +16,8 @@
                                                post_process_predicted_fragment_spectra, reshape_dims, beta_score)
 
 from imspy.data.peptide import PeptideProductIonSeriesCollection, PeptideSequence
-
 from imspy.simulation.utility import flatten_prosit_array
+from sagepy.rescore.utility import dict_to_dense_array, spectral_entropy_similarity, spectral_correlation
 
 def predict_intensities_prosit(
         psm_collection: List[PeptideSpectrumMatch],
@@ -70,12 +70,18 @@ def predict_intensities_prosit(
     psm_collection_intensity = associate_fragment_ions_with_prosit_predicted_intensities(
         psm_collection, intensity_pred, num_threads=num_threads)
 
-    for psm, psm_intensity in zip(psm_collection, psm_collection_intensity):
+    # calculate the spectral similarity metrics
+    for psm, psm_intensity, prosit_intensity in tqdm(zip(psm_collection, psm_collection_intensity, intensity_pred),
+                                                      desc='Calc spectral similarity metrics', ncols=100, disable=not verbose):
+
         psm.fragments_predicted = psm_intensity.fragments_predicted
         psm.cosine_similarity = psm_intensity.cosine_similarity
+        psm.prosit_intensities = prosit_intensity
 
-    for ps in psm_collection:
-        ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted)
+        psm.spectral_correlation_similarity_pearson = psm_intensity.spectral_correlation_similarity_pearson
+        psm.spectral_correlation_similarity_spearman = psm_intensity.spectral_correlation_similarity_spearman
+        psm.spectral_entropy_similarity = psm_intensity.spectral_entropy_similarity
+        psm.beta_score = beta_score(psm.fragments_observed, psm.fragments_predicted)
 
 
 def get_collision_energy_calibration_factor(

diff --git a/imspy/imspy/timstof/dbsearch/imspy_dda.py b/imspy/imspy/timstof/dbsearch/imspy_dda.py
@@ -5,6 +5,8 @@
 import time
 import toml
 
+import mokapot
+
 import pandas as pd
 import numpy as np
 
@@ -15,6 +17,7 @@
 from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, json_bin_to_psms, ScoreType
 
 from sagepy.qfdr.tdc import target_decoy_competition_pandas
+from tqdm import tqdm
 
 from imspy.algorithm.ccs.predictors import DeepPeptideIonMobilityApex, load_deep_ccs_predictor
 from imspy.algorithm.intensity.utility import beta_score
@@ -26,7 +29,8 @@
 
 from imspy.timstof.dbsearch.utility import sanitize_mz, sanitize_charge, get_searchable_spec, split_fasta, \
     write_psms_binary, re_score_psms, \
-    merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map
+    merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map, \
+    transform_psm_to_pin
 
 from imspy.algorithm.intensity.predictors import get_collision_energy_calibration_factor
 
@@ -129,6 +133,15 @@ def main():
         help="Batch size for fasta file (default: 1)"
     )
 
+    parser.add_argument(
+        "--no_re_score_mokapot",
+        dest="re_score_mokapot",
+        action="store_false",
+        help="Re-score PSMs using mokapot"
+    )
+
+    parser.set_defaults(re_score_mokapot=True)
+
     # SAGE isolation window settings
     parser.add_argument("--isolation_window_lower", type=float, default=-3.0, help="Isolation window (default: -3.0)")
     parser.add_argument("--isolation_window_upper", type=float, default=3.0, help="Isolation window (default: 3.0)")
@@ -240,7 +253,7 @@ def main():
     parser.add_argument("--fdr_threshold", type=float, default=0.01, help="FDR threshold (default: 0.01)")
 
     # number of threads
-    parser.add_argument("--num_threads", type=int, default=16, help="Number of threads (default: 16)")
+    parser.add_argument("--num_threads", type=int, default=-1, help="Number of threads (default: -1)")
 
     # if train splits should be balanced
     parser.add_argument(
@@ -251,14 +264,6 @@ def main():
     )
     parser.set_defaults(balanced_re_score=True)
 
-    # TDC method
-    parser.add_argument(
-        "--tdc_method",
-        type=str,
-        default="peptide_psm_peptide",
-        help="TDC method (default: peptide_psm_peptide aka double competition)"
-    )
-
     # load dataset in memory
     parser.add_argument(
         "--in_memory",
@@ -330,6 +335,10 @@ def main():
     logging.basicConfig(filename=f"{write_folder_path}/imspy/imspy-{current_time}.log",
                         level=logging.INFO, format='%(asctime)s %(message)s')
 
+    if args.num_threads == -1 and args.verbose:
+        print(f"Using all available CPU threads: {os.cpu_count()} ...")
+        args.num_threads = os.cpu_count()
+
     logging.info("Arguments settings:")
     for arg in vars(args):
         logging.info(f"{arg}: {getattr(args, arg)}")
@@ -628,10 +637,7 @@ def main():
         psm = associate_fragment_ions_with_prosit_predicted_intensities(psm, intensity_pred,
                                                                         num_threads=args.num_threads)
 
-        if args.verbose:
-            print("calculating beta score ...")
-
-        for ps in psm:
+        for ps in tqdm(psm, desc="calculating beta scores", ncols=100, disable=(not args.verbose)):
             ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted)
 
         if args.verbose:
@@ -744,19 +750,47 @@ def main():
     write_psms_binary(byte_array=bts, folder_path=write_folder_path, file_name="total_psms", total=True)
 
     PSM_pandas = peptide_spectrum_match_collection_to_pandas(psms)
+
+    if args.re_score_mokapot:
+        if args.verbose:
+            print("re-scoring PSMs using mokapot ...")
+
+        # create a mokapot folder in the imspy folder
+        if not os.path.exists(write_folder_path + "/imspy/mokapot"):
+            os.makedirs(write_folder_path + "/imspy/mokapot")
+
+        # create a PIN file from the PSMs
+        PSM_pin = transform_psm_to_pin(PSM_pandas)
+        PSM_pin.to_csv(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin", index=False, sep="\t")
+
+        psms_moka = mokapot.read_pin(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin")
+        results, models = mokapot.brew(psms_moka)
+
+        results.to_txt(dest_dir=f"{write_folder_path}" + "/imspy/mokapot/")
+
     PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"])
 
     if args.verbose:
-        print(f"FDR calculation, using target decoy competition: {args.tdc_method} ...")
+        print(f"FDR calculation ...")
 
     psms_rescored = target_decoy_competition_pandas(peptide_spectrum_match_collection_to_pandas(psms, re_score=True),
-                                                    method=args.tdc_method)
+                                                    method="psm")
 
     psms_rescored = psms_rescored[(psms_rescored.q_value <= 0.01) & (psms_rescored.decoy == False)]
 
     TDC = pd.merge(psms_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"],
                    right_on=["spec_idx", "match_idx", "decoy"]).sort_values(by="score", ascending=False)
 
+    TDC.to_csv(f"{write_folder_path}" + "/imspy/PSMs.csv", index=False)
+
+    peptides_rescored = target_decoy_competition_pandas(peptide_spectrum_match_collection_to_pandas(psms, re_score=True),
+                                                    method="peptide_psm_peptide")
+
+    peptides_rescored = peptides_rescored[(peptides_rescored.q_value <= 0.01) & (peptides_rescored.decoy == False)]
+
+    TDC = pd.merge(peptides_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"],
+                   right_on=["spec_idx", "match_idx", "decoy"]).sort_values(by="score", ascending=False)
+
     TDC.to_csv(f"{write_folder_path}" + "/imspy/Peptides.csv", index=False)
 
     end_time = time.time()

diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py
@@ -448,3 +448,45 @@ def extract_timstof_dda_data(path: str,
     fragments['processed_spec'] = processed_spec
 
     return fragments
+
+
+def transform_psm_to_pin(psm_df):
+    columns_map = {
+        'spec_idx': 'SpecId',
+        'decoy': 'Label',
+        'charge': 'Charge',
+        'sequence': 'Peptide',
+        'proteins': 'Proteins',
+        # feature mapping for re-scoring
+        'hyper_score': 'Feature1',
+        'isotope_error': 'Feature2',
+        'delta_mass': 'Feature3',
+        'delta_rt': 'Feature4',
+        'delta_ims': 'Feature5',
+        'matched_peaks': 'Feature6',
+        'matched_intensity_pct': 'Feature7',
+        'intensity_ms1': 'Feature8',
+        'intensity_ms2': 'Feature9',
+        'average_ppm': 'Feature1ß',
+        'poisson': 'Feature11',
+        'spectral_entropy_similarity': 'Feature12',
+        'spectral_correlation_similarity_pearson': 'Feature13',
+        'spectral_correlation_similarity_spearman': 'Feature14',
+        'spectral_normalized_intensity_difference': 'Feature15',
+        'collision_energy': 'Feature16',
+        'delta_next': 'Feature17',
+        'delta_best': 'Feature18',
+        'longest_b': 'Feature19',
+        'longest_y': 'Feature20',
+        'longest_y_pct': 'Feature21',
+    }
+
+    psm_df = psm_df[list(columns_map.keys())]
+    df_pin = psm_df.rename(columns=columns_map)
+    df_pin_clean = df_pin.dropna(axis=1, how='all')
+    df_pin_clean = df_pin_clean.dropna()
+
+    df_pin_clean['Label'] = df_pin_clean['Label'].apply(lambda x: -1 if x else 1)
+    df_pin_clean['ScanNr'] = range(1, len(df_pin_clean) + 1)
+
+    return df_pin_clean
diff --git a/imspy/pyproject.toml b/imspy/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "imspy"
-version = "0.2.32"
+version = "0.2.33"
 description = ""
 authors = ["theGreatHerrLebert <davidteschner@googlemail.com>"]
 readme = "README.md"
@@ -18,9 +18,10 @@ tensorflow-probability = ">=0.22.1"
 wandb = ">=0.12.1"
 
 pyopenms = ">=2.6.0"
+mokapot = ">=0.10.0"
 
-sagepy = ">=0.2.22"
-imspy-connector = ">=0.2.32"
+sagepy = ">=0.2.23"
+imspy-connector = ">=0.2.33"
 
 scipy = ">=1.7.1"
 tqdm = ">=4.66"

diff --git a/imspy_connector/Cargo.toml b/imspy_connector/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "imspy-connector"
-version = "0.2.32"
+version = "0.2.33"
 edition = "2021"
 
 [lib]
@@ -14,4 +14,4 @@ mscore = {path = "../mscore"}
 rustdf = {path = "../rustdf"}
 serde = "1.0.202"
 serde_json = "1.0.117"
-ndarray = "0.15.6"
+ndarray = "0.16.1"