From 116f65ec8654a6d198ade8389cdce0984eb48685 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Thu, 12 Sep 2024 17:54:53 +0200 Subject: [PATCH 1/6] adding additional spec sim features --- imspy/imspy/algorithm/intensity/predictors.py | 23 +++++++++++++++---- imspy/imspy/timstof/dbsearch/imspy_dda.py | 18 +++++++++++++++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/imspy/imspy/algorithm/intensity/predictors.py b/imspy/imspy/algorithm/intensity/predictors.py index 374be0e4..792a062a 100644 --- a/imspy/imspy/algorithm/intensity/predictors.py +++ b/imspy/imspy/algorithm/intensity/predictors.py @@ -16,8 +16,8 @@ post_process_predicted_fragment_spectra, reshape_dims, beta_score) from imspy.data.peptide import PeptideProductIonSeriesCollection, PeptideSequence - from imspy.simulation.utility import flatten_prosit_array +from sagepy.rescore.utility import dict_to_dense_array, spectral_entropy_similarity, spectral_correlation def predict_intensities_prosit( psm_collection: List[PeptideSpectrumMatch], @@ -70,12 +70,25 @@ def predict_intensities_prosit( psm_collection_intensity = associate_fragment_ions_with_prosit_predicted_intensities( psm_collection, intensity_pred, num_threads=num_threads) - for psm, psm_intensity in zip(psm_collection, psm_collection_intensity): + # calculate the spectral similarity metrics + for psm, psm_intensity, prosit_intensity in tqdm(zip(psm_collection, psm_collection_intensity, intensity_pred, psm_collection), + desc='Calculating spectral similarity metrics', ncols=100, disable=not verbose): psm.fragments_predicted = psm_intensity.fragments_predicted psm.cosine_similarity = psm_intensity.cosine_similarity - - for ps in psm_collection: - ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted) + psm.prosit_intensities = prosit_intensity + + i_obs = dict_to_dense_array(psm.fragments_observed) + i_pred = dict_to_dense_array(psm.prosit_fragment_map()) + diff = np.sum(i_pred - i_obs) / (np.sum(i_obs) + np.sum(i_pred)) + corr_pearson = spectral_correlation(i_obs, i_pred, method='pearson') + corr_spearman = spectral_correlation(i_obs, i_pred, method='spearman') + entropy = spectral_entropy_similarity(i_obs, i_pred) + + psm.spectral_normalized_intensity_difference = diff + psm.spectral_correlation_pearson = corr_pearson + psm.spectral_correlation_spearman = corr_spearman + psm.spectral_entropy_similarity = entropy + psm.beta_score = beta_score(i_obs, i_pred) def get_collision_energy_calibration_factor( diff --git a/imspy/imspy/timstof/dbsearch/imspy_dda.py b/imspy/imspy/timstof/dbsearch/imspy_dda.py index 885aa8ce..ffef54ed 100644 --- a/imspy/imspy/timstof/dbsearch/imspy_dda.py +++ b/imspy/imspy/timstof/dbsearch/imspy_dda.py @@ -15,6 +15,7 @@ from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, json_bin_to_psms, ScoreType from sagepy.qfdr.tdc import target_decoy_competition_pandas +from tqdm import tqdm from imspy.algorithm.ccs.predictors import DeepPeptideIonMobilityApex, load_deep_ccs_predictor from imspy.algorithm.intensity.utility import beta_score @@ -33,6 +34,7 @@ from sagepy.core.scoring import psms_to_json_bin from sagepy.utility import peptide_spectrum_match_collection_to_pandas from sagepy.utility import apply_mz_calibration +from sagepy.rescore.utility import dict_to_dense_array, spectral_entropy_similarity, spectral_correlation # suppress tensorflow warnings os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -628,6 +630,22 @@ def main(): psm = associate_fragment_ions_with_prosit_predicted_intensities(psm, intensity_pred, num_threads=args.num_threads) + for ps in tqdm(psm, desc="Calculating spectral similarity metrics", ncols=100, disable=(not args.verbose)): + i_obs = dict_to_dense_array(ps.fragments_observed) + i_pred = dict_to_dense_array(ps.prosit_fragment_map()) + + # calculate spectral similarity metrics + diff = np.sum(i_pred - i_obs) / (np.sum(i_obs) + np.sum(i_pred)) + corr_pearson = spectral_correlation(i_obs, i_pred, method='pearson') + corr_spearman = spectral_correlation(i_obs, i_pred, method='spearman') + entropy = spectral_entropy_similarity(i_obs, i_pred) + + # set spectral similarity metrics + ps.spectral_normalized_intensity_difference = diff + ps.spectral_correlation_pearson = corr_pearson + ps.spectral_correlation_spearman = corr_spearman + ps.spectral_entropy_similarity = entropy + if args.verbose: print("calculating beta score ...") From 55d59724e1d2a3d6a060d87ffc3380e621a77982 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Thu, 12 Sep 2024 19:16:08 +0200 Subject: [PATCH 2/6] fixing setter call --- imspy/imspy/algorithm/intensity/predictors.py | 13 +++++++------ imspy/imspy/timstof/dbsearch/imspy_dda.py | 13 ++++--------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/imspy/imspy/algorithm/intensity/predictors.py b/imspy/imspy/algorithm/intensity/predictors.py index 792a062a..c737868f 100644 --- a/imspy/imspy/algorithm/intensity/predictors.py +++ b/imspy/imspy/algorithm/intensity/predictors.py @@ -71,24 +71,25 @@ def predict_intensities_prosit( psm_collection, intensity_pred, num_threads=num_threads) # calculate the spectral similarity metrics - for psm, psm_intensity, prosit_intensity in tqdm(zip(psm_collection, psm_collection_intensity, intensity_pred, psm_collection), - desc='Calculating spectral similarity metrics', ncols=100, disable=not verbose): + for psm, psm_intensity, prosit_intensity in tqdm(zip(psm_collection, psm_collection_intensity, intensity_pred), + desc='Calc spectral similarity metrics', ncols=100, disable=not verbose): psm.fragments_predicted = psm_intensity.fragments_predicted psm.cosine_similarity = psm_intensity.cosine_similarity psm.prosit_intensities = prosit_intensity - i_obs = dict_to_dense_array(psm.fragments_observed) + i_obs = dict_to_dense_array(psm.observed_fragment_map()) i_pred = dict_to_dense_array(psm.prosit_fragment_map()) diff = np.sum(i_pred - i_obs) / (np.sum(i_obs) + np.sum(i_pred)) corr_pearson = spectral_correlation(i_obs, i_pred, method='pearson') corr_spearman = spectral_correlation(i_obs, i_pred, method='spearman') entropy = spectral_entropy_similarity(i_obs, i_pred) + # set spectral similarity metrics psm.spectral_normalized_intensity_difference = diff - psm.spectral_correlation_pearson = corr_pearson - psm.spectral_correlation_spearman = corr_spearman + psm.spectral_correlation_similarity_pearson = corr_pearson + psm.spectral_correlation_similarity_spearman = corr_spearman psm.spectral_entropy_similarity = entropy - psm.beta_score = beta_score(i_obs, i_pred) + psm.beta_score = beta_score(psm.fragments_observed, psm.fragments_predicted) def get_collision_energy_calibration_factor( diff --git a/imspy/imspy/timstof/dbsearch/imspy_dda.py b/imspy/imspy/timstof/dbsearch/imspy_dda.py index ffef54ed..ee20b541 100644 --- a/imspy/imspy/timstof/dbsearch/imspy_dda.py +++ b/imspy/imspy/timstof/dbsearch/imspy_dda.py @@ -630,8 +630,8 @@ def main(): psm = associate_fragment_ions_with_prosit_predicted_intensities(psm, intensity_pred, num_threads=args.num_threads) - for ps in tqdm(psm, desc="Calculating spectral similarity metrics", ncols=100, disable=(not args.verbose)): - i_obs = dict_to_dense_array(ps.fragments_observed) + for ps in tqdm(psm, desc="Calc spectral similarity metrics", ncols=100, disable=(not args.verbose)): + i_obs = dict_to_dense_array(ps.observed_fragment_map()) i_pred = dict_to_dense_array(ps.prosit_fragment_map()) # calculate spectral similarity metrics @@ -642,14 +642,9 @@ def main(): # set spectral similarity metrics ps.spectral_normalized_intensity_difference = diff - ps.spectral_correlation_pearson = corr_pearson - ps.spectral_correlation_spearman = corr_spearman + ps.spectral_correlation_similarity_pearson = corr_pearson + ps.spectral_correlation_similarity_spearman = corr_spearman ps.spectral_entropy_similarity = entropy - - if args.verbose: - print("calculating beta score ...") - - for ps in psm: ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted) if args.verbose: From 1413ddbb01644e492a8dec3e48a77338f212fd47 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Sat, 14 Sep 2024 11:07:40 +0200 Subject: [PATCH 3/6] moved sim calc to parallel execution --- imspy/imspy/algorithm/intensity/predictors.py | 16 ++++------------ imspy/imspy/timstof/dbsearch/imspy_dda.py | 16 +--------------- 2 files changed, 5 insertions(+), 27 deletions(-) diff --git a/imspy/imspy/algorithm/intensity/predictors.py b/imspy/imspy/algorithm/intensity/predictors.py index c737868f..9714b19f 100644 --- a/imspy/imspy/algorithm/intensity/predictors.py +++ b/imspy/imspy/algorithm/intensity/predictors.py @@ -73,22 +73,14 @@ def predict_intensities_prosit( # calculate the spectral similarity metrics for psm, psm_intensity, prosit_intensity in tqdm(zip(psm_collection, psm_collection_intensity, intensity_pred), desc='Calc spectral similarity metrics', ncols=100, disable=not verbose): + psm.fragments_predicted = psm_intensity.fragments_predicted psm.cosine_similarity = psm_intensity.cosine_similarity psm.prosit_intensities = prosit_intensity - i_obs = dict_to_dense_array(psm.observed_fragment_map()) - i_pred = dict_to_dense_array(psm.prosit_fragment_map()) - diff = np.sum(i_pred - i_obs) / (np.sum(i_obs) + np.sum(i_pred)) - corr_pearson = spectral_correlation(i_obs, i_pred, method='pearson') - corr_spearman = spectral_correlation(i_obs, i_pred, method='spearman') - entropy = spectral_entropy_similarity(i_obs, i_pred) - - # set spectral similarity metrics - psm.spectral_normalized_intensity_difference = diff - psm.spectral_correlation_similarity_pearson = corr_pearson - psm.spectral_correlation_similarity_spearman = corr_spearman - psm.spectral_entropy_similarity = entropy + psm.spectral_correlation_similarity_pearson = psm_intensity.spectral_correlation_similarity_pearson + psm.spectral_correlation_similarity_spearman = psm_intensity.spectral_correlation_similarity_spearman + psm.spectral_entropy_similarity = psm_intensity.spectral_entropy_similarity psm.beta_score = beta_score(psm.fragments_observed, psm.fragments_predicted) diff --git a/imspy/imspy/timstof/dbsearch/imspy_dda.py b/imspy/imspy/timstof/dbsearch/imspy_dda.py index ee20b541..2f3762fc 100644 --- a/imspy/imspy/timstof/dbsearch/imspy_dda.py +++ b/imspy/imspy/timstof/dbsearch/imspy_dda.py @@ -630,21 +630,7 @@ def main(): psm = associate_fragment_ions_with_prosit_predicted_intensities(psm, intensity_pred, num_threads=args.num_threads) - for ps in tqdm(psm, desc="Calc spectral similarity metrics", ncols=100, disable=(not args.verbose)): - i_obs = dict_to_dense_array(ps.observed_fragment_map()) - i_pred = dict_to_dense_array(ps.prosit_fragment_map()) - - # calculate spectral similarity metrics - diff = np.sum(i_pred - i_obs) / (np.sum(i_obs) + np.sum(i_pred)) - corr_pearson = spectral_correlation(i_obs, i_pred, method='pearson') - corr_spearman = spectral_correlation(i_obs, i_pred, method='spearman') - entropy = spectral_entropy_similarity(i_obs, i_pred) - - # set spectral similarity metrics - ps.spectral_normalized_intensity_difference = diff - ps.spectral_correlation_similarity_pearson = corr_pearson - ps.spectral_correlation_similarity_spearman = corr_spearman - ps.spectral_entropy_similarity = entropy + for ps in tqdm(psm, desc="calculating beta scores", ncols=100, disable=(not args.verbose)): ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted) if args.verbose: From 1da14595024766c06196d5a88d88f8a4b0df296c Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Sat, 14 Sep 2024 11:25:17 +0200 Subject: [PATCH 4/6] default all threads --- imspy/imspy/timstof/dbsearch/imspy_dda.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/imspy/imspy/timstof/dbsearch/imspy_dda.py b/imspy/imspy/timstof/dbsearch/imspy_dda.py index 2f3762fc..427211ea 100644 --- a/imspy/imspy/timstof/dbsearch/imspy_dda.py +++ b/imspy/imspy/timstof/dbsearch/imspy_dda.py @@ -242,7 +242,7 @@ def main(): parser.add_argument("--fdr_threshold", type=float, default=0.01, help="FDR threshold (default: 0.01)") # number of threads - parser.add_argument("--num_threads", type=int, default=16, help="Number of threads (default: 16)") + parser.add_argument("--num_threads", type=int, default=-1, help="Number of threads (default: -1)") # if train splits should be balanced parser.add_argument( @@ -332,6 +332,10 @@ def main(): logging.basicConfig(filename=f"{write_folder_path}/imspy/imspy-{current_time}.log", level=logging.INFO, format='%(asctime)s %(message)s') + if args.num_threads == -1 and args.verbose: + print(f"Using all available CPU threads: {os.cpu_count()} ...") + args.num_threads = os.cpu_count() + logging.info("Arguments settings:") for arg in vars(args): logging.info(f"{arg}: {getattr(args, arg)}") From 9655ce5cdc128e13484ff8965234ee9c27ca6a4b Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 9 Oct 2024 16:37:13 +0200 Subject: [PATCH 5/6] adding mokkapot post-processing as option to imspy_dda --- imspy/imspy/timstof/dbsearch/imspy_dda.py | 55 ++++++++++++++++++----- imspy/imspy/timstof/dbsearch/utility.py | 42 +++++++++++++++++ imspy/pyproject.toml | 1 + 3 files changed, 86 insertions(+), 12 deletions(-) diff --git a/imspy/imspy/timstof/dbsearch/imspy_dda.py b/imspy/imspy/timstof/dbsearch/imspy_dda.py index 427211ea..5ca9bc98 100644 --- a/imspy/imspy/timstof/dbsearch/imspy_dda.py +++ b/imspy/imspy/timstof/dbsearch/imspy_dda.py @@ -5,6 +5,8 @@ import time import toml +import mokapot + import pandas as pd import numpy as np @@ -27,14 +29,14 @@ from imspy.timstof.dbsearch.utility import sanitize_mz, sanitize_charge, get_searchable_spec, split_fasta, \ write_psms_binary, re_score_psms, \ - merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map + merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map, \ + transform_psm_to_pin from imspy.algorithm.intensity.predictors import get_collision_energy_calibration_factor from sagepy.core.scoring import psms_to_json_bin from sagepy.utility import peptide_spectrum_match_collection_to_pandas from sagepy.utility import apply_mz_calibration -from sagepy.rescore.utility import dict_to_dense_array, spectral_entropy_similarity, spectral_correlation # suppress tensorflow warnings os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' @@ -131,6 +133,15 @@ def main(): help="Batch size for fasta file (default: 1)" ) + parser.add_argument( + "--no_re_score_mokapot", + dest="re_score_mokapot", + action="store_false", + help="Re-score PSMs using mokapot" + ) + + parser.set_defaults(re_score_mokapot=True) + # SAGE isolation window settings parser.add_argument("--isolation_window_lower", type=float, default=-3.0, help="Isolation window (default: -3.0)") parser.add_argument("--isolation_window_upper", type=float, default=3.0, help="Isolation window (default: 3.0)") @@ -253,14 +264,6 @@ def main(): ) parser.set_defaults(balanced_re_score=True) - # TDC method - parser.add_argument( - "--tdc_method", - type=str, - default="peptide_psm_peptide", - help="TDC method (default: peptide_psm_peptide aka double competition)" - ) - # load dataset in memory parser.add_argument( "--in_memory", @@ -747,19 +750,47 @@ def main(): write_psms_binary(byte_array=bts, folder_path=write_folder_path, file_name="total_psms", total=True) PSM_pandas = peptide_spectrum_match_collection_to_pandas(psms) + + if args.re_score_mokapot: + if args.verbose: + print("re-scoring PSMs using mokapot ...") + + # create a mokapot folder in the imspy folder + if not os.path.exists(write_folder_path + "/imspy/mokapot"): + os.makedirs(write_folder_path + "/imspy/mokapot") + + # create a PIN file from the PSMs + PSM_pin = transform_psm_to_pin(PSM_pandas) + PSM_pin.to_csv(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin", index=False, sep="\t") + + psms_moka = mokapot.read_pin(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin") + results, models = mokapot.brew(psms_moka) + + results.to_txt(dest_dir=f"{write_folder_path}" + "/imspy/mokapot/") + PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"]) if args.verbose: - print(f"FDR calculation, using target decoy competition: {args.tdc_method} ...") + print(f"FDR calculation ...") psms_rescored = target_decoy_competition_pandas(peptide_spectrum_match_collection_to_pandas(psms, re_score=True), - method=args.tdc_method) + method="psm") psms_rescored = psms_rescored[(psms_rescored.q_value <= 0.01) & (psms_rescored.decoy == False)] TDC = pd.merge(psms_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], right_on=["spec_idx", "match_idx", "decoy"]).sort_values(by="score", ascending=False) + TDC.to_csv(f"{write_folder_path}" + "/imspy/PSMs.csv", index=False) + + peptides_rescored = target_decoy_competition_pandas(peptide_spectrum_match_collection_to_pandas(psms, re_score=True), + method="peptide_psm_peptide") + + peptides_rescored = peptides_rescored[(peptides_rescored.q_value <= 0.01) & (peptides_rescored.decoy == False)] + + TDC = pd.merge(peptides_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], + right_on=["spec_idx", "match_idx", "decoy"]).sort_values(by="score", ascending=False) + TDC.to_csv(f"{write_folder_path}" + "/imspy/Peptides.csv", index=False) end_time = time.time() diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py index fcb37a94..ffea6074 100644 --- a/imspy/imspy/timstof/dbsearch/utility.py +++ b/imspy/imspy/timstof/dbsearch/utility.py @@ -448,3 +448,45 @@ def extract_timstof_dda_data(path: str, fragments['processed_spec'] = processed_spec return fragments + + +def transform_psm_to_pin(psm_df): + columns_map = { + 'spec_idx': 'SpecId', + 'decoy': 'Label', + 'charge': 'Charge', + 'sequence': 'Peptide', + 'proteins': 'Proteins', + # feature mapping for re-scoring + 'hyper_score': 'Feature1', + 'isotope_error': 'Feature2', + 'delta_mass': 'Feature3', + 'delta_rt': 'Feature4', + 'delta_ims': 'Feature5', + 'matched_peaks': 'Feature6', + 'matched_intensity_pct': 'Feature7', + 'intensity_ms1': 'Feature8', + 'intensity_ms2': 'Feature9', + 'average_ppm': 'Feature1ß', + 'poisson': 'Feature11', + 'spectral_entropy_similarity': 'Feature12', + 'spectral_correlation_similarity_pearson': 'Feature13', + 'spectral_correlation_similarity_spearman': 'Feature14', + 'spectral_normalized_intensity_difference': 'Feature15', + 'collision_energy': 'Feature16', + 'delta_next': 'Feature17', + 'delta_best': 'Feature18', + 'longest_b': 'Feature19', + 'longest_y': 'Feature20', + 'longest_y_pct': 'Feature21', + } + + psm_df = psm_df[list(columns_map.keys())] + df_pin = psm_df.rename(columns=columns_map) + df_pin_clean = df_pin.dropna(axis=1, how='all') + df_pin_clean = df_pin_clean.dropna() + + df_pin_clean['Label'] = df_pin_clean['Label'].apply(lambda x: -1 if x else 1) + df_pin_clean['ScanNr'] = range(1, len(df_pin_clean) + 1) + + return df_pin_clean diff --git a/imspy/pyproject.toml b/imspy/pyproject.toml index 59f8277b..769a20d9 100644 --- a/imspy/pyproject.toml +++ b/imspy/pyproject.toml @@ -18,6 +18,7 @@ tensorflow-probability = ">=0.22.1" wandb = ">=0.12.1" pyopenms = ">=2.6.0" +mokapot = ">=0.10.0" sagepy = ">=0.2.22" imspy-connector = ">=0.2.32" From f33d2d2636f8e2c0b3be18447272317bbc6464e6 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 9 Oct 2024 16:44:55 +0200 Subject: [PATCH 6/6] bumping version --- imspy/pyproject.toml | 6 +++--- imspy_connector/Cargo.toml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/imspy/pyproject.toml b/imspy/pyproject.toml index 769a20d9..fea0b80f 100644 --- a/imspy/pyproject.toml +++ b/imspy/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "imspy" -version = "0.2.32" +version = "0.2.33" description = "" authors = ["theGreatHerrLebert "] readme = "README.md" @@ -20,8 +20,8 @@ wandb = ">=0.12.1" pyopenms = ">=2.6.0" mokapot = ">=0.10.0" -sagepy = ">=0.2.22" -imspy-connector = ">=0.2.32" +sagepy = ">=0.2.23" +imspy-connector = ">=0.2.33" scipy = ">=1.7.1" tqdm = ">=4.66" diff --git a/imspy_connector/Cargo.toml b/imspy_connector/Cargo.toml index 59f41a21..dff71316 100644 --- a/imspy_connector/Cargo.toml +++ b/imspy_connector/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "imspy-connector" -version = "0.2.32" +version = "0.2.33" edition = "2021" [lib] @@ -14,4 +14,4 @@ mscore = {path = "../mscore"} rustdf = {path = "../rustdf"} serde = "1.0.202" serde_json = "1.0.117" -ndarray = "0.15.6" +ndarray = "0.16.1"