diff --git a/imspy/imspy/algorithm/intensity/predictors.py b/imspy/imspy/algorithm/intensity/predictors.py index 374be0e4..9714b19f 100644 --- a/imspy/imspy/algorithm/intensity/predictors.py +++ b/imspy/imspy/algorithm/intensity/predictors.py @@ -16,8 +16,8 @@ post_process_predicted_fragment_spectra, reshape_dims, beta_score) from imspy.data.peptide import PeptideProductIonSeriesCollection, PeptideSequence - from imspy.simulation.utility import flatten_prosit_array +from sagepy.rescore.utility import dict_to_dense_array, spectral_entropy_similarity, spectral_correlation def predict_intensities_prosit( psm_collection: List[PeptideSpectrumMatch], @@ -70,12 +70,18 @@ def predict_intensities_prosit( psm_collection_intensity = associate_fragment_ions_with_prosit_predicted_intensities( psm_collection, intensity_pred, num_threads=num_threads) - for psm, psm_intensity in zip(psm_collection, psm_collection_intensity): + # calculate the spectral similarity metrics + for psm, psm_intensity, prosit_intensity in tqdm(zip(psm_collection, psm_collection_intensity, intensity_pred), + desc='Calc spectral similarity metrics', ncols=100, disable=not verbose): + psm.fragments_predicted = psm_intensity.fragments_predicted psm.cosine_similarity = psm_intensity.cosine_similarity + psm.prosit_intensities = prosit_intensity - for ps in psm_collection: - ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted) + psm.spectral_correlation_similarity_pearson = psm_intensity.spectral_correlation_similarity_pearson + psm.spectral_correlation_similarity_spearman = psm_intensity.spectral_correlation_similarity_spearman + psm.spectral_entropy_similarity = psm_intensity.spectral_entropy_similarity + psm.beta_score = beta_score(psm.fragments_observed, psm.fragments_predicted) def get_collision_energy_calibration_factor( diff --git a/imspy/imspy/timstof/dbsearch/imspy_dda.py b/imspy/imspy/timstof/dbsearch/imspy_dda.py index 885aa8ce..5ca9bc98 100644 --- a/imspy/imspy/timstof/dbsearch/imspy_dda.py +++ b/imspy/imspy/timstof/dbsearch/imspy_dda.py @@ -5,6 +5,8 @@ import time import toml +import mokapot + import pandas as pd import numpy as np @@ -15,6 +17,7 @@ from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, json_bin_to_psms, ScoreType from sagepy.qfdr.tdc import target_decoy_competition_pandas +from tqdm import tqdm from imspy.algorithm.ccs.predictors import DeepPeptideIonMobilityApex, load_deep_ccs_predictor from imspy.algorithm.intensity.utility import beta_score @@ -26,7 +29,8 @@ from imspy.timstof.dbsearch.utility import sanitize_mz, sanitize_charge, get_searchable_spec, split_fasta, \ write_psms_binary, re_score_psms, \ - merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map + merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map, \ + transform_psm_to_pin from imspy.algorithm.intensity.predictors import get_collision_energy_calibration_factor @@ -129,6 +133,15 @@ def main(): help="Batch size for fasta file (default: 1)" ) + parser.add_argument( + "--no_re_score_mokapot", + dest="re_score_mokapot", + action="store_false", + help="Re-score PSMs using mokapot" + ) + + parser.set_defaults(re_score_mokapot=True) + # SAGE isolation window settings parser.add_argument("--isolation_window_lower", type=float, default=-3.0, help="Isolation window (default: -3.0)") parser.add_argument("--isolation_window_upper", type=float, default=3.0, help="Isolation window (default: 3.0)") @@ -240,7 +253,7 @@ def main(): parser.add_argument("--fdr_threshold", type=float, default=0.01, help="FDR threshold (default: 0.01)") # number of threads - parser.add_argument("--num_threads", type=int, default=16, help="Number of threads (default: 16)") + parser.add_argument("--num_threads", type=int, default=-1, help="Number of threads (default: -1)") # if train splits should be balanced parser.add_argument( @@ -251,14 +264,6 @@ def main(): ) parser.set_defaults(balanced_re_score=True) - # TDC method - parser.add_argument( - "--tdc_method", - type=str, - default="peptide_psm_peptide", - help="TDC method (default: peptide_psm_peptide aka double competition)" - ) - # load dataset in memory parser.add_argument( "--in_memory", @@ -330,6 +335,10 @@ def main(): logging.basicConfig(filename=f"{write_folder_path}/imspy/imspy-{current_time}.log", level=logging.INFO, format='%(asctime)s %(message)s') + if args.num_threads == -1 and args.verbose: + print(f"Using all available CPU threads: {os.cpu_count()} ...") + args.num_threads = os.cpu_count() + logging.info("Arguments settings:") for arg in vars(args): logging.info(f"{arg}: {getattr(args, arg)}") @@ -628,10 +637,7 @@ def main(): psm = associate_fragment_ions_with_prosit_predicted_intensities(psm, intensity_pred, num_threads=args.num_threads) - if args.verbose: - print("calculating beta score ...") - - for ps in psm: + for ps in tqdm(psm, desc="calculating beta scores", ncols=100, disable=(not args.verbose)): ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted) if args.verbose: @@ -744,19 +750,47 @@ def main(): write_psms_binary(byte_array=bts, folder_path=write_folder_path, file_name="total_psms", total=True) PSM_pandas = peptide_spectrum_match_collection_to_pandas(psms) + + if args.re_score_mokapot: + if args.verbose: + print("re-scoring PSMs using mokapot ...") + + # create a mokapot folder in the imspy folder + if not os.path.exists(write_folder_path + "/imspy/mokapot"): + os.makedirs(write_folder_path + "/imspy/mokapot") + + # create a PIN file from the PSMs + PSM_pin = transform_psm_to_pin(PSM_pandas) + PSM_pin.to_csv(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin", index=False, sep="\t") + + psms_moka = mokapot.read_pin(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin") + results, models = mokapot.brew(psms_moka) + + results.to_txt(dest_dir=f"{write_folder_path}" + "/imspy/mokapot/") + PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"]) if args.verbose: - print(f"FDR calculation, using target decoy competition: {args.tdc_method} ...") + print(f"FDR calculation ...") psms_rescored = target_decoy_competition_pandas(peptide_spectrum_match_collection_to_pandas(psms, re_score=True), - method=args.tdc_method) + method="psm") psms_rescored = psms_rescored[(psms_rescored.q_value <= 0.01) & (psms_rescored.decoy == False)] TDC = pd.merge(psms_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], right_on=["spec_idx", "match_idx", "decoy"]).sort_values(by="score", ascending=False) + TDC.to_csv(f"{write_folder_path}" + "/imspy/PSMs.csv", index=False) + + peptides_rescored = target_decoy_competition_pandas(peptide_spectrum_match_collection_to_pandas(psms, re_score=True), + method="peptide_psm_peptide") + + peptides_rescored = peptides_rescored[(peptides_rescored.q_value <= 0.01) & (peptides_rescored.decoy == False)] + + TDC = pd.merge(peptides_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], + right_on=["spec_idx", "match_idx", "decoy"]).sort_values(by="score", ascending=False) + TDC.to_csv(f"{write_folder_path}" + "/imspy/Peptides.csv", index=False) end_time = time.time() diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py index fcb37a94..ffea6074 100644 --- a/imspy/imspy/timstof/dbsearch/utility.py +++ b/imspy/imspy/timstof/dbsearch/utility.py @@ -448,3 +448,45 @@ def extract_timstof_dda_data(path: str, fragments['processed_spec'] = processed_spec return fragments + + +def transform_psm_to_pin(psm_df): + columns_map = { + 'spec_idx': 'SpecId', + 'decoy': 'Label', + 'charge': 'Charge', + 'sequence': 'Peptide', + 'proteins': 'Proteins', + # feature mapping for re-scoring + 'hyper_score': 'Feature1', + 'isotope_error': 'Feature2', + 'delta_mass': 'Feature3', + 'delta_rt': 'Feature4', + 'delta_ims': 'Feature5', + 'matched_peaks': 'Feature6', + 'matched_intensity_pct': 'Feature7', + 'intensity_ms1': 'Feature8', + 'intensity_ms2': 'Feature9', + 'average_ppm': 'Feature1ß', + 'poisson': 'Feature11', + 'spectral_entropy_similarity': 'Feature12', + 'spectral_correlation_similarity_pearson': 'Feature13', + 'spectral_correlation_similarity_spearman': 'Feature14', + 'spectral_normalized_intensity_difference': 'Feature15', + 'collision_energy': 'Feature16', + 'delta_next': 'Feature17', + 'delta_best': 'Feature18', + 'longest_b': 'Feature19', + 'longest_y': 'Feature20', + 'longest_y_pct': 'Feature21', + } + + psm_df = psm_df[list(columns_map.keys())] + df_pin = psm_df.rename(columns=columns_map) + df_pin_clean = df_pin.dropna(axis=1, how='all') + df_pin_clean = df_pin_clean.dropna() + + df_pin_clean['Label'] = df_pin_clean['Label'].apply(lambda x: -1 if x else 1) + df_pin_clean['ScanNr'] = range(1, len(df_pin_clean) + 1) + + return df_pin_clean diff --git a/imspy/pyproject.toml b/imspy/pyproject.toml index 59f8277b..fea0b80f 100644 --- a/imspy/pyproject.toml +++ b/imspy/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "imspy" -version = "0.2.32" +version = "0.2.33" description = "" authors = ["theGreatHerrLebert "] readme = "README.md" @@ -18,9 +18,10 @@ tensorflow-probability = ">=0.22.1" wandb = ">=0.12.1" pyopenms = ">=2.6.0" +mokapot = ">=0.10.0" -sagepy = ">=0.2.22" -imspy-connector = ">=0.2.32" +sagepy = ">=0.2.23" +imspy-connector = ">=0.2.33" scipy = ">=1.7.1" tqdm = ">=4.66" diff --git a/imspy_connector/Cargo.toml b/imspy_connector/Cargo.toml index 59f41a21..dff71316 100644 --- a/imspy_connector/Cargo.toml +++ b/imspy_connector/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "imspy-connector" -version = "0.2.32" +version = "0.2.33" edition = "2021" [lib] @@ -14,4 +14,4 @@ mscore = {path = "../mscore"} rustdf = {path = "../rustdf"} serde = "1.0.202" serde_json = "1.0.117" -ndarray = "0.15.6" +ndarray = "0.16.1"