From ae44d30a189d17d77404888df96d94ef6c6c0fab Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Tue, 7 May 2024 16:34:47 +0200 Subject: [PATCH 01/19] adding predict intensities --- imspy/imspy/algorithm/intensity/predictors.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/imspy/imspy/algorithm/intensity/predictors.py b/imspy/imspy/algorithm/intensity/predictors.py index d17db3a5..32d67fd4 100644 --- a/imspy/imspy/algorithm/intensity/predictors.py +++ b/imspy/imspy/algorithm/intensity/predictors.py @@ -130,6 +130,49 @@ def simulate_ion_intensities_pandas(self, data: pd.DataFrame, batch_size: int = return data + def predict_intensities( + self, + sequences: List[str], + charges: List[int], + collision_energies: List[float], + divide_collision_energy_by: float = 1e2, + batch_size: int = 512, + flatten: bool = False, + ) -> List[NDArray]: + sequences_unmod = [remove_unimod_annotation(s) for s in sequences] + sequence_length = [len(s) for s in sequences_unmod] + collision_energies_norm = [ce / divide_collision_energy_by for ce in collision_energies] + + tf_ds = generate_prosit_intensity_prediction_dataset( + sequences_unmod, + charges, + np.expand_dims(collision_energies_norm, 1)).batch(batch_size) + + ds_unpacked = tf_ds.map(unpack_dict) + + intensity_predictions = [] + for peptides_in, precursor_charge_in, collision_energy_in in tqdm(ds_unpacked, desc='Predicting intensities', + total=len(sequences) // batch_size + 1, + ncols=100, + disable=not self.verbose): + model_input = [peptides_in, precursor_charge_in, collision_energy_in] + model_output = self.model(model_input).numpy() + intensity_predictions.append(model_output) + + I_pred = list(np.vstack(intensity_predictions)) + I_pred = np.squeeze(reshape_dims(post_process_predicted_fragment_spectra(pd.DataFrame({ + 'sequence': sequences, + 'charge': charges, + 'collision_energy': collision_energies, + 'sequence_length': sequence_length, + 'intensity_raw': I_pred, + })))) + + if flatten: + I_pred = np.vstack([flatten_prosit_array(r) for r in I_pred]) + + return I_pred + def simulate_ion_intensities( self, sequences: List[str], From b925958f87125d907e7a8ab6b3efb4530c26402a Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Tue, 7 May 2024 18:57:55 +0200 Subject: [PATCH 02/19] adding re-fit to model --- imspy/imspy/algorithm/rt/predictors.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/imspy/imspy/algorithm/rt/predictors.py b/imspy/imspy/algorithm/rt/predictors.py index 438bd422..6638e988 100644 --- a/imspy/imspy/algorithm/rt/predictors.py +++ b/imspy/imspy/algorithm/rt/predictors.py @@ -101,6 +101,16 @@ def simulate_separation_times(self, sequences: list[str], batch_size: int = 1024 return self.model.predict(tf_ds, verbose=self.verbose) + def fit_model(self, data: pd.DataFrame, epochs: int = 10, batch_size: int = 1024, re_compile=False): + assert 'sequence' in data.columns, 'Data must contain a column named "sequence"' + assert 'retention_time_observed' in data.columns, 'Data must contain a column named "retention_time_observed"' + tokens = self._preprocess_sequences(data.sequence.values) + rts = data.retention_time.values + tf_ds = tf.data.Dataset.from_tensor_slices((tokens, rts)).batch(batch_size) + if re_compile: + self.model.compile(optimizer='adam', loss='mean_squared_error') + self.model.fit(tf_ds, epochs=epochs, verbose=self.verbose) + def simulate_separation_times_pandas(self, data: pd.DataFrame, gradient_length: float, batch_size: int = 1024) -> pd.DataFrame: From b68836d611e582f80538757836e6061a469530f1 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Tue, 7 May 2024 19:04:37 +0200 Subject: [PATCH 03/19] fixing --- imspy/imspy/algorithm/rt/predictors.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/imspy/imspy/algorithm/rt/predictors.py b/imspy/imspy/algorithm/rt/predictors.py index 6638e988..cb4981bd 100644 --- a/imspy/imspy/algorithm/rt/predictors.py +++ b/imspy/imspy/algorithm/rt/predictors.py @@ -105,15 +105,14 @@ def fit_model(self, data: pd.DataFrame, epochs: int = 10, batch_size: int = 1024 assert 'sequence' in data.columns, 'Data must contain a column named "sequence"' assert 'retention_time_observed' in data.columns, 'Data must contain a column named "retention_time_observed"' tokens = self._preprocess_sequences(data.sequence.values) - rts = data.retention_time.values + rts = data.retention_time_observed.values tf_ds = tf.data.Dataset.from_tensor_slices((tokens, rts)).batch(batch_size) if re_compile: self.model.compile(optimizer='adam', loss='mean_squared_error') self.model.fit(tf_ds, epochs=epochs, verbose=self.verbose) def simulate_separation_times_pandas(self, data: pd.DataFrame, - gradient_length: float, - batch_size: int = 1024) -> pd.DataFrame: + gradient_length: float, batch_size: int = 1024) -> pd.DataFrame: tokens = self._preprocess_sequences(data.sequence.values) tf_ds = tf.data.Dataset.from_tensor_slices(tokens).batch(batch_size) From f3373d431e20d7a4fd3af7d3e3acf0c2a36a6ad4 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Tue, 7 May 2024 19:12:24 +0200 Subject: [PATCH 04/19] adding fit --- imspy/imspy/algorithm/rt/predictors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imspy/imspy/algorithm/rt/predictors.py b/imspy/imspy/algorithm/rt/predictors.py index cb4981bd..3e78abe4 100644 --- a/imspy/imspy/algorithm/rt/predictors.py +++ b/imspy/imspy/algorithm/rt/predictors.py @@ -106,7 +106,7 @@ def fit_model(self, data: pd.DataFrame, epochs: int = 10, batch_size: int = 1024 assert 'retention_time_observed' in data.columns, 'Data must contain a column named "retention_time_observed"' tokens = self._preprocess_sequences(data.sequence.values) rts = data.retention_time_observed.values - tf_ds = tf.data.Dataset.from_tensor_slices((tokens, rts)).batch(batch_size) + tf_ds = tf.data.Dataset.from_tensor_slices((tokens, rts)).shuffle(len(data)).batch(batch_size) if re_compile: self.model.compile(optimizer='adam', loss='mean_squared_error') self.model.fit(tf_ds, epochs=epochs, verbose=self.verbose) From 67b4438e0e0371a065f8d4170d4acead624c735c Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Thu, 9 May 2024 19:20:53 +0200 Subject: [PATCH 05/19] testing --- imspy/imspy/timstof/dda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imspy/imspy/timstof/dda.py b/imspy/imspy/timstof/dda.py index fe04c131..3b2551d4 100644 --- a/imspy/imspy/timstof/dda.py +++ b/imspy/imspy/timstof/dda.py @@ -55,7 +55,7 @@ def _load_pasef_meta_data(self): return pd.read_sql_query("SELECT * from PasefFrameMsMsInfo", sqlite3.connect(self.data_path + "/analysis.tdf")) - def get_pasef_fragments(self) -> pd.DataFrame: + def get_pasef_fragments(self, num_threads: int = 1) -> pd.DataFrame: """Get PASEF fragments. Args: @@ -65,7 +65,7 @@ def get_pasef_fragments(self) -> pd.DataFrame: List[FragmentDDA]: List of PASEF fragments. """ pasef_fragments = [FragmentDDA.from_py_tims_fragment_dda(fragment) - for fragment in self.__dataset.get_pasef_fragments(1)] + for fragment in self.__dataset.get_pasef_fragments(num_threads)] pasef_fragments = pd.DataFrame({ 'frame_id': [s.frame_id for s in pasef_fragments], From 8c21b6173c977ced249b42b7ea83753dde74bd08 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Thu, 9 May 2024 19:21:27 +0200 Subject: [PATCH 06/19] fixing --- imspy/imspy/timstof/dda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imspy/imspy/timstof/dda.py b/imspy/imspy/timstof/dda.py index 3b2551d4..47e7221f 100644 --- a/imspy/imspy/timstof/dda.py +++ b/imspy/imspy/timstof/dda.py @@ -59,7 +59,7 @@ def get_pasef_fragments(self, num_threads: int = 1) -> pd.DataFrame: """Get PASEF fragments. Args: - num_threads (int, optional): Number of threads. Defaults to 4. + num_threads (int, optional): Number of threads. Defaults to 1. Returns: List[FragmentDDA]: List of PASEF fragments. From 0285f0d59f6af371fc5e4d473144d8013dcfbc18 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Thu, 9 May 2024 23:05:40 +0200 Subject: [PATCH 07/19] version bump --- imspy/pyproject.toml | 4 ++-- rustms/Cargo.toml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/imspy/pyproject.toml b/imspy/pyproject.toml index fe8e7794..d1f3a19e 100644 --- a/imspy/pyproject.toml +++ b/imspy/pyproject.toml @@ -18,8 +18,8 @@ wandb = ">=0.12.1" pyopenms = ">=2.6.0" -sagepy = ">=0.2.7" -sagepy-connector = ">=0.2.7" +sagepy = ">=0.2.9 +sagepy-connector = ">=0.2.9" imspy-connector = ">=0.2.17" diff --git a/rustms/Cargo.toml b/rustms/Cargo.toml index 9d0f9b9f..69a17121 100644 --- a/rustms/Cargo.toml +++ b/rustms/Cargo.toml @@ -4,8 +4,8 @@ version = "0.1.0" edition = "2021" [dependencies] -serde = { version = "1.0.198", features = ["derive"] } -serde_json = "1.0.116" +serde = { version = "1.0.201", features = ["derive"] } +serde_json = "1.0.117" regex = "1.10.4" rayon = "1.10.0" itertools = "0.12.1" From a781531ba428ed451370206cee92dbc5d1eb78c9 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 08:56:06 +0200 Subject: [PATCH 08/19] adding DDA search --- imspy/imspy/timstof/dbsearch/__init__.py | 0 imspy/imspy/timstof/dbsearch/dda.py | 0 imspy/imspy/timstof/dbsearch/utility.py | 0 imspy/pyproject.toml | 2 +- 4 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 imspy/imspy/timstof/dbsearch/__init__.py create mode 100644 imspy/imspy/timstof/dbsearch/dda.py create mode 100644 imspy/imspy/timstof/dbsearch/utility.py diff --git a/imspy/imspy/timstof/dbsearch/__init__.py b/imspy/imspy/timstof/dbsearch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py new file mode 100644 index 00000000..e69de29b diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py new file mode 100644 index 00000000..e69de29b diff --git a/imspy/pyproject.toml b/imspy/pyproject.toml index d1f3a19e..019f45c0 100644 --- a/imspy/pyproject.toml +++ b/imspy/pyproject.toml @@ -18,7 +18,7 @@ wandb = ">=0.12.1" pyopenms = ">=2.6.0" -sagepy = ">=0.2.9 +sagepy = ">=0.2.9" sagepy-connector = ">=0.2.9" imspy-connector = ">=0.2.17" From beed6a1943a29a773a0b8534389ed3598a188aa0 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 08:56:30 +0200 Subject: [PATCH 09/19] adding utilities --- imspy/imspy/timstof/dbsearch/utility.py | 128 ++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py index e69de29b..935fe207 100644 --- a/imspy/imspy/timstof/dbsearch/utility.py +++ b/imspy/imspy/timstof/dbsearch/utility.py @@ -0,0 +1,128 @@ +import re +from typing import List, Tuple +from tqdm import tqdm + +import numpy as np +from typing import Optional +from sagepy.core import Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor, Tolerance, Scorer, Representation +from sagepy.core.scoring import PeptideSpectrumMatch + +from imspy.algorithm.intensity.predictors import Prosit2023TimsTofWrapper + + +def sanitize_charge(charge: Optional[float]) -> Optional[int]: + if charge is not None and not np.isnan(charge): + return int(charge) + return None + + +def sanitize_mz(mz: Optional[float], mz_highest: float) -> Optional[float]: + if mz is not None and not np.isnan(mz): + return mz + return mz_highest + + +def split_fasta(fasta: str, num_splits: int = 16) -> List[str]: + """ Split a fasta file into multiple fasta files. + Args: + fasta: Fasta file as string. + num_splits: Number of splits fasta file should be split into. + + Returns: + List of fasta files as strings, will contain num_splits fasta files with equal number of sequences. + """ + split_strings = re.split(r'\n>', fasta) + + if not split_strings[0].startswith('>'): + split_strings[0] = '>' + split_strings[0] + + total_items = len(split_strings) + items_per_batch = total_items // num_splits + remainder = total_items % num_splits + + fastas = [] + start_index = 0 + + for i in range(num_splits): + extra = 1 if i < remainder else 0 + stop_index = start_index + items_per_batch + extra + + if start_index >= total_items: + break + + batch = '\n>'.join(split_strings[start_index:stop_index]) + + if not batch.startswith('>'): + batch = '>' + batch + + fastas.append(batch) + start_index = stop_index + + return fastas + + +def get_searchable_spec(row, + precursor: Precursor, + raw_fragment_spec: RawSpectrum, + spec_processor: SpectrumProcessor) -> ProcessedSpectrum: + precursor = row['sage_precursor'] + flat_spec = row['raw_data'].to_indexed_mz_spectrum() + time = row['time'] + spec = RawSpectrum( + file_id=25, + ms_level=2, + spec_id=row['spec_id'], + representation=Representation(), + precursors=[precursor], + scan_start_time=float(time), + ion_injection_time=float(time), + total_ion_current=np.sum(flat_spec.intensity), + mz=flat_spec.mz.astype(np.float32), + intensity=flat_spec.intensity.astype(np.float32) + ) + + processed_spec = spec_processor.process(spec) + return processed_spec + + +def get_collision_energy_callibration_factor( + sample: List[PeptideSpectrumMatch], + model: Prosit2023TimsTofWrapper, + lower: int = -30, + upper: int = 30, + verbose: bool = False, +) -> Tuple[float, List[float]]: + """ + Get the collision energy calibration factor for a given sample. + Args: + sample: a list of PeptideSpectrumMatch objects + model: a Prosit2023TimsTofWrapper object + lower: lower bound for the search + upper: upper bound for the search + disable_progress_bar: whether to disable the progress bar + + Returns: + Tuple[float, List[float]]: the collision energy calibration factor and the cosine similarities + """ + cos_target, cos_decoy = [], [] + + if verbose: + print(f"Searching for collision energy calibration factor between {lower} and {upper}...") + + for i in tqdm(range(lower, upper), disable=verbose): + I = model.predict_intensities( + [p.sequence for p in sample], + np.array([p.charge for p in sample]), + [p.collision_energy + i for p in sample], + batch_size=2048, + flatten=True + ) + + psm_i = associate_fragment_ions_with_prosit_predicted_intensities(sample, I) + target = list(filter(lambda x: not x.decoy, psm_i)) + decoy = list(filter(lambda x: x.decoy, psm_i)) + + cos_target.append((i, np.mean([x.cosine_similarity for x in target]))) + cos_decoy.append((i, np.mean([x.cosine_similarity for x in decoy]))) + + return cos_target[np.argmax([x[1] for x in cos_target])][0], [x[1] for x in cos_target] From 3454b4b88834dcc69e50d8fcf401a7e262406e9d Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 16:40:56 +0200 Subject: [PATCH 10/19] adding timsTOF dda processing --- imspy/imspy/timstof/dbsearch/dda.py | 365 ++++++++++++++++++++++++ imspy/imspy/timstof/dbsearch/utility.py | 79 +++-- 2 files changed, 427 insertions(+), 17 deletions(-) diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py index e69de29b..29727710 100644 --- a/imspy/imspy/timstof/dbsearch/dda.py +++ b/imspy/imspy/timstof/dbsearch/dda.py @@ -0,0 +1,365 @@ +import argparse +import os +import sys +import time + +import pandas as pd +import numpy as np +from sagepy.core import Precursor, Tolerance, SpectrumProcessor, Scorer, EnzymeBuilder, SAGE_KNOWN_MODS, validate_mods, \ + validate_var_mods, SageSearchConfiguration +from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, \ + peptide_spectrum_match_list_to_pandas, json_bin_to_psms +from sagepy.qfdr.tdc import target_decoy_competition_pandas + +from imspy.algorithm import DeepPeptideIonMobilityApex, DeepChromatographyApex, load_deep_ccs_predictor, \ + load_tokenizer_from_resources, load_deep_retention_time +from imspy.algorithm.intensity.predictors import Prosit2023TimsTofWrapper +from imspy.timstof import TimsDatasetDDA +from imspy.timstof.dbsearch.utility import sanitize_mz, sanitize_charge, get_searchable_spec, split_fasta, \ + get_collision_energy_calibration_factor, write_psms_binary + +from sagepy.core.scoring import psms_to_json_bin + + +def main(): + # use argparse to parse command line arguments + parser = argparse.ArgumentParser(description='🦀💻 IMSPY - timsTOF DDA 🔬🐍 - PROTEOMICS IMS DDA data analysis ' + 'using imspy and sagepy.') + + # Required string argument for path and fasta file + parser.add_argument( + "path", + type=str, + help="Path to bruker raw folders (.d) containing RAW files" + ) + parser.add_argument( + "fasta", + type=str, + help="Path to the fasta file of proteins to be digested" + ) + + # Optional verbosity flag + parser.add_argument( + "-v", + "--verbose", + type=bool, + default=True, + help="Increase output verbosity" + ) + # Optional flag for fasta batch size, defaults to 1 + parser.add_argument( + "-fbs", + "--fasta_batch_size", + type=int, + default=1, + help="Batch size for fasta file (default: 1)" + ) + + # SAGE isolation window settings + parser.add_argument("--isolation_window_lower", type=float, default=-3.0, help="Isolation window (default: -3.0)") + parser.add_argument("--isolation_window_upper", type=float, default=3.0, help="Isolation window (default: 3.0)") + + # SAGE Scoring settings + # precursor tolerance lower and upper + parser.add_argument("--precursor_tolerance_lower", type=float, default=-5.0, + help="Precursor tolerance lower (default: -5.0)") + parser.add_argument("--precursor_tolerance_upper", type=float, default=5.0, + help="Precursor tolerance upper (default: 5.0)") + + # fragment tolerance lower and upper + parser.add_argument("--fragment_tolerance_lower", type=float, default=-25.0, + help="Fragment tolerance lower (default: -25.0)") + parser.add_argument("--fragment_tolerance_upper", type=float, default=25.0, + help="Fragment tolerance upper (default: 25.0)") + + # number of psms to report + parser.add_argument("--report_psms", type=int, default=10, help="Number of PSMs to report (default: 10)") + # minimum number of matched peaks + parser.add_argument("--min_matched_peaks", type=int, default=5, help="Minimum number of matched peaks (default: 5)") + # annotate matches + parser.add_argument("--annotate_matches", type=bool, default=True, help="Annotate matches (default: True)") + # SAGE Preprocessing settings + parser.add_argument("--take_top_n", type=int, default=100, help="Take top n peaks (default: 100)") + + # SAGE settings for digest of fasta file + parser.add_argument("--missed_cleavages", type=int, default=2, help="Number of missed cleavages (default: 2)") + parser.add_argument("--min_len", type=int, default=7, help="Minimum peptide length (default: 7)") + parser.add_argument("--max_len", type=int, default=30, help="Maximum peptide length (default: 30)") + parser.add_argument("--cleave_at", type=str, default='KR', help="Cleave at (default: KR)") + parser.add_argument("--restrict", type=str, default='P', help="Restrict (default: P)") + parser.add_argument("--decoys", type=bool, default=True, help="Generate decoys (default: True)") + parser.add_argument("--c_terminal", type=bool, default=True, help="C terminal (default: True)") + + # sage search configuration + parser.add_argument("--fragment_max_mz", type=float, default=4000, help="Fragment max mz (default: 4000)") + parser.add_argument("--bucket_size", type=int, default=16384, help="Bucket size (default: 16384)") + + # randomize fasta + parser.add_argument("--randomize_fasta_split", type=bool, default=True, + help="Randomize fasta split (default: True)") + + # number of threads + parser.add_argument("--num_threads", type=int, default=16, help="Number of threads (default: 16)") + args = parser.parse_args() + + paths = [] + + # Check if path exists + if not os.path.exists(args.path): + print(f"Path {args.path} does not exist. Exiting.") + sys.exit(1) + + # Check if path is a directory + if not os.path.isdir(args.path): + print(f"Path {args.path} is not a directory. Exiting.") + sys.exit(1) + + # check for fasta file + if not os.path.exists(args.fasta): + print(f"Path {args.fasta} does not exist. Exiting.") + sys.exit(1) + + # check for fasta file + if not os.path.isfile(args.fasta): + print(f"Path {args.fasta} is not a file. Exiting.") + sys.exit(1) + + for root, dirs, _ in os.walk(args.path): + for file in dirs: + if file.endswith(".d"): + paths.append(os.path.join(root, file)) + + # get the write folder path + write_folder_path = "/".join(args.path.split("/")[:-1]) + + # get time + start_time = time.time() + + if args.verbose: + print(f"Found {len(paths)} RAW data folders in {args.path} ...") + + # go over RAW data one file at a time + for path in paths: + if args.verbose: + print(f"Processing {path} ...") + + ds_name = os.path.basename(path).split(".")[0] + dataset = TimsDatasetDDA(str(path), in_memory=False) + + if args.verbose: + print("loading PASEF fragments ...") + + fragments = dataset.get_pasef_fragments() + + mobility = fragments.apply(lambda r: np.mean(r.raw_data.mobility), axis=1) + fragments['mobility'] = mobility + + spec_id = fragments.apply(lambda r: str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name, axis=1) + fragments['spec_id'] = spec_id + + if args.verbose: + print("loading precursor data ...") + + sage_precursor = fragments.apply(lambda r: Precursor( + mz=sanitize_mz(r['monoisotopic_mz'], r['largest_peak_mz']), + intensity=r['intensity'], + charge=sanitize_charge(r['charge']), + isolation_window=Tolerance(da=(args.isolation_window_lower, args.isolation_window_upper)), + collision_energy=r.collision_energy, + inverse_ion_mobility=r.mobility, + ), axis=1) + + fragments['sage_precursor'] = sage_precursor + + if args.verbose: + print("preprocessing spectra ...") + + processed_spec = fragments.apply( + lambda r: get_searchable_spec( + precursor=r.sage_precursor, + raw_fragment_data=r.raw_data, + spec_processor=SpectrumProcessor(take_top_n=args.take_top_n), + spec_id=r.spec_id, + time=r['time'], + ), + axis=1 + ) + + fragments['processed_spec'] = processed_spec + + scorer = Scorer( + precursor_tolerance=Tolerance(da=(args.precursor_tolerance_lower, args.precursor_tolerance_upper)), + fragment_tolerance=Tolerance(ppm=(args.fragment_tolerance_lower, args.fragment_tolerance_upper)), + report_psms=args.report_psms, + min_matched_peaks=args.min_matched_peaks, + annotate_matches=args.annotate_matches + ) + + if args.verbose: + print("generating fasta digest ...") + + # configure a trypsin-like digestor of fasta files + enzyme_builder = EnzymeBuilder( + missed_cleavages=args.missed_cleavages, + min_len=args.min_len, + max_len=args.max_len, + cleave_at=args.cleave_at, + restrict=args.restrict, + c_terminal=args.c_terminal, + ) + + # generate static cysteine modification TODO: make configurable + static_mods = {k: v for k, v in [SAGE_KNOWN_MODS.cysteine_static()]} + + # generate variable methionine modification TODO: make configurable + variable_mods = {k: v for k, v in [SAGE_KNOWN_MODS.methionine_variable()]} + + # generate SAGE compatible mod representations + static = validate_mods(static_mods) + variab = validate_var_mods(variable_mods) + + with open(args.fasta, 'r') as infile: + fasta = infile.read() + + fasta_list = split_fasta(fasta, args.fasta_batch_size, randomize=args.randomize_fasta_split) + + if args.verbose: + print("loading deep learning models for intensity, retention time and ion mobility prediction ...") + + prosit_model = Prosit2023TimsTofWrapper(verbose=False) + im_predictor = DeepPeptideIonMobilityApex(load_deep_ccs_predictor(), + load_tokenizer_from_resources("tokenizer_ionmob")) + rt_predictor = DeepChromatographyApex(load_deep_retention_time(), + load_tokenizer_from_resources("tokenizer-ptm"), verbose=True) + + psm_list = [] + + if args.verbose: + print("generating search configuration ...") + + for fasta in fasta_list: + sage_config = SageSearchConfiguration( + fasta=fasta, + static_mods=static, + variable_mods=variab, + enzyme_builder=enzyme_builder, + generate_decoys=args.decoys, + fragment_max_mz=args.fragment_max_mz, + bucket_size=int(args.bucket_size), + ) + + if args.verbose: + print("generating indexed database ...") + + # generate the database for searching against + indexed_db = sage_config.generate_indexed_database() + + psm = scorer.score_collection_psm( + db=indexed_db, + spectrum_collection=fragments['processed_spec'].values, + num_threads=args.num_threads, + ) + + for p in psm: + p.file_name = ds_name + + psm_list.extend(psm) + + psm = psm_list + + sample = np.random.choice(psm, 4096) + collision_energy_calibration_factor, _ = get_collision_energy_calibration_factor( + sample, + prosit_model, + verbose=args.verbose, + ) + + for p in psm: + p.collision_energy_calibrated = p.collision_energy + collision_energy_calibration_factor + + if args.verbose: + print("predicting ion intensities ...") + + intensity_pred = prosit_model.predict_intensities( + [p.sequence for p in psm], + np.array([p.charge for p in psm]), + [p.collision_energy_calibrated for p in psm], + batch_size=2048, + flatten=True, + ) + + psm = associate_fragment_ions_with_prosit_predicted_intensities(psm, intensity_pred, num_threads=args.num_threads) + + if args.verbose: + print("predicting ion mobilities ...") + + # predict ion mobilities + inv_mob = im_predictor.simulate_ion_mobilities( + sequences=[x.sequence for x in psm], + charges=[x.charge for x in psm], + mz=[x.mono_mz_calculated for x in psm] + ) + + # set ion mobilities + for mob, p in zip(inv_mob, psm): + p.inverse_mobility_predicted = mob + + # calculate calibration factor + inv_mob_calibration_factor = np.mean( + [x.inverse_mobility_observed - x.inverse_mobility_predicted for x in psm]) + + # set calibrated ion mobilities + for p in psm: + p.inverse_mobility_predicted += inv_mob_calibration_factor + + PSM_pandas = peptide_spectrum_match_list_to_pandas(psm) + PSM_q = target_decoy_competition_pandas(PSM_pandas) + + PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"]) + + # use good psm hits to fine tune RT predictor for dataset + TDC = pd.merge(PSM_q, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], + right_on=["spec_idx", "match_idx", "decoy"]) + + TDC_filtered = TDC[TDC.q_value <= 0.001] + + rt_predictor.fit_model(TDC_filtered[TDC_filtered.decoy == False], epochs=5, batch_size=2048) + + if args.verbose: + print("predicting retention times ...") + + # predict retention times + rt_pred = rt_predictor.simulate_separation_times( + sequences=[x.sequence for x in psm] + ) + + # set retention times + for rt, p in zip(rt_pred, psm): + p.retention_time_predicted = rt + + # serialize PSMs to JSON binary + bts = psms_to_json_bin(psm) + + # write PSMs to binary file + write_psms_binary(byte_array=bts, folder_path=write_folder_path, file_name=ds_name) + + psms = [] + + # read PSMs from binary files + for file in os.listdir(write_folder_path): + if file.endswith(".bin"): + f = open(os.path.join(write_folder_path, file), 'rb') + data = f.read() + f.close() + psms.extend(json_bin_to_psms(data)) + + end_time = time.time() + + if args.verbose: + print("Done processing all RAW files.") + minutes, seconds = divmod(end_time - start_time, 60) + print(f"Processed {len(paths)} RAW files in {minutes} minutes and {seconds:.2f} seconds.") + + +if __name__ == "__main__": + main() diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py index 935fe207..e1f08f6d 100644 --- a/imspy/imspy/timstof/dbsearch/utility.py +++ b/imspy/imspy/timstof/dbsearch/utility.py @@ -1,3 +1,4 @@ +import os import re from typing import List, Tuple from tqdm import tqdm @@ -5,9 +6,10 @@ import numpy as np from typing import Optional from sagepy.core import Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor, Tolerance, Scorer, Representation -from sagepy.core.scoring import PeptideSpectrumMatch +from sagepy.core.scoring import PeptideSpectrumMatch, associate_fragment_ions_with_prosit_predicted_intensities from imspy.algorithm.intensity.predictors import Prosit2023TimsTofWrapper +from imspy.timstof.frame import TimsFrame def sanitize_charge(charge: Optional[float]) -> Optional[int]: @@ -22,17 +24,25 @@ def sanitize_mz(mz: Optional[float], mz_highest: float) -> Optional[float]: return mz_highest -def split_fasta(fasta: str, num_splits: int = 16) -> List[str]: +def split_fasta(fasta: str, num_splits: int = 16, randomize: bool = True) -> List[str]: """ Split a fasta file into multiple fasta files. Args: fasta: Fasta file as string. num_splits: Number of splits fasta file should be split into. + randomize: Whether to randomize the order of sequences before splitting. Returns: List of fasta files as strings, will contain num_splits fasta files with equal number of sequences. """ + + if num_splits == 1: + return [fasta] + split_strings = re.split(r'\n>', fasta) + if randomize: + np.random.shuffle(split_strings) + if not split_strings[0].startswith('>'): split_strings[0] = '>' + split_strings[0] @@ -61,21 +71,38 @@ def split_fasta(fasta: str, num_splits: int = 16) -> List[str]: return fastas -def get_searchable_spec(row, - precursor: Precursor, - raw_fragment_spec: RawSpectrum, - spec_processor: SpectrumProcessor) -> ProcessedSpectrum: - precursor = row['sage_precursor'] - flat_spec = row['raw_data'].to_indexed_mz_spectrum() - time = row['time'] +def get_searchable_spec(precursor: Precursor, + raw_fragment_data: TimsFrame, + spec_processor: SpectrumProcessor, + time: float, + spec_id: str, + file_id: int = 0, + ms_level: int = 2) -> ProcessedSpectrum: + """ + Get SAGE searchable spectrum from raw data. + Args: + precursor: Precursor object + raw_fragment_data: TimsFrame object + time: float + spec_processor: SpectrumProcessor object + spec_id: str + file_id: int + ms_level: int + + Returns: + ProcessedSpectrum: ProcessedSpectrum object + """ + + flat_spec = raw_fragment_data.to_indexed_mz_spectrum() + spec = RawSpectrum( - file_id=25, - ms_level=2, - spec_id=row['spec_id'], + file_id=file_id, + ms_level=ms_level, + spec_id=spec_id, representation=Representation(), precursors=[precursor], - scan_start_time=float(time), - ion_injection_time=float(time), + scan_start_time=time, + ion_injection_time=time, total_ion_current=np.sum(flat_spec.intensity), mz=flat_spec.mz.astype(np.float32), intensity=flat_spec.intensity.astype(np.float32) @@ -85,7 +112,7 @@ def get_searchable_spec(row, return processed_spec -def get_collision_energy_callibration_factor( +def get_collision_energy_calibration_factor( sample: List[PeptideSpectrumMatch], model: Prosit2023TimsTofWrapper, lower: int = -30, @@ -99,7 +126,7 @@ def get_collision_energy_callibration_factor( model: a Prosit2023TimsTofWrapper object lower: lower bound for the search upper: upper bound for the search - disable_progress_bar: whether to disable the progress bar + verbose: whether to print progress Returns: Tuple[float, List[float]]: the collision energy calibration factor and the cosine similarities @@ -107,7 +134,7 @@ def get_collision_energy_callibration_factor( cos_target, cos_decoy = [], [] if verbose: - print(f"Searching for collision energy calibration factor between {lower} and {upper}...") + print(f"Searching for collision energy calibration factor between {lower} and {upper} ...") for i in tqdm(range(lower, upper), disable=verbose): I = model.predict_intensities( @@ -126,3 +153,21 @@ def get_collision_energy_callibration_factor( cos_decoy.append((i, np.mean([x.cosine_similarity for x in decoy]))) return cos_target[np.argmax([x[1] for x in cos_target])][0], [x[1] for x in cos_target] + + +def write_psms_binary(byte_array, folder_path: str, file_name: str): + """ Write PSMs to binary file. + Args: + byte_array: Byte array + folder_path: Folder path + file_name: File name + """ + # create folder if it doesn't exist + if not os.path.exists(f'{folder_path}/imspy/psm'): + os.makedirs(f'{folder_path}/imspy/psm') + + file = open(f'{folder_path}/imspy/psm/{file_name}.bin', 'wb') + try: + file.write(bytearray(byte_array)) + finally: + file.close() From 4818a2f980d6d3d9a6e079b9d980845cc42fa652 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 19:43:24 +0200 Subject: [PATCH 11/19] adding re-scoring --- imspy/imspy/timstof/dbsearch/dda.py | 36 +++++-- imspy/imspy/timstof/dbsearch/utility.py | 122 +++++++++++++++++++++++- 2 files changed, 149 insertions(+), 9 deletions(-) diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py index 29727710..6d20aabb 100644 --- a/imspy/imspy/timstof/dbsearch/dda.py +++ b/imspy/imspy/timstof/dbsearch/dda.py @@ -16,7 +16,7 @@ from imspy.algorithm.intensity.predictors import Prosit2023TimsTofWrapper from imspy.timstof import TimsDatasetDDA from imspy.timstof.dbsearch.utility import sanitize_mz, sanitize_charge, get_searchable_spec, split_fasta, \ - get_collision_energy_calibration_factor, write_psms_binary + get_collision_energy_calibration_factor, write_psms_binary, re_score_psms from sagepy.core.scoring import psms_to_json_bin @@ -73,7 +73,7 @@ def main(): help="Fragment tolerance upper (default: 25.0)") # number of psms to report - parser.add_argument("--report_psms", type=int, default=10, help="Number of PSMs to report (default: 10)") + parser.add_argument("--report_psms", type=int, default=5, help="Number of PSMs to report (default: 5)") # minimum number of matched peaks parser.add_argument("--min_matched_peaks", type=int, default=5, help="Minimum number of matched peaks (default: 5)") # annotate matches @@ -95,8 +95,8 @@ def main(): parser.add_argument("--bucket_size", type=int, default=16384, help="Bucket size (default: 16384)") # randomize fasta - parser.add_argument("--randomize_fasta_split", type=bool, default=True, - help="Randomize fasta split (default: True)") + parser.add_argument("--randomize_fasta_split", type=bool, default=False, + help="Randomize fasta split (default: False)") # number of threads parser.add_argument("--num_threads", type=int, default=16, help="Number of threads (default: 16)") @@ -154,7 +154,8 @@ def main(): mobility = fragments.apply(lambda r: np.mean(r.raw_data.mobility), axis=1) fragments['mobility'] = mobility - spec_id = fragments.apply(lambda r: str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name, axis=1) + # generate random string for for spec_id + spec_id = fragments.apply(lambda r: str(np.random.randint(1000000000)) + '-' + str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name, axis=1) fragments['spec_id'] = spec_id if args.verbose: @@ -226,9 +227,12 @@ def main(): if args.verbose: print("loading deep learning models for intensity, retention time and ion mobility prediction ...") + # the intensity predictor model prosit_model = Prosit2023TimsTofWrapper(verbose=False) + # the ion mobility predictor model im_predictor = DeepPeptideIonMobilityApex(load_deep_ccs_predictor(), load_tokenizer_from_resources("tokenizer_ionmob")) + # the retention time predictor model rt_predictor = DeepChromatographyApex(load_deep_retention_time(), load_tokenizer_from_resources("tokenizer-ptm"), verbose=True) @@ -237,7 +241,7 @@ def main(): if args.verbose: print("generating search configuration ...") - for fasta in fasta_list: + for i, fasta in enumerate(fasta_list): sage_config = SageSearchConfiguration( fasta=fasta, static_mods=static, @@ -249,11 +253,14 @@ def main(): ) if args.verbose: - print("generating indexed database ...") + print(f"generating indexed database for fasta {i + 1} of {len(fasta_list)} ...") # generate the database for searching against indexed_db = sage_config.generate_indexed_database() + if args.verbose: + print("searching database ...") + psm = scorer.score_collection_psm( db=indexed_db, spectrum_collection=fragments['processed_spec'].values, @@ -323,7 +330,10 @@ def main(): TDC_filtered = TDC[TDC.q_value <= 0.001] - rt_predictor.fit_model(TDC_filtered[TDC_filtered.decoy == False], epochs=5, batch_size=2048) + if args.verbose: + print("fine tuning retention time predictor ...") + + rt_predictor.fit_model(TDC_filtered[TDC_filtered.decoy == False], epochs=4, batch_size=2048) if args.verbose: print("predicting retention times ...") @@ -353,6 +363,16 @@ def main(): f.close() psms.extend(json_bin_to_psms(data)) + psms = re_score_psms(psms, verbose=args.verbose) + + R = target_decoy_competition_pandas(peptide_spectrum_match_list_to_pandas(psms, re_score=True)) + R_after = R[(R.q_value <= 0.01) & (R.decoy == False)] + + # use good psm hits to fine tune RT predictor for dataset + PSM_pandas = peptide_spectrum_match_list_to_pandas(psms, re_score=True).drop(columns=["score", "q_value"]) + R_f = pd.merge(R_after, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], + right_on=["spec_idx", "match_idx", "decoy"]) + end_time = time.time() if args.verbose: diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py index e1f08f6d..a6679011 100644 --- a/imspy/imspy/timstof/dbsearch/utility.py +++ b/imspy/imspy/timstof/dbsearch/utility.py @@ -1,24 +1,48 @@ import os import re from typing import List, Tuple + +import pandas as pd from tqdm import tqdm import numpy as np from typing import Optional -from sagepy.core import Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor, Tolerance, Scorer, Representation +from sagepy.core import Precursor, RawSpectrum, ProcessedSpectrum, SpectrumProcessor, Representation from sagepy.core.scoring import PeptideSpectrumMatch, associate_fragment_ions_with_prosit_predicted_intensities from imspy.algorithm.intensity.predictors import Prosit2023TimsTofWrapper from imspy.timstof.frame import TimsFrame +from sagepy.utility import get_features +from sagepy.qfdr.tdc import target_decoy_competition_pandas +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis + +from sagepy.utility import peptide_spectrum_match_list_to_pandas +from numpy.typing import NDArray + def sanitize_charge(charge: Optional[float]) -> Optional[int]: + """Sanitize charge value. + Args: + charge: Charge value as float. + + Returns: + int: Charge value as int. + """ if charge is not None and not np.isnan(charge): return int(charge) return None def sanitize_mz(mz: Optional[float], mz_highest: float) -> Optional[float]: + """Sanitize mz value. + Args: + mz: Mz value as float. + mz_highest: Highest mz value. + + Returns: + float: Mz value as float. + """ if mz is not None and not np.isnan(mz): return mz return mz_highest @@ -171,3 +195,99 @@ def write_psms_binary(byte_array, folder_path: str, file_name: str): file.write(bytearray(byte_array)) finally: file.close() + + +def generate_training_data(psms: List[PeptideSpectrumMatch]) -> Tuple[NDArray, NDArray]: + """ Generate training data. + Args: + psms: List of PeptideSpectrumMatch objects + + Returns: + Tuple[NDArray, NDArray]: X_train and Y_train + """ + + # create pandas table from psms + PSM_pandas = peptide_spectrum_match_list_to_pandas(psms) + + # calculate q-values to get inital "good" hits + PSM_q = target_decoy_competition_pandas(PSM_pandas) + PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"]) + + # merge data with q-values + TDC = pd.merge(PSM_q, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], + right_on=["spec_idx", "match_idx", "decoy"]) + + # select best positive examples + TARGET = TDC[(TDC.decoy == False) & (TDC.q_value <= 0.001)] + X_target, Y_target = get_features(TARGET) + + # select all decoys + DECOY = TDC[TDC.decoy] + X_decoy, Y_decoy = get_features(DECOY) + X_train = np.vstack((X_target, X_decoy)) + Y_train = np.hstack((Y_target, Y_decoy)) + + return X_train, Y_train + + +def split_psms(psms: List[PeptideSpectrumMatch], num_splits: int = 10) -> List[List[PeptideSpectrumMatch]]: + """ Split PSMs into multiple splits. + + Args: + psms: List of PeptideSpectrumMatch objects + num_splits: Number of splits + + Returns: + List[List[PeptideSpectrumMatch]]: List of splits + """ + # floor devision by num_splits + split_size = len(psms) // num_splits + # remainder for last split + remainder = len(psms) % num_splits + splits = [] + start_index = 0 + for i in range(num_splits): + end_index = start_index + split_size + (1 if i < remainder else 0) + splits.append(psms[start_index:end_index]) + start_index = end_index + + return splits + + +def re_score_psms(psms: List[PeptideSpectrumMatch], num_splits: int = 10, + verbose: bool = True) -> List[PeptideSpectrumMatch]: + """ Re-score PSMs using LDA. + Args: + psms: List of PeptideSpectrumMatch objects + num_splits: Number of splits + verbose: Whether to print progress + + Returns: + List[PeptideSpectrumMatch]: List of PeptideSpectrumMatch objects + """ + + splits = split_psms(psms=psms, num_splits=num_splits) + + predictions = [] + + for i in tqdm(range(num_splits), disable=not verbose): + + target = splits[i] + features = [] + + for j in range(num_splits): + if j != i: + features.extend(splits[j]) + + X_train, Y_train = generate_training_data(features) + X, _ = get_features(peptide_spectrum_match_list_to_pandas(target)) + + lda = LinearDiscriminantAnalysis() + lda.fit(X_train, Y_train) + Y_pred = np.squeeze(lda.transform(X)) + predictions.extend(Y_pred) + + for score, match in zip(predictions, psms): + match.re_score = score + + return psms From 69570093e7755c2b39b9fc2843d1f3abd25044f7 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 19:55:43 +0200 Subject: [PATCH 12/19] adding re-scoring --- imspy/imspy/timstof/dbsearch/dda.py | 10 ++++++++-- imspy/imspy/timstof/dbsearch/utility.py | 9 ++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py index 6d20aabb..8c618627 100644 --- a/imspy/imspy/timstof/dbsearch/dda.py +++ b/imspy/imspy/timstof/dbsearch/dda.py @@ -98,6 +98,12 @@ def main(): parser.add_argument("--randomize_fasta_split", type=bool, default=False, help="Randomize fasta split (default: False)") + # re-scoring settings + parser.add_argument("--num_splits", type=int, default=10, help="Number of splits (default: 10)") + + # fdr threshold + parser.add_argument("--fdr_threshold", type=float, default=0.01, help="FDR threshold (default: 0.01)") + # number of threads parser.add_argument("--num_threads", type=int, default=16, help="Number of threads (default: 16)") args = parser.parse_args() @@ -363,10 +369,10 @@ def main(): f.close() psms.extend(json_bin_to_psms(data)) - psms = re_score_psms(psms, verbose=args.verbose) + psms = re_score_psms(psms, verbose=args.verbose, num_splits=args.num_splits) R = target_decoy_competition_pandas(peptide_spectrum_match_list_to_pandas(psms, re_score=True)) - R_after = R[(R.q_value <= 0.01) & (R.decoy == False)] + R_after = R[(R.q_value <= args.fdr_threshold) & (R.decoy == False)] # use good psm hits to fine tune RT predictor for dataset PSM_pandas = peptide_spectrum_match_list_to_pandas(psms, re_score=True).drop(columns=["score", "q_value"]) diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py index a6679011..74c2847f 100644 --- a/imspy/imspy/timstof/dbsearch/utility.py +++ b/imspy/imspy/timstof/dbsearch/utility.py @@ -254,8 +254,11 @@ def split_psms(psms: List[PeptideSpectrumMatch], num_splits: int = 10) -> List[L return splits -def re_score_psms(psms: List[PeptideSpectrumMatch], num_splits: int = 10, - verbose: bool = True) -> List[PeptideSpectrumMatch]: +def re_score_psms( + psms: List[PeptideSpectrumMatch], + num_splits: int = 10, + verbose: bool = True, +) -> List[PeptideSpectrumMatch]: """ Re-score PSMs using LDA. Args: psms: List of PeptideSpectrumMatch objects @@ -270,7 +273,7 @@ def re_score_psms(psms: List[PeptideSpectrumMatch], num_splits: int = 10, predictions = [] - for i in tqdm(range(num_splits), disable=not verbose): + for i in tqdm(range(num_splits), disable=not verbose, desc='Re-scoring PSMs', ncols=100): target = splits[i] features = [] From aa489e5de6594331870cf574573995efff3be8c4 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 20:19:31 +0200 Subject: [PATCH 13/19] adding re-scoring --- imspy/imspy/timstof/dbsearch/dda.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py index 8c618627..6ebf4126 100644 --- a/imspy/imspy/timstof/dbsearch/dda.py +++ b/imspy/imspy/timstof/dbsearch/dda.py @@ -161,7 +161,7 @@ def main(): fragments['mobility'] = mobility # generate random string for for spec_id - spec_id = fragments.apply(lambda r: str(np.random.randint(1000000000)) + '-' + str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name, axis=1) + spec_id = fragments.apply(lambda r: str(np.random.randint(int(1e9))) + '-' + str(r['frame_id']) + '-' + str(r['precursor_id']) + '-' + ds_name, axis=1) fragments['spec_id'] = spec_id if args.verbose: @@ -370,15 +370,17 @@ def main(): psms.extend(json_bin_to_psms(data)) psms = re_score_psms(psms, verbose=args.verbose, num_splits=args.num_splits) + PSM_pandas = peptide_spectrum_match_list_to_pandas(psms) + PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"]) - R = target_decoy_competition_pandas(peptide_spectrum_match_list_to_pandas(psms, re_score=True)) - R_after = R[(R.q_value <= args.fdr_threshold) & (R.decoy == False)] + psms_rescored = target_decoy_competition_pandas(peptide_spectrum_match_list_to_pandas(psms, re_score=True)) + psms_rescored = psms_rescored[(psms_rescored.q_value <= 0.01) & (psms_rescored.decoy == False)] - # use good psm hits to fine tune RT predictor for dataset - PSM_pandas = peptide_spectrum_match_list_to_pandas(psms, re_score=True).drop(columns=["score", "q_value"]) - R_f = pd.merge(R_after, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], + TDC = pd.merge(psms_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], right_on=["spec_idx", "match_idx", "decoy"]) + TDC.to_csv(f"{write_folder_path}/Peptides.csv", index=False) + end_time = time.time() if args.verbose: From ec93e8ff4425bdfbdd86ee82350bf5f5b542a155 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 20:39:34 +0200 Subject: [PATCH 14/19] adding re-scoring --- imspy/imspy/timstof/dbsearch/dda.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py index 6ebf4126..dc2a845d 100644 --- a/imspy/imspy/timstof/dbsearch/dda.py +++ b/imspy/imspy/timstof/dbsearch/dda.py @@ -369,6 +369,8 @@ def main(): f.close() psms.extend(json_bin_to_psms(data)) + psms = list(sorted(psms, key=lambda psm: (psm.spec_idx, psm.peptide_idx))) + psms = re_score_psms(psms, verbose=args.verbose, num_splits=args.num_splits) PSM_pandas = peptide_spectrum_match_list_to_pandas(psms) PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"]) From ad7171dd8aeb25b1035bd77ba11952675bc1515a Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 20:45:01 +0200 Subject: [PATCH 15/19] adding psm --- imspy/imspy/timstof/dbsearch/dda.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py index dc2a845d..30b4cd64 100644 --- a/imspy/imspy/timstof/dbsearch/dda.py +++ b/imspy/imspy/timstof/dbsearch/dda.py @@ -5,16 +5,22 @@ import pandas as pd import numpy as np + from sagepy.core import Precursor, Tolerance, SpectrumProcessor, Scorer, EnzymeBuilder, SAGE_KNOWN_MODS, validate_mods, \ validate_var_mods, SageSearchConfiguration + from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, \ peptide_spectrum_match_list_to_pandas, json_bin_to_psms + from sagepy.qfdr.tdc import target_decoy_competition_pandas from imspy.algorithm import DeepPeptideIonMobilityApex, DeepChromatographyApex, load_deep_ccs_predictor, \ load_tokenizer_from_resources, load_deep_retention_time + from imspy.algorithm.intensity.predictors import Prosit2023TimsTofWrapper + from imspy.timstof import TimsDatasetDDA + from imspy.timstof.dbsearch.utility import sanitize_mz, sanitize_charge, get_searchable_spec, split_fasta, \ get_collision_energy_calibration_factor, write_psms_binary, re_score_psms From 0efd3e5a886be9beed60f070196f951f4c4ab730 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 21:16:44 +0200 Subject: [PATCH 16/19] adding psm-rescoring --- imspy/imspy/timstof/dbsearch/dda.py | 9 +++++---- imspy/imspy/timstof/dbsearch/utility.py | 2 -- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py index 30b4cd64..aa314c31 100644 --- a/imspy/imspy/timstof/dbsearch/dda.py +++ b/imspy/imspy/timstof/dbsearch/dda.py @@ -368,16 +368,17 @@ def main(): psms = [] # read PSMs from binary files - for file in os.listdir(write_folder_path): + for file in os.listdir(write_folder_path + "/imspy/psm/"): if file.endswith(".bin"): - f = open(os.path.join(write_folder_path, file), 'rb') + f = open(os.path.join(write_folder_path + "/imspy/psm/", file), 'rb') data = f.read() f.close() psms.extend(json_bin_to_psms(data)) + # sort PSMs to avoid leaking information into predictions during re-scoring psms = list(sorted(psms, key=lambda psm: (psm.spec_idx, psm.peptide_idx))) - psms = re_score_psms(psms, verbose=args.verbose, num_splits=args.num_splits) + psms = re_score_psms(psms=psms, verbose=args.verbose, num_splits=args.num_splits) PSM_pandas = peptide_spectrum_match_list_to_pandas(psms) PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"]) @@ -387,7 +388,7 @@ def main(): TDC = pd.merge(psms_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], right_on=["spec_idx", "match_idx", "decoy"]) - TDC.to_csv(f"{write_folder_path}/Peptides.csv", index=False) + TDC.to_csv(f"{write_folder_path} + /imspy/Peptides.csv", index=False) end_time = time.time() diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py index 74c2847f..30d284e3 100644 --- a/imspy/imspy/timstof/dbsearch/utility.py +++ b/imspy/imspy/timstof/dbsearch/utility.py @@ -205,7 +205,6 @@ def generate_training_data(psms: List[PeptideSpectrumMatch]) -> Tuple[NDArray, N Returns: Tuple[NDArray, NDArray]: X_train and Y_train """ - # create pandas table from psms PSM_pandas = peptide_spectrum_match_list_to_pandas(psms) @@ -270,7 +269,6 @@ def re_score_psms( """ splits = split_psms(psms=psms, num_splits=num_splits) - predictions = [] for i in tqdm(range(num_splits), disable=not verbose, desc='Re-scoring PSMs', ncols=100): From a82e0b6c09ec931ee2d4c1f347eb61454eafb819 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 15 May 2024 21:40:24 +0200 Subject: [PATCH 17/19] testing --- imspy/imspy/timstof/dbsearch/dda.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py index aa314c31..dd11c836 100644 --- a/imspy/imspy/timstof/dbsearch/dda.py +++ b/imspy/imspy/timstof/dbsearch/dda.py @@ -151,9 +151,10 @@ def main(): print(f"Found {len(paths)} RAW data folders in {args.path} ...") # go over RAW data one file at a time - for path in paths: + for p, path in enumerate(paths): if args.verbose: print(f"Processing {path} ...") + print(f"Processing {p + 1} of {len(paths)} ...") ds_name = os.path.basename(path).split(".")[0] dataset = TimsDatasetDDA(str(path), in_memory=False) From 14acc0b0a229f4f8d37dc316c1b75cc33e7f9e30 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Thu, 16 May 2024 10:56:28 +0200 Subject: [PATCH 18/19] adding desc --- imspy/imspy/timstof/dbsearch/utility.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imspy/imspy/timstof/dbsearch/utility.py b/imspy/imspy/timstof/dbsearch/utility.py index 30d284e3..db69527d 100644 --- a/imspy/imspy/timstof/dbsearch/utility.py +++ b/imspy/imspy/timstof/dbsearch/utility.py @@ -160,7 +160,7 @@ def get_collision_energy_calibration_factor( if verbose: print(f"Searching for collision energy calibration factor between {lower} and {upper} ...") - for i in tqdm(range(lower, upper), disable=verbose): + for i in tqdm(range(lower, upper), disable=not verbose, desc='calibrating CE', ncols=100): I = model.predict_intensities( [p.sequence for p in sample], np.array([p.charge for p in sample]), @@ -291,4 +291,4 @@ def re_score_psms( for score, match in zip(predictions, psms): match.re_score = score - return psms + return list(filter(lambda s: s.re_score <= 100, psms)) From 992f70df72493090ece12d3d96dffde9708ff538 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Thu, 16 May 2024 14:13:09 +0200 Subject: [PATCH 19/19] adding fasta split to script --- imspy/imspy/timstof/dbsearch/dda.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/imspy/imspy/timstof/dbsearch/dda.py b/imspy/imspy/timstof/dbsearch/dda.py index dd11c836..224288f6 100644 --- a/imspy/imspy/timstof/dbsearch/dda.py +++ b/imspy/imspy/timstof/dbsearch/dda.py @@ -9,8 +9,7 @@ from sagepy.core import Precursor, Tolerance, SpectrumProcessor, Scorer, EnzymeBuilder, SAGE_KNOWN_MODS, validate_mods, \ validate_var_mods, SageSearchConfiguration -from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, \ - peptide_spectrum_match_list_to_pandas, json_bin_to_psms +from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, json_bin_to_psms, merge_psm_dicts from sagepy.qfdr.tdc import target_decoy_competition_pandas @@ -25,6 +24,7 @@ get_collision_energy_calibration_factor, write_psms_binary, re_score_psms from sagepy.core.scoring import psms_to_json_bin +from sagepy.utility import peptide_spectrum_match_list_to_pandas def main(): @@ -249,11 +249,11 @@ def main(): rt_predictor = DeepChromatographyApex(load_deep_retention_time(), load_tokenizer_from_resources("tokenizer-ptm"), verbose=True) - psm_list = [] - if args.verbose: print("generating search configuration ...") + merged_dict = {} + for i, fasta in enumerate(fasta_list): sage_config = SageSearchConfiguration( fasta=fasta, @@ -266,7 +266,7 @@ def main(): ) if args.verbose: - print(f"generating indexed database for fasta {i + 1} of {len(fasta_list)} ...") + print(f"generating indexed database for fasta split {i + 1} of {len(fasta_list)} ...") # generate the database for searching against indexed_db = sage_config.generate_indexed_database() @@ -280,12 +280,19 @@ def main(): num_threads=args.num_threads, ) - for p in psm: - p.file_name = ds_name + for _, values in psm.items(): + for value in values: + value.file_name = ds_name + + if i == 0: + merged_dict = psm + else: + merged_dict = merge_psm_dicts(right_psms=psm, left_psms=merged_dict, max_hits=args.report_psms) - psm_list.extend(psm) + psm = [] - psm = psm_list + for _, values in merged_dict.items(): + psm.extend(values) sample = np.random.choice(psm, 4096) collision_energy_calibration_factor, _ = get_collision_energy_calibration_factor( @@ -332,7 +339,7 @@ def main(): for p in psm: p.inverse_mobility_predicted += inv_mob_calibration_factor - PSM_pandas = peptide_spectrum_match_list_to_pandas(psm) + PSM_pandas = peptide_spectrum_match_list_to_pandas(psm, use_sequence_as_match_idx=True) PSM_q = target_decoy_competition_pandas(PSM_pandas) PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"]) @@ -389,7 +396,7 @@ def main(): TDC = pd.merge(psms_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"], right_on=["spec_idx", "match_idx", "decoy"]) - TDC.to_csv(f"{write_folder_path} + /imspy/Peptides.csv", index=False) + TDC.to_csv(f"{write_folder_path}" + "/imspy/Peptides.csv", index=False) end_time = time.time()