Skip to content

Commit

Permalink
Merge pull request #260 from theGreatHerrLebert/feature/property-pred…
Browse files Browse the repository at this point in the history
…iction

Feature/property prediction
  • Loading branch information
theGreatHerrLebert authored Sep 10, 2024
2 parents bdc87d4 + c09ef65 commit 47c9dc0
Show file tree
Hide file tree
Showing 14 changed files with 294 additions and 102 deletions.
4 changes: 3 additions & 1 deletion imspy/imspy/algorithm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# from .mixture import GaussianMixtureModel
from .ccs.predictors import DeepPeptideIonMobilityApex, GRUCCSPredictor, load_deep_ccs_predictor
from .ionization.predictors import DeepChargeStateDistribution, GRUChargeStatePredictor, load_deep_charge_state_predictor
from .utility import load_tokenizer_from_resources

from .ccs.predictors import predict_inverse_ion_mobility
from .rt.predictors import predict_retention_time
from .intensity.predictors import predict_intensities_prosit
47 changes: 47 additions & 0 deletions imspy/imspy/algorithm/ccs/predictors.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,63 @@
from typing import List

import numpy as np
import pandas as pd
import tensorflow as tf
from sagepy.core import PeptideSpectrumMatch

from sagepy.utility import peptide_spectrum_match_collection_to_pandas
from tensorflow.keras.models import load_model

from abc import ABC, abstractmethod
from numpy.typing import NDArray

from imspy.algorithm.utility import load_tokenizer_from_resources
from imspy.chemistry.mobility import ccs_to_one_over_k0, one_over_k0_to_ccs
from scipy.optimize import curve_fit

from imspy.timstof.dbsearch.utility import generate_balanced_im_dataset
from imspy.utility import tokenize_unimod_sequence
from imspy.algorithm.utility import get_model_path, InMemoryCheckpoint


def predict_inverse_ion_mobility(
psm_collection: List[PeptideSpectrumMatch],
refine_model: bool = True,
verbose: bool = False) -> None:
"""
Predict the inverse mobility of a peptide-spectrum match collection
Args:
psm_collection: a list of peptide-spectrum matches
refine_model: whether to refine the model
verbose: whether to print verbose output
Returns:
None, inverse mobility values are added to the PSMs
"""

im_predictor = DeepPeptideIonMobilityApex(load_deep_ccs_predictor(),
load_tokenizer_from_resources("tokenizer-ptm"),
verbose=verbose)
if refine_model:
im_predictor.fine_tune_model(
peptide_spectrum_match_collection_to_pandas(generate_balanced_im_dataset(psm_collection)),
batch_size=128,
re_compile=True,
verbose=verbose
)

# predict ion mobilities
inv_mob = im_predictor.simulate_ion_mobilities(
sequences=[x.sequence for x in psm_collection],
charges=[x.charge for x in psm_collection],
mz=[x.mono_mz_calculated for x in psm_collection]
)

# set ion mobilities
for mob, ps in zip(inv_mob, psm_collection):
ps.inverse_mobility_predicted = mob


def load_deep_ccs_predictor() -> tf.keras.models.Model:
""" Get a pretrained deep predictor model
Expand Down
112 changes: 109 additions & 3 deletions imspy/imspy/algorithm/intensity/predictors.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,126 @@
import os
import re
from typing import List
from typing import List, Tuple

from numpy.typing import NDArray
import pandas as pd
import numpy as np
import tensorflow as tf
from abc import ABC, abstractmethod

from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, PeptideSpectrumMatch
from tqdm import tqdm

from imspy.algorithm.utility import get_model_path
from imspy.algorithm.intensity.utility import (generate_prosit_intensity_prediction_dataset, unpack_dict,
post_process_predicted_fragment_spectra, reshape_dims)
from .utility import (generate_prosit_intensity_prediction_dataset, unpack_dict,
post_process_predicted_fragment_spectra, reshape_dims, beta_score)

from imspy.data.peptide import PeptideProductIonSeriesCollection, PeptideSequence

from imspy.simulation.utility import flatten_prosit_array

def predict_intensities_prosit(
psm_collection: List[PeptideSpectrumMatch],
calibrate_collision_energy: bool = True,
verbose: bool = False,
num_threads: int = -1,
) -> None:
"""
Predict the fragment ion intensities using Prosit.
Args:
psm_collection: a list of peptide-spectrum matches
calibrate_collision_energy: whether to calibrate the collision energy
verbose:
num_threads:
Returns:
"""
# check if num_threads is -1, if so, use all available threads
if num_threads == -1:
num_threads = os.cpu_count()

# the intensity predictor model
prosit_model = Prosit2023TimsTofWrapper(verbose=False)

# sample for collision energy calibration
sample = list(sorted(psm_collection, key=lambda x: x.hyper_score, reverse=True))[:int(2 ** 11)]

if calibrate_collision_energy:
collision_energy_calibration_factor, _ = get_collision_energy_calibration_factor(
list(filter(lambda match: match.decoy is not True, sample)),
prosit_model,
verbose=verbose
)

else:
collision_energy_calibration_factor = 0.0

for ps in psm_collection:
ps.collision_energy_calibrated = ps.collision_energy + collision_energy_calibration_factor

intensity_pred = prosit_model.predict_intensities(
[p.sequence for p in psm_collection],
np.array([p.charge for p in psm_collection]),
[p.collision_energy_calibrated for p in psm_collection],
batch_size=2048,
flatten=True,
)

psm_collection_intensity = associate_fragment_ions_with_prosit_predicted_intensities(
psm_collection, intensity_pred, num_threads=num_threads)

for psm, psm_intensity in zip(psm_collection, psm_collection_intensity):
psm.fragments_predicted = psm_intensity.fragments_predicted
psm.cosine_similarity = psm_intensity.cosine_similarity

for ps in psm_collection:
ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted)


def get_collision_energy_calibration_factor(
sample: List[PeptideSpectrumMatch],
model: 'Prosit2023TimsTofWrapper',
lower: int = -30,
upper: int = 30,
verbose: bool = False,
) -> Tuple[float, List[float]]:
"""
Get the collision energy calibration factor for a given sample.
Args:
sample: a list of PeptideSpectrumMatch objects
model: a Prosit2023TimsTofWrapper object
lower: lower bound for the search
upper: upper bound for the search
verbose: whether to print progress
Returns:
Tuple[float, List[float]]: the collision energy calibration factor and the cosine similarities
"""
cos_target, cos_decoy = [], []

if verbose:
print(f"Searching for collision energy calibration factor between {lower} and {upper} ...")

for i in tqdm(range(lower, upper), disable=not verbose, desc='calibrating CE', ncols=100):
I = model.predict_intensities(
[p.sequence for p in sample],
np.array([p.charge for p in sample]),
[p.collision_energy + i for p in sample],
batch_size=2048,
flatten=True
)

psm_i = associate_fragment_ions_with_prosit_predicted_intensities(sample, I)
target = list(filter(lambda x: not x.decoy, psm_i))
decoy = list(filter(lambda x: x.decoy, psm_i))

cos_target.append((i, np.mean([x.cosine_similarity for x in target])))
cos_decoy.append((i, np.mean([x.cosine_similarity for x in decoy])))

return cos_target[np.argmax([x[1] for x in cos_target])][0], [x[1] for x in cos_target]



def remove_unimod_annotation(sequence: str) -> str:
"""Remove [UNIMOD:N] annotations from the sequence."""
Expand Down
45 changes: 43 additions & 2 deletions imspy/imspy/algorithm/intensity/utility.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from numpy.typing import NDArray
from typing import List
import os

from numpy.typing import NDArray
from typing import List, Tuple
import numba
import numpy as np
import pandas as pd

Expand All @@ -10,9 +12,48 @@

from dlomix.reports.postprocessing import (reshape_flat, reshape_dims,
normalize_base_peak, mask_outofcharge, mask_outofrange)
from sagepy.core import IonType

from imspy.utility import tokenize_unimod_sequence

@numba.njit
def log_factorial(n: int, k: int) -> float:
k = max(k, 2)
result = 0.0
for i in range(n, k - 1, -1):
result += np.log(i)
return result


def beta_score(fragments_observed, fragments_predicted) -> float:
"""
The beta score is a variant of the OpenMS proposed score calculation, using predicted intensities instead of a constant value for the expected intensity.
Args:
fragments_observed: The Sage Fragment object containing the observed intensities
fragments_predicted: The Sage Fragment object containing the predicted intensities, e.g. from Prosit
Returns:
float: The beta score, hyper score variant using predicted intensities instead of a constant value for the expected intensity
"""

intensity = np.dot(fragments_observed.intensities, fragments_predicted.intensities)

len_b, len_y = 0, 0

b_type = IonType("b")
y_type = IonType("y")

for t in fragments_observed.ion_types:
if t == b_type:
len_b += 1
elif t == y_type:
len_y += 1

i_min = min(len_b, len_y)
i_max = max(len_b, len_y)

return np.log1p(intensity) + 2.0 * log_factorial(int(i_min), 2) + log_factorial(int(i_max), int(i_min) + 1)


def seq_to_index(seq: str, max_length: int = 30) -> NDArray:
"""Convert a sequence to a list of indices into the alphabet.
Expand Down
51 changes: 48 additions & 3 deletions imspy/imspy/algorithm/rt/predictors.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,63 @@
from typing import Union
from typing import Union, List

import pandas as pd
import numpy as np
import tensorflow as tf
from abc import ABC, abstractmethod
from numpy.typing import NDArray
from sagepy.core import PeptideSpectrumMatch
from sagepy.utility import peptide_spectrum_match_collection_to_pandas

from imspy.algorithm.utility import get_model_path, InMemoryCheckpoint
from imspy.timstof.dbsearch.utility import linear_map
from imspy.algorithm.utility import get_model_path, InMemoryCheckpoint, load_tokenizer_from_resources
from imspy.timstof.dbsearch.utility import linear_map, generate_balanced_rt_dataset
from imspy.utility import tokenize_unimod_sequence

from tensorflow.keras.models import load_model


def predict_retention_time(
psm_collection: List[PeptideSpectrumMatch],
refine_model: bool = True,
verbose: bool = False) -> None:
"""
Predict the retention times for a collection of peptide-spectrum matches
Args:
psm_collection: a list of peptide-spectrum matches
refine_model: whether to refine the model
verbose: whether to print verbose output
Returns:
None, retention times are set in the peptide-spectrum matches
"""

rt_predictor = DeepChromatographyApex(load_deep_retention_time_predictor(),
load_tokenizer_from_resources("tokenizer-ptm"),
verbose=verbose)

rt_min = np.min([x.retention_time_observed for x in psm_collection])
rt_max = np.max([x.retention_time_observed for x in psm_collection])

for psm in psm_collection:
psm.projected_rt = linear_map(psm.retention_time_observed, old_min=rt_min, old_max=rt_max, new_min=0, new_max=60)

if refine_model:
rt_predictor.fine_tune_model(
peptide_spectrum_match_collection_to_pandas(generate_balanced_rt_dataset(psm_collection)),
batch_size=128,
re_compile=True,
verbose=verbose
)

# predict retention times
rt_predicted = rt_predictor.simulate_separation_times(
sequences=[x.sequence for x in psm_collection],
)

# set the predicted retention times
for rt, ps in zip(rt_predicted, psm_collection):
ps.retention_time_predicted = rt


def get_rt_train_set(tokenizer, sequence, rt) -> tf.data.Dataset:
sequences = tokenizer.texts_to_sequences(sequence)
seq_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, 50, padding='post')
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd

from imspy.algorithm import load_tokenizer_from_resources
from imspy.algorithm.utility import load_tokenizer_from_resources
from imspy.algorithm.ionization.predictors import BinomialChargeStateDistributionModel, DeepChargeStateDistribution, \
load_deep_charge_state_predictor
from imspy.chemistry.utility import calculate_mz
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd

from imspy.algorithm import DeepPeptideIonMobilityApex, load_deep_ccs_predictor, load_tokenizer_from_resources
from imspy.algorithm.ccs.predictors import DeepPeptideIonMobilityApex, load_deep_ccs_predictor
from imspy.algorithm.utility import load_tokenizer_from_resources


def simulate_ion_mobilities(
Expand Down
10 changes: 7 additions & 3 deletions imspy/imspy/simulation/timsim/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def main():
"--num_sample_peptides",
type=int,
default=25_000,
help="Sample fraction, fraction of peptides to be sampled at random from digested fasta (default: 0.005)")
help="Number of peptides to sample from the digested fasta (default: 25_000)")

parser.add_argument("--missed_cleavages", type=int, default=2, help="Number of missed cleavages (default: 2)")
parser.add_argument("--min_len", type=int, default=7, help="Minimum peptide length (default: 7)")
Expand Down Expand Up @@ -314,8 +314,12 @@ def main():
peptides = pd.concat(peptide_list)

if args.sample_peptides:
peptides = peptides.sample(n=args.num_sample_peptides, random_state=41)
peptides.reset_index(drop=True, inplace=True)
try:
peptides = peptides.sample(n=args.num_sample_peptides, random_state=41)
peptides.reset_index(drop=True, inplace=True)
except ValueError:
print(f"Warning: Not enough peptides to sample {args.num_sample_peptides}, "
f"using all {peptides.shape[0]} peptides.")

if verbose:
print(f"Simulating {peptides.shape[0]} peptides...")
Expand Down
1 change: 1 addition & 0 deletions imspy/imspy/timstof/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .data import TimsDataset
from .dia import TimsDatasetDIA
from .dda import TimsDatasetDDA, FragmentDDA
from .dbsearch.utility import extract_timstof_dda_data
10 changes: 7 additions & 3 deletions imspy/imspy/timstof/dbsearch/imspy_dda.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@

from sagepy.qfdr.tdc import target_decoy_competition_pandas

from imspy.algorithm import DeepPeptideIonMobilityApex, load_deep_ccs_predictor, load_tokenizer_from_resources
from imspy.algorithm.ccs.predictors import DeepPeptideIonMobilityApex, load_deep_ccs_predictor
from imspy.algorithm.intensity.utility import beta_score
from imspy.algorithm.utility import load_tokenizer_from_resources
from imspy.algorithm.rt.predictors import DeepChromatographyApex, load_deep_retention_time_predictor
from imspy.algorithm.intensity.predictors import Prosit2023TimsTofWrapper

from imspy.timstof import TimsDatasetDDA

from imspy.timstof.dbsearch.utility import sanitize_mz, sanitize_charge, get_searchable_spec, split_fasta, \
get_collision_energy_calibration_factor, write_psms_binary, re_score_psms, \
merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map, beta_score
write_psms_binary, re_score_psms, \
merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map

from imspy.algorithm.intensity.predictors import get_collision_energy_calibration_factor

from sagepy.core.scoring import psms_to_json_bin
from sagepy.utility import peptide_spectrum_match_collection_to_pandas
Expand Down
Loading

0 comments on commit 47c9dc0

Please sign in to comment.