Skip to content

Commit

Permalink
Merge pull request #267 from theGreatHerrLebert/feature/property-pred…
Browse files Browse the repository at this point in the history
…iction

Feature/property prediction
  • Loading branch information
theGreatHerrLebert authored Oct 9, 2024
2 parents 9736147 + f33d2d2 commit 3b2f7e6
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 25 deletions.
14 changes: 10 additions & 4 deletions imspy/imspy/algorithm/intensity/predictors.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
post_process_predicted_fragment_spectra, reshape_dims, beta_score)

from imspy.data.peptide import PeptideProductIonSeriesCollection, PeptideSequence

from imspy.simulation.utility import flatten_prosit_array
from sagepy.rescore.utility import dict_to_dense_array, spectral_entropy_similarity, spectral_correlation

def predict_intensities_prosit(
psm_collection: List[PeptideSpectrumMatch],
Expand Down Expand Up @@ -70,12 +70,18 @@ def predict_intensities_prosit(
psm_collection_intensity = associate_fragment_ions_with_prosit_predicted_intensities(
psm_collection, intensity_pred, num_threads=num_threads)

for psm, psm_intensity in zip(psm_collection, psm_collection_intensity):
# calculate the spectral similarity metrics
for psm, psm_intensity, prosit_intensity in tqdm(zip(psm_collection, psm_collection_intensity, intensity_pred),
desc='Calc spectral similarity metrics', ncols=100, disable=not verbose):

psm.fragments_predicted = psm_intensity.fragments_predicted
psm.cosine_similarity = psm_intensity.cosine_similarity
psm.prosit_intensities = prosit_intensity

for ps in psm_collection:
ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted)
psm.spectral_correlation_similarity_pearson = psm_intensity.spectral_correlation_similarity_pearson
psm.spectral_correlation_similarity_spearman = psm_intensity.spectral_correlation_similarity_spearman
psm.spectral_entropy_similarity = psm_intensity.spectral_entropy_similarity
psm.beta_score = beta_score(psm.fragments_observed, psm.fragments_predicted)


def get_collision_energy_calibration_factor(
Expand Down
66 changes: 50 additions & 16 deletions imspy/imspy/timstof/dbsearch/imspy_dda.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import time
import toml

import mokapot

import pandas as pd
import numpy as np

Expand All @@ -15,6 +17,7 @@
from sagepy.core.scoring import associate_fragment_ions_with_prosit_predicted_intensities, json_bin_to_psms, ScoreType

from sagepy.qfdr.tdc import target_decoy_competition_pandas
from tqdm import tqdm

from imspy.algorithm.ccs.predictors import DeepPeptideIonMobilityApex, load_deep_ccs_predictor
from imspy.algorithm.intensity.utility import beta_score
Expand All @@ -26,7 +29,8 @@

from imspy.timstof.dbsearch.utility import sanitize_mz, sanitize_charge, get_searchable_spec, split_fasta, \
write_psms_binary, re_score_psms, \
merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map
merge_dicts_with_merge_dict, generate_balanced_rt_dataset, generate_balanced_im_dataset, linear_map, \
transform_psm_to_pin

from imspy.algorithm.intensity.predictors import get_collision_energy_calibration_factor

Expand Down Expand Up @@ -129,6 +133,15 @@ def main():
help="Batch size for fasta file (default: 1)"
)

parser.add_argument(
"--no_re_score_mokapot",
dest="re_score_mokapot",
action="store_false",
help="Re-score PSMs using mokapot"
)

parser.set_defaults(re_score_mokapot=True)

# SAGE isolation window settings
parser.add_argument("--isolation_window_lower", type=float, default=-3.0, help="Isolation window (default: -3.0)")
parser.add_argument("--isolation_window_upper", type=float, default=3.0, help="Isolation window (default: 3.0)")
Expand Down Expand Up @@ -240,7 +253,7 @@ def main():
parser.add_argument("--fdr_threshold", type=float, default=0.01, help="FDR threshold (default: 0.01)")

# number of threads
parser.add_argument("--num_threads", type=int, default=16, help="Number of threads (default: 16)")
parser.add_argument("--num_threads", type=int, default=-1, help="Number of threads (default: -1)")

# if train splits should be balanced
parser.add_argument(
Expand All @@ -251,14 +264,6 @@ def main():
)
parser.set_defaults(balanced_re_score=True)

# TDC method
parser.add_argument(
"--tdc_method",
type=str,
default="peptide_psm_peptide",
help="TDC method (default: peptide_psm_peptide aka double competition)"
)

# load dataset in memory
parser.add_argument(
"--in_memory",
Expand Down Expand Up @@ -330,6 +335,10 @@ def main():
logging.basicConfig(filename=f"{write_folder_path}/imspy/imspy-{current_time}.log",
level=logging.INFO, format='%(asctime)s %(message)s')

if args.num_threads == -1 and args.verbose:
print(f"Using all available CPU threads: {os.cpu_count()} ...")
args.num_threads = os.cpu_count()

logging.info("Arguments settings:")
for arg in vars(args):
logging.info(f"{arg}: {getattr(args, arg)}")
Expand Down Expand Up @@ -628,10 +637,7 @@ def main():
psm = associate_fragment_ions_with_prosit_predicted_intensities(psm, intensity_pred,
num_threads=args.num_threads)

if args.verbose:
print("calculating beta score ...")

for ps in psm:
for ps in tqdm(psm, desc="calculating beta scores", ncols=100, disable=(not args.verbose)):
ps.beta_score = beta_score(ps.fragments_observed, ps.fragments_predicted)

if args.verbose:
Expand Down Expand Up @@ -744,19 +750,47 @@ def main():
write_psms_binary(byte_array=bts, folder_path=write_folder_path, file_name="total_psms", total=True)

PSM_pandas = peptide_spectrum_match_collection_to_pandas(psms)

if args.re_score_mokapot:
if args.verbose:
print("re-scoring PSMs using mokapot ...")

# create a mokapot folder in the imspy folder
if not os.path.exists(write_folder_path + "/imspy/mokapot"):
os.makedirs(write_folder_path + "/imspy/mokapot")

# create a PIN file from the PSMs
PSM_pin = transform_psm_to_pin(PSM_pandas)
PSM_pin.to_csv(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin", index=False, sep="\t")

psms_moka = mokapot.read_pin(f"{write_folder_path}" + "/imspy/mokapot/PSMs.pin")
results, models = mokapot.brew(psms_moka)

results.to_txt(dest_dir=f"{write_folder_path}" + "/imspy/mokapot/")

PSM_pandas = PSM_pandas.drop(columns=["q_value", "score"])

if args.verbose:
print(f"FDR calculation, using target decoy competition: {args.tdc_method} ...")
print(f"FDR calculation ...")

psms_rescored = target_decoy_competition_pandas(peptide_spectrum_match_collection_to_pandas(psms, re_score=True),
method=args.tdc_method)
method="psm")

psms_rescored = psms_rescored[(psms_rescored.q_value <= 0.01) & (psms_rescored.decoy == False)]

TDC = pd.merge(psms_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"],
right_on=["spec_idx", "match_idx", "decoy"]).sort_values(by="score", ascending=False)

TDC.to_csv(f"{write_folder_path}" + "/imspy/PSMs.csv", index=False)

peptides_rescored = target_decoy_competition_pandas(peptide_spectrum_match_collection_to_pandas(psms, re_score=True),
method="peptide_psm_peptide")

peptides_rescored = peptides_rescored[(peptides_rescored.q_value <= 0.01) & (peptides_rescored.decoy == False)]

TDC = pd.merge(peptides_rescored, PSM_pandas, left_on=["spec_idx", "match_idx", "decoy"],
right_on=["spec_idx", "match_idx", "decoy"]).sort_values(by="score", ascending=False)

TDC.to_csv(f"{write_folder_path}" + "/imspy/Peptides.csv", index=False)

end_time = time.time()
Expand Down
42 changes: 42 additions & 0 deletions imspy/imspy/timstof/dbsearch/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,3 +448,45 @@ def extract_timstof_dda_data(path: str,
fragments['processed_spec'] = processed_spec

return fragments


def transform_psm_to_pin(psm_df):
columns_map = {
'spec_idx': 'SpecId',
'decoy': 'Label',
'charge': 'Charge',
'sequence': 'Peptide',
'proteins': 'Proteins',
# feature mapping for re-scoring
'hyper_score': 'Feature1',
'isotope_error': 'Feature2',
'delta_mass': 'Feature3',
'delta_rt': 'Feature4',
'delta_ims': 'Feature5',
'matched_peaks': 'Feature6',
'matched_intensity_pct': 'Feature7',
'intensity_ms1': 'Feature8',
'intensity_ms2': 'Feature9',
'average_ppm': 'Feature1ß',
'poisson': 'Feature11',
'spectral_entropy_similarity': 'Feature12',
'spectral_correlation_similarity_pearson': 'Feature13',
'spectral_correlation_similarity_spearman': 'Feature14',
'spectral_normalized_intensity_difference': 'Feature15',
'collision_energy': 'Feature16',
'delta_next': 'Feature17',
'delta_best': 'Feature18',
'longest_b': 'Feature19',
'longest_y': 'Feature20',
'longest_y_pct': 'Feature21',
}

psm_df = psm_df[list(columns_map.keys())]
df_pin = psm_df.rename(columns=columns_map)
df_pin_clean = df_pin.dropna(axis=1, how='all')
df_pin_clean = df_pin_clean.dropna()

df_pin_clean['Label'] = df_pin_clean['Label'].apply(lambda x: -1 if x else 1)
df_pin_clean['ScanNr'] = range(1, len(df_pin_clean) + 1)

return df_pin_clean
7 changes: 4 additions & 3 deletions imspy/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "imspy"
version = "0.2.32"
version = "0.2.33"
description = ""
authors = ["theGreatHerrLebert <davidteschner@googlemail.com>"]
readme = "README.md"
Expand All @@ -18,9 +18,10 @@ tensorflow-probability = ">=0.22.1"
wandb = ">=0.12.1"

pyopenms = ">=2.6.0"
mokapot = ">=0.10.0"

sagepy = ">=0.2.22"
imspy-connector = ">=0.2.32"
sagepy = ">=0.2.23"
imspy-connector = ">=0.2.33"

scipy = ">=1.7.1"
tqdm = ">=4.66"
Expand Down
4 changes: 2 additions & 2 deletions imspy_connector/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "imspy-connector"
version = "0.2.32"
version = "0.2.33"
edition = "2021"

[lib]
Expand All @@ -14,4 +14,4 @@ mscore = {path = "../mscore"}
rustdf = {path = "../rustdf"}
serde = "1.0.202"
serde_json = "1.0.117"
ndarray = "0.15.6"
ndarray = "0.16.1"

0 comments on commit 3b2f7e6

Please sign in to comment.