Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

transfer simulation framework from proteolizard #36

Merged
merged 8 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions imspy/examples/simulation/run_example_simulation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from imspy.simulation.experiment import LcImsMsMs
from imspy.simulation.hardware_models import (NeuralChromatographyApex,
NormalChromatographyProfileModel,
LiquidChromatography,
ElectroSpray,
TrappedIon,
TOF,
NeuralIonMobilityApex,
NormalIonMobilityProfileModel,
AveragineModel,
BinomialIonSource
)
from imspy.proteome import ProteinSample, Trypsin, ORGANISM
from imspy.chemistry.mass import BufferGas

import pandas as pd
import numpy as np


def irt_to_rt(irt):
return irt


def scan_im_interval(scan_id):
intercept = 1451.357
slope = -877.361
scan_id = np.atleast_1d(scan_id)
lower = ( scan_id - intercept ) / slope
upper = ((scan_id+1) - intercept ) / slope
return np.stack([1/lower, 1/upper], axis=1)


def im_to_scan(reduced_ion_mobility):
intercept = 1451.357
slope = -877.361
# TODO more appropriate function here ?
one_over_k0 = 1/reduced_ion_mobility
return np.round(one_over_k0 * slope + intercept).astype(np.int16)


def build_experiment():
t = LcImsMsMs("./timstofexp1_binomial_ion_source_21_7/") # maybe rather call this class LCIMSMSExperiment

lc = LiquidChromatography()
lc.frame_length = 1200 #ms
lc.gradient_length = 120 # min
esi = ElectroSpray()
tims = TrappedIon()
tims.scan_time = 110 # us
tof_mz = TOF()

t.lc_method = lc
t.ionization_method = esi
t.ion_mobility_separation_method = tims
t.mz_separation_method = tof_mz

N2 = BufferGas("N2")

tokenizer_path = '/home/tim/Workspaces/ionmob/pretrained-models/tokenizers/tokenizer.json'

rt_model_weights = "/home/tim/Workspaces/Resources/models/DeepChromatograpy/"
t.lc_method.apex_model = NeuralChromatographyApex(rt_model_weights,tokenizer_path = tokenizer_path)

t.lc_method.profile_model = NormalChromatographyProfileModel()
t.lc_method.irt_to_rt_converter = irt_to_rt

im_model_weights = "/home/tim/Workspaces/ionmob/pretrained-models/GRUPredictor"
t.ion_mobility_separation_method.apex_model = NeuralIonMobilityApex(im_model_weights, tokenizer_path = tokenizer_path)

t.ion_mobility_separation_method.profile_model = NormalIonMobilityProfileModel()
t.ion_mobility_separation_method.buffer_gas = N2
t.ion_mobility_separation_method.scan_to_reduced_im_interval_converter = scan_im_interval
t.ion_mobility_separation_method.reduced_im_to_scan_converter = im_to_scan

t.ionization_method.ionization_model = BinomialIonSource()

t.mz_separation_method.model = AveragineModel()

rng = np.random.default_rng(2023)
# read proteome
proteome = pd.read_feather('/home/tim/Workspaces/Resources/Homo-sapiens-proteome.feather')
random_abundances = rng.integers(1e3,1e7,size=proteome.shape[0])
proteome = proteome.assign(abundancy= random_abundances)
# create sample and sample digest; TODO: add missed cleavages to ENZYMEs
sample = ProteinSample(proteome, ORGANISM.HOMO_SAPIENS)
sample_digest = sample.digest(Trypsin())

# to reduce computational load in example
sample_digest.data = sample_digest.data.sample(100, random_state= rng)


t.load_sample(sample_digest)
return t


if __name__ == "__main__":

t = build_experiment()

#cProfile.run("t.run(10000)", filename="profiler_10000_8_process",sort="cumtime")
t.run(100, frames_per_assemble_process=10)
162 changes: 160 additions & 2 deletions imspy/imspy/chemistry/mass.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
import re
from collections import defaultdict
import numpy as np
import mendeleev as me

MASS_PROTON = 1.007276466583
MASS_NEUTRON = 1.00866491597
MASS_ELECTRON = 0.00054857990946
MASS_H2O = 18.0105646863
MASS_WATER = 18.0105646863

# IUPAC standard in Kelvin
STANDARD_TEMPERATURE = 273.15
# IUPAC standard in Pa
STANDARD_PRESSURE = 1e5
# IUPAC elementary charge
ELEMENTARY_CHARGE = 1.602176634e-19
# IUPAC BOLTZMANN'S CONSTANT
K_BOLTZMANN = 1.380649e-23

AMINO_ACIDS = {
'Lysine': 'K',
Expand Down Expand Up @@ -146,7 +156,7 @@ def calculate_monoisotopic_mass(sequence):
mass_sequence = np.sum([AMINO_ACID_MASSES[amino_acid] * count for amino_acid, count in aa_counts.items()])
mass_modifics = np.sum([MODIFICATIONS_MZ_NUMERICAL[mod] * count for mod, count in mod_counts.items()])

return mass_sequence + mass_modifics + MASS_H2O
return mass_sequence + mass_modifics + MASS_WATER


def calculate_mz(monoisotopic_mass, charge):
Expand Down Expand Up @@ -174,3 +184,151 @@ def calculate_mz_from_sequence(sequence, charge):
float: The m/z of the sequence.
"""
return calculate_mz(calculate_monoisotopic_mass(sequence), charge)


def get_monoisotopic_token_weight(token:str):
"""
Gets monoisotopic weight of token

:param token: Token of aa sequence e.g. "<START>[UNIMOD:1]"
:type token: str
:return: Weight in Dalton.
:rtype: float
"""
splits = token.split("[")
for i in range(1, len(splits)):
splits[i] = "["+splits[i]

mass = 0
for split in splits:
mass += AMINO_ACID_MASSES[split]
return mass


def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float:
mass = 0
for token in sequence_tokenized:
mass += get_monoisotopic_token_weight(token)
return mass + MASS_WATER


def get_mass_over_charge(mass: float, charge: int) -> float:
return (mass / charge) + MASS_PROTON


def get_num_protonizable_sites(sequence: str) -> int:
"""
Gets number of sites that can be protonized.
This function does not yet account for PTMs.

:param sequence: Amino acid sequence
:type sequence: str
:return: Number of protonizable sites
:rtype: int
"""
sites = 1 # n-terminus
for s in sequence:
if s in ["H", "R", "K"]:
sites += 1
return sites


class ChemicalCompound:

def _calculate_molecular_mass(self):
mass = 0
for (atom, abundance) in self.element_composition.items():
mass += me.element(atom).atomic_weight * abundance
return mass

def __init__(self, formula):
self.element_composition = self.get_composition(formula)
self.mass = self._calculate_molecular_mass()

def get_composition(self, formula: str):
"""
Parse chemical formula into Dict[str:int] with
atoms as keys and the respective counts as values.

:param formula: Chemical formula of compound e.g. 'C6H12O6'
:type formula: str
:return: Dictionary Atom: Count
:rtype: Dict[str:int]
"""
if formula.startswith("("):
assert formula.endswith(")")
formula = formula[1:-1]

tmp_group = ""
tmp_group_count = ""
depth = 0
comp_list = []
comp_counts = []

# extract components: everything in brackets and atoms
# extract component counts: number behind component or 1
for (i, e) in enumerate(formula):
if e == "(":
depth += 1
if depth == 1:
if tmp_group != "":
comp_list.append(tmp_group)
tmp_group = ""
if tmp_group_count == "":
comp_counts.append(1)
else:
comp_counts.append(int(tmp_group_count))
tmp_group_count = ""
tmp_group += e
continue
if e == ")":
depth -= 1
tmp_group += e
continue
if depth > 0:
tmp_group += e
continue
if e.isupper():
if tmp_group != "":
comp_list.append(tmp_group)
tmp_group = ""
if tmp_group_count == "":
comp_counts.append(1)
else:
comp_counts.append(int(tmp_group_count))
tmp_group_count = ""
tmp_group += e
continue
if e.islower():
tmp_group += e
continue
if e.isnumeric():
tmp_group_count += e
if tmp_group != "":
comp_list.append(tmp_group)
if tmp_group_count == "":
comp_counts.append(1)
else:
comp_counts.append(int(tmp_group_count))

# assemble dictionary from component lists
atom_dict = {}
for (comp, count) in zip(comp_list, comp_counts):
if not comp.startswith("("):
atom_dict[comp] = count
else:
atom_dicts_depth = self.get_composition(comp)
for atom in atom_dicts_depth:
atom_dicts_depth[atom] *= count
if atom in atom_dict:
atom_dict[atom] += atom_dicts_depth[atom]
else:
atom_dict[atom] = atom_dicts_depth[atom]
atom_dicts_depth = {}
return atom_dict


class BufferGas(ChemicalCompound):

def __init__(self, formula: str):
super().__init__(formula)
4 changes: 2 additions & 2 deletions imspy/imspy/chemistry/mobility.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np

SUMMARY_CONSTANT = 18509.8632163405


def one_over_k0_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
"""
Expand All @@ -11,7 +13,6 @@ def one_over_k0_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_d
:param temp: temperature of the drift gas in C°
:param t_diff: factor to translate from C° to K
"""
SUMMARY_CONSTANT = 18509.8632163405
reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
return (SUMMARY_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0)

Expand All @@ -26,6 +27,5 @@ def ccs_to_one_over_k0(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.
:param temp: temperature of the drift gas in C°
:param t_diff: factor to translate from C° to K
"""
SUMMARY_CONSTANT = 18509.8632163405
reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge)
1 change: 1 addition & 0 deletions imspy/imspy/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
import imspy_connector as pims

from imspy.core.spectrum import MzSpectrum, TimsSpectrum, IndexedMzSpectrum

from imspy.utility.utilities import re_index_indices
Expand Down
1 change: 1 addition & 0 deletions imspy/imspy/core/slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from imspy.utility.utilities import re_index_indices

import imspy_connector as pims

from imspy.core.frame import TimsFrame, TimsFrameVectorized
from imspy.core.spectrum import MzSpectrum, TimsSpectrum

Expand Down
Loading