diff --git a/imspy/examples/simulation/run_example_simulation.py b/imspy/examples/simulation/run_example_simulation.py new file mode 100644 index 00000000..640b876e --- /dev/null +++ b/imspy/examples/simulation/run_example_simulation.py @@ -0,0 +1,101 @@ +from imspy.simulation.experiment import LcImsMsMs +from imspy.simulation.hardware_models import (NeuralChromatographyApex, + NormalChromatographyProfileModel, + LiquidChromatography, + ElectroSpray, + TrappedIon, + TOF, + NeuralIonMobilityApex, + NormalIonMobilityProfileModel, + AveragineModel, + BinomialIonSource + ) +from imspy.proteome import ProteinSample, Trypsin, ORGANISM +from imspy.chemistry.mass import BufferGas + +import pandas as pd +import numpy as np + + +def irt_to_rt(irt): + return irt + + +def scan_im_interval(scan_id): + intercept = 1451.357 + slope = -877.361 + scan_id = np.atleast_1d(scan_id) + lower = ( scan_id - intercept ) / slope + upper = ((scan_id+1) - intercept ) / slope + return np.stack([1/lower, 1/upper], axis=1) + + +def im_to_scan(reduced_ion_mobility): + intercept = 1451.357 + slope = -877.361 + # TODO more appropriate function here ? + one_over_k0 = 1/reduced_ion_mobility + return np.round(one_over_k0 * slope + intercept).astype(np.int16) + + +def build_experiment(): + t = LcImsMsMs("./timstofexp1_binomial_ion_source_21_7/") # maybe rather call this class LCIMSMSExperiment + + lc = LiquidChromatography() + lc.frame_length = 1200 #ms + lc.gradient_length = 120 # min + esi = ElectroSpray() + tims = TrappedIon() + tims.scan_time = 110 # us + tof_mz = TOF() + + t.lc_method = lc + t.ionization_method = esi + t.ion_mobility_separation_method = tims + t.mz_separation_method = tof_mz + + N2 = BufferGas("N2") + + tokenizer_path = '/home/tim/Workspaces/ionmob/pretrained-models/tokenizers/tokenizer.json' + + rt_model_weights = "/home/tim/Workspaces/Resources/models/DeepChromatograpy/" + t.lc_method.apex_model = NeuralChromatographyApex(rt_model_weights,tokenizer_path = tokenizer_path) + + t.lc_method.profile_model = NormalChromatographyProfileModel() + t.lc_method.irt_to_rt_converter = irt_to_rt + + im_model_weights = "/home/tim/Workspaces/ionmob/pretrained-models/GRUPredictor" + t.ion_mobility_separation_method.apex_model = NeuralIonMobilityApex(im_model_weights, tokenizer_path = tokenizer_path) + + t.ion_mobility_separation_method.profile_model = NormalIonMobilityProfileModel() + t.ion_mobility_separation_method.buffer_gas = N2 + t.ion_mobility_separation_method.scan_to_reduced_im_interval_converter = scan_im_interval + t.ion_mobility_separation_method.reduced_im_to_scan_converter = im_to_scan + + t.ionization_method.ionization_model = BinomialIonSource() + + t.mz_separation_method.model = AveragineModel() + + rng = np.random.default_rng(2023) + # read proteome + proteome = pd.read_feather('/home/tim/Workspaces/Resources/Homo-sapiens-proteome.feather') + random_abundances = rng.integers(1e3,1e7,size=proteome.shape[0]) + proteome = proteome.assign(abundancy= random_abundances) + # create sample and sample digest; TODO: add missed cleavages to ENZYMEs + sample = ProteinSample(proteome, ORGANISM.HOMO_SAPIENS) + sample_digest = sample.digest(Trypsin()) + + # to reduce computational load in example + sample_digest.data = sample_digest.data.sample(100, random_state= rng) + + + t.load_sample(sample_digest) + return t + + +if __name__ == "__main__": + + t = build_experiment() + + #cProfile.run("t.run(10000)", filename="profiler_10000_8_process",sort="cumtime") + t.run(100, frames_per_assemble_process=10) \ No newline at end of file diff --git a/imspy/imspy/chemistry/mass.py b/imspy/imspy/chemistry/mass.py index 1cba89da..a1819638 100644 --- a/imspy/imspy/chemistry/mass.py +++ b/imspy/imspy/chemistry/mass.py @@ -1,11 +1,21 @@ import re from collections import defaultdict import numpy as np +import mendeleev as me MASS_PROTON = 1.007276466583 MASS_NEUTRON = 1.00866491597 MASS_ELECTRON = 0.00054857990946 -MASS_H2O = 18.0105646863 +MASS_WATER = 18.0105646863 + +# IUPAC standard in Kelvin +STANDARD_TEMPERATURE = 273.15 +# IUPAC standard in Pa +STANDARD_PRESSURE = 1e5 +# IUPAC elementary charge +ELEMENTARY_CHARGE = 1.602176634e-19 +# IUPAC BOLTZMANN'S CONSTANT +K_BOLTZMANN = 1.380649e-23 AMINO_ACIDS = { 'Lysine': 'K', @@ -146,7 +156,7 @@ def calculate_monoisotopic_mass(sequence): mass_sequence = np.sum([AMINO_ACID_MASSES[amino_acid] * count for amino_acid, count in aa_counts.items()]) mass_modifics = np.sum([MODIFICATIONS_MZ_NUMERICAL[mod] * count for mod, count in mod_counts.items()]) - return mass_sequence + mass_modifics + MASS_H2O + return mass_sequence + mass_modifics + MASS_WATER def calculate_mz(monoisotopic_mass, charge): @@ -174,3 +184,151 @@ def calculate_mz_from_sequence(sequence, charge): float: The m/z of the sequence. """ return calculate_mz(calculate_monoisotopic_mass(sequence), charge) + + +def get_monoisotopic_token_weight(token:str): + """ + Gets monoisotopic weight of token + + :param token: Token of aa sequence e.g. "[UNIMOD:1]" + :type token: str + :return: Weight in Dalton. + :rtype: float + """ + splits = token.split("[") + for i in range(1, len(splits)): + splits[i] = "["+splits[i] + + mass = 0 + for split in splits: + mass += AMINO_ACID_MASSES[split] + return mass + + +def get_mono_isotopic_weight(sequence_tokenized: list[str]) -> float: + mass = 0 + for token in sequence_tokenized: + mass += get_monoisotopic_token_weight(token) + return mass + MASS_WATER + + +def get_mass_over_charge(mass: float, charge: int) -> float: + return (mass / charge) + MASS_PROTON + + +def get_num_protonizable_sites(sequence: str) -> int: + """ + Gets number of sites that can be protonized. + This function does not yet account for PTMs. + + :param sequence: Amino acid sequence + :type sequence: str + :return: Number of protonizable sites + :rtype: int + """ + sites = 1 # n-terminus + for s in sequence: + if s in ["H", "R", "K"]: + sites += 1 + return sites + + +class ChemicalCompound: + + def _calculate_molecular_mass(self): + mass = 0 + for (atom, abundance) in self.element_composition.items(): + mass += me.element(atom).atomic_weight * abundance + return mass + + def __init__(self, formula): + self.element_composition = self.get_composition(formula) + self.mass = self._calculate_molecular_mass() + + def get_composition(self, formula: str): + """ + Parse chemical formula into Dict[str:int] with + atoms as keys and the respective counts as values. + + :param formula: Chemical formula of compound e.g. 'C6H12O6' + :type formula: str + :return: Dictionary Atom: Count + :rtype: Dict[str:int] + """ + if formula.startswith("("): + assert formula.endswith(")") + formula = formula[1:-1] + + tmp_group = "" + tmp_group_count = "" + depth = 0 + comp_list = [] + comp_counts = [] + + # extract components: everything in brackets and atoms + # extract component counts: number behind component or 1 + for (i, e) in enumerate(formula): + if e == "(": + depth += 1 + if depth == 1: + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e == ")": + depth -= 1 + tmp_group += e + continue + if depth > 0: + tmp_group += e + continue + if e.isupper(): + if tmp_group != "": + comp_list.append(tmp_group) + tmp_group = "" + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + tmp_group_count = "" + tmp_group += e + continue + if e.islower(): + tmp_group += e + continue + if e.isnumeric(): + tmp_group_count += e + if tmp_group != "": + comp_list.append(tmp_group) + if tmp_group_count == "": + comp_counts.append(1) + else: + comp_counts.append(int(tmp_group_count)) + + # assemble dictionary from component lists + atom_dict = {} + for (comp, count) in zip(comp_list, comp_counts): + if not comp.startswith("("): + atom_dict[comp] = count + else: + atom_dicts_depth = self.get_composition(comp) + for atom in atom_dicts_depth: + atom_dicts_depth[atom] *= count + if atom in atom_dict: + atom_dict[atom] += atom_dicts_depth[atom] + else: + atom_dict[atom] = atom_dicts_depth[atom] + atom_dicts_depth = {} + return atom_dict + + +class BufferGas(ChemicalCompound): + + def __init__(self, formula: str): + super().__init__(formula) diff --git a/imspy/imspy/chemistry/mobility.py b/imspy/imspy/chemistry/mobility.py index 796f15c8..d668d527 100644 --- a/imspy/imspy/chemistry/mobility.py +++ b/imspy/imspy/chemistry/mobility.py @@ -1,5 +1,7 @@ import numpy as np +SUMMARY_CONSTANT = 18509.8632163405 + def one_over_k0_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): """ @@ -11,7 +13,6 @@ def one_over_k0_to_ccs(one_over_k0, mz, charge, mass_gas=28.013, temp=31.85, t_d :param temp: temperature of the drift gas in C° :param t_diff: factor to translate from C° to K """ - SUMMARY_CONSTANT = 18509.8632163405 reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) return (SUMMARY_CONSTANT * charge) / (np.sqrt(reduced_mass * (temp + t_diff)) * 1 / one_over_k0) @@ -26,6 +27,5 @@ def ccs_to_one_over_k0(ccs, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273. :param temp: temperature of the drift gas in C° :param t_diff: factor to translate from C° to K """ - SUMMARY_CONSTANT = 18509.8632163405 reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) return ((np.sqrt(reduced_mass * (temp + t_diff))) * ccs) / (SUMMARY_CONSTANT * charge) diff --git a/imspy/imspy/core/frame.py b/imspy/imspy/core/frame.py index 886673c0..7a6ef5a1 100644 --- a/imspy/imspy/core/frame.py +++ b/imspy/imspy/core/frame.py @@ -7,6 +7,7 @@ import numpy as np import imspy_connector as pims + from imspy.core.spectrum import MzSpectrum, TimsSpectrum, IndexedMzSpectrum from imspy.utility.utilities import re_index_indices diff --git a/imspy/imspy/core/slice.py b/imspy/imspy/core/slice.py index 2ff48ec7..fc5f3a3b 100644 --- a/imspy/imspy/core/slice.py +++ b/imspy/imspy/core/slice.py @@ -8,6 +8,7 @@ from imspy.utility.utilities import re_index_indices import imspy_connector as pims + from imspy.core.frame import TimsFrame, TimsFrameVectorized from imspy.core.spectrum import MzSpectrum, TimsSpectrum diff --git a/imspy/imspy/core/spectrum.py b/imspy/imspy/core/spectrum.py index af39760a..1d229e50 100644 --- a/imspy/imspy/core/spectrum.py +++ b/imspy/imspy/core/spectrum.py @@ -1,12 +1,30 @@ +from __future__ import annotations +import json import numpy as np -from typing import List, Tuple - +from typing import List, Tuple, Callable import pandas as pd from numpy.typing import NDArray - +from scipy.signal import find_peaks import imspy_connector as pims +def get_peak_integral(peaks: NDArray[np.int32], peak_info: dict) -> NDArray[np.float64]: + """Calculates the integral of the peaks in a spectrum. + + Args: + peaks (NDArray[np.int32]): Peak indices. + peak_info (dict): Peak info. + + Returns: + NDArray[np.float64]: Peak integrals. + """ + integrals = np.zeros(len(peaks), dtype=np.float64) + FWHM = peak_info['widths'] + h = peak_info['prominences'] + integrals = np.sqrt(2*np.pi) * h * FWHM / (2*np.sqrt(2*np.log(2))) + return integrals + + class IndexedMzSpectrum: def __init__(self, index: NDArray[np.int32], mz: NDArray[np.float64], intensity: NDArray[np.float64]): """IndexedMzSpectrum class. @@ -103,6 +121,27 @@ def __repr__(self): class MzSpectrum: + + @classmethod + def from_jsons(cls, jsons: str) -> MzSpectrum: + json_dict:dict = json.loads(jsons) + mz = json_dict["mz"] + intensity = json_dict["intensity"] + return cls(np.array(mz, dtype=np.float64), np.array(intensity, dtype=np.float64)) + + @classmethod + def from_mz_spectra_list(cls, spectra_list:List[MzSpectrum], resolution: int)->MzSpectrum: + """Generates a convoluted mass spectrum by adding all spectra in the given list. + + Args: + spectra_list (List[MzSpectrum]): List of mass spectra. + resolution (int): Desired resolution of returned spectrum. + + Returns: + MzSpectrum: Convoluted spectrum. + """ + return cls.from_py_mz_spectrum(pims.PyMzSpectrum.from_mzspectra_list([spectrum.__spec_ptr for spectrum in spectra_list], resolution)) + def __init__(self, mz: NDArray[np.float64], intensity: NDArray[np.float64]): """MzSpectrum class. @@ -160,9 +199,21 @@ def from_py_mz_spectrum(cls, spec: pims.PyMzSpectrum): def __repr__(self): return f"MzSpectrum(num_peaks={len(self.mz)})" + + def __mul__(self, scale) -> MzSpectrum: + """Overwrite * operator for scaling of spectrum + + Args: + scale (float): Scale. + + Returns: + MzSpectrum: Scaled spectrum + """ + tmp: pims.PyMzSpectrum = self.__spec_ptr * scale + return self.from_py_mz_spectrum(tmp) def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5, - min_intensity: float = 1) -> Tuple[NDArray, List['MzSpectrum']]: + min_intensity: float = 1) -> Tuple[NDArray, List[MzSpectrum]]: """Convert the spectrum to a list of windows. Args: @@ -178,8 +229,20 @@ def to_windows(self, window_length: float = 10, overlapping: bool = True, min_nu indices, windows = self.__spec_ptr.to_windows(window_length, overlapping, min_num_peaks, min_intensity) return indices, [MzSpectrum.from_py_mz_spectrum(window) for window in windows] + def to_resolution(self, resolution: int) -> MzSpectrum: + """Bins the spectrum's m/z values to a + given resolution and sums the intensities. + + Args: + resolution (int): Negative decadic logarithm of bin size. + + Returns: + MzSpectrum: A new `MzSpectrum` where m/z values are binned according to the given resolution. + """ + return MzSpectrum.from_py_mz_spectrum(self.__spec_ptr.to_resolution(resolution)) + def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, intensity_min: float = 0.0, - intensity_max: float = 1e9) -> 'MzSpectrum': + intensity_max: float = 1e9) -> MzSpectrum: """Filter the spectrum for a given m/z range and intensity range. Args: @@ -194,7 +257,7 @@ def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, intensity_min: flo return MzSpectrum.from_py_mz_spectrum( self.__spec_ptr.filter_ranged(mz_min, mz_max, intensity_min, intensity_max)) - def vectorized(self, resolution: int = 2) -> 'MzSpectrumVectorized': + def vectorized(self, resolution: int = 2) -> MzSpectrumVectorized: """Convert the spectrum to a vectorized spectrum. Args: @@ -204,8 +267,18 @@ def vectorized(self, resolution: int = 2) -> 'MzSpectrumVectorized': MzSpectrumVectorized: Vectorized spectrum. """ return MzSpectrumVectorized.from_py_mz_spectrum_vectorized(self.__spec_ptr.vectorized(resolution)) + + def to_jsons(self) -> str: + """ + generates json string representation of MzSpectrum + """ + json_dict = {} + json_dict["mz"] = self.mz.tolist() + json_dict["intensity"] = self.intensity.tolist() + return json.dumps(json_dict) + class MzSpectrumVectorized: def __init__(self, indices: NDArray[np.int32], values: NDArray[np.float64], resolution: int): """MzSpectrum class. @@ -237,7 +310,7 @@ def from_py_mz_spectrum_vectorized(cls, spec: pims.PyMzSpectrumVectorized): @property def resolution(self) -> float: """Resolution. - + Returns: float: Resolution. """ @@ -261,6 +334,21 @@ def values(self) -> NDArray[np.float64]: """ return self.__spec_ptr.values + def to_centroided(self, integrate_method: Callable = get_peak_integral) -> MzSpectrum: + """Convert the spectrum to a centroided spectrum. + + Returns: + MzSpectrum: Centroided spectrum. + """ + # first generate dense spectrum + dense_spectrum = MzSpectrumVectorized.from_py_mz_spectrum_vectorized(self.__spec_ptr.to_dense_spectrum(None)) + # find peaks in the dense spectrum and widths with scipy + peaks, peak_info = find_peaks(dense_spectrum.values, height=0, width=(0,0.5)) + # then get the peak integrals + integrals = integrate_method(peaks, peak_info) + # then create a new spectrum with the peak indices and the integrals + return MzSpectrum.from_py_mz_spectrum(pims.PyMzSpectrum(dense_spectrum.indices[peaks]/np.power(10,dense_spectrum.resolution), integrals)) + def __repr__(self): return f"MzSpectrumVectorized(num_values={len(self.values)})" diff --git a/imspy/imspy/feature.py b/imspy/imspy/feature.py new file mode 100644 index 00000000..53ab596f --- /dev/null +++ b/imspy/imspy/feature.py @@ -0,0 +1,211 @@ +import pandas as pd +import numpy as np +from numpy.typing import ArrayLike + +import json + +from imspy.data import TimsSlice, TimsFrame +from imspy.utility import gaussian, exp_gaussian +from imspy.isotopes import IsotopePatternGenerator, create_initial_feature_distribution +from abc import ABC, abstractmethod +from typing import Optional, Dict +class Profile: + + def __init__(self,positions:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + if jsons is not None: + self._jsons = jsons + self.positions, self.rel_abundancies, self.model_params, self.access_dictionary = self._from_jsons(jsons) + else: + self.positions = np.asarray(positions) + self.rel_abundancies = np.asarray(rel_abundancies) + self.model_params = model_params + self.access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)} + self._jsons = self._to_jsons() + + def __iter__(self): + self.__size = len(self.positions) + self.__iterator_pos = 0 + return self + + def __next__(self): + if self.__iterator_pos < self.__size: + p = self.positions[self.__iterator_pos] + ra = self.rel_abundancies[self.__iterator_pos] + self.__iterator_pos += 1 + return p,ra + raise StopIteration + + def __getitem__(self, position: int): + return self.access_dictionary[position] + + def _to_jsons(self): + mp = {} + for key,val in self.model_params.items(): + if isinstance(val,np.generic): + mp[key] = val.item() + else: + mp[key] = val + + json_dict = {"positions": self.positions.tolist(), + "rel_abundancies": self.rel_abundancies.tolist(), + "model_params": mp} + return json.dumps(json_dict, allow_nan = False) + + def _from_jsons(self, jsons:str): + json_dict = json.loads(jsons) + positions = json_dict["positions"] + rel_abundancies = json_dict["rel_abundancies"] + access_dictionary = {p:i for p,i in zip(positions, rel_abundancies)} + return positions,rel_abundancies,json_dict["model_params"], access_dictionary + + @property + def jsons(self): + return self._jsons + +class RTProfile(Profile): + + def __init__(self,frames:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + super().__init__(frames, rel_abundancies, model_params, jsons) + + @property + def frames(self): + return self.positions + +class ScanProfile(Profile): + + def __init__(self,scans:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + super().__init__(scans, rel_abundancies, model_params, jsons) + + @property + def scans(self): + return self.positions + +class ChargeProfile(Profile): + def __init__(self,charges:Optional[ArrayLike] = None, rel_abundancies:Optional[ArrayLike] = None, model_params: Optional[Dict] = None, jsons:Optional[str] = None): + abundant_charges = charges[rel_abundancies>0] + relative_abundancies_over_0 = rel_abundancies[rel_abundancies>0] + super().__init__(abundant_charges, relative_abundancies_over_0, model_params, jsons) + + @property + def charges(self): + return self.positions + +class FeatureGenerator(ABC): + def __init__(self): + pass + + @abstractmethod + def generate_feature(self, mz: float, charge: int): + pass + + +class PrecursorFeatureGenerator(FeatureGenerator): + + def __init__(self): + super(PrecursorFeatureGenerator).__init__() + + def generate_feature(self, mz: float, charge: int, + pattern_generator: IsotopePatternGenerator, + num_rt: int = 64, + num_im: int = 32, + distr_im=gaussian, + distr_rt=exp_gaussian, + rt_lower: float = -9, + rt_upper: float = 18, + im_lower: float = -4, + im_upper: float = 4, + intensity_amplitude: float = 1e3, + min_intensity: int = 5) -> TimsSlice: + + I = create_initial_feature_distribution(num_rt, num_im, rt_lower, rt_upper, + im_lower, im_upper, distr_im, distr_rt) + + frame_list = [] + + spec = pattern_generator.generate_spectrum(mz, charge, min_intensity=5, centroided=True, k=12) + mz, intensity = spec.mz, spec.intensity / np.max(spec.intensity) * 100 + + for i in range(num_rt): + + scan_arr, mz_arr, intensity_arr, tof_arr, inv_mob_arr = [], [], [], [], [] + + for j in range(num_im): + intensity_scaled = intensity * I[i, j] + intensity_scaled = intensity_scaled * intensity_amplitude + int_intensity = np.array([int(x) for x in intensity_scaled]) + + bt = [(x, y) for x, y in zip(mz, int_intensity) if y >= min_intensity] + + mz_tmp = np.array([x for x, y in bt]) + intensity_tmp = np.array([y for x, y in bt]).astype(np.int32) + + rts = np.repeat(i, intensity_tmp.shape[0]).astype(np.float32) + scans = np.repeat(j, intensity_tmp.shape[0]).astype(np.int32) + tof = np.ones_like(rts).astype(np.int32) + inv_mob = np.ones_like(scans) + + scan_arr.append(scans) + mz_arr.append(mz_tmp) + intensity_arr.append(intensity_tmp) + tof_arr.append(tof) + inv_mob_arr.append(inv_mob) + + frame = TimsFrame(None, i, float(i), + np.hstack(scan_arr), np.hstack(mz_arr), np.hstack(intensity_arr), + np.hstack(tof_arr), np.hstack(inv_mob_arr)) + + frame_list.append(frame) + + return TimsSlice(None, frame_list, []) + + +class FeatureBatch: + def __init__(self, feature_table: pd.DataFrame, raw_data: TimsSlice): + """ + + :param feature_table: + :param raw_data: + """ + self.feature_table = feature_table.sort_values(['rt_length'], ascending=False) + self.raw_data = raw_data + self.__feature_counter = 0 + self.current_row = self.feature_table.iloc[0] + r = self.current_row + self.current_feature = self.get_feature(r.mz - 0.1, r.mz + (r.num_peaks / r.charge) + 0.1, + r.im_start, + r.im_stop, + r.rt_start, + r.rt_stop) + + def __repr__(self): + return f'FeatureBatch(features={self.feature_table.shape[0]}, slice={self.raw_data})' + + def get_feature(self, mz_min, mz_max, scan_min, scan_max, rt_min, rt_max, intensity_min=20): + return self.raw_data.filter_ranged(mz_min=mz_min, + mz_max=mz_max, + scan_min=scan_min, + scan_max=scan_max, + intensity_min=intensity_min, + rt_min=rt_min, + rt_max=rt_max) + + def __iter__(self): + return self.current_row, self.current_feature + + def __next__(self): + feature = self.current_row + data = self.current_feature + + if self.__feature_counter < self.feature_table.shape[0]: + self.__feature_counter += 1 + self.current_row = self.feature_table.iloc[self.__feature_counter] + r = self.current_row + self.current_feature = self.get_feature(r.mz - 0.1, + r.mz + (r.num_peaks / r.charge) + 0.1, + r.im_start, + r.im_stop, + r.rt_start, + r.rt_stop) + return feature, data + + raise StopIteration diff --git a/imspy/imspy/isotopes.py b/imspy/imspy/isotopes.py new file mode 100644 index 00000000..da75dae2 --- /dev/null +++ b/imspy/imspy/isotopes.py @@ -0,0 +1,242 @@ +from __future__ import annotations +from typing import Optional, Tuple +import warnings +import numpy as np +from numpy.typing import ArrayLike +from abc import ABC, abstractmethod + +from scipy.signal import argrelextrema +from imspy.data import MzSpectrum +from imspy.utility import gaussian, exp_gaussian, normal_pdf +import numba +import pyopenms + +from imspy.noise import detection_noise + +MASS_PROTON = 1.007276466621 +MASS_NEUTRON = 1.00866491595 + + +@numba.jit(nopython=True) +def factorial(n: int): + if n <= 0: + return 1 + return n * factorial(n - 1) + + +# linear dependency of mass +@numba.jit(nopython=True) +def lam(mass: float, slope: float = 0.000594, intercept: float = -0.03091): + """ + :param intercept: + :param slope: + :param mass: + :return: + """ + return slope * mass + intercept + + +@numba.jit(nopython=True) +def weight(mass: float, peak_nums: ArrayLike, normalize: bool = True): + """ + :param mass: + :param num_steps: + :param normalize: + :return: + """ + factorials = np.zeros_like(peak_nums) + norm = 1 + for i,k in enumerate(peak_nums): + factorials[i] = factorial(k) + weights = np.exp(-lam(mass)) * np.power(lam(mass), peak_nums) / factorials + if normalize: + norm = weights.sum() + return weights/norm + +def get_pyopenms_weights(sequence: str, peak_nums: ArrayLike, generator: pyopenms.CoarseIsotopePatternGenerator): + """ + Gets hypothetical intensities of isotopic peaks. + + :param sequence: Peptide sequence in proForma format. + :type sequence: str + :param generator: pyopenms generator for isotopic pattern calculation + :type generator: pyopenms.CoarseIsotopePatternGenerator + :return: List of isotopic peak intensities + :rtype: List + """ + + n = peak_nums.shape[0] + generator.setMaxIsotope(n) + aa_sequence = sequence.strip("_") + peptide= pyopenms.AASequence().fromString(aa_sequence.replace("[","(").replace("]",")")) + formula = peptide.getFormula() + distribution = generator.run(formula) + intensities = [i.getIntensity() for i in distribution.getContainer()] + return intensities + +@numba.jit(nopython=True) +def iso(x: ArrayLike, mass: float, charge: float, sigma: float, amp: float, K: int, step_size:float, add_detection_noise: bool = False, mass_neutron: float = MASS_NEUTRON): + """ + :param mass_neutron: + :param x: + :param mass: + :param charge: + :param sigma: + :param amp: + :param K:4 + :param step_size: + :param add_detection_noise: + :return: + """ + k = np.arange(K) + means = ((mass + mass_neutron * k) / charge).reshape((1,-1)) + weights = weight(mass,k).reshape((1,-1)) + intensities = np.sum(weights*normal_pdf(x.reshape((-1,1)), means, sigma), axis= 1)*step_size + if add_detection_noise: + return detection_noise(intensities*amp) + else: + return intensities * amp + + +@numba.jit(nopython=True) +def numba_generate_pattern(lower_bound: float, + upper_bound: float, + mass: float, + charge: float, + amp: float, + k: int, + sigma: float = 0.008492569002123142, + resolution: int = 3): + """ + :param lower_bound: + :param upper_bound: + :param mass: + :param charge: + :param amp: + :param k: + :param sigma: + :param resolution: + :return: + """ + step_size = min(sigma/10,1/np.power(10,resolution)) + size = int((upper_bound-lower_bound)//step_size+1) + mzs = np.linspace(lower_bound,upper_bound,size) + intensities = iso(mzs,mass,charge,sigma,amp,k,step_size) + + return mzs + MASS_PROTON, intensities + +#@numba.jit(nopython= True) +def numba_ion_sampler(mass: float, charge: int, sigma: ArrayLike, k: int, ion_count: int, intensity_per_ion: int ): + sigma = np.atleast_1d(sigma) + if sigma.size == 1: + sigma = np.repeat(sigma,k) + if sigma.size != k: + raise ValueError("Sigma must be either length 1 or k") + + + us = np.zeros(ion_count) + devs_std = np.zeros(ion_count) + for ion in range(ion_count): + u = np.random.uniform() + dev_std = np.random.normal() + us[ion] = u + devs_std[ion] = dev_std + + weights = weight(mass, np.arange(k)).cumsum() + + def _get_component(u: float, weights_cum_sum: ArrayLike = weights) -> int: + for idx, weight_sum in enumerate(weights_cum_sum): + if u < weight_sum: + return idx + + comps = np.zeros_like(us) + devs = np.zeros_like(us) + for idx, u in enumerate(us): + comp = _get_component(u) + comps[idx] = comp + devs[idx] = sigma[comp] * devs_std[idx] + + mz = ((comps*MASS_NEUTRON) + mass) / charge + MASS_PROTON + devs + i = np.ones_like(mz) * intensity_per_ion + return (mz, i) + +@numba.jit +def create_initial_feature_distribution(num_rt: int, num_im: int, + rt_lower: float = -9, + rt_upper: float = 18, + im_lower: float = -4, + im_upper: float = 4, + distr_im=gaussian, + distr_rt=exp_gaussian) -> np.array: + + I = np.ones((num_rt, num_im)).astype(np.float32) + + for i, x in enumerate(np.linspace(im_lower, im_upper, num_im)): + for j, y in enumerate(np.linspace(rt_lower, rt_upper, num_rt)): + I[j, i] *= (distr_im(x) * distr_rt(y)) + + return I + + +class IsotopePatternGenerator(ABC): + def __init__(self): + pass + @abstractmethod + def generate_pattern(self, mass: float, charge: int) -> Tuple[ArrayLike, ArrayLike]: + pass + + @abstractmethod + def generate_spectrum(self, mass: int, charge: int, min_intensity: int) -> MzSpectrum: + pass + + +class AveragineGenerator(IsotopePatternGenerator): + def __init__(self): + super().__init__() + self.default_abundancy = 1e4 + + def generate_pattern(self, mass: float, charge: int, k: int = 7, + amp: Optional[float] = None, resolution: float = 3, + min_intensity: int = 5) -> Tuple[ArrayLike, ArrayLike]: + pass + + def generate_spectrum(self, mass: int, charge: int, min_intensity: int = 5, k: int = 7, + amp :Optional[float] = None, resolution:float =3, centroided: bool = True) -> MzSpectrum: + if amp is None: + amp = self.default_abundancy + + lb = mass / charge - .2 + ub = mass / charge + k + .2 + + mz, i = numba_generate_pattern(lower_bound=lb, upper_bound=ub, + mass=mass, charge=charge, amp=amp, k=k, resolution=resolution) + #Todo implement centroid method + + if centroided: + return MzSpectrum(mz, i)\ + .to_resolution(resolution).filter(lb,ub,min_intensity)\ + .to_centroided(baseline_noise_level=max(min_intensity,1), sigma=1/np.power(10,resolution-1)) + + return MzSpectrum(mz, i).to_resolution(resolution).filter(lb,ub,min_intensity) + + def generate_spectrum_by_sampling(self, + mass: float, + charge: int, + k:int =7, + sigma: ArrayLike = 0.008492569002123142, + ion_count:int = 1000000, + resolution:float = 3, + intensity_per_ion:int = 1, + centroided = True, + min_intensity = 5) -> MzSpectrum: + + assert 100 <= mass / charge <= 2000, f"m/z should be between 100 and 2000, was: {mass / charge}" + mz, i = numba_ion_sampler(mass, charge, sigma, k, ion_count, intensity_per_ion) + + if centroided: + return ( MzSpectrum(mz,i) + .to_resolution(resolution).filter(-1,-1,min_intensity) + .to_centroided(baseline_noise_level=max(min_intensity,1), sigma=1/np.power(10,resolution-1)) + ) + return MzSpectrum(mz,i)\ + .to_resolution(resolution).filter(-1,-1,min_intensity) \ No newline at end of file diff --git a/imspy/imspy/noise.py b/imspy/imspy/noise.py new file mode 100644 index 00000000..c2cf163b --- /dev/null +++ b/imspy/imspy/noise.py @@ -0,0 +1,124 @@ +""" This module contains several noise models, + such as detection noise, shot noise and baseline noise. +""" +import numpy as np +import numba +from typing import Callable, Optional, Tuple +from numpy.typing import ArrayLike +from imspy.data import MzSpectrum +from imspy.utility import normal_pdf + +@numba.jit(nopython=True) +def mu_function_normal_default(intensity: ArrayLike) -> ArrayLike: + return intensity + +@numba.jit(nopython=True) +def sigma_function_normal_default(intensity:ArrayLike) -> ArrayLike: + return np.sqrt(intensity) + +@numba.jit(nopython=True) +def mu_function_poisson_default(intensity:ArrayLike) -> ArrayLike: + offset = 0 + return intensity+ offset + +@numba.jit(nopython=True) +def detection_noise(signal: ArrayLike, + method: str = "poisson", + custom_mu_function: Optional[Callable] = None, + custom_sigma_function: Optional[Callable] = None) -> ArrayLike: + """ + + :param signal: + :param method: + :param custom_mu_function: + :param custom_sigma_function: + """ + if method == "normal": + if custom_sigma_function is None: + sigma_function:Callable = sigma_function_normal_default + else: + sigma_function:Callable = custom_sigma_function + if custom_mu_function is None: + mu_function:Callable = mu_function_normal_default + else: + mu_function:Callable = custom_mu_function + + sigmas = sigma_function(signal) + mus = mu_function(signal) + noised_signal = np.zeros_like(mus) + + for i in range(noised_signal.size): + s = sigmas[i] + m = mus[i] + n = np.random.normal()*s+m + noised_signal[i] = n + + elif method == "poisson": + if custom_sigma_function is not None: + raise ValueError("Sigma function is not used if method is 'poisson'") + if custom_mu_function is None: + mu_function:Callable = mu_function_poisson_default + else: + mu_function:Callable = custom_mu_function + mus = mu_function(signal) + noised_signal = np.zeros_like(mus) + + for i in range(noised_signal.size): + mu = mus[i] + n = np.random.poisson(lam = mu) + noised_signal[i] = n + + else: + raise NotImplementedError("This method is not implemented, choose 'normal' or 'poisson'.") + + return noised_signal + +@numba.jit(nopython=True) +def generate_noise_peak(pos:float, sigma: float, intensity: float, min_intensity:int = 0, resolution:float = 3): + step_size = min(sigma/10,1/np.power(10,resolution)) + lower = int((pos-4*sigma)//step_size) + upper = int((pos+4*sigma)//step_size) + mzs = np.arange(lower,upper)*step_size + intensities = normal_pdf(mzs,pos,sigma,normalize=True)*intensity*step_size + return (mzs[intensities>=min_intensity], intensities[intensities>=min_intensity].astype(np.int64)) + + +def baseline_shot_noise_window(window:MzSpectrum, + window_theoretical_mz_min:float, + window_theoretical_mz_max:float, + expected_noise_peaks: int = 5, + expected_noise_intensity: float = 10, + expected_noise_sigma:float = 0.001, + resolution:float = 3) -> MzSpectrum: + """ + + """ + num_noise_peaks = np.random.poisson(lam=expected_noise_peaks) + noised_window = MzSpectrum([],[]) + for i in range(num_noise_peaks): + location_i = np.random.uniform(window_theoretical_mz_min,window_theoretical_mz_max) + intensity_i = np.random.exponential(expected_noise_intensity) + sigma_i = np.random.exponential(expected_noise_sigma) + noise_mz, noise_intensity = generate_noise_peak(location_i,sigma_i,intensity_i,resolution=resolution) + noise_peak = MzSpectrum(noise_mz, noise_intensity) + noised_window = noised_window+noise_peak + return noised_window + + +def baseline_shot_noise(spectrum:MzSpectrum,window_size:float=50,expected_noise_peaks_per_Th:int=10,min_intensity:int = 5, resolution = 3): + """ + + + """ + min_mz = spectrum.mz.min()-0.1 + max_mz = spectrum.mz.max()+0.1 + bins,windows = spectrum.windows(window_size,overlapping=False,min_peaks=0,min_intensity=0) + noised_spectrum = spectrum + for b,w in zip(bins,windows): + lower = b*window_size + upper = (b+1)*window_size + noised_spectrum = noised_spectrum+baseline_shot_noise_window(w,lower,upper,window_size*expected_noise_peaks_per_Th) + return noised_spectrum.to_resolution(resolution).filter(min_mz,max_mz,min_intensity) + +def baseline_noise(): + pass \ No newline at end of file diff --git a/imspy/imspy/proteome.py b/imspy/imspy/proteome.py new file mode 100644 index 00000000..85103430 --- /dev/null +++ b/imspy/imspy/proteome.py @@ -0,0 +1,322 @@ +from __future__ import annotations +import os +import warnings +import numpy as np +from numpy.typing import ArrayLike +import pandas as pd +import sqlite3 +from imspy.data import MzSpectrum +from imspy.chemistry import get_mono_isotopic_weight, MASS_PROTON + +from imspy.feature import RTProfile, ScanProfile, ChargeProfile +from imspy.utility import tokenize_proforma_sequence, TokenSequence, get_aa_num_proforma_sequence +from enum import Enum +from abc import ABC, abstractmethod + +from typing import Optional, List, Union + +class ENZYME(Enum): + TRYPSIN = 1 + + +class ORGANISM(Enum): + HOMO_SAPIENS = 1 + + +class Enzyme(ABC): + def __init__(self, name: ENZYME): + self.name = name + + @abstractmethod + def calculate_cleavages(self, sequence: str) -> np.array: + pass + + def digest(self, sequence: str, missed_cleavages: int, min_length) -> list[str]: + pass + + +class Trypsin(Enzyme): + def __init__(self, name: ENZYME = ENZYME.TRYPSIN): + super().__init__(name) + self.name = name + + def calculate_cleavages(self, sequence: str): + """ + Scans sequence for cleavage motifs + and returns list of tuples of peptide intervals. + + :param sequence: protein sequence to digest + :type sequence: str + :return: beginning with + :rtype: List[Tuple[int,int]] + """ + cut_sites = [] + start = 0 + end = 0 + in_unimod_bracket = False + for i,aa in enumerate(sequence[:-1]): + # skip unimod brackets + if aa == "(": + in_unimod_bracket = True + if in_unimod_bracket: + if aa == ")": + in_unimod_bracket = False + continue + + # cut after every 'K' or 'R' that is not followed by 'P' + + next_aa = sequence[i+1] + if ((aa == 'K') or (aa == 'R')) and next_aa != 'P': + end = i+1 + cut_sites.append((start, end)) + start = end + + cut_sites.append((start, len(sequence))) + + return np.array(cut_sites) + + def __repr__(self): + return f'Enzyme(name: {self.name.name})' + + def digest(self, sequence:str, abundancy:float, missed_cleavages:int =0, min_length:int =7): + """ + Digests a protein into peptides. + + :param sequence: Sequence of protein in ProForma standard. + :type sequence: str + :param abundancy: Abundance of protein + :type abundancy: float + :param missed_cleavages: Number of allowed missed cleavages, defaults to 0 + :type missed_cleavages: int, optional + :param min_length: Min length of recorded peptides, defaults to 7 + :type min_length: int, optional + :return: List of dictionaries with `sequence`,`start`, `end` and `abundance` as keys. + :rtype: List[Dict] + """ + assert 0 <= missed_cleavages <= 2, f'Number of missed cleavages might be between 0 and 2, was: {missed_cleavages}' + + peptide_intervals = self.calculate_cleavages(sequence) + + # TODO: implement + if missed_cleavages == 1: + pass + if missed_cleavages == 2: + pass + + dig_list = [] + + for s, e in peptide_intervals: + dig_list.append(sequence[s: e]) + + # pair sequence digests with indices + wi = zip(dig_list, peptide_intervals) + # filter out short sequences and clean index display + wi = map(lambda p: (p[0], p[1][0], p[1][1]), filter(lambda s: get_aa_num_proforma_sequence(s[0]) >= min_length, wi)) + + return list(map(lambda e: {'sequence': e[0], 'start': e[1], 'end': e[2], 'abundancy': abundancy}, wi)) + + +class PeptideDigest: + def __init__(self, data: pd.DataFrame, organism: ORGANISM, enzyme: ENZYME): + self.data = data + self.organism = organism + self.enzyme = enzyme + + def __repr__(self): + return f'PeptideMix(Sample: {self.organism.name}, Enzyme: {self.enzyme.name}, number peptides: {self.data.shape[0]})' + + +class ProteinSample: + + def __init__(self, data: pd.DataFrame, name: ORGANISM): + self.data = data + self.name = name + + def digest(self, enzyme: Enzyme, missed_cleavages: int = 0, min_length: int = 7) -> PeptideDigest: + """ + Digest all proteins in the sample with `enzyme` + + :param enzyme: Enzyme for digestion e.g. instance of `Trypsin` + :type enzyme: Enzyme + :param missed_cleavages: Number of allowed missed cleavages, defaults to 0 + :type missed_cleavages: int, optional + :param min_length: Minimum length of peptide to be recorded, defaults to 7 + :type min_length: int, optional + :return: Digested sample + :rtype: PeptideDigest + """ + # digest with enzyme row-wise (one protein at a time) + digest = self.data.apply(lambda r: enzyme.digest(r['sequence'], r['abundancy'], missed_cleavages, min_length), axis=1) + + V = zip(self.data['id'].values, digest.values) + + r_list = [] + + for (gene, peptides) in V: + for (pep_idx, pep) in enumerate(peptides): + # discard sequences with unknown amino acids + if pep['sequence'].find('X') == -1: + pep['gene_id'] = gene + pep['pep_id'] = f"{gene}_{pep_idx}" + pep['sequence_tokenized'] = tokenize_proforma_sequence(pep['sequence']) + pep['mass_theoretical'] = get_mono_isotopic_weight(pep['sequence_tokenized']) + pep['sequence_tokenized'] = TokenSequence(pep['sequence_tokenized']) + pep['sequence'] = '_' + pep['sequence'] + '_' + r_list.append(pep) + + return PeptideDigest(pd.DataFrame(r_list), self.name, enzyme.name) + + def __repr__(self): + return f'ProteinSample(Organism: {self.name.name})' + +class ProteomicsExperimentDatabaseHandle: + def __init__(self,path:str): + if not os.path.exists(path): + print("Connect to Database") + + self.con = sqlite3.connect(path) + self._chunk_size = None + + def push(self, table_name:str, data:PeptideDigest): + if table_name == "PeptideDigest": + assert isinstance(data, PeptideDigest), "For pushing to table 'PeptideDigest' data type must be `PeptideDigest`" + df = data.data.apply(self.make_sql_compatible) + else: + raise ValueError("This Table does not exist and is not supported") + + df.to_sql(table_name, self.con, if_exists="replace") + + def update(self, data_slice: ProteomicsExperimentSampleSlice): + assert isinstance(data_slice, ProteomicsExperimentSampleSlice) + df_separated_peptides = data_slice.peptides.apply(self.make_sql_compatible) + df_ions = data_slice.ions.apply(self.make_sql_compatible) + df_separated_peptides.to_sql("SeparatedPeptides", self.con, if_exists="append") + df_ions.to_sql("Ions", self.con , if_exists="append") + + def load(self, table_name:str, query:Optional[str] = None): + if query is None: + query = f"SELECT * FROM {table_name}" + return pd.read_sql(query,self.con, index_col="index") + + def load_chunks(self, chunk_size: int, query:Optional[str] = None): + if query is None: + query = "SELECT * FROM PeptideDigest" + self.__chunk_generator = pd.read_sql(query,self.con, chunksize=chunk_size, index_col="index") + for chunk in self.__chunk_generator: + chunk.loc[:,"sequence_tokenized"] = chunk.loc[:,"sequence_tokenized"].transform(lambda st: TokenSequence(None, jsons=st)) + yield(ProteomicsExperimentSampleSlice(peptides = chunk)) + + def load_frames(self, frame_range:Tuple[int,int], spectra_as_jsons = False): + query = ( + "SELECT SeparatedPeptides.pep_id, " + "SeparatedPeptides.sequence, " + "SeparatedPeptides.simulated_frame_profile, " + "SeparatedPeptides.mass_theoretical, " + "SeparatedPeptides.abundancy, " + "SeparatedPeptides.frame_min, " + "SeparatedPeptides.frame_max, " + "Ions.mz, " + "Ions.charge, " + "Ions.relative_abundancy, " + "Ions.scan_min, " + "Ions.scan_max, " + "Ions.simulated_scan_profile, " + "Ions.simulated_mz_spectrum " + "FROM SeparatedPeptides " + "INNER JOIN Ions " + "ON SeparatedPeptides.pep_id = Ions.pep_id " + f"AND SeparatedPeptides.frame_min < {frame_range[1]} " + f"AND SeparatedPeptides.frame_max >= {frame_range[0]} " + ) + df = pd.read_sql(query, self.con) + + # unzip jsons + df.loc[:,"simulated_scan_profile"] = df["simulated_scan_profile"].transform(lambda sp: ScanProfile(jsons=sp)) + df.loc[:,"simulated_frame_profile"] = df["simulated_frame_profile"].transform(lambda rp: RTProfile(jsons=rp)) + if spectra_as_jsons: + return df + else: + df.loc[:,"simulated_mz_spectrum"] = df["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) + + return df + + @staticmethod + def make_sql_compatible(column): + if column.size < 1: + return + if isinstance(column.iloc[0], (RTProfile, ScanProfile, ChargeProfile, TokenSequence)): + return column.apply(lambda x: x.jsons) + elif isinstance(column.iloc[0], MzSpectrum): + return column.apply(lambda x: x.to_jsons()) + else: + return column + +class ProteomicsExperimentSampleSlice: + """ + exposed dataframe of database + """ + def __init__(self, peptides:pd.DataFrame, ions:Optional[pd.DataFrame]=None): + self.peptides = peptides + self.ions = ions + + def add_simulation(self, simulation_name:str, simulation_data: ArrayLike): + accepted_peptide_simulations = [ + "simulated_irt_apex", + "simulated_frame_apex", + "simulated_frame_profile", + "simulated_charge_profile", + ] + + accepted_ion_simulations = [ + "simulated_scan_apex", + "simulated_k0", + "simulated_scan_profile", + "simulated_mz_spectrum", + ] + + # for profiles store min and max values + get_min_position = np.vectorize(lambda p:p.positions.min(), otypes=[int]) + get_max_position = np.vectorize(lambda p:p.positions.max(), otypes=[int]) + + if simulation_name == "simulated_frame_profile": + + self.peptides["frame_min"] = get_min_position(simulation_data) + self.peptides["frame_max"] = get_max_position(simulation_data) + + elif simulation_name == "simulated_charge_profile": + ions_dict = { + "sequence":[], + "pep_id":[], + "mz":[], + "charge":[], + "relative_abundancy":[] + } + sequences = self.peptides["sequence"].values + pep_ids = self.peptides["pep_id"].values + masses = self.peptides["mass_theoretical"].values + + for (s, pi, m, charge_profile) in zip(sequences, pep_ids, masses, simulation_data): + for (c, r) in charge_profile: + ions_dict["sequence"].append(s) + ions_dict["pep_id"].append(pi) + ions_dict["charge"].append(c) + ions_dict["relative_abundancy"].append(r) + ions_dict["mz"].append(m/c + MASS_PROTON) + self.ions = pd.DataFrame(ions_dict) + + self.peptides["charge_min"] = get_min_position(simulation_data) + self.peptides["charge_max"] = get_max_position(simulation_data) + + elif simulation_name == "simulated_scan_profile": + + self.ions["scan_min"] = get_min_position(simulation_data) + self.ions["scan_max"] = get_max_position(simulation_data) + + if simulation_name in accepted_peptide_simulations: + self.peptides[simulation_name] = simulation_data + + elif simulation_name in accepted_ion_simulations: + self.ions[simulation_name] = simulation_data + + else: + raise ValueError(f"Simulation name '{simulation_name}' is not defined") \ No newline at end of file diff --git a/imspy/imspy/simulation/__init__.py b/imspy/imspy/simulation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/imspy/imspy/simulation/experiment.py b/imspy/imspy/simulation/experiment.py new file mode 100644 index 00000000..df60d5d8 --- /dev/null +++ b/imspy/imspy/simulation/experiment.py @@ -0,0 +1,203 @@ +import os +from multiprocessing import Pool +import functools +from abc import ABC, abstractmethod +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq +from tqdm import tqdm +from imspy.core import MzSpectrum +from imspy.proteome import PeptideDigest, ProteomicsExperimentDatabaseHandle +import imspy.simulation.hardware_models as hardware + + +class ProteomicsExperiment(ABC): + def __init__(self, path: str): + + # path strings to experiment folder, database and output subfolder + self.path = path + self.output_path = f"{os.path.dirname(path)}/output" + self.database_path = f"{self.path}/experiment_database.db" + + if not os.path.exists(self.path): + os.makedirs(self.path) + + if os.path.exists(self.database_path): + raise FileExistsError("Experiment found in the given path.") + + if not os.path.exists(self.output_path): + os.mkdir(self.output_path) + + # output folder must be empty, otherwise it is + # assumend that it contains old experiments + if len(os.listdir(self.output_path)) > 0: + raise FileExistsError("Experiment found in the given path.") + + # init database and loaded sample + self.database = ProteomicsExperimentDatabaseHandle(self.database_path) + self.loaded_sample = None + + # hardware methods + self._lc_method = None + self._ionization_method = None + self._ion_mobility_separation_method = None + self._mz_separation_method = None + + @property + def lc_method(self): + return self._lc_method + + @lc_method.setter + def lc_method(self, method: hardware.LiquidChromatography): + self._lc_method = method + + @property + def ionization_method(self): + return self._ionization_method + + @ionization_method.setter + def ionization_method(self, method: hardware.IonSource): + self._ionization_method = method + + @property + def ion_mobility_separation_method(self): + return self._ion_mobility_separation_method + + @ion_mobility_separation_method.setter + def ion_mobility_separation_method(self, method: hardware.IonMobilitySeparation): + self._ion_mobility_separation_method = method + + @property + def mz_separation_method(self): + return self._mz_separation_method + + @mz_separation_method.setter + def mz_separation_method(self, method: hardware.MzSeparation): + self._mz_separation_method = method + + @abstractmethod + def load_sample(self, sample: PeptideDigest): + self.database.push("PeptideDigest",sample) + + @abstractmethod + def run(self): + pass + + +class LcImsMsMs(ProteomicsExperiment): + def __init__(self, path:str): + super().__init__(path) + + def load_sample(self, sample: PeptideDigest): + return super().load_sample(sample) + + def run(self, chunk_size: int = 1000, assemble_processes: int = 8, frames_per_assemble_process:int = 20): + self._simulate_features(chunk_size) + self._assemble(frames_per_assemble_process, assemble_processes) + + + def _simulate_features(self, chunk_size): + # load bulks of data here as dataframe if necessary + for data_chunk in self.database.load_chunks(chunk_size): + self.lc_method.run(data_chunk) + self.ionization_method.run(data_chunk) + self.ion_mobility_separation_method.run(data_chunk) + self.mz_separation_method.run(data_chunk) + self.database.update(data_chunk) + + @staticmethod + def _assemble_frame_range(frame_range, scan_id_min, scan_id_max, default_abundance, resolution, output_path, database_path): + + frame_range_start = frame_range[0] + frame_range_end = frame_range[1] + # generate file_name + file_name = f"frames_{frame_range_start}_{frame_range_end}.parquet" + output_file_path = f"{output_path}/{file_name}" + + frame_range = range(frame_range_start,frame_range_end) + scan_range = range(scan_id_min, scan_id_max+1) + + thread_db_handle = ProteomicsExperimentDatabaseHandle(database_path) + ions_in_split = thread_db_handle.load_frames((frame_range_start, frame_range_end), spectra_as_jsons = True) + + + # skip if no peptides in split + if ions_in_split.shape[0] == 0: + return {} + + # spectra are currently stored in json format (from SQL db) + ions_in_split.loc[:,"simulated_mz_spectrum"] = ions_in_split["simulated_mz_spectrum"].transform(lambda s: MzSpectrum.from_jsons(jsons=s)) + + # construct signal data set + signal = {f_id:{s_id:[] for s_id in scan_range} for f_id in frame_range } + + for _,row in ions_in_split.iterrows(): + + ion_frame_start = max(frame_range_start, row["frame_min"]) + ion_frame_end = min(frame_range_end-1, row["frame_max"]) # -1 here because frame_range_end is covered by next frame range + + ion_scan_start = max(scan_id_min, row["scan_min"]) + ion_scan_end = min(scan_id_max, row["scan_max"]) + + ion_frame_profile = row["simulated_frame_profile"] + ion_scan_profile = row["simulated_scan_profile"] + + ion_charge_abundance = row["abundancy"]*row["relative_abundancy"] + + ion_spectrum = row["simulated_mz_spectrum"] + + # frame start and end inclusive + for f_id in range(ion_frame_start, ion_frame_end+1): + # scan start and end inclusive + for s_id in range(ion_scan_start, ion_scan_end+1): + + abundance = ion_charge_abundance*ion_frame_profile[f_id]*ion_scan_profile[s_id] + rel_to_default_abundance = abundance/default_abundance + + signal[f_id][s_id].append(ion_spectrum*rel_to_default_abundance) + + output_dict = {"frame_id" : [], + "scan_id" : [], + "mz" : [], + "intensity" : [], + } + for (f_id,frame_dict) in signal.items(): + for (s_id,scan_spectra_list) in frame_dict.items(): + + if len(scan_spectra_list) > 0: + scan_spectrum = MzSpectrum.from_mz_spectra_list(scan_spectra_list,resolution = resolution).vectorized(resolution=resolution).to_centroided() + output_dict["mz"].append(scan_spectrum.mz.tolist()) + output_dict["intensity"].append(scan_spectrum.intensity.tolist()) + output_dict["scan_id"].append(s_id) + output_dict["frame_id"].append(f_id) + signal[f_id][s_id] = None + + for key, value in output_dict.items(): + output_dict[key] = pa.array(value) + + pa_table = pa.Table.from_pydict(output_dict) + + pq.write_table(pa_table, output_file_path, compression=None) + + def _assemble(self, frames_per_process:int, num_processes:int): + + scan_id_min = self.ion_mobility_separation_method.scan_id_min + scan_id_max = self.ion_mobility_separation_method.scan_id_max + default_abundance = self.mz_separation_method.model.default_abundance + resolution = self.mz_separation_method.resolution + + split_positions = np.arange(0, self.lc_method.num_frames , step=frames_per_process ).astype(int) + split_start = split_positions[:-1] + split_end = split_positions[1:] + + assemble_frame_range = functools.partial(self._assemble_frame_range, scan_id_min = scan_id_min, scan_id_max = scan_id_max, default_abundance = default_abundance, resolution = resolution, output_path = self.output_path, database_path= self.database_path) + + if num_processes > 1: + with Pool(num_processes) as pool: + list(tqdm(pool.imap(assemble_frame_range, zip(split_start, split_end)),total=len(split_start))) + else: + for start,end in tqdm(zip(split_start,split_end), total = len(split_start)): + assemble_frame_range((start, end)) + + + diff --git a/imspy/imspy/simulation/hardware_models.py b/imspy/imspy/simulation/hardware_models.py new file mode 100644 index 00000000..b2e29b1c --- /dev/null +++ b/imspy/imspy/simulation/hardware_models.py @@ -0,0 +1,774 @@ +from __future__ import annotations +from abc import ABC, abstractmethod +from typing import List, Tuple +from multiprocessing import Pool + +import tensorflow as tf +import numpy as np +from numpy.typing import ArrayLike, NDArray +import pandas as pd +from scipy.stats import exponnorm, norm, binom, gamma + +from imspy.chemistry.mass import STANDARD_TEMPERATURE, STANDARD_PRESSURE, BufferGas, get_num_protonizable_sites +from imspy.chemistry.mobility import SUMMARY_CONSTANT as CCS_K0_CONVERSION_CONSTANT +from imspy.proteome import ProteomicsExperimentSampleSlice +from imspy.feature import RTProfile, ScanProfile, ChargeProfile +from imspy.isotopes import AveragineGenerator +from imspy.utility.utilities import tokenizer_from_json + + +class Device(ABC): + def __init__(self, name: str): + self.name = name + self._temperature = STANDARD_TEMPERATURE + self._pressure = STANDARD_PRESSURE + + @property + def temperature(self): + """ + Get device temperature + + :return: Temperature of device in Kelvin. + :rtype: float + """ + return self._temperature + + @temperature.setter + def temperature(self, T:float): + """ + Set device temperature + + :param T: Temperature in Kelvin. + :type T: float + """ + self._temperature = T + + @property + def pressure(self): + """ + Get device pressure + + :return: Pressure of device in Pa. + :rtype: float + """ + return self._pressure + + @pressure.setter + def pressure(self, p:float): + """ + Set device pressure + :param p: Pressure in Pa. + :type p: float + """ + self._pressure = p + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + + +class Model(ABC): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Device): + pass + + +class Chromatography(Device): + def __init__(self, name: str="ChromatographyDevice"): + super().__init__(name) + self._apex_model = None + self._profile_model = None + self._irt_to_rt_converter = None + self._frame_length = 1200 + self._gradient_length = 120*60*1000 # 120 minutes in miliseconds + + @property + def frame_length(self): + return self._frame_length + + @frame_length.setter + def frame_length(self, milliseconds: int): + self._frame_length = milliseconds + + @property + def gradient_length(self): + return self._gradient_length/(60*1000) + + @gradient_length.setter + def gradient_length(self, minutes: int): + self._gradient_length = minutes*60*1000 + + @property + def num_frames(self): + return self._gradient_length//self._frame_length + + @property + def apex_model(self): + return self._apex_model + + @apex_model.setter + def apex_model(self, model: ChromatographyApexModel): + self._apex_model = model + + @property + def profile_model(self): + return self._profile_model + + @property + def irt_to_rt_converter(self): + return self._irt_to_rt_converter + + @irt_to_rt_converter.setter + def irt_to_rt_converter(self, converter:callable): + self._irt_to_rt_converter = converter + + @profile_model.setter + def profile_model(self, model:ChromatographyProfileModel): + self._profile_model = model + + def irt_to_frame_id(self, irt: float): + return self.rt_to_frame_id(self.irt_to_rt(irt)) + + @abstractmethod + def rt_to_frame_id(self, rt: float): + pass + + def irt_to_rt(self, irt): + return self.irt_to_rt_converter(irt) + + def frame_time_interval(self, frame_id:ArrayLike): + frame_id = np.atleast_1d(frame_id) + frame_length_minutes = self.frame_length/(60*1000) + s = (frame_id-1)*frame_length_minutes + e = frame_id*frame_length_minutes + return np.stack([s, e], axis = 1) + + def frame_time_middle(self, frame_id: ArrayLike): + return np.mean(self.frame_time_interval(frame_id),axis=1) + + def frame_time_end(self, frame_id: ArrayLike): + return self.frame_time_interval(frame_id)[:,1] + + def frame_time_start(self, frame_id: ArrayLike): + return self.frame_time_interval(frame_id)[:,0] + + +class LiquidChromatography(Chromatography): + def __init__(self, name: str = "LiquidChromatographyDevice"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + # retention time apex simulation + retention_time_apex = self._apex_model.simulate(sample, self) + # in irt and rt + sample.add_simulation("simulated_irt_apex", retention_time_apex) + # in frame id + sample.add_simulation("simulated_frame_apex", self.irt_to_frame_id(retention_time_apex)) + + # profile simulation + + retention_profile = self._profile_model.simulate(sample, self) + sample.add_simulation("simulated_frame_profile", retention_profile) + + def rt_to_frame_id(self, rt_minutes: ArrayLike): + rt_minutes = np.asarray(rt_minutes) + # first frame is completed not at 0 but at frame_length + frame_id = (rt_minutes/self.frame_length*1000*60).astype(np.int64)+1 + return frame_id + + +class ChromatographyApexModel(Model): + def __init__(self): + self._device = None + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + pass + + +class ChromatographyProfileModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + pass + + +class EMGChromatographyProfileModel(ChromatographyProfileModel): + + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + mus = sample.peptides["simulated_irt_apex"].values + profile_list = [] + + for mu in mus: + σ = 0.1 # must be sampled + λ = 10 # must be sampled + K = 1/(σ*λ) + μ = device.irt_to_rt(mu) + model_params = { "sigma":σ, + "lambda":λ, + "mu":μ, + "name":"EMG" + } + + emg = exponnorm(loc=μ, scale=σ, K = K) + # start and end value (in retention times) + s_rt, e_rt = emg.ppf([0.01,0.9]) + # as frames + s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt) + + profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation + profile_rt_ends = device.frame_time_end(profile_frames) + profile_rt_cdfs = emg.cdf(profile_rt_ends) + profile_rel_intensities = np.diff(profile_rt_cdfs) + + profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params)) + return profile_list + + +class NormalChromatographyProfileModel(ChromatographyProfileModel): + + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> List[RTProfile]: + mus = sample.peptides["simulated_irt_apex"].values + sigmas = gamma(a=4.929,scale=1/18.784).rvs(mus.size)/6 + profile_list = [] + + for mu,σ in zip(mus,sigmas): + + μ = device.irt_to_rt(mu) + model_params = { "sigma":σ, + "mu":μ, + "name":"Normal" + } + + normal = norm(loc=μ, scale=σ) + # start and end value (in retention times) + s_rt, e_rt = normal.ppf([0.02,0.98]) + # as frames + s_frame, e_frame = device.rt_to_frame_id(s_rt), device.rt_to_frame_id(e_rt) + + profile_frames = np.arange(s_frame-1,e_frame+1) # starting with s_frame-1 for cdf interval calculation + profile_rt_ends = device.frame_time_end(profile_frames) + profile_rt_cdfs = normal.cdf(profile_rt_ends) + profile_rel_intensities = np.diff(profile_rt_cdfs) + + profile_list.append(RTProfile(profile_frames[1:],profile_rel_intensities,model_params)) + return profile_list + + +class NeuralChromatographyApex(ChromatographyApexModel): + + def __init__(self, model_path: str, tokenizer_path: str): + super().__init__() + self.model_path = model_path + self.tokenizer = tokenizer_from_json(tokenizer_path) + + def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: + print('tokenizing sequences...') + tokens = self.tokenizer.texts_to_sequences(sequences_tokenized) + tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') + return tokens_padded + + @staticmethod + def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs: int = 2048): + pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1) + if batched: + dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)).batch(bs) + else: + dataset = tf.data.Dataset.from_tensor_slices((tokens_padded, pseudo_target)) + model = tf.keras.models.load_model(model_path) + print('predicting irts...') + return_val = model.predict(dataset) + return return_val + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) -> NDArray[np.float64]: + + data = sample.peptides + tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized).to_numpy() + print('generating tf dataset...') + tokens_padded = self.sequences_to_tokens(tokens) + + with Pool(1) as pool: + r = pool.starmap(self._worker, [(self.model_path, tokens_padded)]) + return r[0] + + +class IonSource(Device): + def __init__(self, name:str ="IonizationDevice"): + super().__init__(name) + self._ionization_model = None + + @property + def ionization_model(self): + return self._ionization_model + + @ionization_model.setter + def ionization_model(self, model: IonizationModel): + self._ionization_model = model + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class ElectroSpray(IonSource): + def __init__(self, name:str ="ElectrosprayDevice"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + charge_profiles = self.ionization_model.simulate(sample, self) + sample.add_simulation("simulated_charge_profile", charge_profiles) + +class IonizationModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample:ProteomicsExperimentSampleSlice, device: IonSource) -> NDArray: + pass + +class RandomIonSource(IonizationModel): + def __init__(self): + super().__init__() + self._charge_probabilities = np.array([0.1, 0.5, 0.3, 0.1]) + self._allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) + + @property + def allowed_charges(self): + return self._allowed_charges + + @allowed_charges.setter + def allowed_charges(self, charges: ArrayLike): + self._allowed_charges = np.asarray(charges, dtype=np.int8) + + @property + def charge_probabilities(self): + return self._charge_probabilities + + @charge_probabilities.setter + def charge_probabilities(self, probabilities: ArrayLike): + self._charge_probabilities = np.asarray(probabilities) + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: + if self.charge_probabilities.shape != self.allowed_charges.shape: + raise ValueError("Number of allowed charges must fit to number of probabilities") + + data = sample.peptides + charge = np.random.choice(self.allowed_charges, data.shape[0], p=self.charge_probabilities) + rel_intensity = np.ones_like(charge) + charge_profiles = [] + for c,i in zip(charge, rel_intensity): + charge_profiles.append(ChargeProfile([c],[i],model_params={"name":"RandomIonSource"})) + return charge_profiles + +class BinomialIonSource(): + def __init__(self): + super().__init__() + self._charged_probability = 0.5 + self._allowed_charges = np.array([1, 2, 3, 4], dtype=np.int8) + + @property + def allowed_charges(self): + return self._allowed_charges + + @allowed_charges.setter + def allowed_charges(self, charges: ArrayLike): + self._allowed_charges = np.asarray(charges, dtype=np.int8) + + @property + def charged_probability(self): + return self._charged_probability + + @charged_probability.setter + def charged_probability(self, probability: float): + self._charged_probability = probability + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonSource) -> List[ChargeProfile]: + sequences = sample.peptides.sequence + vec_get_num_protonizable_sites = np.vectorize(get_num_protonizable_sites) + basic_aa_nums = vec_get_num_protonizable_sites(sequences) + + charge_profiles = [] + for baa_num in basic_aa_nums: + rel_intensities = binom(baa_num,self.charged_probability).pmf(self.allowed_charges) + charge_profiles.append(ChargeProfile(self.allowed_charges,rel_intensities,model_params={"name":"BinomialIonSource","basic_aa_num":baa_num})) + return charge_profiles + +class IonMobilitySeparation(Device): + def __init__(self, name:str = "IonMobilityDevice"): + super().__init__(name) + # models + self._apex_model = None + self._profile_model = None + + # hardware parameter + self._scan_intervall = 1 + self._scan_time = None + self._scan_id_min = None + self._scan_id_max = None + self._buffer_gas = None + + # converters + self._reduced_im_to_scan_converter = None + self._scan_to_reduced_im_interval_converter = None + + @property + def reduced_im_to_scan_converter(self): + return self._reduced_im_to_scan_converter + + @reduced_im_to_scan_converter.setter + def reduced_im_to_scan_converter(self, converter:callable): + self._reduced_im_to_scan_converter = converter + + @property + def scan_to_reduced_im_interval_converter(self): + return self._scan_to_reduced_im_interval_converter + + @scan_to_reduced_im_interval_converter.setter + def scan_to_reduced_im_interval_converter(self, converter:callable): + self._scan_to_reduced_im_interval_converter = converter + + @property + def buffer_gas(self): + return self._buffer_gas + + @buffer_gas.setter + def buffer_gas(self, gas: BufferGas): + self._buffer_gas = gas + + @property + def scan_intervall(self): + return self._scan_intervall + + @scan_intervall.setter + def scan_intervall(self, number:int): + self._scan_intervall = number + + @property + def scan_id_min(self): + return self._scan_id_min + + @scan_id_min.setter + def scan_id_min(self, number:int): + self._scan_id_min = number + + @property + def scan_id_max(self): + return self._scan_id_max + + @scan_id_max.setter + def scan_id_max(self, number:int): + self._scan_id_max = number + + @property + def scan_time(self): + return self._scan_time + + @scan_time.setter + def scan_time(self, microseconds:int): + self._scan_time = microseconds + + @property + def apex_model(self): + return self._apex_model + + @apex_model.setter + def apex_model(self, model: IonMobilityApexModel): + self._apex_model = model + + @property + def profile_model(self): + return self._profile_model + + @profile_model.setter + def profile_model(self, model: IonMobilityProfileModel): + self._profile_model = model + + def scan_to_reduced_im_interval(self, scan_id: ArrayLike): + return self.scan_to_reduced_im_interval_converter(scan_id) + + def reduced_im_to_scan(self, ion_mobility): + return self.reduced_im_to_scan_converter(ion_mobility) + + def scan_im_middle(self, scan_id: ArrayLike): + return np.mean(self.scan_to_reduced_im_interval(scan_id), axis = 1) + + def scan_im_lower(self, scan_id: ArrayLike): + return self.scan_to_reduced_im_interval(scan_id)[:,0] + + def scan_im_upper(self, scan_id:ArrayLike): + return self.scan_to_reduced_im_interval(scan_id)[:,1] + + def im_to_reduced_im(self, ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE): + """ + Calculate reduced ion mobility K_0 + (normalized to standard pressure p_0 + and standard temperature T_0), from + ion mobility K. + + K_0 = K * p/p_0 * T_0/T + + [1] J. N. Dodds and E. S. Baker, + “Ion mobility spectrometry: fundamental concepts, + instrumentation, applications, and the road ahead,” + Journal of the American Society for Mass Spectrometry, + vol. 30, no. 11, pp. 2185–2195, 2019, + doi: 10.1007/s13361-019-02288-2. + + :param ion_mobility: Ion mobility K to + normalize to standard conditions + :param p_0: Standard pressure (Pa). + :param T_0: Standard temperature (K). + """ + T = self.temperature + p = self.pressure + return ion_mobility*p/p_0*T_0/T + + def reduced_im_to_im(self, reduced_ion_mobility: float, p_0: float = STANDARD_PRESSURE, T_0: float = STANDARD_TEMPERATURE): + """ + Inverse of `.im_to_reduced_im()` + """ + T = self.temperature + p = self.pressure + return reduced_ion_mobility*p_0/p*T/T_0 + + def ccs_to_reduced_im(self, ccs:float, mz:float, charge:int): + # TODO Citation + """ + Conversion of collision cross-section values (ccs) + to reduced ion mobility according to + Mason-Schamp equation. + + :param ccs: collision cross-section (ccs) + :type ccs: float + :param mz: Mass (Da) to charge ratio of peptide + :type mz: float + :param charge: Charge of peptide + :type charge: int + :return: Reduced ion mobility + :rtype: float + """ + + T = self.temperature + mass = mz*charge + μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass) + z = charge + + K0 = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*ccs) + return K0 + + def reduced_im_to_ccs(self, reduced_ion_mobility:float, mz:float, charge:int): + """ + Conversion of reduced ion mobility + to collision cross-section values (ccs) + according to Mason-Schamp equation. + + :param reduced_ion_mobility: reduced ion mobility K0 + :type reduced_ion_mobility: float + :param mz: Mass (Da) to charge ratio of peptide + :type mz: float + :param charge: Charge of peptide + :type charge: int + :return: Collision cross-section (ccs) + :rtype: float + """ + + T = self.temperature + mass = mz*charge + μ = self.buffer_gas.mass*mass/(self.buffer_gas.mass+mass) + z = charge + K0 = reduced_ion_mobility + + ccs = CCS_K0_CONVERSION_CONSTANT*z*1/(np.sqrt(μ*T)*K0) + return ccs + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + + + +class TrappedIon(IonMobilitySeparation): + + def __init__(self, name:str = "TrappedIonMobilitySeparation"): + super().__init__() + self._scan_id_min = 1 + self._scan_id_max = 918 + + def run(self, sample: ProteomicsExperimentSampleSlice): + # scan apex simulation + one_over_k0, scan_apex = self._apex_model.simulate(sample, self) + # in irt and rt + sample.add_simulation("simulated_scan_apex", scan_apex) + sample.add_simulation("simulated_k0", one_over_k0) + # scan profile simulation + scan_profile = self._profile_model.simulate(sample, self) + sample.add_simulation("simulated_scan_profile", scan_profile) + + +class IonMobilityApexModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray[np.float64]: + return super().simulate(sample, device) + +class IonMobilityProfileModel(Model): + def __init__(self): + pass + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> NDArray: + return super().simulate(sample, device) + +class NeuralIonMobilityApex(IonMobilityApexModel): + + def __init__(self, model_path: str, tokenizer_path: str): + super().__init__() + self.model_path = model_path + self.tokenizer = tokenizer_from_json(tokenizer_path) + + def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array: + print('tokenizing sequences...') + tokens = self.tokenizer.texts_to_sequences(sequences_tokenized) + tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post') + return tokens_padded + + @staticmethod + def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np.array, batched: bool = True, bs: int = 2048): + + mz = np.expand_dims(mz, 1) + c = tf.one_hot(charges - 1, depth=4) + pseudo_target = np.expand_dims(np.zeros_like(tokens_padded[:, 0]), axis=1) + if batched: + dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)).batch(bs) + else: + dataset = tf.data.Dataset.from_tensor_slices(((mz, c, tokens_padded), pseudo_target)) + + print('predicting mobilities...') + model = tf.keras.models.load_model(model_path) + ccs, _ = model.predict(dataset) + return ccs + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]: + data = sample.ions.merge(sample.peptides.loc[:,["pep_id","sequence_tokenized"]],on="pep_id",validate="many_to_one") + tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized).to_numpy() + tokens_padded = self.sequences_to_tokens(tokens) + mz = data['mz'].values + charges = data["charge"].values + with Pool(1) as pool: + r = pool.starmap(self._worker, [(self.model_path, tokens_padded, mz, charges)]) + ccs = r[0] + K0s = device.ccs_to_reduced_im(np.squeeze(ccs), mz, data['charge'].values) + scans = device.reduced_im_to_scan(K0s) + return K0s,scans + +class NormalIonMobilityProfileModel(IonMobilityProfileModel): + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> List[ScanProfile]: + mus = 1/sample.ions["simulated_k0"].values #1 over k0 + sigmas = gamma(a=3.703,scale=1/79.614).rvs(mus.size)/6 + profile_list = [] + for μ,σ in zip(mus,sigmas): + + model_params = { "sigma":σ, + "mu":μ, + "name":"NORMAL" + } + + normal = norm(loc=μ, scale=σ) + # start and end value (k0) + s_one_over_k0, e_one_over_k0 = normal.ppf([0.01,0.99]) + # as scan ids, remember first scans elutes largest ions + e_scan, s_scan = device.reduced_im_to_scan(1/s_one_over_k0), device.reduced_im_to_scan(1/e_one_over_k0) + + profile_scans = np.arange(s_scan-1,e_scan+1) # starting s_scan-1 is necessary here to include its end value for cdf interval + profile_end_im = 1/device.scan_im_upper(profile_scans) + profile_end_cdfs = 1-normal.cdf(profile_end_im) + profile_rel_intensities = np.diff(profile_end_cdfs) + + profile_list.append(ScanProfile(profile_scans[1:],profile_rel_intensities,model_params)) + return profile_list + + +class MzSeparation(Device): + def __init__(self, name:str = "MassSpectrometer"): + super().__init__(name) + self._model = None + self._resolution = 3 + + @property + def resolution(self): + return self._resolution + + @resolution.setter + def resolution(self, res: int): + self._resolution = res + + @property + def model(self): + return self._model + + @model.setter + def model(self, model: MzSeparationModel): + self._model = model + + @abstractmethod + def run(self, sample: ProteomicsExperimentSampleSlice): + pass + +class TOF(MzSeparation): + def __init__(self, name:str = "TimeOfFlightMassSpectrometer"): + super().__init__(name) + + def run(self, sample: ProteomicsExperimentSampleSlice): + spectra = self.model.simulate(sample, self) + sample.add_simulation("simulated_mz_spectrum", spectra) + +class MzSeparationModel(Model): + def __init__(self): + self._default_abundance = 1e4 + + @property + def default_abundance(self): + return self._default_abundance + + @default_abundance.setter + def default_abundance(self, abundance:int): + self._default_abundance = abundance + + @abstractmethod + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): + return super().simulate(sample, device) + +class AveragineModel(MzSeparationModel): + def __init__(self): + super().__init__() + + def simulate(self, sample: ProteomicsExperimentSampleSlice, device: MzSeparation): + + avg = AveragineGenerator() + # right join here to keep order of keys in right df + joined_df = pd.merge(sample.peptides[["pep_id","mass_theoretical"]], + sample.ions[["pep_id","charge"]], + how = "right", + on = "pep_id") + masses = joined_df["mass_theoretical"].values + charges = joined_df["charge"].values + spectra = [] + for (m, c) in zip(masses, charges): + spectrum = avg.generate_spectrum(m,c,amp = self.default_abundance, centroided=False) + spectra.append(spectrum) + return spectra diff --git a/imspy/imspy/timstof/data.py b/imspy/imspy/timstof/data.py index c1e35ca6..a2121910 100644 --- a/imspy/imspy/timstof/data.py +++ b/imspy/imspy/timstof/data.py @@ -8,8 +8,8 @@ from abc import ABC -from imspy.core import TimsFrame -from imspy.core import TimsSlice +from imspy.core.frame import TimsFrame +from imspy.core.slice import TimsSlice class TimsDataset(ABC): diff --git a/imspy/imspy/utility.py b/imspy/imspy/utility.py new file mode 100644 index 00000000..dd504b39 --- /dev/null +++ b/imspy/imspy/utility.py @@ -0,0 +1,508 @@ +import io +import json +from typing import List, Optional +import tensorflow as tf +import numpy as np +from numpy.typing import ArrayLike + +import numba +import math +import pandas as pd + +from tqdm import tqdm +from imspy.data import MzSpectrum +# TODO bring to imspy +#from proteolizardalgo.hashing import ReferencePattern + + +class TokenSequence: + + def __init__(self, sequence_tokenized: Optional[List[str]] = None, jsons:Optional[str] = None): + if jsons is not None: + self.sequence_tokenized = self._from_jsons(jsons) + self._jsons = jsons + else : + self.sequence_tokenized = sequence_tokenized + self._jsons = self._to_jsons() + + def _to_jsons(self): + json_dict = self.sequence_tokenized + return json.dumps(json_dict) + + def _from_jsons(self, jsons:str): + return json.loads(jsons) + + @property + def jsons(self): + return self._jsons + +def proteome_from_fasta(path: str) -> pd.DataFrame: + """ + Read a fasta file and return a dataframe with the protein name and sequence. + :param path: path to the fasta file + :return: a dataframe with the protein name and sequence + """ + d = {} + with open(path) as infile: + gene = '' + header = '' + for i, line in enumerate(infile): + if line.find('>') == -1: + gene += line + elif line.find('>') != -1 and i > 0: + header = line + d[header.replace('\n', '')[4:]] = gene.replace('\n', '') + gene = '' + elif i == 0: + header = line + + row_list = [] + + for key, value in d.items(): + split_index = key.find(' ') + gene_id = key[:split_index].split('|')[0] + rest = key[split_index:] + row = {'id': gene_id, 'meta_data': rest, 'sequence': value} + row_list.append(row) + + return pd.DataFrame(row_list) + + +def peak_width_preserving_mz_transform( + mz: np.array, + M0: float = 500, + resolution: float = 50_000): + """ + Transform values into an index that fixes the width of the peak at half full width. + Arguments: + mz (np.array): An array of mass to charge ratios to transform. + M0 (float): The m/z value at which TOFs resolution is reported. + resolution (float): The resolution of the TOF instrument. + """ + return (np.log(mz) - np.log(M0)) / np.log1p(1 / resolution) + + +def bins_to_mz(mz_bin, win_len): + return np.abs(mz_bin) * win_len + (int(mz_bin < 0) * (0.5 * win_len)) + + +def get_ref_pattern_as_spectra(ref_patterns): + """ + Get the reference patterns as a list of spectra. + :param ref_patterns: the reference patterns + :return: a list of spectra + """ + spec_list = [] + + for index, row in ref_patterns.iterrows(): + + if 150 <= (row['m'] + 1) / row['z'] <= 1700: + last_contrib = row['last_contrib'] + + pattern = row['pattern'][:1000] + + mz_bin = np.arange(1000) + + both = list(zip(mz_bin, pattern)) + + both = list(filter(lambda x: x[1] >= 0.001, both)) + + mz = [x[0] for x in both] + i = [x[1] for x in both] + + first_peak = np.where(np.diff(mz) > 1)[0][0] + max_i = np.argmax(i[:first_peak]) + mono_mz_max = mz[max_i] + + mz = (mz - mono_mz_max) / 100 + row['m'] / row['z'] + + mz = np.round(np.array(mz), 2) + 1.0 + i = np.array([int(x) for x in i]) + + spectrum = MzSpectrum(mz, i) + spec_list.append((row['m'], row['z'], spectrum, last_contrib)) + + return spec_list + + +def get_refspec_list(ref_patterns, win_len=5): + """ + + """ + + d_list = [] + + for spec in tqdm(ref_patterns, desc='creating reference patterns', ncols=100): + m = spec[0] + z = spec[1] + sp = spec[2] + lc = spec[3] + peak_width = 0.04 + mz_mono = np.round((m + 1.001) / z, 2) + peak_width + mz_bin = np.floor(mz_mono / win_len) + ref_spec = ReferencePattern(mz_bin=mz_bin, mz_mono=mz_mono, charge=z, mz_last_contrib=lc, + spectrum=sp, window_length=win_len) + + d_list.append((mz_bin, ref_spec)) + + return d_list + + +def create_reference_dict(D): + # create members + keys = set([t[0] for t in D]) + ref_d = dict([(k, []) for k in keys]) + + for b, p in D: + ref_d[b].append(p) + + tmp_dict = {} + # go over all keys + for key, values in ref_d.items(): + + # create charges + r = np.arange(5) + 1 + + # go over all charge states + for c in r: + + # go over all reference pattern + for ca in values: + + # append exactly one charge state per reference bin + if c == ca.charge: + # if key already exists, append + if key in tmp_dict: + tmp_dict[key].append(ca) + break + + # if key does not exist, create new list + else: + tmp_dict[key] = [ca] + break + + return tmp_dict + +@numba.jit(nopython=True) +def normal_pdf(x: ArrayLike, mass: float, s: float = 0.001, inv_sqrt_2pi: float = 0.3989422804014327, normalize: bool = False): + """ + :param inv_sqrt_2pi: + :param x: + :param mass: + :param s: + :return: + """ + a = (x - mass) / s + if normalize: + return np.exp(-0.5 * np.power(a,2)) + else: + return inv_sqrt_2pi / s * np.exp(-0.5 * np.power(a,2)) + + +@numba.jit +def gaussian(x, μ=0, σ=1): + """ + Gaussian function + :param x: + :param μ: + :param σ: + :return: + """ + A = 1 / np.sqrt(2 * np.pi * np.power(σ, 2)) + B = np.exp(- (np.power(x - μ, 2) / 2 * np.power(σ, 2))) + + return A * B + + +@numba.jit +def exp_distribution(x, λ=1): + """ + Exponential function + :param x: + :param λ: + :return: + """ + if x > 0: + return λ * np.exp(-λ * x) + return 0 + + +@numba.jit +def exp_gaussian(x, μ=-3, σ=1, λ=.25): + """ + laplacian distribution with exponential decay + :param x: + :param μ: + :param σ: + :param λ: + :return: + """ + A = λ / 2 * np.exp(λ / 2 * (2 * μ + λ * np.power(σ, 2) - 2 * x)) + B = math.erfc((μ + λ * np.power(σ, 2) - x) / (np.sqrt(2) * σ)) + return A * B + + +class NormalDistribution: + def __init__(self, μ: float, σ: float): + self.μ = μ + self.σ = σ + + def __call__(self, x): + return gaussian(x, self.μ, self.σ) + + +class ExponentialGaussianDistribution: + def __init__(self, μ: float = -3, σ: float = 1, λ: float = .25): + self.μ = μ + self.σ = σ + self.λ = λ + + def __call__(self, x): + return exp_gaussian(x, self.μ, self.σ, self.λ) + + +def preprocess_max_quant_evidence(exp: pd.DataFrame) -> pd.DataFrame: + """ + select columns from evidence txt, rename to ionmob naming convention and transform to raw data rt in seconds + Args: + exp: a MaxQuant evidence dataframe from evidence.txt table + Returns: cleaned evidence dataframe, columns renamed to ionmob naming convention + """ + + # select columns + exp = exp[['m/z', 'Mass', 'Charge', 'Modified sequence', 'Retention time', + 'Retention length', 'Ion mobility index', 'Ion mobility length', '1/K0', '1/K0 length', + 'Number of isotopic peaks', 'Max intensity m/z 0', 'Intensity', 'Raw file', 'CCS']].rename( + # rename columns to ionmob naming convention + columns={'m/z': 'mz', 'Mass': 'mass', + 'Charge': 'charge', 'Modified sequence': 'sequence', 'Retention time': 'rt', + 'Retention length': 'rt_length', 'Ion mobility index': 'im', 'Ion mobility length': 'im_length', + '1/K0': 'inv_ion_mob', '1/K0 length': 'inv_ion_mob_length', 'CCS': 'ccs', + 'Number of isotopic peaks': 'num_peaks', 'Max intensity m/z 0': 'mz_max_intensity', + 'Intensity': 'intensity', 'Raw file': 'raw'}).dropna() + + # transform retention time from minutes to seconds as stored in tdf raw data + exp['rt'] = exp.apply(lambda r: r['rt'] * 60, axis=1) + exp['rt_length'] = exp.apply(lambda r: r['rt_length'] * 60, axis=1) + exp['rt_start'] = exp.apply(lambda r: r['rt'] - r['rt_length'] / 2, axis=1) + exp['rt_stop'] = exp.apply(lambda r: r['rt'] + r['rt_length'] / 2, axis=1) + + exp['im_start'] = exp.apply(lambda r: int(np.round(r['im'] - r['im_length'] / 2)), axis=1) + exp['im_stop'] = exp.apply(lambda r: int(np.round(r['im'] + r['im_length'] / 2)), axis=1) + + exp['inv_ion_mob_start'] = exp.apply(lambda r: r['inv_ion_mob'] - r['inv_ion_mob_length'] / 2, axis=1) + exp['inv_ion_mob_stop'] = exp.apply(lambda r: r['inv_ion_mob'] + r['inv_ion_mob_length'] / 2, axis=1) + + # remove duplicates + exp = exp.drop_duplicates(['sequence', 'charge', 'rt', 'ccs']) + + return exp + +def max_quant_to_proforma(s:str, old_annotation:bool=False): + """ + Convert MaxQuant amino acid sequences (with PTM) to + proForma format. + + :param s: amino acid sequence + :type s: str + :param old_annotation: Wether old annotation is used in `s`, defaults to False + :type old_annotation: bool, optional + """ + seq = s.strip("_") + + proforma_seq = "" + if old_annotation: + proforma_seq = (seq + .replace("(ox)", "[UNIMOD:35]") + .replace("(ac)", "[UNIMOD:1]") + ) + else: + proforma_seq = (seq + .replace("(Oxidation (M))", "[UNIMOD:35]") + .replace("(Phospho (STY))", "[UNIMOD:21]") + .replace("(Acetyl (Protein N-term))", "[UNIMOD:1]") + ) + return f"_{proforma_seq}_" + +def preprocess_max_quant_sequence(s, old_annotation=False): + """ + :param s: + :param old_annotation: + """ + + seq = s[1:-1] + + is_acc = False + + if old_annotation: + seq = seq.replace('(ox)', '$') + + if seq.find('(ac)') != -1: + is_acc = True + seq = seq.replace('(ac)', '') + + else: + seq = seq.replace('(Oxidation (M))', '$') + seq = seq.replace('(Phospho (STY))', '&') + + if seq.find('(Acetyl (Protein N-term))') != -1: + is_acc = True + seq = seq.replace('(Acetyl (Protein N-term))', '') + + # form list from string + slist = list(seq) + + tmp_list = [] + + for item in slist: + if item == '$': + tmp_list.append('') + + elif item == '&': + tmp_list.append('') + + else: + tmp_list.append(item) + + slist = tmp_list + + r_list = [] + + for i, char in enumerate(slist): + + if char == '': + C = slist[i - 1] + C = C + '-' + r_list = r_list[:-1] + r_list.append(C) + + elif char == 'C': + r_list.append('C-') + + elif char == '': + C = slist[i - 1] + C = C + '-' + r_list = r_list[:-1] + r_list.append(C) + + else: + r_list.append(char) + + if is_acc: + return ['-'] + r_list + [''] + + return [''] + r_list + [''] + +def is_unimod_start(char:str): + """ + Tests if char is start of unimod + bracket + + :param char: Character of a proForma formatted aa sequence + :type char: str + :return: Wether char is start of unimod bracket + :rtype: bool + """ + if char in ["(","[","{"]: + return True + else: + return False + +def is_unimod_end(char:str): + """ + Tests if char is end of unimod + bracket + + :param char: Character of a proForma formatted aa sequence + :type char: str + :return: Wether char is end of unimod bracket + :rtype: bool + """ + if char in [")","]","}"]: + return True + else: + return False + +def tokenize_proforma_sequence(sequence: str): + """ + Tokenize a ProForma formatted sequence string. + + :param sequence: Sequence string (ProForma formatted) + :type sequence: str + :return: List of tokens + :rtype: List + """ + sequence = sequence.upper().replace("(","[").replace(")","]") + token_list = [""] + in_unimod_bracket = False + tmp_token = "" + + for aa in sequence: + if is_unimod_start(aa): + in_unimod_bracket = True + if in_unimod_bracket: + if is_unimod_end(aa): + in_unimod_bracket = False + tmp_token += aa + continue + if tmp_token != "": + token_list.append(tmp_token) + tmp_token = "" + tmp_token += aa + + if tmp_token != "": + token_list.append(tmp_token) + + if len(token_list) > 1: + if token_list[1].find("UNIMOD:1") != -1: + token_list[1] = ""+token_list[1] + token_list = token_list[1:] + token_list.append("") + + return token_list + +def get_aa_num_proforma_sequence(sequence:str): + """ + get number of amino acids in sequence + + :param sequence: proforma formatted aa sequence + :type sequence: str + :return: Number of amino acids + :rtype: int + """ + num_aa = 0 + inside_bracket = False + + for aa in sequence: + if is_unimod_start(aa): + inside_bracket = True + if inside_bracket: + if is_unimod_end(aa): + inside_bracket = False + continue + num_aa += 1 + return num_aa + + + +def tokenizer_to_json(tokenizer: tf.keras.preprocessing.text.Tokenizer, path: str): + """ + save a fit keras tokenizer to json for later use + :param tokenizer: fit keras tokenizer to save + :param path: path to save json to + """ + tokenizer_json = tokenizer.to_json() + with io.open(path, 'w', encoding='utf-8') as f: + f.write(json.dumps(tokenizer_json, ensure_ascii=False)) + + +def tokenizer_from_json(path: str): + """ + load a pre-fit tokenizer from a json file + :param path: path to tokenizer as json file + :return: a keras tokenizer loaded from json + """ + with open(path) as f: + data = json.load(f) + return tf.keras.preprocessing.text.tokenizer_from_json(data) + diff --git a/imspy/pyproject.toml b/imspy/pyproject.toml index e672cf12..f95a7f9d 100644 --- a/imspy/pyproject.toml +++ b/imspy/pyproject.toml @@ -11,9 +11,13 @@ pandas = ">=2.1" numpy = ">=1.21" tensorflow = ">=2.14" tensorflow-probability = ">=0.22.1" + imspy-connector = ">=0.2.7" tqdm = ">=4.62.0" scipy = ">=1.7.1" +tqdm = ">=4.66" +pyarrow =">=13.0" +mendeleev = ">=0.7.0" [build-system] requires = ["poetry-core"] diff --git a/imspy_connector/src/py_mz_spectrum.rs b/imspy_connector/src/py_mz_spectrum.rs index 46e01577..4efc8ec8 100644 --- a/imspy_connector/src/py_mz_spectrum.rs +++ b/imspy_connector/src/py_mz_spectrum.rs @@ -25,11 +25,13 @@ impl PyMsType { } #[pyclass] +#[derive(Clone)] pub struct PyMzSpectrum { pub inner: MzSpectrum, } #[pymethods] + impl PyMzSpectrum { #[new] pub unsafe fn new(mz: &PyArray1, intensity: &PyArray1) -> PyResult { @@ -37,9 +39,27 @@ impl PyMzSpectrum { inner: MzSpectrum { mz: mz.as_slice()?.to_vec(), intensity: intensity.as_slice()?.to_vec(), - }, + } }) } + #[staticmethod] + pub unsafe fn from_mzspectra_list(list: Vec, resolution: i32) -> PyResult { + if list.is_empty(){ + Ok(PyMzSpectrum { + inner: MzSpectrum { + mz: Vec::new(), + intensity: Vec::new(), + } + }) + } + else { + let mut convoluted: MzSpectrum = MzSpectrum { mz: vec![], intensity: vec![] }; + for spectrum in list { + convoluted = convoluted + spectrum.inner; + } + Ok(PyMzSpectrum { inner: convoluted.to_resolution(resolution) }) + } + } #[getter] pub fn mz(&self, py: Python) -> Py> { @@ -86,6 +106,12 @@ impl PyMzSpectrum { }; Ok(py_filtered) } + pub fn __add__(&self, other: PyMzSpectrum) -> PyResult { + Ok(PyMzSpectrum { inner: (self.inner.clone() + other.inner) }) + } + pub fn __mul__(&self, scale: f64) -> PyResult { + Ok(PyMzSpectrum { inner: (self.inner.clone() * scale) }) + } } #[pyclass] @@ -107,6 +133,10 @@ impl PyMzSpectrumVectorized { }) } + pub fn to_dense_spectrum(&self, max_index: Option) -> PyMzSpectrumVectorized { + PyMzSpectrumVectorized { inner: self.inner.to_dense_spectrum(max_index) } + } + #[getter] pub fn resolution(&self) -> i32 { self.inner.resolution @@ -242,4 +272,8 @@ impl PyTimsSpectrum { pub fn mz_spectrum(&self) -> PyMzSpectrum { PyMzSpectrum { inner: self.inner.spectrum.mz_spectrum.clone() } } + + pub fn to_resolution(&self, resolution: i32) -> Self { + PyTimsSpectrum{ inner: self.inner.to_resolution(resolution)} + } } diff --git a/mscore/src/mz_spectrum.rs b/mscore/src/mz_spectrum.rs index 88daf84f..1f2728bb 100644 --- a/mscore/src/mz_spectrum.rs +++ b/mscore/src/mz_spectrum.rs @@ -293,6 +293,17 @@ impl std::ops::Add for MzSpectrum { } } +impl std::ops::Mul for MzSpectrum { + type Output = Self; + fn mul(self, scale: f64) -> Self::Output{ + let mut scaled_intensities: Vec = vec![0.0; self.intensity.len()]; + for (idx,intensity) in self.intensity.iter().enumerate(){ + scaled_intensities[idx] = scale*intensity; + } + Self{ mz: self.mz.clone(), intensity: scaled_intensities} + + } +} /// Represents a mass spectrum with associated m/z indices, m/z values, and intensities #[derive(Clone)] pub struct IndexedMzSpectrum { @@ -559,15 +570,36 @@ impl MzSpectrumVectorized { /// # Arguments /// /// * `max_index` - The maximum index for the dense vector. - pub fn to_dense(&self, max_index: usize) -> DVector { - let mut dense = DVector::zeros(max_index + 1); + + fn get_max_index(&self) -> usize { + let base: i32 = 10; + let max_mz: i32 = 2000; + let max_index: usize = (max_mz*base.pow(self.resolution as u32)) as usize; + max_index + } + pub fn to_dense(&self, max_index: Option) -> DVector { + let max_index = match max_index { + Some(max_index) => max_index, + None => self.get_max_index(), + }; + let mut dense_intensities: DVector = DVector::::zeros(max_index + 1); for (&index, &intensity) in self.indices.iter().zip(self.values.iter()) { if (index as usize) <= max_index { - dense[index as usize] = intensity; + dense_intensities[index as usize] = intensity; } } - dense + dense_intensities + } + pub fn to_dense_spectrum(&self, max_index: Option) -> MzSpectrumVectorized{ + let max_index = match max_index { + Some(max_index) => max_index, + None => self.get_max_index(), + }; + let dense_intensities: Vec = self.to_dense(Some(max_index)).data.into(); + let dense_indices: Vec = (0..=max_index).map(|i| i as i32).collect(); + let dense_spectrum: MzSpectrumVectorized = MzSpectrumVectorized { resolution: (self.resolution), indices: (dense_indices), values: (dense_intensities) }; + dense_spectrum } }