prevent circular import and fix tokenizer error

theGreatHerrLebert · Oct 27, 2023 · 85fbd8d · 85fbd8d
1 parent c0ba7d9
commit 85fbd8d
Show file tree

Hide file tree

Showing 8 changed files with 31 additions and 15 deletions.
diff --git a/pyims/examples/simulation/run_example_simulation.py b/pyims/examples/simulation/run_example_simulation.py
@@ -89,7 +89,7 @@ def build_experiment():
 
 
     # to reduce computational load in example
-    sample_digest.data = sample_digest.data.sample(100000, random_state= rng)
+    sample_digest.data = sample_digest.data.sample(100, random_state= rng)
 
 
     t.load_sample(sample_digest)

diff --git a/pyims/pyims/data/frame.py b/pyims/pyims/data/frame.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 import pyims_connector as pims
-from pyims.spectrum import MzSpectrum, TimsSpectrum
+from pyims.data.spectrum import MzSpectrum, TimsSpectrum
 
 
 class TimsFrame:

diff --git a/pyims/pyims/data/handle.py b/pyims/pyims/data/handle.py
@@ -10,7 +10,8 @@
 
 from abc import ABC
 
-from pyims.data import TimsFrame, TimsSlice
+from pyims.data.frame import TimsFrame
+from pyims.data.slice import TimsSlice
 
 
 

diff --git a/pyims/pyims/data/slice.py b/pyims/pyims/data/slice.py
@@ -3,7 +3,8 @@
 from typing import List
 
 import pyims_connector as pims
-from pyims.data import TimsFrame, MzSpectrum
+from pyims.data.frame import TimsFrame
+from pyims.data.spectrum import MzSpectrum
 
 
 class TimsSlice:

diff --git a/pyims/pyims/data/spectrum.py b/pyims/pyims/data/spectrum.py
@@ -1,6 +1,6 @@
 import numpy as np
 from typing import List, Tuple
-
+from __future__ import annotations
 import pandas as pd
 from numpy.typing import NDArray
 
@@ -67,7 +67,7 @@ def __repr__(self):
         return f"MzSpectrum(num_peaks={len(self.mz)})"
 
     def to_windows(self, window_length: float = 10, overlapping: bool = True, min_num_peaks: int = 5,
-                   min_intensity: float = 1) -> Tuple[NDArray, List['MzSpectrum']]:
+                   min_intensity: float = 1) -> Tuple[NDArray, List[MzSpectrum]]:
         """Convert the spectrum to a list of windows.
 
         Args:
@@ -83,8 +83,20 @@ def to_windows(self, window_length: float = 10, overlapping: bool = True, min_nu
         indices, windows = self.__spec_ptr.to_windows(window_length, overlapping, min_num_peaks, min_intensity)
         return indices, [MzSpectrum.from_py_mz_spectrum(window) for window in windows]
 
+    def to_resolution(self, resolution: int) -> MzSpectrum:
+        """Bins the spectrum's m/z values to a 
+        given resolution and sums the intensities.
+
+        Args:
+            resolution (int): Negative decadic logarithm of bin size.
+
+        Returns:
+            MzSpectrum: A new `MzSpectrum` where m/z values are binned according to the given resolution.
+        """
+        return self.__spec_ptr.to_resolution(resolution)
+
     def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, intensity_min: float = 0.0,
-               intensity_max: float = 1e9) -> 'MzSpectrum':
+               intensity_max: float = 1e9) -> MzSpectrum:
         """Filter the spectrum for a given m/z range and intensity range.
 
         Args:
@@ -99,7 +111,7 @@ def filter(self, mz_min: float = 0.0, mz_max: float = 2000.0, intensity_min: flo
         return MzSpectrum.from_py_mz_spectrum(
             self.__spec_ptr.filter_ranged(mz_min, mz_max, intensity_min, intensity_max))
 
-    def vectorized(self, resolution: int = 2) -> 'MzSpectrumVectorized':
+    def vectorized(self, resolution: int = 2) -> MzSpectrumVectorized:
         """Convert the spectrum to a vectorized spectrum.
 
         Args:
@@ -145,7 +157,7 @@ def from_py_mz_spectrum_vectorized(cls, spec: pims.PyMzSpectrumVectorized):
     @property
     def resolution(self) -> float:
         """Resolution.
-
+        
         Returns:
             float: Resolution.
         """

diff --git a/pyims/pyims/feature.py b/pyims/pyims/feature.py
@@ -4,7 +4,7 @@
 
 import json
 
-from pyims.data import TimsSlice, TimsFrame, MzSpectrum
+from pyims.data import TimsSlice, TimsFrame
 from pyims.utility import gaussian, exp_gaussian
 from pyims.isotopes import IsotopePatternGenerator, create_initial_feature_distribution
 from abc import ABC, abstractmethod

diff --git a/pyims/pyims/simulation/hardware_models.py b/pyims/pyims/simulation/hardware_models.py
@@ -267,7 +267,7 @@ def __init__(self, model_path: str, tokenizer_path: str):
 
     def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array:
         print('tokenizing sequences...')
-        tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized)
+        tokens = self.tokenizer.texts_to_sequences(sequences_tokenized)
         tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
         return tokens_padded
 
@@ -286,7 +286,7 @@ def _worker(model_path: str, tokens_padded: np.array, batched: bool = True, bs:
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: Chromatography) ->  NDArray[np.float64]:
 
         data = sample.peptides
-        tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized)
+        tokens = data["sequence_tokenized"].apply(lambda st: st.sequence_tokenized).to_numpy()
         print('generating tf dataset...')
         tokens_padded = self.sequences_to_tokens(tokens)
 
@@ -632,7 +632,7 @@ def __init__(self, model_path: str, tokenizer_path: str):
 
     def sequences_to_tokens(self, sequences_tokenized: np.array) -> np.array:
         print('tokenizing sequences...')
-        tokens = np.apply_along_axis(self.tokenizer.texts_to_sequences, 0, sequences_tokenized)
+        tokens = self.tokenizer.texts_to_sequences(sequences_tokenized)
         tokens_padded = tf.keras.preprocessing.sequence.pad_sequences(tokens, 50, padding='post')
         return tokens_padded
 
@@ -654,7 +654,7 @@ def _worker(model_path: str, tokens_padded: np.array, mz: np.array, charges: np.
 
     def simulate(self, sample: ProteomicsExperimentSampleSlice, device: IonMobilitySeparation) -> Tuple[NDArray]:
         data = sample.ions.merge(sample.peptides.loc[:,["pep_id","sequence_tokenized"]],on="pep_id",validate="many_to_one")
-        tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized)
+        tokens = data.sequence_tokenized.apply(lambda st: st.sequence_tokenized).to_numpy()
         tokens_padded = self.sequences_to_tokens(tokens)
         mz = data['mz'].values
         charges = data["charge"].values

diff --git a/pyims/pyproject.toml b/pyims/pyproject.toml
@@ -8,12 +8,14 @@ readme = "README.md"
 [tool.poetry.dependencies]
 python = ">=3.10, <3.12"
 pandas = ">=2.1"
-numpy = ">=1.21"
+numpy = ">=1.21, <1.25"
 mendeleev = ">=0.14"
 pyopenms = ">=3.1"
 scipy = ">=1.11.2"
 tqdm = ">=4.66"
 pyarrow =">=13.0"
+tensorflow = ">=2.14"
+numba = ">=0.57"
 
 [build-system]
 requires = ["poetry-core"]