Merge pull request #201 from theGreatHerrLebert/david@simulation

David@simulation
theGreatHerrLebert · May 16, 2024 · 04f9ddb · 04f9ddb
2 parents 001d492 + 992f70d
commit 04f9ddb
Show file tree

Hide file tree

Showing 8 changed files with 765 additions and 9 deletions.
diff --git a/imspy/imspy/algorithm/intensity/predictors.py b/imspy/imspy/algorithm/intensity/predictors.py
@@ -130,6 +130,49 @@ def simulate_ion_intensities_pandas(self, data: pd.DataFrame, batch_size: int =
 
         return data
 
+    def predict_intensities(
+            self,
+            sequences: List[str],
+            charges: List[int],
+            collision_energies: List[float],
+            divide_collision_energy_by: float = 1e2,
+            batch_size: int = 512,
+            flatten: bool = False,
+    ) -> List[NDArray]:
+        sequences_unmod = [remove_unimod_annotation(s) for s in sequences]
+        sequence_length = [len(s) for s in sequences_unmod]
+        collision_energies_norm = [ce / divide_collision_energy_by for ce in collision_energies]
+
+        tf_ds = generate_prosit_intensity_prediction_dataset(
+            sequences_unmod,
+            charges,
+            np.expand_dims(collision_energies_norm, 1)).batch(batch_size)
+
+        ds_unpacked = tf_ds.map(unpack_dict)
+
+        intensity_predictions = []
+        for peptides_in, precursor_charge_in, collision_energy_in in tqdm(ds_unpacked, desc='Predicting intensities',
+                                                                          total=len(sequences) // batch_size + 1,
+                                                                          ncols=100,
+                                                                          disable=not self.verbose):
+            model_input = [peptides_in, precursor_charge_in, collision_energy_in]
+            model_output = self.model(model_input).numpy()
+            intensity_predictions.append(model_output)
+
+        I_pred = list(np.vstack(intensity_predictions))
+        I_pred = np.squeeze(reshape_dims(post_process_predicted_fragment_spectra(pd.DataFrame({
+            'sequence': sequences,
+            'charge': charges,
+            'collision_energy': collision_energies,
+            'sequence_length': sequence_length,
+            'intensity_raw': I_pred,
+        }))))
+
+        if flatten:
+            I_pred = np.vstack([flatten_prosit_array(r) for r in I_pred])
+
+        return I_pred
+
     def simulate_ion_intensities(
             self,
             sequences: List[str],

diff --git a/imspy/imspy/algorithm/rt/predictors.py b/imspy/imspy/algorithm/rt/predictors.py
@@ -101,9 +101,18 @@ def simulate_separation_times(self, sequences: list[str], batch_size: int = 1024
 
         return self.model.predict(tf_ds, verbose=self.verbose)
 
+    def fit_model(self, data: pd.DataFrame, epochs: int = 10, batch_size: int = 1024, re_compile=False):
+        assert 'sequence' in data.columns, 'Data must contain a column named "sequence"'
+        assert 'retention_time_observed' in data.columns, 'Data must contain a column named "retention_time_observed"'
+        tokens = self._preprocess_sequences(data.sequence.values)
+        rts = data.retention_time_observed.values
+        tf_ds = tf.data.Dataset.from_tensor_slices((tokens, rts)).shuffle(len(data)).batch(batch_size)
+        if re_compile:
+            self.model.compile(optimizer='adam', loss='mean_squared_error')
+        self.model.fit(tf_ds, epochs=epochs, verbose=self.verbose)
+
     def simulate_separation_times_pandas(self, data: pd.DataFrame,
-                                         gradient_length: float,
-                                         batch_size: int = 1024) -> pd.DataFrame:
+                                         gradient_length: float, batch_size: int = 1024) -> pd.DataFrame:
         tokens = self._preprocess_sequences(data.sequence.values)
         tf_ds = tf.data.Dataset.from_tensor_slices(tokens).batch(batch_size)
 

diff --git a/imspy/imspy/timstof/dbsearch/__init__.py b/imspy/imspy/timstof/dbsearch/__init__.py