theGreatHerrLebert · theGreatHerrLebert · Nov 16, 2023 · Nov 15, 2023 · Nov 15, 2023 · Nov 15, 2023
diff --git a/imspy/imspy/algorithm/hashing.py b/imspy/imspy/algorithm/hashing.py
@@ -1,22 +1,10 @@
 import tensorflow as tf
 import numpy as np
-import pandas as pd
 import warnings
 
 
-class TimsHasher:
-    """
-    Class to create hash keys from a given set of weights.
-
-    Args:
-
-        trials (int): number of trials to use for random projection.
-        len_trial (int): length of each trial.
-        seed (int): seed for random projection.
-        resolution (int): resolution of the random projection.
-        num_dalton (int): number of dalton to use for random projection.
-    """
-    def __init__(self, trials=32, len_trial=20, seed=5671, resolution=1, num_dalton=10):
+class CosimHasher:
+    def __init__(self, target_vector_length, trials: int = 32, len_trial: int = 20, seed: int = 42):
 
         assert 0 < trials, f'trials variable needs to be greater then 1, was: {trials}'
         assert 0 < len_trial, f'length trial variable needs to be greater then 1, was: {trials}'
@@ -31,42 +19,62 @@ def __init__(self, trials=32, len_trial=20, seed=5671, resolution=1, num_dalton=
                           f"using int64 which might slow down computation significantly.")
             self.V = tf.constant(
                 np.expand_dims(np.array([np.power(2, i) for i in range(len_trial)]).astype(np.int64), 1))
-
         else:
             raise ValueError(f"bit number per hash cannot be greater then 64 or smaller 1, was: {len_trial}.")
 
-        self.trails = trials
+        self.trials = trials
         self.len_trial = len_trial
         self.seed = seed
-        self.resolution = resolution
-        self.num_dalton = num_dalton
+        self.target_vector_length = target_vector_length
 
         np.random.seed(seed)
-        res_factor = int(np.power(10, resolution))
-        size = (len_trial * trials, num_dalton * res_factor + 1)
+        size = (len_trial * self.trials, target_vector_length)
         X = np.random.normal(0, 1, size=size).astype(np.float32)
         self.hash_tensor = tf.transpose(tf.constant(X))
 
-    def __repr__(self):
-        return f"TimsHasher(trials={self.trails}, len_trial={self.len_trial}, seed={self.seed}, " \
-               f"resolution={self.resolution}, num_dalton={self.num_dalton})"
+    def __repr__(self) -> str:
+        return f"CosimHasher(trials={self.trials}, len_trial={self.len_trial}, seed={self.seed}, " \
+               f"target_vector_length={self.target_vector_length})"
 
-    # create keys by random projection
-    def calculate_keys(self, W: tf.Tensor):
+    def calculate_keys(self, W: tf.Tensor) -> tf.Tensor:
 
         S = (tf.sign(W @ self.hash_tensor) + 1) / 2
 
         if self.len_trial <= 32:
             # reshape into window, num_hashes, len_single_hash
-            S = tf.cast(tf.reshape(S, shape=(S.shape[0], self.trails, self.len_trial)), tf.int32)
+            S = tf.cast(tf.reshape(S, shape=(S.shape[0], self.trials, self.len_trial)), tf.int32)
 
             # calculate int key from binary by base-transform
             H = tf.squeeze(S @ self.V)
             return H
         else:
             # reshape into window, num_hashes, len_single_hash
-            S = tf.cast(tf.reshape(S, shape=(S.shape[0], self.trails, self.len_trial)), tf.int64)
+            S = tf.cast(tf.reshape(S, shape=(S.shape[0], self.trials, self.len_trial)), tf.int64)
 
             # calculate int key from binary by base-transform
             H = tf.squeeze(S @ self.V)
             return H
+
+
+class TimsHasher(CosimHasher):
+    """
+    Class to create hash keys from a given set of weights.
+
+    Args:
+
+        trials (int): number of trials to use for random projection.
+        len_trial (int): length of each trial.
+        seed (int): seed for random projection.
+        resolution (int): resolution of the random projection.
+        num_dalton (int): number of dalton to use for random projection.
+    """
+    def __init__(self, trials: int = 32, len_trial: int = 20, seed: int = 5671, resolution: int = 1, num_dalton: int = 10):
+        res_factor = 10 ** resolution
+        target_vector_length = num_dalton * res_factor + 1
+        super().__init__(target_vector_length, trials, len_trial, seed)
+        self.resolution = resolution
+        self.num_dalton = num_dalton
+
+    def __repr__(self) -> str:
+        return f"TimsHasher(trials={self.trials}, len_trial={self.len_trial}, seed={self.seed}, " \
+               f"resolution={self.resolution}, num_dalton={self.num_dalton})"
diff --git a/imspy/imspy/core/frame.py b/imspy/imspy/core/frame.py
@@ -237,6 +237,15 @@ def from_windows(cls, windows: List[TimsSpectrum]) -> 'TimsFrame':
             [spec.get_spec_ptr() for spec in windows]
         ))
 
+    def to_dense_windows(self, window_length: float = 10, resolution: int = 1, overlapping: bool = True,
+                         min_num_peaks: int = 5, min_intensity: float = 0.0) -> NDArray[np.float64]:
+
+        rows, cols, values, scans, window_indices = self.__frame_ptr.to_dense_windows(window_length, resolution,
+                                                                                      overlapping, min_num_peaks,
+                                                                                      min_intensity)
+
+        return scans, window_indices, np.reshape(values, (rows, cols))
+
     def get_fragment_ptr(self):
         return self.__frame_ptr
 

diff --git a/imspy/imspy/core/slice.py b/imspy/imspy/core/slice.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-from typing import List
+from typing import List, Tuple, Any
 
 from numpy.typing import NDArray
 from tensorflow import sparse as sp
@@ -147,6 +147,23 @@ def to_windows(self, window_length: float = 10, overlapping: bool = True, min_nu
         return [TimsSpectrum.from_py_tims_spectrum(spec) for spec in self.__slice_ptr.to_windows(
             window_length, overlapping, min_num_peaks, min_intensity, num_threads)]
 
+    def to_dense_windows(self, window_length: float = 10, resolution: int = 1, overlapping: bool = True,
+                         min_num_peaks: int = 5, min_intensity: float = 0.0, num_theads: int = 4) -> (
+            tuple)[list[NDArray], list[NDArray], list[NDArray]]:
+
+        DW = self.__slice_ptr.to_dense_windows(window_length, overlapping, min_num_peaks, min_intensity, resolution,
+                                               num_theads)
+
+        scan_list, window_indices_list, values_list = [], [], []
+
+        for values, scans, bins, row, col in DW:
+            W = np.reshape(values, (row, col))
+            scan_list.append(scans)
+            window_indices_list.append(bins)
+            values_list.append(W)
+
+        return scan_list, window_indices_list, values_list
+
     @property
     def df(self) -> pd.DataFrame:
         """Get the data as a pandas DataFrame.

diff --git a/imspy/pyproject.toml b/imspy/pyproject.toml
@@ -12,6 +12,7 @@ numpy = ">=1.21"
 tensorflow = ">=2.14"
 tensorflow-probability = ">=0.22.1"
 imspy-connector = ">=0.2.5"
+tqdm = ">=4.62.0"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/imspy/tests/test_s_curve.py b/imspy/tests/test_s_curve.py
@@ -0,0 +1,106 @@
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy.stats import binom
+from tqdm import tqdm
+
+from imspy.algorithm.hashing import TimsHasher
+
+np.random.seed(0)
+
+
+def get_signal_noise(sigma, n_windows, n_bins):
+    # generate F and F_prime
+    F = np.random.randn(n_windows, n_bins)
+
+    noise = np.random.randn(1, n_bins) * sigma
+    F_prime = F + noise
+
+    signal = np.zeros((n_windows, n_bins))
+    noise = np.zeros_like(signal)
+
+    for i in np.arange(n_windows):
+        mat_i = get_random_rotation(n_bins)
+
+        new_sig = mat_i @ F[0, :]
+        signal[i, :] = new_sig
+
+        new_noise = mat_i @ F_prime[0, :]
+        noise[i, :] = new_noise
+
+    return signal, noise
+
+
+def get_p_estimate(ors, ands, F, F_prime, seed):
+    hasher = TimsHasher(trials=ors, len_trial=ands, seed=seed, num_dalton=10, resolution=1)
+    H = hasher.calculate_keys(F)
+    H_p = hasher.calculate_keys(F_prime)
+    return H[np.any((H == H_p).numpy(), axis=1)].shape[0] / H.shape[0], H, H_p
+
+
+def and_or(p, n, m):
+    # n times AND
+    # m times OR
+    return 1.0 - np.power((1.0 - np.power(p, n)), m)
+
+
+def get_p_model(s, ands, ors):
+    pSim = 1.0 - (np.arccos(s)) / np.pi
+    return and_or(pSim, ands, ors)
+
+
+def get_random_rotation(dim):
+    m = np.random.randn(dim, dim)
+    m_s = (m + m.T) / np.sqrt(2)
+    v, mat = np.linalg.eig(m_s)
+    return mat
+
+
+def get_estimates(ors, ands, signal, noise, seed, n_windows):
+    p_est, H, H_p = get_p_estimate(ors, ands, signal, noise, seed)
+
+    sims = []
+    for (s, n) in zip(signal, noise):
+        sim = cosine_similarity(s.reshape(1, -1), n.reshape(1, -1))[0][0]
+        sims.append(sim)
+
+    sim_median = np.median(sims)
+
+    p_mod = get_p_model(sim_median, ands, ors)
+    inter = binom.ppf([0.025, 0.975], n_windows, p_mod) / n_windows
+
+    return sim_median, p_est, p_mod, inter
+
+
+def test_s_curve():
+    """
+    """
+    # Arrange
+    n_windows = 3000
+    n_bins = 101
+
+    ors = 128
+    ands = 31
+
+    seed = 1354656
+
+    # Action
+    dict_values = {}
+
+    for sigma in tqdm([0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75],
+                      ncols=100,
+                      desc="iterating collision probabilities"):
+        signal, noise = get_signal_noise(sigma, n_windows, n_bins)
+        sim_median, p_est, p_mod, inter = get_estimates(ors, ands, signal, noise, seed, n_windows)
+        dict_values[sigma] = (sim_median, p_est, p_mod, inter)
+
+    # Assert
+    for (k, (sim_median, p_est, p_mod, inter)) in dict_values.items():
+        assert inter[0] <= p_est <= inter[1]
+
+
+def main():
+    test_s_curve()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/imspy_connector/src/py_tims_frame.rs b/imspy_connector/src/py_tims_frame.rs
@@ -1,5 +1,6 @@
 use pyo3::prelude::*;
 use pyo3::types::PyList;
+use pyo3::types::PyTuple;
 use numpy::{PyArray1, IntoPyArray};
 use mscore::{TimsFrame, ImsFrame, MsType, TimsFrameVectorized, ImsFrameVectorized, ToResolution, Vectorized, RawTimsFrame, TimsSpectrum};
 
@@ -192,6 +193,17 @@ impl PyTimsFrame {
 
         Ok(PyTimsFrame { inner: TimsFrame::from_windows(spectra) })
     }
+
+    pub fn to_dense_windows(&self, py: Python, window_length: f64, resolution: i32, overlapping: bool, min_peaks: usize, min_intensity: f64) -> PyResult<PyObject> {
+
+        let (data, scans, window_indices, rows, cols) = self.inner.to_dense_windows(window_length, overlapping, min_peaks, min_intensity, resolution);
+        let py_array: &PyArray1<f64> = data.into_pyarray(py);
+        let py_scans: &PyArray1<i32> = scans.into_pyarray(py);
+        let py_window_indices: &PyArray1<i32> = window_indices.into_pyarray(py);
+        let tuple = PyTuple::new(py, &[rows.into_py(py), cols.into_py(py), py_array.to_owned().into_py(py), py_scans.to_owned().into_py(py), py_window_indices.to_owned().into_py(py)]);
+
+        Ok(tuple.into())
+    }
 }
 
 #[pyclass]

diff --git a/imspy_connector/src/py_tims_slice.rs b/imspy_connector/src/py_tims_slice.rs
@@ -96,6 +96,11 @@ impl PyTimsSlice {
 
         Ok(list.into())
     }
+
+    pub fn to_dense_windows(&self, window_length: f64, overlapping: bool, min_peaks: usize, min_intensity: f64, resolution: i32, num_threads: usize) -> Vec<(Vec<f64>, Vec<i32>, Vec<i32>, usize, usize)> {
+        self.inner.to_dense_windows(window_length, overlapping, min_peaks, min_intensity, resolution, num_threads)
+    }
+
     pub fn get_frame_at_index(&self, index: i32) -> PyTimsFrame {
         PyTimsFrame { inner: self.inner.frames[index as usize].clone() }
     }

diff --git a/mscore/src/tims_frame.rs b/mscore/src/tims_frame.rs
@@ -201,12 +201,18 @@ impl TimsFrame {
         // split by scan (ion mobility)
         let spectra = self.to_tims_spectra();
 
-        let scan_indices: Vec<_> = spectra.iter().map(|spectrum| spectrum.scan).collect();
-
         let windows: Vec<_> = spectra.iter().map(|spectrum|
             spectrum.to_windows(window_length, overlapping, min_peaks, min_intensity))
             .collect();
 
+        let mut scan_indices = Vec::new();
+
+        for tree in windows.iter() {
+            for (_, window) in tree {
+                scan_indices.push(window.scan)
+            }
+        }
+
         let mut spectra = Vec::new();
         let mut window_indices = Vec::new();
 
@@ -248,11 +254,32 @@ impl TimsFrame {
         TimsFrame::new(first_window.frame_id, first_window.ms_type.clone(), first_window.retention_time, scan, mobility, tof, mzs, intensity)
     }
 
-    /*
-    pub fn to_dense_windows(&self, window_length: f64, overlapping: bool, min_peaks: usize, min_intensity: f64) {
-        let (scan_indices, window_indices, spectra) = self.to_windows_indexed(window_length, overlapping, min_peaks, min_intensity);
+    pub fn to_dense_windows(&self, window_length: f64, overlapping: bool, min_peaks: usize, min_intensity: f64, resolution: i32) -> (Vec<f64>, Vec<i32>, Vec<i32>, usize, usize) {
+        let factor = (10.0f64).powi(resolution);
+        let num_colums = ((window_length * factor).round() + 1.0) as usize;
+
+        let (scans, window_indices, spectra) = self.to_windows_indexed(window_length, overlapping, min_peaks, min_intensity);
+        let vectorized_spectra = spectra.iter().map(|spectrum| spectrum.vectorized(resolution)).collect::<Vec<_>>();
+
+        let mut flat_matrix: Vec<f64> = vec![0.0; spectra.len() * num_colums];
+
+        for (row_index, (window_index, spectrum)) in itertools::multizip((&window_indices, vectorized_spectra)).enumerate() {
+
+            let vectorized_window_index = match *window_index >= 0 {
+                true => (*window_index as f64 * window_length * factor).round() as i32,
+                false => (((-1.0 * (*window_index as f64)) * window_length - (0.5 * window_length)) * factor).round() as i32,
+            };
+
+            for (i, index) in spectrum.vector.mz_vector.indices.iter().enumerate() {
+                let zero_based_index = (index - vectorized_window_index) as usize;
+                let current_index = row_index * num_colums + zero_based_index;
+                flat_matrix[current_index] = spectrum.vector.mz_vector.values[i];
+            }
+
+        }
+        (flat_matrix, scans, window_indices, spectra.len(), num_colums)
     }
-    */
+
 
     pub fn to_indexed_mz_spectrum(&self) -> IndexedMzSpectrum {
         let mut grouped_data: BTreeMap<i32, Vec<(f64, f64)>> = BTreeMap::new();

diff --git a/mscore/src/tims_slice.rs b/mscore/src/tims_slice.rs
@@ -218,6 +218,17 @@ impl TimsSlice {
         windows
     }
 
+    pub fn to_dense_windows(&self, window_length: f64, overlapping: bool, min_peaks: usize, min_intensity: f64, resolution: i32, num_threads: usize) -> Vec<(Vec<f64>, Vec<i32>, Vec<i32>, usize, usize)> {
+        let pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();
+
+        let result = pool.install(|| {
+            let t = self.frames.par_iter().map(|f| f.to_dense_windows(window_length, overlapping, min_peaks, min_intensity, resolution)).collect::<Vec<_>>();
+            t
+        });
+
+        result
+    }
+
     pub fn to_tims_planes(&self, tof_max_value: i32, num_chunks: i32, num_threads: usize) -> Vec<TimsPlane> {
 
         let flat_slice = self.flatten();