Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

David@mixture #50

Merged
merged 8 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 35 additions & 27 deletions imspy/imspy/algorithm/hashing.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,10 @@
import tensorflow as tf
import numpy as np
import pandas as pd
import warnings


class TimsHasher:
"""
Class to create hash keys from a given set of weights.

Args:

trials (int): number of trials to use for random projection.
len_trial (int): length of each trial.
seed (int): seed for random projection.
resolution (int): resolution of the random projection.
num_dalton (int): number of dalton to use for random projection.
"""
def __init__(self, trials=32, len_trial=20, seed=5671, resolution=1, num_dalton=10):
class CosimHasher:
def __init__(self, target_vector_length, trials: int = 32, len_trial: int = 20, seed: int = 42):

assert 0 < trials, f'trials variable needs to be greater then 1, was: {trials}'
assert 0 < len_trial, f'length trial variable needs to be greater then 1, was: {trials}'
Expand All @@ -31,42 +19,62 @@ def __init__(self, trials=32, len_trial=20, seed=5671, resolution=1, num_dalton=
f"using int64 which might slow down computation significantly.")
self.V = tf.constant(
np.expand_dims(np.array([np.power(2, i) for i in range(len_trial)]).astype(np.int64), 1))

else:
raise ValueError(f"bit number per hash cannot be greater then 64 or smaller 1, was: {len_trial}.")

self.trails = trials
self.trials = trials
self.len_trial = len_trial
self.seed = seed
self.resolution = resolution
self.num_dalton = num_dalton
self.target_vector_length = target_vector_length

np.random.seed(seed)
res_factor = int(np.power(10, resolution))
size = (len_trial * trials, num_dalton * res_factor + 1)
size = (len_trial * self.trials, target_vector_length)
X = np.random.normal(0, 1, size=size).astype(np.float32)
self.hash_tensor = tf.transpose(tf.constant(X))

def __repr__(self):
return f"TimsHasher(trials={self.trails}, len_trial={self.len_trial}, seed={self.seed}, " \
f"resolution={self.resolution}, num_dalton={self.num_dalton})"
def __repr__(self) -> str:
return f"CosimHasher(trials={self.trials}, len_trial={self.len_trial}, seed={self.seed}, " \
f"target_vector_length={self.target_vector_length})"

# create keys by random projection
def calculate_keys(self, W: tf.Tensor):
def calculate_keys(self, W: tf.Tensor) -> tf.Tensor:

S = (tf.sign(W @ self.hash_tensor) + 1) / 2

if self.len_trial <= 32:
# reshape into window, num_hashes, len_single_hash
S = tf.cast(tf.reshape(S, shape=(S.shape[0], self.trails, self.len_trial)), tf.int32)
S = tf.cast(tf.reshape(S, shape=(S.shape[0], self.trials, self.len_trial)), tf.int32)

# calculate int key from binary by base-transform
H = tf.squeeze(S @ self.V)
return H
else:
# reshape into window, num_hashes, len_single_hash
S = tf.cast(tf.reshape(S, shape=(S.shape[0], self.trails, self.len_trial)), tf.int64)
S = tf.cast(tf.reshape(S, shape=(S.shape[0], self.trials, self.len_trial)), tf.int64)

# calculate int key from binary by base-transform
H = tf.squeeze(S @ self.V)
return H


class TimsHasher(CosimHasher):
"""
Class to create hash keys from a given set of weights.

Args:

trials (int): number of trials to use for random projection.
len_trial (int): length of each trial.
seed (int): seed for random projection.
resolution (int): resolution of the random projection.
num_dalton (int): number of dalton to use for random projection.
"""
def __init__(self, trials: int = 32, len_trial: int = 20, seed: int = 5671, resolution: int = 1, num_dalton: int = 10):
res_factor = 10 ** resolution
target_vector_length = num_dalton * res_factor + 1
super().__init__(target_vector_length, trials, len_trial, seed)
self.resolution = resolution
self.num_dalton = num_dalton

def __repr__(self) -> str:
return f"TimsHasher(trials={self.trials}, len_trial={self.len_trial}, seed={self.seed}, " \
f"resolution={self.resolution}, num_dalton={self.num_dalton})"
9 changes: 9 additions & 0 deletions imspy/imspy/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,15 @@ def from_windows(cls, windows: List[TimsSpectrum]) -> 'TimsFrame':
[spec.get_spec_ptr() for spec in windows]
))

def to_dense_windows(self, window_length: float = 10, resolution: int = 1, overlapping: bool = True,
min_num_peaks: int = 5, min_intensity: float = 0.0) -> NDArray[np.float64]:

rows, cols, values, scans, window_indices = self.__frame_ptr.to_dense_windows(window_length, resolution,
overlapping, min_num_peaks,
min_intensity)

return scans, window_indices, np.reshape(values, (rows, cols))

def get_fragment_ptr(self):
return self.__frame_ptr

Expand Down
19 changes: 18 additions & 1 deletion imspy/imspy/core/slice.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import pandas as pd
from typing import List
from typing import List, Tuple, Any

from numpy.typing import NDArray
from tensorflow import sparse as sp
Expand Down Expand Up @@ -147,6 +147,23 @@ def to_windows(self, window_length: float = 10, overlapping: bool = True, min_nu
return [TimsSpectrum.from_py_tims_spectrum(spec) for spec in self.__slice_ptr.to_windows(
window_length, overlapping, min_num_peaks, min_intensity, num_threads)]

def to_dense_windows(self, window_length: float = 10, resolution: int = 1, overlapping: bool = True,
min_num_peaks: int = 5, min_intensity: float = 0.0, num_theads: int = 4) -> (
tuple)[list[NDArray], list[NDArray], list[NDArray]]:

DW = self.__slice_ptr.to_dense_windows(window_length, overlapping, min_num_peaks, min_intensity, resolution,
num_theads)

scan_list, window_indices_list, values_list = [], [], []

for values, scans, bins, row, col in DW:
W = np.reshape(values, (row, col))
scan_list.append(scans)
window_indices_list.append(bins)
values_list.append(W)

return scan_list, window_indices_list, values_list

@property
def df(self) -> pd.DataFrame:
"""Get the data as a pandas DataFrame.
Expand Down
1 change: 1 addition & 0 deletions imspy/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ numpy = ">=1.21"
tensorflow = ">=2.14"
tensorflow-probability = ">=0.22.1"
imspy-connector = ">=0.2.5"
tqdm = ">=4.62.0"

[build-system]
requires = ["poetry-core"]
Expand Down
106 changes: 106 additions & 0 deletions imspy/tests/test_s_curve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import binom
from tqdm import tqdm

from imspy.algorithm.hashing import TimsHasher

np.random.seed(0)


def get_signal_noise(sigma, n_windows, n_bins):
# generate F and F_prime
F = np.random.randn(n_windows, n_bins)

noise = np.random.randn(1, n_bins) * sigma
F_prime = F + noise

signal = np.zeros((n_windows, n_bins))
noise = np.zeros_like(signal)

for i in np.arange(n_windows):
mat_i = get_random_rotation(n_bins)

new_sig = mat_i @ F[0, :]
signal[i, :] = new_sig

new_noise = mat_i @ F_prime[0, :]
noise[i, :] = new_noise

return signal, noise


def get_p_estimate(ors, ands, F, F_prime, seed):
hasher = TimsHasher(trials=ors, len_trial=ands, seed=seed, num_dalton=10, resolution=1)
H = hasher.calculate_keys(F)
H_p = hasher.calculate_keys(F_prime)
return H[np.any((H == H_p).numpy(), axis=1)].shape[0] / H.shape[0], H, H_p


def and_or(p, n, m):
# n times AND
# m times OR
return 1.0 - np.power((1.0 - np.power(p, n)), m)


def get_p_model(s, ands, ors):
pSim = 1.0 - (np.arccos(s)) / np.pi
return and_or(pSim, ands, ors)


def get_random_rotation(dim):
m = np.random.randn(dim, dim)
m_s = (m + m.T) / np.sqrt(2)
v, mat = np.linalg.eig(m_s)
return mat


def get_estimates(ors, ands, signal, noise, seed, n_windows):
p_est, H, H_p = get_p_estimate(ors, ands, signal, noise, seed)

sims = []
for (s, n) in zip(signal, noise):
sim = cosine_similarity(s.reshape(1, -1), n.reshape(1, -1))[0][0]
sims.append(sim)

sim_median = np.median(sims)

p_mod = get_p_model(sim_median, ands, ors)
inter = binom.ppf([0.025, 0.975], n_windows, p_mod) / n_windows

return sim_median, p_est, p_mod, inter


def test_s_curve():
"""
"""
# Arrange
n_windows = 3000
n_bins = 101

ors = 128
ands = 31

seed = 1354656

# Action
dict_values = {}

for sigma in tqdm([0.2, 0.25, 0.3, 0.35, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75],
ncols=100,
desc="iterating collision probabilities"):
signal, noise = get_signal_noise(sigma, n_windows, n_bins)
sim_median, p_est, p_mod, inter = get_estimates(ors, ands, signal, noise, seed, n_windows)
dict_values[sigma] = (sim_median, p_est, p_mod, inter)

# Assert
for (k, (sim_median, p_est, p_mod, inter)) in dict_values.items():
assert inter[0] <= p_est <= inter[1]


def main():
test_s_curve()


if __name__ == '__main__':
main()
12 changes: 12 additions & 0 deletions imspy_connector/src/py_tims_frame.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use pyo3::prelude::*;
use pyo3::types::PyList;
use pyo3::types::PyTuple;
use numpy::{PyArray1, IntoPyArray};
use mscore::{TimsFrame, ImsFrame, MsType, TimsFrameVectorized, ImsFrameVectorized, ToResolution, Vectorized, RawTimsFrame, TimsSpectrum};

Expand Down Expand Up @@ -192,6 +193,17 @@ impl PyTimsFrame {

Ok(PyTimsFrame { inner: TimsFrame::from_windows(spectra) })
}

pub fn to_dense_windows(&self, py: Python, window_length: f64, resolution: i32, overlapping: bool, min_peaks: usize, min_intensity: f64) -> PyResult<PyObject> {

let (data, scans, window_indices, rows, cols) = self.inner.to_dense_windows(window_length, overlapping, min_peaks, min_intensity, resolution);
let py_array: &PyArray1<f64> = data.into_pyarray(py);
let py_scans: &PyArray1<i32> = scans.into_pyarray(py);
let py_window_indices: &PyArray1<i32> = window_indices.into_pyarray(py);
let tuple = PyTuple::new(py, &[rows.into_py(py), cols.into_py(py), py_array.to_owned().into_py(py), py_scans.to_owned().into_py(py), py_window_indices.to_owned().into_py(py)]);

Ok(tuple.into())
}
}

#[pyclass]
Expand Down
5 changes: 5 additions & 0 deletions imspy_connector/src/py_tims_slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ impl PyTimsSlice {

Ok(list.into())
}

pub fn to_dense_windows(&self, window_length: f64, overlapping: bool, min_peaks: usize, min_intensity: f64, resolution: i32, num_threads: usize) -> Vec<(Vec<f64>, Vec<i32>, Vec<i32>, usize, usize)> {
self.inner.to_dense_windows(window_length, overlapping, min_peaks, min_intensity, resolution, num_threads)
}

pub fn get_frame_at_index(&self, index: i32) -> PyTimsFrame {
PyTimsFrame { inner: self.inner.frames[index as usize].clone() }
}
Expand Down
39 changes: 33 additions & 6 deletions mscore/src/tims_frame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,12 +201,18 @@ impl TimsFrame {
// split by scan (ion mobility)
let spectra = self.to_tims_spectra();

let scan_indices: Vec<_> = spectra.iter().map(|spectrum| spectrum.scan).collect();

let windows: Vec<_> = spectra.iter().map(|spectrum|
spectrum.to_windows(window_length, overlapping, min_peaks, min_intensity))
.collect();

let mut scan_indices = Vec::new();

for tree in windows.iter() {
for (_, window) in tree {
scan_indices.push(window.scan)
}
}

let mut spectra = Vec::new();
let mut window_indices = Vec::new();

Expand Down Expand Up @@ -248,11 +254,32 @@ impl TimsFrame {
TimsFrame::new(first_window.frame_id, first_window.ms_type.clone(), first_window.retention_time, scan, mobility, tof, mzs, intensity)
}

/*
pub fn to_dense_windows(&self, window_length: f64, overlapping: bool, min_peaks: usize, min_intensity: f64) {
let (scan_indices, window_indices, spectra) = self.to_windows_indexed(window_length, overlapping, min_peaks, min_intensity);
pub fn to_dense_windows(&self, window_length: f64, overlapping: bool, min_peaks: usize, min_intensity: f64, resolution: i32) -> (Vec<f64>, Vec<i32>, Vec<i32>, usize, usize) {
let factor = (10.0f64).powi(resolution);
let num_colums = ((window_length * factor).round() + 1.0) as usize;

let (scans, window_indices, spectra) = self.to_windows_indexed(window_length, overlapping, min_peaks, min_intensity);
let vectorized_spectra = spectra.iter().map(|spectrum| spectrum.vectorized(resolution)).collect::<Vec<_>>();

let mut flat_matrix: Vec<f64> = vec![0.0; spectra.len() * num_colums];

for (row_index, (window_index, spectrum)) in itertools::multizip((&window_indices, vectorized_spectra)).enumerate() {

let vectorized_window_index = match *window_index >= 0 {
true => (*window_index as f64 * window_length * factor).round() as i32,
false => (((-1.0 * (*window_index as f64)) * window_length - (0.5 * window_length)) * factor).round() as i32,
};

for (i, index) in spectrum.vector.mz_vector.indices.iter().enumerate() {
let zero_based_index = (index - vectorized_window_index) as usize;
let current_index = row_index * num_colums + zero_based_index;
flat_matrix[current_index] = spectrum.vector.mz_vector.values[i];
}

}
(flat_matrix, scans, window_indices, spectra.len(), num_colums)
}
*/


pub fn to_indexed_mz_spectrum(&self) -> IndexedMzSpectrum {
let mut grouped_data: BTreeMap<i32, Vec<(f64, f64)>> = BTreeMap::new();
Expand Down
11 changes: 11 additions & 0 deletions mscore/src/tims_slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,17 @@ impl TimsSlice {
windows
}

pub fn to_dense_windows(&self, window_length: f64, overlapping: bool, min_peaks: usize, min_intensity: f64, resolution: i32, num_threads: usize) -> Vec<(Vec<f64>, Vec<i32>, Vec<i32>, usize, usize)> {
let pool = ThreadPoolBuilder::new().num_threads(num_threads).build().unwrap();

let result = pool.install(|| {
let t = self.frames.par_iter().map(|f| f.to_dense_windows(window_length, overlapping, min_peaks, min_intensity, resolution)).collect::<Vec<_>>();
t
});

result
}

pub fn to_tims_planes(&self, tof_max_value: i32, num_chunks: i32, num_threads: usize) -> Vec<TimsPlane> {

let flat_slice = self.flatten();
Expand Down