Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use mzdata::mz_read, adding support for gzipped MGF and mzML, and for Thermo raw and mzMLb #1

Merged
merged 16 commits into from
Oct 24, 2024
Merged
13 changes: 12 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,19 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Install stable toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true

- name: Run Clippy
run: cargo clippy --all-targets --all-features
uses: actions-rs/cargo@v1
with:
command: clippy
args: --all-targets --all-features

pytest:
runs-on: ${{ matrix.os }}
Expand Down
9 changes: 7 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
[package]
name = "ms2rescore-rs"
version = "0.3.0"
version = "0.4.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "ms2rescore_rs"
crate-type = ["cdylib"]

[features]
default = ["thermo"]

thermo = ["mzdata/thermo"]

[dependencies]
pyo3 = "0.20.0"
mzdata = "0.20.0"
mzdata = "0.33.0"
timsrust = "0.3.0"
37 changes: 23 additions & 14 deletions src/file_types.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
use mzdata::io::MassSpectrometryFormat;

pub enum SpectrumFileType {
MascotGenericFormat,
MzML,
MzMLb,
BrukerRaw,
// ThermoRaw,
ThermoRaw,
Unknown,
}

pub fn match_file_type(spectrum_path: &str) -> SpectrumFileType {
let extension = spectrum_path.split('.').last().unwrap_or("").to_lowercase();
match extension.as_str() {
"mgf" => SpectrumFileType::MascotGenericFormat,
"mzml" => SpectrumFileType::MzML,
"d" | "ms2" => SpectrumFileType::BrukerRaw,
// "raw" => SpectrumFileType::ThermoRaw,
_ => match (
folder_contains_extension(spectrum_path, "bin"),
folder_contains_extension(spectrum_path, "parquet"),
) {
(true, true) => SpectrumFileType::BrukerRaw,
_ => SpectrumFileType::Unknown,
},
match mzdata::io::infer_from_path(spectrum_path).0 {
MassSpectrometryFormat::MGF => SpectrumFileType::MascotGenericFormat,
MassSpectrometryFormat::MzML => SpectrumFileType::MzML,
MassSpectrometryFormat::MzMLb => SpectrumFileType::MzMLb,
MassSpectrometryFormat::ThermoRaw => SpectrumFileType::ThermoRaw,
MassSpectrometryFormat::Unknown => {
let extension = spectrum_path.split('.').last().unwrap_or("").to_lowercase();
match extension.as_str() {
"d" | "ms2" => SpectrumFileType::BrukerRaw,
_ => match (
folder_contains_extension(spectrum_path, "bin"),
folder_contains_extension(spectrum_path, "parquet"),
) {
(true, true) => SpectrumFileType::BrukerRaw,
_ => SpectrumFileType::Unknown,
},
}
}
_ => SpectrumFileType::Unknown
}
}

Expand Down
29 changes: 18 additions & 11 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,37 @@ mod ms2_spectrum;

use std::collections::HashMap;

use pyo3::exceptions::PyOSError;
use pyo3::exceptions::{PyException, PyValueError};
use pyo3::prelude::*;

use file_types::{match_file_type, SpectrumFileType};
use precursor::Precursor;
use ms2_spectrum::MS2Spectrum;

/// Check if spectrum path matches a supported file type.
#[pyfunction]
pub fn is_supported_file_type(spectrum_path: String) -> bool {
let file_type = match_file_type(&spectrum_path);

!matches!(file_type, SpectrumFileType::Unknown)
}

/// Get mapping of spectrum identifiers to precursor information.
#[pyfunction]
pub fn get_precursor_info(spectrum_path: String) -> PyResult<HashMap<String, Precursor>> {
let file_type = match_file_type(&spectrum_path);

let precursors = match file_type {
SpectrumFileType::MascotGenericFormat | SpectrumFileType::MzML => {
parse_mzdata::parse_precursor_info(&spectrum_path, file_type)
SpectrumFileType::MascotGenericFormat | SpectrumFileType::MzML | SpectrumFileType::MzMLb | SpectrumFileType:: ThermoRaw => {
parse_mzdata::parse_precursor_info(&spectrum_path)
}
SpectrumFileType::BrukerRaw => parse_timsrust::parse_precursor_info(&spectrum_path),
// SpectrumFileType::ThermoRaw => parse_with_mzdata_thermo(&spectrum_path, file_type),
SpectrumFileType::Unknown => return Err(PyOSError::new_err("Unsupported file type")),
SpectrumFileType::Unknown => return Err(PyValueError::new_err("Unsupported file type")),
};

match precursors {
Ok(precursors) => Ok(precursors),
Err(e) => Err(PyOSError::new_err(e.to_string())),
Err(e) => Err(PyException::new_err(e.to_string())),
}
}

Expand All @@ -39,17 +46,16 @@ pub fn get_ms2_spectra(spectrum_path: String) -> PyResult<Vec<ms2_spectrum::MS2S
let file_type = match_file_type(&spectrum_path);

let spectra = match file_type {
SpectrumFileType::MascotGenericFormat | SpectrumFileType::MzML => {
parse_mzdata::read_ms2_spectra(&spectrum_path, file_type)
SpectrumFileType::MascotGenericFormat | SpectrumFileType::MzML | SpectrumFileType::MzMLb | SpectrumFileType:: ThermoRaw => {
parse_mzdata::read_ms2_spectra(&spectrum_path)
}
SpectrumFileType::BrukerRaw => parse_timsrust::read_ms2_spectra(&spectrum_path),
// SpectrumFileType::ThermoRaw => parse_with_mzdata_thermo(&spectrum_path, file_type),
SpectrumFileType::Unknown => return Err(PyOSError::new_err("Unsupported file type")),
SpectrumFileType::Unknown => return Err(PyValueError::new_err("Unsupported file type")),
};

match spectra {
Ok(spectra) => Ok(spectra),
Err(e) => Err(PyOSError::new_err(e.to_string())),
Err(e) => Err(PyException::new_err(e.to_string())),
}
}

Expand All @@ -59,6 +65,7 @@ pub fn get_ms2_spectra(spectrum_path: String) -> PyResult<Vec<ms2_spectrum::MS2S
fn ms2rescore_rs(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<Precursor>()?;
m.add_class::<MS2Spectrum>()?;
m.add_function(wrap_pyfunction!(is_supported_file_type, m)?)?;
m.add_function(wrap_pyfunction!(get_precursor_info, m)?)?;
m.add_function(wrap_pyfunction!(get_ms2_spectra, m)?)?;
Ok(())
Expand Down
63 changes: 9 additions & 54 deletions src/parse_mzdata.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
use std::collections::HashMap;
use std::fs::File;

use mzdata::io::{MGFReader, MzMLReader};
use mzdata::params::ParamValue;
use mzdata::mz_read;

use crate::file_types::SpectrumFileType;
use crate::ms2_spectrum::MS2Spectrum;
use crate::precursor::Precursor;

Expand Down Expand Up @@ -50,71 +48,28 @@ impl From<mzdata::spectrum::MultiLayerSpectrum> for MS2Spectrum {
/// Parse precursor info from spectrum files with mzdata
pub fn parse_precursor_info(
spectrum_path: &str,
file_type: SpectrumFileType,
) -> Result<HashMap<String, Precursor>, std::io::Error> {
let file = File::open(spectrum_path)?;
match file_type {
SpectrumFileType::MascotGenericFormat => Ok(MGFReader::new(file)
mz_read!(spectrum_path.as_ref(), reader => {
reader.filter(|spectrum| spectrum.description.ms_level == 2)
.filter_map(|spectrum| {
spectrum.description.precursor.as_ref()?;
Some((spectrum.description.id.clone(), Precursor::from(&spectrum)))
})
.collect::<HashMap<String, Precursor>>()),

SpectrumFileType::MzML => Ok(MzMLReader::new(file)
.filter_map(|spectrum| {
if spectrum.description.ms_level != 2 {
return None;
}
spectrum.description.precursor.as_ref()?;
Some((spectrum.description.id.clone(), Precursor::from(&spectrum)))
})
.collect::<HashMap<String, Precursor>>()),

_ => Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"Unsupported file type for mzdata",
)),
}
.collect::<HashMap<String, Precursor>>()
})
}

/// Read MS2 spectra from spectrum files with mzdata
pub fn read_ms2_spectra(
spectrum_path: &str,
file_type: SpectrumFileType,
) -> Result<Vec<MS2Spectrum>, std::io::Error> {
let file = File::open(spectrum_path)?;
match file_type {
SpectrumFileType::MascotGenericFormat => Ok(MGFReader::new(file)
.map(MS2Spectrum::from)
.collect::<Vec<MS2Spectrum>>()),

SpectrumFileType::MzML => Ok(MzMLReader::new(file)
.filter(|spectrum| spectrum.description.ms_level == 2)
mz_read!(spectrum_path.as_ref(), reader => {
reader.filter(|spectrum| spectrum.description.ms_level == 2)
.map(MS2Spectrum::from)
.collect::<Vec<MS2Spectrum>>()),

_ => Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"Unsupported file type for mzdata",
)),
}
.collect::<Vec<MS2Spectrum>>()
})
}

// pub fn parse_precursor_info_thermo(
// spectrum_path: &str,
// file_type: SpectrumFileType,
// ) -> Result<HashMap<String, Precursor>, std::io::Error> {
// let reader = mzdata::io::ThermoRawReader::open_path(spectrum_path)?;
// Ok(reader
// .into_iter()
// .filter(|spectrum| {
// (spectrum.description.ms_level == 2) && (spectrum.description.precursor.is_some())
// })
// .map(|spectrum| (spectrum.description.id, Precursor::from(spectrum)))
// .collect::<HashMap<String, Precursor>>())
// }

fn get_charge_from_spectrum(spectrum: &mzdata::spectrum::MultiLayerSpectrum) -> Option<usize> {
spectrum
.description
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test.mgf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
BEGIN IONS
TITLE=peptide1
CHARGE=2+
PEPMASS=475.137295
CHARGE=2+
ION_MOBILITY=42.42
RTINSECONDS=51.2
72.04439 100
Expand Down
Loading