rformassspectrometry · jorainer · Sep 19, 2024 · Sep 17, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -168,6 +168,7 @@ jobs:
           BiocManager::install('rhdf5', dependencies = TRUE, ask = FALSE, update = FALSE, INSTALL_opts = '--force-biarch')
           BiocManager::install("mzR", dependencies = TRUE, ask = FALSE, update = FALSE)
           BiocManager::install("msdata")
+          BiocManager::install("RforMassSpectrometry/MsBackendMetaboLights")
 
           message(paste('****', Sys.time(), 'force installation of selected packages ****'))
           BiocManager::install(c("BiocStyle", "rmarkdown", "magick", "Spectra"))

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -44,7 +44,8 @@ Suggests:
     testthat,
     xcms,
     alabaster.se,
-    alabaster.matrix
+    alabaster.matrix, 
+    MsBackendMetaboLights
 License: Artistic-2.0
 Encoding: UTF-8
 VignetteBuilder: knitr

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,8 @@
-# Version 0.0
+# Version 0.0.5
+
+## Changes in 0.0.5
+
+- Add *MetaboLights* `readMsObject()` method for `MsExpriment()` objects. 
 
 ## Changes in 0.0.4
 

diff --git a/R/MetaboLightsParam.R b/R/MetaboLightsParam.R
@@ -11,64 +11,114 @@
 #' The `MetaboLightsParam` class and the associated `readMsObject()` method
 #' allow users to load an `MsExperiment` object from a study in the
 #' MetaboLights database (https://www.ebi.ac.uk/metabolights/index) by
-#' providing its unique study `studyId`. This function is particularly useful
+#' providing its unique study `mtblsId`. This function is particularly useful
 #' for importing metabolomics data into an `MsExperiment` object for further
-#' analysis within the R environment. It's important to note that this method
-#' can *only* be used for import into an R environement using `readMsObject()`.
+#' analysis in the R environment. It is important to note that this method
+#' can *only* be used for importing into an R environment via `readMsObject()`.
 #' It cannot be used with the `saveMsObject()` method.
 #'
 #' If the study contains multiple assays, the user will be prompted to select
 #' which assay to load. The resulting `MsExperiment` object will include a
 #' `sampleData` slot populated with data extracted from the selected assay.
-#' Columns in the `sampleData` that contain only `NA` values are automatically
-#' removed, and an additional column is added to track the injection index.
 #'
-#' @param studyId `character(1)` The MetaboLights study studyId, which should
+#' Users can define how to filter this `sampleData` table by specifying a few
+#' parameters. The `keepOntology` parameter is set to `TRUE` by default, meaning
+#' that all columns related to ontology are retained. If set to `FALSE`, they are
+#' removed. If ontology columns are kept, some column names may be duplicated and
+#' therefore numbered. The order of these columns is important, as it reflects the
+#' assay and sample information available in MetaboLights.
+#'
+#' The `keepProtocol` parameter is also set to `TRUE` by default, meaning that
+#' all columns related to protocols are kept. If set to `FALSE`, they are removed.
+#' The `simplify` parameter, defaulted to `TRUE`, means that columns filled with
+#' NAs or duplicated content will be removed. In the case of duplicated content,
+#' only the first occurring column will be retained.
+#'
+#' Further filtering can be performed using the `filePattern` slot of the
+#' `MetaboLightsParam` object. The default for this slot is
+#' `"mzML$|CDF$|cdf$|mzXML$"`, which corresponds to the supported raw data file
+#' types.
+#'
+#' @param mtblsId `character(1)` The MetaboLights study ID, which should
 #' start with "MTBL". This identifier uniquely specifies the study within the
 #' MetaboLights database.
 #'
+#' @param assayName `character(1)` The name of the assay to load. If the study
+#' contains multiple assays and this parameter is not specified, the user will
+#' be prompted to select which assay to load.
+#'
+#' @param filePattern `character(1)` A regular expression pattern to filter the
+#' raw data files associated with the selected assay. The default value is
+#' `"mzML$|CDF$|cdf$|mzXML$"`, corresponding to the supported raw data file
+#' types.
+#'
+#' @param keepOntology `logical(1)` Whether to keep columns related to ontology
+#' in the `sampleData` slot. Default is `TRUE`.
+#'
+#' @param keepProtocol `logical(1)` Whether to keep columns related to protocols
+#' information in the `sampleData` slot. Default is `TRUE`.
+#'
+#' @param simplify `logical(1)` Whether to simplify the `sampleData` table by
+#' removing columns filled with NAs or duplicated content. Default is `TRUE`.
+#'
 #' @inheritParams saveMsObject
 #'
-#' @returns (for now ?) A `MsExperiment` object with only the sampleData slots
-#' filled (will be updated when MetaboLightsBackend available ?).
+#' @returns An `MsExperiment` object with the `sampleData` slot populated using
+#' MetaboLights sample and assay information. The spectra data is represented
+#' as a `MsBackendMetabolights` object, generated from the raw data files
+#' associated with the selected assay of the specified MetaboLights ID
+#' (`mtblsId`).
 #'
-#' @author Philippine Louail
+#' @author Philippine Louail,Johannes Rainer
 #'
 #' @importFrom methods new
 #'
 #' @importClassesFrom ProtGenerics Param
 #'
 #' @examples
 #' library(MsExperiment)
-#' # Load a study with the studyId "MTBLS10035"
-#' param <- MetaboLightsParam(studyId = "MTBLS10035")
-#' ms_experiment <- readMsObject(MsExperiment(), param)
+#' # Load a study with the mtblsId "MTBLS39" and selecting specific file pattern
+#' # as well as removing ontology and protocol information in the metadata.
+#' param <- MetaboLightsParam(mtblsId = "MTBLS39", filePattern = "63A.cdf")
+#' ms_experiment <- readMsObject(MsExperiment(), param , keepOntology = FALSE,
+#'                               keepProtocol = FALSE)
 #'
 #' @seealso
 #' - `MsExperiment` object, defined in the
 #'   ([MsExperiment](https://bioconductor.org/packages/MsExperiment)) package.
+#'
+#' - `MsBackendMetaboLights` object, defined in the
+#'  ([MsBackendMetaboLights](https://github.com/rformassspectrometry/MsBackendMetaboLights))
+#'  repository.
+#'
 #' - [MetaboLights](https://www.ebi.ac.uk/metabolights/index) for accessing
 #'   the MetaboLights database.
 #'
 NULL
 
 #' @noRd
 setClass("MetaboLightsParam",
-         slots = c(studyId = "character"),
+         slots = c(mtblsId = "character",
+                   assayName = "character",
+                   filePattern = "character"),
          contains = "Param",
          prototype = list(
-             studyId = character(1)
+             mtblsId = character(1),
+             assayName = character(1),
+             filePattern = character(1)
          ),
          validity = function(object) {
              msg <- NULL
-             if (!grepl("^MTBLS", object@studyId))
-                 msg <- c("'studyId' must start with 'MTBLS'")
+             if (!grepl("^MTBLS", object@mtblsId))
+                 msg <- c("'mtblsId' must start with 'MTBLS'")
              msg
          })
 
 #' @rdname MetaboLightsParam
 #'
 #' @export
-MetaboLightsParam <- function(studyId = character(1)) {
-    new("MetaboLightsParam", studyId = studyId)
+MetaboLightsParam <- function(mtblsId = character(), assayName = character(),
+                              filePattern = "mzML$|CDF$|cdf$|mzXML$"){
+    new("MetaboLightsParam", mtblsId = mtblsId, assayName = assayName,
+        filePattern = filePattern)
 }
diff --git a/R/MsExperiment.R b/R/MsExperiment.R
@@ -214,48 +214,99 @@ setMethod("readMsObject", signature(object = "MsExperiment",
 setMethod("readMsObject",
           signature(object = "MsExperiment",
                     param = "MetaboLightsParam"),
-          function(object, param, ...) {
-              url <- file.path(
-                  "https://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/",
-                  param@studyId, "/")
-              ## Retrieve the HTML content using curl system command
-              html_content <- system(paste("curl -s", url), intern = TRUE) ##idk if that works for all OS
-              if (all(grepl("The requested URL was not found on this server",
-                       html_content)))
-                  stop("Study not found. Please check the study ID and try again.")
-              files <- regmatches(html_content,
-                                  gregexpr("href=\"([^\"]+\\.txt|[^\"]+\\.tsv)\"",
-                                           html_content))
-              files <- unlist(files)
-              files <- gsub("href=\"|\"", "", files)
+          function(object, param, keepOntology = TRUE, keepProtocol = TRUE,
+                   simplify = TRUE, ...) {
+              if (!requireNamespace("MsBackendMetaboLights", quietly = TRUE)) {
+                  stop("Required package 'MsBackendMetaboLights' is missing. ",
+                       "Please install it and try again.", call. = FALSE)
+              }
+              pth <- MsBackendMetaboLights::mtbls_ftp_path(param@mtblsId)
+              all_fls <- MsBackendMetaboLights::mtbls_list_files(param@mtblsId)
 
-              ## check assay files
-              assays <- grep("a_", files, value = TRUE)
-              if (length(assays) > 1) {
+              ## Extract and read assay files
+              assays <- all_fls[grepl("^a_", all_fls)]
+              if (length(param@assayName) > 0) selected_assay <- param@assayName
+              else if (length(assays) > 1 && length(param@assayName) == 0) {
                   cat("Multiple assay files found:\n")
                   selection <- menu(assays,
                                     title = paste("Please choose the assay",
                                     "file you want to use:"))
                   selected_assay <- assays[selection]
               } else if (length(assays) == 1) {
-                  selected_assay <- assays[1]
+                  selected_assay <- assays
                   cat("Only one assay file found:", selected_assay, "\n")
-              } else stop("No assay files found.") ## i don't think that would happen...
-
-              assay_data <- read.table(file.path(url, selected_assay),
-                                       header = TRUE, sep = "\t")
-              assay_data$injection_index <- seq_len(nrow(assay_data))
+              }
+              assay_data <- read.table(paste0(pth, selected_assay),
+                                       header = TRUE, sep = "\t",
+                                       check.names = FALSE)
 
               ## Extract and read sample info files
-              sample_info_files <- grep("s_", files, value = TRUE)
-              sample_info <- read.table(file.path(url, sample_info_files),
-                                        header = TRUE, sep = "\t")
-              merged_data <- merge(assay_data, sample_info, by = "Sample.Name")
-              merged_data <- merged_data[order(merged_data$injection_index), ]
-              merged_data <- merged_data[, colSums(is.na(merged_data)) <
-                                             nrow(merged_data)]
+              s_files <- all_fls[grepl("^s_", all_fls)]
+              sample_info <- read.table(paste0(pth, s_files),
+                                        header = TRUE, sep = "\t",
+                                        check.names = FALSE)
+
+              # merging
+              ord <- match(assay_data$`Sample Name`, sample_info$`Sample Name`)
+              merged_data <- cbind(assay_data, sample_info[ord, ])
+              names(merged_data) <- gsub(" ", "_", names(merged_data))
+              if (keepProtocol || keepOntology || simplify)
+                  merged_data <- .clean_merged(x = merged_data,
+                                               keepProtocol = keepProtocol,
+                                               keepOntology = keepOntology,
+                                               simplify = simplify)
+
+              ## Assemble object
+              object@spectra <- Spectra::Spectra(mtblsId = param@mtblsId,
+                                                 source = MsBackendMetaboLights::MsBackendMetaboLights(),
+                                                 assayName = selected_assay,
+                                                 filePattern = param@filePattern)
+
+              ## sample to spectra link
+              fl <- object@spectra@backend@spectraData[1, "derived_spectral_data_file"]
+              nme <- colnames(merged_data)[which(merged_data[1, ] == fl)]
+              merged_data <- merged_data[grepl(param@filePattern,
+                                               merged_data[, nme]), ]
 
               object@sampleData <- DataFrame(merged_data)
+              object <- MsExperiment::linkSampleData(object,
+                                                     with = paste0("sampleData.",
+                                                                nme,
+                                                                "= spectra.derived_spectral_data_file"))
               validObject(object)
               object
           })
+
+
+#####HELPERS
+
+#' function that takes the extra parameters and clean the metadata if asked by
+#' the user.
+#'
+#' Note:  the subsetting of the merged data is done here, which WILL rename the
+#' duplicated columns. I could fix that by first transforming the data.frame into
+#' a list but I am not sure that it is useful.. The user might do some
+#' subsetting too later and then the same thing will happen. Might as well have
+#' it from the beginning.
+#'
+#' Note2: I would move that function later, keeping it her for review to help
+#' clarity
+#'
+#' @noRd
+.clean_merged <- function(x, keepProtocol, keepOntology, simplify) {
+    # remove ontology
+    if (!keepOntology)
+        x <- x[, -which(grepl("Term", names(x)))]
+
+    # remove protocol
+    if (!keepProtocol)
+        x <- x[, -which(grepl("Protocol|Parameter", names(x)))]
+
+    # remove duplicated columns contents and NAs
+    if (simplify) {
+        x <- x[, !duplicated(as.list(x))]
+        x <- x[, colSums(is.na(x)) != nrow(x)]
+    }
+    return(x)
+}
+