Merge external counts (#23)

* add merge of external counts functionality * add the citation to nature communications * fix rd man page linking * fix minor warnings in BiocCheck/RCheck * fix/catch biomaRt specific issues
gagneurlab · Jan 27, 2021 · 772196c · 772196c
2 parents ee7f1f7 + 2f565ef
commit 772196c
Show file tree

Hide file tree

Showing 18 changed files with 277 additions and 51 deletions.
diff --git a/setupEnv.R → .github/helperScripts/setupEnv.R b/setupEnv.R → .github/helperScripts/setupEnv.R
@@ -53,19 +53,16 @@ for(p in c("getopt", "XML", "xml2", "testthat", "devtools", "covr",
 R.utils::withTimeout(timeout=2400, {
     try({
         print_log("Update packages")
-	BTYPE <- ifelse(.Platform$OS.type == 'unix', "source", "win.binary")
+        BTYPE <- ifelse(.Platform$OS.type == 'unix', "source", "win.binary")
         INSTALL(ask=FALSE, type=BTYPE, Ncpus=NCPUS)
 
-        print_log("Install dev package")
-        try({ devtools::install(".", dependencies=TRUE, type=BTYPE) })
-
         if(R.version[['major']] == "3"){
             print_log("Install updated source package")
-            devtools::install_github("gagneurlab/OUTRIDER", dependencies=FALSE)
+            devtools::install_github("gagneurlab/OUTRIDER", dependencies=TRUE)
         }
-
-        print_log("Install package")
-        devtools::install(".", dependencies=FALSE, type=BTYPE)
+        
+        print_log("Install dev package")
+        devtools::install(".", dependencies=TRUE, type=BTYPE)
     })
 })
 
@@ -74,6 +71,13 @@ R.utils::withTimeout(timeout=2400, {
 if(R.version[['major']] == "3"){
     BiocManager::install(ask=FALSE, update=FALSE, c(
             "Bioconductor/BiocFileCache", "grimbough/biomaRt", "yihui/knitr@v1.29"))
+} else {
+    inst_biomaRt_version <- as.character(utils::packageVersion("biomaRt"))
+    if(utils::compareVersion("2.46.2", inst_biomaRt_version) == 1){
+        BiocManager::install("grimbough/biomaRt", ref="3_12_testing")
+    }
 }
 
+# to get FRASER session info
+try({ library(FRASER) })
 print(BiocManager::valid())
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
@@ -75,14 +75,9 @@ jobs:
 
       - name: Install dependencies
         run: |
-          source("setupEnv.R")
+          source(".github/helperScripts/setupEnv.R")
           remotes::install_cran("rcmdcheck")
         shell: Rscript {0}
-
-        #run: |
-        #  remotes::install_deps(dependencies = TRUE)
-        #  remotes::install_cran("rcmdcheck")
-        #shell: Rscript {0}
 
       - name: Check build
         run: |

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -97,6 +97,7 @@ Collate:
     find_encoding_dimensions.R
     getURLs.R
     helper-functions.R
+    mergeExternalData.R
     saveHDF5Objects.R
     RcppExports.R
     autoencoder.R

diff --git a/NAMESPACE b/NAMESPACE
@@ -52,6 +52,7 @@ export(loadFraserDataSet)
 export(makeSimulatedFraserDataSet)
 export(mapSeqlevels)
 export(mergeCounts)
+export(mergeExternalData)
 export(name)
 export(nonSplicedReads)
 export(optimHyperParams)
@@ -215,6 +216,7 @@ importFrom(SummarizedExperiment,"assay<-")
 importFrom(SummarizedExperiment,"assays<-")
 importFrom(SummarizedExperiment,"colData<-")
 importFrom(SummarizedExperiment,"rowRanges<-")
+importFrom(SummarizedExperiment,Assays)
 importFrom(SummarizedExperiment,SummarizedExperiment)
 importFrom(SummarizedExperiment,assay)
 importFrom(SummarizedExperiment,assayNames)

diff --git a/R/FRASER-package.R b/R/FRASER-package.R
@@ -27,7 +27,7 @@
 #'          readGAlignmentPairs
 #' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames
 #'          colData colData<- rowData rowRanges rowRanges<- SummarizedExperiment
-#'          rbind
+#'          rbind Assays
 #' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList
 #'          makeGRangesFromDataFrame invertStrand
 #' @importFrom IRanges subsetByOverlaps from to IRanges ranges

diff --git a/R/annotationOfRanges.R b/R/annotationOfRanges.R
@@ -18,8 +18,9 @@
 #' @param orgDb An \code{orgDb} object or a data table to map the feature names.
 #'             If this is NULL, then \code{org.Hs.eg.db} is used as the default.
 #' @param keytype The keytype or column name of gene IDs in the \code{TxDb}
-#'             object (see \code{\link[AnnotationDbi]{keytypes}} for a list
-#'             of available ID types).
+#'             object (see 
+#'             \code{\link[AnnotationDbi:AnnotationDb-class]{keytypes}}
+#'             for a list of available ID types).
 #' 
 #' @return FraserDataSet
 #' 
@@ -28,18 +29,24 @@
 #' fds <- createTestFraserDataSet()
 #' 
 #' ### Two ways to annotage ranges with gene names: 
-#' # either using biomart:
-#' fds <- annotateRanges(fds, GRCh=38)
-#' fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37)
-#' rowRanges(fds, type="psi5")[,c("hgnc_symbol", "hgnc_symbol_37")]
+#' # either using biomart with GRCh38
+#' try({
+#'   fds <- annotateRanges(fds, GRCh=38)
+#'   rowRanges(fds, type="psi5")[,c("hgnc_symbol")]
+#' })
+#' 
+#' # either using biomart with GRCh37
+#' try({
+#'   fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37)
+#'   rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")]
+#' })
 #'  
-#' # or with a TxDb object
+#' # or with a provided TxDb object
 #' require(TxDb.Hsapiens.UCSC.hg19.knownGene)
 #' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
 #' require(org.Hs.eg.db)
 #' orgDb <- org.Hs.eg.db
 #' fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
-#'
 #' rowRanges(fds, type="psi5")[,"hgnc_symbol"]
 #' 
 #' @rdname annotateRanges
@@ -63,13 +70,11 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature,
         tryCatch({
             ensemblOutput <- capture.output(ensembl <- useEnsembl(
                     biomart="ensembl", dataset="hsapiens_gene_ensembl", 
-                    GRCh=GRCh
-            ))
+                    GRCh=GRCh))
         },
         error=function(e){
                 message("\nCheck if we have a internet connection!",
-                        " Could not connect to ENSEMBL."
-                )
+                        " Could not connect to ENSEMBL. With error: `", e, "`.")
         })
         if(is.null(ensembl)){
             message("Nothing was annotated!")

diff --git a/R/countRNAseqData.R b/R/countRNAseqData.R
@@ -45,8 +45,8 @@
 #'     used for all samples. If \code{genome} is supplied and 
 #'     \code{strandSpecific(fds) == 0L} (unstranded), then the strand
 #'     information will be estimated by checking the dinucleotides found at the
-#'     intron boundaries 
-#'     (see \code{\link[GenomicAlignments]{summarizeJunctions}}
+#'     intron boundaries (see 
+#'     \code{\link[GenomicAlignments:junctions-methods]{summarizeJunctions}}
 #'     in GenomicAlignments package for details). This can e.g. help to avoid 
 #'     ambiguities when adding gene names from a gene annotation to the introns 
 #'     in a later step. 

diff --git a/R/mergeExternalData.R b/R/mergeExternalData.R
@@ -0,0 +1,163 @@
+#' 
+#' Merge external data
+#' 
+#' To boost its own sequencing data, one can download existing and precounted 
+#' data. This function merges the existing \code{FraserDataSet} with
+#' external count data.
+#' 
+#' For more details on existing datasets have a look at:
+#' <https://github.com/gagneurlab/drop#datasets>
+#' 
+#' Since FRASER can not hand NA values, the merge will return only the 
+#' intersecting regions and will drop any non overlapping features. This has to
+#' be kept in mind when analysing rare disease samples.
+#' 
+#' @param fds A \code{FraserDataSet}
+#' @param countFiles A character vector of file names pointing to the external 
+#'           count data. The vector has to be names or the files have to start 
+#'           with \code{k_j}, \code{k_theta}, \code{n_psi3}, \code{n_psi5}, 
+#'           \code{n_theta}.
+#' @param sampleIDs The samples to be merged from the external data. 
+#' @param annotation A sample annotation of the external data (optional).
+#' 
+#' @return Merged \code{FraserDataSet} object.
+#' 
+#' @examples
+#' anno <- fread(system.file("extdata", "externalCounts", 
+#'         "annotation.tsv.gz", package="FRASER"))
+#' ctsFiles <- list.files(full.names = TRUE, pattern="counts",
+#'         system.file("extdata", "externalCounts", package="FRASER"))
+#' 
+#' fds <- createTestFraserDataSet()
+#' fds_merged <- mergeExternalData(fds, ctsFiles, anno[,sampleID], anno)
+#' 
+#' K(fds, "psi5")
+#' K(fds_merged, "psi5")
+#' 
+#' @export
+mergeExternalData <- function(fds, countFiles, sampleIDs, annotation=NULL){
+
+    # check input
+    checkFraserDataSet(fds)
+    checkCountData(fds)
+
+    if(length(countFiles) != 5){
+        stop("You have to provide 5 files, but only ", length(countFiles), 
+                " were provided.")
+    }
+    if(is.null(names(countFiles))){
+        names(countFiles) <- gsub("(_counts)?.tsv.gz", "", basename(countFiles))
+    }
+    reqNames <- c("k_j", "k_theta", "n_psi3", "n_psi5", "n_theta")
+    if(any(!reqNames %in% unique(names(countFiles)))){
+        stop("Please name the input or the files correctly. We are missing: ", 
+                paste(collapse=", ",
+                        reqNames[!reqNames %in% names(countFiles)]))
+    }
+    if(any(!file.exists(countFiles))){
+        stop("Provided files are missing! We are missing: ",
+                paste(collapse=", ", countFiles[!file.exists(countFiles)]))
+    }
+    if(any(unique(sampleIDs) != sampleIDs)){
+        stop("Provided sampleIDs have to be unique!")
+    }
+    sampleIDs <- as.character(sampleIDs)
+
+    # load external counts
+    message("Loading provided counts")
+    names(reqNames) <- reqNames
+    extCts <- lapply(reqNames, function(id){
+        gr <- makeGRangesFromDataFrame(fread(countFiles[id]), 
+                keep.extra.columns=TRUE)
+        if(any(!sampleIDs %in% colnames(mcols(gr)))){
+            stop("Can not find provided sampleID in count data. Missing IDs: ",
+                    paste(collapse=", ", 
+                            sampleIDs[!sampleIDs %in% colnames(mcols(gr))]))
+        }
+        gr[,sampleIDs]
+    })
+    stopifnot(all(granges(extCts[['k_j']]) == granges(extCts[['n_psi3']])))
+    stopifnot(all(granges(extCts[['k_j']]) == granges(extCts[['n_psi5']])))
+    stopifnot(all(granges(extCts[['k_theta']]) == granges(extCts[['n_theta']])))
+
+    # 
+    # merging colData
+    # 
+    message(date(), ": Merging data ...")
+    if(!is.null(annotation)){
+        annotation <- DataFrame(annotation)
+    } else {
+        annotation <- DataFrame(sampleID=as.character(sampleIDs))
+    }
+    rownames(annotation) <- annotation[,"sampleID"]
+    newColData <- DataFrame(rbind(fill=TRUE,
+            as.data.table(colData(fds)), 
+            as.data.table(annotation[sampleIDs,])))
+    rownames(newColData) <- newColData[,"sampleID"]
+
+    # 
+    # merge psi5/psi3 data
+    # 
+    extractExtData <- function(fds, countFun, type, ov, extData, extName){
+        ans <- as.matrix(cbind(countFun(fds, type=type)[from(ov),],
+                mcols(extData[[extName]])[to(ov),]))
+        mode(ans) <- "integer"
+        ans
+    }
+
+    # find overlap
+    if(all(strand(rowRanges(fds, type="j")) == "*")){
+        for(id in reqNames){
+            strand(extCts[[id]]) <- "*"
+        }
+    }
+    ov <- findOverlaps(rowRanges(fds, type="j"), extCts[['k_j']], type="equal")
+
+    newCtsK_J    <- extractExtData(fds, K, "j",    ov, extCts, "k_j")
+    newCtsN_psi5 <- extractExtData(fds, N, "psi5", ov, extCts, "n_psi5")
+    newCtsN_psi3 <- extractExtData(fds, N, "psi3", ov, extCts, "n_psi3")
+
+
+    # 
+    # merge theta data
+    # 
+    # find overlap
+    ovss <- findOverlaps(rowRanges(fds, type="theta"), 
+            extCts[['k_theta']], type="equal")
+
+    newCtsK_theta <- extractExtData(fds, K, "theta", ovss, extCts, "k_theta")
+    newCtsN_theta <- extractExtData(fds, N, "theta", ovss, extCts, "n_theta")
+
+
+    # 
+    # finalize merged FraserDataObject
+    # 
+    nsr <- SummarizedExperiment(
+            colData=newColData,
+            assays=SimpleList(
+                    rawCountsSS=newCtsK_theta,
+                    rawOtherCounts_theta=newCtsN_theta - newCtsK_theta),
+            rowRanges=rowRanges(fds, type="theta")[
+                    from(ovss),c("spliceSiteID", "type")])
+
+    ans <- new("FraserDataSet", 
+            name = name(fds),
+            bamParam = scanBamParam(fds),
+            strandSpecific = strandSpecific(fds),
+            workingDir = workingDir(fds),
+            colData = newColData,
+            assays=Assays(SimpleList(
+                    rawCountsJ=newCtsK_J,
+                    rawOtherCounts_psi5=newCtsN_psi5 - newCtsK_J,
+                    rawOtherCounts_psi3=newCtsN_psi3 - newCtsK_J)),
+            nonSplicedReads = nsr,
+            rowRanges = rowRanges(fds)[from(ov),c("startID", "endID")],
+            elementMetadata = DataFrame(newCtsK_J[,integer(0)]))
+
+    # 
+    # compute new psi values
+    # 
+    ans <- calculatePSIValues(ans)
+
+    ans
+}
diff --git a/inst/CITATION b/inst/CITATION
@@ -1,24 +1,24 @@
 citHeader("To cite FRASER in publications please use:")
 citEntry(entry = "article",
-    title    = "Detection of aberrant splicing events in RNA-seq data with FRASER",
+    title    = "Detection of aberrant splicing events in RNA-seq data using FRASER",
     author   = personList(
                     person("Christian", "Mertes"),
-                    person("Ines", "Scheller"),
+                    person(c("Ines", "F"), "Scheller"),
                     person(c("Vicente", "A"), "Y\\'{e}pez"),
                     person(c("Muhammed", "H"), "\\c{C}elik"),
                     person("Yingjiqiong", "Liang"),
                     person(c("Laura", "S"), "Kremer"),
                     person("Mirjana", "Gusic"),
                     person("Holger", "Prokisch"),
                     person("Julien", "Gagneur")),
-    journal  = "biorxiv",
-    volume   = "",
-    number   = "",
-    pages    = "",
-    year     = "2019",
-    issn     = "",
-    doi      = "10.1101/2019.12.18.866830",
+    journal  = "Nature Communications",
+    volume   = "12",
+    issue    = "1",
+    pages    = "529",
+    year     = "2021",
+    issn     = "2041-1723",
+    doi      = "10.1038/s41467-020-20573-7",
     textVersion  = paste(
-            "Mertes C, Scheller I et al.,",
-            "Detection of aberrant splicing events in RNA-seq data with FRASER.",
-            "biorxiv, 2019, 10.1101/2019.12.18.866830.") )
+            "Mertes, C., Scheller, I.F., Yepez, V.A. et al.",
+            "Detection of aberrant splicing events in RNA-seq data using FRASER.",
+            "Nat Commun 12, 529 2021. https://doi.org/10.1038/s41467-020-20573-7.") )
diff --git a/inst/extdata/externalCounts/annotation.tsv.gz b/inst/extdata/externalCounts/annotation.tsv.gz
diff --git a/inst/extdata/externalCounts/k_j_counts.tsv.gz b/inst/extdata/externalCounts/k_j_counts.tsv.gz
diff --git a/inst/extdata/externalCounts/k_theta_counts.tsv.gz b/inst/extdata/externalCounts/k_theta_counts.tsv.gz
diff --git a/inst/extdata/externalCounts/n_psi3_counts.tsv.gz b/inst/extdata/externalCounts/n_psi3_counts.tsv.gz
diff --git a/inst/extdata/externalCounts/n_psi5_counts.tsv.gz b/inst/extdata/externalCounts/n_psi5_counts.tsv.gz
diff --git a/inst/extdata/externalCounts/n_theta_counts.tsv.gz b/inst/extdata/externalCounts/n_theta_counts.tsv.gz