Skip to content

Commit

Permalink
Merge external counts (#23)
Browse files Browse the repository at this point in the history
* add merge of external counts functionality
* add the citation to nature communications
* fix rd man page linking 
* fix minor warnings in BiocCheck/RCheck
* fix/catch biomaRt specific issues
  • Loading branch information
c-mertes authored Jan 27, 2021
2 parents ee7f1f7 + 2f565ef commit 772196c
Show file tree
Hide file tree
Showing 18 changed files with 277 additions and 51 deletions.
20 changes: 12 additions & 8 deletions setupEnv.R → .github/helperScripts/setupEnv.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,16 @@ for(p in c("getopt", "XML", "xml2", "testthat", "devtools", "covr",
R.utils::withTimeout(timeout=2400, {
try({
print_log("Update packages")
BTYPE <- ifelse(.Platform$OS.type == 'unix', "source", "win.binary")
BTYPE <- ifelse(.Platform$OS.type == 'unix', "source", "win.binary")
INSTALL(ask=FALSE, type=BTYPE, Ncpus=NCPUS)

print_log("Install dev package")
try({ devtools::install(".", dependencies=TRUE, type=BTYPE) })

if(R.version[['major']] == "3"){
print_log("Install updated source package")
devtools::install_github("gagneurlab/OUTRIDER", dependencies=FALSE)
devtools::install_github("gagneurlab/OUTRIDER", dependencies=TRUE)
}

print_log("Install package")
devtools::install(".", dependencies=FALSE, type=BTYPE)
print_log("Install dev package")
devtools::install(".", dependencies=TRUE, type=BTYPE)
})
})

Expand All @@ -74,6 +71,13 @@ R.utils::withTimeout(timeout=2400, {
if(R.version[['major']] == "3"){
BiocManager::install(ask=FALSE, update=FALSE, c(
"Bioconductor/BiocFileCache", "grimbough/biomaRt", "yihui/knitr@v1.29"))
} else {
inst_biomaRt_version <- as.character(utils::packageVersion("biomaRt"))
if(utils::compareVersion("2.46.2", inst_biomaRt_version) == 1){
BiocManager::install("grimbough/biomaRt", ref="3_12_testing")
}
}

# to get FRASER session info
try({ library(FRASER) })
print(BiocManager::valid())
7 changes: 1 addition & 6 deletions .github/workflows/r.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,9 @@ jobs:

- name: Install dependencies
run: |
source("setupEnv.R")
source(".github/helperScripts/setupEnv.R")
remotes::install_cran("rcmdcheck")
shell: Rscript {0}

#run: |
# remotes::install_deps(dependencies = TRUE)
# remotes::install_cran("rcmdcheck")
#shell: Rscript {0}

- name: Check build
run: |
Expand Down
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ Collate:
find_encoding_dimensions.R
getURLs.R
helper-functions.R
mergeExternalData.R
saveHDF5Objects.R
RcppExports.R
autoencoder.R
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ export(loadFraserDataSet)
export(makeSimulatedFraserDataSet)
export(mapSeqlevels)
export(mergeCounts)
export(mergeExternalData)
export(name)
export(nonSplicedReads)
export(optimHyperParams)
Expand Down Expand Up @@ -215,6 +216,7 @@ importFrom(SummarizedExperiment,"assay<-")
importFrom(SummarizedExperiment,"assays<-")
importFrom(SummarizedExperiment,"colData<-")
importFrom(SummarizedExperiment,"rowRanges<-")
importFrom(SummarizedExperiment,Assays)
importFrom(SummarizedExperiment,SummarizedExperiment)
importFrom(SummarizedExperiment,assay)
importFrom(SummarizedExperiment,assayNames)
Expand Down
2 changes: 1 addition & 1 deletion R/FRASER-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#' readGAlignmentPairs
#' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames
#' colData colData<- rowData rowRanges rowRanges<- SummarizedExperiment
#' rbind
#' rbind Assays
#' @importFrom GenomicRanges findOverlaps granges GRanges GRangesList
#' makeGRangesFromDataFrame invertStrand
#' @importFrom IRanges subsetByOverlaps from to IRanges ranges
Expand Down
29 changes: 17 additions & 12 deletions R/annotationOfRanges.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
#' @param orgDb An \code{orgDb} object or a data table to map the feature names.
#' If this is NULL, then \code{org.Hs.eg.db} is used as the default.
#' @param keytype The keytype or column name of gene IDs in the \code{TxDb}
#' object (see \code{\link[AnnotationDbi]{keytypes}} for a list
#' of available ID types).
#' object (see
#' \code{\link[AnnotationDbi:AnnotationDb-class]{keytypes}}
#' for a list of available ID types).
#'
#' @return FraserDataSet
#'
Expand All @@ -28,18 +29,24 @@
#' fds <- createTestFraserDataSet()
#'
#' ### Two ways to annotage ranges with gene names:
#' # either using biomart:
#' fds <- annotateRanges(fds, GRCh=38)
#' fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37)
#' rowRanges(fds, type="psi5")[,c("hgnc_symbol", "hgnc_symbol_37")]
#' # either using biomart with GRCh38
#' try({
#' fds <- annotateRanges(fds, GRCh=38)
#' rowRanges(fds, type="psi5")[,c("hgnc_symbol")]
#' })
#'
#' # either using biomart with GRCh37
#' try({
#' fds <- annotateRanges(fds, featureName="hgnc_symbol_37", GRCh=37)
#' rowRanges(fds, type="psi5")[,c("hgnc_symbol_37")]
#' })
#'
#' # or with a TxDb object
#' # or with a provided TxDb object
#' require(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#' require(org.Hs.eg.db)
#' orgDb <- org.Hs.eg.db
#' fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)
#'
#' rowRanges(fds, type="psi5")[,"hgnc_symbol"]
#'
#' @rdname annotateRanges
Expand All @@ -63,13 +70,11 @@ annotateRanges <- function(fds, feature="hgnc_symbol", featureName=feature,
tryCatch({
ensemblOutput <- capture.output(ensembl <- useEnsembl(
biomart="ensembl", dataset="hsapiens_gene_ensembl",
GRCh=GRCh
))
GRCh=GRCh))
},
error=function(e){
message("\nCheck if we have a internet connection!",
" Could not connect to ENSEMBL."
)
" Could not connect to ENSEMBL. With error: `", e, "`.")
})
if(is.null(ensembl)){
message("Nothing was annotated!")
Expand Down
4 changes: 2 additions & 2 deletions R/countRNAseqData.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@
#' used for all samples. If \code{genome} is supplied and
#' \code{strandSpecific(fds) == 0L} (unstranded), then the strand
#' information will be estimated by checking the dinucleotides found at the
#' intron boundaries
#' (see \code{\link[GenomicAlignments]{summarizeJunctions}}
#' intron boundaries (see
#' \code{\link[GenomicAlignments:junctions-methods]{summarizeJunctions}}
#' in GenomicAlignments package for details). This can e.g. help to avoid
#' ambiguities when adding gene names from a gene annotation to the introns
#' in a later step.
Expand Down
163 changes: 163 additions & 0 deletions R/mergeExternalData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#'
#' Merge external data
#'
#' To boost its own sequencing data, one can download existing and precounted
#' data. This function merges the existing \code{FraserDataSet} with
#' external count data.
#'
#' For more details on existing datasets have a look at:
#' <https://github.com/gagneurlab/drop#datasets>
#'
#' Since FRASER can not hand NA values, the merge will return only the
#' intersecting regions and will drop any non overlapping features. This has to
#' be kept in mind when analysing rare disease samples.
#'
#' @param fds A \code{FraserDataSet}
#' @param countFiles A character vector of file names pointing to the external
#' count data. The vector has to be names or the files have to start
#' with \code{k_j}, \code{k_theta}, \code{n_psi3}, \code{n_psi5},
#' \code{n_theta}.
#' @param sampleIDs The samples to be merged from the external data.
#' @param annotation A sample annotation of the external data (optional).
#'
#' @return Merged \code{FraserDataSet} object.
#'
#' @examples
#' anno <- fread(system.file("extdata", "externalCounts",
#' "annotation.tsv.gz", package="FRASER"))
#' ctsFiles <- list.files(full.names = TRUE, pattern="counts",
#' system.file("extdata", "externalCounts", package="FRASER"))
#'
#' fds <- createTestFraserDataSet()
#' fds_merged <- mergeExternalData(fds, ctsFiles, anno[,sampleID], anno)
#'
#' K(fds, "psi5")
#' K(fds_merged, "psi5")
#'
#' @export
mergeExternalData <- function(fds, countFiles, sampleIDs, annotation=NULL){

# check input
checkFraserDataSet(fds)
checkCountData(fds)

if(length(countFiles) != 5){
stop("You have to provide 5 files, but only ", length(countFiles),
" were provided.")
}
if(is.null(names(countFiles))){
names(countFiles) <- gsub("(_counts)?.tsv.gz", "", basename(countFiles))
}
reqNames <- c("k_j", "k_theta", "n_psi3", "n_psi5", "n_theta")
if(any(!reqNames %in% unique(names(countFiles)))){
stop("Please name the input or the files correctly. We are missing: ",
paste(collapse=", ",
reqNames[!reqNames %in% names(countFiles)]))
}
if(any(!file.exists(countFiles))){
stop("Provided files are missing! We are missing: ",
paste(collapse=", ", countFiles[!file.exists(countFiles)]))
}
if(any(unique(sampleIDs) != sampleIDs)){
stop("Provided sampleIDs have to be unique!")
}
sampleIDs <- as.character(sampleIDs)

# load external counts
message("Loading provided counts")
names(reqNames) <- reqNames
extCts <- lapply(reqNames, function(id){
gr <- makeGRangesFromDataFrame(fread(countFiles[id]),
keep.extra.columns=TRUE)
if(any(!sampleIDs %in% colnames(mcols(gr)))){
stop("Can not find provided sampleID in count data. Missing IDs: ",
paste(collapse=", ",
sampleIDs[!sampleIDs %in% colnames(mcols(gr))]))
}
gr[,sampleIDs]
})
stopifnot(all(granges(extCts[['k_j']]) == granges(extCts[['n_psi3']])))
stopifnot(all(granges(extCts[['k_j']]) == granges(extCts[['n_psi5']])))
stopifnot(all(granges(extCts[['k_theta']]) == granges(extCts[['n_theta']])))

#
# merging colData
#
message(date(), ": Merging data ...")
if(!is.null(annotation)){
annotation <- DataFrame(annotation)
} else {
annotation <- DataFrame(sampleID=as.character(sampleIDs))
}
rownames(annotation) <- annotation[,"sampleID"]
newColData <- DataFrame(rbind(fill=TRUE,
as.data.table(colData(fds)),
as.data.table(annotation[sampleIDs,])))
rownames(newColData) <- newColData[,"sampleID"]

#
# merge psi5/psi3 data
#
extractExtData <- function(fds, countFun, type, ov, extData, extName){
ans <- as.matrix(cbind(countFun(fds, type=type)[from(ov),],
mcols(extData[[extName]])[to(ov),]))
mode(ans) <- "integer"
ans
}

# find overlap
if(all(strand(rowRanges(fds, type="j")) == "*")){
for(id in reqNames){
strand(extCts[[id]]) <- "*"
}
}
ov <- findOverlaps(rowRanges(fds, type="j"), extCts[['k_j']], type="equal")

newCtsK_J <- extractExtData(fds, K, "j", ov, extCts, "k_j")
newCtsN_psi5 <- extractExtData(fds, N, "psi5", ov, extCts, "n_psi5")
newCtsN_psi3 <- extractExtData(fds, N, "psi3", ov, extCts, "n_psi3")


#
# merge theta data
#
# find overlap
ovss <- findOverlaps(rowRanges(fds, type="theta"),
extCts[['k_theta']], type="equal")

newCtsK_theta <- extractExtData(fds, K, "theta", ovss, extCts, "k_theta")
newCtsN_theta <- extractExtData(fds, N, "theta", ovss, extCts, "n_theta")


#
# finalize merged FraserDataObject
#
nsr <- SummarizedExperiment(
colData=newColData,
assays=SimpleList(
rawCountsSS=newCtsK_theta,
rawOtherCounts_theta=newCtsN_theta - newCtsK_theta),
rowRanges=rowRanges(fds, type="theta")[
from(ovss),c("spliceSiteID", "type")])

ans <- new("FraserDataSet",
name = name(fds),
bamParam = scanBamParam(fds),
strandSpecific = strandSpecific(fds),
workingDir = workingDir(fds),
colData = newColData,
assays=Assays(SimpleList(
rawCountsJ=newCtsK_J,
rawOtherCounts_psi5=newCtsN_psi5 - newCtsK_J,
rawOtherCounts_psi3=newCtsN_psi3 - newCtsK_J)),
nonSplicedReads = nsr,
rowRanges = rowRanges(fds)[from(ov),c("startID", "endID")],
elementMetadata = DataFrame(newCtsK_J[,integer(0)]))

#
# compute new psi values
#
ans <- calculatePSIValues(ans)

ans
}
24 changes: 12 additions & 12 deletions inst/CITATION
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
citHeader("To cite FRASER in publications please use:")
citEntry(entry = "article",
title = "Detection of aberrant splicing events in RNA-seq data with FRASER",
title = "Detection of aberrant splicing events in RNA-seq data using FRASER",
author = personList(
person("Christian", "Mertes"),
person("Ines", "Scheller"),
person(c("Ines", "F"), "Scheller"),
person(c("Vicente", "A"), "Y\\'{e}pez"),
person(c("Muhammed", "H"), "\\c{C}elik"),
person("Yingjiqiong", "Liang"),
person(c("Laura", "S"), "Kremer"),
person("Mirjana", "Gusic"),
person("Holger", "Prokisch"),
person("Julien", "Gagneur")),
journal = "biorxiv",
volume = "",
number = "",
pages = "",
year = "2019",
issn = "",
doi = "10.1101/2019.12.18.866830",
journal = "Nature Communications",
volume = "12",
issue = "1",
pages = "529",
year = "2021",
issn = "2041-1723",
doi = "10.1038/s41467-020-20573-7",
textVersion = paste(
"Mertes C, Scheller I et al.,",
"Detection of aberrant splicing events in RNA-seq data with FRASER.",
"biorxiv, 2019, 10.1101/2019.12.18.866830.") )
"Mertes, C., Scheller, I.F., Yepez, V.A. et al.",
"Detection of aberrant splicing events in RNA-seq data using FRASER.",
"Nat Commun 12, 529 2021. https://doi.org/10.1038/s41467-020-20573-7.") )
Binary file added inst/extdata/externalCounts/annotation.tsv.gz
Binary file not shown.
Binary file added inst/extdata/externalCounts/k_j_counts.tsv.gz
Binary file not shown.
Binary file added inst/extdata/externalCounts/k_theta_counts.tsv.gz
Binary file not shown.
Binary file added inst/extdata/externalCounts/n_psi3_counts.tsv.gz
Binary file not shown.
Binary file added inst/extdata/externalCounts/n_psi5_counts.tsv.gz
Binary file not shown.
Binary file added inst/extdata/externalCounts/n_theta_counts.tsv.gz
Binary file not shown.
Loading

0 comments on commit 772196c

Please sign in to comment.