Skip to content

Commit

Permalink
Merge branch 'ciri' into dev2
Browse files Browse the repository at this point in the history
  • Loading branch information
retaj committed Mar 9, 2017
2 parents 6d9eef1 + 66e6fdd commit 1e1810b
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 42 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
^\.git
^\.gitignore
^\.travis\.yml$
^CHANGELOG\.md$
9 changes: 7 additions & 2 deletions R/annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,13 @@ setMethod("annotateCircs", signature("RangedSummarizedExperiment"),
message('annotating splice junctions...')
se <- annotateFlanks(se, annot.list$gene.feats)
se <- annotateJunctions(se, annot.list$junctions)
message('calculating circular/linear ratios...')
se <- circLinRatio(se)

if (grepl("linear", names(assays(se)))) {
message('calculating circular/linear ratios...')
se <- circLinRatio(se)
} else {
message('no linear splicing info found, skipping circular/linear ratios...')
}

return(se)

Expand Down
81 changes: 47 additions & 34 deletions R/readData.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ readCircs <- function(file, subs="all", qualfilter=TRUE, keepCols=1:6, ...) {
}

} else if (ncol(DT) >= 21 & names(DT)[1] == '#chrom'){

# read find_circ2
DT.lin <- fread(sub("circ_splice_sites.bed$", "lin_splice_sites.bed", file), sep="\t", header=T)
DT.lin$name <- sub("lin", "norm", DT.lin$name)
DT <- rbind(DT.lin, DT)
Expand All @@ -53,6 +53,9 @@ readCircs <- function(file, subs="all", qualfilter=TRUE, keepCols=1:6, ...) {
setnames(DT, names(DT)[5], "n_reads") # TODO: this is dirty af

DT <- DT[!grepl("#", DT$chrom)]
} else if (ncol(DT) == 12 & names(DT)[1] == 'circRNA_ID') {
# read CIRI2
setnames(DT, c('name', 'chrom', 'start', 'end', 'n_reads', 'SM_MS_SMS', 'n_reads_nonjunction', 'junction_reads_ratio', 'circRNA_type', 'gene_id', 'strand', 'junction_reads_ID'))
}


Expand Down Expand Up @@ -113,38 +116,46 @@ setMethod("summarizeCircs", signature("data.frame"),
function(colData, keep.linear, wobble, subs, qualfilter, keepCols){


# munge input
# -------------------------------------- #
coldata.cnams = c('sample','filename')
if(!all(coldata.cnams %in% colnames(colData)))
stop(paste(setdiff(coldata.cnams, colnames(colData),
'is missing from colData')))


# -------------------------------------- #
circ.files = as.character(colData$filename)

# -------------------------------------- #
if(!all(file.exists(circ.files)))
stop('Supplied circ files do not exist')

# load circs
# -------------------------------------- #
circs = lapply(circ.files, readCircs, subs, qualfilter, keepCols)
names(circs) <- colData$sample

dcircs = rbindlist(circs)
if (grepl("_circ_norm_", dcircs$name[1]) | grepl("_circ_circ_", dcircs$name[1])) {
message('funky naming scheme used, will convert _circ_norm_ to _norm_ and _circ_circ_ to _circ_ before everything crashes')
dcircs$name <- sub("_circ_norm_", "_norm_", dcircs$name) # TODO: add exception somewhere, if there is _circ_norm_ in names, boom
dcircs$name <- sub("_circ_circ_", "_circ_", dcircs$name)
}
dcircs$type = ifelse(grepl('circ', dcircs$name), 'circ', 'linear')
#dcircs$set = factor(sub('norm_.+','',sub('circ_.+','',dcircs$name))) # old
dcircs$set = factor(rep(names(circs), sapply(circs, nrow)), levels=names(circs))
dcircs = split(dcircs, dcircs$type)

# -------------------------------------- #
message('Processing circular transcripts')
circ.gr = makeGRangesFromDataFrame(as.data.frame(dcircs[['circ']]), keep.extra.columns=TRUE)
# process circular and linear if input is find_circ
if (!("SM_MS_SMS" %in% names(circs[[1]]))) { #TODO: find a better way to recognize CIRI2
# find_circ
if (grepl("_circ_norm_", dcircs$name[1]) | grepl("_circ_circ_", dcircs$name[1])) {
message('funky naming scheme used, will convert _circ_norm_ to _norm_ and _circ_circ_ to _circ_ before everything crashes')
dcircs$name <- sub("_circ_norm_", "_norm_", dcircs$name)
dcircs$name <- sub("_circ_circ_", "_circ_", dcircs$name)
}
dcircs$type = ifelse(grepl('circ', dcircs$name), 'circ', 'linear')

dcircs = split(dcircs, dcircs$type)

circ.gr = makeGRangesFromDataFrame(as.data.frame(dcircs[['circ']]), keep.extra.columns=TRUE)
} else {
# CIRI2
circ.gr = makeGRangesFromDataFrame(as.data.frame(dcircs), keep.extra.columns=TRUE)
}

# prepare the wobble
# -------------------------------------- #
circ.gr.s = resize(resize(circ.gr, fix='start', width=1), fix='center', width=wobble)
circ.gr.e = resize(resize(circ.gr, fix='end', width=1), fix='center', width=wobble)

Expand All @@ -157,25 +168,27 @@ setMethod("summarizeCircs", signature("data.frame"),
#circ.gr.reduced = sort(unlist(range(split(circ.gr, merge.fos$fac), ignore.strand=FALSE)))
# TODO: wobble logic is naive, should be improved to select the best expressed circRNA as a referent one
circ.gr.reduced = unlist(range(split(circ.gr, merge.fos$fac), ignore.strand=FALSE))

# prepare the assays object
# TODO: this screams refactoring
# -------------------------------------- #
message('Fetching circular expression')
# TODO: pull this out as a helper function that can go through all circ.gr metadata
# columns and assign them to assays
# circ.ex = merge.fos[,.(queryHits, fac)]
# circ.ex$nreads = circ.gr$n_reads[circ.ex$queryHits]
# circ.ex$set = circ.gr$set[circ.ex$queryHits]
# circ.ex.matrix = dcast.data.table(formula=fac~set,
# fun.aggregate=sum,
# fill=0,
# value.var='nreads',
# data=circ.ex)
# circ.ex.matrix = circ.ex.matrix[match(names(circ.gr.reduced), circ.ex.matrix$fac)]
n_reads.dt <- MungeColumn(merge.fos, circ.gr, circ.gr.reduced, "n_reads")
n_uniq.dt <- MungeColumn(merge.fos, circ.gr, circ.gr.reduced, "n_uniq")

assays = list()
assays$circ = as.matrix(n_reads.dt[,-1,with=FALSE])
assays$circ.uniq = as.matrix( n_uniq.dt[,-1,with=FALSE])
#assays$circ = as.matrix(circ.ex.matrix[,-1,with=FALSE])
if (!("SM_MS_SMS" %in% names(circs[[1]]))) { #TODO: find a better way to recognize CIRI2

n_reads.dt <- MungeColumn(merge.fos, circ.gr, circ.gr.reduced, "n_reads")
n_uniq.dt <- MungeColumn(merge.fos, circ.gr, circ.gr.reduced, "n_uniq")

assays$circ = as.matrix(n_reads.dt[, -1, with=FALSE])
assays$circ.uniq = as.matrix( n_uniq.dt[, -1, with=FALSE])

} else {

n_reads.dt <- MungeColumn(merge.fos, circ.gr, circ.gr.reduced, "n_reads")
assays$circ = as.matrix(n_reads.dt[, -1, with=FALSE])

}

if(keep.linear==TRUE){
message('Processing linear transcripts')
Expand Down Expand Up @@ -203,9 +216,9 @@ setMethod("summarizeCircs", signature("character"),
function(colData, keep.linear, wobble, subs, qualfilter,keepCols){

message('Constructing colData...')
colData = data.frame(sample = sub('.candidates.bed','',basename(colData)),
filename=colData,
stringsAsFactors=FALSE)
colData = data.frame(sample = sub('.candidates.bed', '', basename(colData)),
filename = colData,
stringsAsFactors=FALSE)

summarizeCircs(colData=colData,
keep.linear=keep.linear,
Expand Down
17 changes: 12 additions & 5 deletions R/visualize.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,22 @@ setMethod("uniqReadsQC",
signature("RangedSummarizedExperiment"),
definition=function(se, sample) {

if (sample != "all" & !(sample %in% colnames(se))) stop(sample, ' is not a valid sample.')
# this makes sense only for find_circ analyses,
# CIRI does not report unique reads
if (!("circ.uniq" %in% names(assays(se)))) {
stop('circ.uniq assay does not exist')
}
if (sample != "all" & !(sample %in% colnames(se))) {
stop(sample, ' is not a valid sample.')
}

SMPL <- sample
if (sample == "all") {
totals <- rowSums(assays(circs.se)$circ)
uniqs <- rowSums(assays(circs.se)$circ.uniq)
totals <- rowSums(assays(se)$circ)
uniqs <- rowSums(assays(se)$circ.uniq)
} else {
totals <- assays(circs.se)$circ[, SMPL]
uniqs <- assays(circs.se)$circ.uniq[, SMPL]
totals <- assays(se)$circ[, SMPL]
uniqs <- assays(se)$circ.uniq[, SMPL]
}

tmp.dt <- data.table(totals = totals,
Expand Down
19 changes: 18 additions & 1 deletion inst/scripts/test_load_data.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,20 @@
#CIRI2
annot.list <- loadAnnotation("inst/extdata/db/test.sqlite")
cdata <- data.frame(sample=c("riboz", "RNaseR"),
filename=c("inst/extdata/ciri_demo_hek/HEK_riborezo_CIRI_sites.txt",
"inst/extdata/ciri_demo_hek/HEK_RNaseR_CIRI_sites.txt"))

# colData <- cdata
# keep.linear <- TRUE
# wobble <- 1
# subs <- "all"
# qualfilter <- FALSE
# keepCols <- 1:12


circs.se <- summarizeCircs(colData = cdata, keep.linear = FALSE, wobble = 1, subs = "all", qualfilter = FALSE, keepCols = 1:12)
circs.se <- annotateCircs(se = circs.se, annot.list = annot.list, assembly = "hg19")

# find_circ2
annot.list <- loadAnnotation("data/test.sqlite")

Expand All @@ -14,7 +31,7 @@ circs.se <- annotateCircs(se = circs.se, annot.list = annot.list, assembly = "hg

histogram(circs.se, 0.5)
annotPie(circs.se, 0.02)
uniqReadsQC(circs.se, "D0")
uniqReadsQC(circs.se, "all")

# development
#library(data.table)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,19 @@ test_that('sample labels are robust upon resorting colData', {
# sorted vs. unsorted, resTable()
expect_equal(resTable(se.sorted)$FrontalCortex_rep1_sites.bed_circ, resTable(se.unsorted)$FrontalCortex_rep1_sites.bed_circ)
})

test_that('CIRI2 input can be digested by ciRcus', {

circ.files = list.files(system.file('extdata/ciri_demo_hek', package='ciRcus'),
pattern='HEK',
full.names=TRUE)

se <- summarizeCircs(circ.files, keep.linear = FALSE, wobble = 1, subs = "all", qualfilter = FALSE, keepCols = 1:12)

# total circRNAs in both samples
expect_equal(nrow(resTable(se)), 14936)
# there should be 4127 ribozero circRNAs
expect_equal(sum(resTable(se)[, 6, with=F] > 0), 4127)
# there should be 14013 RNaseR circRNAs
expect_equal(sum(resTable(se)[, 7, with=F] > 0), 14013)
})

0 comments on commit 1e1810b

Please sign in to comment.