Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/vastgroup/vast-tools
Browse files Browse the repository at this point in the history
  • Loading branch information
mirimia committed Jul 15, 2024
2 parents 72fd500 + fa636f1 commit 7ab9196
Show file tree
Hide file tree
Showing 5 changed files with 310 additions and 221 deletions.
301 changes: 187 additions & 114 deletions R/Rlib/include_diff.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
#!/usr/bin/env Rscript
#
# Author: Tim Sterne-Weiler, 2014
# tim.sterne.weiler@utoronto.ca
# Modifications by Ulrich Braunschweig 2018
# Author: Tim Sterne-Weiler, Ulrich Braunschweig 2014-2024
# u.braunschweig@utoronto.ca

# This function takes a qual and returns c(post_alpha, post_beta)
# Increments by prior alpha and prior distribution beta, uniform by default
## Check that columns of INCLUSION... table are what we think they are
checkHeader <- function(x, replicateA, replicateB) {
reps <- c(replicateA, replicateB)
sampInd <- unlist(sapply(reps, FUN=function(y) {which(x == y)}))
if (length(sampInd) != length(reps)) {
stop("[vast diff error]: Not all replicate names found in input header!\n")
}
namesOK <- all(paste0(reps, "-Q") == x[sampInd + 1])
if (!namesOK | !all(x[1:3] == c("GENE", "EVENT", "COORD"))) {
stop("[vast diff error]: Input table does not have expected format!\n")
}
}

## This function takes a qual and returns c(post_alpha, post_beta)
## Increments by prior alpha and prior distribution beta, uniform by default
parseQual <- function(qual, prior_alpha=1, prior_beta=1) {
res1 <- as.numeric(sub("[^@]*@([^,]*),.*", "\\1", qual))
res2 <- as.numeric(sub("[^@]*@[^,]*,(.*)", "\\1", qual))
Expand All @@ -19,44 +31,43 @@ parseQual <- function(qual, prior_alpha=1, prior_beta=1) {
cbind(res1, res2)
}

# calculate the probability that the first dist is > than second
# P(psi1 > psi2) when alpha=0; more generally we are determining the probability
# that Psi1 is greater than Psi2 by alpha.. eg. P((psi1 - psi2) > alpha) =
## Calculate the probability that the first dist is > than second
## P(psi1 > psi2) when alpha=0; more generally we are determining the probability
## that psi1 is greater than psi2 by alpha.. eg. P((psi1 - psi2) > alpha) =
## IMPORTANT: run this function as sample(firstDist, length(firstDist)) AND
## sample(secondDist, length(secondDist))
## sample(secondDist, length(secondDist))
## UNLESS you have paired data, then don't sample.
pDiff <- function(firstDist, secondDist, alpha=0.15) {
N <- length(firstDist)
pass <- length( which( (firstDist - secondDist) > alpha ) )
N <- length(firstDist)
pass <- length(which((firstDist - secondDist) > alpha))
pass / N
}

# This function finds the maximum difference between psi1 and psi2 given
# at least some acceptable probability for difference. Defaults to 0.8
# distributions where no diff value exists with a probability > 0.8 are given
# maxDiff of 0.
maxDiff <- function(firstDist, secondDist, acceptProb=0.9) {
alphaSet <- seq(0,1,0.01) #make this global?
probs <- unlist(lapply(alphaSet, function(x) { pDiff(firstDist, secondDist, x) }))
## This function finds the maximum difference between psi1 and psi2 given
## at least some acceptable probability for difference. Defaults to 0.8
## distributions where no diff value exists with a probability > 0.8 are given
## maxDiff of 0.
maxDiff <- function(firstDist, secondDist, acceptProb=0.9, alphaSet) {
probs <- unlist(lapply(alphaSet, pDiff,
firstDist=firstDist, secondDist=secondDist))
ind <- max(c(which(probs > acceptProb), 1))
alphaSet[ind]
}

#
# return the beta variance
## Return the beta variance
betaVar <- function(alpha, beta) {
var <- alpha*beta / (
var <- alpha * beta / (
((alpha + beta) ** 2) * (alpha + beta + 1)
)
var
}

#
## Return a confidence interval
betaCI <- function(betaDist, percentile = c(0.05, 0.95)) {
quantile(betaDist, p=percentile, na.rm = T)
}

# Extension of betaCI function that includes the sampling step
## Extension of betaCI function that includes the sampling step
betaCISample <- function(alpha, beta, n = 5000) {
if (is.na(alpha) || is.na(beta)) {
sample <- NA
Expand All @@ -66,43 +77,44 @@ betaCISample <- function(alpha, beta, n = 5000) {
return(betaCI(sample))
}


### MAKE VISUAL OUTPUT
plotDiff <- function(inpOne, inpTwo, expOne, expTwo, maxD, medOne, medTwo, sampOneName, sampTwoName, rever) {
plotDiff <- function(inpOne, inpTwo, expOne, expTwo, maxD, medOne, medTwo,
sampOneName, sampTwoName, alphaSet, rever) {
### Make visual output

if(rever) { #write this better. ;-)
curCol <- cbb[3:2]
} else {
curCol <- cbb[2:3]
}

if(length(expOne) == 0 || length(expTwo) == 0) { return(NULL) }
if (length(expOne) == 0 || length(expTwo) == 0) {return(NULL)}
one <- data.frame(x=expOne, y=-0.5)
two <- data.frame(x=expTwo, y=-0.5)

distPlot <- ggplot(melt(as.data.frame(
do.call(cbind,list(inpOne, inpTwo))
), measure.vars=c("V1","V2")), aes(fill=variable, x=value))+
geom_histogram(aes(y=..density..), binwidth=0.03333,alpha=0.5, col="grey", position="identity")+
theme_bw()+xlim(c(0,1))+xlab(expression(hat(Psi)))+
scale_fill_manual(values=curCol, labels=c(sampOneName, sampTwoName), name="Samples")+
geom_point(data=one, mapping=aes(x=x, y=y), col=cbb[2], fill=cbb[2], alpha=0.85, inherit.aes = FALSE)+
geom_point(data=two, mapping=aes(x=x, y=y), col=cbb[3], fill=cbb[3], alpha=0.85, inherit.aes = FALSE)
distPlot <- ggplot(melt(as.data.frame(do.call(cbind, list(inpOne, inpTwo))),
measure.vars=c("V1","V2")),
aes(fill=variable, x=value)) +
geom_histogram(aes(y=after_stat(density)), binwidth=0.03333, alpha=0.5, col="grey", position="identity") +
theme_bw() + scale_x_continuous(limits = c(0, 1), oob = scales::oob_keep) + xlab(expression(hat(Psi))) +
scale_fill_manual(values=curCol, labels=c(sampOneName, sampTwoName), name="Samples") +
geom_point(data=one, mapping=aes(x=x, y=y), col=cbb[2], fill=cbb[2], alpha=0.50, inherit.aes = FALSE) +
geom_point(data=two, mapping=aes(x=x, y=y), col=cbb[3], fill=cbb[3], alpha=0.50, inherit.aes = FALSE)

probPlot <- ggplot(as.data.frame(cbind(seq(0,1,0.01),
unlist(lapply(alphaList, function(x) {
unlist(lapply(alphaSet, function(x) {
pDiff(inpOne, inpTwo, x)
})))), aes(x=V1, y=V2))+
geom_line()+theme_bw()+
geom_vline(xintercept=maxD, lty="dashed", col=cbb[7])+
ylab(expression(P((hat(Psi)[1]-hat(Psi)[2]) > x)))+
xlab(expression(x))+ylim(c(0,1))+
annotate("text", x=(maxD+0.08), y=0.05, label=maxD, col=cbb[7])

return(list(distPlot, probPlot))
})))), aes(x=V1, y=V2)) +
geom_line() + theme_bw() +
geom_vline(xintercept=maxD, lty="dashed", col=cbb[7]) +
ylab(expression(P((hat(Psi)[1] - hat(Psi)[2]) > x))) +
xlab(expression(x)) + scale_y_continuous(limits = c(0, 1), oob = scales::oob_keep) +
annotate("text", x=(maxD + 0.08), y=0.05, label=maxD, col=cbb[7])

return(list(distPlot, probPlot))#
}

plotPrint <- function(plotTitle, plotCoord, plotList) {
### Print saved plotting information to output file
grid.newpage()
pushViewport(viewport(layout = grid.layout(3, 2, widths = unit(c(5, 4), "null"), heights = unit(c(1, 0.5, 5), "null"))))
grid.text(as.character(plotTitle), check.overlap=TRUE, gp=gpar(font=2), draw=T, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:2))
Expand All @@ -117,106 +129,167 @@ diffBeta <- function(i, lines, opt,
totalFirst, totalSecond,
shapeFirstAve, shapeSecondAve,
expFirst, expSecond,
repA.qualInd, repB.qualInd) {
### Main diff functionality; fit beta distributions to sample groups for one event and compare
### Is applied to each line of the current nLines of the INCLUSION... table

## if no data, next; # adapted from @lpantano's fork 7/22/2015
if( (sum(totalFirst[i,] > (opt$minReads + opt$alpha + opt$beta)) < opt$minSamples) ||
(sum(totalSecond[i,] > (opt$minReads + opt$alpha + opt$beta)) < opt$minSamples) ) {
return(NULL)
}
repA.qualInd, repB.qualInd,
okFirst, okSecond, skip,
alphaSet) {
## Main diff functionality; fit beta distributions to sample groups for one event and compare
## Is applied to each line of the current nLines of the INCLUSION... table

## Sample Posterior Distributions
psiFirst <- lapply(1:(dim(shapeFirst)[2]), function(x) {
#sample here from rbeta(N, alpha, beta) if > -e
if(totalFirst[i,x] < opt$minReads) { return(NULL) }
rbeta(opt$size, shape1=shapeFirst[i,x,1], shape2=shapeFirst[i,x,2])
## sample here from rbeta(N, alpha, beta) if > -e
if (!okFirst[i,x]) {return(NULL)}
rbeta(opt$size, shape1=shapeFirst[i,x,1], shape2=shapeFirst[i,x,2])
})

psiSecond <- lapply(1:(dim(shapeSecond)[2]), function(x) {
#sample here from rbeta(N, alpha, beta) if > -e
if(totalSecond[i,x] < opt$minReads) { return(NULL) }
rbeta(opt$size, shape1=shapeSecond[i,x,1], shape2=shapeSecond[i,x,2])
## sample here from rbeta(N, alpha, beta) if > -e
if (!okSecond[i,x]) {return(NULL)}
rbeta(opt$size, shape1=shapeSecond[i,x,1], shape2=shapeSecond[i,x,2])
})

if(opt$paired) { #make sure both samples have a non-NULL replicate
for(lstInd in 1:length(psiFirst)) {
if(is.null(psiFirst[[lstInd]]) || is.null(psiSecond[[lstInd]])) {
psiFirst[[lstInd]] <- NULL
psiSecond[[lstInd]] <- NULL
}
}
}

## Create non-parametric Joint Distributions
psiFirstComb <- do.call(c, psiFirst)
psiSecondComb <- do.call(c, psiSecond)

if( length(psiFirstComb) <= 0 || length(psiSecondComb) <= 0 ) { return(NULL) }

## if they aren't paired, then shuffle the joint distributions...
if( !opt$paired ) {
paramFirst <- try (suppressWarnings(
fitdistr(psiFirstComb,
"beta",
list( shape1=shapeFirstAve[i,1], shape2=shapeFirstAve[i,2])
)$estimate ), TRUE )
paramSecond <- try (suppressWarnings(
fitdistr(psiSecondComb,
"beta",
list( shape1=shapeSecondAve[i,1], shape2=shapeSecondAve[i,2])
)$estimate ), TRUE )
## if optimization fails its because the distribution is too narrow
## in which case our starting shapes should already be good enough
if(class(paramFirst) != "try-error") {
psiFirstComb <- rbeta(opt$size, shape1=paramFirst[1], shape2=paramFirst[2])
}
if(class(paramSecond) != "try-error") {
psiSecondComb <- rbeta(opt$size, shape1=paramSecond[1], shape2=paramSecond[2])
}
## Try to to fit a beta distribution on the replicates' parameter estimates.
if (!opt$paired) {
if (length(psiFirstComb) > 0) {
paramFirst <- try(suppressWarnings(
fitdistr(psiFirstComb,
"beta",
list(shape1=shapeFirstAve[i,1], shape2=shapeFirstAve[i,2])
)$estimate ), TRUE )
}
if (length(psiSecondComb)) {
paramSecond <- try(suppressWarnings(
fitdistr(psiSecondComb,
"beta",
list(shape1=shapeSecondAve[i,1], shape2=shapeSecondAve[i,2])
)$estimate ), TRUE )
}
## If optimization fails it's because the distribution is too narrow
## in which case our starting shapes should already be good enough.
## Shuffle since not paired, and downsample to size of smaller number
## but don't downsize if the other sample type is absent anyway.
minSample <- min(c(length(psiFirstComb), length(psiSecondComb)))

if (length(psiFirstComb) > 0) {
if (!inherits(paramFirst, "try-error")) {
psiFirstComb <- rbeta(opt$size, shape1=paramFirst[1], shape2=paramFirst[2])
} else {
if (length(psiSecondComb) > 0) {
psiFirstComb <- sample(psiFirstComb, minSample)
}
}
}
if (length(psiSecondComb) > 0) {
if (!inherits(paramSecond, "try-error")) {
psiSecondComb <- rbeta(opt$size, shape1=paramSecond[1], shape2=paramSecond[2])
} else {
if (length(psiFirstComb) > 0) {
psiSecondComb <- sample(psiSecondComb, minSample)
}
}
}
}

## get empirical posterior median of psi
medOne <- median(psiFirstComb)
medTwo <- median(psiSecondComb)

## look for a max difference given prob cutoff...
if(medOne > medTwo) {
max <- maxDiff(psiFirstComb, psiSecondComb, opt$prob)
if (skip[i]) {
if (is.null(medOne)) {medOne <- NA}
if (is.null(medTwo)) {medTwo <- NA}
max <- NA
} else {
max <- maxDiff(psiSecondComb, psiFirstComb, opt$prob)
if (medOne > medTwo) {
max <- maxDiff(psiFirstComb, psiSecondComb, opt$prob, alphaSet)
} else {
max <- maxDiff(psiSecondComb, psiFirstComb, opt$prob, alphaSet)
}
}

## SIGNIFICANT from here on out:

filtOut <- sprintf("%s\t%s\t%f\t%f\t%f\t%s", lines[[i]][1], lines[[i]][2], medOne, medTwo, medOne - medTwo, round(max,2))
filtOut <- sprintf("%s\t%s\t%f\t%f\t%f\t%s",
lines[[i]][1], lines[[i]][2], medOne, medTwo, medOne-medTwo, round(max,2))

## check for significant difference
if(max < opt$minDiff) {
## non-sig, return null plots and text output
return(list(NULL, NULL, NULL, NULL, filtOut))
} else {
sigInd <- i

if (opt$noPDF) {
eventTitle <- NULL
eventCoord <- NULL
retPlot <- NULL
} else {
if (opt$noPDF || (is.na(max) || max < opt$minDiff)) {
## non-sig, return null plots and text output
return(list(NULL, NULL, NULL, NULL, filtOut))
} else {
## significant; plot
sigInd <- i
eventTitle <- paste(c("Gene: ", lines[[i]][1], " Event: ", lines[[i]][2]), collapse="")
eventCoord <- paste(c("Coordinates: ", lines[[i]][3]), collapse="")

## Print visual output to pdf;
if( medOne > medTwo ) {
## Store visual output to be printed to pdf
if (medOne > medTwo) {
retPlot <- plotDiff(psiFirstComb, psiSecondComb,
expFirst[i,], expSecond[i,], max, medOne, medTwo, sampOneName, sampTwoName, FALSE)
expFirst[i,][okFirst[i,]], expSecond[i,][okSecond[i,]],
max, medOne, medTwo, sampOneName, sampTwoName, alphaSet, FALSE)
} else {
retPlot <- plotDiff(psiSecondComb, psiFirstComb,
expFirst[i,], expSecond[i,], max, medTwo, medOne, sampTwoName, sampOneName, TRUE)
expFirst[i,][okFirst[i,]], expSecond[i,][okSecond[i,]],
max, medTwo, medOne, sampTwoName, sampOneName, alphaSet, TRUE)
}
}
## sig event return
return(list(retPlot, eventTitle, eventCoord, sigInd, filtOut)) #return of mclapply function

## sig event return
return(list(retPlot, eventTitle, eventCoord, sigInd, filtOut))
}
}

writeLog <- function(vastPath, opt) {
### Add a line with call information to the general vast-tools log file

logName <- file.path(opt$output, "VTS_LOG_commands.txt")
vastVer <- scan(file.path(vastPath, "VERSION"), what="character", quiet = T)

## Get date, time and vast-tools version
systime <- Sys.time()
sdate <- paste(
sub("([0-9]{4})-.+", "\\1", systime),
sub("[0-9]{4}-[0-9]{2}-([0-9]{2}).+", "\\1", systime),
sub("[0-9]{4}-([0-9]{2})-.+", "\\1", systime),
sep="-")
stime <- sub("[^ ]+ (.{5}):[0-9]{2}.*", "\\1", systime)

msg1 <- paste0(
"[VAST-TOOLS v", vastVer, ", ",
sdate, " (", stime, ")]"
)

msg2 <- "vast-tools diff"

## Format input options
msg3 <- paste0(
"-a ", opt$replicateA, " ",
"-b ", opt$replicateB, " ",
"--sampleNameA ", opt$sampleNameA, " ",
"--sampleNameB ", opt$sampleNameB, " ",
"-i ", opt$input, " ",
"-n ", opt$nLines, " ",
"-p ", opt$paired, " ",
"-d ", opt$baseName, " ",
"-o ", opt$output, " ",
"-r ", opt$prob, " ",
"-m ", opt$minDiff, " ",
"-e ", opt$minReads, " ",
"-S ", opt$minSamples, " ",
"--alpha ", opt$alpha, " ",
"--beta ", opt$beta, " ",
"-s ", opt$size, " ",
"-z ", opt$seed
)

msg <- paste(msg1, msg2, msg3, paste = " ")

try_error <- try(logFile <- file(logName, "a"), silent = TRUE)
if (inherits(try_error, "try-error")) {
cat("[vast diff warning]: Could not open log file\n")
} else {
write(msg, logFile)
close(logFile)
}
}
Loading

0 comments on commit 7ab9196

Please sign in to comment.