-
Notifications
You must be signed in to change notification settings - Fork 2
/
motifDynAna.R
executable file
·204 lines (189 loc) · 9.58 KB
/
motifDynAna.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library("optparse"))
## parse command line arguments
option_list <- list(
make_option(c("-i", "--inFile"), help="input file containing motif enrichment dynamics (can be stdin)"),
make_option(c("-n", "--minFreq"), default="50", help="minimum frequency of NFRs in each nfr dynamic class (defaut=%default)"),
make_option(c("-y", "--minIdentity"), default="0.8", help="minimum identity of denovo motif with the known motif (defaut=%default)"),
make_option(c("-d", "--diffFreq"), default="0", help="minimum difference in enrichment between categories (defaut=%default)"),
make_option(c("-o", "--outPdfFile"), help="output pdf image file"),
make_option(c("-f", "--onlyDiffFreq"), action="store_true", help="less stringent criteria. use only difference in frequency"),
make_option(c("-t", "--plotRange"), action="store_true", help="even less stringent criteria. plot motifs within a range of differential enrichment"),
make_option(c("-v", "--plotSim"), action="store_true", help="instead plot motifs similar in frequency"),
make_option(c("-r", "--rescale"), action="store_true", help="rescale color bar between min and max value"),
make_option(c("-k", "--breaks"), default="0.3", help="breaks for rescale color bar (default=%default)"),
make_option(c("-c", "--color"), default="RdBu", help="color palette (default=%default)"),
make_option(c("-b", "--colorBias"), default="1", help="bias in color (default=%default)"),
make_option(c("-l", "--mustIncludeMotif"), help="name of motifs that must be included in the final output (if multiple, separate them by a comma)")
)
parser <- OptionParser(usage = "%prog [options]", option_list=option_list)
opt <- parse_args(parser)
## check, if all required arguments are given
if(is.null(opt$inFile) | is.null(opt$outPdfFile)) {
cat("\nProgram: motifDynAna.R (R script to plot motif dynamics)\n")
cat("Author: BRIC, University of Copenhagen, Denmark\n")
cat("Version: 1.0\n")
cat("Contact: pundhir@binf.ku.dk\n");
print_help(parser)
q()
}
## load libraries
suppressPackageStartupMessages(library(reshape))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(session))
suppressPackageStartupMessages(library(pheatmap))
## header of DENOVO- or KNOWN-MOTIF_ENRICHMENT_DYNAMICS.TXT
# 1. SAMPLE_ID
# 2. MOTIF_NAME
# 3. MOTIF
# 4. P-value
# 5. log_P-value
# 6. q-value
# 7. No_of_target_sequences_with_motif
# 8. %_of_target_sequences_with_motif
# 9. No_of_background_sequences_with_motif
# 10. %_of_background_sequences_with_motif
# 11. No_of_target_sequences
# 12. No_of_background_sequences
if(identical(opt$inFile, "stdin")==T) {
dataRaw <- read.table(file("stdin"))
} else {
dataRaw <- read.table(opt$inFile)
}
dataRaw$V13 <- log2((dataRaw$V8+0.01)/(dataRaw$V10+0.01))
dataRaw$V14 <- gsub("^.*BestGuess:", "", dataRaw$V2)
dataRaw$V14 <- gsub("\\(.*", "", dataRaw$V14)
dataRaw$V15 <- gsub("^.*BestGuess:", "", dataRaw$V2)
dataRaw <- dataRaw[which(dataRaw$V11>=as.numeric(opt$minFreq)),]
dataRaw$V15 <- as.numeric(gsub("\\)", "", gsub("^.*\\(", "", dataRaw$V15)))
if(length(which( !is.na(dataRaw$V15), arr.ind=TRUE))>0) {
data <- dataRaw[which(dataRaw$V11>=as.numeric(opt$minFreq) & dataRaw$V15>=as.numeric(opt$minIdentity)),]
} else {
data <- dataRaw[which(dataRaw$V11>=as.numeric(opt$minFreq)),]
data$V15 <- 0
}
if(!is.null(opt$mustIncludeMotif)) {
data <- rbind(data, dataRaw[grep(paste(unlist(strsplit(as.character(opt$mustInclude),",")),collapse="\\/|"), dataRaw$V2),])
data <- data[order(data$V1, data$V2),]
data[is.na(data$V15),]$V15 <- 0
data <- unique(data)
}
cat(sprintf("%d out of %d motifs passed identity (%s) and frequency (%s) score criteria..", nrow(data)/length(unique(data$V1)), nrow(dataRaw)/length(unique(dataRaw$V1)), opt$minIdentity, opt$minFreq))
cat("\n")
no_rows=nrow(data)/length(unique(data$V1))
tf_info <- as.data.frame(data[1:no_rows, c(14,15)])
dat <- matrix(data$V8, nrow=no_rows)
pat <- matrix(data$V4, nrow=no_rows)
## use log fold change
mat <- matrix(data$V13, nrow=no_rows)
## use q-value
#mat <- matrix(data$V6, nrow=no_rows)
data$V1 <- sprintf("%s_%s", data$V1, data$V11)
colnames(mat) <- as.vector(unique(data$V1))
row.names(mat) <- data[1:no_rows,2]
colnames(dat) <- as.vector(unique(data$V1))
row.names(dat) <- data[1:no_rows,2]
if(!is.null(opt$plotSim)) {
#sig_rows <- which(apply(mat, 1, function(x) max(x) > 0.1 & min(x) > 0.1 & max(x)-min(x) <= as.numeric(opt$diffFreq)))
#sig_rows <- which(apply(mat, 1, function(x) max(x)-min(x) <= as.numeric(opt$diffFreq)))
sig_rows <- row(mat)[,1]
} else if(!is.null(opt$plotRange)) {
sig_rows <- which(apply(mat, 1, function(x) max(x) > 1.5 & max(x)-min(x) > 0.3 & max(x)-min(x) <= as.numeric(opt$diffFreq)))
} else if(!is.null(opt$onlyDiffFreq)) {
sig_rows <- which(apply(mat, 1, function(x) max(x)-min(x)>as.numeric(opt$diffFreq)))
} else {
sig_rows <- which(apply(mat, 1, function(x) max(x) > 0 & min(x) < 0 & max(x)-min(x) > as.numeric(opt$diffFreq)))
#sig_rows <- which(apply(mat, 1, function(x) max(x) > as.numeric(opt$diffFreq) & min(x) < -1*as.numeric(opt$diffFreq) & max(x)-min(x) > 1))
}
#sig_rows <- which(apply(dat, 1, function(x) max(x)>3))
cat(sprintf("%d out of %d motifs passed filter criteria..", length(sig_rows), nrow(mat)))
cat("\n")
if(length(sig_rows)>2) {
pdf(opt$outPdfFile, height=20)
#breaks <- as.vector(summary(as.vector(mat[sig_rows,])))
#len=2
#breaks1 <- seq(breaks[1], breaks[2], length=len)
#breaks2 <- seq(breaks[2], breaks[3], length=len)
#breaks3 <- seq(breaks[3], breaks[4], length=len)
#breaks4 <- seq(breaks[4], breaks[5], length=len)
#breaks5 <- seq(breaks[5], breaks[6], length=len)
#breaks <- c(breaks1[1:length(breaks1)], breaks2[2:length(breaks2)],
# breaks3[2:length(breaks3)], breaks4[2:length(breaks4)],
# breaks5[2:length(breaks5)])
if(!is.null(opt$rescale)) {
breaks <- seq(min(mat[sig_rows,]), max(mat[sig_rows,]), by=as.numeric(opt$breaks))
#if(length(breaks)<=12) {
# myCol <- rev(brewer.pal(length(breaks)-1, "RdBu"))
#} else {
# myCol <- colorRampPalette(c("dodgerblue4","grey97", "sienna3"))(length(breaks)-1)
# #myCol <- colorpanel(n=length(breaks)-1,low="blue",mid="#f7f7f7",high="red")
# #myCol <- colorpanel(n=length(breaks)-1,low="#2166ac",mid="#f7f7f7",high="#b2182b")
#}
myCol <- rev(colorRampPalette(brewer.pal(11, opt$color), bias=as.numeric(opt$colorBias))(length(breaks)-1))
#heatmap.2(mat[sig_rows,], trace="none", col=myCol, margins=c(15,25), cexCol=1, cexRow=1, breaks=breaks)
pheatmap(mat[sig_rows,], col=myCol, border_color = NA, cluster_rows=F, breaks=breaks)
} else {
myCol <- rev(colorRampPalette(brewer.pal(11, opt$color), bias=as.numeric(opt$colorBias))(256))
#myCol <- rev(brewer.pal(11, opt$color))
#heatmap.2(mat[sig_rows,], trace="none", col=myCol, margins=c(15,25), cexCol=1, cexRow=1)
pheatmap(mat[sig_rows,], col=myCol, border_color = NA, cluster_rows=F)
#heatmap.2(mat, trace="none", col=myCol, margins=c(15,20), cexCol=1, cexRow=1)
}
dev.off()
txtFile=sprintf("%s.txt", opt$outPdfFile)
mat_df <- as.data.frame(mat)
mat_df$diff <- apply(mat_df, 1, function(x) max(x)-min(x))
dat_df <- as.data.frame(dat)
dat_df$class <- 0
dat_df[which(apply(dat_df, 1, function(x) max(x)>5)==T),]$class <- 1
dat_df[which(apply(dat_df, 1, function(x) max(x)>10)==T),]$class <- 2
#dat_df[which(apply(dat_df, 1, function(x) max(x)>20)==T),]$class <- 3
mat_df <- merge(mat_df, dat_df, by=0)
write.table(mat_df[order(-mat_df[,5]),], txtFile, quote=F, append=F, sep="\t", col.names=T, row.names=T)
} else if(length(sig_rows)>=1) {
txtFile=sprintf("%s.txt", opt$outPdfFile)
cat("\nNumber of significant motifs are too low for a heatmap. Instead writing results to txtFile\n\n")
if(length(sig_rows)==1) {
write(rownames(mat)[sig_rows], file="motif_dynamics.txt", sep="\t")
write.table(mat[sig_rows,], txtFile, quote=F, append=TRUE, sep="\t")
} else {
write.table(mat[sig_rows,], txtFile, quote=F, append=FALSE, sep="\t")
}
} else {
cat("\nNo significant motif is found\n")
}
#save.session("test.session")
## old code (v2.0)
#data <- read.table(opt$inFile)
#data$V10 <- apply(data, 1, function(x) binom.test(as.numeric(x[6]), as.numeric(x[7]), as.numeric(x[8])/as.numeric(x[9]), alternative="g")$p.value)
#no_rows=nrow(data)/length(unique(data$V5))
#dat <- matrix(data$V2, nrow=no_rows)
## use log fold change
#mat <- matrix(data$V4, nrow=no_rows)
## use difference between real and background percentages
#mat <- matrix(data$V2-data$V3, nrow=no_rows)
## use p-value computed using binomial test
#mat <- matrix(data$V10, nrow=no_rows)
#colnames(mat) <- as.vector(unique(data$V5))
#row.names(mat) <- data[1:no_rows,1]
#myCol <- brewer.pal(9, "Blues")
#sig_rows <- which(apply(dat, 1, function(x) max(x))>0.33)
#pdf(opt$outPdfFile)
#heatmap.2(mat[sig_rows,], trace="none", col=myCol, margins=c(12,20), cexCol=1, cexRow=1)
#dev.off()
## old code (v1.0)
#data$count <- 1
#dat <- dcast(data, V2 ~ V1)
#dat[is.na(dat)] <- 0
#cols <- ncol(dat)
#dat$mean <- apply(dat[,c(2:cols)],1, mean)
#dat <- dat[with(dat, order(-mean)),]
#mat <- data.matrix(dat)[,c(2:cols)]
#rownames(mat) <- dat$V2
#mycol <- colorpanel(n=99,low="white",high="black")
#colnames(mat) <- gsub("granulocytes", "grn", colnames(mat))
#pdf(opt$outPdfFile)
#heatmap.2(mat, trace="none", col=mycol, margins=c(11,10), cexCol=1)
#dev.off()