-
Notifications
You must be signed in to change notification settings - Fork 2
/
ngsPlot
executable file
·349 lines (314 loc) · 14.5 KB
/
ngsPlot
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#!/bin/bash
#PBS -l nodes=1:ppn=4
GENOME="mm9";
YLIM="auto";
L=1000;
CD=0.6
SC="global";
CO="Reds";
SHIFT=0;
GO="none"
RR=30
RZ=0
KNC=5
SS="both"
MW=13
REORDER_NGSPLOT="lsk,pregm,gmp,granulocytes"
GENOMIC_REGIONS="bed"
PLOTTITLE="ngsplot"
#### usage ####
usage() {
echo Program: "ngsPlot (plot histone profiles for a set of genomic regions)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: pundhir@binf.ku.dk
echo "Usage: ngsPlot -i <file | stdin> -j <dir> [OPTIONS]"
echo "Options:"
echo " -i <file> [input file(s) having regions of interest in BED format or gene id's, one per line ]"
echo " [if multiple seperate them by a comma]"
echo " [FORMAT: chr start end name score strand]"
echo " -j <dir> [input directory containing BAM files for various histone marks]"
echo " [if multiple seperate them by a comma]"
echo " -o <dir> [output directory to store results]"
echo "[OPTIONS]"
echo " -f <string> [filter BAM files matching identifiers eg. wt or gmp]"
echo " [if multiple seperate them by a comma]"
echo " -d <string> [filter BAM files not matching identifiers eg. wt or gmp]"
echo " [if multiple seperate them by a comma]"
echo " -g <string> [genome (default: mm9)]"
echo " -t <string> [name of the output histone profile plot (default: ngsplot)]"
echo " -u <string> [text identifier for input file(s)]"
echo " [if multiple seperate them by a comma]"
echo " -c <string> [specify line color for average profile in hexadecimal code (enclose in double quotes)]"
echo " [if multiple seperate them by a comma]"
echo " -r <string> [specify color for heatmap (default: Reds); suffix -R to reverse the order of color palette, eg. RdBu-R]"
echo " [for bam-pair, use color-pair(neg_color:pos_color)]"
echo " -p <float> [color distribution for heatmap (positive number; default: 0.6)]"
echo " [ Hint: lower values give more widely spaced colors at the negative end]"
echo " [ In other words, they shift the neutral color to positive values]"
echo " [ If set to 1, the neutral color represents 0 (i.e. no bias)]"
echo " -m [merge replicates of input BAM files]"
echo " -w [ngsPlot description file already exists]"
echo " -y <string> [y-axis limit (min,max)]"
echo " -s <string> [color scale for heatmap (min,max) (default: global)]"
echo " [local: base on each individual heatmap]"
echo " [region: base on all heatmaps belong to the same region]"
echo " [global: base on all heatmaps together]"
echo " [min_val,max_val: custom scale using a pair of numerics]"
echo " -l <int> [flanking region size (default: 1000)]"
echo " -z <int> [shift center position by input number of bases (default: 0)]"
echo " -a <string> [gene order algorithm for heatmap (default: none)]"
echo " [total, hc, max, prod, diff, km, width, none]"
echo " -b <int> [reduce ratio for heatmap (default: 30)]"
echo " [the parameter controls the heatmap height. The smaller the value, the taller the heatmap]"
echo " -q <int> [K-means or HC number of clusters (default=5)]"
echo " -k <int> [remove all zero profiles in heatmap (default: 0) ]"
echo " [set 1 to remove them]"
echo " -n [scale signal profile between 0 and 1]"
echo " -v <string> [reorder NGSPLOT.TXT based on input keywords (default: lsk,pregm,gmp,granulocytes)]"
echo " -e <string> [genomic regions (bed or genebody; default: bed)]"
echo " -x <string> [strand-specific coverage calculation: both (default), same, opposite]"
echo " -M <int> [moving window width to smooth avg. profiles, must be integer]"
echo " [1=no; 3=slightly; 5=somewhat; 9=quite; 13=super (default)]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:j:o:f:d:g:t:u:c:r:p:mwy:s:l:z:a:b:q:k:nv:e:x:M:h ARG; do
case "$ARG" in
i) REGION=$OPTARG;;
j) BAMDIR=$OPTARG;;
o) OUTDIR=$OPTARG;;
f) FILTER_MATCH=$OPTARG;;
d) FILTER_NOMATCH=$OPTARG;;
g) GENOME=$OPTARG;;
t) PLOTTITLE=$OPTARG;;
u) REGIONTITLE=$OPTARG;;
c) COLOR=$OPTARG;;
r) CO=$OPTARG;;
p) CD=$OPTARG;;
m) MERGE=1;;
w) EXIST=1;;
y) YLIM=$OPTARG;;
s) SC=$OPTARG;;
l) L=$OPTARG;;
z) SHIFT=$OPTARG;;
a) GO=$OPTARG;;
b) RR=$OPTARG;;
q) KNC=$OPTARG;;
k) RZ=$OPTARG;;
n) SCALE=1;;
v) REORDER_NGSPLOT=$OPTARG;;
e) GENOMIC_REGIONS=$OPTARG;;
x) SS=$OPTARG;;
M) MW=$OPTARG;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ -z "$REGION" -o -z "$BAMDIR" -o -z "$OUTDIR" -o "$HELP" ]; then
usage
fi
###################
#helperfunction
function wait_for_jobs_to_finish {
for job in `jobs -p`
do
echo $job
wait $job
done
echo $1
}
###############
<<"COMMENT1"
COMMENT1
echo -n "Create directory structure... "
if [ ! -d "$OUTDIR" ]; then
mkdir -p $OUTDIR
fi
echo "done"
oIFS=$IFS
IFS=","
REGIONS=($REGION)
IFS=$oIFS
## initialize a text title for each input file, if not provided
if [ -z "$REGIONTITLE" ]; then
REGIONTITLE=""
for(( i=0; i<${#REGIONS[@]}; i++ )); do
REGIONTITLE="$REGIONTITLE,$i"
done
REGIONTITLE=`echo $REGIONTITLE | perl -ane '$_=~s/^\,//g; print $_;'`;
fi
IFS=","
REGIONTITLES=($REGIONTITLE)
IFS=$oIFS
if [ "${#REGIONS[@]}" -ne "${#REGIONTITLES[@]}" ]; then
echo
echo "ERROR: a text title should be provided for each input file"
echo
usage
fi
if [ -z "$EXIST" ]; then
for (( j=0; j<${#REGIONS[@]}; j++ )); do
## determine, if the input peaks are from a file or stdin
echo -n "Create peak file depending on if the input is from file or STDIN... " >&2
if [ -f "${REGIONS[$j]}" ]; then
if [ "$GENOMIC_REGIONS" == "bed" -a "$L" -gt 0 ]; then
zless ${REGIONS[$j]} | perl -ane '$mid=sprintf("%0.0f", ($F[1]+$F[2])/2); $mid=$mid+'$SHIFT'; $start=$mid; $end=$mid+1; $line="$F[0]\t$start\t$end"; foreach(@F[3..scalar(@F)-1]) { chomp($_); $line.="\t$_"; } print "$line\n";' > $OUTDIR/REGIONS_INTEREST$j.bed
elif [ "$GENOMIC_REGIONS" == "bed" ]; then
zless ${REGIONS[$j]} > $OUTDIR/REGIONS_INTEREST$j.bed
else
zless ${REGIONS[$j]} | perl -ane 'chomp($F[0]); print "$F[0]\n";' > $OUTDIR/REGIONS_INTEREST$j.bed
fi
elif [ "${REGIONS[$j]}" == "stdin" ]; then
if [ "$GENOMIC_REGIONS" == "bed" -a "$L" -gt 0 ]; then
while read LINE; do echo ${LINE}; done | perl -ane '$mid=sprintf("%0.0f", ($F[1]+$F[2])/2); $mid=$mid+'$SHIFT'; $start=$mid; $end=$mid+1; $line="$F[0]\t$start\t$end"; foreach(@F[3..scalar(@F)-1]) { chomp($_); $line.="\t$_"; } print "$line\n";' > $OUTDIR/REGIONS_INTEREST$j.bed
elif [ "$GENOMIC_REGIONS" == "bed" ]; then
while read LINE; do echo ${LINE}; done | perl -ane '$line="$F[0]\t$F[1]\t$F[2]"; foreach(@F[3..scalar(@F)-1]) { chomp($_); $line.="\t$_"; } print "$line\n";' > $OUTDIR/REGIONS_INTEREST$j.bed
else
while read LINE; do echo ${LINE}; done | perl -ane 'chomp($_); print "$F[0]\n";' > $OUTDIR/REGIONS_INTEREST$j.bed
fi
else
usage
fi
## sort input bed file by width, if required
if [ $GO == "width" ]; then
TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
bed2sort -i $OUTDIR/REGIONS_INTEREST$j.bed > $TMP
mv $TMP $OUTDIR/REGIONS_INTEREST$j.bed
GO="none"
fi
echo "done" >&2
echo -n "Compute histone profile for each peak file... " >&2
IFS=","
FILTERS_MATCH=($FILTER_MATCH)
IFS=$oIFS
IFS=","
FILTERS_NOMATCH=($FILTER_NOMATCH)
IFS=$oIFS
REGIONS_COUNT=`less $OUTDIR/REGIONS_INTEREST$j.bed | wc -l`;
IFS=","
BAMDIRS=($BAMDIR)
IFS=$oIFS
for DIR in "${BAMDIRS[@]}"; do
for file in $DIR/*.bam; do
MATCH_COUNT=0
for (( i=0; i<${#FILTERS_MATCH[@]}; i++ )); do
FOUND=`echo $file | perl -ane '$_=~s/^.*\///g; if($_=~/'${FILTERS_MATCH[$i]}'/) {print "1\n";}else{print "0\n";}'`;
#echo "${FILTERS_MATCH[$i]} $file $FOUND $i"
MATCH_COUNT=$(($MATCH_COUNT + $FOUND))
done
NOMATCH_COUNT=0
for (( i=0; i<${#FILTERS_NOMATCH[@]}; i++ )); do
FOUND=`echo $file | perl -ane '$_=~s/^.*\///g; if($_=~/'${FILTERS_NOMATCH[$i]}'/) {print "1\n";}else{print "0\n";}'`;
#echo "${FILTERS_NOMATCH[$i]} $file $FOUND $i"
NOMATCH_COUNT=$(($NOMATCH_COUNT + $FOUND))
done
TITLE=`echo $file | sed 's/^.*\///g;' | sed 's/\..*//g' | perl -ane 'chomp($_); print "$_#'${REGIONTITLES[$j]}'";'`;
if [ "$MATCH_COUNT" == ${#FILTERS_MATCH[@]} -a "$NOMATCH_COUNT" -eq 0 ]; then
echo -e "$file\t$OUTDIR/REGIONS_INTEREST$j.bed\t\"$TITLE\""
#echo -e "$file\t$OUTDIR/REGIONS_INTEREST$j.bed\t\"$TITLE\"\t150\t\"${COLOR[$k]}\""
fi
done
done
echo "done" >&2
done > $OUTDIR/NGSPLOT.txt.tmp
## reorder NGSPLOT file
#IFS=","
#REORDER_ITEM=($REORDER_NGSPLOT)
#IFS=$oIFS
#for ITEM in "${REORDER_ITEM[@]}"; do
# grep "_"$ITEM"_" $OUTDIR/NGSPLOT.txt.tmp
#done > $OUTDIR/NGSPLOT.txt.tmp.reorder
#comm -3 <(sort $OUTDIR/NGSPLOT.txt.tmp.reorder) <(sort $OUTDIR/NGSPLOT.txt.tmp) | sed -E 's/^\s+//g' >> $OUTDIR/NGSPLOT.txt.tmp.reorder
#mv $OUTDIR/NGSPLOT.txt.tmp.reorder $OUTDIR/NGSPLOT.txt.tmp
## define color codes (http://colorbrewer2.org)
if [ -z "$COLOR" ]; then
IFS=" "
COLORS=($(getRColor.R -i $(cat $OUTDIR/NGSPLOT.txt.tmp | wc -l) -p));
IFS=$oIFS
else
COLOR=$(echo $COLOR | perl -ane '$_=~s/\"//g; print $_;')
IFS=","
COLORS=($COLOR);
IFS=$oIFS
fi
k=-1;
LAST_SEEN_BAM_FILE="NA";
LAST_SEEN_BED_FILE="NA";
echo -n "using colors: " >&2
while read LINE; do
BAM_FILE=$(echo ${LINE} | cut -f 1 -d ' ');
BED_FILE=$(echo ${LINE} | cut -f 2 -d ' ');
if [ "${#REGIONS[@]}" -gt "1" -a -z "$MERGE" ]; then
if [ "$LAST_SEEN_BED_FILE" != "$BED_FILE" -o "$LAST_SEEN_BAM_FILE" != "$BAM_FILE" ]; then
k=$(( k+1 ))
fi
else
k=$(( k+1 ))
fi
if [ -z "${COLORS[$k]}" ]; then
k=-1;
echo >&2
echo "WARNING: not enough line colors provided for region classes" >&2
echo >&2
fi
echo -e "${LINE}\t150\t\"${COLORS[$k]}\""
echo -n "${COLORS[$k]} " >&2
LAST_SEEN_BED_FILE=$BED_FILE;
LAST_SEEN_BAM_FILE=$BAM_FILE;
done < $OUTDIR/NGSPLOT.txt.tmp > $OUTDIR/NGSPLOT.txt
rm $OUTDIR/NGSPLOT.txt.tmp
echo "done" >&2
fi
echo -n "plot signal profile.. "
FORBIDIMG=1
#if [ ! -f "$OUTDIR/$PLOTTITLE.zip" ]; then
ngs.plot.r -G $GENOME -R $GENOMIC_REGIONS -C $OUTDIR/NGSPLOT.txt -O $OUTDIR/$PLOTTITLE -T "$REGIONTITLE" -RZ $RZ -MW $MW -Al spline -FL 150 -GO $GO -RR $RR -SC $SC -L $L -YAS $YLIM -SE 1 -FI $FORBIDIMG -KNC $KNC -D refseq -SS $SS
#fi
echo "done"
FORBIDIMG=0
if [ ! -z "$MERGE" ]; then
replotMergedReplicates.r prof -I $OUTDIR/$PLOTTITLE".zip" -O $OUTDIR/$PLOTTITLE".avgprof" -FI $FORBIDIMG -YAS $YLIM -MW $MW -FS 10 -GO $GO -KNC $KNC
replotMergedReplicates.r heatmap -I $OUTDIR/$PLOTTITLE".zip" -O $OUTDIR/$PLOTTITLE".heatmap" -FI $FORBIDIMG -MW $MW -SC $SC -FS 15 -CO $CO -CD $CD -RR $RR -GO $GO -KNC $KNC
else
replot.r prof -I $OUTDIR/$PLOTTITLE".zip" -O $OUTDIR/$PLOTTITLE".avgprof" -FI $FORBIDIMG -YAS $YLIM -MW $MW -FS 10 -GO $GO -KNC $KNC
replot.r heatmap -I $OUTDIR/$PLOTTITLE".zip" -O $OUTDIR/$PLOTTITLE".heatmap" -FI $FORBIDIMG -MW $MW -SC $SC -FS 15 -CO $CO -CD $CD -RR $RR -GO $GO -KNC $KNC
fi
echo -n "scale signal profile between 0 and 1.. "
if [ ! -z "$SCALE" ]; then
ngsPlot.R -i $OUTDIR/$PLOTTITLE.avgprof.RData -o $OUTDIR/$PLOTTITLE.avgprof.pdf
fi
echo "done"
if [ -s "$OUTDIR/$PLOTTITLE.heatmap.clusterInfo" -a "${#REGIONS[@]}" -eq 1 ]; then
TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
join -1 4 -2 1 <(sort -k 4,4 $OUTDIR/REGIONS_INTEREST0.bed) <(cat $OUTDIR/$PLOTTITLE.heatmap.clusterInfo | perl -ane '$F[0]=~s/\:[^\:]+$//g; print "$F[0]\t$F[1]\n";' | sort -k 1,1) | perl -ane 'print "$F[1]\t$F[2]\t$F[3]\t$F[0]"; foreach(@F[4..scalar(@F)-1]) { print "\t$_"; } print "\n";' > $OUTDIR/$TMP
mv $OUTDIR/$TMP $OUTDIR/$PLOTTITLE.heatmap.clusterInfo
fi
## OLD CODE
<<"COMMENT"
## merge replicates of input bam files
if [ ! -z "$MERGE" ]; then
for (( j=0; j<${#REGIONS[@]}; j++ )); do
for BAM_ID in $(cat $OUTDIR/NGSPLOT.txt | grep -w $OUTDIR/REGIONS_INTEREST$j | cut -f 1 | sed 's/_Rep.*//g' | sed 's/\.bam//g' | sort | uniq); do
BAM_FILES=$(grep $BAM_ID $OUTDIR/NGSPLOT.txt | grep -w $OUTDIR/REGIONS_INTEREST$j | perl -ane '$bam_file.="$F[0] "; END { print "$bam_file\n"; }');
NEW_BAM_ID=$(echo $BAM_ID | perl -ane '$_=~s/^.*\///g; print $_;')
if [ ! -f "$OUTDIR/$NEW_BAM_ID.bam" ]; then
samtools merge $OUTDIR/$NEW_BAM_ID.bam $BAM_FILES
fi
grep $BAM_ID $OUTDIR/NGSPLOT.txt | grep -w $OUTDIR/REGIONS_INTEREST$j | head -n 1 | perl -ane 'print '$OUTDIR/$NEW_BAM_ID.bam'\t$F[1]\t'$NEW_BAM_ID'\t$F[3]\t$F[4]\n"; }';
done
done
#done > $OUTDIR/NGSPLOT.txt
fi
exit
COMMENT
#wait
#wait_for_jobs_to_finish "Wait for jobs to finish... "
#echo "done"
#CURRDIR=`pwd`;
#echo -n "plot a single plot showing the histone profile for all peaks... "
#ngsPlot.R -i $CURRDIR/$OUTDIR -t "$PLOTTITLE ($REGIONS_COUNT)" -o $PLOTTITLE.pdf &>/dev/null
#pdfjam $CURRDIR/$OUTDIR/*h3k4me1*heatmap.pdf $CURRDIR/$OUTDIR/*h3k4me3*heatmap.pdf $CURRDIR/$OUTDIR/*h3k27ac*heatmap.pdf --frame true --nup 6x1 --landscape -o $CURRDIR/$OUTDIR/REGIONS_INTEREST_HISTONE_PROFILE_HEAT.tmp.pdf && pdfcrop $CURRDIR/$OUTDIR/REGIONS_INTEREST_HISTONE_PROFILE_HEAT.tmp.pdf $CURRDIR/$OUTDIR/REGIONS_INTEREST_HISTONE_PROFILE_HEAT.pdf && rm $CURRDIR/$OUTDIR/REGIONS_INTEREST_HISTONE_PROFILE_HEAT.tmp.pdf
#echo "done"