-
Notifications
You must be signed in to change notification settings - Fork 2
/
multiIntersectBed.sh
executable file
·150 lines (136 loc) · 6.75 KB
/
multiIntersectBed.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/bash
#PBS -l nodes=1:ppn=4
CONFIG_FILTER="multiIn"
#### usage ####
usage() {
echo
echo Program: "multiIntersectBed.sh (intersect genomic coordinates from multiple BED files)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: pundhir@binf.ku.dk
echo "Usage: multiIntersectBed.sh -i <files> [OPTIONS]"
echo "Note: Different from multiIntersectBed (bedtools) since, this only selects the single most observed coordinate among the consecutive overlapping coordinates"
echo "Options:"
echo " -i <files> [input BED files seperated by a comma]"
echo " **OR**"
echo " [input configuration file containing bed file information]"
echo " [<id> <uniqueId> <bed file> (id should be multiIn]"
echo "[OPTIONS]"
echo " -j <string> [unique name to describe each input bed file separated by a comma]"
echo " **OR**"
echo " [can be specified in the config file]"
echo " -f <string> [filter in input BED files for this input string parameter]"
echo " -F <string> [filter out input BED files for this input string parameter]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:j:f:F:h ARG; do
case "$ARG" in
i) BEDFILE=$OPTARG;;
j) NAME=$OPTARG;;
f) FILTER_IN=$OPTARG;;
F) FILTER_OUT=$OPTARG;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ -z "$BEDFILE" -o "$HELP" ]; then
usage
fi
###################
#helperfunction
function wait_for_jobs_to_finish {
for job in `jobs -p`
do
echo $job
wait $job
done
echo $1
}
###############
## check if input is BED files or configuration file containing BED file information
INPUT=$(echo $BEDFILE | perl -ane '$_=~s/\,.*//g; print $_;')
if [ "$(zless $INPUT | grep -wvi start | sortBed -i stdin 2>/dev/null | wc -l)" -le 0 ]; then
## read configuration file
NAME=$(cat $BEDFILE | perl -ane '
if($_=~/^'$CONFIG_FILTER'/) {
if(!$seen{$F[1]}) {
$file_name.="$F[1],";
$seen{$F[1]}=1;
}
} END {
$file_name=~s/\,$//g;
print "$file_name\n";
}'
)
INPUT=$(cat $BEDFILE | perl -ane '
if($_=~/^'$CONFIG_FILTER'/) {
$file.="$F[2],";
} END {
$file=~s/\,$//g;
print "$file\n";
}'
)
BEDFILE=$INPUT
fi
#echo -e "$BEDFILE\t$NAME"; exit
## parse input bam files in an array
IFS=","
BEDFILES=($BEDFILE)
BEDFILES_COUNT=${#BEDFILES[@]}
IFS=" "
## initialize name parameter, if provided
if [ ! -z "$NAME" ]; then
IFS=","
NAMES=($NAME)
NAMES_COUNT=${#NAMES[@]}
IFS=" "
else
NAMES_COUNT=0
fi
if [ "$BEDFILES_COUNT" -lt 2 -o ! -z "$NAME" -a "$BEDFILES_COUNT" -ne "$NAMES_COUNT" ]; then
echo -n "minimum two input bed files are required as input. Also provide name for each input bed file";
usage
fi
COMMAND_BED=""
COMMAND_NAME=""
for (( i=0; i<$BEDFILES_COUNT; i++ )); do
TMP_NAME[i]=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
#TMP_NAME[i]=$RANDOM
if [ ! -z "$FILTER_IN" ]; then
zless ${BEDFILES[$i]} | perl -ane 'for($i=0; $i<scalar(@F); $i++) { if($F[$i]=~/^'$FILTER_IN'$/) { print $_; last; } }' | bedtools sort -i - > ${TMP_NAME[$i]}.bed
elif [ ! -z "$FILTER_OUT" ]; then
zless ${BEDFILES[$i]} | perl -ane '$found=0; for($i=0; $i<scalar(@F); $i++) { if($F[$i]=~/^'$FILTER_OUT'$/i) { $found=1; last; } } if($found==0) { print $_; }' | bedtools sort -i - > ${TMP_NAME[$i]}.bed
else
bedtools sort -i ${BEDFILES[$i]} > ${TMP_NAME[$i]}.bed
fi
COMMAND_BED="$COMMAND_BED ${TMP_NAME[$i]}.bed"
if [ ! -z "$NAME" ]; then
COMMAND_NAME="$COMMAND_NAME ${NAMES[$i]}"
fi
done
wait
#echo "bedtools multiinter -i $COMMAND_BED -names $COMMAND_NAME"; exit
if [ ! -z "$NAME" ]; then
bedtools multiinter -i $COMMAND_BED -names $COMMAND_NAME | perl -ane 'if(defined($line)) { if($F[1]==$last_coor) { if($F[3]>$last_counter) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } else { $last_coor=$F[2]; } } elsif($last_coor!=$F[1]) { print "$line"; $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } } elsif(!defined($line)) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } END { print "$line"; }'
## last_counter=$F[3] removed from else condition (Feb 22, 2017)
#bedtools multiinter -i $COMMAND_BED -names $COMMAND_NAME | perl -ane 'if(defined($line)) { if($F[1]==$last_coor) { if($F[3]>$last_counter) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } else { $last_coor=$F[2]; $last_counter=$F[3]; } } elsif($last_coor!=$F[1]) { print "$line"; $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } } elsif(!defined($line)) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } END { print "$line"; }'
#bedtools multiinter -i $COMMAND_BED -names $COMMAND_NAME | perl -ane 'if(defined($line)) { if($F[1]==$last_coor) { if($F[3]>$last_counter) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } else { $last_coor=$F[2]; $last_counter=$F[3]; } } elsif($last_counter!=$F[1]) { print "$line"; $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } } elsif(!defined($line)) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } END { print "$line"; }'
else
bedtools multiinter -i $COMMAND_BED | perl -ane 'if(defined($line)) { if($F[1]==$last_coor) { if($F[3]>$last_counter) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } else { $last_coor=$F[2]; } } elsif($last_coor!=$F[1]) { print "$line"; $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } } elsif(!defined($line)) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } END { print "$line"; }'
## last_counter=$F[3] removed from else condition (Feb 22, 2017)
#bedtools multiinter -i $COMMAND_BED | perl -ane 'if(defined($line)) { if($F[1]==$last_coor) { if($F[3]>$last_counter) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } else { $last_coor=$F[2]; $last_counter=$F[3]; } } elsif($last_coor!=$F[1]) { print "$line"; $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } } elsif(!defined($line)) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } END { print "$line"; }'
#bedtools multiinter -i $COMMAND_BED | perl -ane 'if(defined($line)) { if($F[1]==$last_coor) { if($F[3]>$last_counter) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } else { $last_coor=$F[2]; $last_counter=$F[3]; } } elsif($last_counter!=$F[1]) { print "$line"; $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } } elsif(!defined($line)) { $line=$_; $last_coor=$F[2]; $last_counter=$F[3]; } END { print "$line"; }'
fi | while read line; do
if [ ! -z "$FILTER_IN" ]; then
echo $line$'\t'$FILTER_IN;
else
echo "$line";
fi
done
## remove temporary files
for (( i=0; i<$BEDFILES_COUNT; i++ )); do
rm ${TMP_NAME[$i]}.bed
done