-
Notifications
You must be signed in to change notification settings - Fork 2
/
tabEdit
executable file
·89 lines (82 loc) · 3.65 KB
/
tabEdit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/bin/bash
#PBS -l nodes=1:ppn=4
GENECOL=4
#### usage ####
usage() {
echo Program: "tabEdit (edit or gather information about a tab delimited file)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: pundhir@binf.ku.dk
echo "Usage: tabEdit -i <file> [OPTIONS]"
echo "Options:"
echo " -i <file> [input tab delimited file (can be stdin)]"
echo " -c [count number of columns (default)]"
echo " -r [remove empty columns]"
echo " -t [convert spaces to tab]"
echo " -m [convert comma to tab]"
echo " -n <int> [reformat file to ensure input number of columns]"
echo " [fill missing columns with NA]"
echo " -f [format input file into proper five column BED format]"
echo " -a <file> [input file in BED format containing gene coordintaes]"
echo " [used to add gene coordinate information to file]"
echo " [assumes gene name in fourth column]"
echo " -e <int> [column containing gene name information (default: 4)]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:crtmn:ifa:e:h ARG; do
case "$ARG" in
i) INFILE=$OPTARG;;
c) COUNT=1;;
r) REMOVE=1;;
t) TAB=1;;
m) COMMA=1;;
n) NCOL=$OPTARG;;
f) FORMATBED=1;;
a) ADDGENE=$OPTARG;;
e) GENECOL=$OPTARG;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ -z "$INFILE" -o "$HELP" ]; then
usage
fi
## create temporary BED file if input is from stdin
if [ "$INFILE" == "stdin" ]; then
TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
while read LINE; do
echo "${LINE}"
done > $TMP
#| perl -ane 'if($_=~/^#/) { print $_; next; } $line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t+$//g; $line=~s/\s+$//g; print "$line\n";' > $TMP
INFILE=$TMP
fi
if [ ! -z "$ADDGENE" ]; then
if [ "$GENECOL" -eq "1" ]; then
zless $INFILE | perl -ane '$match=`zgrep -E \"\\s+$F['$((GENECOL-1))']\\s+\" '$ADDGENE' -m 1`; chomp($_); if($match!~/^$/) { @coor=split(/\s+/,$match); print "$coor[0]\t$coor[1]\t$coor[2]\t$coor[3]\t$coor[4]\t$coor[5]\t$_\n"; } else { print "#$_\n"; }'
else
zless $INFILE | perl -ane '$match=`zgrep -E \"\\s+$F['$((GENECOL-1))']\\s+\" '$ADDGENE' -m 1`; chomp($_); if($match!~/^$/) { @coor=split(/\s+/,$match); print "$_\t$coor[0]:$coor[1]-$coor[2]|$coor[5]\n"; } else { print "$_\tNA\n"; }'
fi
elif [ ! -z "$FORMATBED" ]; then
zless $INFILE | perl -ane 'if($_!~/^#/) { print $_; }' | perl -ane '$j++; if($F[4]!~/^[0-9\.]+$/ || $F[4]<=0) { print "$F[0]\t$F[1]\t$F[2]\tpeak_$j\t1\t.\n"; } else { print "$F[0]\t$F[1]\t$F[2]\tpeak_$j\t$F[4]\t.\n"; }'
elif [ ! -z "$TAB" ]; then
zless $INFILE | perl -ane 'if($_=~/^#/) { print $_; next; } $line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t+$//g; $line=~s/\s+$//g; print "$line\n";'
elif [ ! -z "$COMMA" ]; then
zless $INFILE | perl -an -F'/\,/' -e 'if($_=~/^#/) { print $_; next; } $line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t+$//g; $line=~s/\s+$//g; print "$line\n";'
elif [ ! -z "$NCOL" ]; then
zless $INFILE | perl -an -F'/\s+/' -e 'print $F[0]; foreach(@F[1..scalar(@F)-1]) { chomp($_); print "\t$_"; } if(scalar(@F)<'$NCOL') { $i='$NCOL'-scalar(@F); while($i>0) { print "\tNA"; $i--; } } print "\n";'
else
## remove empty columns
if [ ! -z "$REMOVE" ]; then
#tr -s '\t' < $INFILE
perl -ane '$_=~s/\s+$//g; print "$_\n";' < $INFILE
else
zless $INFILE | perl -an -F'/\t{1}/' -e 'print scalar(@F)."\n";' | sort | uniq -c
fi
fi
## remove temporary file, if exists
if [ ! -z "$TMP" ]; then
rm $TMP
fi