tabEdit

#!/bin/bash
#PBS -l nodes=1:ppn=4

GENECOL=4

#### usage ####
usage() {
	echo Program: "tabEdit (edit or gather information about a tab delimited file)"
	echo Author: BRIC, University of Copenhagen, Denmark
	echo Version: 1.0
	echo Contact: pundhir@binf.ku.dk
	echo "Usage: tabEdit -i <file> [OPTIONS]"
	echo "Options:"
	echo " -i <file>   [input tab delimited file (can be stdin)]"
    echo " -c          [count number of columns (default)]"
    echo " -r          [remove empty columns]"
    echo " -t          [convert spaces to tab]"
    echo " -m          [convert comma to tab]"
    echo " -n <int>    [reformat file to ensure input number of columns]"
    echo "             [fill missing columns with NA]"
    echo " -f          [format input file into proper five column BED format]"
    echo " -a <file>   [input file in BED format containing gene coordintaes]"
    echo "             [used to add gene coordinate information to file]"
    echo "             [assumes gene name in fourth column]"
    echo " -e <int>    [column containing gene name information (default: 4)]"
    echo " -h          [help]"
	echo
	exit 0
}

#### parse options ####
while getopts i:crtmn:ifa:e:h ARG; do
    case "$ARG" in
    i) INFILE=$OPTARG;;
    c) COUNT=1;;
    r) REMOVE=1;;
    t) TAB=1;;
    m) COMMA=1;;
    n) NCOL=$OPTARG;;
    f) FORMATBED=1;;
    a) ADDGENE=$OPTARG;;
    e) GENECOL=$OPTARG;;
    h) HELP=1;;
    esac
done

## usage, if necessary file and directories are given/exist
if [ -z "$INFILE" -o "$HELP" ]; then
	usage
fi

## create temporary BED file if input is from stdin
if [ "$INFILE" == "stdin" ]; then
    TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
    while read LINE; do
        echo "${LINE}"
    done > $TMP 
    #| perl -ane 'if($_=~/^#/) { print $_; next; } $line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t+$//g; $line=~s/\s+$//g; print "$line\n";' > $TMP
    INFILE=$TMP
fi

if [ ! -z "$ADDGENE" ]; then
    if [ "$GENECOL" -eq "1" ]; then
        zless $INFILE | perl -ane '$match=`zgrep -E \"\\s+$F['$((GENECOL-1))']\\s+\" '$ADDGENE' -m 1`; chomp($_); if($match!~/^$/) { @coor=split(/\s+/,$match); print "$coor[0]\t$coor[1]\t$coor[2]\t$coor[3]\t$coor[4]\t$coor[5]\t$_\n"; } else { print "#$_\n"; }'
    else
        zless $INFILE | perl -ane '$match=`zgrep -E \"\\s+$F['$((GENECOL-1))']\\s+\" '$ADDGENE' -m 1`; chomp($_); if($match!~/^$/) { @coor=split(/\s+/,$match); print "$_\t$coor[0]:$coor[1]-$coor[2]|$coor[5]\n"; } else { print "$_\tNA\n"; }'
    fi
elif [ ! -z "$FORMATBED" ]; then
    zless $INFILE | perl -ane 'if($_!~/^#/) { print $_; }' | perl -ane '$j++; if($F[4]!~/^[0-9\.]+$/ || $F[4]<=0) { print "$F[0]\t$F[1]\t$F[2]\tpeak_$j\t1\t.\n"; } else { print "$F[0]\t$F[1]\t$F[2]\tpeak_$j\t$F[4]\t.\n"; }'
elif [ ! -z "$TAB" ]; then
    zless $INFILE | perl -ane 'if($_=~/^#/) { print $_; next; } $line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t+$//g; $line=~s/\s+$//g; print "$line\n";'
elif [ ! -z "$COMMA" ]; then
    zless $INFILE | perl -an -F'/\,/' -e 'if($_=~/^#/) { print $_; next; } $line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t+$//g; $line=~s/\s+$//g; print "$line\n";'
elif [ ! -z "$NCOL" ]; then
    zless $INFILE | perl -an -F'/\s+/' -e 'print $F[0]; foreach(@F[1..scalar(@F)-1]) { chomp($_); print "\t$_"; } if(scalar(@F)<'$NCOL') { $i='$NCOL'-scalar(@F); while($i>0) { print "\tNA"; $i--; } } print "\n";'
else
    ## remove empty columns
    if [ ! -z "$REMOVE" ]; then
        #tr -s '\t' < $INFILE
        perl -ane '$_=~s/\s+$//g; print "$_\n";' < $INFILE
    else
        zless $INFILE | perl -an -F'/\t{1}/' -e 'print scalar(@F)."\n";' | sort | uniq -c
    fi
fi

## remove temporary file, if exists
if [ ! -z "$TMP" ]; then
    rm $TMP
fi