multiClassMotifAna

#!/bin/bash
#PBS -l nodes=1:ppn=4

GENOME="mm9";
TFBS_FILE="/localhome/bric/xfd783/software/homer/data/knownTFs/vertebrates/jaspar_uniprobe_jolma.motifs"
PROCESSOR=1
NMOTIFS=50
LENGTH="7,8,9,10,11,12,13,14" ## recommended 7,8,9,10,11,12,13,14 (Andersson et al. 2014)
REDUCE_THRESHOLD="0.75" ## argument in compareMotifs.pl (-reduceThreshold)  ## recommended 0.75 (Andersson et al. 2014)
INFO="1.5" ## argument in compareMotifs.pl (-info) 
#MIN_T=50 ## argument in compareMotifs.pl (-minT)
PVALUE="1e-15" ## recommended 1e-15 (Andersson et al. 2014)
MIN_P_T=3 ## recommended 3 (Andersson et al. 2014)
B=100 ## recommended 30 (Andersson et al. 2014)
S=100 ## recommended 100 (Andersson et al. 2014)
MIN_N_CLASS=1
SIZE=200

#### usage ####
usage() {
echo Program: "multiClassMotifAna (perform motif analysis on genomic regions from multiple classes)"
	echo Author: BRIC, University of Copenhagen, Denmark
	echo Version: 1.0
	echo Contact: pundhir@binf.ku.dk
	echo "Usage: multiClassMotifAna -i <file> -o <dir> [OPTIONS]"
	echo "Options:"
	echo " -i <file>   [input file containing gene list(s) (can be stdin)]"
    echo "             [format]"
    echo "             => chr start end name score strand class"
    echo " -o <dir>    [output directory]"
    echo "[OPTIONS]"
    echo " -s <file>   [input file having PWM of TFBS motifs]"
    echo "             [default: /localhome/bric/xfd783/software/homer/data/knownTFs/vertebrates/jaspar_uniprobe_jolma.motifs]"
    echo " -g <string> [genome for which to perform the analysis (mm9, hg19, mm10, hg38, danRer7; default: mm9)]"
    echo " -r <int>    [number fo processors to use (default: 1)]"
    echo " -c <int>    [number of top motifs to analyze from each sample (default: 50)]"
    echo " -t <string> [motif length (default: 7,8,9,10,11,12,13,14)]"
    echo " -u <float>  [similarity threshold used to remove similar motifs (default: 0.75; **OBSOLETE**)]"
    echo " -v <float>  [remove motifs with information content less than # (default: 1.5; **OBSOLETE**)]"
    echo " -w <int>    [remove motifs with less than # number of target instances (default: not used; **OBSOLETE**)]"
    echo " -p <float>  [p-value (default: 1e-15)]"
    echo " -y <float>  [remove motifs with target percentage less than # (default: 3)]"
    echo " -z <float>  [remove motifs with background percentage greater than # (default: 100)]"
    echo " -n <int>    [number of motifs to optimize for each motif length (default: 100)]"
    echo " -b <file>   [custom background file in bed format]"
    echo "             [can also be a class identifier, where class is sixth column in input bed file (-i)]"
    echo " -x <int>    [minimum number of elements within each region class to plot (default: 1)]"
    echo " -d <int>    [size of region (default: 200bp)]"  
    echo "             [The size of the region used for motif finding is important.  If analyzing ChIP-Seq peaks from a transcription factor,]"  
    echo "             [Chuck would recommend 50 bp for establishing the primary motif bound by a given transcription factor and 200 bp for  ]"
    echo "             [finding both primary and 'co-enriched' motifs for a transcription factor.  When looking at histone marked regions,   ]"
    echo "             [500-1000 bp is probably a good idea (i.e. H3K4me or H3/H4 acetylated regions).  In theory, HOMER can work with very  ]"
    echo "             [large regions (i.e. 10kb), but with the larger the regions comes more sequence and longer execution time.  These     ]"
    echo "             [regions will be based off the center of the peaks.  If you prefer an offset, you can specify '-d -300,100' to     ]"
    echo "             [search a region of size 400 that is centered 100 bp upstream of the peak center (useful if doing motif finding on    ]"
    echo "             [putative TSS regions).  If you have variable length regions, use the option '-d given' and HOMER will use the     ]"
    echo "             [exact regions that were used as input.]"
    echo " -l <string> [name of motifs that must be included in the final output]"
    echo "             [if multiple, separate them by a comma]"
	echo " -h          [help]"
	echo
	exit 0
}

#### parse options ####
while getopts i:o:s:g:r:c:t:u:v:w:p:y:z:n:b:x:d:l:h ARG; do
	case "$ARG" in
		i) BEDFILE=$OPTARG;;
        o) OUTDIR=$OPTARG;;
        g) GENOME=$OPTARG;;
        s) TFBS_FILE=$OPTARG;;
        r) PROCESSOR=$OPTARG;;
        c) NMOTIFS=$OPTARG;;
        t) LENGTH=$OPTARG;;  
        u) REDUCE_THRESHOLD=$OPTARG;;
        v) INFO=$OPTARG;;
        w) MIN_T=$OPTARG;;
        p) PVALUE=$OPTARG;;  
        y) MIN_P_T=$OPTARG;;  
        z) B=$OPTARG;;
        n) S=$OPTARG;;
        b) BKG_FILE=$OPTARG;;
        x) MIN_N_CLASS=$OPTARG;;
        d) SIZE=$OPTARG;;
        l) MUST_INCLUDE_MOTIF=$OPTARG;;
		h) HELP=1;;
	esac
done

## usage, if necessary file and directories are given/exist
if [ -z "$BEDFILE" -o -z "$OUTDIR" -o "$HELP" ]; then
	usage
fi

###################
#helperfunction
function wait_for_jobs_to_finish {
    for job in `jobs -p`
    do
        echo $job
        wait $job
    done
    echo $1
}
###############

<<"COMMENT1"
COMMENT1
echo -n "Create directory structure... "
if [ ! -d "$OUTDIR" ]; then
    mkdir -p $OUTDIR
fi
echo "done"

echo -n "Populating files based on input genome, $GENOME (`date`).. "
if [ "$GENOME" == "mm9" ]; then
    GENOME_FILE="/scratch/genomes/annotations/SIZE/mouse.mm9.genome"
    REPEAT_FILE="/scratch/genomes/annotations/REPEATS/mouse.mm9.simpleRepeat.gz"
    GENOME_MOTIF="mm9r"
elif [ "$GENOME" == "hg19" ]; then
    GENOME_FILE="/scratch/genomes/annotations/SIZE/human.hg19.genome"
    REPEAT_FILE="/scratch/genomes/annotations/REPEATS/human.hg19.simpleRepeat.gz"
    GENOME_MOTIF="hg19r"
elif [ "$GENOME" == "mm10" ]; then
    GENOME_FILE="/scratch/genomes/annotations/SIZE/mouse.mm10.genome"
    REPEAT_FILE="/scratch/genomes/annotations/REPEATS/mouse.mm10.simpleRepeat.gz"
    GENOME_MOTIF="mm10r"
elif [ "$GENOME" == "hg38" ]; then
    GENOME_FILE="/scratch/genomes/annotations/SIZE/human.hg38.genome"
    REPEAT_FILE="/scratch/genomes/annotations/REPEATS/human.hg38.simpleRepeat.gz"
    GENOME_MOTIF="hg38r"
elif [ "$GENOME" == "danRer7" ]; then
    GENOME_FILE="/scratch/genomes/annotations/SIZE/zebrafish.danRer7.genome"
    REPEAT_FILE="/scratch/genomes/annotations/REPEATS/zebrafish.danRer7.simpleRepeat.gz"   
    GENOME_MOTIF="danRer7r"
else
    echo "Presently the program only support analysis for mm9, hg19 or danRer7"
    echo
    usage
fi
echo "done"

## determine, if the input genes are from a file or stdin
echo -n "Create gene file depending on if the input is from file or STDIN (`date`).. "
if [ -f "$BEDFILE" ]; then
    zless $BEDFILE | perl -ane '$line=(); foreach(@F) { chomp($_); $line.="$_\t"; } $line=~s/\t$//g; print "$line\n";' > $OUTDIR/BEDFILE_INTEREST.TXT
elif [ "$BEDFILE" == "stdin" ]; then
    while read LINE; do echo ${LINE}; done | perl -ane '$line=(); foreach(@F) { chomp($_); $line.="$_\t"; } $line=~s/\t$//g; print "$line\n";' > $OUTDIR/BEDFILE_INTEREST.TXT
else
    usage
fi

echo "Initiate motif analysis (`date`).. "
if [ ! -z "$BKG_FILE" -a ! -f "$BKG_FILE" ]; then
    echo -e "Reformatting input bed file as -b is a keyword (`date`).. "
    TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
    zless $OUTDIR/BEDFILE_INTEREST.TXT | grep -vw $BKG_FILE > $OUTDIR/$TMP.target
    zless $OUTDIR/BEDFILE_INTEREST.TXT | grep -w $BKG_FILE > $OUTDIR/$TMP.background
    mv $OUTDIR/$TMP.target $OUTDIR/BEDFILE_INTEREST.TXT
    BKG_FILE="$OUTDIR/$TMP.background"
    echo "done"
fi

MOTIFDIR_ALL=""
CLASS_COUNT=0
## Date: Aug 2, 2019; changed input file requirement to ensure strand information is provided. If missing, motif position as histogram was wrong in analysis using motifAna
STRAND_NOTGIVEN=$(cat $OUTDIR/BEDFILE_INTEREST.TXT | cut -f 6 | sort | uniq | grep -v '+\|-\|\.' | wc -l)
#echo $STRAND_NOTGIVEN; exit

for CLASS in $(cat $OUTDIR/BEDFILE_INTEREST.TXT | perl -ane 'if($F[1]=~/^[0-9]+$/) { if('$STRAND_NOTGIVEN') { print "$F[5]\n"; } else { print "$F[6]\n"; } }' | sort | uniq -c | sed -E 's/^\s+//g' | perl -ane 'if($F[0]>='$MIN_N_CLASS') { print "$F[1]\n"; }'); do
    ID=$(echo $CLASS | perl -ane 'if($_=~/^\.$/) { print "other"; } else { $_=~s/\,/\_/g; print $_; }')
    cat $OUTDIR/BEDFILE_INTEREST.TXT | perl -ane 'if('$STRAND_NOTGIVEN') { if($F[5]=~/^'$CLASS'$/) { print $_; } } else { if($F[6]=~/^'$CLASS'$/) { print $_; } }' > $OUTDIR/enhancer_$ID.bed

    MOTIFDIR="$OUTDIR/$ID"
    if [ ! -d "$MOTIFDIR" ]; then
        mkdir -p $MOTIFDIR
    fi

    if [ -z "$BKG_FILE" ]; then
        motifAna -i $OUTDIR/enhancer_$ID.bed -o $MOTIFDIR -m 1 -t $LENGTH -u $REDUCE_THRESHOLD -v $INFO -x $PVALUE -y $MIN_P_T -n $S -g $GENOME -p $PROCESSOR -s $TFBS_FILE -d $SIZE &>$MOTIFDIR/$ID.log &
    else
        motifAna -i $OUTDIR/enhancer_$ID.bed -o $MOTIFDIR -m 1 -t $LENGTH -u $REDUCE_THRESHOLD -v $INFO -x $PVALUE -y $MIN_P_T -n $S -g $GENOME -p $PROCESSOR -s $TFBS_FILE -b $BKG_FILE -d $SIZE &>$MOTIFDIR/$ID.log &
    fi

    CLASS_COUNT=$((CLASS_COUNT+1))
    MOTIFDIR_ALL="$MOTIFDIR_ALL,$MOTIFDIR"
done
wait

if [ "$CLASS_COUNT" -gt 1 ]; then
    ARGS=""
    if [ ! -z "$MUST_INCLUDE_MOTIF" ]; then
        extractMotifs -i $TFBS_FILE -j $MUST_INCLUDE_MOTIF -g $GENOME > $OUTDIR/MUST_INCLUDE.MOTIFS
        ARGS="$ARGS -l $MUST_INCLUDE_MOTIF"
        ARGS="$ARGS -M $OUTDIR/MUST_INCLUDE.MOTIFS"
    fi

    MOTIFDIR_ALL=$(echo $MOTIFDIR_ALL | perl -ane '$_=~s/^\,//g; print $_;')
    if [ -z "$BKG_FILE" ]; then
        motifDynAna -i $MOTIFDIR_ALL -o $OUTDIR/motifDynAna -g $GENOME -p $PROCESSOR -c $NMOTIFS -t $LENGTH -u $REDUCE_THRESHOLD -v $INFO -x 1.1 $ARGS &>$OUTDIR/motifDynAna.log
    else
        motifDynAna -i $MOTIFDIR_ALL -o $OUTDIR/motifDynAna -g $GENOME -p $PROCESSOR -c $NMOTIFS -t $LENGTH -u $REDUCE_THRESHOLD -v $INFO -x 1.1 $ARGS -b $BKG_FILE &>$OUTDIR/motifDynAna.log
    fi
else
    echo "Only one class is found. Not performing differential motif enrichment analysis using motifDynAna" 
fi
echo "done"