-
Notifications
You must be signed in to change notification settings - Fork 2
/
nanoPARE_setup.sh
executable file
·189 lines (155 loc) · 6.29 KB
/
nanoPARE_setup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/bin/bash
function usage() {
cat <<HELP
### nanoPARE Setup ###
Usage: ./nanoPARE_setup.sh [options]
Sets up a collection of files required to run the nanoPARE tools:
1) Writes a length.table recording the length of each chromosome in the genome FASTA
2) Generates a STAR genome index using the reference genome and transcriptome
3) Writes a BED file of putative TSO strand invasion sites based on mask_sequences.table
4) Writes a transcriptome FASTA file by combinding -G and -A
5) Parses reference annotations to identify 5'-terminal exons and single-exon transcripts
The genome index is written to temp/genome_index.
Other setup files are added to the resources/ directory.
Optional arguments:
-R | --reference Reference table (default: resources/reference.table)
-G | --genome Genome FASTA file (default: resources/genome.fasta)
-A | --annotation Transcript GFF file (default: resources/annotation.gff)
--lmod Load required modules with Lmod (default: false)
--ram Amount of available RAM in gigabytes (default: 30)
--cpus Number of cores available for multithreaded programs (default: 1)
--gtf_gene_tag Passes to STAR --sjdbGTFtagExonParentGene (default: gene_id)
--gtf_transcript_tag Passes to STAR --sjdbGTFtagExonParentTranscript (default: transcript_id)
All steps of this pipeline access the default files for -R, -G, and -A, respectively.
You can replace these 3 items in the resources folder for simplicity.
HELP
}
################
# CONFIG SETUP #
################
# Storing all default global environment variables
if [ -z "$root_dir" ]
then
root_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Temporary $root_dir relative to this script's location if it isn't already in the environment
fi
bash_dir=$root_dir/scripts/bash_scripts
python_dir=$root_dir/scripts/python_scripts
resource_dir=$root_dir/resources
temp_dir=$root_dir/temp
log_dir=$root_dir/log
results_dir=$root_dir/results
genome_index_dir=$temp_dir/genome_index
. $bash_dir/read_cmdline.sh
# Set defaults for variables if they are not already in the environment
if [ -z "$GENOME_FASTA" ]
then
GENOME_FASTA=$resource_dir/genome.fasta # If not already in environment, set as default value
fi
if [ -z "$GTF_TRANSCRIPT_TAG" ]
then
GTF_TRANSCRIPT_TAG="transcript_id"
fi
if [ -z "$GTF_GENE_TAG" ]
then
GTF_GENE_TAG="gene_id"
fi
if [ -z "$REFERENCE_TABLE" ]
then
REFERENCE_TABLE=$resource_dir/reference.table
fi
if [ -z "$ANNOTATION_GFF" ]
then
ANNOTATION_GFF=$resource_dir/annotation.gff
fi
if [ -z "$LMOD" ]
then
LMOD=0
fi
if [ -z "$CPUS" ]
then
CPUS=1
fi
. $bash_dir/list_settings.sh
# Environment modules to load with Lmod (if option --lmod is passed)
REQUIRED_MODULES=( --rna-star --bedtools --python2018 )
. $bash_dir/load_modules.sh
cd $root_dir
echo "####################################"
echo "### SETUP: GENERATE GENOME INDEX ###"
echo "####################################"
# A table of chromosome names and lengths derived from the input FASTA genome file
length_table_command="python $python_dir/fasta_lengths.py $GENOME_FASTA > $resource_dir/length.table"
echo "$length_table_command"
eval "$length_table_command"
echo "Length table generated."
echo " "
length_array=($(cut -f 2 $resource_dir/length.table | tr '\n' ' '))
genome_length=$(echo ${length_array[@]} | tr ' ' '+' | bc)
index_base_number=$(echo "l($genome_length)/l(2)/2 -1" | bc -l | cut -d '.' -f 1)
if [ $index_base_number -gt 14 ]
then
index_base_number=14
fi
# Genome index required by STAR for short read alignments
rm -rf $genome_index_dir
mkdir -p $genome_index_dir
genome_index_command="STAR \
--runThreadN $CPUS \
--runMode genomeGenerate \
--genomeDir $genome_index_dir \
--outFileNamePrefix $log_dir/ \
--genomeFastaFiles $GENOME_FASTA \
--genomeSAindexNbases $index_base_number \
--sjdbGTFfile $ANNOTATION_GFF \
--sjdbGTFtagExonParentTranscript $GTF_TRANSCRIPT_TAG \
--sjdbGTFtagExonParentGene $GTF_GENE_TAG"
echo "$genome_index_command"
# eval "$genome_index_command"
echo "Genome index complete."
echo " "
echo "### GENERATE TSO MASKING FILES ###"
#python $python_dir/fasta_sequence_search.py \
# $GENOME_FASTA \
# $resource_dir/mask_sequences.table \
# -O $resource_dir
echo "Masking BED files generated."
echo " "
echo "Generating transcriptome FASTA file."
#python $python_dir/gtf_to_fasta.py \
# -G $GENOME_FASTA \
# -A $ANNOTATION_GFF \
# > $resource_dir/transcriptome.fasta
echo "### GENERATE ANNOTATION CLASS REFERENCE FILES ###"
cd $resource_dir
echo " Getting transcript-level exons"
grep -P "exon\t" $ANNOTATION_GFF |\
sed 's/\t[^\t]*transcript_id=\([^;]*\);.*/\t\1\t/' |\
sed 's/Parent=\(transcript:\)\?\([^;]*\).*/\2/' |\
sort -k 9 > class.exons_by_transcript.gff
echo " Getting 5'-most exons"
python $python_dir/bed_deduplicate.py\
-F 8 -S upstream --startline 3 --endline 4 --strandline 6 class.exons_by_transcript.gff > class.terminal_exons_by_transcript.gff
sed 's/\.[0-9]\{1,\}$//' class.terminal_exons_by_transcript.gff | sort -k 9 > class.terminal_exons_by_gene.gff
echo " Converting transcript-level to gene-level annotations"
grep -P "exon\t" $ANNOTATION_GFF | sed 's/gene_id=\([^;]*\);.*/\1\t/' | sed 's/Parent=\(transcript:\)\?\([^;\.]*\).*/\2/' | sort -k 9 > class.exons_by_gene.gff
echo " Reformatting to GFF files to BED files"
bedtools sort -i class.exons_by_gene.gff |\
awk '{ printf $1"\t"$4-1"\t"$5"\t"$9"\t0\t"$7"\n" }' |\
bedtools merge -s -c 4,5,6 -o distinct,sum,first -delim ";" |\
sed 's/;[^\t]*//' |\
bedtools sort | sort -k 4 > class.exons_by_gene.geneorder.bed
bedtools sort -i class.exons_by_gene.geneorder.bed > class.exons_by_gene.bed
echo " Reformatting to GFF files to BED files"
bedtools sort -i class.terminal_exons_by_gene.gff |\
awk '{ printf $1"\t"$4-1"\t"$5"\t"$9"\t0\t"$7"\n" }' |\
bedtools merge -s -c 4,5,6 -o distinct,sum,first -delim ";" |\
sed 's/;[^\t]*//' |\
bedtools sort > class.terminal_exons_by_gene.bed
python $python_dir/bed_deduplicate.py \
--only_unique class.exons_by_gene.geneorder.bed -F 3 |\
bedtools sort > class.single_exon_genes.bed
echo " Cleaning up temporary files"
rm class.terminal_exons_by_gene.gff class.exons_by_gene.gff class.exons_by_gene.geneorder.bed
rm class.terminal_exons_by_transcript.gff class.exons_by_transcript.gff
echo "Setup complete."
exit 0