-
Notifications
You must be signed in to change notification settings - Fork 0
/
03_genomic_annotation
149 lines (100 loc) · 4.15 KB
/
03_genomic_annotation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/bin/bash
## scripts to convert and annotate sites using genomation
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
-c|--chromolist)
chromolist="$2"
shift # past argument
shift # past value
;;
-o|--organism)
organism="$2"
shift # past argument
shift # past value
;;
-s|--sites)
sites="$2"
shift # past argument
shift # past value
;;
-g|--genome)
genome="$2"
shift # past argument
shift # past value
;;
*) # unknown option
echo "Unknown option: $1"
shift # past argument
;;
esac
done
# Check if any required arguments are missing
if [ -z "$chromolist" ] || [ -z "$organism" ] || [ -z "$sites" ] || [ -z "$genome" ]; then
echo "Usage: $0 -c|--chromolist <chromosome_list_file> -o|--organism <organism_name> -s|--sites <sites_file> -g|--genome <genome_file>"
exit 1
fi
#### get chromosomes as number
cat ${chromolist} | while read line
do
code=$(echo $line | cut -d " " -f1)
number=$(echo $line | cut -d " " -f2)
echo "s/${code}/${number}/"
done > chr.sed.script
### converting csv to bed ####
cat ${sites} | sed 's/,/\t/g' | awk 'NR > 1 {print $2,$3,$3}' | sed 's/ * /\t/g' | sed 's/"//g' > ${organism}.sites.bed
### converting gtf files to bed12 and right chromosome codes ####
./gtfToGenePred -ignoreGroupsWithoutExons ${genome} genes.pred
./genePredToBed genes.pred ${organism}.ann.bed
rm genes.pred
sed -f chr.sed.script ${organism}.ann.bed > ${organism}.coded.ann.bed
rm ${organism}.ann.bed
echo "file conversions done"
#### R part to annotate with genomation (building R script) ####
cat > genomation_run.r << EOF
## loading libraries
if (!requireNamespace("genomationData", quietly = TRUE)) install.packages("genomationData", repos = "https://cloud.r-project.org/")
if (!requireNamespace("genomation", quietly = TRUE)) install.packages("genomation", repos = "https://cloud.r-project.org/")
if (!requireNamespace("GenomicRanges", quietly = TRUE)) install.packages("GenomicRanges", repos = "https://cloud.r-project.org/")
if (!requireNamespace("readr", quietly = TRUE)) install.packages("readr", repos = "https://cloud.r-project.org/")
library(genomationData)
library(GenomicRanges)
library(readr)
library(genomation)
## loading data
## cpg sites
#coord = read.table("${organism}.sites.bed")
#cpgsites = readBed("${organism}.sites.bed")
coord = read.csv("${sites}")
cpgsites = readBed("${organism}.sites.bed")
## annotated genome
genomeann = readTranscriptFeatures("${organism}.coded.ann.bed")
## running annotation
annot.list = annotateWithGeneParts(cpgsites, genomeann)
## get distance from TSS and gene ids
tssdist = getAssociationWithTSS(annot.list)
position = getMembers(annot.list)
output <- cbind(coord,tssdist,position)
tooutput <- output[, c(1, 2, 5, 6, 7, 8, 9,10)]
write.table(tooutput, "${organism}_annotated_sites.txt", sep = "\t", row.names = FALSE, quote = FALSE)
EOF
module load R
Rscript genomation_run.r
echo "annotation done"
### getting transcript to gene information
cat ${genome} | cut -d "\"" -f2,4 | sed 's/\"/\t/g' | awk -v OFS="\t" '!/^#/ && NF == 2 {print $2,$1}' | sort | uniq > ${organism}_gene_transcript_info
## running association of transcripts to gene
cat > gene_names.py << EOF
import pandas as pd
existing_table = pd.read_table('${organism}_annotated_sites.txt')
info_to_append = pd.read_table('${organism}_gene_transcript_info', header=None, names=['Column1', 'Column2'])
merged_table = pd.merge(existing_table, info_to_append, left_on='feature.name', right_on='Column1', how='left')
merged_table.rename(columns={'Column2': 'gene.name'}, inplace=True)
merged_table.rename(columns={'V1': 'chromossome'}, inplace=True)
merged_table.rename(columns={'V2': 'site'}, inplace=True)
merged_table.drop(columns=['Column1'], inplace=True)
merged_table.to_csv('${organism}_annotated_sites_complete.txt', sep='\t', index=False)
EOF
echo "running final python steps"
python gene_names.py
echo "script done final annotation file : ${organism}_annotated_sites_complete.txt"