-
Notifications
You must be signed in to change notification settings - Fork 0
/
pathways_genes.sh
executable file
·28 lines (19 loc) · 1.17 KB
/
pathways_genes.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
##################################
# This script contains the commands used to create all the files that store the gene sets
# of the child pathways that relate directly to the parent pathway, whose ID is specified on the
# file name.
# The parent pathway name can be found if the ID on the file name is looked up in the file
# Reactome_TopLevel_Pathways.tsv.
##################################
for id in $(gawk '{ if(NR>1){ gsub(/\.[[:digit:]]+/, "", $1); print $1 }}' Reactome_TopLevel_Pathways.tsv);
do child_ids=$(grep -E "$id" ReactomePathwaysRelation.txt | cut -f2);
for c_id in $child_ids;
do grep -E $c_id ReactomePathways.gmt >> gene_set_${id};
echo "Finished one child ID";
done;
echo "Finished one Parent ID";
done
# In case we wanted to add at the beginning of each gene_set_* file a header with the tuple (pathway_id, pathway_name)
# we just have to run the following
# Adding a -i flag to the last sed parameter to modify the files in-place
for i in $(ls gene_set_* | gawk '{gsub(/gene_set_/, "", $1); print}'); do line=$(grep -E $i Reactome_TopLevel_Pathways.tsv); header=$(echo $line | sed 's/ /\t/'); sed "1s/^/$header\n/" gene_set_${i}; done