-
Notifications
You must be signed in to change notification settings - Fork 0
138 lines (102 loc) · 5.37 KB
/
map_tables.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
name: Map NCBI and Ensembl files
on:
push:
branches: [ "main" ]
workflow_dispatch:
jobs:
map-files:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install Dependencies
run: |
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat'
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets'
chmod +x datasets dataformat
- name: Set Environment Variables
run: |
#datasets
fields=accession,assminfo-name,organism-name,organism-tax-id,annotinfo-name,annotinfo-method,annotinfo-featcount-gene-total,annotinfo-pipeline,annotinfo-provider,annotinfo-release-date
echo "FIELDS=$fields" >> $GITHUB_ENV
#input files
echo "NCBI=ncbi.tsv" >> $GITHUB_ENV
echo "ENSEMBL_RR=ensembl_rapid_release.tsv" >> $GITHUB_ENV
#assembly info
echo "ASSEMBLY_TABLE=assemblies_table.tsv" >> $GITHUB_ENV
#output file header
header="annotation_name\tfull_path\t$(echo $fields | tr ',' '\t')"
echo "HEADER=$header" >> $GITHUB_ENV
#output file name
echo "OUTPUT=mapped_annotations.tsv" >> $GITHUB_ENV
#tmp files
echo "MERGED_TABLE=merged_table.tsv" >> $GITHUB_ENV
echo "NEW_ANNOTATIONS=new_annotations.tsv" >> $GITHUB_ENV
- name: Init Output File
run: |
output="${{env.OUTPUT}}"
# Initialize the output file for FTP paths if it doesn't exist
if [[ ! -f "$output" ]]; then
echo -e "${{env.HEADER}}" > "$output"
fi
- name: Collect and Filter Data from TSVs
run: |
# Define the input files (two TSVs) and output file
ncbi="${{env.NCBI}}"
ensembl_rr="${{env.ENSEMBL_RR}}"
#format name accession path
merged_table="${{env.MERGED_TABLE}}"
# Merge two TSVs, process columns, and save to output
{ tail -n +2 "$ncbi"; tail -n +2 "$ensembl_rr"; } | awk -F'\t' '{
# Get the accession (first column)
accession = $1;
# Extract the filename from the full path (second column), remove .gff.gz or .gff3.gz extension
match($2, /([^\/]+)\.(gff3?|gff)\.gz$/, arr);
filename = arr[1];
# Print accession, filename without extension, and full URL (second column)
print filename "\t" accession "\t" $2;
}' > "$merged_table"
- name: Filter Out Existing Annotations
run: |
new_annotations="${{env.NEW_ANNOTATIONS}}"
merged_table="${{env.MERGED_TABLE}}"
existing_annotations="${{env.OUTPUT}}"
awk 'NR==FNR {if (FNR > 1) exclude[$1]; next} FNR > 1 && !($1 in exclude)' "$existing_annotations" "$merged_table" > "$new_annotations"
# Check if the output file is empty
if [ ! -s "$new_annotations" ]; then
echo "Output file is empty. Exiting.."
exit 0
fi
rm "$merged_table"
- name: Get New Assemblies NCBI Metadata
run: |
tmp=tmp.txt
cat "${{env.NEW_ANNOTATIONS}}" |
awk -F'\t' '{print $2}' > "$tmp"
./datasets summary genome accession --inputfile "$tmp" --as-json-lines |
./dataformat tsv genome --fields "${{env.FIELDS}}" > "${{env.ASSEMBLY_TABLE}}"
rm "$tmp"
- name: Insert new annotations
run: |
new_annotations="${{env.NEW_ANNOTATIONS}}"
existing_annotations="${{env.OUTPUT}}"
assembly_table="${{env.ASSEMBLY_TABLE}}"
# Process new annotations
awk 'NR==FNR {if (FNR > 1) assembly[$1] = $0; next} FNR > 1 {
accession = $2; # Get the second column (accession) from new_annotations
full_path = $3; # Get the third column (full path) from new_annotations
if (accession in assembly) { # If accession matches a line in assembly_table
# Print formatted output with new_annotations first column, full_path, and the rest of assembly_table
match(assembly[accession], /^[^\t]+\t(.+)/, arr);
print $1 "\t" arr[0] "\t" full_path;
}
}' "$assembly_table" "$new_annotations" >> "$existing_annotations"
# Display the final output for debugging purposes
rm "$assembly_table"
rm "$new_annotations"
rm datasets
rm dataformat
- name: Commit & Push changes
uses: actions-js/push@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}