-
Notifications
You must be signed in to change notification settings - Fork 0
151 lines (114 loc) · 4.96 KB
/
ncbi_all.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
name: Retrieve NCBI GFF3 Paths
on:
schedule:
- cron: '0 0 * * 0'
workflow_dispatch:
jobs:
retrieve-ncbi-gff3:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install Dependencies
run: |
sudo apt-get update && sudo apt-get install -y lftp
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat'
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets'
chmod +x datasets dataformat
- name: Set Environment Variables
run: |
#datasets
echo "TAXID=2759" >> $GITHUB_ENV
echo "FIELDS=accession" >> $GITHUB_ENV
#ncbi ftp
echo "FTP=ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
echo "FTP_URL=ftp://ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
echo "DIR=/genomes/all" >> $GITHUB_ENV
#tmp files
echo "PATHS=paths.tsv" >> $GITHUB_ENV
echo "EXISTING_ACCESSIONS=existing_accessions.tsv" >> $GITHUB_ENV
echo "NCBI_ACCESSIONS=ncbi_accessions.tsv" >> $GITHUB_ENV
echo "DIRS=directories.txt" >> $GITHUB_ENV
#output
echo "OUTPUT=ncbi.tsv" >> $GITHUB_ENV
- name: Get existing Assemblies or Initialize Output File
run: |
output="${{env.OUTPUT}}"
# Initialize the output file for FTP paths if it doesn't exist
if [[ ! -f "$output" ]]; then
echo -e "accession\tfull_path" > "$output"
fi
# Save existing accessions to a file, skipping the header
awk 'NR > 1 {print $1}' "$output" | sort > "${{env.EXISTING_ACCESSIONS}}"
- name: Get NCBI Assemblies
run: |
./datasets summary genome taxon "${{env.TAXID}}" --assembly-level chromosome,complete --annotated --as-json-lines |
./dataformat tsv genome --fields "${{env.FIELDS}}" |
tail -n +2 > "${{env.NCBI_ACCESSIONS}}"
cat "${{env.NCBI_ACCESSIONS}}"
- name: Set FTP paths
run: |
while read -r accession; do
# Skip if accession exists
grep -Fwq "$accession" "${{env.EXISTING_ACCESSIONS}}" && continue
# Append new FTP paths to the temporary file
base_path="${{env.FTP_URL}}${{env.DIR}}/${accession:0:3}/${accession:4:3}/${accession:7:3}/${accession:10:3}"
echo -e "${accession}\t${base_path}" >> "${{env.PATHS}}"
done < "${{env.NCBI_ACCESSIONS}}"
rm "${{env.NCBI_ACCESSIONS}}"
rm "${{env.EXISTING_ACCESSIONS}}"
- name: Get FTP Directories
run: |
# Loop through base FTP paths and capture the directories
while read -r accession base_path; do
echo "Searching for directory in $base_path"
# Capture the directory to change into
dir_to_cd=$(lftp -c "
set net:timeout 10
set net:max-retries 3
open ftp.ncbi.nlm.nih.gov
cd ${base_path#ftp://ftp.ncbi.nlm.nih.gov}
cls -1 ${accession}* | head -n 1
quit
")
#strip @ from dirs
dir_to_cd=$(echo "$dir_to_cd" | sed 's/@$//')
# If no directory is found, skip to the next accession
[[ -z "$dir_to_cd" ]] && echo "No directory for $accession, skipping..." && continue
echo "Found directory: $dir_to_cd"
# Save accession and directory to file
echo -e "${accession}\t${base_path}/${dir_to_cd}" >> "${{env.DIRS}}"
done < "${{env.PATHS}}"
# Check if the output file is empty
if [ ! -s "${{env.PATHS}}" ]; then
echo "Any new path found. Exiting.."
exit 0
fi
rm "${{env.PATHS}}"
- name: Search for .gff.gz Files
run: |
# Loop through the directories and search for .gff.gz files
while read -r accession dir_to_cd; do
echo "Searching for .gff.gz files in $dir_to_cd"
# Use lftp to connect to the server and list all files in the directory
lftp -c "
set net:timeout 10
set net:max-retries 3
open ftp.ncbi.nlm.nih.gov
cd ${dir_to_cd#ftp://ftp.ncbi.nlm.nih.gov}
cls -1 ${accession}*.gff.gz
quit
" | awk -v acc="$accession" -v dir="$dir_to_cd" '
/\.gff\.gz$/ {
https_base = gensub(/^ftp:\/\//, "https://", 1, dir)
print acc "\t" https_base "/" $NF
}
' >> "${{env.OUTPUT}}"
done < "${{env.DIRS}}"
rm "${{env.DIRS}}"
rm datasets
rm dataformat
- name: Commit & Push changes
uses: actions-js/push@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}