Retrieve NCBI GFF3 Paths #31
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Retrieve NCBI GFF3 Paths | |
on: | |
schedule: | |
- cron: '0 0 * * 0' | |
workflow_dispatch: | |
jobs: | |
retrieve-ncbi-gff3: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v3 | |
- name: Install Dependencies | |
run: | | |
sudo apt-get update && sudo apt-get install -y lftp | |
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat' | |
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets' | |
chmod +x datasets dataformat | |
- name: Set Environment Variables | |
run: | | |
#datasets | |
echo "TAXID=2759" >> $GITHUB_ENV | |
echo "FIELDS=accession" >> $GITHUB_ENV | |
#ncbi ftp | |
echo "FTP=ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV | |
echo "FTP_URL=ftp://ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV | |
echo "DIR=/genomes/all" >> $GITHUB_ENV | |
#tmp files | |
echo "PATHS=paths.tsv" >> $GITHUB_ENV | |
echo "EXISTING_ACCESSIONS=existing_accessions.tsv" >> $GITHUB_ENV | |
echo "NCBI_ACCESSIONS=ncbi_accessions.tsv" >> $GITHUB_ENV | |
echo "DIRS=directories.txt" >> $GITHUB_ENV | |
#output | |
echo "OUTPUT=ncbi.tsv" >> $GITHUB_ENV | |
- name: Get existing Assemblies or Initialize Output File | |
run: | | |
output="${{env.OUTPUT}}" | |
# Initialize the output file for FTP paths if it doesn't exist | |
if [[ ! -f "$output" ]]; then | |
echo -e "accession\tfull_path" > "$output" | |
fi | |
# Save existing accessions to a file, skipping the header | |
awk 'NR > 1 {print $1}' "$output" | sort > "${{env.EXISTING_ACCESSIONS}}" | |
- name: Get NCBI Assemblies | |
run: | | |
./datasets summary genome taxon "${{env.TAXID}}" --assembly-level chromosome,complete --annotated --as-json-lines | | |
./dataformat tsv genome --fields "${{env.FIELDS}}" | | |
tail -n +2 > "${{env.NCBI_ACCESSIONS}}" | |
cat "${{env.NCBI_ACCESSIONS}}" | |
- name: Set FTP paths | |
run: | | |
while read -r accession; do | |
# Skip if accession exists | |
grep -Fwq "$accession" "${{env.EXISTING_ACCESSIONS}}" && continue | |
# Append new FTP paths to the temporary file | |
base_path="${{env.FTP_URL}}${{env.DIR}}/${accession:0:3}/${accession:4:3}/${accession:7:3}/${accession:10:3}" | |
echo -e "${accession}\t${base_path}" >> "${{env.PATHS}}" | |
done < "${{env.NCBI_ACCESSIONS}}" | |
rm "${{env.NCBI_ACCESSIONS}}" | |
rm "${{env.EXISTING_ACCESSIONS}}" | |
- name: Get FTP Directories | |
run: | | |
# Loop through base FTP paths and capture the directories | |
while read -r accession base_path; do | |
echo "Searching for directory in $base_path" | |
# Capture the directory to change into | |
dir_to_cd=$(lftp -c " | |
set net:timeout 10 | |
set net:max-retries 3 | |
open ftp.ncbi.nlm.nih.gov | |
cd ${base_path#ftp://ftp.ncbi.nlm.nih.gov} | |
cls -1 ${accession}* | head -n 1 | |
quit | |
") | |
#strip @ from dirs | |
dir_to_cd=$(echo "$dir_to_cd" | sed 's/@$//') | |
# If no directory is found, skip to the next accession | |
[[ -z "$dir_to_cd" ]] && echo "No directory for $accession, skipping..." && continue | |
echo "Found directory: $dir_to_cd" | |
# Save accession and directory to file | |
echo -e "${accession}\t${base_path}/${dir_to_cd}" >> "${{env.DIRS}}" | |
done < "${{env.PATHS}}" | |
# Check if the output file is empty | |
if [ ! -s "${{env.PATHS}}" ]; then | |
echo "Any new path found. Exiting.." | |
exit 0 | |
fi | |
rm "${{env.PATHS}}" | |
- name: Search for .gff.gz Files | |
run: | | |
# Loop through the directories and search for .gff.gz files | |
while read -r accession dir_to_cd; do | |
echo "Searching for .gff.gz files in $dir_to_cd" | |
# Use lftp to connect to the server and list all files in the directory | |
lftp -c " | |
set net:timeout 10 | |
set net:max-retries 3 | |
open ftp.ncbi.nlm.nih.gov | |
cd ${dir_to_cd#ftp://ftp.ncbi.nlm.nih.gov} | |
cls -1 ${accession}*.gff.gz | |
quit | |
" | awk -v acc="$accession" -v dir="$dir_to_cd" ' | |
/\.gff\.gz$/ { | |
https_base = gensub(/^ftp:\/\//, "https://", 1, dir) | |
print acc "\t" https_base "/" $NF | |
} | |
' >> "${{env.OUTPUT}}" | |
done < "${{env.DIRS}}" | |
rm "${{env.DIRS}}" | |
rm datasets | |
rm dataformat | |
- name: Commit & Push changes | |
uses: actions-js/push@master | |
with: | |
github_token: ${{ secrets.GITHUB_TOKEN }} |