Skip to content

Retrieve NCBI GFF3 Paths #31

Retrieve NCBI GFF3 Paths

Retrieve NCBI GFF3 Paths #31

Workflow file for this run

name: Retrieve NCBI GFF3 Paths
on:
schedule:
- cron: '0 0 * * 0'
workflow_dispatch:
jobs:
retrieve-ncbi-gff3:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install Dependencies
run: |
sudo apt-get update && sudo apt-get install -y lftp
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat'
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets'
chmod +x datasets dataformat
- name: Set Environment Variables
run: |
#datasets
echo "TAXID=2759" >> $GITHUB_ENV
echo "FIELDS=accession" >> $GITHUB_ENV
#ncbi ftp
echo "FTP=ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
echo "FTP_URL=ftp://ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
echo "DIR=/genomes/all" >> $GITHUB_ENV
#tmp files
echo "PATHS=paths.tsv" >> $GITHUB_ENV
echo "EXISTING_ACCESSIONS=existing_accessions.tsv" >> $GITHUB_ENV
echo "NCBI_ACCESSIONS=ncbi_accessions.tsv" >> $GITHUB_ENV
echo "DIRS=directories.txt" >> $GITHUB_ENV
#output
echo "OUTPUT=ncbi.tsv" >> $GITHUB_ENV
- name: Get existing Assemblies or Initialize Output File
run: |
output="${{env.OUTPUT}}"
# Initialize the output file for FTP paths if it doesn't exist
if [[ ! -f "$output" ]]; then
echo -e "accession\tfull_path" > "$output"
fi
# Save existing accessions to a file, skipping the header
awk 'NR > 1 {print $1}' "$output" | sort > "${{env.EXISTING_ACCESSIONS}}"
- name: Get NCBI Assemblies
run: |
./datasets summary genome taxon "${{env.TAXID}}" --assembly-level chromosome,complete --annotated --as-json-lines |
./dataformat tsv genome --fields "${{env.FIELDS}}" |
tail -n +2 > "${{env.NCBI_ACCESSIONS}}"
cat "${{env.NCBI_ACCESSIONS}}"
- name: Set FTP paths
run: |
while read -r accession; do
# Skip if accession exists
grep -Fwq "$accession" "${{env.EXISTING_ACCESSIONS}}" && continue
# Append new FTP paths to the temporary file
base_path="${{env.FTP_URL}}${{env.DIR}}/${accession:0:3}/${accession:4:3}/${accession:7:3}/${accession:10:3}"
echo -e "${accession}\t${base_path}" >> "${{env.PATHS}}"
done < "${{env.NCBI_ACCESSIONS}}"
rm "${{env.NCBI_ACCESSIONS}}"
rm "${{env.EXISTING_ACCESSIONS}}"
- name: Get FTP Directories
run: |
# Loop through base FTP paths and capture the directories
while read -r accession base_path; do
echo "Searching for directory in $base_path"
# Capture the directory to change into
dir_to_cd=$(lftp -c "
set net:timeout 10
set net:max-retries 3
open ftp.ncbi.nlm.nih.gov
cd ${base_path#ftp://ftp.ncbi.nlm.nih.gov}
cls -1 ${accession}* | head -n 1
quit
")
#strip @ from dirs
dir_to_cd=$(echo "$dir_to_cd" | sed 's/@$//')
# If no directory is found, skip to the next accession
[[ -z "$dir_to_cd" ]] && echo "No directory for $accession, skipping..." && continue
echo "Found directory: $dir_to_cd"
# Save accession and directory to file
echo -e "${accession}\t${base_path}/${dir_to_cd}" >> "${{env.DIRS}}"
done < "${{env.PATHS}}"
# Check if the output file is empty
if [ ! -s "${{env.PATHS}}" ]; then
echo "Any new path found. Exiting.."
exit 0
fi
rm "${{env.PATHS}}"
- name: Search for .gff.gz Files
run: |
# Loop through the directories and search for .gff.gz files
while read -r accession dir_to_cd; do
echo "Searching for .gff.gz files in $dir_to_cd"
# Use lftp to connect to the server and list all files in the directory
lftp -c "
set net:timeout 10
set net:max-retries 3
open ftp.ncbi.nlm.nih.gov
cd ${dir_to_cd#ftp://ftp.ncbi.nlm.nih.gov}
cls -1 ${accession}*.gff.gz
quit
" | awk -v acc="$accession" -v dir="$dir_to_cd" '
/\.gff\.gz$/ {
https_base = gensub(/^ftp:\/\//, "https://", 1, dir)
print acc "\t" https_base "/" $NF
}
' >> "${{env.OUTPUT}}"
done < "${{env.DIRS}}"
rm "${{env.DIRS}}"
rm datasets
rm dataformat
- name: Commit & Push changes
uses: actions-js/push@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}