Skip to content

Workflow file for this run

name: Retrieve NCBI GFF3 Paths
on:
push:
branches:
- main
workflow_dispatch:
jobs:
retrieve-ncbi-gff3:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install Dependencies
run: |
sudo apt-get update && sudo apt-get install -y lftp
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat'
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets'
chmod +x datasets dataformat
- name: Set Environment Variables
run: |
echo "TAXID=2759" >> $GITHUB_ENV
echo "FIELDS=accession" >> $GITHUB_ENV
echo "OUTPUT=ncbi.tsv" >> $GITHUB_ENV
echo "FTP=ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
echo "FTP_URL=ftp://ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
echo "DIR=/genomes/all" >> $GITHUB_ENV
echo "TMP=paths.tsv" >> $GITHUB_ENV
- name: Retrieve Eukaryotic Genomes and Generate FTP Paths
run: |
# Function to initialize files
initialize_file() {
file=$1
header=$2
[[ ! -f "$file" ]] && echo -e "$header" > "$file"
}
# Function to generate FTP paths
generate_ftp_path() {
accession=$1
echo "${{env.FTP_URL}}${{env.DIR}}/${accession:0:3}/${accession:4:3}/${accession:7:3}/${accession:10:3}"
}
output="${{env.OUTPUT}}"
tmp="${{env.TMP}}"
# Initialize the output and temporary files
initialize_file "$output" "accession\tfull_path"
> "$tmp"
# Load existing accessions from the output file
existing_accessions=$(awk 'NR > 1 {print $1}' "$output" | sort)
cat existing_accessions
# Retrieve genomes and generate FTP paths
./datasets summary genome taxon "${{env.TAXID}}" --assembly-level chromosome,complete --annotated --as-json-lines |
./dataformat tsv genome --fields "${{env.FIELDS}}" | tail -n +2 | head -n 10 |
while read -r accession; do
# Skip if accession exists
echo "$existing_accessions" | grep -qw "$accession" && continue
# Append new FTP paths to the temporary file
base_path=$(generate_ftp_path "$accession")
echo -e "${accession}\t${base_path}" >> "$tmp"
done
cat "$tmp"
- name: Recursively Search for .gff.gz Files on NCBI FTP
run: |
# Function to search for .gff.gz files
search_gff_files() {
accession=$1
base_path=$2
lftp -c "
set net:timeout 10
set net:max-retries 3
open ${{env.FTP}}
cd ${base_path#${{env.FTP_URL}}}
cls -1 ${accession}*.gff.gz
quit
"
}
tmp="${{env.TMP}}"
output="${{env.OUTPUT}}"
# Loop through base FTP paths and search for .gff.gz files
while read -r accession base_path; do
dir_to_cd=$(lftp -c "
set net:timeout 10
set net:max-retries 3
open ${{env.FTP}}
cd ${base_path#${{env.FTP_URL}}}
cls -1 ${accession}* | head -n 1
quit
")
# Strip '@' from the directory name if it exists
dir_to_cd=$(echo "$dir_to_cd" | sed 's/@$//')
[[ -z "$dir_to_cd" ]] && echo "No directory for $accession, skipping..." && continue
echo "Found directory: $dir_to_cd"
search_gff_files "$accession" "${base_path}/${dir_to_cd}" | awk -v acc="$accession" -v base="$base_path" -v dir="$dir_to_cd" '
/\.gff\.gz$/ {
https_base = gensub(/^ftp:\/\//, "https://", 1, base)
print acc "\t" https_base "/" dir $NF
}
' >> "$output"
done < "$tmp"
- name: Remove Intermediary Files
run: |
rm "${{env.TMP}}"
rm datasets dataformat
- name: Commit & Push changes
uses: actions-js/push@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}