debug #20
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Retrieve NCBI GFF3 Paths | |
on: | |
push: | |
branches: | |
- main | |
workflow_dispatch: | |
jobs: | |
retrieve-ncbi-gff3: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v3 | |
- name: Install Dependencies | |
run: | | |
sudo apt-get update && sudo apt-get install -y lftp | |
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat' | |
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets' | |
chmod +x datasets dataformat | |
- name: Set Environment Variables | |
run: | | |
echo "TAXID=2759" >> $GITHUB_ENV | |
echo "FIELDS=accession" >> $GITHUB_ENV | |
echo "OUTPUT=ncbi.tsv" >> $GITHUB_ENV | |
echo "FTP=ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV | |
echo "FTP_URL=ftp://ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV | |
echo "DIR=/genomes/all" >> $GITHUB_ENV | |
echo "TMP=paths.tsv" >> $GITHUB_ENV | |
- name: Retrieve Eukaryotic Genomes and Generate FTP Paths | |
run: | | |
# Function to initialize files | |
initialize_file() { | |
file=$1 | |
header=$2 | |
[[ ! -f "$file" ]] && echo -e "$header" > "$file" | |
} | |
# Function to generate FTP paths | |
generate_ftp_path() { | |
accession=$1 | |
echo "${{env.FTP_URL}}${{env.DIR}}/${accession:0:3}/${accession:4:3}/${accession:7:3}/${accession:10:3}" | |
} | |
output="${{env.OUTPUT}}" | |
tmp="${{env.TMP}}" | |
# Initialize the output and temporary files | |
initialize_file "$output" "accession\tfull_path" | |
> "$tmp" | |
# Load existing accessions from the output file | |
existing_accessions=$(awk 'NR > 1 {print $1}' "$output" | sort) | |
cat existing_accessions | |
# Retrieve genomes and generate FTP paths | |
./datasets summary genome taxon "${{env.TAXID}}" --assembly-level chromosome,complete --annotated --as-json-lines | | |
./dataformat tsv genome --fields "${{env.FIELDS}}" | tail -n +2 | head -n 10 | | |
while read -r accession; do | |
# Skip if accession exists | |
echo "$existing_accessions" | grep -qw "$accession" && continue | |
# Append new FTP paths to the temporary file | |
base_path=$(generate_ftp_path "$accession") | |
echo -e "${accession}\t${base_path}" >> "$tmp" | |
done | |
cat "$tmp" | |
- name: Recursively Search for .gff.gz Files on NCBI FTP | |
run: | | |
# Function to search for .gff.gz files | |
search_gff_files() { | |
accession=$1 | |
base_path=$2 | |
lftp -c " | |
set net:timeout 10 | |
set net:max-retries 3 | |
open ${{env.FTP}} | |
cd ${base_path#${{env.FTP_URL}}} | |
cls -1 ${accession}*.gff.gz | |
quit | |
" | |
} | |
tmp="${{env.TMP}}" | |
output="${{env.OUTPUT}}" | |
# Loop through base FTP paths and search for .gff.gz files | |
while read -r accession base_path; do | |
dir_to_cd=$(lftp -c " | |
set net:timeout 10 | |
set net:max-retries 3 | |
open ${{env.FTP}} | |
cd ${base_path#${{env.FTP_URL}}} | |
cls -1 ${accession}* | head -n 1 | |
quit | |
") | |
# Strip '@' from the directory name if it exists | |
dir_to_cd=$(echo "$dir_to_cd" | sed 's/@$//') | |
[[ -z "$dir_to_cd" ]] && echo "No directory for $accession, skipping..." && continue | |
echo "Found directory: $dir_to_cd" | |
search_gff_files "$accession" "${base_path}/${dir_to_cd}" | awk -v acc="$accession" -v base="$base_path" -v dir="$dir_to_cd" ' | |
/\.gff\.gz$/ { | |
https_base = gensub(/^ftp:\/\//, "https://", 1, base) | |
print acc "\t" https_base "/" dir $NF | |
} | |
' >> "$output" | |
done < "$tmp" | |
- name: Remove Intermediary Files | |
run: | | |
rm "${{env.TMP}}" | |
rm datasets dataformat | |
- name: Commit & Push changes | |
uses: actions-js/push@master | |
with: | |
github_token: ${{ secrets.GITHUB_TOKEN }} |