debug #20

Workflow file for this run

.github/workflows/ncbi_all.yml at 2dbe64d

	name: Retrieve NCBI GFF3 Paths

	on:
	push:
	branches:
	- main
	workflow_dispatch:

	jobs:
	retrieve-ncbi-gff3:
	runs-on: ubuntu-latest

	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install Dependencies
	run: \|
	sudo apt-get update && sudo apt-get install -y lftp
	curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat'
	curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets'
	chmod +x datasets dataformat

	- name: Set Environment Variables
	run: \|
	echo "TAXID=2759" >> $GITHUB_ENV
	echo "FIELDS=accession" >> $GITHUB_ENV
	echo "OUTPUT=ncbi.tsv" >> $GITHUB_ENV
	echo "FTP=ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
	echo "FTP_URL=ftp://ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
	echo "DIR=/genomes/all" >> $GITHUB_ENV
	echo "TMP=paths.tsv" >> $GITHUB_ENV

	- name: Retrieve Eukaryotic Genomes and Generate FTP Paths
	run: \|
	# Function to initialize files
	initialize_file() {
	file=$1
	header=$2
	[[ ! -f "$file" ]] && echo -e "$header" > "$file"
	}

	# Function to generate FTP paths
	generate_ftp_path() {
	accession=$1
	echo "${{env.FTP_URL}}${{env.DIR}}/${accession:0:3}/${accession:4:3}/${accession:7:3}/${accession:10:3}"
	}

	output="${{env.OUTPUT}}"
	tmp="${{env.TMP}}"

	# Initialize the output and temporary files
	initialize_file "$output" "accession\tfull_path"
	> "$tmp"

	# Load existing accessions from the output file
	existing_accessions=$(awk 'NR > 1 {print $1}' "$output" \| sort)

	cat existing_accessions

	# Retrieve genomes and generate FTP paths
	./datasets summary genome taxon "${{env.TAXID}}" --assembly-level chromosome,complete --annotated --as-json-lines \|
	./dataformat tsv genome --fields "${{env.FIELDS}}" \| tail -n +2 \| head -n 10 \|
	while read -r accession; do
	# Skip if accession exists
	echo "$existing_accessions" \| grep -qw "$accession" && continue

	# Append new FTP paths to the temporary file
	base_path=$(generate_ftp_path "$accession")
	echo -e "${accession}\t${base_path}" >> "$tmp"
	done

	cat "$tmp"

	- name: Recursively Search for .gff.gz Files on NCBI FTP
	run: \|
	# Function to search for .gff.gz files
	search_gff_files() {
	accession=$1
	base_path=$2
	lftp -c "
	set net:timeout 10
	set net:max-retries 3
	open ${{env.FTP}}
	cd ${base_path#${{env.FTP_URL}}}
	cls -1 ${accession}*.gff.gz
	quit
	"
	}

	tmp="${{env.TMP}}"
	output="${{env.OUTPUT}}"

	# Loop through base FTP paths and search for .gff.gz files
	while read -r accession base_path; do
	dir_to_cd=$(lftp -c "
	set net:timeout 10
	set net:max-retries 3
	open ${{env.FTP}}
	cd ${base_path#${{env.FTP_URL}}}
	cls -1 ${accession}* \| head -n 1
	quit
	")

	# Strip '@' from the directory name if it exists
	dir_to_cd=$(echo "$dir_to_cd" \| sed 's/@$//')

	[[ -z "$dir_to_cd" ]] && echo "No directory for $accession, skipping..." && continue

	echo "Found directory: $dir_to_cd"

	search_gff_files "$accession" "${base_path}/${dir_to_cd}" \| awk -v acc="$accession" -v base="$base_path" -v dir="$dir_to_cd" '
	/\.gff\.gz$/ {
	https_base = gensub(/^ftp:\/\//, "https://", 1, base)
	print acc "\t" https_base "/" dir $NF
	}
	' >> "$output"

	done < "$tmp"

	- name: Remove Intermediary Files
	run: \|
	rm "${{env.TMP}}"
	rm datasets dataformat

	- name: Commit & Push changes
	uses: actions-js/push@master
	with:
	github_token: ${{ secrets.GITHUB_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

debug #20

Workflow file

debug #20

Jobs

Run details

Workflow file for this run