Retrieve NCBI GFF3 Paths #30

Workflow file for this run

.github/workflows/ncbi_all.yml at dc5c5bd

	name: Retrieve NCBI GFF3 Paths

	on:
	schedule:
	- cron: '0 0 * * 0'
	workflow_dispatch:


	jobs:
	retrieve-ncbi-gff3:
	runs-on: ubuntu-latest

	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install Dependencies
	run: \|
	sudo apt-get update && sudo apt-get install -y lftp
	curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat'
	curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets'
	chmod +x datasets dataformat

	- name: Set Environment Variables
	run: \|

	#datasets
	echo "TAXID=2759" >> $GITHUB_ENV
	echo "FIELDS=accession" >> $GITHUB_ENV

	#ncbi ftp
	echo "FTP=ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
	echo "FTP_URL=ftp://ftp.ncbi.nlm.nih.gov" >> $GITHUB_ENV
	echo "DIR=/genomes/all" >> $GITHUB_ENV

	#tmp files
	echo "PATHS=paths.tsv" >> $GITHUB_ENV
	echo "EXISTING_ACCESSIONS=existing_accessions.tsv" >> $GITHUB_ENV
	echo "NCBI_ACCESSIONS=ncbi_accessions.tsv" >> $GITHUB_ENV
	echo "DIRS=directories.txt" >> $GITHUB_ENV

	#output
	echo "OUTPUT=ncbi.tsv" >> $GITHUB_ENV

	- name: Get existing Assemblies or Initialize Output File
	run: \|

	output="${{env.OUTPUT}}"

	# Initialize the output file for FTP paths if it doesn't exist
	if [[ ! -f "$output" ]]; then
	echo -e "accession\tfull_path" > "$output"
	fi

	# Save existing accessions to a file, skipping the header
	awk 'NR > 1 {print $1}' "$output" \| sort > "${{env.EXISTING_ACCESSIONS}}"

	- name: Get NCBI Assemblies
	run: \|

	./datasets summary genome taxon "${{env.TAXID}}" --assembly-level chromosome,complete --annotated --as-json-lines \|
	./dataformat tsv genome --fields "${{env.FIELDS}}" \|
	tail -n +2 > "${{env.NCBI_ACCESSIONS}}"

	cat "${{env.NCBI_ACCESSIONS}}"

	- name: Set FTP paths
	run: \|

	while read -r accession; do
	# Skip if accession exists
	grep -Fwq "$accession" "${{env.EXISTING_ACCESSIONS}}" && continue

	# Append new FTP paths to the temporary file
	base_path="${{env.FTP_URL}}${{env.DIR}}/${accession:0:3}/${accession:4:3}/${accession:7:3}/${accession:10:3}"
	echo -e "${accession}\t${base_path}" >> "${{env.PATHS}}"

	done < "${{env.NCBI_ACCESSIONS}}"

	rm "${{env.NCBI_ACCESSIONS}}"
	rm "${{env.EXISTING_ACCESSIONS}}"

	- name: Get FTP Directories
	run: \|

	# Loop through base FTP paths and capture the directories
	while read -r accession base_path; do
	echo "Searching for directory in $base_path"

	# Capture the directory to change into
	dir_to_cd=$(lftp -c "
	set net:timeout 10
	set net:max-retries 3
	open ftp.ncbi.nlm.nih.gov
	cd ${base_path#ftp://ftp.ncbi.nlm.nih.gov}
	cls -1 ${accession}* \| head -n 1
	quit
	")

	#strip @ from dirs
	dir_to_cd=$(echo "$dir_to_cd" \| sed 's/@$//')

	# If no directory is found, skip to the next accession
	[[ -z "$dir_to_cd" ]] && echo "No directory for $accession, skipping..." && continue

	echo "Found directory: $dir_to_cd"

	# Save accession and directory to file
	echo -e "${accession}\t${base_path}/${dir_to_cd}" >> "${{env.DIRS}}"

	done < "${{env.PATHS}}"
	# Check if the output file is empty
	if [ ! -s "${{env.PATHS}}" ]; then
	echo "Any new path found. Exiting.."
	exit 0
	fi

	rm "${{env.PATHS}}"

	- name: Search for .gff.gz Files
	run: \|

	# Loop through the directories and search for .gff.gz files
	while read -r accession dir_to_cd; do
	echo "Searching for .gff.gz files in $dir_to_cd"

	# Use lftp to connect to the server and list all files in the directory
	lftp -c "
	set net:timeout 10
	set net:max-retries 3
	open ftp.ncbi.nlm.nih.gov
	cd ${dir_to_cd#ftp://ftp.ncbi.nlm.nih.gov}
	cls -1 ${accession}*.gff.gz
	quit
	" \| awk -v acc="$accession" -v dir="$dir_to_cd" '
	/\.gff\.gz$/ {
	https_base = gensub(/^ftp:\/\//, "https://", 1, dir)
	print acc "\t" https_base "/" $NF
	}
	' >> "${{env.OUTPUT}}"

	done < "${{env.DIRS}}"

	rm "${{env.DIRS}}"
	rm datasets
	rm dataformat

	- name: Commit & Push changes
	uses: actions-js/push@master
	with:
	github_token: ${{ secrets.GITHUB_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Retrieve NCBI GFF3 Paths #30

Workflow file

Retrieve NCBI GFF3 Paths #30

Jobs

Run details

Workflow file for this run