Map NCBI and Ensembl files #20

Summary
Jobs
- map-files
Run details
- Usage
- Workflow file

Workflow file for this run

.github/workflows/map_tables.yml at 3ed7afe

	name: Map NCBI and Ensembl files

	on:
	schedule:
	- cron: '0 0 * * 1'
	workflow_dispatch:

	jobs:
	map-files:
	runs-on: ubuntu-latest

	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install Dependencies
	run: \|
	curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat'
	curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets'
	chmod +x datasets dataformat

	- name: Set Environment Variables
	run: \|

	#datasets
	fields=accession,assminfo-name,organism-name,organism-tax-id
	echo "FIELDS=$fields" >> $GITHUB_ENV

	#input files
	echo "NCBI=ncbi.tsv" >> $GITHUB_ENV
	echo "ENSEMBL_RR=ensembl_rapid_release.tsv" >> $GITHUB_ENV

	#assembly info
	echo "ASSEMBLY_TABLE=assemblies_table.tsv" >> $GITHUB_ENV

	#output file header
	header="annotation_name\t$(echo $fields \| tr ',' '\t')\tfull_path"
	echo "HEADER=$header" >> $GITHUB_ENV

	#output file name
	echo "OUTPUT=mapped_annotations.tsv" >> $GITHUB_ENV

	#tmp files
	echo "MERGED_TABLE=merged_table.tsv" >> $GITHUB_ENV
	echo "NEW_ANNOTATIONS=new_annotations.tsv" >> $GITHUB_ENV

	- name: Init Output File
	run: \|

	output="${{env.OUTPUT}}"

	# Initialize the output file for FTP paths if it doesn't exist
	if [[ ! -f "$output" ]]; then
	echo -e "${{env.HEADER}}" > "$output"
	fi

	- name: Collect and Filter Data from TSVs
	run: \|

	# Define the input files (two TSVs) and output file
	ncbi="${{env.NCBI}}"
	ensembl_rr="${{env.ENSEMBL_RR}}"

	#format name accession path
	merged_table="${{env.MERGED_TABLE}}"

	# Merge two TSVs, process columns, and save to output
	{ tail -n +2 "$ncbi"; tail -n +2 "$ensembl_rr"; } \| awk -F'\t' '{
	# Get the accession (first column)
	accession = $1;

	# Extract the filename from the full path (second column), remove .gff.gz or .gff3.gz extension
	match($2, /([^\/]+)\.(gff3?\|gff)\.gz$/, arr);
	filename = arr[1];

	# Print accession, filename without extension, and full URL (second column)
	print filename "\t" accession "\t" $2;
	}' > "$merged_table"

	- name: Filter Out Existing Annotations
	run: \|
	new_annotations="${{env.NEW_ANNOTATIONS}}"
	merged_table="${{env.MERGED_TABLE}}"
	existing_annotations="${{env.OUTPUT}}"

	awk 'NR==FNR {if (FNR > 1) exclude[$1]; next} FNR > 1 && !($1 in exclude)' "$existing_annotations" "$merged_table" > "$new_annotations"

	# Check if the output file is empty
	if [ ! -s "$new_annotations" ]; then
	echo "Output file is empty. Exiting.."
	exit 0
	fi

	rm "$merged_table"

	- name: Get New Assemblies NCBI Metadata
	run: \|

	tmp=tmp.txt

	cat "${{env.NEW_ANNOTATIONS}}" \|
	awk -F'\t' '{print $2}' > "$tmp"

	./datasets summary genome accession --inputfile "$tmp" --as-json-lines \|
	./dataformat tsv genome --fields "${{env.FIELDS}}" > "${{env.ASSEMBLY_TABLE}}"

	rm "$tmp"

	- name: Insert new annotations
	run: \|

	new_annotations="${{env.NEW_ANNOTATIONS}}"
	existing_annotations="${{env.OUTPUT}}"
	assembly_table="${{env.ASSEMBLY_TABLE}}"

	# Process new annotations
	awk 'NR==FNR {if (FNR > 1) assembly[$1] = $0; next} FNR > 1 {
	accession = $2; # Get the second column (accession) from new_annotations
	full_path = $3; # Get the third column (full path) from new_annotations
	if (accession in assembly) { # If accession matches a line in assembly_table
	# Print formatted output with new_annotations first column, full_path, and the rest of assembly_table
	match(assembly[accession], /^[^\t]+\t(.+)/, arr);
	print $1 "\t" arr[0] "\t" full_path;
	}
	}' "$assembly_table" "$new_annotations" >> "$existing_annotations"

	# Display the final output for debugging purposes

	rm "$assembly_table"
	rm "$new_annotations"
	rm datasets
	rm dataformat

	- name: Commit & Push changes
	uses: actions-js/push@master
	with:
	github_token: ${{ secrets.GITHUB_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Map NCBI and Ensembl files #20

Workflow file

Map NCBI and Ensembl files #20

Jobs

Run details

Workflow file for this run