Skip to content

Map NCBI and Ensembl files #22

Map NCBI and Ensembl files

Map NCBI and Ensembl files #22

Workflow file for this run

name: Map NCBI and Ensembl files
on:
schedule:
- cron: '0 0 * * 1'
workflow_dispatch:
jobs:
map-files:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install Dependencies
run: |
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat'
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets'
chmod +x datasets dataformat
- name: Set Environment Variables
run: |
#datasets
fields=accession,assminfo-name,organism-name,organism-tax-id
echo "FIELDS=$fields" >> $GITHUB_ENV
#input files
echo "NCBI=ncbi.tsv" >> $GITHUB_ENV
echo "ENSEMBL_RR=ensembl_rapid_release.tsv" >> $GITHUB_ENV
#assembly info
echo "ASSEMBLY_TABLE=assemblies_table.tsv" >> $GITHUB_ENV
#output file header
header="annotation_name\t$(echo $fields | tr ',' '\t')\tfull_path"
echo "HEADER=$header" >> $GITHUB_ENV
#output file name
echo "OUTPUT=mapped_annotations.tsv" >> $GITHUB_ENV
#tmp files
echo "MERGED_TABLE=merged_table.tsv" >> $GITHUB_ENV
echo "NEW_ANNOTATIONS=new_annotations.tsv" >> $GITHUB_ENV
- name: Init Output File
run: |
output="${{env.OUTPUT}}"
# Initialize the output file for FTP paths if it doesn't exist
if [[ ! -f "$output" ]]; then
echo -e "${{env.HEADER}}" > "$output"
fi
- name: Collect and Filter Data from TSVs
run: |
# Define the input files (two TSVs) and output file
ncbi="${{env.NCBI}}"
ensembl_rr="${{env.ENSEMBL_RR}}"
#format name accession path
merged_table="${{env.MERGED_TABLE}}"
# Merge two TSVs, process columns, and save to output
{ tail -n +2 "$ncbi"; tail -n +2 "$ensembl_rr"; } | awk -F'\t' '{
# Get the accession (first column)
accession = $1;
# Extract the filename from the full path (second column), remove .gff.gz or .gff3.gz extension
match($2, /([^\/]+)\.(gff3?|gff)\.gz$/, arr);
filename = arr[1];
# Print accession, filename without extension, and full URL (second column)
print filename "\t" accession "\t" $2;
}' > "$merged_table"
- name: Filter Out Existing Annotations
run: |
new_annotations="${{env.NEW_ANNOTATIONS}}"
merged_table="${{env.MERGED_TABLE}}"
existing_annotations="${{env.OUTPUT}}"
awk 'NR==FNR {if (FNR > 1) exclude[$1]; next} FNR > 1 && !($1 in exclude)' "$existing_annotations" "$merged_table" > "$new_annotations"
# Check if the output file is empty
if [ ! -s "$new_annotations" ]; then
echo "Output file is empty. Exiting.."
exit 0
fi
rm "$merged_table"
- name: Get New Assemblies NCBI Metadata
run: |
tmp=tmp.txt
cat "${{env.NEW_ANNOTATIONS}}" |
awk -F'\t' '{print $2}' > "$tmp"
./datasets summary genome accession --inputfile "$tmp" --as-json-lines |
./dataformat tsv genome --fields "${{env.FIELDS}}" > "${{env.ASSEMBLY_TABLE}}"
rm "$tmp"
- name: Insert new annotations
run: |
new_annotations="${{env.NEW_ANNOTATIONS}}"
existing_annotations="${{env.OUTPUT}}"
assembly_table="${{env.ASSEMBLY_TABLE}}"
# Process new annotations
awk 'NR==FNR {if (FNR > 1) assembly[$1] = $0; next} FNR > 1 {
accession = $2; # Get the second column (accession) from new_annotations
full_path = $3; # Get the third column (full path) from new_annotations
if (accession in assembly) { # If accession matches a line in assembly_table
# Print formatted output with new_annotations first column, full_path, and the rest of assembly_table
match(assembly[accession], /^[^\t]+\t(.+)/, arr);
print $1 "\t" arr[0] "\t" full_path;
}
}' "$assembly_table" "$new_annotations" >> "$existing_annotations"
# Display the final output for debugging purposes
rm "$assembly_table"
rm "$new_annotations"
rm datasets
rm dataformat
- name: Commit & Push changes
uses: actions-js/push@master
with:
github_token: ${{ secrets.GITHUB_TOKEN }}