Map NCBI and Ensembl files #23
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Map NCBI and Ensembl files | |
on: | |
schedule: | |
- cron: '0 0 * * 1' | |
workflow_dispatch: | |
jobs: | |
map-files: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v3 | |
- name: Install Dependencies | |
run: | | |
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/dataformat' | |
curl -O 'https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/v2/linux-amd64/datasets' | |
chmod +x datasets dataformat | |
- name: Set Environment Variables | |
run: | | |
#datasets | |
fields=accession,assminfo-name,organism-name,organism-tax-id | |
echo "FIELDS=$fields" >> $GITHUB_ENV | |
#input files | |
echo "NCBI=ncbi.tsv" >> $GITHUB_ENV | |
echo "ENSEMBL_RR=ensembl_rapid_release.tsv" >> $GITHUB_ENV | |
#assembly info | |
echo "ASSEMBLY_TABLE=assemblies_table.tsv" >> $GITHUB_ENV | |
#output file header | |
header="annotation_name\t$(echo $fields | tr ',' '\t')\tfull_path" | |
echo "HEADER=$header" >> $GITHUB_ENV | |
#output file name | |
echo "OUTPUT=mapped_annotations.tsv" >> $GITHUB_ENV | |
#tmp files | |
echo "MERGED_TABLE=merged_table.tsv" >> $GITHUB_ENV | |
echo "NEW_ANNOTATIONS=new_annotations.tsv" >> $GITHUB_ENV | |
- name: Init Output File | |
run: | | |
output="${{env.OUTPUT}}" | |
# Initialize the output file for FTP paths if it doesn't exist | |
if [[ ! -f "$output" ]]; then | |
echo -e "${{env.HEADER}}" > "$output" | |
fi | |
- name: Collect and Filter Data from TSVs | |
run: | | |
# Define the input files (two TSVs) and output file | |
ncbi="${{env.NCBI}}" | |
ensembl_rr="${{env.ENSEMBL_RR}}" | |
#format name accession path | |
merged_table="${{env.MERGED_TABLE}}" | |
# Merge two TSVs, process columns, and save to output | |
{ tail -n +2 "$ncbi"; tail -n +2 "$ensembl_rr"; } | awk -F'\t' '{ | |
# Get the accession (first column) | |
accession = $1; | |
# Extract the filename from the full path (second column), remove .gff.gz or .gff3.gz extension | |
match($2, /([^\/]+)\.(gff3?|gff)\.gz$/, arr); | |
filename = arr[1]; | |
# Print accession, filename without extension, and full URL (second column) | |
print filename "\t" accession "\t" $2; | |
}' > "$merged_table" | |
- name: Filter Out Existing Annotations | |
run: | | |
new_annotations="${{env.NEW_ANNOTATIONS}}" | |
merged_table="${{env.MERGED_TABLE}}" | |
existing_annotations="${{env.OUTPUT}}" | |
awk 'NR==FNR {if (FNR > 1) exclude[$1]; next} FNR > 1 && !($1 in exclude)' "$existing_annotations" "$merged_table" > "$new_annotations" | |
# Check if the output file is empty | |
if [ ! -s "$new_annotations" ]; then | |
echo "Output file is empty. Exiting.." | |
exit 0 | |
fi | |
rm "$merged_table" | |
- name: Get New Assemblies NCBI Metadata | |
run: | | |
tmp=tmp.txt | |
cat "${{env.NEW_ANNOTATIONS}}" | | |
awk -F'\t' '{print $2}' > "$tmp" | |
./datasets summary genome accession --inputfile "$tmp" --as-json-lines | | |
./dataformat tsv genome --fields "${{env.FIELDS}}" > "${{env.ASSEMBLY_TABLE}}" | |
rm "$tmp" | |
- name: Insert new annotations | |
run: | | |
new_annotations="${{env.NEW_ANNOTATIONS}}" | |
existing_annotations="${{env.OUTPUT}}" | |
assembly_table="${{env.ASSEMBLY_TABLE}}" | |
# Process new annotations | |
awk 'NR==FNR {if (FNR > 1) assembly[$1] = $0; next} FNR > 1 { | |
accession = $2; # Get the second column (accession) from new_annotations | |
full_path = $3; # Get the third column (full path) from new_annotations | |
if (accession in assembly) { # If accession matches a line in assembly_table | |
# Print formatted output with new_annotations first column, full_path, and the rest of assembly_table | |
match(assembly[accession], /^[^\t]+\t(.+)/, arr); | |
print $1 "\t" arr[0] "\t" full_path; | |
} | |
}' "$assembly_table" "$new_annotations" >> "$existing_annotations" | |
# Display the final output for debugging purposes | |
rm "$assembly_table" | |
rm "$new_annotations" | |
rm datasets | |
rm dataformat | |
- name: Commit & Push changes | |
uses: actions-js/push@master | |
with: | |
github_token: ${{ secrets.GITHUB_TOKEN }} |