Skip to content

Commit

Permalink
attempt fix blast report
Browse files Browse the repository at this point in the history
  • Loading branch information
gregdenay committed Jul 24, 2024
1 parent 6f5940a commit 65cdbdc
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 39 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
### 1.7.1

#### Bugfix

Fixing some issues with blast report generation when using newer BLAST databases. Source of the problemis still unclear but appears to be linked to new realeses of the BLAST db and concerns only specific taxids or sequences.

### 1.7.0

#### Breaking changes
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.7.0
1.7.1
2 changes: 1 addition & 1 deletion workflow/envs/blast.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ channels:
- bioconda
- defaults
dependencies:
- blast=2.12.0
- blast=2.16.0
78 changes: 41 additions & 37 deletions workflow/rules/blast.smk
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ rule blast_otus:
-qcov_hsp_perc {params.qcov} $masking \
-outfmt '6 qseqid sseqid evalue pident bitscore sacc staxid length mismatch gaps stitle' \
-num_threads {threads} \
-mt_mode 1 \
2> {log}
sed -i '1 i\query\tsubject\tevalue\tidentity\tbitscore\tsubject_acc\tsubject_taxid\talignment_length\tmismatch\tgaps\tsubject_name' {output.report}
Expand Down Expand Up @@ -235,49 +236,52 @@ rule blast_stats:
"{sample}/reports/{sample}_blast_stats.tsv",
params:
bit_diff=config["bit_score_diff"],
sample=lambda w: w.sample,
message:
"[{wildcards.sample}][assignement] collecting BLAST stats"
conda:
"../envs/pandas.yaml"
log:
"logs/{sample}/blast_stats.log",
shell:
"""
exec 2> {log}
if [ -s {input.blast} ]
then
# Get list of all OTUs
OTUs=$(grep "^>" {input.otus} | cut -d";" -f1 | tr -d '>' | sort -u)
for otu in $OTUs
do
size=$(grep -E "^>${{otu}}\>" {input.otus} | cut -d"=" -f2)
bhits=$(grep -c -E "^${{otu}};" {input.blast} || true)
if [ $bhits -eq 0 ]
then
# When there is no blast hit
echo "{wildcards.sample}\t$otu\t$size\t0\t0\t0\t0\t0\t-\t-\t-\t- (1.0)\t../{input.blast}\t../{input.filtered}" >> {output}
else
# Otherwise collect and print stats to file
bit_best=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -rn | head -n1)
bit_low=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -n | head -n1)
bit_thr=$(($bit_best - {params.bit_diff}))
shits=$(grep -c -E "^${{otu}}\>" {input.filtered})
cons=$(grep -E "^${{otu}}\>" {input.lca} | cut -d'\t' -f2-5)
echo "{wildcards.sample}\t$otu\t$size\t$bhits\t$bit_best\t$bit_low\t$bit_thr\t$shits\t$cons\t../{input.blast}\t../{input.filtered}" >> {output}
fi
done
# Sort by size and add header (just to get hits on top)
sort -k3,3nr -o {output} {output}
sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output}
else
echo "{wildcards.sample}\t-\t-\t0\t0\t0\t0\t0\t-\t-\t-\t-\t../{input.blast}\t../{input.filtered}" > {output}
sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output}
fi
"""
script:
"../scripts/blast_stats.py"
# shell:
# """
# exec 2> {log}

# if [ -s {input.blast} ]
# then
# # Get list of all OTUs
# OTUs=$(grep "^>" {input.otus} | cut -d";" -f1 | tr -d '>' | sort -u)

# for otu in $OTUs
# do
# size=$(grep -E "^>${{otu}}\>" {input.otus} | cut -d"=" -f2)
# bhits=$(grep -c -E "^${{otu}};" {input.blast} || true)
# if [ $bhits -eq 0 ]
# then
# # When there is no blast hit
# echo "{wildcards.sample}\t$otu\t$size\t0\t0\t0\t0\t0\t-\t-\t-\t- (1.0)\t../{input.blast}\t../{input.filtered}" >> {output}
# else
# # Otherwise collect and print stats to file
# bit_best=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -rn | head -n1)
# bit_low=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -n | head -n1)
# bit_thr=$(($bit_best - {params.bit_diff}))
# shits=$(grep -c -E "^${{otu}}\>" {input.filtered})
# cons=$(grep -E "^${{otu}}\>" {input.lca} | cut -d'\t' -f2-5)

# echo "{wildcards.sample}\t$otu\t$size\t$bhits\t$bit_best\t$bit_low\t$bit_thr\t$shits\t$cons\t../{input.blast}\t../{input.filtered}" >> {output}
# fi
# done
# # Sort by size and add header (just to get hits on top)
# sort -k3,3nr -o {output} {output}
# sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output}

# else
# echo "{wildcards.sample}\t-\t-\t0\t0\t0\t0\t0\t-\t-\t-\t-\t../{input.blast}\t../{input.filtered}" > {output}
# sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output}
# fi
# """


rule collect_blast_stats:
Expand Down
135 changes: 135 additions & 0 deletions workflow/scripts/blast_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import sys


sys.stderr = open(snakemake.log[0], "w")


import os
import pandas as pd


HEADER = "\t".join([
"Sample",
"Query",
"Count",
"Blast hits",
"Best bit-score",
"Lowest bit-score",
"Bit-score threshold",
"Saved Blast hits",
"Consensus",
"Rank",
"Taxid",
"Disambiguation",
"link_report",
"link_filtered"
])


def parse_fasta(fasta):
ids, sizes = [], []
with open(fasta, "r") as fi:
for l in fi:
if l[0] ==">":
ids.append(l.split(";")[0][1:].strip())
sizes.append(l.split("=")[1].strip())
return ids, sizes


def count_blasthits(id, report):
i = 0
with open(report, "r") as fi:
for l in fi:
if l.split(";")[0] == id:
i += 1
return i


def main(otus_in, blast_in, filtered_in, lca_in, report_out, bit_diff, sample):
with open(report_out, "w") as fo:
fo.write(f"{HEADER}\n")
if not os.path.isfile(blast_in) or os.stat(blast_in).st_size == 0:
with open(report_out, "a") as fo:
fo.write("\t".join([
sample,
"-",
"-",
"0",
"0",
"0",
"0",
"0",
"-",
"-",
"-",
"-",
f"../{blast_in}",
f"../{filtered_in}"
])+"\n")

else:
otus, sizes = parse_fasta(otus_in)
blast_df = pd.read_csv(blast_in, sep="\t")
filtered_df = pd.read_csv(filtered_in, sep="\t")
lca_df = pd.read_csv(lca_in, sep="\t")

for otu, size in zip(otus, sizes):
bhits = count_blasthits(otu, blast_in)
if bhits == 0:
with open(report_out, "a") as fo:
fo.write("\t".join([
sample,
otu,
str(size),
"0",
"0",
"0",
"0",
"0",
"-",
"-",
"-",
"- (1.0)",
f"../{blast_in}",
f"../{filtered_in}"
])+"\n")
else:
bit_best = blast_df["bitscore"].max()
bit_low = blast_df["bitscore"].min()
bit_thr = bit_best - bit_diff
shits = filtered_df["query"].str.count(otu).sum()
cons = lca_df[lca_df["queryID"] == otu]

with open(report_out, "a") as fo:
fo.write("\t".join([
sample,
otu,
str(size),
str(bhits),
str(bit_best),
str(bit_low),
str(bit_thr),
str(shits),
cons["Consensus"].values[0],
cons["Rank"].values[0],
str(cons["Taxid"].values[0]),
cons["Disambiguation"].values[0],
f"../{blast_in}",
f"../{filtered_in}"
])+"\n")


if __name__ == '__main__':
main(
snakemake.input['otus'],
snakemake.input['blast'],
snakemake.input['filtered'],
snakemake.input['lca'],
snakemake.output[0],
snakemake.params['bit_diff'],
snakemake.params['sample'],
)

0 comments on commit 65cdbdc

Please sign in to comment.