Skip to content

Commit

Permalink
Use floor rather than round for phred scores
Browse files Browse the repository at this point in the history
  • Loading branch information
rhpvorderman committed Aug 9, 2024
1 parent eda2c55 commit 1c5a428
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ Changelog
version 0.11.0-dev
------------------
+ Fix a bug where the average phred score per read would be rounded, not
floored. This would lead reads with a phred score such as 9.7 to be counted
towards the Q>=10 results.
+ Replace some of the hand vectorized code with more generic code that can be
automatically be optimized by the compiler. This should make things faster on
Windows and ARM64 platforms.
Expand Down
4 changes: 3 additions & 1 deletion src/sequali/_qcmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -1849,7 +1849,9 @@ QCMetrics_add_meta(QCMetrics *self, struct FastqMeta *meta)
meta->accumulated_error_rate = accumulated_error_rate;
double average_error_rate = accumulated_error_rate / (double)sequence_length;
double average_phred = -10.0 * log10(average_error_rate);
uint64_t phred_score_index = (uint64_t)round(average_phred);
// Floor the average phred so q9.7 does not get represented as q10 but
// q9. Otherwise the Q>=10 count is going to be off.
uint64_t phred_score_index = (uint64_t)floor(average_phred);
assert(phred_score_index >= 0);
assert(phred_score_index <= PHRED_MAX);
self->phred_scores[phred_score_index] += 1;
Expand Down
10 changes: 7 additions & 3 deletions tests/test_qc_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_qc_metrics():
phred_content = metrics.phred_scores()
this_read_error = (10 ** -1) * 25 + (10 ** -3) * 25
this_read_phred = -10 * math.log10(this_read_error / len(sequence))
phred_index = round(this_read_phred)
phred_index = math.floor(this_read_phred)
assert phred_content[phred_index] == 1
assert sum(phred_content) == 1
phred_array = metrics.phred_count_table()
Expand Down Expand Up @@ -81,8 +81,12 @@ def test_long_sequence():
# properly.
sequence = 4096 * 'A' + 4096 * 'C'
qualities = 8192 * chr(20 + 33)
# Use a sum range and multiply by four because that is how the algorithm
# in sequali works. Keeping 4 separate counters for efficiency.
calculated_phred = -10 * math.log10(sum(0.01 for _ in range(2048)) * 4 / 8192)
floored_phred = math.floor(calculated_phred)
metrics.add_read(FastqRecordView("name", sequence, qualities))
assert metrics.phred_scores()[20] == 1
assert metrics.phred_scores()[floored_phred] == 1
assert metrics.gc_content()[50] == 1


Expand All @@ -97,4 +101,4 @@ def test_average_long_quality():
error_rate = (1 + 19999 * 10 ** -5) / 20000
phred = -10 * math.log10(error_rate)
metrics.add_read(FastqRecordView("name", sequence, qualities))
assert metrics.phred_scores()[round(phred)] == 1
assert metrics.phred_scores()[math.floor(phred)] == 1

0 comments on commit 1c5a428

Please sign in to comment.