Use floor rather than round for phred scores

rhpvorderman · Aug 9, 2024 · 1c5a428 · 1c5a428
1 parent eda2c55
commit 1c5a428
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 4 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,6 +9,9 @@ Changelog
 
 version 0.11.0-dev
 ------------------
++ Fix a bug where the average phred score per read would be rounded, not
+  floored. This would lead reads with a phred score such as 9.7 to be counted
+  towards the Q>=10 results.
 + Replace some of the hand vectorized code with more generic code that can be
   automatically be optimized by the compiler. This should make things faster on
   Windows and ARM64 platforms.

diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
@@ -1849,7 +1849,9 @@ QCMetrics_add_meta(QCMetrics *self, struct FastqMeta *meta)
     meta->accumulated_error_rate = accumulated_error_rate;
     double average_error_rate = accumulated_error_rate / (double)sequence_length;
     double average_phred = -10.0 * log10(average_error_rate);
-    uint64_t phred_score_index = (uint64_t)round(average_phred);
+    // Floor the average phred so q9.7 does not get represented as q10 but
+    // q9. Otherwise the Q>=10 count is going to be off.
+    uint64_t phred_score_index = (uint64_t)floor(average_phred);
     assert(phred_score_index >= 0);
     assert(phred_score_index <= PHRED_MAX);
     self->phred_scores[phred_score_index] += 1;

diff --git a/tests/test_qc_metrics.py b/tests/test_qc_metrics.py
@@ -42,7 +42,7 @@ def test_qc_metrics():
     phred_content = metrics.phred_scores()
     this_read_error = (10 ** -1) * 25 + (10 ** -3) * 25
     this_read_phred = -10 * math.log10(this_read_error / len(sequence))
-    phred_index = round(this_read_phred)
+    phred_index = math.floor(this_read_phred)
     assert phred_content[phred_index] == 1
     assert sum(phred_content) == 1
     phred_array = metrics.phred_count_table()
@@ -81,8 +81,12 @@ def test_long_sequence():
     # properly.
     sequence = 4096 * 'A' + 4096 * 'C'
     qualities = 8192 * chr(20 + 33)
+    # Use a sum range and multiply by four because that is how the algorithm
+    # in sequali works. Keeping 4 separate counters for efficiency.
+    calculated_phred = -10 * math.log10(sum(0.01 for _ in range(2048)) * 4 / 8192)
+    floored_phred = math.floor(calculated_phred)
     metrics.add_read(FastqRecordView("name", sequence, qualities))
-    assert metrics.phred_scores()[20] == 1
+    assert metrics.phred_scores()[floored_phred] == 1
     assert metrics.gc_content()[50] == 1
 
 
@@ -97,4 +101,4 @@ def test_average_long_quality():
     error_rate = (1 + 19999 * 10 ** -5) / 20000
     phred = -10 * math.log10(error_rate)
     metrics.add_read(FastqRecordView("name", sequence, qualities))
-    assert metrics.phred_scores()[round(phred)] == 1
+    assert metrics.phred_scores()[math.floor(phred)] == 1