Merge pull request #170 from rhpvorderman/release_0.10.0

Release 0.10.0
rhpvorderman · Jun 3, 2024 · 802a0f9 · 802a0f9
2 parents df22ab5 + 5ad7a47
commit 802a0f9
Show file tree

Hide file tree

Showing 12 changed files with 470 additions and 99 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,17 @@ Changelog
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+version 0.10.0
+------------------
++ Make overrepresented sequences table scrollable and smaller so it is easier
+  to skip over when lots of entries are found.
++ Overrepresented sequence analysis now only counts unique fragments per read.
+  Fragments that are duplicated inside the read are counted only once. This
+  prevents long stretches of genomic repeats getting higher reported
+  frequencies.
++ Speed up sequence identity calculations on AVX2 enabled CPUs by using a
+  reverse-diagonal parallelized version of Smith-Waterman.
+
 version 0.9.1
 -----------------
 + Fix an issue where the insert size metrics module would crash when no

diff --git a/docs/module_options.rst b/docs/module_options.rst
@@ -64,9 +64,12 @@ such the adapter sequences will always be sampled in the same frame.
     sequence is sampled when the length is not divisible by the size of the
     fragments.
 
-Fragments are stored and counted in a hash table. When the hash table is full
-only fragments that are already present will be counted. To diminish the time
-spent on the module, by default 1 in 8 sequences is analysed.
+For each sequence only unique fragments within the sequence are sampled, to
+avoid overrepresenting common genomic repeats.
+unique fragments are then stored and counted in a hash table. When the hash
+table is full only fragments that are already present will be counted.
+To diminish the time spent on the module, by default 1 in 8 sequences is
+analysed.
 
 After the module is run, stored fragments are checked for their counts. If the
 count exceeds a certain threshold it is considered overrepresented. Sequali

diff --git a/scripts/benchmark_sequence_identity.py b/scripts/benchmark_sequence_identity.py
@@ -10,5 +10,7 @@
         data = json.load(f)
     sequence_dicts = data["overrepresented_sequences"]["overrepresented_sequences"]
     for seqdict in sequence_dicts:
-        identify_sequence_builtin(seqdict["sequence"])
-
+        best_match = seqdict["best_match"]
+        total, max, found_match = identify_sequence_builtin(seqdict["sequence"])
+        if best_match != found_match:
+            raise ValueError(f"{best_match} != {found_match}")
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
@@ -3154,6 +3154,8 @@ typedef struct _SequenceDuplicationStruct {
     size_t fragment_length;
     uint64_t number_of_sequences;
     uint64_t sampled_sequences;
+    uint64_t staging_hash_table_size; 
+    uint64_t *staging_hash_table;
     uint64_t hash_table_size;
     uint64_t *hashes; 
     uint32_t *counts;
@@ -3166,6 +3168,7 @@ typedef struct _SequenceDuplicationStruct {
 static void 
 SequenceDuplication_dealloc(SequenceDuplication *self)
 {
+    PyMem_Free(self->staging_hash_table);
     PyMem_Free(self->hashes);
     PyMem_Free(self->counts);
     Py_TYPE(self)->tp_free((PyObject *)self);
@@ -3232,6 +3235,8 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     self->hash_table_size = hash_table_size;
     self->total_fragments = 0;
     self->fragment_length = fragment_length;
+    self->staging_hash_table_size = 0;
+    self->staging_hash_table = NULL;
     self->hashes = hashes;
     self->counts = counts;
     self->sample_every = sample_every;
@@ -3265,6 +3270,44 @@ Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash)
     }
 }
 
+static int 
+SequenceDuplication_resize_staging(SequenceDuplication *self, uint64_t new_size)
+{
+    if (new_size <= self->staging_hash_table_size) {
+        return 0;
+    }
+    uint64_t *tmp = PyMem_Realloc(self->staging_hash_table, 
+                                  new_size * sizeof(uint64_t));
+    if (tmp == NULL) {
+        PyErr_NoMemory();
+        return -1; 
+    }
+    self->staging_hash_table = tmp;
+    self->staging_hash_table_size = new_size;
+    return 0;
+}
+
+static inline void 
+add_to_staging(uint64_t *staging_hash_table, uint64_t staging_hash_table_size, 
+               uint64_t hash) 
+{
+    /* Works because size is a power of 2 */
+    uint64_t hash_to_index_int = staging_hash_table_size - 1; 
+    uint64_t index = hash & hash_to_index_int;
+    while (true) {
+        uint64_t current_entry  = staging_hash_table[index];
+        if (current_entry == 0) {
+            staging_hash_table[index] = hash;
+            break;
+        } else if (current_entry == hash) {
+            break;
+        }
+        index += 1; 
+        index &= hash_to_index_int;
+    }
+    return;
+}
+
 static int
 SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
 {
@@ -3299,6 +3342,16 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
        still all of the sequence is being sampled.
     */
     Py_ssize_t total_fragments = (sequence_length + fragment_length - 1) / fragment_length;
+    size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5));
+    size_t staging_hash_size = 1ULL << staging_hash_bits;
+    if (staging_hash_size > self->staging_hash_table_size) {
+        if (SequenceDuplication_resize_staging(self, staging_hash_size) < 0) {
+            return -1;
+        }
+    }
+    uint64_t *staging_hash_table = self->staging_hash_table;
+    memset(staging_hash_table, 0 , staging_hash_size * sizeof(uint64_t));
+
     Py_ssize_t from_mid_point_fragments = total_fragments / 2;
     Py_ssize_t mid_point = sequence_length - (from_mid_point_fragments * fragment_length);
     bool warn_unknown = false;
@@ -3313,7 +3366,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
         }
         fragments += 1;
         uint64_t hash = wanghash64(kmer);
-        Sequence_duplication_insert_hash(self, hash);
+        add_to_staging(staging_hash_table, staging_hash_size, hash);    
     }
 
     // Sample back sequences
@@ -3327,7 +3380,13 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
         }
         fragments += 1;
         uint64_t hash = wanghash64(kmer);
-        Sequence_duplication_insert_hash(self, hash);
+        add_to_staging(staging_hash_table, staging_hash_size, hash);    
+    }
+    for (size_t i=0; i<staging_hash_size; i++) {
+        uint64_t hash = staging_hash_table[i];
+        if (hash != 0) {
+            Sequence_duplication_insert_hash(self, hash);
+        }
     }
     if (warn_unknown) {
         PyObject *culprit = PyUnicode_DecodeASCII((char *)sequence, sequence_length, NULL);