Skip to content

Commit

Permalink
Merge pull request #170 from rhpvorderman/release_0.10.0
Browse files Browse the repository at this point in the history
Release 0.10.0
  • Loading branch information
rhpvorderman authored Jun 3, 2024
2 parents df22ab5 + 5ad7a47 commit 802a0f9
Show file tree
Hide file tree
Showing 12 changed files with 470 additions and 99 deletions.
11 changes: 11 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ Changelog
.. This document is user facing. Please word the changes in such a way
.. that users understand how the changes affect the new version.
version 0.10.0
------------------
+ Make overrepresented sequences table scrollable and smaller so it is easier
to skip over when lots of entries are found.
+ Overrepresented sequence analysis now only counts unique fragments per read.
Fragments that are duplicated inside the read are counted only once. This
prevents long stretches of genomic repeats getting higher reported
frequencies.
+ Speed up sequence identity calculations on AVX2 enabled CPUs by using a
reverse-diagonal parallelized version of Smith-Waterman.

version 0.9.1
-----------------
+ Fix an issue where the insert size metrics module would crash when no
Expand Down
9 changes: 6 additions & 3 deletions docs/module_options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,12 @@ such the adapter sequences will always be sampled in the same frame.
sequence is sampled when the length is not divisible by the size of the
fragments.

Fragments are stored and counted in a hash table. When the hash table is full
only fragments that are already present will be counted. To diminish the time
spent on the module, by default 1 in 8 sequences is analysed.
For each sequence only unique fragments within the sequence are sampled, to
avoid overrepresenting common genomic repeats.
unique fragments are then stored and counted in a hash table. When the hash
table is full only fragments that are already present will be counted.
To diminish the time spent on the module, by default 1 in 8 sequences is
analysed.

After the module is run, stored fragments are checked for their counts. If the
count exceeds a certain threshold it is considered overrepresented. Sequali
Expand Down
6 changes: 4 additions & 2 deletions scripts/benchmark_sequence_identity.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@
data = json.load(f)
sequence_dicts = data["overrepresented_sequences"]["overrepresented_sequences"]
for seqdict in sequence_dicts:
identify_sequence_builtin(seqdict["sequence"])

best_match = seqdict["best_match"]
total, max, found_match = identify_sequence_builtin(seqdict["sequence"])
if best_match != found_match:
raise ValueError(f"{best_match} != {found_match}")
63 changes: 61 additions & 2 deletions src/sequali/_qcmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -3154,6 +3154,8 @@ typedef struct _SequenceDuplicationStruct {
size_t fragment_length;
uint64_t number_of_sequences;
uint64_t sampled_sequences;
uint64_t staging_hash_table_size;
uint64_t *staging_hash_table;
uint64_t hash_table_size;
uint64_t *hashes;
uint32_t *counts;
Expand All @@ -3166,6 +3168,7 @@ typedef struct _SequenceDuplicationStruct {
static void
SequenceDuplication_dealloc(SequenceDuplication *self)
{
PyMem_Free(self->staging_hash_table);
PyMem_Free(self->hashes);
PyMem_Free(self->counts);
Py_TYPE(self)->tp_free((PyObject *)self);
Expand Down Expand Up @@ -3232,6 +3235,8 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
self->hash_table_size = hash_table_size;
self->total_fragments = 0;
self->fragment_length = fragment_length;
self->staging_hash_table_size = 0;
self->staging_hash_table = NULL;
self->hashes = hashes;
self->counts = counts;
self->sample_every = sample_every;
Expand Down Expand Up @@ -3265,6 +3270,44 @@ Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash)
}
}

static int
SequenceDuplication_resize_staging(SequenceDuplication *self, uint64_t new_size)
{
if (new_size <= self->staging_hash_table_size) {
return 0;
}
uint64_t *tmp = PyMem_Realloc(self->staging_hash_table,
new_size * sizeof(uint64_t));
if (tmp == NULL) {
PyErr_NoMemory();
return -1;
}
self->staging_hash_table = tmp;
self->staging_hash_table_size = new_size;
return 0;
}

static inline void
add_to_staging(uint64_t *staging_hash_table, uint64_t staging_hash_table_size,
uint64_t hash)
{
/* Works because size is a power of 2 */
uint64_t hash_to_index_int = staging_hash_table_size - 1;
uint64_t index = hash & hash_to_index_int;
while (true) {
uint64_t current_entry = staging_hash_table[index];
if (current_entry == 0) {
staging_hash_table[index] = hash;
break;
} else if (current_entry == hash) {
break;
}
index += 1;
index &= hash_to_index_int;
}
return;
}

static int
SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
{
Expand Down Expand Up @@ -3299,6 +3342,16 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
still all of the sequence is being sampled.
*/
Py_ssize_t total_fragments = (sequence_length + fragment_length - 1) / fragment_length;
size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5));
size_t staging_hash_size = 1ULL << staging_hash_bits;
if (staging_hash_size > self->staging_hash_table_size) {
if (SequenceDuplication_resize_staging(self, staging_hash_size) < 0) {
return -1;
}
}
uint64_t *staging_hash_table = self->staging_hash_table;
memset(staging_hash_table, 0 , staging_hash_size * sizeof(uint64_t));

Py_ssize_t from_mid_point_fragments = total_fragments / 2;
Py_ssize_t mid_point = sequence_length - (from_mid_point_fragments * fragment_length);
bool warn_unknown = false;
Expand All @@ -3313,7 +3366,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
}
fragments += 1;
uint64_t hash = wanghash64(kmer);
Sequence_duplication_insert_hash(self, hash);
add_to_staging(staging_hash_table, staging_hash_size, hash);
}

// Sample back sequences
Expand All @@ -3327,7 +3380,13 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta)
}
fragments += 1;
uint64_t hash = wanghash64(kmer);
Sequence_duplication_insert_hash(self, hash);
add_to_staging(staging_hash_table, staging_hash_size, hash);
}
for (size_t i=0; i<staging_hash_size; i++) {
uint64_t hash = staging_hash_table[i];
if (hash != 0) {
Sequence_duplication_insert_hash(self, hash);
}
}
if (warn_unknown) {
PyObject *culprit = PyUnicode_DecodeASCII((char *)sequence, sequence_length, NULL);
Expand Down
Loading

0 comments on commit 802a0f9

Please sign in to comment.