diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 119675b..bb5614a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ Changelog .. This document is user facing. Please word the changes in such a way .. that users understand how the changes affect the new version. +version 0.13.0-dev +------------------ ++ Only sample the first 100 bp from the beginning and end of the read by + default for the overrepresented sequences analysis. This prevents a lot of + false positives from common human genome repeats. The amount of base pairs + that are sampled from the beginning and end is user settable with an option + to sample everything. + version 0.12.0 ------------------ + Properly name percentiles as such in the sequence length distribution rather diff --git a/docs/_static/images/overrepresented_sampling.fodg b/docs/_static/images/overrepresented_sampling.fodg index 27b19a0..26205c3 100644 --- a/docs/_static/images/overrepresented_sampling.fodg +++ b/docs/_static/images/overrepresented_sampling.fodg @@ -1,11 +1,11 @@ - Ruben Vorderman2024-03-27T12:13:53.658159835LibreOffice/7.4.7.2$Linux_X86_64 LibreOffice_project/40$Build-22024-03-27T12:27:29.181257342Ruben VordermanPT13M36S3 + Ruben Vorderman2024-03-27T12:13:53.658159835LibreOffice/7.4.7.2$Linux_X86_64 LibreOffice_project/40$Build-22024-11-20T11:17:18.804415312Ruben VordermanPT20M2S4 - -5024 - -849 + -1890 + -1872 44763 30807 @@ -35,10 +35,10 @@ true 4 0 - -5024 - -849 - 31530 - 21700 + -1890 + -1872 + 33617 + 21741 1000 1000 500 @@ -100,8 +100,8 @@ false false false - lAH+/0dlbmVyaWMgUHJpbnRlcgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAU0dFTlBSVAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAWAAMAtQAAAAAAAAAEAAhSAAAEdAAASm9iRGF0YSAxCnByaW50ZXI9R2VuZXJpYyBQcmludGVyCm9yaWVudGF0aW9uPVBvcnRyYWl0CmNvcGllcz0xCmNvbGxhdGU9ZmFsc2UKbWFyZ2luYWRqdXN0bWVudD0wLDAsMCwwCmNvbG9yZGVwdGg9MjQKcHNsZXZlbD0wCnBkZmRldmljZT0xCmNvbG9yZGV2aWNlPTAKUFBEQ29udGV4dERhdGEKUGFnZVNpemU6QTQAABIAQ09NUEFUX0RVUExFWF9NT0RFDwBEdXBsZXhNb2RlOjpPZmY= - Generic Printer + owH+/0hQX0VOVllfNzY0MF9zZXJpZXNfNEMzRDAzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQ1VQUzpIUF9FTlZZXzc2NDBfc2VyaWVzXzRDM0QwMwAWAAMAxAAAAAAAAAAIAFZUAAAkbQAASm9iRGF0YSAxCnByaW50ZXI9SFBfRU5WWV83NjQwX3Nlcmllc180QzNEMDMKb3JpZW50YXRpb249UG9ydHJhaXQKY29waWVzPTEKY29sbGF0ZT1mYWxzZQptYXJnaW5hZGp1c3RtZW50PTAsMCwwLDAKY29sb3JkZXB0aD0yNApwc2xldmVsPTAKcGRmZGV2aWNlPTEKY29sb3JkZXZpY2U9MApQUERDb250ZXh0RGF0YQpQYWdlU2l6ZTpMZXR0ZXIAABIAQ09NUEFUX0RVUExFWF9NT0RFDwBEdXBsZXhNb2RlOjpPZmY= + HP_ENVY_7640_series_4C3D03 1250 @@ -131,9 +131,12 @@ + + + - + @@ -291,7 +294,7 @@ - + @@ -312,20 +315,24 @@ + + + + - + - + - + - + @@ -387,7 +394,7 @@ - + Sequence #1 @@ -395,7 +402,7 @@ barcode - + adapter @@ -423,11 +430,11 @@ - + - + @@ -435,19 +442,15 @@ - - - - - + - + - + @@ -503,6 +506,66 @@ + + Sequence #3 + + + + barcode + + + + adapter + + + + adapter + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/_static/images/overrepresented_sampling.svg b/docs/_static/images/overrepresented_sampling.svg index 3cf7716..45871bd 100644 --- a/docs/_static/images/overrepresented_sampling.svg +++ b/docs/_static/images/overrepresented_sampling.svg @@ -1,6 +1,6 @@ - + @@ -18,6 +18,7 @@ + @@ -59,9 +60,9 @@ - - - Sequence #1 + + + Sequence #1 @@ -73,9 +74,9 @@ - - - adapter + + + adapter @@ -122,16 +123,16 @@ - - - + + + - - - + + + @@ -143,122 +144,220 @@ - - - + + + - - - + + + - - - + + + - - - - - - - Sequence #2 - + barcode - + adapter - + adapter - + - + - + - + - + - + - + - + - + + + + + + Sequence #3 + + + + + + + barcode + + + + + + + adapter + + + + + + + adapter + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/module_options.rst b/docs/module_options.rst index a4d2cc9..4fd53e7 100644 --- a/docs/module_options.rst +++ b/docs/module_options.rst @@ -53,6 +53,10 @@ ends towards the middle. This means that the first and last fragment will always be the first 21 bp of the beginning and the last 21 bp in the end. As such the adapter sequences will always be sampled in the same frame. +To prevent many common genome repeats from ending up in the report, Sequali +limits the amount of sampling inwards to approximately 100 bp. With the default +fragment size of 21 bp, this means that 5 fragments are taken from each side. + .. figure:: _static/images/overrepresented_sampling.svg This figure shows how fragments are sampled in sequali. The silver elements @@ -60,9 +64,9 @@ such the adapter sequences will always be sampled in the same frame. sampled. Despite the length differences between sequence #1 and sequence #2 the fragments for the adapter and barcode sequences are the same. In Sequence #1 the fragments sampled from the back end overlap somewhat - with sequences from the front end. This is a necessity to ensure all of the - sequence is sampled when the length is not divisible by the size of the - fragments. + with sequences from the front end. In sequence #3 the sequence is quite + long so the middle is not sampled. This prevents many common genome repeats + from ending up in the report. For each sequence only unique fragments within the sequence are sampled, to avoid overrepresenting common genomic repeats. @@ -89,6 +93,10 @@ The following command line parameters affect this module: store. + ``--overrepresentation-sample-every``: How often a sequence is sampled. Default is every 8 sequences. ++ ``--overrepresentation-bases-from-start``: the minimum amount of bases to + take from the start of the sequence. Default is 100. ++ ``--overrepresentation-bases-from-end``: the minimum amount of bases to + take from the end of the sequence. Default is 100. .. [#F1] A canonical k-mer is the k-mer that has the lowest sort order compared to itself and its reverse complement. This way the canonical-kmer is diff --git a/src/sequali/__init__.py b/src/sequali/__init__.py index cea6b5d..4cfa811 100644 --- a/src/sequali/__init__.py +++ b/src/sequali/__init__.py @@ -17,7 +17,7 @@ from ._qc import A, C, G, N, T from ._qc import ( AdapterCounter, BamParser, FastqParser, FastqRecordArrayView, - FastqRecordView, PerTileQuality, QCMetrics, SequenceDuplication + FastqRecordView, OverrepresentedSequences, PerTileQuality, QCMetrics, ) from ._qc import NUMBER_OF_NUCS, NUMBER_OF_PHREDS, PHRED_MAX, TABLE_SIZE from ._version import __version__ @@ -32,7 +32,7 @@ "FastqRecordArrayView", "PerTileQuality", "QCMetrics", - "SequenceDuplication", + "OverrepresentedSequences", "NUMBER_OF_NUCS", "NUMBER_OF_PHREDS", "PHRED_MAX", diff --git a/src/sequali/__main__.py b/src/sequali/__main__.py index 0c7568e..2dc171f 100644 --- a/src/sequali/__main__.py +++ b/src/sequali/__main__.py @@ -23,6 +23,8 @@ from ._qc import ( AdapterCounter, + DEFAULT_BASES_FROM_END, + DEFAULT_BASES_FROM_START, DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS, DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH, DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET, @@ -34,9 +36,9 @@ DedupEstimator, InsertSizeMetrics, NanoStats, + OverrepresentedSequences, PerTileQuality, QCMetrics, - SequenceDuplication ) from ._version import __version__ from .adapters import DEFAULT_ADAPTER_FILE, adapters_from_file @@ -119,6 +121,22 @@ def argument_parser() -> argparse.ArgumentParser: f"gets filled up with more sequences from the " f"beginning. " f"Default: 1 in {DEFAULT_UNIQUE_SAMPLE_EVERY}.") + parser.add_argument("--overrepresentation-bases-from-start", type=int, + default=DEFAULT_BASES_FROM_START, + metavar="BP", + help=f"The minimum amount of bases sampled from the " + f"start of the read. There might be slight overshoot " + f"depending on the fragment length. Set to a " + f"negative value to sample the entire read. " + f"Default: {DEFAULT_BASES_FROM_START}") + parser.add_argument("--overrepresentation-bases-from-end", type=int, + default=DEFAULT_BASES_FROM_END, + metavar="BP", + help=f"The minimum amount of bases sampled from the " + f"end of the read. There might be slight overshoot " + f"depending on the fragment length. Set to a " + f"negative value to sample the entire read. " + f"Default: {DEFAULT_BASES_FROM_END}") parser.add_argument("--duplication-max-stored-fingerprints", type=int, default=DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS, metavar="N", @@ -188,7 +206,7 @@ def main() -> None: metrics1 = QCMetrics() per_tile_quality1 = PerTileQuality() nanostats1 = NanoStats() - sequence_duplication1 = SequenceDuplication( + overrepresented_sequences1 = OverrepresentedSequences( max_unique_fragments=args.overrepresentation_max_unique_fragments, fragment_length=args.overrepresentation_fragment_length, sample_every=args.overrepresentation_sample_every @@ -219,7 +237,7 @@ def main() -> None: insert_size_metrics = InsertSizeMetrics() metrics2 = QCMetrics() per_tile_quality2 = PerTileQuality() - sequence_duplication2 = SequenceDuplication( + overrepresented_sequences2 = OverrepresentedSequences( max_unique_fragments=args.overrepresentation_max_unique_fragments, fragment_length=args.overrepresentation_fragment_length, sample_every=args.overrepresentation_sample_every @@ -227,7 +245,7 @@ def main() -> None: else: metrics2 = None per_tile_quality2 = None - sequence_duplication2 = None + overrepresented_sequences2 = None insert_size_metrics = None with contextlib.ExitStack() as exit_stack: reader1 = NGSFile(args.input, threads - 1) @@ -253,7 +271,7 @@ def main() -> None: for record_array1 in reader1: metrics1.add_record_array(record_array1) per_tile_quality1.add_record_array(record_array1) - sequence_duplication1.add_record_array(record_array1) + overrepresented_sequences1.add_record_array(record_array1) nanostats1.add_record_array(record_array1) if paired: record_array2 = reader2.read(len(record_array1)) @@ -274,7 +292,7 @@ def main() -> None: insert_size_metrics.add_record_array_pair(record_array1, record_array2) # type: ignore # noqa: E501 metrics2.add_record_array(record_array2) # type: ignore # noqa: E501 per_tile_quality2.add_record_array(record_array2) # type: ignore # noqa: E501 - sequence_duplication2.add_record_array(record_array2) # type: ignore # noqa: E501 + overrepresented_sequences2.add_record_array(record_array2) # type: ignore # noqa: E501 else: adapter_counter1.add_record_array(record_array1) # type: ignore # noqa: E501 dedup_estimator.add_record_array(record_array1) @@ -289,14 +307,14 @@ def main() -> None: metrics=metrics1, adapter_counter=adapter_counter1, per_tile_quality=per_tile_quality1, - sequence_duplication=sequence_duplication1, + sequence_duplication=overrepresented_sequences1, dedup_estimator=dedup_estimator, nanostats=nanostats1, insert_size_metrics=insert_size_metrics, filename_reverse=args.input_reverse, metrics_reverse=metrics2, per_tile_quality_reverse=per_tile_quality2, - sequence_duplication_reverse=sequence_duplication2, + sequence_duplication_reverse=overrepresented_sequences2, adapters=adapters, fraction_threshold=fraction_threshold, min_threshold=min_threshold, diff --git a/src/sequali/_qc.pyi b/src/sequali/_qc.pyi index 06fd9ba..77fdaf8 100644 --- a/src/sequali/_qc.pyi +++ b/src/sequali/_qc.pyi @@ -32,6 +32,8 @@ DEFAULT_MAX_UNIQUE_FRAGMENTS: int DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS: int DEFAULT_FRAGMENT_LENGTH: int DEFAULT_UNIQUE_SAMPLE_EVERY: int +DEFAULT_BASES_FROM_START: int +DEFAULT_BASES_FROM_END: int DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH: int DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH: int DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET: int @@ -94,7 +96,7 @@ class PerTileQuality: def add_record_array(self, __record_array: FastqRecordArrayView) -> None: ... def get_tile_counts(self) -> List[Tuple[int, List[float], List[int]]]: ... -class SequenceDuplication: +class OverrepresentedSequences: number_of_sequences: int sampled_sequences: int collected_unique_fragments: int @@ -106,7 +108,10 @@ class SequenceDuplication: def __init__(self, max_unique_fragments: int = DEFAULT_MAX_UNIQUE_FRAGMENTS, fragment_length: int = DEFAULT_FRAGMENT_LENGTH, - sample_every: int = DEFAULT_UNIQUE_SAMPLE_EVERY): ... + sample_every: int = DEFAULT_UNIQUE_SAMPLE_EVERY, + bases_from_start: int = DEFAULT_BASES_FROM_START, + bases_from_end = DEFAULT_BASES_FROM_END, + ): ... def add_read(self, __read: FastqRecordView) -> None: ... def add_record_array(self, __record_array: FastqRecordArrayView) -> None: ... def sequence_counts(self) -> Dict[str, int]: ... diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c index bb8642e..57e3033 100644 --- a/src/sequali/_qcmodule.c +++ b/src/sequali/_qcmodule.c @@ -3135,9 +3135,9 @@ kmer_to_sequence(uint64_t kmer, size_t k, uint8_t *sequence) } } -/************************* - * SEQUENCE DUPLICATION * - *************************/ +/**************************** + * OverrepresentedSequences * + ****************************/ /* A module that cuts the sequence in bits of k size. The canonical (lowest) representation of the bit is used. @@ -3151,8 +3151,10 @@ kmer_to_sequence(uint64_t kmer, size_t k, uint8_t *sequence) #define DEFAULT_MAX_UNIQUE_FRAGMENTS 5000000 #define DEFAULT_FRAGMENT_LENGTH 21 #define DEFAULT_UNIQUE_SAMPLE_EVERY 8 +#define DEFAULT_BASES_FROM_START 100 +#define DEFAULT_BASES_FROM_END 100 -typedef struct _SequenceDuplicationStruct { +typedef struct _OverrepresentedSequencesStruct { PyObject_HEAD size_t fragment_length; uint64_t number_of_sequences; @@ -3166,10 +3168,12 @@ typedef struct _SequenceDuplicationStruct { uint64_t number_of_unique_fragments; uint64_t total_fragments; size_t sample_every; -} SequenceDuplication; + Py_ssize_t fragments_from_start; + Py_ssize_t fragments_from_end; +} OverrepresentedSequences; static void -SequenceDuplication_dealloc(SequenceDuplication *self) +OverrepresentedSequences_dealloc(OverrepresentedSequences *self) { PyMem_Free(self->staging_hash_table); PyMem_Free(self->hashes); @@ -3178,17 +3182,20 @@ SequenceDuplication_dealloc(SequenceDuplication *self) } static PyObject * -SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) +OverrepresentedSequences__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) { Py_ssize_t max_unique_fragments = DEFAULT_MAX_UNIQUE_FRAGMENTS; Py_ssize_t fragment_length = DEFAULT_FRAGMENT_LENGTH; Py_ssize_t sample_every = DEFAULT_UNIQUE_SAMPLE_EVERY; + Py_ssize_t bases_from_start = DEFAULT_BASES_FROM_START; + Py_ssize_t bases_from_end = DEFAULT_BASES_FROM_END; static char *kwargnames[] = {"max_unique_fragments", "fragment_length", - "sample_every", NULL}; - static char *format = "|nnn:SequenceDuplication"; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames, - &max_unique_fragments, &fragment_length, - &sample_every)) { + "sample_every", "bases_from_start", + "bases_from_end", NULL}; + static char *format = "|nnnnn:OverrepresentedSequences"; + if (!PyArg_ParseTupleAndKeywords( + args, kwargs, format, kwargnames, &max_unique_fragments, + &fragment_length, &sample_every, &bases_from_start, &bases_from_end)) { return NULL; } if (max_unique_fragments < 1) { @@ -3209,6 +3216,12 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) "sample_every must be 1 or greater. Got %zd", sample_every); return NULL; } + if (bases_from_start < 0) { + bases_from_start = UINT32_MAX; + } + if (bases_from_end < 0) { + bases_from_end = UINT32_MAX; + } /* If size is a power of 2, the modulo HASH_TABLE_SIZE can be optimised to a bitwise AND. Using 1.5 times as a base we ensure that the hashtable is utilized for at most 2/3. (Increased business degrades performance.) */ @@ -3221,7 +3234,7 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) PyMem_Free(counts); return PyErr_NoMemory(); } - SequenceDuplication *self = PyObject_New(SequenceDuplication, type); + OverrepresentedSequences *self = PyObject_New(OverrepresentedSequences, type); if (self == NULL) { PyMem_Free(hashes); PyMem_Free(counts); @@ -3239,11 +3252,15 @@ SequenceDuplication__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) self->hashes = hashes; self->counts = counts; self->sample_every = sample_every; + self->fragments_from_start = + (bases_from_start + fragment_length - 1) / fragment_length; + self->fragments_from_end = + (bases_from_end + fragment_length - 1) / fragment_length; return (PyObject *)self; } static void -Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash) +Sequence_duplication_insert_hash(OverrepresentedSequences *self, uint64_t hash) { uint64_t hash_to_index_int = self->hash_table_size - 1; uint64_t *hashes = self->hashes; @@ -3271,7 +3288,8 @@ Sequence_duplication_insert_hash(SequenceDuplication *self, uint64_t hash) } static int -SequenceDuplication_resize_staging(SequenceDuplication *self, uint64_t new_size) +OverrepresentedSequences_resize_staging(OverrepresentedSequences *self, + uint64_t new_size) { if (new_size <= self->staging_hash_table_size) { return 0; @@ -3310,7 +3328,8 @@ add_to_staging(uint64_t *staging_hash_table, uint64_t staging_hash_table_size, } static int -SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta) +OverrepresentedSequences_add_meta(OverrepresentedSequences *self, + struct FastqMeta *meta) { if (self->number_of_sequences % self->sample_every != 0) { self->number_of_sequences += 1; @@ -3341,25 +3360,44 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta) If the sequence length is exactly divisible by the fragment length, this results in exactly no overlap between front and back fragments, while still all of the sequence is being sampled. + + If the sequence is very large the amount of samples is taken is limited + by a user-settable maximum. + */ + /* Vader: Luke, Obi-Wan never told you about the algorithm... + Luke: He told me enough! It uses integer division! + Vader: Yes Luke, we have to use integer division. + Luke: No, that's not true! The compiler can optimize integer division + away for constants! + Vader: Search your feelings Luke! The fragment length has to be user + settable! + Luke: NOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO */ - Py_ssize_t total_fragments = + Py_ssize_t max_fragments = (sequence_length + fragment_length - 1) / fragment_length; + Py_ssize_t from_mid_point_fragments = max_fragments / 2; + Py_ssize_t max_start_fragments = max_fragments - from_mid_point_fragments; + Py_ssize_t fragments_from_start = + Py_MIN(self->fragments_from_start, max_start_fragments); + Py_ssize_t fragments_from_end = + Py_MIN(self->fragments_from_end, from_mid_point_fragments); + Py_ssize_t total_fragments = fragments_from_start + fragments_from_end; size_t staging_hash_bits = (size_t)ceil(log2((double)total_fragments * 1.5)); size_t staging_hash_size = 1ULL << staging_hash_bits; if (staging_hash_size > self->staging_hash_table_size) { - if (SequenceDuplication_resize_staging(self, staging_hash_size) < 0) { + if (OverrepresentedSequences_resize_staging(self, staging_hash_size) < 0) { return -1; } } uint64_t *staging_hash_table = self->staging_hash_table; memset(staging_hash_table, 0, staging_hash_size * sizeof(uint64_t)); - Py_ssize_t from_mid_point_fragments = total_fragments / 2; - Py_ssize_t mid_point = - sequence_length - (from_mid_point_fragments * fragment_length); + Py_ssize_t start_end = fragments_from_start * fragment_length; + Py_ssize_t end_start = + sequence_length - (fragments_from_end * fragment_length); bool warn_unknown = false; // Sample front sequences - for (Py_ssize_t i = 0; i < mid_point; i += fragment_length) { + for (Py_ssize_t i = 0; i < start_end; i += fragment_length) { int64_t kmer = sequence_to_canonical_kmer(sequence + i, fragment_length); if (kmer < 0) { if (kmer == TWOBIT_UNKNOWN_CHAR) { @@ -3373,7 +3411,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta) } // Sample back sequences - for (Py_ssize_t i = mid_point; i < sequence_length; i += fragment_length) { + for (Py_ssize_t i = end_start; i < sequence_length; i += fragment_length) { int64_t kmer = sequence_to_canonical_kmer(sequence + i, fragment_length); if (kmer < 0) { if (kmer == TWOBIT_UNKNOWN_CHAR) { @@ -3404,7 +3442,7 @@ SequenceDuplication_add_meta(SequenceDuplication *self, struct FastqMeta *meta) return 0; } -PyDoc_STRVAR(SequenceDuplication_add_read__doc__, +PyDoc_STRVAR(OverrepresentedSequences_add_read__doc__, "add_read($self, read, /)\n" "--\n" "\n" @@ -3413,10 +3451,11 @@ PyDoc_STRVAR(SequenceDuplication_add_read__doc__, " read\n" " A FastqRecordView object.\n"); -#define SequenceDuplication_add_read_method METH_O +#define OverrepresentedSequences_add_read_method METH_O static PyObject * -SequenceDuplication_add_read(SequenceDuplication *self, FastqRecordView *read) +OverrepresentedSequences_add_read(OverrepresentedSequences *self, + FastqRecordView *read) { if (!FastqRecordView_CheckExact(read)) { PyErr_Format(PyExc_TypeError, @@ -3424,13 +3463,13 @@ SequenceDuplication_add_read(SequenceDuplication *self, FastqRecordView *read) Py_TYPE(read)->tp_name); return NULL; } - if (SequenceDuplication_add_meta(self, &read->meta) != 0) { + if (OverrepresentedSequences_add_meta(self, &read->meta) != 0) { return NULL; } Py_RETURN_NONE; } -PyDoc_STRVAR(SequenceDuplication_add_record_array__doc__, +PyDoc_STRVAR(OverrepresentedSequences_add_record_array__doc__, "add_record_array($self, record_array, /)\n" "--\n" "\n" @@ -3439,11 +3478,11 @@ PyDoc_STRVAR(SequenceDuplication_add_record_array__doc__, " record_array\n" " A FastqRecordArrayView object.\n"); -#define SequenceDuplication_add_record_array_method METH_O +#define OverrepresentedSequences_add_record_array_method METH_O static PyObject * -SequenceDuplication_add_record_array(SequenceDuplication *self, - FastqRecordArrayView *record_array) +OverrepresentedSequences_add_record_array(OverrepresentedSequences *self, + FastqRecordArrayView *record_array) { if (!FastqRecordArrayView_CheckExact(record_array)) { PyErr_Format( @@ -3455,24 +3494,24 @@ SequenceDuplication_add_record_array(SequenceDuplication *self, Py_ssize_t number_of_records = Py_SIZE(record_array); struct FastqMeta *records = record_array->records; for (Py_ssize_t i = 0; i < number_of_records; i++) { - if (SequenceDuplication_add_meta(self, records + i) != 0) { + if (OverrepresentedSequences_add_meta(self, records + i) != 0) { return NULL; } } Py_RETURN_NONE; } -PyDoc_STRVAR(SequenceDuplication_sequence_counts__doc__, +PyDoc_STRVAR(OverrepresentedSequences_sequence_counts__doc__, "sequence_counts($self, /)\n" "--\n" "\n" "Get a dictionary with sequence counts \n"); -#define SequenceDuplication_sequence_counts_method METH_NOARGS +#define OverrepresentedSequences_sequence_counts_method METH_NOARGS static PyObject * -SequenceDuplication_sequence_counts(SequenceDuplication *self, - PyObject *Py_UNUSED(ignore)) +OverrepresentedSequences_sequence_counts(OverrepresentedSequences *self, + PyObject *Py_UNUSED(ignore)) { PyObject *count_dict = PyDict_New(); if (count_dict == NULL) { @@ -3511,7 +3550,7 @@ SequenceDuplication_sequence_counts(SequenceDuplication *self, } PyDoc_STRVAR( - SequenceDuplication_overrepresented_sequences__doc__, + OverrepresentedSequences_overrepresented_sequences__doc__, "overrepresented_sequences($self, threshold=0.001)\n" "--\n" "\n" @@ -3534,19 +3573,21 @@ PyDoc_STRVAR( "high " " numbers of sequences."); -#define SequenceDuplication_overrepresented_sequences_method \ +#define OverrepresentedSequences_overrepresented_sequences_method \ METH_VARARGS | METH_KEYWORDS static PyObject * -SequenceDuplication_overrepresented_sequences(SequenceDuplication *self, - PyObject *args, PyObject *kwargs) +OverrepresentedSequences_overrepresented_sequences(OverrepresentedSequences *self, + PyObject *args, + PyObject *kwargs) { double threshold = 0.0001; // 0.01 % Py_ssize_t min_threshold = 1; Py_ssize_t max_threshold = PY_SSIZE_T_MAX; static char *kwargnames[] = {"threshold_fraction", "min_threshold", "max_threshold", NULL}; - static char *format = "|dnn:SequenceDuplication.overrepresented_sequences"; + static char *format = + "|dnn:OverrepresentedSequences.overrepresented_sequences"; if (!PyArg_ParseTupleAndKeywords(args, kwargs, format, kwargnames, &threshold, &min_threshold, &max_threshold)) { return NULL; @@ -3627,51 +3668,53 @@ SequenceDuplication_overrepresented_sequences(SequenceDuplication *self, return NULL; } -static PyMethodDef SequenceDuplication_methods[] = { - {"add_read", (PyCFunction)SequenceDuplication_add_read, - SequenceDuplication_add_read_method, SequenceDuplication_add_read__doc__}, - {"add_record_array", (PyCFunction)SequenceDuplication_add_record_array, - SequenceDuplication_add_record_array_method, - SequenceDuplication_add_record_array__doc__}, - {"sequence_counts", (PyCFunction)SequenceDuplication_sequence_counts, - SequenceDuplication_sequence_counts_method, - SequenceDuplication_sequence_counts__doc__}, +static PyMethodDef OverrepresentedSequences_methods[] = { + {"add_read", (PyCFunction)OverrepresentedSequences_add_read, + OverrepresentedSequences_add_read_method, + OverrepresentedSequences_add_read__doc__}, + {"add_record_array", (PyCFunction)OverrepresentedSequences_add_record_array, + OverrepresentedSequences_add_record_array_method, + OverrepresentedSequences_add_record_array__doc__}, + {"sequence_counts", (PyCFunction)OverrepresentedSequences_sequence_counts, + OverrepresentedSequences_sequence_counts_method, + OverrepresentedSequences_sequence_counts__doc__}, {"overrepresented_sequences", - (PyCFunction)(void (*)(void))SequenceDuplication_overrepresented_sequences, - SequenceDuplication_overrepresented_sequences_method, - SequenceDuplication_overrepresented_sequences__doc__}, + (PyCFunction)(void (*)(void))OverrepresentedSequences_overrepresented_sequences, + OverrepresentedSequences_overrepresented_sequences_method, + OverrepresentedSequences_overrepresented_sequences__doc__}, {NULL}, }; -static PyMemberDef SequenceDuplication_members[] = { +static PyMemberDef OverrepresentedSequences_members[] = { {"number_of_sequences", T_ULONGLONG, - offsetof(SequenceDuplication, number_of_sequences), READONLY, + offsetof(OverrepresentedSequences, number_of_sequences), READONLY, "The total number of sequences submitted."}, {"sampled_sequences", T_ULONGLONG, - offsetof(SequenceDuplication, sampled_sequences), READONLY, + offsetof(OverrepresentedSequences, sampled_sequences), READONLY, "The total number of sequences that were analysed."}, {"collected_unique_fragments", T_ULONGLONG, - offsetof(SequenceDuplication, number_of_unique_fragments), READONLY, + offsetof(OverrepresentedSequences, number_of_unique_fragments), READONLY, "The number of unique fragments collected."}, {"max_unique_fragments", T_ULONGLONG, - offsetof(SequenceDuplication, max_unique_fragments), READONLY, + offsetof(OverrepresentedSequences, max_unique_fragments), READONLY, "The maximum number of unique sequences stored in the object."}, - {"fragment_length", T_BYTE, offsetof(SequenceDuplication, fragment_length), + {"fragment_length", T_BYTE, offsetof(OverrepresentedSequences, fragment_length), READONLY, "The length of the sampled sequences"}, - {"sample_every", T_PYSSIZET, offsetof(SequenceDuplication, sample_every), + {"sample_every", T_PYSSIZET, offsetof(OverrepresentedSequences, sample_every), READONLY, "One in this many reads is sampled"}, - {"total_fragments", T_ULONGLONG, offsetof(SequenceDuplication, total_fragments), - READONLY, "Total number of fragments."}, + {"total_fragments", T_ULONGLONG, + offsetof(OverrepresentedSequences, total_fragments), READONLY, + "Total number of fragments."}, {NULL}, }; -static PyTypeObject SequenceDuplication_Type = { - .tp_name = "_qc.SequenceDuplication", - .tp_basicsize = sizeof(SequenceDuplication), - .tp_dealloc = (destructor)(SequenceDuplication_dealloc), - .tp_new = (newfunc)SequenceDuplication__new__, - .tp_members = SequenceDuplication_members, - .tp_methods = SequenceDuplication_methods, +static PyTypeObject OverrepresentedSequences_Type = { + .tp_name = "_qc.OverrepresentedSequences", + .tp_basicsize = sizeof(OverrepresentedSequences), + .tp_dealloc = (destructor)(OverrepresentedSequences_dealloc), + .tp_new = (newfunc)OverrepresentedSequences__new__, + .tp_members = OverrepresentedSequences_members, + .tp_methods = OverrepresentedSequences_methods, }; /******************* @@ -5193,7 +5236,7 @@ PyInit__qc(void) if (python_module_add_type(m, &PerTileQuality_Type) != 0) { return NULL; } - if (python_module_add_type(m, &SequenceDuplication_Type) != 0) { + if (python_module_add_type(m, &OverrepresentedSequences_Type) != 0) { return NULL; } if (python_module_add_type(m, &DedupEstimator_Type) != 0) { @@ -5226,6 +5269,8 @@ PyInit__qc(void) PyModule_AddIntMacro(m, DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS); PyModule_AddIntMacro(m, DEFAULT_FRAGMENT_LENGTH); PyModule_AddIntMacro(m, DEFAULT_UNIQUE_SAMPLE_EVERY); + PyModule_AddIntMacro(m, DEFAULT_BASES_FROM_START); + PyModule_AddIntMacro(m, DEFAULT_BASES_FROM_END); PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH); PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH); PyModule_AddIntMacro(m, DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET); diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py index d539c73..0c9af2e 100644 --- a/src/sequali/report_modules.py +++ b/src/sequali/report_modules.py @@ -35,9 +35,16 @@ import pygal.style # type: ignore from ._qc import A, C, G, N, T -from ._qc import (AdapterCounter, DedupEstimator, - INSERT_SIZE_MAX_ADAPTER_STORE_SIZE, InsertSizeMetrics, - NanoStats, PerTileQuality, QCMetrics, SequenceDuplication) +from ._qc import ( + AdapterCounter, + DedupEstimator, + INSERT_SIZE_MAX_ADAPTER_STORE_SIZE, + InsertSizeMetrics, + NanoStats, + OverrepresentedSequences, + PerTileQuality, + QCMetrics, +) from ._qc import NUMBER_OF_NUCS, NUMBER_OF_PHREDS, PHRED_MAX from ._version import __version__ from .adapters import Adapter @@ -1495,7 +1502,7 @@ def to_html(self) -> str: @classmethod def from_sequence_duplication( cls, - seqdup: SequenceDuplication, + seqdup: OverrepresentedSequences, fraction_threshold: float = DEFAULT_FRACTION_THRESHOLD, min_threshold: int = DEFAULT_MIN_THRESHOLD, max_threshold: int = DEFAULT_MAX_THRESHOLD, @@ -2131,7 +2138,7 @@ def calculate_stats( filename: str, metrics: QCMetrics, per_tile_quality: PerTileQuality, - sequence_duplication: SequenceDuplication, + sequence_duplication: OverrepresentedSequences, dedup_estimator: DedupEstimator, nanostats: NanoStats, adapters: List[Adapter], @@ -2140,7 +2147,7 @@ def calculate_stats( insert_size_metrics: Optional[InsertSizeMetrics] = None, metrics_reverse: Optional[QCMetrics] = None, per_tile_quality_reverse: Optional[PerTileQuality] = None, - sequence_duplication_reverse: Optional[SequenceDuplication] = None, + sequence_duplication_reverse: Optional[OverrepresentedSequences] = None, graph_resolution: int = 200, fraction_threshold: float = DEFAULT_FRACTION_THRESHOLD, min_threshold: int = DEFAULT_MIN_THRESHOLD, diff --git a/tests/test_sequence_duplication.py b/tests/test_overrepresented_sequences.py similarity index 54% rename from tests/test_sequence_duplication.py rename to tests/test_overrepresented_sequences.py index b7cee7f..e7391b7 100644 --- a/tests/test_sequence_duplication.py +++ b/tests/test_overrepresented_sequences.py @@ -19,7 +19,7 @@ import pytest -from sequali import FastqRecordView, SequenceDuplication +from sequali import FastqRecordView, OverrepresentedSequences def view_from_sequence(sequence: str) -> FastqRecordView: @@ -30,12 +30,14 @@ def view_from_sequence(sequence: str) -> FastqRecordView: ) -def test_sequence_duplication(): +def test_overrepresented_sequences(): max_unique_fragments = 100_000 fragment_length = 31 - seqdup = SequenceDuplication(max_unique_fragments=max_unique_fragments, - fragment_length=fragment_length, - sample_every=1) + seqdup = OverrepresentedSequences( + max_unique_fragments=max_unique_fragments, + fragment_length=fragment_length, + sample_every=1 + ) # Create unique sequences by using all combinations of ACGT for the amount # of letters that is necessary to completely saturate the maximum unique # sequences @@ -58,41 +60,42 @@ def test_sequence_duplication(): assert sequence_counts[fragment_length * "A"] == 2 -def test_sequence_duplication_add_read_no_view(): - seqdup = SequenceDuplication() +def test_overrepresented_sequences_add_read_no_view(): + seqs = OverrepresentedSequences() with pytest.raises(TypeError) as error: - seqdup.add_read(b"ACGT") # type: ignore + seqs.add_read(b"ACGT") # type: ignore error.match("FastqRecordView") error.match("bytes") @pytest.mark.parametrize("threshold", [-0.1, 1.1]) -def test_sequence_duplication_overrepresented_sequences_faulty_threshold(threshold): - seqdup = SequenceDuplication() +def test_overrepresented_sequences_overrepresented_sequences_faulty_threshold( + threshold): + seqs = OverrepresentedSequences() with pytest.raises(ValueError) as error: - seqdup.overrepresented_sequences(threshold_fraction=threshold) + seqs.overrepresented_sequences(threshold_fraction=threshold) error.match(str(threshold)) error.match("between") error.match("0.0") error.match("1.0") -def test_sequence_duplication_overrepresented_sequences(): +def test_overrepresented_sequences_overrepresented_sequences(): fragment_length = 31 - seqdup = SequenceDuplication(sample_every=1, fragment_length=fragment_length) + seqs = OverrepresentedSequences(sample_every=1, fragment_length=fragment_length) for i in range(100): - seqdup.add_read(view_from_sequence("A" * fragment_length)) + seqs.add_read(view_from_sequence("A" * fragment_length)) for i in range(200): - seqdup.add_read(view_from_sequence("C" * fragment_length)) + seqs.add_read(view_from_sequence("C" * fragment_length)) for i in range(2000): - seqdup.add_read(view_from_sequence("G" * fragment_length)) + seqs.add_read(view_from_sequence("G" * fragment_length)) for i in range(10): - seqdup.add_read(view_from_sequence("T" * fragment_length)) - seqdup.add_read(view_from_sequence("C" * (fragment_length - 1) + "A")) + seqs.add_read(view_from_sequence("T" * fragment_length)) + seqs.add_read(view_from_sequence("C" * (fragment_length - 1) + "A")) for i in range(100_000 - (100 + 200 + 2000 + 10 + 1)): # Count up to 100_000 to get nice fractions for all the sequences - seqdup.add_read(view_from_sequence("A" * (fragment_length - 1) + "C")) - overrepresented = seqdup.overrepresented_sequences(threshold_fraction=0.001) + seqs.add_read(view_from_sequence("A" * (fragment_length - 1) + "C")) + overrepresented = seqs.overrepresented_sequences(threshold_fraction=0.001) assert overrepresented[0][2] == "A" * (fragment_length - 1) + "C" assert overrepresented[1][2] == "C" * fragment_length assert overrepresented[1][0] == 2200 @@ -100,14 +103,14 @@ def test_sequence_duplication_overrepresented_sequences(): assert overrepresented[2][1] == 110 / 100_000 # Assert no other sequences recorded as overrepresented. assert len(overrepresented) == 3 - overrepresented = seqdup.overrepresented_sequences(threshold_fraction=0.00001) + overrepresented = seqs.overrepresented_sequences(threshold_fraction=0.00001) assert overrepresented[-1][2] == "C" * (fragment_length - 1) + "A" - overrepresented = seqdup.overrepresented_sequences( + overrepresented = seqs.overrepresented_sequences( threshold_fraction=0.00001, min_threshold=2, ) assert overrepresented[-1][2] == "A" * fragment_length - overrepresented = seqdup.overrepresented_sequences( + overrepresented = seqs.overrepresented_sequences( threshold_fraction=0.1, min_threshold=2, max_threshold=1000, @@ -116,29 +119,29 @@ def test_sequence_duplication_overrepresented_sequences(): assert overrepresented[1][2] == "C" * fragment_length -def test_sequence_duplication_case_insensitive(): +def test_overrepresented_sequences_case_insensitive(): fragment_length = 31 - seqdup = SequenceDuplication(fragment_length=fragment_length, sample_every=1) - seqdup.add_read(view_from_sequence("aaTTaca" * 5)) - seqdup.add_read(view_from_sequence("AAttACA" * 5)) - seqcounts = seqdup.sequence_counts() - assert seqdup.number_of_sequences == 2 - assert seqdup.total_fragments == 4 - assert seqdup.collected_unique_fragments == 2 + seqs = OverrepresentedSequences(fragment_length=fragment_length, sample_every=1) + seqs.add_read(view_from_sequence("aaTTaca" * 5)) + seqs.add_read(view_from_sequence("AAttACA" * 5)) + seqcounts = seqs.sequence_counts() + assert seqs.number_of_sequences == 2 + assert seqs.total_fragments == 4 + assert seqs.collected_unique_fragments == 2 assert len(seqcounts) == 2 assert seqcounts[("AATTACA" * 5)[:fragment_length]] == 2 assert seqcounts[("AATTACA" * 5)[-fragment_length:]] == 2 @pytest.mark.parametrize("divisor", list(range(1, 21))) -def test_sequence_duplication_sampling_rate(divisor): - seqdup = SequenceDuplication(sample_every=divisor) +def test_overrepresented_sequences_sampling_rate(divisor): + seqs = OverrepresentedSequences(sample_every=divisor) read = view_from_sequence("AAAA") number_of_sequences = 10_000 for i in range(number_of_sequences): - seqdup.add_read(read) - assert seqdup.number_of_sequences == number_of_sequences - assert seqdup.sampled_sequences == (number_of_sequences + divisor - 1) // divisor + seqs.add_read(read) + assert seqs.number_of_sequences == number_of_sequences + assert seqs.sampled_sequences == (number_of_sequences + divisor - 1) // divisor @pytest.mark.parametrize(["sequence", "result"], [ @@ -149,30 +152,57 @@ def test_sequence_duplication_sampling_rate(divisor): # Fragments that are duplicated in the sequence should only be recorded once. ("GATTACGATTAC", {"ATC": 1, "GTA": 1}), ]) -def test_sequence_duplication_all_fragments(sequence, result): - seqdup = SequenceDuplication(fragment_length=3, sample_every=1) - seqdup.add_read(view_from_sequence(sequence)) - seq_counts = seqdup.sequence_counts() +def test_overrepresented_sequences_all_fragments(sequence, result): + seqs = OverrepresentedSequences(fragment_length=3, sample_every=1) + seqs.add_read(view_from_sequence(sequence)) + seq_counts = seqs.sequence_counts() assert seq_counts == result def test_very_short_sequence(): # With 32 byte load this will overflow the used memory. - seqdup = SequenceDuplication(fragment_length=3, sample_every=1) - seqdup.add_read(view_from_sequence("ACT")) - assert seqdup.sequence_counts() == {"ACT": 1} + seqs = OverrepresentedSequences(fragment_length=3, sample_every=1) + seqs.add_read(view_from_sequence("ACT")) + assert seqs.sequence_counts() == {"ACT": 1} def test_non_iupac_warning(): - seqdup = SequenceDuplication(fragment_length=3, sample_every=1) + seqs = OverrepresentedSequences(fragment_length=3, sample_every=1) with pytest.warns(UserWarning, match="KKK"): - seqdup.add_read(view_from_sequence("KKK")) + seqs.add_read(view_from_sequence("KKK")) def test_valid_does_not_warn_for_n(): - seqdup = SequenceDuplication(fragment_length=3, sample_every=1) + seqs = OverrepresentedSequences(fragment_length=3, sample_every=1) with warnings.catch_warnings(): warnings.simplefilter("error") - seqdup.add_read(view_from_sequence("ACGTN")) + seqs.add_read(view_from_sequence("ACGTN")) # N does lead to a sample not being loaded. - assert seqdup.sampled_sequences == 1 + assert seqs.sampled_sequences == 1 + + +@pytest.mark.parametrize( + ["bases_from_start", "bases_from_end", "result"], [ + (0, 0, ()), + (1, 1, ("AAC", "CAA")), + (2, 2, ("AAC", "CAA")), + (3, 3, ("AAC", "CAA")), + (4, 4, ("AAC", "CAA", "CCG", "GCC")), + (1, 0, ("AAC",)), + (0, 1, ("CAA",)), + (100, 100, ("AAA", "AAC", "CAA", "CCG", "GCC")), + (-1, -1, ("AAA", "AAC", "CAA", "CCG", "GCC")), + ] +) +def test_overrepresented_sequences_sample_from_begin_and_end( + bases_from_start, bases_from_end, result): + seqs = OverrepresentedSequences( + fragment_length=3, + sample_every=1, + bases_from_start=bases_from_start, + bases_from_end=bases_from_end + ) + seqs.add_read(view_from_sequence("AACCGGTTTTGGCCAA")) + overrepresented = [x[2] for x in seqs.overrepresented_sequences(min_threshold=1)] + overrepresented.sort() + assert tuple(overrepresented) == result