rhpvorderman · rhpvorderman · Nov 20, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,14 @@ Changelog
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+version 0.13.0-dev
+------------------
++ Only sample the first 100 bp from the beginning and end of the read by
+  default for the overrepresented sequences analysis. This prevents a lot of
+  false positives from common human genome repeats. The amount of base pairs
+  that are sampled from the beginning and end is user settable with an option
+  to sample everything.
+
 version 0.12.0
 ------------------
 + Properly name percentiles as such in the sequence length distribution rather

diff --git a/docs/_static/images/overrepresented_sampling.fodg b/docs/_static/images/overrepresented_sampling.fodg
diff --git a/docs/_static/images/overrepresented_sampling.svg b/docs/_static/images/overrepresented_sampling.svg
diff --git a/docs/module_options.rst b/docs/module_options.rst
@@ -53,16 +53,20 @@ ends towards the middle. This means that the first and last fragment will
 always be the first 21 bp of the beginning and the last 21 bp in the end. As
 such the adapter sequences will always be sampled in the same frame.
 
+To prevent many common genome repeats from ending up in the report, Sequali
+limits the amount of sampling inwards to approximately 100 bp. With the default
+fragment size of 21 bp, this means that 5 fragments are taken from each side.
+
 .. figure:: _static/images/overrepresented_sampling.svg
 
     This figure shows how fragments are sampled in sequali. The silver elements
     represent the fragments. Sequence #1 is longer and hence more fragments are
     sampled. Despite the length differences between sequence #1 and sequence #2
     the fragments for the adapter and barcode sequences are the same.
     In Sequence #1 the fragments sampled from the back end overlap somewhat
-    with sequences from the front end. This is a necessity to ensure all of the
-    sequence is sampled when the length is not divisible by the size of the
-    fragments.
+    with sequences from the front end. In sequence #3 the sequence is quite
+    long so the middle is not sampled. This prevents many common genome repeats
+    from ending up in the report.
 
 For each sequence only unique fragments within the sequence are sampled, to
 avoid overrepresenting common genomic repeats.
@@ -89,6 +93,10 @@ The following command line parameters affect this module:
   store.
 + ``--overrepresentation-sample-every``: How often a sequence is sampled. Default
   is every 8 sequences.
++ ``--overrepresentation-bases-from-start``: the minimum amount of bases to
+  take from the start of the sequence. Default is 100.
++ ``--overrepresentation-bases-from-end``: the minimum amount of bases to
+  take from the end of the sequence. Default is 100.
 
 .. [#F1] A canonical k-mer is the k-mer that has the lowest sort order compared
          to itself and its reverse complement. This way the canonical-kmer is

diff --git a/src/sequali/__init__.py b/src/sequali/__init__.py
@@ -17,7 +17,7 @@
 from ._qc import A, C, G, N, T
 from ._qc import (
     AdapterCounter, BamParser, FastqParser, FastqRecordArrayView,
-    FastqRecordView, PerTileQuality, QCMetrics, SequenceDuplication
+    FastqRecordView, OverrepresentedSequences, PerTileQuality, QCMetrics,
 )
 from ._qc import NUMBER_OF_NUCS, NUMBER_OF_PHREDS, PHRED_MAX, TABLE_SIZE
 from ._version import __version__
@@ -32,7 +32,7 @@
     "FastqRecordArrayView",
     "PerTileQuality",
     "QCMetrics",
-    "SequenceDuplication",
+    "OverrepresentedSequences",
     "NUMBER_OF_NUCS",
     "NUMBER_OF_PHREDS",
     "PHRED_MAX",

diff --git a/src/sequali/__main__.py b/src/sequali/__main__.py
@@ -23,6 +23,8 @@
 
 from ._qc import (
     AdapterCounter,
+    DEFAULT_BASES_FROM_END,
+    DEFAULT_BASES_FROM_START,
     DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS,
     DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH,
     DEFAULT_FINGERPRINT_BACK_SEQUENCE_OFFSET,
@@ -34,9 +36,9 @@
     DedupEstimator,
     InsertSizeMetrics,
     NanoStats,
+    OverrepresentedSequences,
     PerTileQuality,
     QCMetrics,
-    SequenceDuplication
 )
 from ._version import __version__
 from .adapters import DEFAULT_ADAPTER_FILE, adapters_from_file
@@ -119,6 +121,22 @@ def argument_parser() -> argparse.ArgumentParser:
                              f"gets filled up with more sequences from the "
                              f"beginning. "
                              f"Default: 1 in {DEFAULT_UNIQUE_SAMPLE_EVERY}.")
+    parser.add_argument("--overrepresentation-bases-from-start", type=int,
+                        default=DEFAULT_BASES_FROM_START,
+                        metavar="BP",
+                        help=f"The minimum amount of bases sampled from the "
+                             f"start of the read. There might be slight overshoot "
+                             f"depending on the fragment length. Set to a "
+                             f"negative value to sample the entire read. "
+                             f"Default: {DEFAULT_BASES_FROM_START}")
+    parser.add_argument("--overrepresentation-bases-from-end", type=int,
+                        default=DEFAULT_BASES_FROM_END,
+                        metavar="BP",
+                        help=f"The minimum amount of bases sampled from the "
+                             f"end of the read. There might be slight overshoot "
+                             f"depending on the fragment length. Set to a "
+                             f"negative value to sample the entire read. "
+                             f"Default: {DEFAULT_BASES_FROM_END}")
     parser.add_argument("--duplication-max-stored-fingerprints", type=int,
                         default=DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS,
                         metavar="N",
@@ -188,7 +206,7 @@ def main() -> None:
     metrics1 = QCMetrics()
     per_tile_quality1 = PerTileQuality()
     nanostats1 = NanoStats()
-    sequence_duplication1 = SequenceDuplication(
+    overrepresented_sequences1 = OverrepresentedSequences(
         max_unique_fragments=args.overrepresentation_max_unique_fragments,
         fragment_length=args.overrepresentation_fragment_length,
         sample_every=args.overrepresentation_sample_every
@@ -219,15 +237,15 @@ def main() -> None:
         insert_size_metrics = InsertSizeMetrics()
         metrics2 = QCMetrics()
         per_tile_quality2 = PerTileQuality()
-        sequence_duplication2 = SequenceDuplication(
+        overrepresented_sequences2 = OverrepresentedSequences(
             max_unique_fragments=args.overrepresentation_max_unique_fragments,
             fragment_length=args.overrepresentation_fragment_length,
             sample_every=args.overrepresentation_sample_every
         )
     else:
         metrics2 = None
         per_tile_quality2 = None
-        sequence_duplication2 = None
+        overrepresented_sequences2 = None
         insert_size_metrics = None
     with contextlib.ExitStack() as exit_stack:
         reader1 = NGSFile(args.input, threads - 1)
@@ -253,7 +271,7 @@ def main() -> None:
         for record_array1 in reader1:
             metrics1.add_record_array(record_array1)
             per_tile_quality1.add_record_array(record_array1)
-            sequence_duplication1.add_record_array(record_array1)
+            overrepresented_sequences1.add_record_array(record_array1)
             nanostats1.add_record_array(record_array1)
             if paired:
                 record_array2 = reader2.read(len(record_array1))
@@ -274,7 +292,7 @@ def main() -> None:
                 insert_size_metrics.add_record_array_pair(record_array1, record_array2)  # type: ignore  # noqa: E501
                 metrics2.add_record_array(record_array2)  # type: ignore  # noqa: E501
                 per_tile_quality2.add_record_array(record_array2)  # type: ignore  # noqa: E501
-                sequence_duplication2.add_record_array(record_array2)  # type: ignore  # noqa: E501
+                overrepresented_sequences2.add_record_array(record_array2)  # type: ignore  # noqa: E501
             else:
                 adapter_counter1.add_record_array(record_array1)   # type: ignore  # noqa: E501
                 dedup_estimator.add_record_array(record_array1)
@@ -289,14 +307,14 @@ def main() -> None:
         metrics=metrics1,
         adapter_counter=adapter_counter1,
         per_tile_quality=per_tile_quality1,
-        sequence_duplication=sequence_duplication1,
+        sequence_duplication=overrepresented_sequences1,
         dedup_estimator=dedup_estimator,
         nanostats=nanostats1,
         insert_size_metrics=insert_size_metrics,
         filename_reverse=args.input_reverse,
         metrics_reverse=metrics2,
         per_tile_quality_reverse=per_tile_quality2,
-        sequence_duplication_reverse=sequence_duplication2,
+        sequence_duplication_reverse=overrepresented_sequences2,
         adapters=adapters,
         fraction_threshold=fraction_threshold,
         min_threshold=min_threshold,

diff --git a/src/sequali/_qc.pyi b/src/sequali/_qc.pyi
@@ -32,6 +32,8 @@ DEFAULT_MAX_UNIQUE_FRAGMENTS: int
 DEFAULT_DEDUP_MAX_STORED_FINGERPRINTS: int
 DEFAULT_FRAGMENT_LENGTH: int
 DEFAULT_UNIQUE_SAMPLE_EVERY: int
+DEFAULT_BASES_FROM_START: int
+DEFAULT_BASES_FROM_END: int
 DEFAULT_FINGERPRINT_FRONT_SEQUENCE_LENGTH: int
 DEFAULT_FINGERPRINT_BACK_SEQUENCE_LENGTH: int
 DEFAULT_FINGERPRINT_FRONT_SEQUENCE_OFFSET: int
@@ -94,7 +96,7 @@ class PerTileQuality:
     def add_record_array(self, __record_array: FastqRecordArrayView) -> None: ...
     def get_tile_counts(self) -> List[Tuple[int, List[float], List[int]]]: ...
 
-class SequenceDuplication:
+class OverrepresentedSequences:
     number_of_sequences: int
     sampled_sequences: int
     collected_unique_fragments: int
@@ -106,7 +108,10 @@ class SequenceDuplication:
     def __init__(self,
                  max_unique_fragments: int = DEFAULT_MAX_UNIQUE_FRAGMENTS,
                  fragment_length: int = DEFAULT_FRAGMENT_LENGTH,
-                 sample_every: int = DEFAULT_UNIQUE_SAMPLE_EVERY): ...
+                 sample_every: int = DEFAULT_UNIQUE_SAMPLE_EVERY,
+                 bases_from_start: int = DEFAULT_BASES_FROM_START,
+                 bases_from_end = DEFAULT_BASES_FROM_END,
+    ): ...
     def add_read(self, __read: FastqRecordView) -> None: ...
     def add_record_array(self, __record_array: FastqRecordArrayView) -> None: ...
     def sequence_counts(self) -> Dict[str, int]: ...