From 4e6a17094aaaec1fafe696f4d3ced3e121797750 Mon Sep 17 00:00:00 2001
From: Michael Hall <michael@mbh.sh>
Date: Fri, 22 Nov 2024 10:39:16 +1000
Subject: [PATCH] docs(lib): finialise liblrge docs

---
 Cargo.lock              |  1 +
 Cargo.toml              |  1 +
 liblrge/Cargo.toml      | 14 ++++---
 liblrge/README.md       | 10 +++++
 liblrge/src/ava.rs      | 38 ++++++++++++++++++-
 liblrge/src/estimate.rs | 18 +++++----
 liblrge/src/lib.rs      | 81 +++++++++++++++++++++++++++++++++++++++--
 liblrge/src/twoset.rs   | 43 ++++++++++++++++++++++
 lrge/Cargo.toml         |  1 +
 lrge/src/main.rs        |  2 +-
 10 files changed, 191 insertions(+), 18 deletions(-)
 create mode 100644 liblrge/README.md

diff --git a/Cargo.lock b/Cargo.lock
index af76366..402dcc7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -359,6 +359,7 @@ dependencies = [
  "bzip2",
  "crossbeam-channel",
  "csv",
+ "env_logger",
  "flate2",
  "libc",
  "liblzma",
diff --git a/Cargo.toml b/Cargo.toml
index 0966474..57fef0c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,6 +15,7 @@ readme = "README.md"
 keywords = ["bioinformatics", "long-reads", "overlaps", "estimation", "genome-size"]
 license-file = "LICENSE"
 exclude = ["paper/*"]
+rust-version = "1.74.1"
 
 [workspace.dependencies]
 log = "0.4.22"
diff --git a/liblrge/Cargo.toml b/liblrge/Cargo.toml
index 5ab2489..518c5e9 100644
--- a/liblrge/Cargo.toml
+++ b/liblrge/Cargo.toml
@@ -7,10 +7,11 @@ authors.workspace = true
 edition.workspace = true
 repository.workspace = true
 homepage.workspace = true
-readme.workspace = true
+readme = "README.md"
 keywords.workspace = true
 license-file.workspace = true
 exclude.workspace = true
+rust-version.workspace = true
 
 [dependencies]
 log.workspace = true
@@ -25,13 +26,16 @@ serde = { version = "1.0.215", features = ["derive"] }
 csv = "1.3.1"
 
 # for compression support
-flate2 = { version = "1.0.34", optional = true, package = "flate2" }  # For gzip files
-zstd = { version = "0.13.2", optional = true }   # For zstd files
-bzip2 = { version = "0.4.4", optional = true }   # For bzip2 files
-liblzma = { version = "0.3.5", optional = true, package = "liblzma" } # For lzma (xz) files.
+flate2 = { version = "1.0.34", optional = true, package = "flate2" }
+zstd = { version = "0.13.2", optional = true }
+bzip2 = { version = "0.4.4", optional = true }
+liblzma = { version = "0.3.5", optional = true, package = "liblzma" }
 
 [features]
 compression = ["gzip", "zstd", "bzip2", "xz"]  # Enable compression support
 default = ["compression"]  # Enable compression by default
 xz = ["liblzma"]  # Alias "xz" to "liblzma" dependency
 gzip = ["flate2"]  # Alias "gzip" to "flate2" dependency
+
+[dev-dependencies]
+env_logger = "0.11.5"  # for documentation tests
\ No newline at end of file
diff --git a/liblrge/README.md b/liblrge/README.md
new file mode 100644
index 0000000..1681810
--- /dev/null
+++ b/liblrge/README.md
@@ -0,0 +1,10 @@
+# liblrge
+
+![docs.rs](https://img.shields.io/docsrs/liblrge)
+![Crates.io Version](https://img.shields.io/crates/v/liblrge)
+![Crates.io Total Downloads](https://img.shields.io/crates/d/liblrge)
+
+This is a Rust library for estimating genome size from long read overlaps. The library is used by the `lrge` command 
+line tool documented in the [root of this repository](https://www.github.com/mbhall88/lrge).
+
+See the [documentation](https:docs.rs/liblrge) for example usage and API documentation.
\ No newline at end of file
diff --git a/liblrge/src/ava.rs b/liblrge/src/ava.rs
index cbd778f..6fba03e 100644
--- a/liblrge/src/ava.rs
+++ b/liblrge/src/ava.rs
@@ -1,7 +1,40 @@
 //! A strategy that compares overlaps between the same set of reads - i.e., all-vs-all.
 //!
-//! In general, this strategy is less computationally efficient than [`TwoSetStrategy`], but it
+//! In general, this strategy is less computationally efficient than [`TwoSetStrategy`][crate::TwoSetStrategy], but it
 //! is slightly more accurate - though that difference in accuracy is not statistically significant.
+//!
+//! # Examples
+//!
+//! You probably want to use the [`Builder`] interface to customise the strategy.
+//!
+//! ```no_run
+//! use liblrge::{Estimate, AvaStrategy};
+//! use liblrge::estimate::{LOWER_QUANTILE, UPPER_QUANTILE};
+//! use liblrge::ava::{Builder, DEFAULT_AVA_NUM_READS};
+//!
+//! let input = "path/to/reads.fastq";
+//! let mut strategy = Builder::new()
+//!    .num_reads(DEFAULT_AVA_NUM_READS)
+//!    .threads(4)
+//!    .seed(Some(42))  // makes the estimate reproducible
+//!    .build(input);
+//!
+//! let finite = true;  // estimate the genome size based on the finite estimates (recommended)
+//! let low_q = Some(LOWER_QUANTILE);   // lower quantile for the confidence interval
+//! let upper_q = Some(UPPER_QUANTILE); // upper quantile for the confidence interval
+//! let est_result = strategy.estimate(finite, low_q, upper_q).expect("Failed to generate estimate");
+//! let estimate = est_result.estimate;
+//!
+//! let no_mapping_count = est_result.no_mapping_count;
+//! // you might want to handle cases where some proportion of query reads did not overlap with target reads
+//! ```
+//!
+//! By default, the intermediate reads and overlap files are written to a temporary directory and
+//! cleaned up after the strategy object is dropped. This is done via the use of the [`tempfile`](https://crates.io/crates/tempfile) crate.
+//! The intermediate reads file will be placed inside the temporary directory and names `reads.fq`,
+//! while the overlap file will be named `overlaps.paf`.
+//!
+//! You can set your own temporary directory by using the [`Builder::tmpdir`] method.
 mod builder;
 
 use std::collections::{HashMap, HashSet};
@@ -23,6 +56,7 @@ use crate::io::FastqRecordExt;
 use crate::minimap2::{AlignerWrapper, Preset};
 use crate::{io, unique_random_set, Estimate, Platform};
 
+/// The default number of reads to use in the all-vs-all strategy.
 pub const DEFAULT_AVA_NUM_READS: usize = 25_000;
 
 /// A strategy that compares overlaps between two sets of reads.
@@ -30,6 +64,8 @@ pub const DEFAULT_AVA_NUM_READS: usize = 25_000;
 /// The convention is to use a smaller set of query reads and a larger set of target reads. The
 /// query reads are overlapped with the target reads and an estimated genome size is calculated
 /// for **each query read** based on the number of overlaps it has with the target set.
+///
+/// See the [module-level documentation](crate::ava) for more information and examples.
 pub struct AvaStrategy {
     /// Path to the FASTQ file.
     input: PathBuf,
diff --git a/liblrge/src/estimate.rs b/liblrge/src/estimate.rs
index 0f88ff9..e05624a 100644
--- a/liblrge/src/estimate.rs
+++ b/liblrge/src/estimate.rs
@@ -9,7 +9,7 @@ pub struct EstimateResult {
     /// The lower quantile of the estimates
     pub lower: Option<f32>,
     /// The median of the estimates - this is the genome size estimate
-    pub median: Option<f32>,
+    pub estimate: Option<f32>,
     /// The upper quantile of the estimates
     pub upper: Option<f32>,
     /// The number of reads that did not have an overlap
@@ -34,9 +34,9 @@ pub trait Estimate {
     /// * `finite`: Whether to consider only finite estimates. We found setting this to `true` gave
     ///   more accurate results (see the paper).
     /// * `lower_quant`: The lower percentile to calculate. If `None`, this will not be calculated.
-    ///   This value should be between 0 and 1. So, for the 25th percentile, you would pass `0.25`.
+    ///   This value should be between 0 and 0.5. So, for the 25th percentile, you would pass `0.25`.
     /// * `upper_quant`: The upper percentile to calculate. If `None`, this will not be calculated.
-    ///   This value should be between 0 and 1. So, for the 75th percentile, you would pass `0.75`.
+    ///   This value should be between 0.5 and 1.0. So, for the 75th percentile, you would pass `0.75`.
     ///
     /// In our analysis, we found that the 15th and 65th percentiles gave the highest confidence (~92%).
     /// If you want to use our most current recommended values, you can use the constants [`LOWER_QUANTILE`]
@@ -44,11 +44,13 @@ pub trait Estimate {
     ///
     /// # Returns
     ///
-    /// A tuple containing the lower percentile (if requested), median, and upper percentile (if
-    /// requested) of the estimates - in that order.
+    /// An [`EstimateResult`] containing the lower, median, and upper estimates, as well as the number
+    /// of reads that did not have an overlap. This number can be important for quality control. For
+    /// examples, if you have a high number of reads that did not overlap, you may want to investigate
+    /// why that is (e.g., contamination, poor quality reads, etc.).
     ///
-    /// Will return `None` if there are no finite estimates when `finite` is `true`. Will also return
-    /// `None` if there are no estimates at all.
+    /// The estimate will be `None` if there are no finite estimates when `finite` is `true`, or if
+    /// there are no estimates at all.
     fn estimate(
         &mut self,
         finite: bool,
@@ -67,7 +69,7 @@ pub trait Estimate {
 
         Ok(EstimateResult {
             lower,
-            median,
+            estimate: median,
             upper,
             no_mapping_count,
         })
diff --git a/liblrge/src/lib.rs b/liblrge/src/lib.rs
index fb22b38..862e235 100644
--- a/liblrge/src/lib.rs
+++ b/liblrge/src/lib.rs
@@ -3,6 +3,59 @@
 //! `liblrge` is a Rust library that provides utilities for estimating genome size for a given set
 //! of reads.
 //!
+//! You can find a command-line interface (CLI) tool that uses this library in the [`lrge`][lrge] crate.
+//!
+//! [lrge]: https://crates.io/crates/lrge
+//!
+//! ## Usage
+//!
+//! The library provides two strategies for estimating genome size:
+//!
+//! ### [`TwoSetStrategy`]
+//!
+//! The two-set strategy uses two (random) sets of reads to estimate the genome size. The query set, which is
+//! generally smaller, is overlapped against a target set of reads. A genome size estimate is generated
+//! for each read in the query set, based on the number of overlaps and the average read length.
+//! The median of these estimates is taken as the final genome size estimate.
+//!
+//! ```no_run
+//! use liblrge::{Estimate, TwoSetStrategy};
+//! use liblrge::twoset::{Builder, DEFAULT_TARGET_NUM_READS, DEFAULT_QUERY_NUM_READS};
+//!
+//! let input = "path/to/reads.fastq";
+//! let mut strategy = Builder::new()
+//!    .target_num_reads(DEFAULT_TARGET_NUM_READS)
+//!    .query_num_reads(DEFAULT_QUERY_NUM_READS)
+//!    .threads(4)
+//!    .build(input);
+//!
+//! let est_result = strategy.estimate(false, None, None).expect("Failed to generate estimate");
+//! let estimate = est_result.estimate;
+//! // do something with the estimate
+//! ```
+//!
+//! ### [`AvaStrategy`]
+//!
+//! The all-vs-all (ava) strategy takes a (random) set of reads and overlaps it against itself to
+//! estimate the genome size. The genome size estimate is generated for each read in the set, based on the
+//! number of overlaps and the average read length - minus the read being assessed. The median of these
+//! estimates is taken as the final genome size estimate.
+//!
+//! ```no_run
+//! use liblrge::{Estimate, AvaStrategy};
+//! use liblrge::ava::{Builder, DEFAULT_AVA_NUM_READS};
+//!
+//! let input = "path/to/reads.fastq";
+//! let mut strategy = Builder::new()
+//!    .num_reads(DEFAULT_AVA_NUM_READS)
+//!   .threads(4)
+//!   .build(input);
+//!
+//! let est_result = strategy.estimate(false, None, None).expect("Failed to generate estimate");
+//! let estimate = est_result.estimate;
+//! // do something with the estimate
+//! ```
+//!
 //! ## Features
 //!
 //! This library includes optional support for compressed file formats, controlled by feature flags.
@@ -48,9 +101,31 @@
 //! [xz]: https://crates.io/liblzma
 //! [bzip2]: https://crates.io/crates/bzip2
 //! [magic]: https://en.wikipedia.org/wiki/Magic_number_(programming)#In_files
+//!
+//! ## Disabling logging
+//!
+//! `liblrge` will output some logging information via the [`log`][log] crate. If you wish to
+//! suppress this logging you can configure the logging level in your application. For example, using
+//! the [`env_logger`][env_logger] crate you can do the following:
+//!
+//! ```
+//! use log::LevelFilter;
+//!
+//! let mut log_builder = env_logger::Builder::new();
+//! log_builder
+//!     .filter(None, LevelFilter::Info)
+//!     .filter_module("liblrge", LevelFilter::Off);
+//! log_builder.init();
+//!
+//! // Your application code here
+//! ```
+//!
+//! This will set the global logging level to `Info` and disable all logging from the `liblrge` library.
+//!
+//! [log]: https://crates.io/crates/log
+//! [env_logger]: https://crates.io/crates/env_logger
 // todo add link to paper
-// todo add library denies such as #![deny(missing_docs)]
-// todo add info on how to suppress logging
+#[deny(missing_docs)]
 pub mod ava;
 pub mod error;
 pub mod estimate;
@@ -66,7 +141,7 @@ pub use self::estimate::Estimate;
 pub use self::twoset::TwoSetStrategy;
 use std::str::FromStr;
 
-/// A type alias for `Result` with [`LrgeError`] as the error type.
+/// A type alias for `Result` with [`LrgeError`][crate::error::LrgeError] as the error type.
 pub type Result<T> = std::result::Result<T, error::LrgeError>;
 
 /// The sequencing platform used to generate the reads.
diff --git a/liblrge/src/twoset.rs b/liblrge/src/twoset.rs
index 1e8fb92..a681479 100644
--- a/liblrge/src/twoset.rs
+++ b/liblrge/src/twoset.rs
@@ -1,4 +1,45 @@
 //! A strategy that compares overlaps between two different sets of reads.
+//!
+//! The convention is to use a smaller set of query reads and a larger set of target reads. The query
+//! reads are overlapped with the target reads and an estimated genome size is calculated for
+//! **each query read** based on the number of overlaps it has with the target set.
+//!
+//! This strategy is generally faster than the [`AvaStrategy`][crate::AvaStrategy] and uses less memory.
+//!
+//! # Examples
+//!
+//! You probably want to use the [`Builder`] interface to customise the strategy.
+//!
+//! ```no_run
+//! use liblrge::{Estimate, TwoSetStrategy};
+//! use liblrge::estimate::{LOWER_QUANTILE, UPPER_QUANTILE};
+//! use liblrge::twoset::{Builder, DEFAULT_TARGET_NUM_READS, DEFAULT_QUERY_NUM_READS};
+//!
+//! let input = "path/to/reads.fastq";
+//! let mut strategy = Builder::new()
+//!    .target_num_reads(DEFAULT_TARGET_NUM_READS)
+//!    .query_num_reads(DEFAULT_QUERY_NUM_READS)
+//!    .threads(4)
+//!    .seed(Some(42))  // makes the estimate reproducible
+//!    .build(input);
+//!
+//! let finite = true;  // estimate the genome size based on the finite estimates (recommended)
+//! let low_q = Some(LOWER_QUANTILE);   // lower quantile for the confidence interval
+//! let upper_q = Some(UPPER_QUANTILE); // upper quantile for the confidence interval
+//! let est_result = strategy.estimate(finite, low_q, upper_q).expect("Failed to generate estimate");
+//! let estimate = est_result.estimate;
+//!
+//! let no_mapping_count = est_result.no_mapping_count;
+//! // you might want to handle cases where some proportion of query reads did not overlap with target reads
+//! ```
+//!
+//! By default, the intermediate target and query reads and overlap files are written to a temporary
+//! directory and cleaned up after the strategy object is dropped. This is done via the use of the
+//! [`tempfile`](https://crates.io/crates/tempfile) crate. The intermediate read files are placed in
+//! the temporary directory and named `target.fq` and `query.fq`, while the overlap file is named
+//! `overlaps.paf`.
+//!
+//! You can set your own temporary directory by using the [`Builder::tmpdir`] method.
 mod builder;
 
 use std::collections::HashSet;
@@ -27,6 +68,8 @@ pub const DEFAULT_QUERY_NUM_READS: usize = 5_000;
 /// The convention is to use a smaller set of query reads and a larger set of target reads. The
 /// query reads are overlapped with the target reads and an estimated genome size is calculated
 /// for **each query read** based on the number of overlaps it has with the target set.
+///
+/// See the [module-level documentation](crate::twoset) for more information and examples.
 pub struct TwoSetStrategy {
     /// Path to the FASTQ file.
     input: PathBuf,
diff --git a/lrge/Cargo.toml b/lrge/Cargo.toml
index 814f1d5..b4b8754 100644
--- a/lrge/Cargo.toml
+++ b/lrge/Cargo.toml
@@ -11,6 +11,7 @@ readme.workspace = true
 keywords.workspace = true
 license-file.workspace = true
 exclude.workspace = true
+rust-version.workspace = true
 
 [dependencies]
 log.workspace = true
diff --git a/lrge/src/main.rs b/lrge/src/main.rs
index c5a5027..bce0f23 100644
--- a/lrge/src/main.rs
+++ b/lrge/src/main.rs
@@ -85,7 +85,7 @@ fn main() -> Result<()> {
         .estimate(!args.with_infinity, Some(args.lower_q), Some(args.upper_q))
         .context("Failed to generate estimate")?;
 
-    let estimate = est_result.median;
+    let estimate = est_result.estimate;
     let low_q = est_result.lower;
     let upper_q = est_result.upper;