From 4e6a17094aaaec1fafe696f4d3ced3e121797750 Mon Sep 17 00:00:00 2001 From: Michael Hall Date: Fri, 22 Nov 2024 10:39:16 +1000 Subject: [PATCH] docs(lib): finialise liblrge docs --- Cargo.lock | 1 + Cargo.toml | 1 + liblrge/Cargo.toml | 14 ++++--- liblrge/README.md | 10 +++++ liblrge/src/ava.rs | 38 ++++++++++++++++++- liblrge/src/estimate.rs | 18 +++++---- liblrge/src/lib.rs | 81 +++++++++++++++++++++++++++++++++++++++-- liblrge/src/twoset.rs | 43 ++++++++++++++++++++++ lrge/Cargo.toml | 1 + lrge/src/main.rs | 2 +- 10 files changed, 191 insertions(+), 18 deletions(-) create mode 100644 liblrge/README.md diff --git a/Cargo.lock b/Cargo.lock index af76366..402dcc7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -359,6 +359,7 @@ dependencies = [ "bzip2", "crossbeam-channel", "csv", + "env_logger", "flate2", "libc", "liblzma", diff --git a/Cargo.toml b/Cargo.toml index 0966474..57fef0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ readme = "README.md" keywords = ["bioinformatics", "long-reads", "overlaps", "estimation", "genome-size"] license-file = "LICENSE" exclude = ["paper/*"] +rust-version = "1.74.1" [workspace.dependencies] log = "0.4.22" diff --git a/liblrge/Cargo.toml b/liblrge/Cargo.toml index 5ab2489..518c5e9 100644 --- a/liblrge/Cargo.toml +++ b/liblrge/Cargo.toml @@ -7,10 +7,11 @@ authors.workspace = true edition.workspace = true repository.workspace = true homepage.workspace = true -readme.workspace = true +readme = "README.md" keywords.workspace = true license-file.workspace = true exclude.workspace = true +rust-version.workspace = true [dependencies] log.workspace = true @@ -25,13 +26,16 @@ serde = { version = "1.0.215", features = ["derive"] } csv = "1.3.1" # for compression support -flate2 = { version = "1.0.34", optional = true, package = "flate2" } # For gzip files -zstd = { version = "0.13.2", optional = true } # For zstd files -bzip2 = { version = "0.4.4", optional = true } # For bzip2 files -liblzma = { version = "0.3.5", optional = true, package = "liblzma" } # For lzma (xz) files. +flate2 = { version = "1.0.34", optional = true, package = "flate2" } +zstd = { version = "0.13.2", optional = true } +bzip2 = { version = "0.4.4", optional = true } +liblzma = { version = "0.3.5", optional = true, package = "liblzma" } [features] compression = ["gzip", "zstd", "bzip2", "xz"] # Enable compression support default = ["compression"] # Enable compression by default xz = ["liblzma"] # Alias "xz" to "liblzma" dependency gzip = ["flate2"] # Alias "gzip" to "flate2" dependency + +[dev-dependencies] +env_logger = "0.11.5" # for documentation tests \ No newline at end of file diff --git a/liblrge/README.md b/liblrge/README.md new file mode 100644 index 0000000..1681810 --- /dev/null +++ b/liblrge/README.md @@ -0,0 +1,10 @@ +# liblrge + +![docs.rs](https://img.shields.io/docsrs/liblrge) +![Crates.io Version](https://img.shields.io/crates/v/liblrge) +![Crates.io Total Downloads](https://img.shields.io/crates/d/liblrge) + +This is a Rust library for estimating genome size from long read overlaps. The library is used by the `lrge` command +line tool documented in the [root of this repository](https://www.github.com/mbhall88/lrge). + +See the [documentation](https:docs.rs/liblrge) for example usage and API documentation. \ No newline at end of file diff --git a/liblrge/src/ava.rs b/liblrge/src/ava.rs index cbd778f..6fba03e 100644 --- a/liblrge/src/ava.rs +++ b/liblrge/src/ava.rs @@ -1,7 +1,40 @@ //! A strategy that compares overlaps between the same set of reads - i.e., all-vs-all. //! -//! In general, this strategy is less computationally efficient than [`TwoSetStrategy`], but it +//! In general, this strategy is less computationally efficient than [`TwoSetStrategy`][crate::TwoSetStrategy], but it //! is slightly more accurate - though that difference in accuracy is not statistically significant. +//! +//! # Examples +//! +//! You probably want to use the [`Builder`] interface to customise the strategy. +//! +//! ```no_run +//! use liblrge::{Estimate, AvaStrategy}; +//! use liblrge::estimate::{LOWER_QUANTILE, UPPER_QUANTILE}; +//! use liblrge::ava::{Builder, DEFAULT_AVA_NUM_READS}; +//! +//! let input = "path/to/reads.fastq"; +//! let mut strategy = Builder::new() +//! .num_reads(DEFAULT_AVA_NUM_READS) +//! .threads(4) +//! .seed(Some(42)) // makes the estimate reproducible +//! .build(input); +//! +//! let finite = true; // estimate the genome size based on the finite estimates (recommended) +//! let low_q = Some(LOWER_QUANTILE); // lower quantile for the confidence interval +//! let upper_q = Some(UPPER_QUANTILE); // upper quantile for the confidence interval +//! let est_result = strategy.estimate(finite, low_q, upper_q).expect("Failed to generate estimate"); +//! let estimate = est_result.estimate; +//! +//! let no_mapping_count = est_result.no_mapping_count; +//! // you might want to handle cases where some proportion of query reads did not overlap with target reads +//! ``` +//! +//! By default, the intermediate reads and overlap files are written to a temporary directory and +//! cleaned up after the strategy object is dropped. This is done via the use of the [`tempfile`](https://crates.io/crates/tempfile) crate. +//! The intermediate reads file will be placed inside the temporary directory and names `reads.fq`, +//! while the overlap file will be named `overlaps.paf`. +//! +//! You can set your own temporary directory by using the [`Builder::tmpdir`] method. mod builder; use std::collections::{HashMap, HashSet}; @@ -23,6 +56,7 @@ use crate::io::FastqRecordExt; use crate::minimap2::{AlignerWrapper, Preset}; use crate::{io, unique_random_set, Estimate, Platform}; +/// The default number of reads to use in the all-vs-all strategy. pub const DEFAULT_AVA_NUM_READS: usize = 25_000; /// A strategy that compares overlaps between two sets of reads. @@ -30,6 +64,8 @@ pub const DEFAULT_AVA_NUM_READS: usize = 25_000; /// The convention is to use a smaller set of query reads and a larger set of target reads. The /// query reads are overlapped with the target reads and an estimated genome size is calculated /// for **each query read** based on the number of overlaps it has with the target set. +/// +/// See the [module-level documentation](crate::ava) for more information and examples. pub struct AvaStrategy { /// Path to the FASTQ file. input: PathBuf, diff --git a/liblrge/src/estimate.rs b/liblrge/src/estimate.rs index 0f88ff9..e05624a 100644 --- a/liblrge/src/estimate.rs +++ b/liblrge/src/estimate.rs @@ -9,7 +9,7 @@ pub struct EstimateResult { /// The lower quantile of the estimates pub lower: Option, /// The median of the estimates - this is the genome size estimate - pub median: Option, + pub estimate: Option, /// The upper quantile of the estimates pub upper: Option, /// The number of reads that did not have an overlap @@ -34,9 +34,9 @@ pub trait Estimate { /// * `finite`: Whether to consider only finite estimates. We found setting this to `true` gave /// more accurate results (see the paper). /// * `lower_quant`: The lower percentile to calculate. If `None`, this will not be calculated. - /// This value should be between 0 and 1. So, for the 25th percentile, you would pass `0.25`. + /// This value should be between 0 and 0.5. So, for the 25th percentile, you would pass `0.25`. /// * `upper_quant`: The upper percentile to calculate. If `None`, this will not be calculated. - /// This value should be between 0 and 1. So, for the 75th percentile, you would pass `0.75`. + /// This value should be between 0.5 and 1.0. So, for the 75th percentile, you would pass `0.75`. /// /// In our analysis, we found that the 15th and 65th percentiles gave the highest confidence (~92%). /// If you want to use our most current recommended values, you can use the constants [`LOWER_QUANTILE`] @@ -44,11 +44,13 @@ pub trait Estimate { /// /// # Returns /// - /// A tuple containing the lower percentile (if requested), median, and upper percentile (if - /// requested) of the estimates - in that order. + /// An [`EstimateResult`] containing the lower, median, and upper estimates, as well as the number + /// of reads that did not have an overlap. This number can be important for quality control. For + /// examples, if you have a high number of reads that did not overlap, you may want to investigate + /// why that is (e.g., contamination, poor quality reads, etc.). /// - /// Will return `None` if there are no finite estimates when `finite` is `true`. Will also return - /// `None` if there are no estimates at all. + /// The estimate will be `None` if there are no finite estimates when `finite` is `true`, or if + /// there are no estimates at all. fn estimate( &mut self, finite: bool, @@ -67,7 +69,7 @@ pub trait Estimate { Ok(EstimateResult { lower, - median, + estimate: median, upper, no_mapping_count, }) diff --git a/liblrge/src/lib.rs b/liblrge/src/lib.rs index fb22b38..862e235 100644 --- a/liblrge/src/lib.rs +++ b/liblrge/src/lib.rs @@ -3,6 +3,59 @@ //! `liblrge` is a Rust library that provides utilities for estimating genome size for a given set //! of reads. //! +//! You can find a command-line interface (CLI) tool that uses this library in the [`lrge`][lrge] crate. +//! +//! [lrge]: https://crates.io/crates/lrge +//! +//! ## Usage +//! +//! The library provides two strategies for estimating genome size: +//! +//! ### [`TwoSetStrategy`] +//! +//! The two-set strategy uses two (random) sets of reads to estimate the genome size. The query set, which is +//! generally smaller, is overlapped against a target set of reads. A genome size estimate is generated +//! for each read in the query set, based on the number of overlaps and the average read length. +//! The median of these estimates is taken as the final genome size estimate. +//! +//! ```no_run +//! use liblrge::{Estimate, TwoSetStrategy}; +//! use liblrge::twoset::{Builder, DEFAULT_TARGET_NUM_READS, DEFAULT_QUERY_NUM_READS}; +//! +//! let input = "path/to/reads.fastq"; +//! let mut strategy = Builder::new() +//! .target_num_reads(DEFAULT_TARGET_NUM_READS) +//! .query_num_reads(DEFAULT_QUERY_NUM_READS) +//! .threads(4) +//! .build(input); +//! +//! let est_result = strategy.estimate(false, None, None).expect("Failed to generate estimate"); +//! let estimate = est_result.estimate; +//! // do something with the estimate +//! ``` +//! +//! ### [`AvaStrategy`] +//! +//! The all-vs-all (ava) strategy takes a (random) set of reads and overlaps it against itself to +//! estimate the genome size. The genome size estimate is generated for each read in the set, based on the +//! number of overlaps and the average read length - minus the read being assessed. The median of these +//! estimates is taken as the final genome size estimate. +//! +//! ```no_run +//! use liblrge::{Estimate, AvaStrategy}; +//! use liblrge::ava::{Builder, DEFAULT_AVA_NUM_READS}; +//! +//! let input = "path/to/reads.fastq"; +//! let mut strategy = Builder::new() +//! .num_reads(DEFAULT_AVA_NUM_READS) +//! .threads(4) +//! .build(input); +//! +//! let est_result = strategy.estimate(false, None, None).expect("Failed to generate estimate"); +//! let estimate = est_result.estimate; +//! // do something with the estimate +//! ``` +//! //! ## Features //! //! This library includes optional support for compressed file formats, controlled by feature flags. @@ -48,9 +101,31 @@ //! [xz]: https://crates.io/liblzma //! [bzip2]: https://crates.io/crates/bzip2 //! [magic]: https://en.wikipedia.org/wiki/Magic_number_(programming)#In_files +//! +//! ## Disabling logging +//! +//! `liblrge` will output some logging information via the [`log`][log] crate. If you wish to +//! suppress this logging you can configure the logging level in your application. For example, using +//! the [`env_logger`][env_logger] crate you can do the following: +//! +//! ``` +//! use log::LevelFilter; +//! +//! let mut log_builder = env_logger::Builder::new(); +//! log_builder +//! .filter(None, LevelFilter::Info) +//! .filter_module("liblrge", LevelFilter::Off); +//! log_builder.init(); +//! +//! // Your application code here +//! ``` +//! +//! This will set the global logging level to `Info` and disable all logging from the `liblrge` library. +//! +//! [log]: https://crates.io/crates/log +//! [env_logger]: https://crates.io/crates/env_logger // todo add link to paper -// todo add library denies such as #![deny(missing_docs)] -// todo add info on how to suppress logging +#[deny(missing_docs)] pub mod ava; pub mod error; pub mod estimate; @@ -66,7 +141,7 @@ pub use self::estimate::Estimate; pub use self::twoset::TwoSetStrategy; use std::str::FromStr; -/// A type alias for `Result` with [`LrgeError`] as the error type. +/// A type alias for `Result` with [`LrgeError`][crate::error::LrgeError] as the error type. pub type Result = std::result::Result; /// The sequencing platform used to generate the reads. diff --git a/liblrge/src/twoset.rs b/liblrge/src/twoset.rs index 1e8fb92..a681479 100644 --- a/liblrge/src/twoset.rs +++ b/liblrge/src/twoset.rs @@ -1,4 +1,45 @@ //! A strategy that compares overlaps between two different sets of reads. +//! +//! The convention is to use a smaller set of query reads and a larger set of target reads. The query +//! reads are overlapped with the target reads and an estimated genome size is calculated for +//! **each query read** based on the number of overlaps it has with the target set. +//! +//! This strategy is generally faster than the [`AvaStrategy`][crate::AvaStrategy] and uses less memory. +//! +//! # Examples +//! +//! You probably want to use the [`Builder`] interface to customise the strategy. +//! +//! ```no_run +//! use liblrge::{Estimate, TwoSetStrategy}; +//! use liblrge::estimate::{LOWER_QUANTILE, UPPER_QUANTILE}; +//! use liblrge::twoset::{Builder, DEFAULT_TARGET_NUM_READS, DEFAULT_QUERY_NUM_READS}; +//! +//! let input = "path/to/reads.fastq"; +//! let mut strategy = Builder::new() +//! .target_num_reads(DEFAULT_TARGET_NUM_READS) +//! .query_num_reads(DEFAULT_QUERY_NUM_READS) +//! .threads(4) +//! .seed(Some(42)) // makes the estimate reproducible +//! .build(input); +//! +//! let finite = true; // estimate the genome size based on the finite estimates (recommended) +//! let low_q = Some(LOWER_QUANTILE); // lower quantile for the confidence interval +//! let upper_q = Some(UPPER_QUANTILE); // upper quantile for the confidence interval +//! let est_result = strategy.estimate(finite, low_q, upper_q).expect("Failed to generate estimate"); +//! let estimate = est_result.estimate; +//! +//! let no_mapping_count = est_result.no_mapping_count; +//! // you might want to handle cases where some proportion of query reads did not overlap with target reads +//! ``` +//! +//! By default, the intermediate target and query reads and overlap files are written to a temporary +//! directory and cleaned up after the strategy object is dropped. This is done via the use of the +//! [`tempfile`](https://crates.io/crates/tempfile) crate. The intermediate read files are placed in +//! the temporary directory and named `target.fq` and `query.fq`, while the overlap file is named +//! `overlaps.paf`. +//! +//! You can set your own temporary directory by using the [`Builder::tmpdir`] method. mod builder; use std::collections::HashSet; @@ -27,6 +68,8 @@ pub const DEFAULT_QUERY_NUM_READS: usize = 5_000; /// The convention is to use a smaller set of query reads and a larger set of target reads. The /// query reads are overlapped with the target reads and an estimated genome size is calculated /// for **each query read** based on the number of overlaps it has with the target set. +/// +/// See the [module-level documentation](crate::twoset) for more information and examples. pub struct TwoSetStrategy { /// Path to the FASTQ file. input: PathBuf, diff --git a/lrge/Cargo.toml b/lrge/Cargo.toml index 814f1d5..b4b8754 100644 --- a/lrge/Cargo.toml +++ b/lrge/Cargo.toml @@ -11,6 +11,7 @@ readme.workspace = true keywords.workspace = true license-file.workspace = true exclude.workspace = true +rust-version.workspace = true [dependencies] log.workspace = true diff --git a/lrge/src/main.rs b/lrge/src/main.rs index c5a5027..bce0f23 100644 --- a/lrge/src/main.rs +++ b/lrge/src/main.rs @@ -85,7 +85,7 @@ fn main() -> Result<()> { .estimate(!args.with_infinity, Some(args.lower_q), Some(args.upper_q)) .context("Failed to generate estimate")?; - let estimate = est_result.median; + let estimate = est_result.estimate; let low_q = est_result.lower; let upper_q = est_result.upper;