Skip to content

Commit

Permalink
docs(lib): finialise liblrge docs
Browse files Browse the repository at this point in the history
  • Loading branch information
mbhall88 committed Nov 22, 2024
1 parent 917b786 commit 4e6a170
Show file tree
Hide file tree
Showing 10 changed files with 191 additions and 18 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ readme = "README.md"
keywords = ["bioinformatics", "long-reads", "overlaps", "estimation", "genome-size"]
license-file = "LICENSE"
exclude = ["paper/*"]
rust-version = "1.74.1"

[workspace.dependencies]
log = "0.4.22"
Expand Down
14 changes: 9 additions & 5 deletions liblrge/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ authors.workspace = true
edition.workspace = true
repository.workspace = true
homepage.workspace = true
readme.workspace = true
readme = "README.md"
keywords.workspace = true
license-file.workspace = true
exclude.workspace = true
rust-version.workspace = true

[dependencies]
log.workspace = true
Expand All @@ -25,13 +26,16 @@ serde = { version = "1.0.215", features = ["derive"] }
csv = "1.3.1"

# for compression support
flate2 = { version = "1.0.34", optional = true, package = "flate2" } # For gzip files
zstd = { version = "0.13.2", optional = true } # For zstd files
bzip2 = { version = "0.4.4", optional = true } # For bzip2 files
liblzma = { version = "0.3.5", optional = true, package = "liblzma" } # For lzma (xz) files.
flate2 = { version = "1.0.34", optional = true, package = "flate2" }
zstd = { version = "0.13.2", optional = true }
bzip2 = { version = "0.4.4", optional = true }
liblzma = { version = "0.3.5", optional = true, package = "liblzma" }

[features]
compression = ["gzip", "zstd", "bzip2", "xz"] # Enable compression support
default = ["compression"] # Enable compression by default
xz = ["liblzma"] # Alias "xz" to "liblzma" dependency
gzip = ["flate2"] # Alias "gzip" to "flate2" dependency

[dev-dependencies]
env_logger = "0.11.5" # for documentation tests
10 changes: 10 additions & 0 deletions liblrge/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# liblrge

![docs.rs](https://img.shields.io/docsrs/liblrge)
![Crates.io Version](https://img.shields.io/crates/v/liblrge)
![Crates.io Total Downloads](https://img.shields.io/crates/d/liblrge)

This is a Rust library for estimating genome size from long read overlaps. The library is used by the `lrge` command
line tool documented in the [root of this repository](https://www.github.com/mbhall88/lrge).

See the [documentation](https:docs.rs/liblrge) for example usage and API documentation.
38 changes: 37 additions & 1 deletion liblrge/src/ava.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,40 @@
//! A strategy that compares overlaps between the same set of reads - i.e., all-vs-all.
//!
//! In general, this strategy is less computationally efficient than [`TwoSetStrategy`], but it
//! In general, this strategy is less computationally efficient than [`TwoSetStrategy`][crate::TwoSetStrategy], but it
//! is slightly more accurate - though that difference in accuracy is not statistically significant.
//!
//! # Examples
//!
//! You probably want to use the [`Builder`] interface to customise the strategy.
//!
//! ```no_run
//! use liblrge::{Estimate, AvaStrategy};
//! use liblrge::estimate::{LOWER_QUANTILE, UPPER_QUANTILE};
//! use liblrge::ava::{Builder, DEFAULT_AVA_NUM_READS};
//!
//! let input = "path/to/reads.fastq";
//! let mut strategy = Builder::new()
//! .num_reads(DEFAULT_AVA_NUM_READS)
//! .threads(4)
//! .seed(Some(42)) // makes the estimate reproducible
//! .build(input);
//!
//! let finite = true; // estimate the genome size based on the finite estimates (recommended)
//! let low_q = Some(LOWER_QUANTILE); // lower quantile for the confidence interval
//! let upper_q = Some(UPPER_QUANTILE); // upper quantile for the confidence interval
//! let est_result = strategy.estimate(finite, low_q, upper_q).expect("Failed to generate estimate");
//! let estimate = est_result.estimate;
//!
//! let no_mapping_count = est_result.no_mapping_count;
//! // you might want to handle cases where some proportion of query reads did not overlap with target reads
//! ```
//!
//! By default, the intermediate reads and overlap files are written to a temporary directory and
//! cleaned up after the strategy object is dropped. This is done via the use of the [`tempfile`](https://crates.io/crates/tempfile) crate.
//! The intermediate reads file will be placed inside the temporary directory and names `reads.fq`,
//! while the overlap file will be named `overlaps.paf`.
//!
//! You can set your own temporary directory by using the [`Builder::tmpdir`] method.
mod builder;

use std::collections::{HashMap, HashSet};
Expand All @@ -23,13 +56,16 @@ use crate::io::FastqRecordExt;
use crate::minimap2::{AlignerWrapper, Preset};
use crate::{io, unique_random_set, Estimate, Platform};

/// The default number of reads to use in the all-vs-all strategy.
pub const DEFAULT_AVA_NUM_READS: usize = 25_000;

/// A strategy that compares overlaps between two sets of reads.
///
/// The convention is to use a smaller set of query reads and a larger set of target reads. The
/// query reads are overlapped with the target reads and an estimated genome size is calculated
/// for **each query read** based on the number of overlaps it has with the target set.
///
/// See the [module-level documentation](crate::ava) for more information and examples.
pub struct AvaStrategy {
/// Path to the FASTQ file.
input: PathBuf,
Expand Down
18 changes: 10 additions & 8 deletions liblrge/src/estimate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pub struct EstimateResult {
/// The lower quantile of the estimates
pub lower: Option<f32>,
/// The median of the estimates - this is the genome size estimate
pub median: Option<f32>,
pub estimate: Option<f32>,
/// The upper quantile of the estimates
pub upper: Option<f32>,
/// The number of reads that did not have an overlap
Expand All @@ -34,21 +34,23 @@ pub trait Estimate {
/// * `finite`: Whether to consider only finite estimates. We found setting this to `true` gave
/// more accurate results (see the paper).
/// * `lower_quant`: The lower percentile to calculate. If `None`, this will not be calculated.
/// This value should be between 0 and 1. So, for the 25th percentile, you would pass `0.25`.
/// This value should be between 0 and 0.5. So, for the 25th percentile, you would pass `0.25`.
/// * `upper_quant`: The upper percentile to calculate. If `None`, this will not be calculated.
/// This value should be between 0 and 1. So, for the 75th percentile, you would pass `0.75`.
/// This value should be between 0.5 and 1.0. So, for the 75th percentile, you would pass `0.75`.
///
/// In our analysis, we found that the 15th and 65th percentiles gave the highest confidence (~92%).
/// If you want to use our most current recommended values, you can use the constants [`LOWER_QUANTILE`]
/// and [`UPPER_QUANTILE`]. You can of course use any values you like.
///
/// # Returns
///
/// A tuple containing the lower percentile (if requested), median, and upper percentile (if
/// requested) of the estimates - in that order.
/// An [`EstimateResult`] containing the lower, median, and upper estimates, as well as the number
/// of reads that did not have an overlap. This number can be important for quality control. For
/// examples, if you have a high number of reads that did not overlap, you may want to investigate
/// why that is (e.g., contamination, poor quality reads, etc.).
///
/// Will return `None` if there are no finite estimates when `finite` is `true`. Will also return
/// `None` if there are no estimates at all.
/// The estimate will be `None` if there are no finite estimates when `finite` is `true`, or if
/// there are no estimates at all.
fn estimate(
&mut self,
finite: bool,
Expand All @@ -67,7 +69,7 @@ pub trait Estimate {

Ok(EstimateResult {
lower,
median,
estimate: median,
upper,
no_mapping_count,
})
Expand Down
81 changes: 78 additions & 3 deletions liblrge/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,59 @@
//! `liblrge` is a Rust library that provides utilities for estimating genome size for a given set
//! of reads.
//!
//! You can find a command-line interface (CLI) tool that uses this library in the [`lrge`][lrge] crate.
//!
//! [lrge]: https://crates.io/crates/lrge
//!
//! ## Usage
//!
//! The library provides two strategies for estimating genome size:
//!
//! ### [`TwoSetStrategy`]
//!
//! The two-set strategy uses two (random) sets of reads to estimate the genome size. The query set, which is
//! generally smaller, is overlapped against a target set of reads. A genome size estimate is generated
//! for each read in the query set, based on the number of overlaps and the average read length.
//! The median of these estimates is taken as the final genome size estimate.
//!
//! ```no_run
//! use liblrge::{Estimate, TwoSetStrategy};
//! use liblrge::twoset::{Builder, DEFAULT_TARGET_NUM_READS, DEFAULT_QUERY_NUM_READS};
//!
//! let input = "path/to/reads.fastq";
//! let mut strategy = Builder::new()
//! .target_num_reads(DEFAULT_TARGET_NUM_READS)
//! .query_num_reads(DEFAULT_QUERY_NUM_READS)
//! .threads(4)
//! .build(input);
//!
//! let est_result = strategy.estimate(false, None, None).expect("Failed to generate estimate");
//! let estimate = est_result.estimate;
//! // do something with the estimate
//! ```
//!
//! ### [`AvaStrategy`]
//!
//! The all-vs-all (ava) strategy takes a (random) set of reads and overlaps it against itself to
//! estimate the genome size. The genome size estimate is generated for each read in the set, based on the
//! number of overlaps and the average read length - minus the read being assessed. The median of these
//! estimates is taken as the final genome size estimate.
//!
//! ```no_run
//! use liblrge::{Estimate, AvaStrategy};
//! use liblrge::ava::{Builder, DEFAULT_AVA_NUM_READS};
//!
//! let input = "path/to/reads.fastq";
//! let mut strategy = Builder::new()
//! .num_reads(DEFAULT_AVA_NUM_READS)
//! .threads(4)
//! .build(input);
//!
//! let est_result = strategy.estimate(false, None, None).expect("Failed to generate estimate");
//! let estimate = est_result.estimate;
//! // do something with the estimate
//! ```
//!
//! ## Features
//!
//! This library includes optional support for compressed file formats, controlled by feature flags.
Expand Down Expand Up @@ -48,9 +101,31 @@
//! [xz]: https://crates.io/liblzma
//! [bzip2]: https://crates.io/crates/bzip2
//! [magic]: https://en.wikipedia.org/wiki/Magic_number_(programming)#In_files
//!
//! ## Disabling logging
//!
//! `liblrge` will output some logging information via the [`log`][log] crate. If you wish to
//! suppress this logging you can configure the logging level in your application. For example, using
//! the [`env_logger`][env_logger] crate you can do the following:
//!
//! ```
//! use log::LevelFilter;
//!
//! let mut log_builder = env_logger::Builder::new();
//! log_builder
//! .filter(None, LevelFilter::Info)
//! .filter_module("liblrge", LevelFilter::Off);
//! log_builder.init();
//!
//! // Your application code here
//! ```
//!
//! This will set the global logging level to `Info` and disable all logging from the `liblrge` library.
//!
//! [log]: https://crates.io/crates/log
//! [env_logger]: https://crates.io/crates/env_logger
// todo add link to paper
// todo add library denies such as #![deny(missing_docs)]
// todo add info on how to suppress logging
#[deny(missing_docs)]
pub mod ava;
pub mod error;
pub mod estimate;
Expand All @@ -66,7 +141,7 @@ pub use self::estimate::Estimate;
pub use self::twoset::TwoSetStrategy;
use std::str::FromStr;

/// A type alias for `Result` with [`LrgeError`] as the error type.
/// A type alias for `Result` with [`LrgeError`][crate::error::LrgeError] as the error type.
pub type Result<T> = std::result::Result<T, error::LrgeError>;

/// The sequencing platform used to generate the reads.
Expand Down
43 changes: 43 additions & 0 deletions liblrge/src/twoset.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,45 @@
//! A strategy that compares overlaps between two different sets of reads.
//!
//! The convention is to use a smaller set of query reads and a larger set of target reads. The query
//! reads are overlapped with the target reads and an estimated genome size is calculated for
//! **each query read** based on the number of overlaps it has with the target set.
//!
//! This strategy is generally faster than the [`AvaStrategy`][crate::AvaStrategy] and uses less memory.
//!
//! # Examples
//!
//! You probably want to use the [`Builder`] interface to customise the strategy.
//!
//! ```no_run
//! use liblrge::{Estimate, TwoSetStrategy};
//! use liblrge::estimate::{LOWER_QUANTILE, UPPER_QUANTILE};
//! use liblrge::twoset::{Builder, DEFAULT_TARGET_NUM_READS, DEFAULT_QUERY_NUM_READS};
//!
//! let input = "path/to/reads.fastq";
//! let mut strategy = Builder::new()
//! .target_num_reads(DEFAULT_TARGET_NUM_READS)
//! .query_num_reads(DEFAULT_QUERY_NUM_READS)
//! .threads(4)
//! .seed(Some(42)) // makes the estimate reproducible
//! .build(input);
//!
//! let finite = true; // estimate the genome size based on the finite estimates (recommended)
//! let low_q = Some(LOWER_QUANTILE); // lower quantile for the confidence interval
//! let upper_q = Some(UPPER_QUANTILE); // upper quantile for the confidence interval
//! let est_result = strategy.estimate(finite, low_q, upper_q).expect("Failed to generate estimate");
//! let estimate = est_result.estimate;
//!
//! let no_mapping_count = est_result.no_mapping_count;
//! // you might want to handle cases where some proportion of query reads did not overlap with target reads
//! ```
//!
//! By default, the intermediate target and query reads and overlap files are written to a temporary
//! directory and cleaned up after the strategy object is dropped. This is done via the use of the
//! [`tempfile`](https://crates.io/crates/tempfile) crate. The intermediate read files are placed in
//! the temporary directory and named `target.fq` and `query.fq`, while the overlap file is named
//! `overlaps.paf`.
//!
//! You can set your own temporary directory by using the [`Builder::tmpdir`] method.
mod builder;

use std::collections::HashSet;
Expand Down Expand Up @@ -27,6 +68,8 @@ pub const DEFAULT_QUERY_NUM_READS: usize = 5_000;
/// The convention is to use a smaller set of query reads and a larger set of target reads. The
/// query reads are overlapped with the target reads and an estimated genome size is calculated
/// for **each query read** based on the number of overlaps it has with the target set.
///
/// See the [module-level documentation](crate::twoset) for more information and examples.
pub struct TwoSetStrategy {
/// Path to the FASTQ file.
input: PathBuf,
Expand Down
1 change: 1 addition & 0 deletions lrge/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ readme.workspace = true
keywords.workspace = true
license-file.workspace = true
exclude.workspace = true
rust-version.workspace = true

[dependencies]
log.workspace = true
Expand Down
2 changes: 1 addition & 1 deletion lrge/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ fn main() -> Result<()> {
.estimate(!args.with_infinity, Some(args.lower_q), Some(args.upper_q))
.context("Failed to generate estimate")?;

let estimate = est_result.median;
let estimate = est_result.estimate;
let low_q = est_result.lower;
let upper_q = est_result.upper;

Expand Down

0 comments on commit 4e6a170

Please sign in to comment.