Skip to content

Commit

Permalink
add jank v1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
pre-mRNA committed Apr 10, 2023
1 parent 63c414f commit a0e1662
Show file tree
Hide file tree
Showing 4 changed files with 202 additions and 0 deletions.
13 changes: 13 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "jank"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
bio = "0.38.0"
rayon = "1.5.1"
clap = "2.33.3"
flate2 = "1.0.22"
gzp = "0.4.0"
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Just ANother K-mer Counter (JANK)
<p align="center">
<img src="./img/jank.png" alt="JANK logo" width="400" height="300">
</p>
JANK is a high-performance k-mer counter developed in Rust, designed to efficiently process both FASTA and FASTQ files, including gzipped files. It utilizes parallel processing to enhance performance on multi-core systems.

## Features
- Compatible with FASTA and FASTQ input files
- Supports gzipped files
- Employs parallel processing for optimal performance on multi-core systems
- Offers the option to count reverse complement k-mers
- Allows user-defined k-mer size
- Automatically multithreads, but enables used-defined threadcount

## Dependencies
- Rust programming language (version >= 1.54.0)
- clap (version >= 3.0.0)
- flate2 (version >= 1.0.20)
- rayon (version >= 1.5.1)
**Note: Dependencies will be automatically installed during compilation.**

## Installation
```
# Clone the repository
git clone https://github.com/your_username/jank.git
# Compile the code
cd jank && cargo build --release
# Add jank to your path
export PATH="$PATH:$(pwd)/target/release"
```
The compiled binary will be located in the target/release directory, and added to your current PATH.

## Usage
```
jank [FLAGS] [OPTIONS] <input_file>
```
<input_file>: The input FASTA or FASTQ file (required)
### Flags
-r, --count-reverse-complement: Count reverse complement k-mers as well
### Options
-k, --kmer-size <k_value>: Size of k-mers to count (default: 9)
-c, --cpu-count <cpu_count>: Number of CPU cores to use (default: 0, which means the program will use all available cores)

```
jank -k 7 -c 4 -r input.fasta
```
This command will count 7-mers in the input.fasta file, using 4 CPU cores, and also counting reverse complement k-mers.
Binary file added img/jank.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
140 changes: 140 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
use std::fs::File;
use std::io::{BufRead, BufReader, Write, stderr};
use std::collections::HashMap;
use clap::{Arg, App, value_t};
use rayon::prelude::*;
use std::sync::Mutex;
use flate2::bufread::GzDecoder;

fn main() {
let matches = App::new("Just ANother Kmer counter")
.arg(Arg::with_name("input_file")
.help("Input FASTA or FASTQ file")
.required(true)
.index(1))
.arg(Arg::with_name("k_value")
.short("k")
.long("kmer-size")
.help("Size of k-mers to count")
.takes_value(true)
.default_value("9"))
.arg(Arg::with_name("cpu_count")
.short("c")
.long("cpu-count")
.help("Number of CPU cores to use")
.takes_value(true)
.default_value("0"))
.arg(Arg::with_name("count_reverse_complement")
.short("r")
.long("count-reverse-complement")
.help("Count reverse complement k-mers as well")
.takes_value(false))
.get_matches();

let input_file = matches.value_of("input_file").unwrap();
let kmer_size: usize = value_t!(matches.value_of("k_value"), usize).unwrap_or_else(|e| e.exit());
let cpu_count: usize = value_t!(matches.value_of("cpu_count"), usize).unwrap_or_else(|e| e.exit());
let count_reverse_complement = matches.is_present("count_reverse_complement");

if cpu_count > 0 {
rayon::ThreadPoolBuilder::new().num_threads(cpu_count).build_global().unwrap();
}

let sequences = read_sequences(input_file);
let kmer_counts = Mutex::new(HashMap::new());

let total_length: usize = sequences.par_iter()
.map(|seq| {
if seq.len() >= kmer_size {
for idx in 0..=seq.len() - kmer_size {
let kmer = String::from_utf8_lossy(&seq[idx..idx + kmer_size]).to_string();
let mut kmer_counts = kmer_counts.lock().unwrap();
kmer_counts.entry(kmer.clone()).and_modify(|c| *c += 1).or_insert(1);
if count_reverse_complement {
let reverse_complement = reverse_complement(&kmer);
kmer_counts.entry(reverse_complement).and_modify(|c| *c += 1).or_insert(1);
}
}
seq.len()
} else {
writeln!(stderr(), "Discarding short sequence with length: {} (less than k-mer size: {})", seq.len(), kmer_size).unwrap();
0
}
})
.reduce(|| 0, |a, b| a + b);

let total_kmers: usize = kmer_counts.lock().unwrap().values().map(|v| *v).sum();

writeln!(stderr(), "Total length of input sequences: {}", total_length).unwrap();
writeln!(stderr(), "Total number of k-mers counted: {}", total_kmers).unwrap();
writeln!(stderr(), "Using {} CPU cores", rayon::current_num_threads()).unwrap();

println!("# Command: kmer_counter {} -k {}", input_file, kmer_size);
for (kmer, count) in kmer_counts.lock().unwrap().iter() {
println!("{} {}", kmer, count);
}
}

fn read_sequences(filename: &str) -> Vec<Vec<u8>> {
let file = File::open(filename).expect("Error opening file");
let is_gzipped = filename.to_lowercase().ends_with(".gz");

let reader: Box<dyn BufRead> = if is_gzipped {
Box::new(BufReader::new(GzDecoder::new(BufReader::new(file))))
} else {
Box::new(BufReader::new(file))
};

let is_fasta = filename.to_lowercase().contains(".fasta") || filename.to_lowercase().contains(".fa");
let is_fastq = filename.to_lowercase().contains(".fastq") || filename.to_lowercase().contains(".fq");

let mut sequences = Vec::new();
let mut current_seq = Vec::new();
let mut in_sequence = false;
let mut ignore_lines = 0;

eprintln!("Reading sequences...");
for line in reader.lines() {
let line = line.unwrap();
if ignore_lines > 0 {
ignore_lines -= 1;
continue;
}

if line.starts_with('>') && is_fasta || line.starts_with('@') && is_fastq {
if !current_seq.is_empty() {
sequences.push(current_seq);
current_seq = Vec::new();
}
in_sequence = true;
} else if in_sequence && is_fastq && line.starts_with('+') {
ignore_lines = 1;
in_sequence = false;
} else if in_sequence {
current_seq.extend(line.bytes());
}
}

if !current_seq.is_empty() {
sequences.push(current_seq);
}

eprintln!("Finished reading sequences. Sequences count: {}", sequences.len());

sequences
}

fn reverse_complement(kmer: &str) -> String {
let mut rev_comp = String::with_capacity(kmer.len());
for base in kmer.chars().rev() {
let complement = match base {
'A' => 'T',
'T' => 'A',
'C' => 'G',
'G' => 'C',
_ => panic!("Invalid base in k-mer: {}", base),
};
rev_comp.push(complement);
}
rev_comp
}

0 comments on commit a0e1662

Please sign in to comment.