Skip to content

Commit

Permalink
expose options for long reads splitting and edit distance threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
sfchen committed Apr 23, 2020
1 parent b08ecea commit 5f29d9b
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 3 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ Key options:
-g, --genomes the Genomes file in fasta format. data/SARS-CoV-2.genomes.fa will be used if neither KMER file (-k) nor Genomes file (-g) is specified (string [=])
-p, --positive_threshold the data is considered as POSITIVE, when its mean coverage of unique kmer >= positive_threshold (0.001 ~ 100). 0.1 by default. (float [=0.1])
-d, --depth_threshold For coverage calculation. A region is considered covered when its mean depth >= depth_threshold (0.001 ~ 1000). 1.0 by default. (float [=1])
--bin_size For coverage calculation. The genome is split to many bins, with the size of each bin is bin_size (1 ~ 100000), default 0 means adaptive. (int [=0])
-E, --ed_threshold If the edit distance of a sequence and a genome region is <=ed_threshold, then consider it a match (0 ~ 50). 8 by default. (int [=8])
--long_read_threshold A read will be considered as long read if its length >= long_read_threshold (100 ~ 10000). 200 by default. (int [=200])
--read_segment_len A long read will be splitted to read segments, with each <= read_segment_len (50 ~ 5000, should be < long_read_threshold). 100 by default. (int [=100])
--bin_size For coverage calculation. The genome is splitted to many bins, with each bin has a length of bin_size (1 ~ 100000), default 0 means adaptive. (int [=0])
-j, --json the json format report file name (string [=fastv.json])
-h, --html the html format report file name (string [=fastv.html])
-R, --report_title should be quoted with ' or ", default is "fastv report" (string [=fastv report])
Expand Down
2 changes: 1 addition & 1 deletion src/genomes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ MapResult Genomes::mapToGenome(string& seq, uint32 seqPos, string& genome, uint3
ret.ed = ed;
ret.start = gp;
ret.len = seq.length();
ret.mapped = ed < 10 && ed < seq.length()/4; // TODO: export to options
ret.mapped = ed <= mOptions->edThreshold && ed < seq.length()/4; // TODO: export to options

return ret;
}
Expand Down
8 changes: 7 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ int main(int argc, char* argv[]){
cmd.add<string>("genomes", 'g', "the Genomes file in fasta format. data/SARS-CoV-2.genomes.fa will be used if neither KMER file (-k) nor Genomes file (-g) is specified", false, "");
cmd.add<float>("positive_threshold", 'p', "the data is considered as POSITIVE, when its mean coverage of unique kmer >= positive_threshold (0.001 ~ 100). 0.1 by default.", false, 0.1);
cmd.add<float>("depth_threshold", 'd', "For coverage calculation. A region is considered covered when its mean depth >= depth_threshold (0.001 ~ 1000). 1.0 by default.", false, 1.0);
cmd.add<int>("bin_size", 0, "For coverage calculation. The genome is split to many bins, with the size of each bin is bin_size (1 ~ 100000), default 0 means adaptive.", false, 0);
cmd.add<int>("ed_threshold", 'E', "If the edit distance of a sequence and a genome region is <=ed_threshold, then consider it a match (0 ~ 50). 8 by default.", false, 8);
cmd.add<int>("long_read_threshold", 0, "A read will be considered as long read if its length >= long_read_threshold (100 ~ 10000). 200 by default.", false, 200);
cmd.add<int>("read_segment_len", 0, "A long read will be splitted to read segments, with each <= read_segment_len (50 ~ 5000, should be < long_read_threshold). 100 by default.", false, 100);
cmd.add<int>("bin_size", 0, "For coverage calculation. The genome is splitted to many bins, with each bin has a length of bin_size (1 ~ 100000), default 0 means adaptive.", false, 0);

// reporting
cmd.add<string>("json", 'j', "the json format report file name", false, "fastv.json");
Expand Down Expand Up @@ -176,6 +179,9 @@ int main(int argc, char* argv[]){

opt.positiveThreshold = cmd.get<float>("positive_threshold");
opt.depthThreshold = cmd.get<float>("depth_threshold");
opt.edThreshold = cmd.get<int>("ed_threshold");
opt.longReadThreshold = cmd.get<int>("long_read_threshold");
opt.segmentLength = cmd.get<int>("read_segment_len");
opt.statsBinSize = cmd.get<int>("bin_size");

opt.compression = cmd.get<int>("compression");
Expand Down
13 changes: 13 additions & 0 deletions src/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Options::Options(){
statsBinSize = 0;
longReadThreshold = 200;
segmentLength = 100;
edThreshold = 8;
}

void Options::init() {
Expand Down Expand Up @@ -177,6 +178,18 @@ bool Options::validate() {
if(depthThreshold < 0.001 || depthThreshold > 1000)
error_exit("depth threshold (-d) should be 0.001 ~ 1000, suggest 1");

if(longReadThreshold < 100 || longReadThreshold > 10000)
error_exit("long read threshold (--long_read_threshold) should be 100 ~ 10000, suggest 200");

if(segmentLength < 50 || segmentLength > 5000)
error_exit("segment length for splitted long reads (--read_segment_len) should be 50 ~ 5000, suggest 100");

if(segmentLength >= longReadThreshold)
error_exit("segment length for splitted long reads (--read_segment_len) must less than long read threshold (--long_read_threshold)");

if(edThreshold < 0 || edThreshold > 50)
error_exit("edit distance threshold (-E) should be 0 ~ 50, suggest 8");

if(trim.front1 < 0 || trim.front1 > 30)
error_exit("trim_front1 (--trim_front1) should be 0 ~ 30, suggest 0 ~ 4");

Expand Down
2 changes: 2 additions & 0 deletions src/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,8 @@ class Options{
double positiveThreshold;
// the threshold of depth for a region considered as covered
double depthThreshold;
// if ed(read, genome) <= edThreshold, then think it as a match
int edThreshold;
// the bin size for stat coverage and edit distance
int statsBinSize;
// read with length >= longReadThreshold will be considered as long reads
Expand Down

0 comments on commit 5f29d9b

Please sign in to comment.