Skip to content

Commit

Permalink
Illumina Complete Long Read presets (#1069)
Browse files Browse the repository at this point in the history
* Implements a transition-aware alignment scoring scheme and configuration presets for ICLR

* Fix to enable use of general scoring matrix in ksw as suggested by lh3

---------

Co-authored-by: koadman <>
  • Loading branch information
koadman authored Jun 4, 2023
1 parent e28a55b commit ace990c
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 6 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
```sh
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
```
The difference between `map-pb` and `map-ont` is that `map-pb` uses
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
Nanopore reads.
Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
accounts for the low overall error rate in the reads, with transversion errors
being less frequent than transitions.

#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads

Expand Down
16 changes: 14 additions & 2 deletions align.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,17 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
mat[(m - 1) * m + j] = sc_ambi;
}

static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
{
assert(m==5);
ksw_gen_simple_mat(m,mat,a,b,sc_ambi);
transition = transition > 0? -transition : transition;
mat[0*m+2]=transition; // A->G
mat[1*m+3]=transition; // C->T
mat[2*m+0]=transition; // G->A
mat[3*m+1]=transition; // T->C
}

static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
{
uint32_t i;
Expand Down Expand Up @@ -323,6 +334,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
fputc('\n', stderr);
}
if (opt->b != opt->transition) flag |= KSW_EZ_GENERIC_SC;
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
ksw_reset_extz(ez);
ez->zdropped = 1;
Expand Down Expand Up @@ -586,7 +598,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int

r2->cnt = 0;
if (r->cnt == 0) return;
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
bw = (int)(opt->bw * 1.5 + 1.);
bw_long = (int)(opt->bw_long * 1.5 + 1.);
if (bw_long < bw) bw_long = bw;
Expand Down Expand Up @@ -844,7 +856,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;

ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
tseq = (uint8_t*)kmalloc(km, tl);
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];
Expand Down
5 changes: 3 additions & 2 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const

int main(int argc, char *argv[])
{
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
ketopt_t o = KETOPT_INIT;
mm_mapopt_t opt;
mm_idxopt_t ipt;
Expand Down Expand Up @@ -178,6 +178,7 @@ int main(int argc, char *argv[])
else if (c == 'm') opt.min_chain_score = atoi(o.arg);
else if (c == 'A') opt.a = atoi(o.arg);
else if (c == 'B') opt.b = atoi(o.arg);
else if (c == 'b') opt.transition = atoi(o.arg);
else if (c == 's') opt.min_dp_max = atoi(o.arg);
else if (c == 'C') opt.noncan = atoi(o.arg);
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
Expand Down Expand Up @@ -367,7 +368,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " --version show version number\n");
fprintf(fp_help, " Preset:\n");
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
fprintf(fp_help, " - map-pb/map-ont/map-iclr-prerender/map-iclr - PacBio/Nanopore/ICLR vs reference mapping\n");
fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");
Expand Down
1 change: 1 addition & 0 deletions minimap.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ typedef struct {
float alt_drop;

int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int transition; // transition mismatch score (A:G, C:T)
int sc_ambi; // score when one or both bases are "N"
int noncan; // cost of non-canonical splicing sites
int junc_bonus;
Expand Down
9 changes: 9 additions & 0 deletions options.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
opt->alt_drop = 0.15f;

opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
opt->transition = opt->b;
opt->sc_ambi = 1;
opt->zdrop = 400, opt->zdrop_inv = 200;
opt->end_bonus = -1;
Expand Down Expand Up @@ -112,6 +113,14 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
mo->occ_dist = 500;
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
mo->min_dp_max = 200;
} else if (strcmp(preset, "map-iclr-prerender") == 0) {
io->flag = 0, io->k = 15;
mo->b = 6, mo->transition = 1;
mo->q = 10, mo->q2 = 50;
} else if (strcmp(preset, "map-iclr") == 0) {
io->flag = 0, io->k = 19;
mo->b = 6, mo->transition = 4;
mo->q = 10, mo->q2 = 50;
} else if (strncmp(preset, "asm", 3) == 0) {
io->flag = 0, io->k = 19, io->w = 19;
mo->bw = 1000, mo->bw_long = 100000;
Expand Down

0 comments on commit ace990c

Please sign in to comment.