-
Notifications
You must be signed in to change notification settings - Fork 0
/
bdbwt2brindex.hpp
186 lines (141 loc) · 4.17 KB
/
bdbwt2brindex.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
/* Wrapper using part of br-index API
* with the bdbwt as needed for MEM finding
*/
#include "../br-index/src/definitions.hpp"
#include "../bdbwt/include/BD_BWT_index.hh"
namespace bri {
// sample maintained during the search
struct bd_sample {
/*
* state variables for left_extension & right_extension
* range: SA range of P
* rangeR: correspondents to range in reversed text
*/
range_t range, rangeR;
bd_sample(): range(), rangeR() {}
bd_sample(range_t range_,
range_t rangeR_)
:
range(range_),
rangeR(rangeR_) {}
void set_values(range_t range_,
range_t rangeR_)
{
range = range_;
rangeR = rangeR_;
}
// the pattern does not exist
bool is_invalid()
{
return (range.first > range.second) || (rangeR.first > rangeR.second);
}
// range size
ulint size()
{
return range.second + 1 - range.first;
}
};
class bdbwt_index {
public:
bdbwt_index() {}
bdbwt_index(std::string const& input)
{
bdbwt = BD_BWT_index<>((uint8_t*)input.c_str());
saidx = new ulint[bdbwt.size()];
ulint j = LF(0);
for (ulint i=0; i<bdbwt.size()-2; i++) {
saidx[j] = bdbwt.size()-i-2;
j = LF(j);
}
}
/*
* get full BWT range
*/
range_t full_range()
{
return {0,bdbwt.size()-1};
}
/*
* get a sample corresponding to an empty string
*/
bd_sample get_initial_sample(bool reverse = false)
{
return bd_sample(full_range(), // entire SA range
full_range()); // entire SAR range
}
ulint LF(ulint j)
{
return bdbwt.backward_step(j);
}
/*
* search the pattern cP (P:the current pattern)
* returns SA&SAR range corresponding to cP
*/
bd_sample left_extension(uchar c, bd_sample const& prev_sample)
{
Interval_pair pair = bdbwt.left_extend(Interval_pair(prev_sample.range.first,prev_sample.range.second,prev_sample.rangeR.first,prev_sample.rangeR.second),c);
bd_sample sample;
sample.range.first=pair.forward.left;
sample.range.second=pair.forward.right;
sample.rangeR.first=pair.reverse.left;
sample.rangeR.second=pair.reverse.right;
return sample;
}
/*
* search the pattern Pc (P:the current pattern)
* return SAR&SA range corresponding to Pc
*/
bd_sample right_extension(uchar c, bd_sample const& prev_sample)
{
Interval_pair pair = bdbwt.right_extend(Interval_pair(prev_sample.range.first,prev_sample.range.second,prev_sample.rangeR.first,prev_sample.rangeR.second),c);
bd_sample sample;
sample.range.first=pair.forward.left;
sample.range.second=pair.forward.right;
sample.rangeR.first=pair.reverse.left;
sample.rangeR.second=pair.reverse.right;
return sample;
}
/*
* get BWT[i] or BWT^R[i]
*/
uchar bwt_at(ulint i, bool reversed = false)
{
if (!reversed) bdbwt.forward_bwt_at(i);
return bdbwt.backward_bwt_at(i);
}
/*
* save index to "{path_prefix}.bdbwt" file
*/
void save_to_file(std::string const& path_prefix)
{
std::string path = path_prefix + ".bdbwt";
bdbwt.save_to_disk(path,"");
}
/*
* load index file from path
*/
void load_from_file(std::string const& path)
{
bdbwt.load_from_disk(path,"");
}
void load(std::istream& in) {
// not used, defined for compatibility
}
ulint text_size() { return bdbwt.size() - 1; }
ulint bwt_size(bool reversed=false) { return bdbwt.size(); }
uchar get_terminator() {
return bdbwt.END;
}
std::vector<ulint> locate_sample(bd_sample const& sample)
{
ulint n_occ = sample.range.second + 1 - sample.range.first;
std::vector<ulint> res(n_occ);
for (ulint i=sample.range.first; i <= sample.range.second; i++)
res[i-sample.range.first] = saidx[i];
return res;
}
private:
BD_BWT_index<> bdbwt;
ulint* saidx;
};
};