-
-
Notifications
You must be signed in to change notification settings - Fork 8
/
naivebayes.js
293 lines (241 loc) · 8.87 KB
/
naivebayes.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/**
* 用于重置分类器的键
* keys we use to serialize a classifier's state
*/
const STATE_KEYS = [
'categories', 'docCount', 'totalDocuments', 'vocabulary', 'wordCount', 'wordFrequencyCount', 'options'
]
/**
* 默认分词器,英文按照空格分割单词,中文按照字符分割
* Given an input string, tokenize it into an array of word tokens.
* This is the default tokenization function used if user does not provide one in `options`.
*
* @param {String} text
* @return {Array}
*/
const defaultTokenizer = text => {
// 仅保留英文、中文、数字
const rgxPunctuation = /[^(a-zA-ZA-Яa-я\u4e00-\u9fa50-9_)+\s]/g
// 英文以空格分词,中文不分词,以单个字为单位
return text.replace(rgxPunctuation, ' ')
.replace(/[\u4e00-\u9fa5]/g, word => `${word} `)
.split(/\s+/)
}
/**
* Naive-Bayes Classifier 朴素贝叶斯
*
* This is a naive-bayes classifier that uses Laplace Smoothing.
*
*/
class NaiveBayes {
constructor(options) {
// set options object
this.options = {}
if (typeof options !== 'undefined') {
if (!options || typeof options !== 'object' || Array.isArray(options)) {
throw TypeError('NaiveBayes got invalid `options`: `' + options + '`. Pass in an object.')
}
this.options = options
}
// 分词器
this.tokenizer = this.options.tokenizer || defaultTokenizer
// 词汇表
this.vocabulary = []
// 已学习的文档总数量, number of documents we have learned from
this.totalDocuments = 0
// 分类的词频表, document frequency table for each of our categories
this.docCount = {}
// 分类词总数/概率基数, for each category, how many words total were mapped to it
this.wordCount = {}
// 分类的词频统计, word frequency table for each category
this.wordFrequencyCount = {}
// 所有分类, hashmap of our category names
this.categories = []
}
/**
* 初始化新分类
* Initialize each of our data structure entries for this new category
*
* @param {String} categoryName
*/
initializeCategory(categoryName) {
if (!this.categories.includes(categoryName)) {
this.docCount[categoryName] = 0
this.wordCount[categoryName] = 0
this.wordFrequencyCount[categoryName] = {}
this.categories.push(categoryName)
}
return this
}
/**
* 训练朴素贝叶斯分类器,告诉它分类关系
* train our naive-bayes classifier by telling it what `category`
* the `text` corresponds to.
*
* @param {String} text
* @param {String} class
*/
learn(text, category) {
// 初始化分类, initialize category data structures if we've never seen this category
this.initializeCategory(category)
// 更新这个分类映射的语句的数量(用于计算后面的 P(C) )
// update our count of how many documents mapped to this category
this.docCount[category]++
// 更新已学习的文档总数, update the total number of documents we have learned from
this.totalDocuments++
// 将文本标准化为词汇数组, normalize the text into a word array
const tokens = this.tokenizer(text)
// 获取文本中每个词汇的词频(用于更新总词频), get a frequency count for each token in the text
const frequencyTable = this.frequencyTable(tokens)
/*
* 更新我们的词汇和我们的词频计数这个分类
* Update our vocabulary and our word frequency count for this category
*/
Object.keys(frequencyTable).forEach(token => {
// 将目标词汇添加到词汇表, add this word to our vocabulary if not already existing
if (!this.vocabulary.includes(token)) {
this.vocabulary.push(token)
}
const frequencyInText = frequencyTable[token]
// 在这个分类中更新这个词的频率信息(更新总词频), update the frequency information for this word in this category
if (!this.wordFrequencyCount[category][token]) {
this.wordFrequencyCount[category][token] = frequencyInText
} else {
this.wordFrequencyCount[category][token] += frequencyInText
}
// 更新我们已经看到映射到这个分类的所有词汇的计数(C.wordCount,用于计算词类概率)
// update the count of all words we have seen mapped to this category
this.wordCount[category] += frequencyInText
})
return this
}
/**
* 进行分类,或者说进行预测
* Determine what category `text` belongs to.
*
* @param {String} text
* @param {Boolean} probability
* @return {String} category
*/
categorize(text, probability) {
return probability ? this.probabilities(text)[0]
: this.probabilities(text)[0].category
}
/**
* 返回一个数组,数组内部是按照概率从高到低排序的组合
* Determine category probabilities for `text`.
*
* @param {String} text
* @return {Array} probabilities
*/
probabilities(text) {
// [W1,W2,W3,W4,Wn...]
const tokens = this.tokenizer(text)
const frequencyTable = this.frequencyTable(tokens)
// 返回由 P(W1|C) * P(W2|C) ... P(Wn|C) * P(C) 组成的数组
// iterate thru our categories to calculate the probability for this text
return this.categories.map(category => {
// start by calculating the overall probability of this category
// => out of all documents we've ever looked at, how many were
// mapped to this category
const categoryProbability = this.docCount[category] / this.totalDocuments
//take the log to avoid underflow
let logProbability = Math.log(categoryProbability)
// now determine P( w | c ) for each word `w` in the text
Object.keys(frequencyTable).forEach(token => {
const frequencyInText = frequencyTable[token]
const tokenProbability = this.tokenProbability(token, category)
// console.log('token: %s category: `%s` tokenProbability: %d', token, category, tokenProbability)
//determine the log of the P( w | c ) for this word
logProbability += frequencyInText * Math.log(tokenProbability)
})
return {
category: category,
probability: logProbability
}
}).sort((prev, next) => next.probability - prev.probability)
}
/**
* 概率计算器,用于计算"元素"属于"分类"的概率
* Calculate probability that a `token` belongs to a `category`
*
* @param {String} token
* @param {String} category
* @return {Number} probability
*/
tokenProbability(token, category) {
// 分类中目标词汇的词频
const wordFrequencyCount = this.wordFrequencyCount[category][token] || 0
// 分类总词汇数量
const wordCount = this.wordCount[category]
// 拉普拉斯方程,防止概率为0,P(W|C)
return ( wordFrequencyCount + 1 ) / ( wordCount + this.vocabulary.length )
}
/**
* 概率HashMap
* Build a frequency hashmap where
* - the keys are the entries in `tokens`
* - the values are the frequency of each entry in `tokens`
*
* @param {Array} tokens Normalized word array
* @return {Object}
*/
frequencyTable(tokens) {
const frequencyTable = Object.create(null)
tokens.forEach(token => {
if (!frequencyTable[token]) {
frequencyTable[token] = 1
} else {
frequencyTable[token]++
}
})
return frequencyTable
}
/**
* Dump the classifier's state as a JSON string.
* @param {Boolean} Optionally format the serialized JSON output for easier human consumption
* @return {String} Representation of the classifier.
*/
toJson(prettyPrint) {
const prettyPrintSpaces = prettyPrint ? 2 : 0
return JSON.stringify(this.toJsonObject(), null, prettyPrintSpaces)
}
toJsonObject() {
const state = {}
STATE_KEYS.forEach(key => state[key] = this[key])
return state
}
/**
* 从JSON初始化贝叶斯分类器实例(json对象,不是字符串对象)
* Initializes a NaiveBayes instance from a JSON state representation.
* Use this with classifier.toJson().
*
* @param {String} jsonStr state representation obtained by classifier.toJson()
* @return {NaiveBayes} Classifier
*/
static fromJson(json) {
if (typeof json === 'string') {
try {
json = JSON.parse(json)
} catch (err) {
throw new Error('Naivebayes.fromJson expects a valid JSON string.')
}
}
json.options = json.options || {}
// init a new classifier
const classifier = new NaiveBayes(json.options)
// override the classifier's state
STATE_KEYS.forEach(key => {
if (json[key] == undefined) {
throw new Error(`NaiveBayes.fromJson: JSON string is missing an expected property: '${key}'.`)
} else {
classifier[key] = json[key]
}
})
return classifier
}
static getStateKeys() {
return STATE_KEYS
}
}
module.exports = NaiveBayes