diff --git a/README.md b/README.md index 4d8ff6b..2cd8304 100644 --- a/README.md +++ b/README.md @@ -2,24 +2,28 @@ [![Build status](https://travis-ci.org/Hexagon/thinker-fts.svg)](https://travis-ci.org/Hexagon/thinker-fts) [![npm version](https://badge.fury.io/js/thinker-fts.svg)](https://badge.fury.io/js/thinker-fts) -Fast and extendible Node.js/Javascript full text search engine. +Fast and extendible pure JavaScript full text search engine. ## Features * Highly optimized, will give a ranked resultset within 20 ms on a 5000 (average wikipedia sized) document dataset. * In-memory operation * Few external dependencies - * Natural language search + * Natural language searchx * Partial matching * Expression correction / suggestions * Weighted ranker (configurable weights for each field, all-expression-match-factor, partial vs exact factor etc.) + * Search modifiers (+ require, - exclude, "searchword" precise match - excepts wordprocessors) * Field preprocessors * HTML-Stripper * Word preprocessors - * Swedish stemmer with stemmer stop words - * Stop words - * Wordforms - * Stripper for multiple characters + * [Stemmers](https://en.wikipedia.org/wiki/Stemming) + * Swedish + * English + * [Stop words](https://en.wikipedia.org/wiki/Stop_words) + * Word forms + * [Soundex](https://en.wikipedia.org/wiki/Soundex) + * Stripper for repeated characters * Allows saving/loading the index to/from disk, but for small datasets you can feed the index on-the-fly. @@ -300,13 +304,19 @@ An optional feature of the stemmers is to supply a list of words that you don't Currently there is two stemmers available, swedish through a custom version of the Snowball algorithm, and english through the Porter algorithm. -Example setting up thinker with standard ranker and english stemming +Example setting up thinker with standard ranker, english stemming and some stemmer stopwords. ```javascript var thinker = Thinker(), ranker = Thinker.rankers.standard(), - stemmer = Thinker.processors.stemmers.english(); + stemmer = Thinker.processors.stemmers.english({ + "stemmer": true, + "stemming": true, + "dontstemthiseither": true, + "leonardo": true, + "anders", true + }); thinker.addWordProcessor(stemmer); @@ -322,9 +332,8 @@ var thinker = Thinker(), ranker = Thinker.rankers.standard(), stemmer = Thinker.processors.stemmers.swedish({ - "stemmer": true, - "stemming": true, - "dontstemthiseither": true, + "berta": true, + "jonas": true, "leonardo": true, "anders", true }); @@ -334,6 +343,23 @@ thinker.addWordProcessor(stemmer); thinker.ranker = ranker; ``` +#### Soundex + +Soundex preprocesses the words in such way that words that sounds alike matches each other. + +Example setting up thinker with Soundex processing. + +```javascript +var + thinker = Thinker(), + ranker = Thinker.rankers.standard(), + soundex = Thinker.processors.soundex(); + +thinker.addWordProcessor(soundex); + +thinker.ranker = ranker; +``` + ## Dependencies @@ -343,6 +369,8 @@ Note: Dependencies is installed automatically by npm [stemmer](https://github.com/wooorm/stemmer) (https://github.com/wooorm/stemmer) + [node-soundex](https://github.com/LouisT/node-soundex) (https://github.com/LouisT/node-soundex) + ## Development dependencies diff --git a/lib/Thinker.js b/lib/Thinker.js index 4e59b42..727d582 100644 --- a/lib/Thinker.js +++ b/lib/Thinker.js @@ -22,13 +22,6 @@ THE SOFTWARE. */ -/* ToDo: - - * Vikta titel efter hur mycket plats sökorden tar i titeln - * Missingspacesnurra - -*/ - var Index = require('./index.js'), processors = require('./processors.js'), rankers = require('./rankers.js'); @@ -118,9 +111,6 @@ function Thinker (opts) { return new Thinker(opts); } - - // Can be set afterwards - self.enableSuggestions = false; self.ranker = function() {}; // All these options must be set before indexing and @@ -134,7 +124,8 @@ function Thinker (opts) { maxWordLen: 32, wordProcessors: [], fieldProcessors: [], - suggestionMinWordCount: 6 + suggestionMinWordCount: 6, + enableSuggestions: false }, opts ); // Index backend @@ -170,9 +161,9 @@ Thinker.prototype.feed = function (texts, opts) { for (i = opts.minWildcardWordLen; i < word.original.length && i < opts.maxWildcardWordLen; i++) { for (j = 0; j < (word.original.length - i) + 1; j++) { // Do not input partial if equals processed - //if( word.original.substr(j,i) !== word.processed ) { + if( word.original.substr(j,i) !== word.processed ) { self.index.populatePartial(word.original.substr(j, i), wIndex); - //} + } } } } @@ -256,16 +247,16 @@ Thinker.prototype.find = function (string) { continue; } - // + // queryResult = self.index.query(word, exact); - // + // Enable suggestions if self.options.enableSuggestions is true suggestion = undefined; - if (!queryResult.direct.length && self.enableSuggestions) { + if (!queryResult.direct.length && self.options.enableSuggestions) { suggestion = self.index.findClosestWord(word.original); } - // + // Push this expression to result array resultSet.expressions.push({ interpretation: exact ? word.original : word.processed, original: word.original, @@ -281,6 +272,11 @@ Thinker.prototype.find = function (string) { time('rankTime') resultSet.documents = self.ranker(resultSet,self.index.getWordCount()); + // Remove expression[m].hits from resultset, not needed anymore + for (i = 0; i < resultSet.expressions.length; i++) { + delete resultSet.expressions[i].hits; + } + // Add timers to resultset resultSet.findTime = time('findTime'); resultSet.rankTime = time('rankTime'); diff --git a/lib/index.js b/lib/index.js index f0f3b28..b325b6f 100644 --- a/lib/index.js +++ b/lib/index.js @@ -173,7 +173,7 @@ function index(opts) { } else { direct = queryProcessed( location.processed ); } - partial = queryPartial( location.original ) || queryPartial( location.partial ); + partial = queryPartial( location.original ) || queryPartial( location.processed ); // Add object return { diff --git a/lib/processors.js b/lib/processors.js index 648d69f..63595b4 100644 --- a/lib/processors.js +++ b/lib/processors.js @@ -24,7 +24,8 @@ THE SOFTWARE. 'use strict'; -var porterStemmer = require('stemmer'); +var porterStemmer = require('stemmer'), + Soundex = require('soundex'); function stopwords ( stopwords ) { var stopwords = stopwords || {}; @@ -262,17 +263,27 @@ function swedishStemmer ( stopwords ) { }*/ -function englishStemmer ( ) { +function englishStemmer ( stopwords ) { + var stopwords = stopwords || {}; return function ( w ) { + // Dont process stopwords + if ( stopwords[w] === true ) return w; return porterStemmer( w ); }; }; +function soundex ( ) { + return function ( w ) { + return Soundex( w ); + }; +}; + module.exports = { stemmers: { swedish: swedishStemmer, english: englishStemmer }, + soundex: soundex, stopwords: stopwords, wordforms: wordforms, multiples: multiples, diff --git a/lib/rankers.js b/lib/rankers.js index fe15109..2f6d680 100644 --- a/lib/rankers.js +++ b/lib/rankers.js @@ -26,6 +26,7 @@ THE SOFTWARE. /* Default ranker */ function standard (options) { + // Defaults var defaultFieldOptions = { weight: 1, @@ -68,6 +69,7 @@ function standard (options) { j = 0; while ((word = resultSet.expressions[j++])) { + matches = [ { flag: 1, @@ -100,15 +102,16 @@ function standard (options) { // current field or fall back on the default settings. fieldOptions = options.fields[fieldIndex] || defaultFieldOptions; - // + // Multiply match weight with field-specific weight weight = match.weight * fieldOptions.weight; - // Not sure what this is + // For field with boostPercentage flag enabled - add extra weight the more of the field that is matched. + // 1 + (noOfMatchedWords / totalWordsInField) if (fieldOptions.boostPercentage) { weight *= (1 + (matchCount / wordCount[documentId][fieldIndex - 1])); } - // Something explanatory + // Add this fields calculated weight to the document total getDocument(documentId).weight += weight; getDocument(documentId).expressions[j - 1] = match.flag; @@ -123,8 +126,6 @@ function standard (options) { // Convert document results from object to array (to be sortable) documentResultsFinal = Object.keys(documentResults).map(function (key) { return documentResults[key]; }); - // Remove unwanted documents - // Sort documents by total weight documentResultsFinal.sort(function(a, b) { return b.weight - a.weight @@ -164,6 +165,7 @@ function standard (options) { if (!toss) { temp.push(documentResultsFinal[i]); } + } resultSet = temp; diff --git a/package.json b/package.json index 9671bc9..c64c303 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "thinker-fts", - "version": "1.0.7", - "description": "Javascript/Node.js in-memory full text search engine.", + "version": "1.0.8", + "description": "Pure Javascript/Node.js in-memory full text search engine.", "author": "Hexagon <github.com/hexagon>", "contributors": [{ "name": "Pehr Boman", @@ -22,11 +22,17 @@ "thinker", "fts", "fulltext", - "in-memory" + "in-memory", + "levenshtein", + "soundex", + "porter", + "stemmer", + "full text search" ], "dependencies": { "fast-levenshtein": "*", - "stemmer": "*" + "stemmer": "*", + "soundex": "*" }, "devDependencies": { "mocha": "*", diff --git a/test/test.js b/test/test.js index 1e122bd..aab9a3d 100644 --- a/test/test.js +++ b/test/test.js @@ -29,7 +29,7 @@ var should = require('should'), /* START OF EXAMPLE DATA */ var exampleTexts = [ - [0,"Artikel nummer noll","Det här är ettan i det hela, Anders är ett namn. Jonas likaså antikvitets. Bemötandet. effektivitet Kalle olle lars"], + [0,"Artikel nummer noll","Det här är ettan i det hela, Anders är ett namn. Jonas likaså antikvitets. Bemötandet. effektivitet Kalle olle lars considerable"], [1,"Bemötande testtitel med extra ord","Brödtext nummer ett. Ander antikviteten olle lars sven"], [2,"Titeln med extra Testning","Brödtext i sanden artikeln två. Bemött namn Andersson antikvitet nyhet, nyheter, nyheten, nyhetens, nya olle"], ]; @@ -405,10 +405,9 @@ describe('Advanced ranker', function () { }); describe('Suggestion', function () { - var thinker = Thinker({suggestionMinWordCount: 1}); + var thinker = Thinker({suggestionMinWordCount: 1, enableSuggestions: true}); var ranker = Thinker.rankers.standard(); - thinker.enableSuggestions = true; thinker.ranker = ranker; // We need to make a copy of exampletexts, as feed consumes the object @@ -805,7 +804,9 @@ describe('Word processor: English stemmer', function () { var thinker = Thinker(); var ranker = Thinker.rankers.standard(); - var stemmer = Thinker.processors.stemmers.english(); + var stemmer = Thinker.processors.stemmers.english({ + "considerable": true + }); var exampleTextsCopy = JSON.parse(JSON.stringify(exampleTexts)); @@ -817,8 +818,40 @@ describe('Word processor: English stemmer', function () { describe('Search for "considerable"', function () { var result = thinker.find("considerable"); - it('Should be interpreted as "consider"', function () { - result.expressions[0].interpretation.should.equal("consider"); + it('Should be interpreted as "considerable"', function () { + result.expressions[0].interpretation.should.equal("considerable"); + }); + + it('Should give one result"', function () { + result.documents.length.should.equal(1); + }); + + }); + + describe('Search for "considering"', function () { + var result = thinker.find("considering"); + + it('Should be interpreted as "consid"', function () { + result.expressions[0].interpretation.should.equal("consid"); + }); + + it('Should give one PARTIAL result"', function () { + result.documents.length.should.equal(1); + result.documents[0].expressions[0].should.equal(1); + }); + + }); + + describe('Search for "consider"', function () { + var result = thinker.find("consider"); + + it('Should be interpreted as "consid"', function () { + result.expressions[0].interpretation.should.equal("consid"); + }); + + it('Should give one PARTIAL result"', function () { + result.documents.length.should.equal(1); + result.documents[0].expressions[0].should.equal(1); }); }); @@ -870,6 +903,47 @@ describe('Word processor: English stemmer', function () { }); +describe('Word processor: English soundex', function () { + + var thinker = Thinker(); + var ranker = Thinker.rankers.standard(); + var soundex = Thinker.processors.soundex(); + + + thinker.addWordProcessor(soundex); + thinker.ranker = ranker; + + thinker.feed([ + [0,"This is a tile","This is a textual"], + [1,"This is a tilly","This is a sexual"], + + ]); + + describe('Search for "tile"', function () { + var result = thinker.find("tile"); + + it('Should be interpreted as "T400"', function () { + result.expressions[0].interpretation.should.equal("T400"); + }); + + it('Should give two results', function () { + result.documents.length.should.equal(2); + }); + + }); + + describe('Search for "textual"', function () { + var result = thinker.find("textual"); + + it('Should give one results', function () { + result.documents.length.should.equal(1); + }); + + }); + +}); + + describe('Field processor: HTML-Stripper', function () { var thinker = Thinker(); var ranker = Thinker.rankers.standard();