diff --git a/README.md b/README.md index 46c313b..3dc1728 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,8 @@ var thinker = Thinker({ minWildcardWordLen: 4, maxWildcardWordLen: 32, minWordLen: 2, - maxWordLen: 32 + maxWordLen: 32, + suggestionMinWordCount: 6 }); // Options available on run time @@ -115,6 +116,10 @@ The shortest word to index, default is 2 which adds 'ex' to the index, but not ' Same as above, but max. +#### opts.suggestionMinWordCount + +Set how many times a word have to exist in the index to be used for suggestions. Defaults to 6. + #### thinker.enableSuggestions If this is enabled, thinker will use unprocessed words from the inputted texts to give suggestions when expressions doesn't give an direct match. diff --git a/lib/Thinker.js b/lib/Thinker.js index f325e3a..bcf349c 100644 --- a/lib/Thinker.js +++ b/lib/Thinker.js @@ -115,11 +115,9 @@ function Thinker (opts) { // Optional `new` keyword if (!(self instanceof Thinker)) { - return new Thinker; + return new Thinker(opts); } - // Index backend - self.index = new Index(); // Can be set afterwards self.enableSuggestions = false; @@ -135,8 +133,13 @@ function Thinker (opts) { minWordLen: 2, maxWordLen: 32, wordProcessors: [], - fieldProcessors: [] - }, opts && opts.options); + fieldProcessors: [], + suggestionMinWordCount: 6 + }, opts ); + + // Index backend + self.index = new Index(self.options); + }; Thinker.prototype.feed = function (texts, opts) { @@ -176,25 +179,29 @@ Thinker.prototype.feed = function (texts, opts) { // split text into separate words, removing empty results // Loop through all textfields (index > 0) + + for (j = 1 ; j < currentDocument.length; j++) { // Extract current field currentField = currentDocument[j]; + if (currentField) { - // Apply all fieldProcessors - for (i = 0; i < opts.fieldProcessors.length; i++) { - if (currentField) { - currentField = opts.fieldProcessors[i](currentField); + // Apply all fieldProcessors + for (i = 0; i < opts.fieldProcessors.length; i++) { + if (currentField) { + currentField = opts.fieldProcessors[i](currentField); + } } - } - // Split field into separate words - currentField = currentField.match(opts.characters); + // Split field into separate words + currentField = currentField.match(opts.characters); - // Extract unique words - for (k = 0; k < currentField.length; k++) { - if (currentWord !== '' && (currentWord = processWord(currentField[k], opts))) { - addWord(currentWord, currentDocument[0], j); + // Extract unique words + for (k = 0; k < currentField.length; k++) { + if (currentWord !== '' && (currentWord = processWord(currentField[k], opts))) { + addWord(currentWord, currentDocument[0], j); + } } } } @@ -228,6 +235,7 @@ Thinker.prototype.find = function (string, exact) { exact = !!exact; for (i = 0; i < words.length; i++) { + // Normalize and validate word if (!(word = words[i]) || !(word = processWord(words[i], self.options))) { continue; @@ -237,6 +245,7 @@ Thinker.prototype.find = function (string, exact) { queryResult = self.index.query(word, exact); // + suggestion = undefined; if (!queryResult.direct && self.enableSuggestions) { suggestion = self.index.findClosestWord(word.original); } diff --git a/lib/index.js b/lib/index.js index 8e1f8e8..1f8667f 100644 --- a/lib/index.js +++ b/lib/index.js @@ -26,12 +26,13 @@ THE SOFTWARE. var levenshtein = require('fast-levenshtein'); -function index() { +function index(opts) { - var data = [], + var options = opts, + data = [], lookupPartial = {}, lookupFull = {}, - lookupOriginal = {}, + lookupSuggestion = {}, i, found, @@ -87,7 +88,7 @@ function index() { match,found; indexProcessed = lookupFull[location.processed]; - indexOriginal = lookupOriginal[location.original]; + indexOriginal = lookupSuggestion[location.original]; // Index processed if(indexProcessed === undefined) { @@ -109,9 +110,11 @@ function index() { } } - // Index original + // Index original words for expression suggestions if(indexOriginal === undefined) { - lookupOriginal[location.original] = true; + lookupSuggestion[location.original] = 1; + } else { + lookupSuggestion[location.original]++; } return indexProcessed; @@ -125,33 +128,38 @@ function index() { return { direct: result, partial: resultPartial }; }, getData: function ( ) { - return [data,lookupPartial,lookupFull,lookupOriginal]; + return [data,lookupPartial,lookupFull,lookupSuggestion]; }, setData: function ( d ) { data = d[0]; lookupPartial = d[1]; lookupFull = d[2]; - lookupOriginal = d[3]; + lookupSuggestion = d[3]; }, findClosestWord: function ( w ) { var i, closestValue = Infinity, closestIndex, distance; - // Convert to array on first run - if ( Object.prototype.toString.call( lookupOriginal ) !== '[object Array]' ) { - lookupOriginal = Object.keys(lookupOriginal).map(function (key) { return key; }); + // Convert to array and filter on first run + if ( Object.prototype.toString.call( lookupSuggestion ) !== '[object Array]' ) { + var result = []; + Object.keys(lookupSuggestion).forEach(function (key) { + if (lookupSuggestion[key] >= options.suggestionMinWordCount) { + result.push(key); + } + }); + lookupSuggestion = result; } - for (i = 0; i < lookupOriginal.length; i++) { - distance = levenshtein.get(w, lookupOriginal[i]); + for (i = 0; i < lookupSuggestion.length; i++) { + distance = levenshtein.get(w, lookupSuggestion[i]); if (distance < closestValue) { closestIndex = i; closestValue = distance; } } - - if (closestIndex !== undefined) { - return lookupOriginal[closestIndex]; + if (closestIndex !== undefined && closestValue < 5) { + return lookupSuggestion[closestIndex]; } } }; diff --git a/package.json b/package.json index 5f7c235..abe0f87 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "thinker-fts", - "version": "1.0.4", + "version": "1.0.5", "description": "Javascript/Node.js in-memory full text search engine.", "author": "Hexagon ", "contributors": [{ diff --git a/test/test.js b/test/test.js index 2ffd533..948edd9 100644 --- a/test/test.js +++ b/test/test.js @@ -80,6 +80,30 @@ describe('Simple usage', function () { }); }); +describe('Simple usage', function () { + var thinker = Thinker({characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g}); + + thinker.ranker = Thinker.rankers.standard(); + + // We need to make a copy of exampletexts, as feed consumes the object + var exampleTextsCopy = JSON.parse(JSON.stringify(exampleTexts)); + thinker.feed(exampleTextsCopy); + + describe('opts.characters', function () { + var result = thinker.find("ånglok"); + + // The second expressin is ignored as default minWordLength is 2 + it('Should return one expression', function () { + result.results.expressions.length.should.equal(1); + }); + + it('Expression interpretation should equal "ånglok"', function () { + result.results.expressions[0].interpretation.should.equal("ånglok"); + }); + + }); +}); + describe('Partial match', function () { var thinker = Thinker(); @@ -286,7 +310,7 @@ describe('Advanced ranker', function () { }); describe('Suggestion', function () { - var thinker = Thinker(); + var thinker = Thinker({suggestionMinWordCount: 1}); var ranker = Thinker.rankers.standard(); thinker.enableSuggestions = true;