Minor fixes, version bump

Hexagon · Nov 23, 2015 · 864dacd · 864dacd
1 parent eb824c3
commit 864dacd
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -78,7 +78,8 @@ var thinker = Thinker({
 	minWildcardWordLen: 4,
 	maxWildcardWordLen: 32,
 	minWordLen: 2,
-	maxWordLen: 32
+	maxWordLen: 32,
+	suggestionMinWordCount: 6
 });
 
 // Options available on run time
@@ -115,6 +116,10 @@ The shortest word to index, default is 2 which adds 'ex' to the index, but not '
 
 Same as above, but max.
 
+#### opts.suggestionMinWordCount
+
+Set how many times a word have to exist in the index to be used for suggestions. Defaults to 6.
+
 #### thinker.enableSuggestions
 
 If this is enabled, thinker will use unprocessed words from the inputted texts to give suggestions when expressions doesn't give an direct match.

diff --git a/lib/Thinker.js b/lib/Thinker.js
@@ -115,11 +115,9 @@ function Thinker (opts) {
 
 	// Optional `new` keyword
 	if (!(self instanceof Thinker)) {
-		return new Thinker;
+		return new Thinker(opts);
 	}
 
-	// Index backend
-	self.index = new Index();
 
 	// Can be set afterwards
 	self.enableSuggestions = false;
@@ -135,8 +133,13 @@ function Thinker (opts) {
 		minWordLen: 2,
 		maxWordLen: 32,
 		wordProcessors: [],
-		fieldProcessors: []
-	}, opts && opts.options);
+		fieldProcessors: [],
+		suggestionMinWordCount: 6
+	}, opts );
+
+	// Index backend
+	self.index = new Index(self.options);
+
 };
 
 Thinker.prototype.feed = function (texts, opts) {
@@ -176,25 +179,29 @@ Thinker.prototype.feed = function (texts, opts) {
 
 		// split text into separate words, removing empty results
 		// Loop through all textfields (index > 0)
+
+
 		for (j = 1 ; j < currentDocument.length; j++) {
 
 			// Extract current field
 			currentField = currentDocument[j];
+			if (currentField) {
 
-			// Apply all fieldProcessors
-			for (i = 0; i < opts.fieldProcessors.length; i++) {
-				if (currentField) {
-					currentField = opts.fieldProcessors[i](currentField);
+				// Apply all fieldProcessors
+				for (i = 0; i < opts.fieldProcessors.length; i++) {
+					if (currentField) {
+						currentField = opts.fieldProcessors[i](currentField);
+					}
 				}
-			}
 
-			// Split field into separate words
-			currentField = currentField.match(opts.characters);
+				// Split field into separate words
+				currentField = currentField.match(opts.characters);
 
-			// Extract unique words
-			for (k = 0; k < currentField.length; k++) {
-				if (currentWord !== '' && (currentWord = processWord(currentField[k], opts))) {
-					addWord(currentWord, currentDocument[0], j);
+				// Extract unique words
+				for (k = 0; k < currentField.length; k++) {
+					if (currentWord !== '' && (currentWord = processWord(currentField[k], opts))) {
+						addWord(currentWord, currentDocument[0], j);
+					}
 				}
 			}
 		}
@@ -228,6 +235,7 @@ Thinker.prototype.find = function (string, exact) {
 	exact = !!exact;
 
 	for (i = 0; i < words.length; i++) {
+
 		// Normalize and validate word
 		if (!(word = words[i]) || !(word = processWord(words[i], self.options))) {
 			continue;
@@ -237,6 +245,7 @@ Thinker.prototype.find = function (string, exact) {
 		queryResult = self.index.query(word, exact);
 
 		// 
+		suggestion = undefined;
 		if (!queryResult.direct && self.enableSuggestions) {
 			suggestion = self.index.findClosestWord(word.original);
 		}

diff --git a/lib/index.js b/lib/index.js
@@ -26,12 +26,13 @@ THE SOFTWARE.
 
 var levenshtein = require('fast-levenshtein');
 
-function index() {
+function index(opts) {
 
-	var	data = [],
+	var	options = opts,
+		data = [],
 		lookupPartial = {},
 		lookupFull = {},
-		lookupOriginal = {},
+		lookupSuggestion = {},
 
 		i, found,
 
@@ -87,7 +88,7 @@ function index() {
 					match,found;
 
 				indexProcessed = lookupFull[location.processed];
-				indexOriginal = lookupOriginal[location.original];
+				indexOriginal = lookupSuggestion[location.original];
 
 				// Index processed
 				if(indexProcessed === undefined) {
@@ -109,9 +110,11 @@ function index() {
 					}
 				}
 
-				// Index original
+				// Index original words for expression suggestions
 				if(indexOriginal === undefined) {
-					lookupOriginal[location.original] = true;
+					lookupSuggestion[location.original] = 1;
+				} else {
+					lookupSuggestion[location.original]++;
 				}
 
 				return indexProcessed;
@@ -125,33 +128,38 @@ function index() {
 				return { direct: result, partial: resultPartial };
 			},
 			getData: function ( ) {
-				return [data,lookupPartial,lookupFull,lookupOriginal];
+				return [data,lookupPartial,lookupFull,lookupSuggestion];
 			},
 			setData: function ( d ) {
 				data = d[0];
 				lookupPartial = d[1];
 				lookupFull = d[2];
-				lookupOriginal = d[3];
+				lookupSuggestion = d[3];
 			},
 			findClosestWord: function ( w ) {
 				var i, closestValue = Infinity, closestIndex, distance;
 
-				// Convert to array on first run
-				if ( Object.prototype.toString.call( lookupOriginal ) !== '[object Array]' ) {
-					lookupOriginal = Object.keys(lookupOriginal).map(function (key) { return key; });
+				// Convert to array and filter on first run
+				if ( Object.prototype.toString.call( lookupSuggestion ) !== '[object Array]' ) {
+					var result = [];
+					Object.keys(lookupSuggestion).forEach(function (key) { 
+						if (lookupSuggestion[key] >= options.suggestionMinWordCount) {
+							result.push(key);
+						}
+					});
+					lookupSuggestion = result;
 				}
 
-				for (i = 0; i < lookupOriginal.length; i++) {
-					distance = levenshtein.get(w, lookupOriginal[i]);
+				for (i = 0; i < lookupSuggestion.length; i++) {
+					distance = levenshtein.get(w, lookupSuggestion[i]);
 					if (distance < closestValue) {
 						closestIndex = i;
 						closestValue = distance;
 					}
 				}
 
-
-				if (closestIndex !== undefined) {
-					return lookupOriginal[closestIndex];
+				if (closestIndex !== undefined && closestValue < 5) {
+					return lookupSuggestion[closestIndex];
 				}
 			}
 		};

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "thinker-fts",
-  "version": "1.0.4",
+  "version": "1.0.5",
   "description": "Javascript/Node.js in-memory full text search engine.",
   "author": "Hexagon <github.com/hexagon>",
   "contributors": [{

diff --git a/test/test.js b/test/test.js
@@ -80,6 +80,30 @@ describe('Simple usage', function () {
 	});
 });
 
+describe('Simple usage', function () {
+	var thinker = Thinker({characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g});
+
+	thinker.ranker = Thinker.rankers.standard();
+
+	// We need to make a copy of exampletexts, as feed consumes the object
+	var exampleTextsCopy = JSON.parse(JSON.stringify(exampleTexts));
+	thinker.feed(exampleTextsCopy);
+
+	describe('opts.characters', function () {	
+		var result = thinker.find("ånglok");
+
+		// The second expressin is ignored as default minWordLength is 2
+		it('Should return one expression', function () {	
+			result.results.expressions.length.should.equal(1);
+		});
+
+		it('Expression interpretation should equal "ånglok"', function () {
+			result.results.expressions[0].interpretation.should.equal("ånglok");
+		});
+
+	});
+});
+
 describe('Partial match', function () {
 	var thinker = Thinker();
 
@@ -286,7 +310,7 @@ describe('Advanced ranker', function () {
 });
 
 describe('Suggestion', function () {
-	var thinker = Thinker();
+	var thinker = Thinker({suggestionMinWordCount: 1});
 	var ranker = Thinker.rankers.standard();
 
 	thinker.enableSuggestions = true;