Soundex, bugfixes, documentation, enableSuggestions moved to opts

Hexagon · Nov 24, 2015 · a013868 · a013868
1 parent a41c7f5
commit a013868
Show file tree

Hide file tree

Showing 7 changed files with 163 additions and 46 deletions.
diff --git a/README.md b/README.md
@@ -2,24 +2,28 @@
 
 [![Build status](https://travis-ci.org/Hexagon/thinker-fts.svg)](https://travis-ci.org/Hexagon/thinker-fts) [![npm version](https://badge.fury.io/js/thinker-fts.svg)](https://badge.fury.io/js/thinker-fts)
 
-Fast and extendible Node.js/Javascript full text search engine.
+Fast and extendible pure JavaScript full text search engine.
 
 ## Features
 
   * Highly optimized, will give a ranked resultset within 20 ms on a 5000 (average wikipedia sized) document dataset.
   * In-memory operation
   * Few external dependencies
-  * Natural language search
+  * Natural language searchx
   * Partial matching
   * Expression correction / suggestions
   * Weighted ranker (configurable weights for each field, all-expression-match-factor, partial vs exact factor etc.)
+  * Search modifiers (+ require, - exclude, "searchword" precise match - excepts wordprocessors)
   * Field preprocessors
 	 * HTML-Stripper
   * Word preprocessors
-	 * Swedish stemmer with stemmer stop words
-	 * Stop words
-	 * Wordforms
-	 * Stripper for multiple characters
+	 * [Stemmers](https://en.wikipedia.org/wiki/Stemming)
+	    * Swedish
+	    * English
+	 * [Stop words](https://en.wikipedia.org/wiki/Stop_words)
+	 * Word forms
+	 * [Soundex](https://en.wikipedia.org/wiki/Soundex)
+	 * Stripper for repeated characters	
   * Allows saving/loading the index to/from disk, but for small datasets you can feed the index on-the-fly.
 
 
@@ -300,13 +304,19 @@ An optional feature of the stemmers is to supply a list of words that you don't
 
 Currently there is two stemmers available, swedish through a custom version of the Snowball algorithm, and english through the Porter algorithm.
 
-Example setting up thinker with standard ranker and english stemming
+Example setting up thinker with standard ranker, english stemming and some stemmer stopwords.
 
 ```javascript
 var
 	thinker 	= Thinker(),
 	ranker 		= Thinker.rankers.standard(),
-	stemmer 	= Thinker.processors.stemmers.english();
+	stemmer 	= Thinker.processors.stemmers.english({
+		"stemmer": true,
+		"stemming": true,
+		"dontstemthiseither": true,
+		"leonardo": true,
+		"anders", true
+	});
 
 thinker.addWordProcessor(stemmer);
 
@@ -322,9 +332,8 @@ var
 	thinker 	= Thinker(),
 	ranker 		= Thinker.rankers.standard(),
 	stemmer 	= Thinker.processors.stemmers.swedish({
-		"stemmer": true,
-		"stemming": true,
-		"dontstemthiseither": true,
+		"berta": true,
+		"jonas": true,
 		"leonardo": true,
 		"anders", true
 	});
@@ -334,6 +343,23 @@ thinker.addWordProcessor(stemmer);
 thinker.ranker = ranker;
 ```
 
+#### Soundex
+
+Soundex preprocesses the words in such way that words that sounds alike matches each other.
+
+Example setting up thinker with Soundex processing.
+
+```javascript
+var
+	thinker 	= Thinker(),
+	ranker 		= Thinker.rankers.standard(),
+	soundex 	= Thinker.processors.soundex();
+
+thinker.addWordProcessor(soundex);
+
+thinker.ranker = ranker;
+```
+
 
 ## Dependencies
 
@@ -343,6 +369,8 @@ Note: Dependencies is installed automatically by npm
 
   [stemmer](https://github.com/wooorm/stemmer) (https://github.com/wooorm/stemmer)
 
+  [node-soundex](https://github.com/LouisT/node-soundex) (https://github.com/LouisT/node-soundex)
+
 
 ## Development dependencies
 

diff --git a/lib/Thinker.js b/lib/Thinker.js
@@ -22,13 +22,6 @@ THE SOFTWARE.
 
 */
 
-/* ToDo:
-
-	* Vikta titel efter hur mycket plats sökorden tar i titeln
-	* Missingspacesnurra
-
-*/
-
 var	Index = require('./index.js'),
 	processors = require('./processors.js'),
 	rankers = require('./rankers.js');
@@ -118,9 +111,6 @@ function Thinker (opts) {
 		return new Thinker(opts);
 	}
 
-
-	// Can be set afterwards
-	self.enableSuggestions = false;
 	self.ranker = function() {};
 
 	// All these options must be set before indexing and
@@ -134,7 +124,8 @@ function Thinker (opts) {
 		maxWordLen: 32,
 		wordProcessors: [],
 		fieldProcessors: [],
-		suggestionMinWordCount: 6
+		suggestionMinWordCount: 6,
+		enableSuggestions: false
 	}, opts );
 
 	// Index backend
@@ -170,9 +161,9 @@ Thinker.prototype.feed = function (texts, opts) {
 		for (i = opts.minWildcardWordLen; i < word.original.length && i < opts.maxWildcardWordLen; i++) {
 			for (j = 0; j < (word.original.length - i) + 1; j++) {
 				// Do not input partial if equals processed
-				//if( word.original.substr(j,i) !== word.processed ) {
+				if( word.original.substr(j,i) !== word.processed ) {
 					self.index.populatePartial(word.original.substr(j, i), wIndex);
-				//}
+				}
 			}
 		}
 	}
@@ -256,16 +247,16 @@ Thinker.prototype.find = function (string) {
 			continue;
 		}
 
-		// 
+		//
 		queryResult = self.index.query(word, exact);
 
-		// 
+		// Enable suggestions if self.options.enableSuggestions is true 
 		suggestion = undefined;
-		if (!queryResult.direct.length && self.enableSuggestions) {
+		if (!queryResult.direct.length && self.options.enableSuggestions) {
 			suggestion = self.index.findClosestWord(word.original);
 		}
 
-		// 
+		// Push this expression to result array
 		resultSet.expressions.push({
 			interpretation: exact ? word.original : word.processed,
 			original: word.original,
@@ -281,6 +272,11 @@ Thinker.prototype.find = function (string) {
 	time('rankTime')
 	resultSet.documents = self.ranker(resultSet,self.index.getWordCount());
 
+	// Remove expression[m].hits from resultset, not needed anymore
+	for (i = 0; i < resultSet.expressions.length; i++) {
+		delete resultSet.expressions[i].hits;
+	}
+
 	// Add timers to resultset
 	resultSet.findTime = time('findTime');
 	resultSet.rankTime = time('rankTime');

diff --git a/lib/index.js b/lib/index.js
@@ -173,7 +173,7 @@ function index(opts) {
 				} else {
 					direct = queryProcessed( location.processed );	
 				}
-				partial = queryPartial( location.original ) || queryPartial( location.partial );
+				partial = queryPartial( location.original ) || queryPartial( location.processed );
 
 				// Add object
 				return {

diff --git a/lib/processors.js b/lib/processors.js
@@ -24,7 +24,8 @@ THE SOFTWARE.
 
 'use strict';
 
-var porterStemmer = require('stemmer');
+var porterStemmer = require('stemmer'),
+	Soundex = require('soundex');
 
 function stopwords ( stopwords ) {
 	var stopwords = stopwords || {};
@@ -262,17 +263,27 @@ function swedishStemmer ( stopwords ) {
 
 }*/
 
-function englishStemmer ( ) {
+function englishStemmer ( stopwords ) {
+	var stopwords = stopwords || {};
 	return function ( w ) {
+		// Dont process stopwords
+		if ( stopwords[w] === true ) return w;
 		return porterStemmer( w );
 	};
 };
 
+function soundex ( ) {
+	return function ( w ) {
+		return Soundex( w );
+	};
+};
+
 module.exports = {
 	stemmers: {
 		swedish: swedishStemmer,
 		english: englishStemmer
 	},
+	soundex: soundex,
 	stopwords: stopwords,
 	wordforms: wordforms,
 	multiples: multiples,

diff --git a/lib/rankers.js b/lib/rankers.js
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 /* Default ranker */
 function standard (options) {
+
 	// Defaults
 	var	defaultFieldOptions = {
 			weight: 1,
@@ -68,6 +69,7 @@ function standard (options) {
 
 		j = 0;
 		while ((word = resultSet.expressions[j++]))  {
+
 			matches = [
 				{
 					flag: 1,
@@ -100,15 +102,16 @@ function standard (options) {
 				// current field or fall back on the default settings. 
 				fieldOptions = options.fields[fieldIndex] || defaultFieldOptions;
 
-				// 
+				// Multiply match weight with field-specific weight
 				weight = match.weight * fieldOptions.weight;
 
-				// Not sure what this is
+				// For field with boostPercentage flag enabled - add extra weight the more of the field that is matched.
+				// 1 + (noOfMatchedWords / totalWordsInField)
 				if (fieldOptions.boostPercentage) {
 					weight *= (1 + (matchCount / wordCount[documentId][fieldIndex - 1]));
 				}
 
-				// Something explanatory
+				// Add this fields calculated weight to the document total
 				getDocument(documentId).weight += weight;
 				getDocument(documentId).expressions[j - 1] = match.flag;
 
@@ -123,8 +126,6 @@ function standard (options) {
 		// Convert document results from object to array (to be sortable)
 		documentResultsFinal = Object.keys(documentResults).map(function (key) { return documentResults[key]; });
 
-		// Remove unwanted documents
-
 		// Sort documents by total weight
 		documentResultsFinal.sort(function(a, b) {
 			return b.weight - a.weight
@@ -164,6 +165,7 @@ function standard (options) {
 			if (!toss) {
 				temp.push(documentResultsFinal[i]);
 			}
+
 		}
 
 		resultSet = temp;

diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "thinker-fts",
-  "version": "1.0.7",
-  "description": "Javascript/Node.js in-memory full text search engine.",
+  "version": "1.0.8",
+  "description": "Pure Javascript/Node.js in-memory full text search engine.",
   "author": "Hexagon <github.com/hexagon>",
   "contributors": [{
     "name": "Pehr Boman",
@@ -22,11 +22,17 @@
     "thinker",
     "fts",
     "fulltext",
-    "in-memory"
+    "in-memory",
+    "levenshtein",
+    "soundex",
+    "porter",
+    "stemmer",
+    "full text search"
   ],
   "dependencies": {
     "fast-levenshtein": "*",
-    "stemmer": "*"
+    "stemmer": "*",
+    "soundex": "*"
   },
   "devDependencies": {
     "mocha": "*",