Skip to content

Commit

Permalink
Soundex, bugfixes, documentation, enableSuggestions moved to opts
Browse files Browse the repository at this point in the history
  • Loading branch information
Hexagon committed Nov 24, 2015
1 parent a41c7f5 commit a013868
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 46 deletions.
50 changes: 39 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,28 @@

[![Build status](https://travis-ci.org/Hexagon/thinker-fts.svg)](https://travis-ci.org/Hexagon/thinker-fts) [![npm version](https://badge.fury.io/js/thinker-fts.svg)](https://badge.fury.io/js/thinker-fts)

Fast and extendible Node.js/Javascript full text search engine.
Fast and extendible pure JavaScript full text search engine.

## Features

* Highly optimized, will give a ranked resultset within 20 ms on a 5000 (average wikipedia sized) document dataset.
* In-memory operation
* Few external dependencies
* Natural language search
* Natural language searchx
* Partial matching
* Expression correction / suggestions
* Weighted ranker (configurable weights for each field, all-expression-match-factor, partial vs exact factor etc.)
* Search modifiers (+ require, - exclude, "searchword" precise match - excepts wordprocessors)
* Field preprocessors
* HTML-Stripper
* Word preprocessors
* Swedish stemmer with stemmer stop words
* Stop words
* Wordforms
* Stripper for multiple characters
* [Stemmers](https://en.wikipedia.org/wiki/Stemming)
* Swedish
* English
* [Stop words](https://en.wikipedia.org/wiki/Stop_words)
* Word forms
* [Soundex](https://en.wikipedia.org/wiki/Soundex)
* Stripper for repeated characters
* Allows saving/loading the index to/from disk, but for small datasets you can feed the index on-the-fly.


Expand Down Expand Up @@ -300,13 +304,19 @@ An optional feature of the stemmers is to supply a list of words that you don't

Currently there is two stemmers available, swedish through a custom version of the Snowball algorithm, and english through the Porter algorithm.

Example setting up thinker with standard ranker and english stemming
Example setting up thinker with standard ranker, english stemming and some stemmer stopwords.

```javascript
var
thinker = Thinker(),
ranker = Thinker.rankers.standard(),
stemmer = Thinker.processors.stemmers.english();
stemmer = Thinker.processors.stemmers.english({
"stemmer": true,
"stemming": true,
"dontstemthiseither": true,
"leonardo": true,
"anders", true
});

thinker.addWordProcessor(stemmer);

Expand All @@ -322,9 +332,8 @@ var
thinker = Thinker(),
ranker = Thinker.rankers.standard(),
stemmer = Thinker.processors.stemmers.swedish({
"stemmer": true,
"stemming": true,
"dontstemthiseither": true,
"berta": true,
"jonas": true,
"leonardo": true,
"anders", true
});
Expand All @@ -334,6 +343,23 @@ thinker.addWordProcessor(stemmer);
thinker.ranker = ranker;
```

#### Soundex

Soundex preprocesses the words in such way that words that sounds alike matches each other.

Example setting up thinker with Soundex processing.

```javascript
var
thinker = Thinker(),
ranker = Thinker.rankers.standard(),
soundex = Thinker.processors.soundex();

thinker.addWordProcessor(soundex);

thinker.ranker = ranker;
```


## Dependencies

Expand All @@ -343,6 +369,8 @@ Note: Dependencies is installed automatically by npm

[stemmer](https://github.com/wooorm/stemmer) (https://github.com/wooorm/stemmer)

[node-soundex](https://github.com/LouisT/node-soundex) (https://github.com/LouisT/node-soundex)


## Development dependencies

Expand Down
30 changes: 13 additions & 17 deletions lib/Thinker.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,6 @@ THE SOFTWARE.
*/

/* ToDo:
* Vikta titel efter hur mycket plats sökorden tar i titeln
* Missingspacesnurra
*/

var Index = require('./index.js'),
processors = require('./processors.js'),
rankers = require('./rankers.js');
Expand Down Expand Up @@ -118,9 +111,6 @@ function Thinker (opts) {
return new Thinker(opts);
}


// Can be set afterwards
self.enableSuggestions = false;
self.ranker = function() {};

// All these options must be set before indexing and
Expand All @@ -134,7 +124,8 @@ function Thinker (opts) {
maxWordLen: 32,
wordProcessors: [],
fieldProcessors: [],
suggestionMinWordCount: 6
suggestionMinWordCount: 6,
enableSuggestions: false
}, opts );

// Index backend
Expand Down Expand Up @@ -170,9 +161,9 @@ Thinker.prototype.feed = function (texts, opts) {
for (i = opts.minWildcardWordLen; i < word.original.length && i < opts.maxWildcardWordLen; i++) {
for (j = 0; j < (word.original.length - i) + 1; j++) {
// Do not input partial if equals processed
//if( word.original.substr(j,i) !== word.processed ) {
if( word.original.substr(j,i) !== word.processed ) {
self.index.populatePartial(word.original.substr(j, i), wIndex);
//}
}
}
}
}
Expand Down Expand Up @@ -256,16 +247,16 @@ Thinker.prototype.find = function (string) {
continue;
}

//
//
queryResult = self.index.query(word, exact);

//
// Enable suggestions if self.options.enableSuggestions is true
suggestion = undefined;
if (!queryResult.direct.length && self.enableSuggestions) {
if (!queryResult.direct.length && self.options.enableSuggestions) {
suggestion = self.index.findClosestWord(word.original);
}

//
// Push this expression to result array
resultSet.expressions.push({
interpretation: exact ? word.original : word.processed,
original: word.original,
Expand All @@ -281,6 +272,11 @@ Thinker.prototype.find = function (string) {
time('rankTime')
resultSet.documents = self.ranker(resultSet,self.index.getWordCount());

// Remove expression[m].hits from resultset, not needed anymore
for (i = 0; i < resultSet.expressions.length; i++) {
delete resultSet.expressions[i].hits;
}

// Add timers to resultset
resultSet.findTime = time('findTime');
resultSet.rankTime = time('rankTime');
Expand Down
2 changes: 1 addition & 1 deletion lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ function index(opts) {
} else {
direct = queryProcessed( location.processed );
}
partial = queryPartial( location.original ) || queryPartial( location.partial );
partial = queryPartial( location.original ) || queryPartial( location.processed );

// Add object
return {
Expand Down
15 changes: 13 additions & 2 deletions lib/processors.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ THE SOFTWARE.

'use strict';

var porterStemmer = require('stemmer');
var porterStemmer = require('stemmer'),
Soundex = require('soundex');

function stopwords ( stopwords ) {
var stopwords = stopwords || {};
Expand Down Expand Up @@ -262,17 +263,27 @@ function swedishStemmer ( stopwords ) {
}*/

function englishStemmer ( ) {
function englishStemmer ( stopwords ) {
var stopwords = stopwords || {};
return function ( w ) {
// Dont process stopwords
if ( stopwords[w] === true ) return w;
return porterStemmer( w );
};
};

function soundex ( ) {
return function ( w ) {
return Soundex( w );
};
};

module.exports = {
stemmers: {
swedish: swedishStemmer,
english: englishStemmer
},
soundex: soundex,
stopwords: stopwords,
wordforms: wordforms,
multiples: multiples,
Expand Down
12 changes: 7 additions & 5 deletions lib/rankers.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ THE SOFTWARE.

/* Default ranker */
function standard (options) {

// Defaults
var defaultFieldOptions = {
weight: 1,
Expand Down Expand Up @@ -68,6 +69,7 @@ function standard (options) {

j = 0;
while ((word = resultSet.expressions[j++])) {

matches = [
{
flag: 1,
Expand Down Expand Up @@ -100,15 +102,16 @@ function standard (options) {
// current field or fall back on the default settings.
fieldOptions = options.fields[fieldIndex] || defaultFieldOptions;

//
// Multiply match weight with field-specific weight
weight = match.weight * fieldOptions.weight;

// Not sure what this is
// For field with boostPercentage flag enabled - add extra weight the more of the field that is matched.
// 1 + (noOfMatchedWords / totalWordsInField)
if (fieldOptions.boostPercentage) {
weight *= (1 + (matchCount / wordCount[documentId][fieldIndex - 1]));
}

// Something explanatory
// Add this fields calculated weight to the document total
getDocument(documentId).weight += weight;
getDocument(documentId).expressions[j - 1] = match.flag;

Expand All @@ -123,8 +126,6 @@ function standard (options) {
// Convert document results from object to array (to be sortable)
documentResultsFinal = Object.keys(documentResults).map(function (key) { return documentResults[key]; });

// Remove unwanted documents

// Sort documents by total weight
documentResultsFinal.sort(function(a, b) {
return b.weight - a.weight
Expand Down Expand Up @@ -164,6 +165,7 @@ function standard (options) {
if (!toss) {
temp.push(documentResultsFinal[i]);
}

}

resultSet = temp;
Expand Down
14 changes: 10 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "thinker-fts",
"version": "1.0.7",
"description": "Javascript/Node.js in-memory full text search engine.",
"version": "1.0.8",
"description": "Pure Javascript/Node.js in-memory full text search engine.",
"author": "Hexagon <github.com/hexagon>",
"contributors": [{
"name": "Pehr Boman",
Expand All @@ -22,11 +22,17 @@
"thinker",
"fts",
"fulltext",
"in-memory"
"in-memory",
"levenshtein",
"soundex",
"porter",
"stemmer",
"full text search"
],
"dependencies": {
"fast-levenshtein": "*",
"stemmer": "*"
"stemmer": "*",
"soundex": "*"
},
"devDependencies": {
"mocha": "*",
Expand Down
Loading

0 comments on commit a013868

Please sign in to comment.