From e68dd8fb304f484f3a8dcdeb8c3420c40792e8f0 Mon Sep 17 00:00:00 2001 From: Hexagon Date: Mon, 6 Feb 2017 22:42:10 +0100 Subject: [PATCH] Major update, release 1.1.0 * Reduced memory usage * API Breaking changes * Various bugfixes * Performance improvements * New features * Document metadata * Filters * Resultset sorting on metadata --- .travis.yml | 7 +- LICENSE | 2 +- README.md | 88 ++-- benchmark/soundex.js | 13 - benchmark/stemmers.english.js | 13 - benchmark/stemmers.swedish.js | 13 - index.js | 4 +- lib/Thinker.js | 256 +++++++++-- lib/index.js | 267 ++++++----- lib/processors.js | 40 +- lib/rankers.js | 248 +++++++--- lib/utils.js | 4 +- package.json | 6 +- test/test.js | 824 ++++++++++++++++++++++++++++------ 14 files changed, 1330 insertions(+), 455 deletions(-) delete mode 100644 benchmark/soundex.js delete mode 100644 benchmark/stemmers.english.js delete mode 100644 benchmark/stemmers.swedish.js diff --git a/.travis.yml b/.travis.yml index 3e167c5..3ae1c05 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: node_js node_js: - - "4.1" - - "4.0" - - "0.12" - - "0.11" + - "6" + - "5" + - "4" \ No newline at end of file diff --git a/LICENSE b/LICENSE index e528d58..bae76cb 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2015 Hexagon +Copyright (c) 2015-2017 Hexagon Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 3d25427..b2da188 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,19 @@ [![Build status](https://travis-ci.org/Hexagon/thinker-fts.svg)](https://travis-ci.org/Hexagon/thinker-fts) [![npm version](https://badge.fury.io/js/thinker-fts.svg)](https://badge.fury.io/js/thinker-fts) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/f4a95b3f01b644d9af07476e4e048c60)](https://www.codacy.com/app/robinnilsson/thinker-fts?utm_source=github.com&utm_medium=referral&utm_content=Hexagon/thinker-fts&utm_campaign=Badge_Grade) [![MIT License](https://img.shields.io/badge/license-MIT-blue.svg)](https://img.shields.io/badge/license-MIT-blue.svg) -Fast and extendible pure JavaScript full text search engine. +Fast, extendible and stand alone pure JavaScript full text search engine. ## Features - * Highly optimized, will give a ranked resultset within 20 ms on a 5000 (average wikipedia sized) document dataset. * In-memory operation + * Highly optimized, will give a ranked resultset within 10 ms on a 5000 (average wikipedia sized) document dataset. * Few external dependencies - * Natural language searchx + * Natural language search * Partial matching * Expression correction / suggestions * Weighted ranker (configurable weights for each field, all-expression-match-factor, partial vs exact factor etc.) - * Search modifiers (+ require, - exclude, "searchword" precise match - excepts wordprocessors) + * Search modifiers (+ require, - exclude, "searchword" precise match which excepts wordprocessors) + * Result filters * Field preprocessors * HTML-Stripper * Word preprocessors @@ -23,8 +24,7 @@ Fast and extendible pure JavaScript full text search engine. * [Stop words](https://en.wikipedia.org/wiki/Stop_words) * Word forms * [Soundex](https://en.wikipedia.org/wiki/Soundex) - * Stripper for repeated characters - * Allows saving/loading the index to/from disk, but for small datasets you can feed the index on-the-fly. + * Stripper for repeated characters ## Installation @@ -43,10 +43,10 @@ var Thinker = require('thinker-fts'), // Connect standard ranker thinker.ranker = Thinker.rankers.standard(); -// Feed thinker with documents of format [id, textfield, textfield, ...] +// Feed thinker with an array of documents formatted like { id: id, fields: [textfield, textfield] } thinker.feed([ - [1, 'Lorem', 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.'], - [2, 'Ipsum', 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.'] + { id: 1, fields: ['Lorem', 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.'] }, + { id: 2, fields: ['Ipsum', 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.'] } ]); // Search for text @@ -54,43 +54,42 @@ var result = thinker.find('ut in'); // Show result console.log(result); - -{ - expressions: [ - { - interpretation: 'ut', +{ + expressions: [ + { original: 'ut', + interpretation: [Object], suggestion: undefined, modifier: undefined, - exactMode: false + exactMode: false }, { - interpretation: 'in', original: 'in', + interpretation: [Object], suggestion: undefined, modifier: undefined, - exactMode: false + exactMode: false } ], + performance: { + find: 1.107075, + rank: 0.598558, + sort: 0.688598, + filter: 0.060182, + total: 2.639159 + }, documents: [ - { - id: 1, - weight: 12, - expressions: [1,0] // <- Array where index 0 correspods to first expression, - // 1 to second expression etc. - // Value is 2 for exact match - // 1 for partial match and 0 for no match - } + { id: 2, weight: 1.5, expressions: [Object] }, + { id: 1, weight: 1.5, expressions: [Object] } ], - findTime: 0.908248, // ms - rankTime: 0.109632 // ms + totalHits: 2, + returnedHits: 2 } ``` Please not that you _have to_ connect a ranker, else find won't provide a result set. The ranker build the result set. - ## Basic configuration Thinkers default configuration is overridden by supplying an options object to Thinkers constructor. @@ -99,14 +98,16 @@ Thinkers default configuration is overridden by supplying an options object to T // Options only available at initialization var thinker = Thinker({ - characters: /([a-zA-Z0-9']*)/g, + characters: /([a-zA-Z0-9]*)/g, caseSensitive: false, - minWildcardWordLen: 4, + minWildcardWordLen: 3, maxWildcardWordLen: 32, minWordLen: 2, maxWordLen: 32, suggestionMinWordCount: 6, - enableSuggestions: false + enableSuggestions: false, + optionalPlusFromExpressions: 1, + coalesceWords: 1 }); ``` @@ -150,10 +151,25 @@ If this is enabled, thinker will use unprocessed words from the inputted texts t This is what results.expressions[n] will look like when you search for 'exression' (missing p) +#### opts.optionalPlusFromExpressions + +Will be renamed, I promise. + +This is how many words there should be in the expression before all words become optional. Defaults to 1 (disabled). + +If you set this to 4, and search for a three word expression, all words will need to exist in the document to giva e match. In the background ```what you want``` become ```+what +you +want```. +If you giva a four word expression, all words become optional as usuabl. + +#### opts.coalesceWords + +When this property is set to greater than one, augmented words will be inserted into the index, consisting of current and next word. If this property is set to 3 and the field is "i want cookies today", a search for ```iwantcookies```, ```wantcookiestoday``` or ```wantcookies``` will give a match. ```javascript { - interpretation: 'exression', + interpretation: { + original: 'expression', + ... + }, ... suggestion: 'expression', ... @@ -198,7 +214,7 @@ Object defining a different base weight for a match in each field of a document, ```javascript var docs = [ - [1,"This is the title", "This is the ingress", "This is the text"], + { id: 1, fields: ["This is the title", "This is the ingress", "This is the text"] }, ... ]; ``` @@ -207,9 +223,9 @@ and your fields weights look like ```javascript fields: { - 1: { weight: 4, boostPercentage: true }, - 2: { weight: 2, boostPercentage: false }, - 3: { weight: 2, boostPercentage: false } + 0: { weight: 4, boostPercentage: true }, + 1: { weight: 2, boostPercentage: false }, + 2: { weight: 2, boostPercentage: false } } ``` diff --git a/benchmark/soundex.js b/benchmark/soundex.js deleted file mode 100644 index 420f7b3..0000000 --- a/benchmark/soundex.js +++ /dev/null @@ -1,13 +0,0 @@ -var - Benchmark = require("benchmark"), - Thinker = require("../"), - suite = new Benchmark.Suite, - soundex = Thinker.processors.soundex(); - -suite.add("Soundex", function() { - var result = soundex("convolution"); -}) -.on("cycle", function(event) { - console.log(String(event.target)); -}) -.run(); diff --git a/benchmark/stemmers.english.js b/benchmark/stemmers.english.js deleted file mode 100644 index 7562c03..0000000 --- a/benchmark/stemmers.english.js +++ /dev/null @@ -1,13 +0,0 @@ -var - Benchmark = require('benchmark'), - Thinker = require('../'), - suite = new Benchmark.Suite, - englishStemmer = Thinker.processors.stemmers.english(); - -suite.add('English stemmer', function() { - var result = englishStemmer('convolution'); -}) -.on('cycle', function(event) { - console.log(String(event.target)); -}) -.run(); diff --git a/benchmark/stemmers.swedish.js b/benchmark/stemmers.swedish.js deleted file mode 100644 index c9cbd5b..0000000 --- a/benchmark/stemmers.swedish.js +++ /dev/null @@ -1,13 +0,0 @@ -var - Benchmark = require("benchmark"), - Thinker = require("../"), - suite = new Benchmark.Suite, - swedishStemmer = Thinker.processors.stemmers.swedish(); - -suite.add("Swedish stemmer", function() { - var result = swedishStemmer("friserandets"); -}) -.on("cycle", function(event) { - console.log(String(event.target)); -}) -.run(); diff --git a/index.js b/index.js index d61fee4..a1da04f 100644 --- a/index.js +++ b/index.js @@ -1,3 +1 @@ -var Thinker = require('./lib/Thinker.js'); - -module.exports = Thinker; \ No newline at end of file +module.exports = require('./lib/Thinker.js'); \ No newline at end of file diff --git a/lib/Thinker.js b/lib/Thinker.js index 469cb45..61fb372 100644 --- a/lib/Thinker.js +++ b/lib/Thinker.js @@ -21,6 +21,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +"use strict"; var Index = require("./index.js"), processors = require("./processors.js"), @@ -28,6 +29,7 @@ var Index = require("./index.js"), utils = require("./utils.js"); function processWord (word, opts) { + var result, i; @@ -47,7 +49,23 @@ function processWord (word, opts) { } // Prepare object - result = { original: word, processed: undefined }; + result = { original: word, preprocessed: undefined, processed: undefined }; + + // Apply all wordProcessors + for (i = 0; i < opts.wordPreProcessors.length; i++) { + if (!word) { + break; + } + + word = opts.wordPreProcessors[i](word); + } + + // Check if the preprocessor disabled this word + if (!word) { + return; + } + + result.preprocessed = word; // Apply all wordProcessors for (i = 0; i < opts.wordProcessors.length; i++) { @@ -58,7 +76,7 @@ function processWord (word, opts) { word = opts.wordProcessors[i](word); } - // Check if the preprocessor disabled this word + // Check if the wordProcessors disabled this word if (!word) { return; } @@ -67,9 +85,10 @@ function processWord (word, opts) { result.processed = word; return result; -}; +} function Thinker (opts) { + var self = this; // Optional `new` keyword @@ -78,20 +97,24 @@ function Thinker (opts) { } self.ranker = function() {}; + self.propertyRanker = rankers.property(); // All these options must be set before indexing and // cannot change afterwards (the object will also be frozen). self.options = utils.defaults({ characters: /([a-zA-Z0-9]*)/g, caseSensitive: false, - minWildcardWordLen: 4, + minWildcardWordLen: 3, maxWildcardWordLen: 32, minWordLen: 2, maxWordLen: 32, wordProcessors: [], + wordPreProcessors: [], fieldProcessors: [], suggestionMinWordCount: 6, - enableSuggestions: false + enableSuggestions: false, + optionalPlusFromExpressions: 1, + coalesceWords: 1 }, opts ); // Changing settings after initializing the index would break things, we will try to prevent that @@ -100,9 +123,10 @@ function Thinker (opts) { // Index backend self.index = new Index(self.options); -}; +} Thinker.prototype.feed = function (texts) { + var self = this, opts = self.options, currentDocument, @@ -110,38 +134,46 @@ Thinker.prototype.feed = function (texts) { currentWord, i,j,k; - // Helper function adding a single word to the index - function addWord (word, docid, fieldIdx) { + function addWord (word, docid, fieldIdx, augmented) { + var wIndex, i, j; - - // Add original - wIndex = self.index.populate(word, docid, fieldIdx); - // Add processed + // Add original, preprocessed and processed + wIndex = self.index.populate(word, docid, fieldIdx, augmented); + self.index.populatePreProcessed(word.preprocessed, wIndex); self.index.populateProcessed(word.processed, wIndex); - // Add partials - for (i = opts.minWildcardWordLen; i < word.original.length && i < opts.maxWildcardWordLen; i++) { - for (j = 0; j < (word.original.length - i) + 1; j++) { - // Do not input partial if equals processed or equals original - if( word.original.substr(j,i) !== word.processed && word.original.substr(j,i) !== word.original ) { - self.index.populatePartial(word.original.substr(j, i), wIndex); + if (!augmented) { + for (i = opts.minWildcardWordLen; i < word.original.length && i < opts.maxWildcardWordLen; i++) { + for (j = 0; j < (word.original.length - i) + 1; j++) { + // Do not input partial if equals processed or equals original + if( word.preprocessed.substr(j,i) !== word.processed && word.preprocessed.substr(j,i) !== word.preprocessed ) { + self.index.populatePartial(word.preprocessed.substr(j, i), wIndex); + } } } } + } /* Stage 1, query index for each individual word */ - while (currentDocument = texts.pop()) { - + while ( (currentDocument = texts.pop() ) ) { + + // Add metatada for current document + if (currentDocument.metadata) { + self.index.populateMetadata(currentDocument.id, currentDocument.metadata); + } + // split text into separate words, removing empty results // Loop through all textfields (index > 0) - for (j = 1 ; j < currentDocument.length; j++) { + for (var j = 0; j < currentDocument.fields.length; j++) { // Extract current field - if ( (currentField = currentDocument[j]) ) { + if ( (currentField = currentDocument.fields[j]) ) { + + var wordHistory = []; // Apply all fieldProcessors for (i = 0; i < opts.fieldProcessors.length; i++) { @@ -155,48 +187,142 @@ Thinker.prototype.feed = function (texts) { // Extract unique words for (k = 0; k < currentField.length; k++) { + + // Check that the current word is't invalidated by the word processors, and add it to the index if (currentWord !== "" && (currentWord = processWord(currentField[k], opts))) { - addWord(currentWord, currentDocument[0], j); + addWord(currentWord, currentDocument.id, j); } + + // Coalesce words (making separate words and written together words equal) + // This bypasses the valid word check, allowing single character words etc to be concatenated + if (opts.coalesceWords > 1 && currentField[k] !== "") { + + wordHistory.push(currentField[k]); + + if (wordHistory.length > 1 ) { + for(var i = 0; i < wordHistory.length - 1; i++) { + var augmentedWord = processWord(wordHistory.slice(i,wordHistory.length).join(""), opts); + addWord(augmentedWord, currentDocument.id, j, true); + } + if (wordHistory.length >= opts.coalesceWords) { + wordHistory.shift(); + } + + } + + } + } + } + } + } + }; Thinker.prototype.addFieldProcessor = function (fn) { return (this.options.fieldProcessors.push(fn), this); }; +Thinker.prototype.addWordPreProcessor = function (fn) { + return (this.options.wordPreProcessors.push(fn), this); +}; + Thinker.prototype.addWordProcessor = function (fn) { return (this.options.wordProcessors.push(fn), this); }; -Thinker.prototype.find = function (string) { +Thinker.prototype.find = function (params) { + + utils.time("totalFindTime"); utils.time("findTime"); + // Allow search string instead of params + // Ignore that f-ed up strings can be typeof "object" :) + if (typeof params === "string") { + params = { expression: params }; + } + + // Exapand params with refaults + params = utils.defaults({ + + // Search string + // Value: String + expression: null, + + // Search only in specifiec field + // Value: Array or nullFmeta + fields: null, + + // Direction + // Value: Boolean + // true = descending + // false = ascending + direction: true, + + // Filter function + // Filter results on + // filter: function (metadata) { + // return metadata.active; + // } + filter: null, + + // Sort by + // Value: String + // sortBy: weight <- Default, sort by ranker weight + // sortBy: anything <- Sort by metadata propert "anything" + sortBy: "weight", + + // Limit number of results + // Value: null or integer + limit: null + + }, params); + + // Handle inconsistencies + if (!params.expression) params.expression = ''; + var self = this, - // Extract valid parts of the expression - words = string.split(" "), + words, word, - // Find matching texts - resultSet = { expressions: [] }, + resultSet = { expressions: [], performance: {} }, queryResult, suggestion, - i; + i, + + expression; + + // Remove trailing spaces after + and - + expression = params.expression.replace(/([+-])+(\s)+/g, '$1'); + + // Remove dashes without space in front + expression = expression.replace(/([^\s]){1}-/, '$1'); + + // Remove leading and trailing spaces from search query + expression = expression.trim(" "); + + // Split query into searate words on whitespace charcter + words = expression.split(" "); for (i = 0; i < words.length; i++) { var modifier=undefined, exact=false; // Find modifiers, set flags, and remove their textual representation + // Plus modifier is automagically applied to each word(expression) if total if ( ["+","-"].indexOf(words[i][0]) !== -1) { modifier = words[i][0]; words[i] = words[i].substring(1,words[i].length); + } else { + if ( words.length < self.options.optionalPlusFromExpressions ) { + modifier = "+"; + } } // Trigger exact mode @@ -206,23 +332,23 @@ Thinker.prototype.find = function (string) { words[i] = words[i].replace(/\"/g,""); // Normalize and validate word - if (!(word = words[i]) || !(word = processWord(words[i], self.options))) { + if (!(word = processWord(words[i], self.options))) { continue; } // - queryResult = self.index.query(word, exact); + queryResult = self.index.query(word, exact, params.filter); // Enable suggestions if self.options.enableSuggestions is true suggestion = undefined; - if (!queryResult.direct.length && self.options.enableSuggestions) { + if ((!queryResult.exact.length && !queryResult.processed.length) && self.options.enableSuggestions) { suggestion = self.index.findClosestWord(word.original); } // Push this expression to result array resultSet.expressions.push({ - interpretation: exact ? word.original : word.processed, - original: word.original, + original: words[i], + interpretation: word, suggestion: suggestion, modifier: modifier, exactMode: exact, @@ -232,21 +358,73 @@ Thinker.prototype.find = function (string) { } // Done finding - resultSet.findTime = utils.time("findTime"); + resultSet.performance.find = utils.time("findTime"); + + // Start ranking + utils.time("rankTime"); + + // Rank by weight + if (params.sortBy === "weight") { + resultSet.documents = self.ranker(resultSet, self.index.getWordCount()); + + // Rank by metadata + } else { + resultSet.documents = self.propertyRanker({ + resultSet: resultSet, + index: self.index, + sortBy: params.sortBy + }); + + } + + // Done ranking + resultSet.performance.rank = utils.time("rankTime"); + + // Start sorting + utils.time("sortTime"); + + // Sort documents by total weight + resultSet.documents = resultSet.documents.sort(function(a, b) { + return params.direction ? (b.weight - a.weight) : (a.weight - b.weight); + }); + + // Done sorting + resultSet.performance.sort = utils.time("sortTime"); // Start ranking - utils.time("rankTime") - resultSet.documents = self.ranker(resultSet,self.index.getWordCount()); + utils.time("filterTime"); + + resultSet.totalHits = resultSet.documents.length; + + if (params.limit) { + resultSet.documents = resultSet.documents.slice(0, params.limit); + } + + resultSet.returnedHits = resultSet.documents.length; // Remove expression[m].hits from resultset, not needed anymore for (i = 0; i < resultSet.expressions.length; i++) { delete resultSet.expressions[i].hits; } - // Done fanking - resultSet.rankTime = utils.time("rankTime"); + // Restore document ids, append filters, append meta + for (i = 0; i < resultSet.documents.length; i++) { + + let docIdx = resultSet.documents[i].id; + + // Restore metadata and document id + resultSet.documents[i].metadata = self.index.getMetadata(docIdx); + resultSet.documents[i].id = self.index.docIndexToId(resultSet.documents[i].id); + + + } + + resultSet.performance.filter = utils.time("filterTime"); + + resultSet.performance.total = utils.time("totalFindTime"); return resultSet; + }; Thinker.processors = processors; diff --git a/lib/index.js b/lib/index.js index 308a2a3..71f1ded 100644 --- a/lib/index.js +++ b/lib/index.js @@ -28,191 +28,227 @@ var levenshtein = require('fast-levenshtein'); function index(options) { - var data = [], + var + // Array of Array with DocumentIndex, FieldIndex, OcurrencesOfWordInCurrentDocumentAndField + // Index of outer array is WordIndex, matched to an actual word through lookupOriginal, lookupPartial, lookupProcessed och lookupPreProcessed + // [ + // [DocumentIndex, FieldIndex, OcurrencesOfWordInCurrentDocumentAndField, DocumentIndex, FieldIndex, OcurrencesOfWordInCurrentDocumentAndField, ... ], + // [DocumentIndex, FieldIndex, OcurrencesOfWordInCurrentDocumentAndField, DocumentIndex, FieldIndex, OcurrencesOfWordInCurrentDocumentAndField, ... ], + // [DocumentIndex, FieldIndex, OcurrencesOfWordInCurrentDocumentAndField, DocumentIndex, FieldIndex, OcurrencesOfWordInCurrentDocumentAndField, ... ], + // [DocumentIndex, FieldIndex, OcurrencesOfWordInCurrentDocumentAndField, DocumentIndex, FieldIndex, OcurrencesOfWordInCurrentDocumentAndField, ... ] + // } + data = [], + + // Lookup Maps of original, partial, processed and preprocessed words + // { + // "ActualWord": WordIndexIn data + // } + lookupOriginal = new Map(null), lookupPartial = new Map(null), lookupProcessed = new Map(null), - lookupOriginal = new Map(null), + lookupPreProcessed = new Map(null), + + // Map of every word and how many times it is used lookupSuggestion = new Map(null), - wordCount = new Object(null), - - found, - - concatArray = function ( arr1, arr2 ) { - var j, i, found, newIdx; - for(i = 0; i < arr2.length; i+=3) { - found = false; - for(j = 0; j < arr1.length; j+=3) { - if (arr2[i] == arr1[j] && arr2[i+1] == arr1[j+1]) { - arr1[j+2]++; - found = true; - break; + + // Map documentIndex => { + // supplied: "Document metadata" + // } + lookupMetadata = new Map(null), + + // Map documentId: documentIndex + lookupDocId = new Map(null), + + // Map documentIndex: documentId + lookupDocIdReverse = new Map(null), + + // Keep track of next available unique documentId + currentDocIndex = 0, + + // Keep track of number of words in each document and field + wordCount = Object.create(null), + + query = function ( location, lookupMap, filterFunc ) { + + // Add object + var words = lookupMap.get(location), + arr1 = [], + arr2, + i, j, + newIdx, + idx, + idxKey, + iEntry; + + var subroutine = function (word, arr1) { + + var j; + + arr2 = data[word]; + idx = new Map(); + + for(j = 0; j < arr2.length; j+=3) { + + if ( ! ( filterFunc && !filterFunc(lookupMetadata.get(arr2[j])))) { + + idxKey = arr2[j]*1e10+arr2[j+1]; + + if ( (iEntry = idx.get(idxKey)) ) { + arr1[iEntry+2]++; + } else { + newIdx = arr1.length; + idx.set(idxKey, newIdx); + arr1[newIdx] = arr2[j]; + arr1[newIdx+1] = arr2[j+1]; + arr1[newIdx+2] = arr2[j+2]; + } + } + } - if (!found) { - newIdx = arr1.length-1; - arr1[++newIdx] = arr2[i]; - arr1[++newIdx] = arr2[i+1]; - arr1[++newIdx] = arr2[i+2]; - } + } - return arr1; - }, - queryPartial = function ( location ) { + if( words !== undefined ) { - // Add object - var words = lookupPartial.get(location), - currentResult, - result = [], - i; - - if( words !== void 0 ) { - for( i = 0; i < words.length; i++) { - currentResult = data[words[i]]; - result = concatArray (result, currentResult); + if (words.constructor === Array) { + for( i = 0; i < words.length; i++) { + subroutine(words[i], arr1); + } + } else { + subroutine(words, arr1); } - return result; + + return arr1; } else { return; } }, - queryProcessed = function ( location ) { + populate = function ( location, wordIdx, lookup ) { + + var dest = lookup.get(location); // Add object - var words = lookupProcessed.get(location), - currentResult, - currentIndex, - addedIndex = [], - result = [], - i; - - if( words !== void 0 ) { - for( i = 0; i < words.length; i++) { - currentResult = data[words[i]]; - result = concatArray (result, currentResult); - } - return result; + if(dest === undefined) { + lookup.set(location,[wordIdx]); + } else { - return; + // Only insert if not already existing + if( dest.indexOf(wordIdx) === -1) { + dest[dest.length] = wordIdx; + } + } }, - exports = { - populateProcessed: function ( location, wordIdx ) { - - var i, location, dest = lookupProcessed.get(location); + docIdToIndex = function (docId) { + let d = lookupDocId.get(docId); + if(d === undefined) { + d = currentDocIndex++; + lookupDocId.set(docId, d); + lookupDocIdReverse.set(d, docId); + } + return d; + }, - // Add object - if(dest === void 0) { - lookupProcessed.set(location,[wordIdx]); - } else { - // Only insert if not already existing - if( dest.indexOf(wordIdx) === -1) { - dest.push(wordIdx); - } - } + docIndexToId = function (docIndex) { + return lookupDocIdReverse.get(docIndex); + }; - }, + exports = { populatePartial: function ( location, wordIdx ) { - - var i, dest = lookupPartial.get(location); - - // Add object - if(dest === void 0) { - lookupPartial.set(location,[wordIdx]); - } else { - // Only insert if not already existing - if( dest.indexOf(wordIdx) === -1) { - dest.push(wordIdx); - } - } - + populate( location, wordIdx, lookupPartial); + }, + populateProcessed: function ( location, wordIdx ) { + populate( location, wordIdx, lookupProcessed); + }, + populatePreProcessed: function ( location, wordIdx ) { + populate( location, wordIdx, lookupPreProcessed); + }, + populateMetadata: function (docid, meta) { + lookupMetadata.set(docIdToIndex(docid), meta); + }, + getMetadata: function (docidx) { + return lookupMetadata.get(docidx); }, + docIndexToId: docIndexToId, populate: function ( location, docId, fieldIdx ) { - // Add object - var i, + var + docIdx = docIdToIndex(docId), + i, indexOriginal, - indexSuggestion, + suggestionCounter, match, found; // Index original words indexOriginal = lookupOriginal.get(location.original); - if(indexOriginal === void 0) { + if(indexOriginal === undefined) { indexOriginal = data.length; - lookupOriginal.set(location.original,indexOriginal); - data[indexOriginal] = [docId, fieldIdx, 1]; + lookupOriginal.set(location.original, indexOriginal); + data[indexOriginal] = [docIdx, fieldIdx, 1]; } else { found = false; match = data[indexOriginal]; for (i = 0; i < match.length; i+=3) { - if(match[i] === docId && match[i+1] === fieldIdx ) { + if(match[i] === docIdx && match[i+1] === fieldIdx ) { match[i+2]++; found = true; break; } } if (!found) { - match.push(docId, fieldIdx, 1); + match.push(docIdx, fieldIdx, 1); } } // Update wordcount of current document and field - if (wordCount[docId] === void 0 ) { - wordCount[docId] = []; + if (wordCount[docIdx] === undefined ) { + wordCount[docIdx] = []; } - wordCount[docId][fieldIdx-1] = (wordCount[docId][fieldIdx-1] || 0) + 1; + wordCount[docIdx][fieldIdx-1] = (wordCount[docIdx][fieldIdx-1] || 0) + 1; // Index original words for expression suggestions, this is filtered on // first run of 'findClosestWord' - indexSuggestion = lookupSuggestion.get(location.original); - if(indexSuggestion === void 0) { + suggestionCounter = lookupSuggestion.get(location.original); + if(suggestionCounter === undefined) { lookupSuggestion.set(location.original,1); } else { - lookupSuggestion.set(location.original,indexSuggestion++); + lookupSuggestion.set(location.original,++suggestionCounter); } return indexOriginal; }, - query: function ( location, exact ) { - var direct, - partial; + query: function ( location, exact, filterFunc ) { + + var hits = {}; if ( exact ) { - direct = ((index = lookupOriginal.get(location.original)) !== void 0) ? data[index] : void 0; + hits.exact = query( location.preprocessed, lookupPreProcessed, filterFunc ) || []; + hits.processed = []; + hits.partial = []; + } else { - direct = queryProcessed( location.processed ); - if ( (partial = queryPartial( location.original )) === void 0 ) partial = queryPartial( location.processed ); + hits.exact = query( location.preprocessed, lookupPreProcessed, filterFunc ) || []; + hits.processed = query( location.processed, lookupProcessed, filterFunc ) || []; + if ( (hits.partial = query( location.preprocessed, lookupPartial, filterFunc )) === undefined ) hits.partial = query( location.processed, lookupPartial, filterFunc ) || []; + } - return { - direct: direct || [], - partial: partial || [] - }; + return hits; + }, getWordCount: function ( ) { return wordCount; }, - getData: function ( ) { - return [data,lookupPartial,lookupProcessed,lookupOriginal,lookupSuggestion,wordCount]; - }, - setData: function ( d ) { - data = d[0]; - lookupPartial = d[1]; - lookupProcessed = d[2]; - lookupOriginal = d[3]; - lookupSuggestion = d[4]; - wordCount = d[5]; - - }, findClosestWord: function ( w ) { var closestValue = Infinity, closestIndex, distance; - lookupSuggestion.forEach(function(value,key) { + lookupSuggestion.forEach(function(value, key) { if(value >= options.suggestionMinWordCount) { distance = levenshtein.get(w, key); if (distance < closestValue) { @@ -223,8 +259,7 @@ function index(options) { lookupSuggestion.delete(key); } }); - - if (closestIndex !== void 0 && closestValue < 5) { + if (closestIndex !== undefined && closestValue < 5) { return closestIndex; } } diff --git a/lib/processors.js b/lib/processors.js index df4042c..80b016c 100644 --- a/lib/processors.js +++ b/lib/processors.js @@ -27,29 +27,37 @@ THE SOFTWARE. var porterStemmer = require("stemmer"), Soundex = require("soundex"); -function stopwords ( stopwords ) { - stopwords = stopwords || {}; +function stopwords ( stopword ) { + stopword = stopword || {}; return function ( w ) { - if ( stopwords[w] === true ) return; + if ( stopword[w] === true ) return; return w; }; -}; +} -function wordforms ( wordforms ) { - var wordforms = wordforms || {}; +function wordforms ( wordform ) { + wordform = wordform || {}; return function ( w ) { - return wordforms[w] || w; + return wordform[w] || w; }; -}; +} -function multiples ( stopwords ) { - var stopwords = stopwords || {}; +function multiples ( stopword ) { + stopword = stopword || {}; return function ( w ) { - if ( stopwords[w] === true ) return; + if ( stopword[w] === true ) return; return w.replace(/([a-zåäö])\1+/gi, "$1"); }; -}; +} + +function dashes ( stopword ) { + stopword = stopword || {}; + return function ( w ) { + if ( stopword[w] === true ) return; + return w.replace(/([^\s]){1}-/, '$1'); + }; +} function stripHtml ( ) { @@ -117,7 +125,8 @@ function swedishStemmer(stopwords) { stopwords = stopwords || {}, suffix = ["dd", "gd", "nn", "dt", "gt", "mm", "tt"], - endings = ["iteten", "anden", "andet", "orna", "aste", "aren", "arna", "ande", "erna", "arne", "itet", "ning", "het", "ast", "ade", "ern", "ing", "are", "en", "ad", "an", "ar", "ig", "er", "et", "or", "at", "na", "e", "a"]; + endings = ["igheter", "igheten", "ingarna", "iteten", "ingen", "anden", "andet", "orna", "aste", "aren", "arna", "ande", "erna", "arne", "itet", "ndet", "orn","het", "ast", "and", "ade", "ern", "ing", "are", "en", "ad", "an", "ar", "ig", "er", "et", "or", "at", "e", "a"]; + return function (w) { @@ -133,7 +142,7 @@ function swedishStemmer(stopwords) { // Remove trailing s if (r1[r1.length-1]==="s") r1 = r1.substring(0,r1.length-1); - // Return of we didnt find r1 + // Return if we didnt find r1 if (r1.length === 0) return word; // Stage 1a-1 @@ -188,5 +197,6 @@ module.exports = { stopwords: stopwords, wordforms: wordforms, multiples: multiples, - stripHtml: stripHtml + stripHtml: stripHtml, + dashes: dashes } \ No newline at end of file diff --git a/lib/rankers.js b/lib/rankers.js index 1c0510b..45c972b 100644 --- a/lib/rankers.js +++ b/lib/rankers.js @@ -36,10 +36,9 @@ function standard (options) { }, options = utils.defaults({ - directHit: 1, + exactHit: 1.5, + processedHit: 1, partialHit: 0.5, - eachPartialExpressionFactor: 1.5, - eachDirectExpressionFactor: 2, fields: {}, minimumWeight: 0 },options); @@ -47,7 +46,7 @@ function standard (options) { return function (resultSet, wordCount) { var documentResultsFinal = [], - documentResults = {}, + documentResultsLookup = {}, i, j, @@ -68,71 +67,73 @@ function standard (options) { while ((word = resultSet.expressions[j++])) { matches = [ - { - flag: 2, - rows: word.hits.direct, - weight: options.directHit, - length: word.hits.direct.length - }, { flag: 1, rows: word.hits.partial, weight: options.partialHit, length: word.hits.partial.length + }, + { + flag: 2, + rows: word.hits.processed, + weight: options.processedHit, + length: word.hits.processed.length + }, + { + flag: 3, + rows: word.hits.exact, + weight: options.exactHit, + length: word.hits.exact.length } ]; - // Get first match (partial) - match = matches.pop(); + // Jump to processed if it"s empty + while(match = matches.pop()) { - // Jump to partials if it"s empty - if (!match.length) { - match = matches.pop(); - } + for (i = 0; i < match.length; i) { - for (i = 0; i < match.length; i) { - documentId = match.rows[i++]; - fieldIndex = match.rows[i++]; - matchCount = match.rows[i++]; + documentId = match.rows[i++]; + fieldIndex = match.rows[i++]; + matchCount = match.rows[i++]; - // Get the specific user-specified settings for the - // current field or fall back on the default settings. - fieldOptions = options.fields[fieldIndex] || (options.fields[fieldIndex] = defaultFieldOptions); + // Ensure that document exists in results + if (documentResultsLookup[documentId] === void 0) { + documentResultsLookup[documentId] = documentResultsFinal.length; - // Multiply match weight with field-specific weight - weight = match.weight * fieldOptions.weight; + doc = documentResultsFinal[documentResultsLookup[documentId]] = { + id: documentId, + weight: 0, + expressions: [], + }; - // For field with boostPercentage flag enabled - add extra weight the more of the field that is matched. - // 1 + (noOfMatchedWords / totalWordsInField) - if (fieldOptions.boostPercentage) { - weight *= (1 + 2.8*(matchCount / wordCount[documentId][fieldIndex - 1])); - } + } else { + doc = documentResultsFinal[documentResultsLookup[documentId]]; + } - doc = documentResults[documentId] || (documentResults[documentId] = { - id: documentId, - weight: 0, - expressions: [] - }); + // Don't do unnessesary work + if ( !doc.expressions[j-1] ) { - doc.weight += weight; - doc.expressions[j-1] = match.flag; + // Get the specific user-specified settings for the + // current field or fall back on the default settings. + fieldOptions = options.fields[fieldIndex] || (options.fields[fieldIndex] = defaultFieldOptions); - // Jump to the next match when the current is exhausted - if (i === match.length && matches.length) { - match = matches.pop(); - i = 0; - } + // Multiply match weight with field-specific weight + weight = match.weight * fieldOptions.weight; - } - } + // For field with boostPercentage flag enabled - add extra weight the more of the field that is matched. + // 1 + (noOfMatchedWords / totalWordsInField) + if (fieldOptions.boostPercentage) { + weight *= (1 + 2.8*(matchCount / wordCount[documentId][fieldIndex - 1])); + } - // Convert document results from object to array (to be sortable) - documentResultsFinal = Object.keys(documentResults).map(function (key) { return documentResults[key]; }); + doc.weight += weight; + doc.expressions[j-1] = match.flag; - // Sort documents by total weight - documentResultsFinal.sort(function(a, b) { - return b.weight - a.weight; - }); + } + + } + } + } // Postprocess resultset, multiplying total weight with a factor under certain circumstances, var temp = []; @@ -141,51 +142,158 @@ function standard (options) { var toss = false; if ( documentResultsFinal[i].weight < options.minimumWeight ) { - toss = true; - } else { - // - Multiply document weight by a factor for ( j = 0; j < resultSet.expressions.length; j++ ) { - - // 2 == Exact match - if (documentResultsFinal[i].expressions[j]==2) { - documentResultsFinal[i].weight *= options.eachDirectExpressionFactor; - // 1 == Partial match - } else if (documentResultsFinal[i].expressions[j]==1) { - documentResultsFinal[i].weight *= options.eachPartialExpressionFactor; + if (!documentResultsFinal[i].expressions[j]) documentResultsFinal[i].expressions[j] = 0; + + // Keep this row? + if ( resultSet.expressions[j].modifier === "-" && documentResultsFinal[i].expressions[j] > 0 ) { + toss = true; + } else if ( resultSet.expressions[j].modifier === "+" && documentResultsFinal[i].expressions[j] === 0) { + toss = true; + } + } + } + + if (!toss) { + temp[temp.length] = documentResultsFinal[i]; + } + + } + + return temp; + }; + +} + +/* Rank by generic property */ +function property () { + + return function (options) { + + options = utils.defaults({ + resultSet: null, + index: null, + sortBy: null + },options); + + var documentResultsFinal = [], + documentResultsLookup = {}, + + i, j, + + documentId, + fieldIndex, + matchCount, + + doc, + + fieldOptions, + weight, + + matches, + word, + match; + + j = 0; + while ((word = options.resultSet.expressions[j++])) { + + matches = [ + { + flag: 1, + rows: word.hits.partial, + weight: options.partialHit, + length: word.hits.partial.length + }, + { + flag: 2, + rows: word.hits.processed, + weight: options.processedHit, + length: word.hits.processed.length + }, + { + flag: 3, + rows: word.hits.exact, + weight: options.exactHit, + length: word.hits.exact.length + } + ]; + + // Jump to processed if it"s empty + while(match = matches.pop()) { + + for (i = 0; i < match.length; i) { + + documentId = match.rows[i++]; + fieldIndex = match.rows[i++]; + matchCount = match.rows[i++]; + + // Ensure that document exists in results + if (documentResultsLookup[documentId] === void 0) { + documentResultsLookup[documentId] = documentResultsFinal.length; + + doc = documentResultsFinal[documentResultsLookup[documentId]] = { + id: documentId, + weight: 0, + expressions: [], + }; - // Else set to zero } else { - documentResultsFinal[i].expressions[j] = 0; + doc = documentResultsFinal[documentResultsLookup[documentId]]; + } + + // Don't do unnessesary work + if ( !doc.expressions[j-1] ) { + + // Multiply match weight with field-specific weight + weight = options.index.getMetadata(documentId)[options.sortBy]; + + doc.weight += weight; + doc.expressions[j-1] = match.flag; } + } + } + } + + // Postprocess resultset, dropping records and stuff + var temp = []; + for ( i = 0; i < documentResultsFinal.length; i++) { + + var toss = false; + + if ( documentResultsFinal[i].weight < options.minimumWeight ) { + toss = true; + } else { + for ( j = 0; j < options.resultSet.expressions.length; j++ ) { + + if (!documentResultsFinal[i].expressions[j]) documentResultsFinal[i].expressions[j] = 0; + // Keep this row? - if ( resultSet.expressions[j].modifier === "-" && documentResultsFinal[i].expressions[j] > 0 ) { + if ( options.resultSet.expressions[j].modifier === "-" && documentResultsFinal[i].expressions[j] > 0 ) { toss = true; - } else if ( resultSet.expressions[j].modifier === "+" && documentResultsFinal[i].expressions[j] === 0) { + } else if ( options.resultSet.expressions[j].modifier === "+" && documentResultsFinal[i].expressions[j] === 0) { toss = true; } } - } if (!toss) { - temp.push(documentResultsFinal[i]); + temp[temp.length] = documentResultsFinal[i]; } } - - resultSet = temp; - return resultSet; + return temp; }; } module.exports = { - standard: standard + standard: standard, + property: property } \ No newline at end of file diff --git a/lib/utils.js b/lib/utils.js index 56cea39..997a76b 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -1,3 +1,5 @@ +"use strict"; + // Helper function for measuring execution time var time = (function () { var times = {}; @@ -11,7 +13,7 @@ var time = (function () { } diff = process.hrtime(times[id]); - times[id] = null; + times[id] = undefined; return (diff[0] * 1e9 + diff[1]) / 1E6; }; diff --git a/package.json b/package.json index 5a926af..ba14109 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "thinker-fts", - "version": "1.0.11", + "version": "1.1.0", "description": "Pure Javascript/Node.js in-memory full text search engine.", "author": "Hexagon ", "contributors": [{ @@ -22,12 +22,12 @@ "thinker", "fts", "fulltext", + "full-text-search", "in-memory", "levenshtein", "soundex", "porter", - "stemmer", - "full text search" + "stemmer" ], "dependencies": { "fast-levenshtein": "*", diff --git a/test/test.js b/test/test.js index 656f4dc..2036348 100644 --- a/test/test.js +++ b/test/test.js @@ -22,19 +22,20 @@ THE SOFTWARE. */ -'use strict'; +"use strict"; var should = require('should'), Thinker = require('../lib/Thinker.js'); /* START OF EXAMPLE DATA */ var exampleTexts = [ - [0,"Artikel nummer noll","Det här är ettan i det hela, Anders är ett namn. Jonas likaså antikvitets. Bemötandet. effektivitet Kalle olle lars considerable"], - [1,"Bemötande testtitel med extra ord","Brödtext nummer ett. Ander antikviteten olle lars sven"], - [2,"Titeln med extra Testning","Brödtext i sanden artikeln artikeln artikeln artikeln två. Bemött namn Andersson antikvitet nyhet, nyheter, nyheten, nyhetens, nya olle"], + {id: 0, fields: [ "Artikel nummer noll","Det här är ettan i det hela, Anders är ett namn. Stavros likaså antikvitets. Bemötandet. kreativitet Kalle olle lars considerable"] }, + {id: 1, fields: [ "Bemötande testtitel med extra ord","Brödtext nummer ett. Ander antikviteten olle lars sven"] }, + {id: 2, fields: [ "Titeln med extra Testning","Brödtext i sanden artikeln artikeln artikeln artikeln två. Bemött namn Andersson antikvitet nyhet, nyheter, nyheten, nyhetens, nya olle"] } ]; /* END OF EXAMPLE DATA */ + describe('Simple usage', function () { var thinker = Thinker(); @@ -52,8 +53,8 @@ describe('Simple usage', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation should equal "artikel"', function () { - result.expressions[0].interpretation.should.equal("artikel"); + it('Expression processed should equal "artikel"', function () { + result.expressions[0].interpretation.processed.should.equal("artikel"); }); it('Should return two results', function () { @@ -65,7 +66,7 @@ describe('Simple usage', function () { }); it('First result should be an direct match', function () { - result.documents[0].expressions[0].should.equal(2); + result.documents[0].expressions[0].should.equal(3); }); it('Second result should have id 2', function () { @@ -78,7 +79,6 @@ describe('Simple usage', function () { }); }); - describe('Simple usage: Local characters', function () { var thinker = Thinker({characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g}); @@ -96,8 +96,8 @@ describe('Simple usage: Local characters', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation should equal "ånglok"', function () { - result.expressions[0].interpretation.should.equal("ånglok"); + it('Expression processed should equal "ånglok"', function () { + result.expressions[0].interpretation.processed.should.equal("ånglok"); }); }); @@ -120,8 +120,8 @@ describe('Simple usage: Exact mode', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation should equal "ånglok"', function () { - result.expressions[0].interpretation.should.equal("ånglok"); + it('Expression processed should equal "ånglok"', function () { + result.expressions[0].interpretation.processed.should.equal("ånglok"); }); }); @@ -152,10 +152,9 @@ describe('Simple usage: Modifiers', function () { result.expressions[2].modifier.should.equal("-"); }); - it('Expression interpretation two should equal "lars"', function () { - result.expressions[1].interpretation.should.equal("lars"); + it('Expression processed two should equal "lars"', function () { + result.expressions[1].interpretation.processed.should.equal("lars"); }); - it('Should return one result', function () { result.documents.length.should.equal(1); }); @@ -185,8 +184,8 @@ describe('Partial match', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation should equal "emöt"', function () { - result.expressions[0].interpretation.should.equal("emöt"); + it('Expression processed should equal "emöt"', function () { + result.expressions[0].interpretation.processed.should.equal("emöt"); }); it('Should return three results (bemötandet, bemötande, bemött)', function () { @@ -219,8 +218,8 @@ describe('Partial match with minimum word length match 5', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation should equal "emöt"', function () { - result.expressions[0].interpretation.should.equal("emöt"); + it('Expression processed should equal "emöt"', function () { + result.expressions[0].interpretation.processed.should.equal("emöt"); }); it('Should return zero results', function () { @@ -234,13 +233,12 @@ describe('Ranker', function () { characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g }), ranker = Thinker.rankers.standard({ - directHit: 1, + exactHit: 1, + processedHit: 0.75, partialHit: 0.5, - eachPartialExpressionFactor: 1.5, - eachDirectExpressionFactor: 2, fields: { - 1: { weight: 4}, - 2: { weight: 2} + 0: { weight: 4}, + 1: { weight: 2} } }); @@ -272,7 +270,7 @@ describe('Ranker', function () { describe('Result type', function () { it('First result should be direct', function () { - result.documents[0].expressions[0].should.equal(2); + result.documents[0].expressions[0].should.equal(3); }); @@ -282,12 +280,12 @@ describe('Ranker', function () { }); describe('Result weight', function () { - it('First result should have a weight of 4*1*2', function () { - result.documents[0].weight.should.equal(8); + it('First result should have a weight of 4*1', function () { + result.documents[0].weight.should.equal(4); }); - it('Second result should have a weight of 2*0.5*1.5', function () { - result.documents[1].weight.should.equal(1.5); + it('Second result should have a weight of 2*0.5', function () { + result.documents[1].weight.should.equal(1); }); }); }); @@ -297,13 +295,12 @@ describe('Ranker', function () { describe('Ranker: Boost percentage', function () { var thinker = Thinker({ characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g }), ranker = Thinker.rankers.standard({ - directHit: 1, + exactHit: 1, + processedHit: 0.75, partialHit: 0.5, - eachPartialExpressionFactor: 1.5, - eachDirectExpressionFactor: 2, fields: { - 1: { weight: 4, boostPercentage: true}, - 2: { weight: 2} + 0: { weight: 4, boostPercentage: true}, + 1: { weight: 2} } }); @@ -315,6 +312,7 @@ describe('Ranker: Boost percentage', function () { thinker.feed(exampleTextsCopy); describe('Basic search "artikel"', function () { + var result = thinker.find("artikel"); it('Should return two results', function () { @@ -322,12 +320,12 @@ describe('Ranker: Boost percentage', function () { }); describe('Result weight', function () { - it('First result should have a weight of 4*1*2*1.3333', function () { - result.documents[0].weight.toFixed(4).should.equal('15.4667'); + it('First result should have a weight of 4*1*1.9333', function () { + result.documents[0].weight.toFixed(4).should.equal('7.7333'); }); - it('Second result should have a weight of 2*0.5*1.5', function () { - result.documents[1].weight.should.equal(1.5); + it('Second result should have a weight of 2*0.5', function () { + result.documents[1].weight.should.equal(1); }); }); }); @@ -338,13 +336,12 @@ describe('Advanced ranker', function () { characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g }); var ranker = Thinker.rankers.standard({ - directHit: 1, - partialHit: 0.5, - eachPartialExpressionFactor: 1.5, - eachDirectExpressionFactor: 2, + exactHit: 1, + processedHit: 0.75, + partialHit: 0.5, fields: { - 1: {weight: 4 }, - 2: {weight: 2 } + 0: {weight: 4 }, + 1: {weight: 2 } } }); @@ -376,26 +373,26 @@ describe('Advanced ranker', function () { describe('Result type', function () { it('First result should be 3 direct matches', function () { - result.documents[0].expressions[0].should.equal(2); - result.documents[0].expressions[1].should.equal(2); - result.documents[0].expressions[2].should.equal(2); + result.documents[0].expressions[0].should.equal(3); + result.documents[0].expressions[1].should.equal(3); + result.documents[0].expressions[2].should.equal(3); }); it('Second result should be 2 partial and one direct', function () { result.documents[1].expressions[0].should.equal(1); result.documents[1].expressions[1].should.equal(1); - result.documents[1].expressions[2].should.equal(2); + result.documents[1].expressions[2].should.equal(3); }); }); describe('Result weight', function () { - it('First result should have a weight of (((4*1)+(2*1)+(2*1)))*2*2*2', function () { - result.documents[0].weight.should.equal(64); + it('First result should have a weight of (((4*1)+(2*1)+(2*1)))', function () { + result.documents[0].weight.should.equal((((4*1)+(2*1)+(2*1)))); }); - it('Second result should have a weight of (((2*0.5)+(2*0.5)+(2*1)))*2*1.5*1.5', function () { - result.documents[1].weight.should.equal(18); + it('Second result should have a weight of (((2*0.5)+(2*0.5)+(2*1)))', function () { + result.documents[1].weight.should.equal((((2*0.5)+(2*0.5)+(2*1)))); }); }); }); @@ -492,17 +489,16 @@ describe('Word-processor: Multiples', function () { describe('Search "k000aaaallle"', function () { var result = thinker.find("k000aaaallle"); - it('Expression interpretation should equal "k000ale"', function () { - result.expressions[0].interpretation.should.equal("k000ale"); + it('Expression processed should equal "k000ale"', function () { + result.expressions[0].interpretation.processed.should.equal("k000ale"); }); }); }); - describe('Word processor: Swedish stemmer', function () { var stemmerStopwords = { "anders": true, - "jonas": true + "stavros": true }; var thinker = Thinker({characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g}); @@ -522,8 +518,8 @@ describe('Word processor: Swedish stemmer', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation be unchanged("anders")', function () { - result.expressions[0].interpretation.should.equal("anders"); + it('Expression processed be unchanged("anders")', function () { + result.expressions[0].interpretation.processed.should.equal("anders"); }); it('Should return two results', function () { @@ -531,7 +527,7 @@ describe('Word processor: Swedish stemmer', function () { }); it('First result should be a direct match (anders)', function () { - result.documents[0].expressions[0].should.equal(2); + result.documents[0].expressions[0].should.equal(3); }); it('Second result should be a partial match (andersson)', function () { @@ -546,8 +542,8 @@ describe('Word processor: Swedish stemmer', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation be unchanged("bemötandet")', function () { - result.expressions[0].interpretation.should.equal("bemötandet"); + it('Expression processed be unchanged("bemötandet")', function () { + result.expressions[0].interpretation.preprocessed.should.equal("bemötandet"); }); it('Expression should be in exact mode', function () { @@ -559,7 +555,7 @@ describe('Word processor: Swedish stemmer', function () { }); it('First result should be a direct match (anders)', function () { - result.documents[0].expressions[0].should.equal(2); + result.documents[0].expressions[0].should.equal(3); }); it('First result should have document id 0', function () { @@ -578,22 +574,112 @@ describe('Word processor: Swedish stemmer', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation should equal "bemöt"', function () { - result.expressions[0].interpretation.should.equal("bemöt"); + it('Expression processed should equal "bemöt"', function () { + result.expressions[0].interpretation.processed.should.equal("bemöt"); }); it('Should return three results (bemötandet, bemötande, bemött)', function () { result.documents.length.should.equal(3); }); - it('All results should be a direct match', function () { + it('All results should be a processed match', function () { result.documents[0].expressions[0].should.equal(2); result.documents[1].expressions[0].should.equal(2); result.documents[2].expressions[0].should.equal(2); }); }); - + + describe('Search for "lyssningarna"', function () { + + var result = thinker.find("lyssningarna"); + + it('Expression processed should equal "lyssn"', function () { + result.expressions[0].interpretation.processed.should.equal("lyssn"); + }); + + }); + + describe('Search for "lyssna"', function () { + + var result = thinker.find("lyssning"); + + it('Expression processed should equal "lyssn"', function () { + result.expressions[0].interpretation.processed.should.equal("lyssn"); + }); + + }); + + describe('Search for "lyssning"', function () { + + var result = thinker.find("lyssning"); + + it('Expression processed should equal "lyssn"', function () { + result.expressions[0].interpretation.processed.should.equal("lyssn"); + }); + + }); + + describe('Search for "lyssnarens"', function () { + + var result = thinker.find("lyssnarens"); + + it('Expression processed should equal "lyssn"', function () { + result.expressions[0].interpretation.processed.should.equal("lyssn"); + }); + + }); + + describe('Search for "lyssningens"', function () { + + var result = thinker.find("lyssningens"); + + it('Expression processed should equal "lyssn"', function () { + result.expressions[0].interpretation.processed.should.equal("lyssn"); + }); + + }); + + describe('Search for "lyssningen"', function () { + + var result = thinker.find("lyssningen"); + + it('Expression processed should equal "lyssn"', function () { + result.expressions[0].interpretation.processed.should.equal("lyssn"); + }); + + }); + + describe('Search for "lyssnandet"', function () { + + var result = thinker.find("lyssnandet"); + + it('Expression processed should equal "lyssn"', function () { + result.expressions[0].interpretation.processed.should.equal("lyssn"); + }); + + }); + + describe('Search for "lyssnare"', function () { + + var result = thinker.find("lyssnare"); + + it('Expression processed should equal "lyssn"', function () { + result.expressions[0].interpretation.processed.should.equal("lyssn"); + }); + + }); + + describe('Search for "lyssna"', function () { + + var result = thinker.find("lyssna"); + + it('Expression processed should equal "lyssn"', function () { + result.expressions[0].interpretation.processed.should.equal("lyssn"); + }); + + }); + describe('Search for "nyheternas"', function () { var result = thinker.find("nyheternas"); @@ -602,20 +688,21 @@ describe('Word processor: Swedish stemmer', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation should equal "ny"', function () { - result.expressions[0].interpretation.should.equal("ny"); + it('Expression processed should equal "ny"', function () { + result.expressions[0].interpretation.processed.should.equal("ny"); }); it('Should return 1 document', function () { result.documents.length.should.equal(1); }); - it('All four (nyhet, nyheter, nyheten, nyhetens)results should be a direct match on the first result', function () { + it('All four (nyhet, nyheter, nyheten, nyhetens)results should be a processed match on the first result', function () { result.documents[0].expressions[0].should.equal(2); }); }); + describe('Search for "nya"', function () { var result = thinker.find("nya"); @@ -624,8 +711,8 @@ describe('Word processor: Swedish stemmer', function () { result.expressions.length.should.equal(1); }); - it('Expression interpretation should equal "ny"', function () { - result.expressions[0].interpretation.should.equal("ny"); + it('Expression processed should equal "ny"', function () { + result.expressions[0].interpretation.processed.should.equal("ny"); }); it('Should return one document', function () { @@ -633,7 +720,7 @@ describe('Word processor: Swedish stemmer', function () { }); it('The result should be a direct match on the first result', function () { - result.documents[0].expressions[0].should.equal(2); + result.documents[0].expressions[0].should.equal(3); }); }); @@ -642,8 +729,8 @@ describe('Word processor: Swedish stemmer', function () { var result = thinker.find("radioar"); - it('Expression interpretation should equal "radio"', function () { - result.expressions[0].interpretation.should.equal("radio"); + it('Expression processed should equal "radio"', function () { + result.expressions[0].interpretation.processed.should.equal("radio"); }); }); @@ -652,8 +739,8 @@ describe('Word processor: Swedish stemmer', function () { var result = thinker.find("sprit"); - it('Expression interpretation should equal "sprit"', function () { - result.expressions[0].interpretation.should.equal("sprit"); + it('Expression processed should equal "sprit"', function () { + result.expressions[0].interpretation.processed.should.equal("sprit"); }); }); @@ -662,8 +749,8 @@ describe('Word processor: Swedish stemmer', function () { var result = thinker.find("produktutveckling"); - it('Expression interpretation should equal "produktutveckl"', function () { - result.expressions[0].interpretation.should.equal("produktutveckl"); + it('Expression processed should equal "produktutveckl"', function () { + result.expressions[0].interpretation.processed.should.equal("produktutveckl"); }); }); @@ -672,8 +759,8 @@ describe('Word processor: Swedish stemmer', function () { var result = thinker.find("produktutvecklare"); - it('Expression interpretation should equal "produktutveckl"', function () { - result.expressions[0].interpretation.should.equal("produktutveckl"); + it('Expression processed should equal "produktutveckl"', function () { + result.expressions[0].interpretation.processed.should.equal("produktutveckl"); }); }); @@ -682,8 +769,8 @@ describe('Word processor: Swedish stemmer', function () { var result = thinker.find("produktutvecklarens"); - it('Expression interpretation should equal "produktutveckl"', function () { - result.expressions[0].interpretation.should.equal("produktutveckl"); + it('Expression processed should equal "produktutveckl"', function () { + result.expressions[0].interpretation.processed.should.equal("produktutveckl"); }); }); @@ -692,8 +779,8 @@ describe('Word processor: Swedish stemmer', function () { var result = thinker.find("skrotverktyget"); - it('Expression interpretation should equal "skrotverktyg"', function () { - result.expressions[0].interpretation.should.equal("skrotverktyg"); + it('Expression processed should equal "skrotverktyg"', function () { + result.expressions[0].interpretation.processed.should.equal("skrotverktyg"); }); }); @@ -704,49 +791,170 @@ describe('Word processor: Swedish stemmer', function () { var result = thinker.find("skrotverktygets"); - it('Expression interpretation should equal "skrotverktyg"', function () { - result.expressions[0].interpretation.should.equal("skrotverktyg"); + it('Expression processed should equal "skrotverktyg"', function () { + result.expressions[0].interpretation.processed.should.equal("skrotverktyg"); }); }); - - describe('Search for "sandning"', function () { + + describe('Search for "sand"', function () { - var result = thinker.find("sandning"); + var result = thinker.find("sand"); - it('Expression interpretation should equal "sand"', function () { - result.expressions[0].interpretation.should.equal("sand"); + it('Expression processed should equal "sand"', function () { + result.expressions[0].interpretation.processed.should.equal("sand"); }); }); - describe('Search for "sand"', function () { + describe('Search for "sandarens"', function () { - var result = thinker.find("sand"); + var result = thinker.find("sandarens"); - it('Expression interpretation should equal "sand"', function () { - result.expressions[0].interpretation.should.equal("sand"); + it('Expression processed should equal "sand"', function () { + result.expressions[0].interpretation.processed.should.equal("sand"); }); }); + + describe('Search for "faktura"', function () { + + var result = thinker.find("faktura"); - describe('Search for "sandarens"', function () { + it('Expression processed should equal "faktur"', function () { + result.expressions[0].interpretation.processed.should.equal("faktur"); + }); - var result = thinker.find("sandarens"); + }); + + describe('Search for "fakturan"', function () { + + var result = thinker.find("fakturan"); + + it('Expression processed should equal "faktur"', function () { + result.expressions[0].interpretation.processed.should.equal("faktur"); + }); + + }); + + describe('Search for "fakturans"', function () { + + var result = thinker.find("fakturans"); - it('Expression interpretation should equal "sand"', function () { - result.expressions[0].interpretation.should.equal("sand"); + it('Expression processed should equal "faktur"', function () { + result.expressions[0].interpretation.processed.should.equal("faktur"); }); - }); + }); + + describe('Search for "fakturor"', function () { + + var result = thinker.find("fakturor"); + + it('Expression processed should equal "faktur"', function () { + result.expressions[0].interpretation.processed.should.equal("faktur"); + }); + + }); + + describe('Search for "fakturorna"', function () { + + var result = thinker.find("fakturorna"); + + it('Expression processed should equal "faktur"', function () { + result.expressions[0].interpretation.processed.should.equal("faktur"); + }); + + }); + + describe('Search for "fakturornas"', function () { + + var result = thinker.find("fakturornas"); + + it('Expression processed should equal "faktur"', function () { + result.expressions[0].interpretation.processed.should.equal("faktur"); + }); + + }); + + describe('Search for "fakturors"', function () { + + var result = thinker.find("fakturors"); + + it('Expression processed should equal "faktur"', function () { + result.expressions[0].interpretation.processed.should.equal("faktur"); + }); + + }); + + describe('Search for "kampanj"', function () { + + var result = thinker.find("kampanj"); + + it('Expression processed should equal "kampanj"', function () { + result.expressions[0].interpretation.processed.should.equal("kampanj"); + }); + + }); + + describe('Search for "kampanjer"', function () { + + var result = thinker.find("kampanjer"); + it('Expression processed should equal "kampanj"', function () { + result.expressions[0].interpretation.processed.should.equal("kampanj"); + }); + + }); + + describe('Search for "kampanjen"', function () { + + var result = thinker.find("kampanjen"); + it('Expression processed should equal "kampanj"', function () { + result.expressions[0].interpretation.processed.should.equal("kampanj"); + }); + + }); + + describe('Search for "kampanjens"', function () { + + var result = thinker.find("kampanjens"); + + it('Expression processed should equal "kampanj"', function () { + result.expressions[0].interpretation.processed.should.equal("kampanj"); + }); + + }); + + describe('Search for "kampanjernas"', function () { + + var result = thinker.find("kampanjernas"); + + it('Expression processed should equal "kampanj"', function () { + result.expressions[0].interpretation.processed.should.equal("kampanj"); + }); + + }); + + + describe('Search for "kampanjerna"', function () { + + var result = thinker.find("kampanjerna"); + + it('Expression processed should equal "kampanj"', function () { + result.expressions[0].interpretation.processed.should.equal("kampanj"); + }); + + }); + + describe('Search for "skrotverktyg"', function () { var result = thinker.find("skrotverktyg"); - it('Expression interpretation should equal "skrotverktyg"', function () { - result.expressions[0].interpretation.should.equal("skrotverktyg"); + it('Expression processed should equal "skrotverktyg"', function () { + result.expressions[0].interpretation.processed.should.equal("skrotverktyg"); }); }); @@ -755,8 +963,8 @@ describe('Word processor: Swedish stemmer', function () { var result = thinker.find("inbyggda"); - it('Expression interpretation should equal "inbygg"', function () { - result.expressions[0].interpretation.should.equal("inbygg"); + it('Expression processed should equal "inbygg"', function () { + result.expressions[0].interpretation.processed.should.equal("inbygg"); }); }); @@ -765,31 +973,163 @@ describe('Word processor: Swedish stemmer', function () { var result = thinker.find("inbyggd"); - it('Expression interpretation should equal "inbygg"', function () { - result.expressions[0].interpretation.should.equal("inbygg"); + it('Expression processed should equal "inbygg"', function () { + result.expressions[0].interpretation.processed.should.equal("inbygg"); }); }); - + + describe('Search for "inbyggda"', function () { + + var result = thinker.find("inbyggda"); + + it('Expression processed should equal "inbygg"', function () { + result.expressions[0].interpretation.processed.should.equal("inbygg"); + }); + + }); + + describe('Search for "hastighet"', function () { + + var result = thinker.find("hastighet"); + + it('Expression processed should equal "hast"', function () { + result.expressions[0].interpretation.processed.should.equal("hast"); + }); + + }); + + describe('Search for "hastighetens"', function () { + + var result = thinker.find("hastighetens"); + + it('Expression processed should equal "hast"', function () { + result.expressions[0].interpretation.processed.should.equal("hast"); + }); + + }); + + describe('Search for "hastigheter"', function () { + + var result = thinker.find("hastigheter"); + + it('Expression processed should equal "hast"', function () { + result.expressions[0].interpretation.processed.should.equal("hast"); + }); + + }); + + describe('Search for "hastigheternas"', function () { + + var result = thinker.find("hastigheternas"); + + it('Expression processed should equal "hast"', function () { + result.expressions[0].interpretation.processed.should.equal("hast"); + }); + + }); + + + describe('Search for "hastigheterna"', function () { + + var result = thinker.find("hastigheterna"); + + it('Expression processed should equal "hast"', function () { + result.expressions[0].interpretation.processed.should.equal("hast"); + }); + + }); + + + describe('Search for "bredband"', function () { + + var result = thinker.find("bredband"); + + it('Expression processed should equal "bredb"', function () { + result.expressions[0].interpretation.processed.should.equal("bredb"); + }); + + }); + + describe('Search for "bredbandet"', function () { + + var result = thinker.find("bredbandet"); + + it('Expression processed should equal "bredb"', function () { + result.expressions[0].interpretation.processed.should.equal("bredb"); + }); + + }); + + describe('Search for "bredbandens"', function () { + + var result = thinker.find("bredbandens"); + + it('Expression processed should equal "bredb"', function () { + result.expressions[0].interpretation.processed.should.equal("bredb"); + }); + + }); + + describe('Search for "bredbandets"', function () { + + var result = thinker.find("bredbandets"); + + it('Expression processed should equal "bredb"', function () { + result.expressions[0].interpretation.processed.should.equal("bredb"); + }); + + }); + + describe('Search for "sökmotorn"', function () { + + var result = thinker.find("sökmotorn"); + + it('Expression processed should equal "sökmot"', function () { + result.expressions[0].interpretation.processed.should.equal("sökmot"); + }); + }); + + describe('Search for "sökmotor"', function () { + + var result = thinker.find("sökmotor"); + + it('Expression processed should equal "sökmot"', function () { + result.expressions[0].interpretation.processed.should.equal("sökmot"); + }); + + }); + + describe('Search for "sökmotorer"', function () { + + var result = thinker.find("sökmotorer"); + + it('Expression processed should equal "sökmot"', function () { + result.expressions[0].interpretation.processed.should.equal("sökmot"); + }); + + }); + describe('Search for "antikviteten"', function () { - + var result = thinker.find("antikviteten"); - + it('Should return one expression', function () { result.expressions.length.should.equal(1); }); - - it('Expression interpretation should equal "antikv"', function () { - result.expressions[0].interpretation.should.equal("antikv"); + + it('Expression processed should equal "antikv"', function () { + result.expressions[0].interpretation.processed.should.equal("antikv"); }); - + it('Should return one result (antikviteten, antivitet, antikvitets)', function () { result.documents.length.should.equal(3); }); - + it('All results should be a direct match', function () { - result.documents[0].expressions[0].should.equal(2); + result.documents[0].expressions[0].should.equal(3); }); + }); }); @@ -813,7 +1153,7 @@ describe('Word processor: English stemmer', function () { var result = thinker.find("considerable"); it('Should be interpreted as "considerable"', function () { - result.expressions[0].interpretation.should.equal("considerable"); + result.expressions[0].interpretation.processed.should.equal("considerable"); }); it('Should give one result"', function () { @@ -826,7 +1166,7 @@ describe('Word processor: English stemmer', function () { var result = thinker.find("considering"); it('Should be interpreted as "consid"', function () { - result.expressions[0].interpretation.should.equal("consid"); + result.expressions[0].interpretation.processed.should.equal("consid"); }); it('Should give one PARTIAL result"', function () { @@ -840,7 +1180,7 @@ describe('Word processor: English stemmer', function () { var result = thinker.find("consider"); it('Should be interpreted as "consid"', function () { - result.expressions[0].interpretation.should.equal("consid"); + result.expressions[0].interpretation.processed.should.equal("consid"); }); it('Should give one PARTIAL result"', function () { @@ -854,7 +1194,7 @@ describe('Word processor: English stemmer', function () { var result = thinker.find("triplicate"); it('Should be interpreted as "triplic"', function () { - result.expressions[0].interpretation.should.equal("triplic"); + result.expressions[0].interpretation.processed.should.equal("triplic"); }); }); @@ -863,7 +1203,7 @@ describe('Word processor: English stemmer', function () { var result = thinker.find("dependent"); it('Should be interpreted as "depend"', function () { - result.expressions[0].interpretation.should.equal("depend"); + result.expressions[0].interpretation.processed.should.equal("depend"); }); }); @@ -872,7 +1212,7 @@ describe('Word processor: English stemmer', function () { var result = thinker.find("probate"); it('Should be interpreted as "probat"', function () { - result.expressions[0].interpretation.should.equal("probat"); + result.expressions[0].interpretation.processed.should.equal("probat"); }); }); @@ -881,7 +1221,7 @@ describe('Word processor: English stemmer', function () { var result = thinker.find("controllable"); it('Should be interpreted as "control"', function () { - result.expressions[0].interpretation.should.equal("control"); + result.expressions[0].interpretation.processed.should.equal("control"); }); }); @@ -890,7 +1230,7 @@ describe('Word processor: English stemmer', function () { var result = thinker.find("rolling"); it('Should be interpreted as "roll"', function () { - result.expressions[0].interpretation.should.equal("roll"); + result.expressions[0].interpretation.processed.should.equal("roll"); }); }); @@ -908,16 +1248,15 @@ describe('Word processor: English soundex', function () { thinker.ranker = ranker; thinker.feed([ - [0,"This is a tile","This is a textual"], - [1,"This is a tilly","This is a sexual"], - + { id: 0, fields: ["This is a tile","This is a textual"] }, + { id: 1, fields: ["This is a tilly","This is a sexual"] } ]); describe('Search for "tile"', function () { var result = thinker.find("tile"); it('Should be interpreted as "T400"', function () { - result.expressions[0].interpretation.should.equal("T400"); + result.expressions[0].interpretation.processed.should.equal("T400"); }); it('Should give two results', function () { @@ -937,6 +1276,89 @@ describe('Word processor: English soundex', function () { }); +describe('coalesceWords option', function () { + + var thinker = Thinker({ + coalesceWords: 3 + }); + var ranker = Thinker.rankers.standard(); + + thinker.ranker = ranker; + + thinker.feed([ + { id: 0, fields: ["This is a tile","This is a textual"] }, + { id: 1, fields: ["This is a tilly","This is a sexual"] } + ]); + + describe('Search for "isatextual"', function () { + var result = thinker.find("isatextual"); + + it('Should be interpreted as "isatextual"', function () { + result.expressions[0].interpretation.processed.should.equal("isatextual"); + }); + + it('Should give one result', function () { + result.documents.length.should.equal(1); + }); + + }); + + describe('Search for "thisisatextual"', function () { + var result = thinker.find("thisisatextual"); + + it('Should give zero result', function () { + result.documents.length.should.equal(0); + }); + + }); + + describe('Search for "thisisa"', function () { + var result = thinker.find("thisisa"); + console.log(result); + it('Should give two result', function () { + result.documents.length.should.equal(2); + }); + + }); + + describe('Search for "thisis"', function () { + var result = thinker.find("thisis"); + + it('Should give zero result', function () { + result.documents.length.should.equal(2); + }); + + }); + + describe('Search for "isa"', function () { + var result = thinker.find("isa"); + + it('Should give zero result', function () { + result.documents.length.should.equal(2); + }); + + }); + + describe('Search for "atextual"', function () { + var result = thinker.find("atextual"); + + it('Should give zero result', function () { + result.documents.length.should.equal(1); + }); + + }); + + describe('Search for "isa"', function () { + var result = thinker.find("isa"); + + it('Should give two result', function () { + result.documents.length.should.equal(2); + }); + + }); + +}); + describe('Field processor: HTML-Stripper', function () { var thinker = Thinker({characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g}); @@ -948,7 +1370,7 @@ describe('Field processor: HTML-Stripper', function () { // We need to make a copy of exampletexts, as feed consumes the object var exampleHtml = [ - [0,"title","

atitle

\"imgdescription\"linktext

awordÅrsringar <innanför>

"] + { id: 0, fields: [ "title","

atitle

\"imgdescription\"linktext

awordÅrsringar <innanför>

"] } ]; thinker.feed(exampleHtml); @@ -1040,4 +1462,150 @@ describe('Field processor: HTML-Stripper', function () { result.documents.length.should.equal(1); }); }); +}); + +describe('Filters', function () { + var thinker = Thinker({characters: /([a-zA-Z0-9åäöÅÄÖ]*)/g}); + var ranker = Thinker.rankers.standard(); + var stripHtml = Thinker.processors.stripHtml(); + + thinker.addFieldProcessor(stripHtml); + thinker.ranker = ranker; + + // We need to make a copy of exampletexts, as feed consumes the object + var exampleHtml = [ + { id: 0, metadata: { testfilterbool: true, testfilterstring: "adfa", testfilterarr: [1,4,5] }, fields: ["Detta är en text som innehåller apa"] }, + { id: 1, metadata: { testfilterbool: false, testfilterstring: "asdf", testfilterarr: [2,5] }, fields: [ "Detta är en text som innehåller kamel"] }, + { id: 2, metadata: { testfilterbool: false, testfilterstring: "asdf", testfilterarr: [] }, fields: [ "Detta är en text som innehåller kanel"] }, + { id: 3, metadata: { testfilterbool: false, testfilterstring: "asd", testfilterarr: [5] }, fields: [ "Detta är en text som innehåller kanel"] }, + ]; + + thinker.feed(exampleHtml); + + describe('Search "apa"', function () { + var result = thinker.find( { expression: "apa", filter: () => true }); + + it('Should return one result', function () { + result.documents.length.should.equal(1); + }); + }); + + describe('Search "text"', function () { + var result = thinker.find( { expression: "text", filter: () => true} ); + + it('Should return three result', function () { + result.documents.length.should.equal(4); + }); + }); + + describe('Search "text" with filter "testfilterbool: true"', function () { + var result = thinker.find( { expression: "text", filter: (filterData) => filterData.testfilterbool} ); + + it('Should return one result', function () { + result.documents.length.should.equal(1); + }); + }); + + describe('Search "text" with filter "testfilterbool: false"', function () { + var result = thinker.find( { expression: "text", filter: (filterData) => !filterData.testfilterbool} ); + + it('Should return three result', function () { + result.documents.length.should.equal(3); + }); + }); + + describe('Search "text" with filter "testfilterstring: asdf"', function () { + var result = thinker.find( { expression: "text", filter: (filterData) => filterData.testfilterstring === "asdf" } ); + it('Should return two result', function () { + result.documents.length.should.equal(2); + }); + }); + + describe('Search "text" with filter "testfilterstring: adfa"', function () { + var result = thinker.find( { expression: "text", filter: (filterData) => filterData.testfilterstring === "adfa" } ); + it('Should return one result', function () { + result.documents.length.should.equal(1); + }); + }); + + describe('Search "text" with filter "testfilterstring: fafa"', function () { + var result = thinker.find( { expression: "text", filter: (filterData) => filterData.testfilterstring === "fafa" } ); + it('Should return zero result', function () { + result.documents.length.should.equal(0); + }); + }); + + describe('Search "text" with filter "testfilterarr has 5"', function () { + var result = thinker.find( { expression: "text", filter: (filterData) => ~filterData.testfilterarr.indexOf(5) } ); + it('Should return three result', function () { + result.documents.length.should.equal(3); + }); + }); + + describe('Search "text" with filter "testfilterarr has 5 && not testfilterbool"', function () { + var result = thinker.find( { expression: "text", filter: (filterData) => ~filterData.testfilterarr.indexOf(5) && !filterData.testfilterbool } ); + it('Should return two result', function () { + result.documents.length.should.equal(2); + }); + }); + + describe('Search exact ""text"" with filter "testfilterbool"', function () { + var result = thinker.find( { expression: "\"text\"", filter: (filterData) => filterData.testfilterbool } ); + it('Should return one result', function () { + result.documents.length.should.equal(1); + }); + }); + + describe('Search exact ""text"" with filter "!testfilterbool"', function () { + var result = thinker.find( { expression: "\"text\"", filter: (filterData) => !filterData.testfilterbool } ); + it('Should return one result', function () { + result.documents.length.should.equal(3); + }); + }); + + describe('Ranker: Sort by metadata parameter', function () { + + var thinker = Thinker(); + + thinker.feed([ + { id: 0, metadata: {a:2}, fields: ["This is a tile","This is a textual"] }, + { id: 1, metadata: {a:1}, fields: ["This is a tilly","This is a sexual"] }, + { id: 2, metadata: {a:3}, fields: ["This is a tilly","This is a usual"] }, + { id: 3, metadata: {a:0}, fields: ["This is a tilly","This is a muse"] } + ]); + + describe('Search for "tile"', function () { + + var result = thinker.find({ + expression: "this", + sortBy: "a", + direction: true + }); + + it('Should be interpreted as "this"', function () { + result.expressions[0].interpretation.processed.should.equal("this"); + }); + + it('Should give four results', function () { + result.documents.length.should.equal(4); + }); + + it('First result should have id 2', function () { + result.documents[0].id.should.equal(2); + }); + + it('Second result should have id 0', function () { + result.documents[1].id.should.equal(0); + }); + + it('Third result should have id 1', function () { + result.documents[2].id.should.equal(1); + }); + + it('Fourth result should have id 3', function () { + result.documents[3].id.should.equal(3); + }); + }); + + }); }); \ No newline at end of file