diff --git a/index.bat b/index.bat
index 54c22724..5712b5a3 100644
--- a/index.bat
+++ b/index.bat
@@ -1 +1 @@
-sir.bat indexwikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --collection wikipedia --skip 0 --take 200000 --pageSize 100000 --sampleSize 1000 %*
\ No newline at end of file
+sir.bat indexwikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --collection wikipedia --skip 0 --take 100000 --pageSize 10000 --sampleSize 1000 %*
\ No newline at end of file
diff --git a/src/Sir.Document/DocmentMapReader.cs b/src/Sir.Document/DocmentMapReader.cs
index df3c7b79..cc912716 100644
--- a/src/Sir.Document/DocmentMapReader.cs
+++ b/src/Sir.Document/DocmentMapReader.cs
@@ -4,7 +4,7 @@
namespace Sir.Documents
{
///
- /// Read document maps (key_id/val_id) from the document map stream.
+ /// Read document maps (key_id/val_id, each pair 2*sizeof(long)) from the document map stream.
/// A document map is needed to re-contruct a complete document.
///
public class DocmentMapReader : IDisposable
diff --git a/src/Sir.Document/DocumentIndexWriter.cs b/src/Sir.Document/DocumentIndexWriter.cs
index 4d924760..66307392 100644
--- a/src/Sir.Document/DocumentIndexWriter.cs
+++ b/src/Sir.Document/DocumentIndexWriter.cs
@@ -16,11 +16,6 @@ public DocumentIndexWriter(Stream stream)
_stream = stream;
}
- public void Flush()
- {
- _stream.Flush();
- }
-
///
/// Get the next auto-incrementing doc id
///
diff --git a/src/Sir.Document/DocumentMapWriter.cs b/src/Sir.Document/DocumentMapWriter.cs
index d0a6895b..4ab9a5ec 100644
--- a/src/Sir.Document/DocumentMapWriter.cs
+++ b/src/Sir.Document/DocumentMapWriter.cs
@@ -5,7 +5,7 @@
namespace Sir.Documents
{
///
- /// Writes document maps (key_id/val_id) to a bitmap.
+ /// Writes document maps (key_id/val_id, each pair 2*sizeof(long)) to a bitmap.
///
public class DocumentMapWriter : IDisposable
{
diff --git a/src/Sir.Document/DocumentReader.cs b/src/Sir.Document/DocumentReader.cs
index 0a3805d5..fec5ecee 100644
--- a/src/Sir.Document/DocumentReader.cs
+++ b/src/Sir.Document/DocumentReader.cs
@@ -21,7 +21,7 @@ public static Document Read(long documentId, DocumentRegistryReader documentRead
var vInfo = documentReader.GetAddressOfValue(kvp.valId);
var val = documentReader.GetValue(vInfo.offset, vInfo.len, vInfo.dataType);
- fields.Add(new Field(key, val, kvp.keyId));
+ fields.Add(new Field(key, val, kvp.keyId, documentId));
}
}
diff --git a/src/Sir.Document/DocumentRegistryWriter.cs b/src/Sir.Document/DocumentRegistryWriter.cs
index 1e3479d7..71ae042f 100644
--- a/src/Sir.Document/DocumentRegistryWriter.cs
+++ b/src/Sir.Document/DocumentRegistryWriter.cs
@@ -9,8 +9,8 @@ namespace Sir.Documents
///
public class DocumentRegistryWriter : IDisposable
{
- private DocumentMapWriter _docs;
- private DocumentIndexWriter _docIx;
+ private DocumentMapWriter _documentMapWriter;
+ private DocumentIndexWriter _documentIndexWriter;
private KeyValueWriter _kvWriter;
private readonly string _directory;
private readonly ulong _collectionId;
@@ -19,8 +19,8 @@ public class DocumentRegistryWriter : IDisposable
public DocumentRegistryWriter(string directory, ulong collectionId)
{
- _docs = new DocumentMapWriter(KeyValueWriter.CreateAppendStream(directory, collectionId, "docs"));
- _docIx = new DocumentIndexWriter(KeyValueWriter.CreateAppendStream(directory, collectionId, "dix"));
+ _documentMapWriter = new DocumentMapWriter(KeyValueWriter.CreateAppendStream(directory, collectionId, "docs"));
+ _documentIndexWriter = new DocumentIndexWriter(KeyValueWriter.CreateAppendStream(directory, collectionId, "dix"));
_kvWriter = new KeyValueWriter(directory, collectionId);
_directory = directory;
_collectionId = collectionId;
@@ -28,39 +28,39 @@ public DocumentRegistryWriter(string directory, ulong collectionId)
public long IncrementDocId()
{
- return _docIx.IncrementDocId();
+ return _documentIndexWriter.IncrementDocId();
}
public (long offset, int length) PutDocumentMap(IList<(long keyId, long valId)> doc)
{
- return _docs.Put(doc);
+ return _documentMapWriter.Put(doc);
}
public void UpdateDocumentMap(long offsetOfMap, int indexInMap, long keyId, long valId)
{
- _docs.Overwrite(offsetOfMap, indexInMap, keyId, valId);
+ _documentMapWriter.Overwrite(offsetOfMap, indexInMap, keyId, valId);
}
public void PutDocumentAddress(long docId, long offset, int len)
{
- _docIx.Put(docId, offset, len);
+ _documentIndexWriter.Put(docId, offset, len);
}
public void Commit()
{
- _docs.Dispose();
- _docIx.Dispose();
+ _documentMapWriter.Dispose();
+ _documentIndexWriter.Dispose();
_kvWriter.Dispose();
- _docs = new DocumentMapWriter(KeyValueWriter.CreateAppendStream(_directory, _collectionId, "docs"));
- _docIx = new DocumentIndexWriter(KeyValueWriter.CreateAppendStream(_directory, _collectionId, "dix"));
+ _documentMapWriter = new DocumentMapWriter(KeyValueWriter.CreateAppendStream(_directory, _collectionId, "docs"));
+ _documentIndexWriter = new DocumentIndexWriter(KeyValueWriter.CreateAppendStream(_directory, _collectionId, "dix"));
_kvWriter = new KeyValueWriter(_directory, _collectionId);
}
public void Dispose()
{
- _docs.Dispose();
- _docIx.Dispose();
+ _documentMapWriter.Dispose();
+ _documentIndexWriter.Dispose();
_kvWriter.Dispose();
}
}
diff --git a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs
index c06dd91a..b2d41d38 100644
--- a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs
+++ b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs
@@ -28,8 +28,8 @@ public DocumentDatabase(string directory, ulong collectionId, IModel model =
{
_directory = directory ?? throw new ArgumentNullException(nameof(directory));
_collectionId = collectionId;
- _model = model ?? throw new ArgumentNullException(nameof(model));
- _indexStrategy = indexStrategy ?? throw new ArgumentNullException(nameof(indexStrategy));
+ _model = model;
+ _indexStrategy = indexStrategy;
_writeSession = new WriteSession(new DocumentRegistryWriter(directory, collectionId));
_indexSession = new IndexSession(directory, collectionId, model, indexStrategy, logger);
_searchSession = new SearchSession(directory, _model, _indexStrategy, logger);
@@ -89,9 +89,6 @@ public void OptimizeAllIndices(int skipDocuments = 0, int takeDocuments = int.Ma
public void Truncate()
{
DisposeInternal();
- _writeSession = null;
- _indexSession = null;
- _searchSession = null;
var count = 0;
@@ -153,9 +150,6 @@ public void TruncateIndexOnly()
public void Rename(ulong newCollectionId)
{
DisposeInternal();
- _writeSession = null;
- _indexSession = null;
- _searchSession = null;
var count = 0;
var from = _collectionId.ToString();
@@ -186,7 +180,7 @@ private void LogInformation(string message)
_logger.LogInformation(message);
}
- public void Commit()
+ public void CommitIndexAndClearSearchCache()
{
_writeSession.Commit();
_indexSession.Commit();
@@ -195,7 +189,7 @@ public void Commit()
public void Dispose()
{
- Commit();
+ CommitIndexAndClearSearchCache();
DisposeInternal();
}
diff --git a/src/Sir.InformationRetreival/Session/WriteSession.cs b/src/Sir.InformationRetreival/Session/WriteSession.cs
index 98724f25..38ef68ff 100644
--- a/src/Sir.InformationRetreival/Session/WriteSession.cs
+++ b/src/Sir.InformationRetreival/Session/WriteSession.cs
@@ -29,7 +29,7 @@ public void Put(Document document)
if (field.Value != null)
{
- Write(field, docMap);
+ WriteField(field, docMap);
}
}
@@ -38,7 +38,7 @@ public void Put(Document document)
_documentWriter.PutDocumentAddress(document.Id, docMeta.offset, docMeta.length);
}
- private void Write(Field field, IList<(long, long)> docMap)
+ private void WriteField(Field field, IList<(long, long)> docMap)
{
field.KeyId = EnsureKeyExists(field.Name);
@@ -59,11 +59,6 @@ public long EnsureKeyExists(string key)
return _documentWriter.KeyValueWriter.EnsureKeyExists(key);
}
- public long EnsureKeyExistsSafely(string key)
- {
- return _documentWriter.KeyValueWriter.EnsureKeyExistsSafely(key);
- }
-
public void Commit()
{
_documentWriter.Commit();
diff --git a/src/Sir.KeyValue/KeyValueReader.cs b/src/Sir.KeyValue/KeyValueReader.cs
index ddb392c7..d525f521 100644
--- a/src/Sir.KeyValue/KeyValueReader.cs
+++ b/src/Sir.KeyValue/KeyValueReader.cs
@@ -48,10 +48,24 @@ public bool TryGetKeyId(ulong keyHash, out long keyId)
if (!_keyCache.TryGetValue(key, out keys))
{
ReadKeysIntoCache();
+
+ if (!_keyCache.TryGetValue(key, out keys))
+ {
+ // there are no keys registered for this collection, even on disk.
+
+ keyId = -1;
+ return false;
+ }
}
- if (keys != null || _keyCache.TryGetValue(key, out keys))
+ if (keys.TryGetValue(keyHash, out keyId))
+ {
+ return true;
+ }
+ else
{
+ ReadKeysIntoCache();
+
if (keys.TryGetValue(keyHash, out keyId))
{
return true;
@@ -64,31 +78,29 @@ public bool TryGetKeyId(ulong keyHash, out long keyId)
private void ReadKeysIntoCache()
{
+ _keyCache.Clear();
+
foreach (var keyFile in System.IO.Directory.GetFiles(_directory, "*.kmap"))
{
var collectionId = ulong.Parse(Path.GetFileNameWithoutExtension(keyFile));
var key = Path.Combine(_directory, collectionId.ToString()).ToHash();
+ var keys = new ConcurrentDictionary();
- var keys = _keyCache.GetOrAdd(key, (k) =>
+ using (var stream = new FileStream(keyFile, FileMode.OpenOrCreate, FileAccess.Read, FileShare.ReadWrite))
{
- var ks = new ConcurrentDictionary();
+ long i = 0;
+ var buf = new byte[sizeof(ulong)];
+ var read = stream.Read(buf, 0, buf.Length);
- using (var stream = new FileStream(keyFile, FileMode.OpenOrCreate, FileAccess.Read, FileShare.ReadWrite))
+ while (read > 0)
{
- long i = 0;
- var buf = new byte[sizeof(ulong)];
- var read = stream.Read(buf, 0, buf.Length);
-
- while (read > 0)
- {
- ks.TryAdd(BitConverter.ToUInt64(buf, 0), i++);
+ keys.TryAdd(BitConverter.ToUInt64(buf, 0), i++);
- read = stream.Read(buf, 0, buf.Length);
- }
+ read = stream.Read(buf, 0, buf.Length);
}
+ }
- return ks;
- });
+ _keyCache.GetOrAdd(key, keys);
}
}
diff --git a/src/Sir.KeyValue/KeyValueWriter.cs b/src/Sir.KeyValue/KeyValueWriter.cs
index 5350a886..07b983f9 100644
--- a/src/Sir.KeyValue/KeyValueWriter.cs
+++ b/src/Sir.KeyValue/KeyValueWriter.cs
@@ -15,9 +15,8 @@ public class KeyValueWriter : IDisposable
private readonly ValueIndexWriter _keyIx;
private readonly ulong _collectionId;
private readonly string _directory;
- private readonly object _keyLock = new object();
- private ConcurrentDictionary> _keyCache;
private readonly KeyValueReader _kvReader;
+ private static object _keyLock = new object();
public KeyValueWriter(string directory, ulong collectionId)
: this(
@@ -29,7 +28,6 @@ public KeyValueWriter(string directory, ulong collectionId)
{
_collectionId = collectionId;
_directory = directory;
- _keyCache = new ConcurrentDictionary>();
_kvReader = new KeyValueReader(directory, collectionId);
}
@@ -75,7 +73,7 @@ public static Stream CreateAppendStream(string directory, ulong collectionId, lo
return new FileStream(fileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite);
}
- public long EnsureKeyExistsSafely(string keyStr)
+ public long EnsureKeyExists(string keyStr)
{
var keyHash = keyStr.ToHash();
long keyId;
@@ -94,7 +92,10 @@ public long EnsureKeyExistsSafely(string keyStr)
keyId = PutKeyInfo(keyInfo.offset, keyInfo.len, keyInfo.dataType);
// store key mapping
- RegisterKeyMapping(_directory, _collectionId, keyHash, keyId);
+ using (var stream = CreateAppendStream(_directory, _collectionId, "kmap"))
+ {
+ stream.Write(BitConverter.GetBytes(keyHash), 0, sizeof(ulong));
+ }
}
}
}
@@ -102,27 +103,6 @@ public long EnsureKeyExistsSafely(string keyStr)
return keyId;
}
- public long EnsureKeyExists(string keyStr)
- {
- var keyHash = keyStr.ToHash();
- long keyId;
-
- if (!_kvReader.TryGetKeyId(keyHash, out keyId))
- {
- // We have a new key!
-
- // store key
- var keyInfo = PutKey(keyStr);
-
- keyId = PutKeyInfo(keyInfo.offset, keyInfo.len, keyInfo.dataType);
-
- // store key mapping
- RegisterKeyMapping(_directory, _collectionId, keyHash, keyId);
- }
-
- return keyId;
- }
-
public (long keyId, long valueId) PutValue(long keyId, object val, out byte dataType)
{
// store value
@@ -164,20 +144,6 @@ public void OverwriteFixedLengthValue(long offset, object value, Type type)
_vals.Put(value);
}
- public void RegisterKeyMapping(string directory, ulong collectionId, ulong keyHash, long keyId)
- {
- var key = Path.Combine(directory, collectionId.ToString()).ToHash();
- var keys = _keyCache.GetOrAdd(key, (key) => { return new ConcurrentDictionary(); });
- var keyMapping = keys.GetOrAdd(keyHash, (key) =>
- {
- using (var stream = CreateAppendStream(directory, collectionId, "kmap"))
- {
- stream.Write(BitConverter.GetBytes(keyHash), 0, sizeof(ulong));
- }
- return keyId;
- });
- }
-
public void Dispose()
{
_vals.Dispose();
diff --git a/src/Sir.StringTests/BagOfCharsDatabaseTests.cs b/src/Sir.StringTests/BagOfCharsDatabaseTests.cs
index b63991b8..9ae53786 100644
--- a/src/Sir.StringTests/BagOfCharsDatabaseTests.cs
+++ b/src/Sir.StringTests/BagOfCharsDatabaseTests.cs
@@ -32,7 +32,7 @@ public void Can_stream()
database.Write(document, index: false);
}
- database.Commit();
+ database.CommitIndexAndClearSearchCache();
var i = 0;
@@ -69,7 +69,7 @@ public void Can_read_and_write()
database.Write(document);
}
- database.Commit();
+ database.CommitIndexAndClearSearchCache();
var queryParser = database.CreateQueryParser();
@@ -109,7 +109,7 @@ public void Can_optimize_index()
database.Write(document, store:true, index:false); // note: no indexing going on here
}
- database.Commit();
+ database.CommitIndexAndClearSearchCache();
var queryParser = database.CreateQueryParser();
diff --git a/write.bat b/write.bat
index 2e937d90..5db8472a 100644
--- a/write.bat
+++ b/write.bat
@@ -1 +1 @@
-sir.bat writewikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --file d:\enwiki-20211122-cirrussearch-content.json.gz --collection wikipedia --skip 100000 --take 100000 --sampleSize 10000 %*
\ No newline at end of file
+sir.bat writewikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --file d:\enwiki-20211122-cirrussearch-content.json.gz --collection wikipedia --skip 0 --take 10000 --sampleSize 1000 %*
\ No newline at end of file