From 4a0a6bdf9d69c7e0fa72b2cc92c2f3a40f9c450d Mon Sep 17 00:00:00 2001 From: Kreeben Date: Wed, 8 May 2024 15:26:03 +0200 Subject: [PATCH] bugfix: mapping between long keyId and ulong keyHash broke --- index.bat | 2 +- src/Sir.Document/DocmentMapReader.cs | 2 +- src/Sir.Document/DocumentIndexWriter.cs | 5 -- src/Sir.Document/DocumentMapWriter.cs | 2 +- src/Sir.Document/DocumentReader.cs | 2 +- src/Sir.Document/DocumentRegistryWriter.cs | 28 +++++------ .../Session/DocumentDatabase.cs | 14 ++---- .../Session/WriteSession.cs | 9 +--- src/Sir.KeyValue/KeyValueReader.cs | 42 +++++++++++------ src/Sir.KeyValue/KeyValueWriter.cs | 46 +++---------------- .../BagOfCharsDatabaseTests.cs | 6 +-- write.bat | 2 +- 12 files changed, 61 insertions(+), 99 deletions(-) diff --git a/index.bat b/index.bat index 54c22724..5712b5a3 100644 --- a/index.bat +++ b/index.bat @@ -1 +1 @@ -sir.bat indexwikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --collection wikipedia --skip 0 --take 200000 --pageSize 100000 --sampleSize 1000 %* \ No newline at end of file +sir.bat indexwikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --collection wikipedia --skip 0 --take 100000 --pageSize 10000 --sampleSize 1000 %* \ No newline at end of file diff --git a/src/Sir.Document/DocmentMapReader.cs b/src/Sir.Document/DocmentMapReader.cs index df3c7b79..cc912716 100644 --- a/src/Sir.Document/DocmentMapReader.cs +++ b/src/Sir.Document/DocmentMapReader.cs @@ -4,7 +4,7 @@ namespace Sir.Documents { /// - /// Read document maps (key_id/val_id) from the document map stream. + /// Read document maps (key_id/val_id, each pair 2*sizeof(long)) from the document map stream. /// A document map is needed to re-contruct a complete document. /// public class DocmentMapReader : IDisposable diff --git a/src/Sir.Document/DocumentIndexWriter.cs b/src/Sir.Document/DocumentIndexWriter.cs index 4d924760..66307392 100644 --- a/src/Sir.Document/DocumentIndexWriter.cs +++ b/src/Sir.Document/DocumentIndexWriter.cs @@ -16,11 +16,6 @@ public DocumentIndexWriter(Stream stream) _stream = stream; } - public void Flush() - { - _stream.Flush(); - } - /// /// Get the next auto-incrementing doc id /// diff --git a/src/Sir.Document/DocumentMapWriter.cs b/src/Sir.Document/DocumentMapWriter.cs index d0a6895b..4ab9a5ec 100644 --- a/src/Sir.Document/DocumentMapWriter.cs +++ b/src/Sir.Document/DocumentMapWriter.cs @@ -5,7 +5,7 @@ namespace Sir.Documents { /// - /// Writes document maps (key_id/val_id) to a bitmap. + /// Writes document maps (key_id/val_id, each pair 2*sizeof(long)) to a bitmap. /// public class DocumentMapWriter : IDisposable { diff --git a/src/Sir.Document/DocumentReader.cs b/src/Sir.Document/DocumentReader.cs index 0a3805d5..fec5ecee 100644 --- a/src/Sir.Document/DocumentReader.cs +++ b/src/Sir.Document/DocumentReader.cs @@ -21,7 +21,7 @@ public static Document Read(long documentId, DocumentRegistryReader documentRead var vInfo = documentReader.GetAddressOfValue(kvp.valId); var val = documentReader.GetValue(vInfo.offset, vInfo.len, vInfo.dataType); - fields.Add(new Field(key, val, kvp.keyId)); + fields.Add(new Field(key, val, kvp.keyId, documentId)); } } diff --git a/src/Sir.Document/DocumentRegistryWriter.cs b/src/Sir.Document/DocumentRegistryWriter.cs index 1e3479d7..71ae042f 100644 --- a/src/Sir.Document/DocumentRegistryWriter.cs +++ b/src/Sir.Document/DocumentRegistryWriter.cs @@ -9,8 +9,8 @@ namespace Sir.Documents /// public class DocumentRegistryWriter : IDisposable { - private DocumentMapWriter _docs; - private DocumentIndexWriter _docIx; + private DocumentMapWriter _documentMapWriter; + private DocumentIndexWriter _documentIndexWriter; private KeyValueWriter _kvWriter; private readonly string _directory; private readonly ulong _collectionId; @@ -19,8 +19,8 @@ public class DocumentRegistryWriter : IDisposable public DocumentRegistryWriter(string directory, ulong collectionId) { - _docs = new DocumentMapWriter(KeyValueWriter.CreateAppendStream(directory, collectionId, "docs")); - _docIx = new DocumentIndexWriter(KeyValueWriter.CreateAppendStream(directory, collectionId, "dix")); + _documentMapWriter = new DocumentMapWriter(KeyValueWriter.CreateAppendStream(directory, collectionId, "docs")); + _documentIndexWriter = new DocumentIndexWriter(KeyValueWriter.CreateAppendStream(directory, collectionId, "dix")); _kvWriter = new KeyValueWriter(directory, collectionId); _directory = directory; _collectionId = collectionId; @@ -28,39 +28,39 @@ public DocumentRegistryWriter(string directory, ulong collectionId) public long IncrementDocId() { - return _docIx.IncrementDocId(); + return _documentIndexWriter.IncrementDocId(); } public (long offset, int length) PutDocumentMap(IList<(long keyId, long valId)> doc) { - return _docs.Put(doc); + return _documentMapWriter.Put(doc); } public void UpdateDocumentMap(long offsetOfMap, int indexInMap, long keyId, long valId) { - _docs.Overwrite(offsetOfMap, indexInMap, keyId, valId); + _documentMapWriter.Overwrite(offsetOfMap, indexInMap, keyId, valId); } public void PutDocumentAddress(long docId, long offset, int len) { - _docIx.Put(docId, offset, len); + _documentIndexWriter.Put(docId, offset, len); } public void Commit() { - _docs.Dispose(); - _docIx.Dispose(); + _documentMapWriter.Dispose(); + _documentIndexWriter.Dispose(); _kvWriter.Dispose(); - _docs = new DocumentMapWriter(KeyValueWriter.CreateAppendStream(_directory, _collectionId, "docs")); - _docIx = new DocumentIndexWriter(KeyValueWriter.CreateAppendStream(_directory, _collectionId, "dix")); + _documentMapWriter = new DocumentMapWriter(KeyValueWriter.CreateAppendStream(_directory, _collectionId, "docs")); + _documentIndexWriter = new DocumentIndexWriter(KeyValueWriter.CreateAppendStream(_directory, _collectionId, "dix")); _kvWriter = new KeyValueWriter(_directory, _collectionId); } public void Dispose() { - _docs.Dispose(); - _docIx.Dispose(); + _documentMapWriter.Dispose(); + _documentIndexWriter.Dispose(); _kvWriter.Dispose(); } } diff --git a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs index c06dd91a..b2d41d38 100644 --- a/src/Sir.InformationRetreival/Session/DocumentDatabase.cs +++ b/src/Sir.InformationRetreival/Session/DocumentDatabase.cs @@ -28,8 +28,8 @@ public DocumentDatabase(string directory, ulong collectionId, IModel model = { _directory = directory ?? throw new ArgumentNullException(nameof(directory)); _collectionId = collectionId; - _model = model ?? throw new ArgumentNullException(nameof(model)); - _indexStrategy = indexStrategy ?? throw new ArgumentNullException(nameof(indexStrategy)); + _model = model; + _indexStrategy = indexStrategy; _writeSession = new WriteSession(new DocumentRegistryWriter(directory, collectionId)); _indexSession = new IndexSession(directory, collectionId, model, indexStrategy, logger); _searchSession = new SearchSession(directory, _model, _indexStrategy, logger); @@ -89,9 +89,6 @@ public void OptimizeAllIndices(int skipDocuments = 0, int takeDocuments = int.Ma public void Truncate() { DisposeInternal(); - _writeSession = null; - _indexSession = null; - _searchSession = null; var count = 0; @@ -153,9 +150,6 @@ public void TruncateIndexOnly() public void Rename(ulong newCollectionId) { DisposeInternal(); - _writeSession = null; - _indexSession = null; - _searchSession = null; var count = 0; var from = _collectionId.ToString(); @@ -186,7 +180,7 @@ private void LogInformation(string message) _logger.LogInformation(message); } - public void Commit() + public void CommitIndexAndClearSearchCache() { _writeSession.Commit(); _indexSession.Commit(); @@ -195,7 +189,7 @@ public void Commit() public void Dispose() { - Commit(); + CommitIndexAndClearSearchCache(); DisposeInternal(); } diff --git a/src/Sir.InformationRetreival/Session/WriteSession.cs b/src/Sir.InformationRetreival/Session/WriteSession.cs index 98724f25..38ef68ff 100644 --- a/src/Sir.InformationRetreival/Session/WriteSession.cs +++ b/src/Sir.InformationRetreival/Session/WriteSession.cs @@ -29,7 +29,7 @@ public void Put(Document document) if (field.Value != null) { - Write(field, docMap); + WriteField(field, docMap); } } @@ -38,7 +38,7 @@ public void Put(Document document) _documentWriter.PutDocumentAddress(document.Id, docMeta.offset, docMeta.length); } - private void Write(Field field, IList<(long, long)> docMap) + private void WriteField(Field field, IList<(long, long)> docMap) { field.KeyId = EnsureKeyExists(field.Name); @@ -59,11 +59,6 @@ public long EnsureKeyExists(string key) return _documentWriter.KeyValueWriter.EnsureKeyExists(key); } - public long EnsureKeyExistsSafely(string key) - { - return _documentWriter.KeyValueWriter.EnsureKeyExistsSafely(key); - } - public void Commit() { _documentWriter.Commit(); diff --git a/src/Sir.KeyValue/KeyValueReader.cs b/src/Sir.KeyValue/KeyValueReader.cs index ddb392c7..d525f521 100644 --- a/src/Sir.KeyValue/KeyValueReader.cs +++ b/src/Sir.KeyValue/KeyValueReader.cs @@ -48,10 +48,24 @@ public bool TryGetKeyId(ulong keyHash, out long keyId) if (!_keyCache.TryGetValue(key, out keys)) { ReadKeysIntoCache(); + + if (!_keyCache.TryGetValue(key, out keys)) + { + // there are no keys registered for this collection, even on disk. + + keyId = -1; + return false; + } } - if (keys != null || _keyCache.TryGetValue(key, out keys)) + if (keys.TryGetValue(keyHash, out keyId)) + { + return true; + } + else { + ReadKeysIntoCache(); + if (keys.TryGetValue(keyHash, out keyId)) { return true; @@ -64,31 +78,29 @@ public bool TryGetKeyId(ulong keyHash, out long keyId) private void ReadKeysIntoCache() { + _keyCache.Clear(); + foreach (var keyFile in System.IO.Directory.GetFiles(_directory, "*.kmap")) { var collectionId = ulong.Parse(Path.GetFileNameWithoutExtension(keyFile)); var key = Path.Combine(_directory, collectionId.ToString()).ToHash(); + var keys = new ConcurrentDictionary(); - var keys = _keyCache.GetOrAdd(key, (k) => + using (var stream = new FileStream(keyFile, FileMode.OpenOrCreate, FileAccess.Read, FileShare.ReadWrite)) { - var ks = new ConcurrentDictionary(); + long i = 0; + var buf = new byte[sizeof(ulong)]; + var read = stream.Read(buf, 0, buf.Length); - using (var stream = new FileStream(keyFile, FileMode.OpenOrCreate, FileAccess.Read, FileShare.ReadWrite)) + while (read > 0) { - long i = 0; - var buf = new byte[sizeof(ulong)]; - var read = stream.Read(buf, 0, buf.Length); - - while (read > 0) - { - ks.TryAdd(BitConverter.ToUInt64(buf, 0), i++); + keys.TryAdd(BitConverter.ToUInt64(buf, 0), i++); - read = stream.Read(buf, 0, buf.Length); - } + read = stream.Read(buf, 0, buf.Length); } + } - return ks; - }); + _keyCache.GetOrAdd(key, keys); } } diff --git a/src/Sir.KeyValue/KeyValueWriter.cs b/src/Sir.KeyValue/KeyValueWriter.cs index 5350a886..07b983f9 100644 --- a/src/Sir.KeyValue/KeyValueWriter.cs +++ b/src/Sir.KeyValue/KeyValueWriter.cs @@ -15,9 +15,8 @@ public class KeyValueWriter : IDisposable private readonly ValueIndexWriter _keyIx; private readonly ulong _collectionId; private readonly string _directory; - private readonly object _keyLock = new object(); - private ConcurrentDictionary> _keyCache; private readonly KeyValueReader _kvReader; + private static object _keyLock = new object(); public KeyValueWriter(string directory, ulong collectionId) : this( @@ -29,7 +28,6 @@ public KeyValueWriter(string directory, ulong collectionId) { _collectionId = collectionId; _directory = directory; - _keyCache = new ConcurrentDictionary>(); _kvReader = new KeyValueReader(directory, collectionId); } @@ -75,7 +73,7 @@ public static Stream CreateAppendStream(string directory, ulong collectionId, lo return new FileStream(fileName, FileMode.Append, FileAccess.Write, FileShare.ReadWrite); } - public long EnsureKeyExistsSafely(string keyStr) + public long EnsureKeyExists(string keyStr) { var keyHash = keyStr.ToHash(); long keyId; @@ -94,7 +92,10 @@ public long EnsureKeyExistsSafely(string keyStr) keyId = PutKeyInfo(keyInfo.offset, keyInfo.len, keyInfo.dataType); // store key mapping - RegisterKeyMapping(_directory, _collectionId, keyHash, keyId); + using (var stream = CreateAppendStream(_directory, _collectionId, "kmap")) + { + stream.Write(BitConverter.GetBytes(keyHash), 0, sizeof(ulong)); + } } } } @@ -102,27 +103,6 @@ public long EnsureKeyExistsSafely(string keyStr) return keyId; } - public long EnsureKeyExists(string keyStr) - { - var keyHash = keyStr.ToHash(); - long keyId; - - if (!_kvReader.TryGetKeyId(keyHash, out keyId)) - { - // We have a new key! - - // store key - var keyInfo = PutKey(keyStr); - - keyId = PutKeyInfo(keyInfo.offset, keyInfo.len, keyInfo.dataType); - - // store key mapping - RegisterKeyMapping(_directory, _collectionId, keyHash, keyId); - } - - return keyId; - } - public (long keyId, long valueId) PutValue(long keyId, object val, out byte dataType) { // store value @@ -164,20 +144,6 @@ public void OverwriteFixedLengthValue(long offset, object value, Type type) _vals.Put(value); } - public void RegisterKeyMapping(string directory, ulong collectionId, ulong keyHash, long keyId) - { - var key = Path.Combine(directory, collectionId.ToString()).ToHash(); - var keys = _keyCache.GetOrAdd(key, (key) => { return new ConcurrentDictionary(); }); - var keyMapping = keys.GetOrAdd(keyHash, (key) => - { - using (var stream = CreateAppendStream(directory, collectionId, "kmap")) - { - stream.Write(BitConverter.GetBytes(keyHash), 0, sizeof(ulong)); - } - return keyId; - }); - } - public void Dispose() { _vals.Dispose(); diff --git a/src/Sir.StringTests/BagOfCharsDatabaseTests.cs b/src/Sir.StringTests/BagOfCharsDatabaseTests.cs index b63991b8..9ae53786 100644 --- a/src/Sir.StringTests/BagOfCharsDatabaseTests.cs +++ b/src/Sir.StringTests/BagOfCharsDatabaseTests.cs @@ -32,7 +32,7 @@ public void Can_stream() database.Write(document, index: false); } - database.Commit(); + database.CommitIndexAndClearSearchCache(); var i = 0; @@ -69,7 +69,7 @@ public void Can_read_and_write() database.Write(document); } - database.Commit(); + database.CommitIndexAndClearSearchCache(); var queryParser = database.CreateQueryParser(); @@ -109,7 +109,7 @@ public void Can_optimize_index() database.Write(document, store:true, index:false); // note: no indexing going on here } - database.Commit(); + database.CommitIndexAndClearSearchCache(); var queryParser = database.CreateQueryParser(); diff --git a/write.bat b/write.bat index 2e937d90..5db8472a 100644 --- a/write.bat +++ b/write.bat @@ -1 +1 @@ -sir.bat writewikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --file d:\enwiki-20211122-cirrussearch-content.json.gz --collection wikipedia --skip 100000 --take 100000 --sampleSize 10000 %* \ No newline at end of file +sir.bat writewikipedia --directory C:\projects\resin\src\Sir.HttpServer\AppData\database --file d:\enwiki-20211122-cirrussearch-content.json.gz --collection wikipedia --skip 0 --take 10000 --sampleSize 1000 %* \ No newline at end of file