From 4ac33f9d4021e24c090e1c7184ba329186a4df8c Mon Sep 17 00:00:00 2001 From: Tony Solomonik Date: Sun, 26 May 2024 21:09:26 +0300 Subject: [PATCH] metadata: Use postgres to store and read index file metadata like path --- .env | 7 + ...c429fa08f1f2d988f5df839783df0b4321d2c.json | 32 + Cargo.lock | 691 +++++++++++++++++- Cargo.toml | 2 + migrations/0001_indexes.down.sql | 1 + migrations/0001_indexes.up.sql | 5 + src/args.rs | 26 +- src/main.rs | 149 ++-- 8 files changed, 829 insertions(+), 84 deletions(-) create mode 100644 .env create mode 100644 .sqlx/query-01c21a2c5f1799c72bd2ef9c7ecc429fa08f1f2d988f5df839783df0b4321d2c.json create mode 100644 migrations/0001_indexes.down.sql create mode 100644 migrations/0001_indexes.up.sql diff --git a/.env b/.env new file mode 100644 index 0000000..1239380 --- /dev/null +++ b/.env @@ -0,0 +1,7 @@ +# sudo -u postgres psql +# ALTER USER postgres PASSWORD 'postgres'; +DATABASE_URL=postgres://postgres:postgres@127.0.0.1/toshokan + +# cargo sqlx prepare +# https://github.com/launchbadge/sqlx/blob/main/sqlx-cli/README.md +SQLX_OFFLINE=true diff --git a/.sqlx/query-01c21a2c5f1799c72bd2ef9c7ecc429fa08f1f2d988f5df839783df0b4321d2c.json b/.sqlx/query-01c21a2c5f1799c72bd2ef9c7ecc429fa08f1f2d988f5df839783df0b4321d2c.json new file mode 100644 index 0000000..e7ad264 --- /dev/null +++ b/.sqlx/query-01c21a2c5f1799c72bd2ef9c7ecc429fa08f1f2d988f5df839783df0b4321d2c.json @@ -0,0 +1,32 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT id, file_name, footer_len FROM indexes", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "id", + "type_info": "Varchar" + }, + { + "ordinal": 1, + "name": "file_name", + "type_info": "Text" + }, + { + "ordinal": 2, + "name": "footer_len", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [] + }, + "nullable": [ + false, + false, + false + ] + }, + "hash": "01c21a2c5f1799c72bd2ef9c7ecc429fa08f1f2d988f5df839783df0b4321d2c" +} diff --git a/Cargo.lock b/Cargo.lock index 043bd1c..781a959 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,6 +24,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", + "getrandom", "once_cell", "version_check", "zerocopy", @@ -137,7 +138,16 @@ checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.61", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", ] [[package]] @@ -179,12 +189,24 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + [[package]] name = "bincode" version = "1.3.3" @@ -194,11 +216,20 @@ dependencies = [ "serde", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +dependencies = [ + "serde", +] [[package]] name = "bitpacking" @@ -299,10 +330,10 @@ version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", - "syn", + "syn 2.0.61", ] [[package]] @@ -330,12 +361,42 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + [[package]] name = "core-foundation-sys" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "cpufeatures" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crc32c" version = "0.6.5" @@ -382,6 +443,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.19" @@ -404,6 +474,17 @@ dependencies = [ "typenum", ] +[[package]] +name = "der" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f55bf8e7b65898637379c1b74eb1551107c8294ed26d855ceb9fd1a09cfc9bc0" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + [[package]] name = "deranged" version = "0.3.11" @@ -421,9 +502,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", + "const-oid", "crypto-common", + "subtle", ] +[[package]] +name = "dotenvy" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" + [[package]] name = "downcast-rs" version = "1.2.1" @@ -435,6 +524,9 @@ name = "either" version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" +dependencies = [ + "serde", +] [[package]] name = "env_logger" @@ -449,6 +541,12 @@ dependencies = [ "termcolor", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.9" @@ -459,6 +557,23 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "etcetera" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.48.0", +] + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "eyre" version = "0.6.12" @@ -487,6 +602,17 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdeb3aa5e95cf9aabc17f060cfa0ced7b83f042390760ca53bf09df9968acaa1" +[[package]] +name = "flume" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181" +dependencies = [ + "futures-core", + "futures-sink", + "spin 0.9.8", +] + [[package]] name = "fnv" version = "1.0.7" @@ -554,6 +680,17 @@ dependencies = [ "futures-util", ] +[[package]] +name = "futures-intrusive" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f" +dependencies = [ + "futures-core", + "lock_api", + "parking_lot", +] + [[package]] name = "futures-io" version = "0.3.30" @@ -568,7 +705,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.61", ] [[package]] @@ -653,6 +790,24 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashlink" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "heck" version = "0.5.0" @@ -665,6 +820,39 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "htmlescape" version = "0.3.1" @@ -812,6 +1000,16 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683" +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "instant" version = "0.1.12" @@ -885,6 +1083,9 @@ name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +dependencies = [ + "spin 0.5.2", +] [[package]] name = "levenshtein_automata" @@ -904,6 +1105,17 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +[[package]] +name = "libsqlite3-sys" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "linux-raw-sys" version = "0.4.13" @@ -1057,12 +1269,49 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand", + "smallvec", + "zeroize", +] + [[package]] name = "num-conv" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1116,7 +1365,7 @@ dependencies = [ "anyhow", "async-trait", "backon", - "base64", + "base64 0.22.1", "bytes", "chrono", "crc32c", @@ -1175,11 +1424,26 @@ checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.5.1", "smallvec", "windows-targets 0.52.5", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -1203,7 +1467,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.61", ] [[package]] @@ -1218,6 +1482,27 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.30" @@ -1334,13 +1619,22 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "redox_syscall" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" dependencies = [ - "bitflags", + "bitflags 2.5.0", ] [[package]] @@ -1393,7 +1687,7 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "566cafdd92868e0939d3fb961bd0dc25fcfaaed179291093b3d43e6b3150ea10" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-core", "futures-util", @@ -1440,11 +1734,31 @@ dependencies = [ "cfg-if", "getrandom", "libc", - "spin", + "spin 0.9.8", "untrusted", "windows-sys 0.52.0", ] +[[package]] +name = "rsa" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8", + "rand_core", + "signature", + "spki", + "subtle", + "zeroize", +] + [[package]] name = "rust-stemmers" version = "1.2.0" @@ -1482,7 +1796,7 @@ version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags", + "bitflags 2.5.0", "errno", "libc", "linux-raw-sys", @@ -1509,7 +1823,7 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" dependencies = [ - "base64", + "base64 0.22.1", "rustls-pki-types", ] @@ -1577,7 +1891,7 @@ checksum = "c5e405930b9796f1c00bee880d03fc7e0bb4b9a11afc776885ffe84320da2865" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.61", ] [[package]] @@ -1603,6 +1917,28 @@ dependencies = [ "serde", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -1621,6 +1957,16 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core", +] + [[package]] name = "sketches-ddsketch" version = "0.2.2" @@ -1655,11 +2001,235 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + [[package]] name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "sqlformat" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce81b7bd7c4493975347ef60d8c7e8b742d4694f4c49f93e0a12ea263938176c" +dependencies = [ + "itertools", + "nom", + "unicode_categories", +] + +[[package]] +name = "sqlx" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9a2ccff1a000a5a59cd33da541d9f2fdcd9e6e8229cc200565942bff36d0aaa" +dependencies = [ + "sqlx-core", + "sqlx-macros", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", +] + +[[package]] +name = "sqlx-core" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24ba59a9342a3d9bab6c56c118be528b27c9b60e490080e9711a04dccac83ef6" +dependencies = [ + "ahash", + "atoi", + "byteorder", + "bytes", + "crc", + "crossbeam-queue", + "either", + "event-listener", + "futures-channel", + "futures-core", + "futures-intrusive", + "futures-io", + "futures-util", + "hashlink", + "hex", + "indexmap", + "log", + "memchr", + "once_cell", + "paste", + "percent-encoding", + "serde", + "serde_json", + "sha2", + "smallvec", + "sqlformat", + "thiserror", + "tokio", + "tokio-stream", + "tracing", + "url", +] + +[[package]] +name = "sqlx-macros" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ea40e2345eb2faa9e1e5e326db8c34711317d2b5e08d0d5741619048a803127" +dependencies = [ + "proc-macro2", + "quote", + "sqlx-core", + "sqlx-macros-core", + "syn 1.0.109", +] + +[[package]] +name = "sqlx-macros-core" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5833ef53aaa16d860e92123292f1f6a3d53c34ba8b1969f152ef1a7bb803f3c8" +dependencies = [ + "dotenvy", + "either", + "heck 0.4.1", + "hex", + "once_cell", + "proc-macro2", + "quote", + "serde", + "serde_json", + "sha2", + "sqlx-core", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", + "syn 1.0.109", + "tempfile", + "tokio", + "url", +] + +[[package]] +name = "sqlx-mysql" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ed31390216d20e538e447a7a9b959e06ed9fc51c37b514b46eb758016ecd418" +dependencies = [ + "atoi", + "base64 0.21.7", + "bitflags 2.5.0", + "byteorder", + "bytes", + "crc", + "digest", + "dotenvy", + "either", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "generic-array", + "hex", + "hkdf", + "hmac", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "percent-encoding", + "rand", + "rsa", + "serde", + "sha1", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-postgres" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c824eb80b894f926f89a0b9da0c7f435d27cdd35b8c655b114e58223918577e" +dependencies = [ + "atoi", + "base64 0.21.7", + "bitflags 2.5.0", + "byteorder", + "crc", + "dotenvy", + "etcetera", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "hex", + "hkdf", + "hmac", + "home", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "rand", + "serde", + "serde_json", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-sqlite" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b244ef0a8414da0bed4bb1910426e890b19e5e9bccc27ada6b797d05c55ae0aa" +dependencies = [ + "atoi", + "flume", + "futures-channel", + "futures-core", + "futures-executor", + "futures-intrusive", + "futures-util", + "libsqlite3-sys", + "log", + "percent-encoding", + "serde", + "sqlx-core", + "tracing", + "url", + "urlencoding", +] [[package]] name = "stable_deref_trait" @@ -1667,6 +2237,17 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stringprep" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", + "unicode-properties", +] + [[package]] name = "strsim" version = "0.11.1" @@ -1679,6 +2260,17 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.61" @@ -1704,7 +2296,7 @@ checksum = "f8d0582f186c0a6d55655d24543f15e43607299425c5ad8352c242b914b31856" dependencies = [ "aho-corasick", "arc-swap", - "base64", + "base64 0.22.1", "bitpacking", "byteorder", "census", @@ -1875,7 +2467,7 @@ checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.61", ] [[package]] @@ -1961,7 +2553,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.61", ] [[package]] @@ -1975,6 +2567,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.11" @@ -1997,6 +2600,7 @@ dependencies = [ "bincode", "clap", "color-eyre", + "dotenvy", "futures", "log", "once_cell", @@ -2004,6 +2608,7 @@ dependencies = [ "pretty_env_logger", "serde", "serde_json", + "sqlx", "tantivy", "tokio", "tokio-util", @@ -2058,7 +2663,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.61", ] [[package]] @@ -2133,6 +2738,24 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-properties" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291" + +[[package]] +name = "unicode-segmentation" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "untrusted" version = "0.9.0" @@ -2150,6 +2773,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf8-ranges" version = "1.0.5" @@ -2179,6 +2808,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" @@ -2200,6 +2835,12 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + [[package]] name = "wasm-bindgen" version = "0.2.92" @@ -2221,7 +2862,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.61", "wasm-bindgen-shared", ] @@ -2255,7 +2896,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.61", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2298,6 +2939,16 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "whoami" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" +dependencies = [ + "redox_syscall 0.4.1", + "wasite", +] + [[package]] name = "winapi" version = "0.3.9" @@ -2513,7 +3164,7 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.61", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index b9fa134..f873a10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ lto = "thin" bincode = "1.3.3" clap = { version = "4.5.4", features = ["derive"] } color-eyre = { version = "0.6.3", default-features = false } +dotenvy = "0.15.7" futures = "0.3.30" log = "0.4.21" once_cell = "1.19.0" @@ -27,6 +28,7 @@ opendal = { version = "0.46.0", features = ["services-fs"] } pretty_env_logger = "0.5.0" serde = { version = "1.0.201", features = ["derive", "rc"] } serde_json = "1.0.117" +sqlx = { version = "0.7.4", features = ["postgres", "macros", "runtime-tokio"] } tantivy = "0.22.0" tokio = { version = "1.37.0", features = ["full"] } tokio-util = { version = "0.7.11", features = ["compat"] } diff --git a/migrations/0001_indexes.down.sql b/migrations/0001_indexes.down.sql new file mode 100644 index 0000000..b3afadf --- /dev/null +++ b/migrations/0001_indexes.down.sql @@ -0,0 +1 @@ +DROP TABLE indexes; diff --git a/migrations/0001_indexes.up.sql b/migrations/0001_indexes.up.sql new file mode 100644 index 0000000..dc58c89 --- /dev/null +++ b/migrations/0001_indexes.up.sql @@ -0,0 +1,5 @@ +CREATE TABLE IF NOT EXISTS indexes( + id VARCHAR(36) PRIMARY KEY, + file_name TEXT NOT NULL, + footer_len BIGINT NOT NULL +); diff --git a/src/args.rs b/src/args.rs index 65bfb77..dacdbdb 100644 --- a/src/args.rs +++ b/src/args.rs @@ -3,6 +3,21 @@ use clap::Parser; #[derive(Parser, Debug, Clone)] #[command(author, version, about, long_about = None)] pub struct Args { + #[clap( + short, + long, + help = "Path to the index dir.", + default_value = "/tmp/toshokan" + )] + pub index_dir: String, + + #[clap( + long, + help = "Postgres DB connection url. +Can also be provided by a DATABASE_URL env var, but only if this arg is not provided." + )] + pub db: Option, + #[clap(subcommand)] pub subcmd: SubCommand, } @@ -24,14 +39,11 @@ pub struct IndexArgs { #[clap(help = "Path to the input jsonl file you want to index.")] pub input_path: String, - #[clap(help = "Path to the index dir.")] - pub index_dir: String, - #[clap( short, long, help = "Path to the dir to build in the inverted indexes.", - default_value = "/tmp/toshokan" + default_value = "/tmp/toshokan_build" )] pub build_dir: String, @@ -46,9 +58,6 @@ The memory is split evenly between all indexing threads, once a thread reaches i #[derive(Parser, Debug, Clone)] pub struct MergeArgs { - #[clap(help = "Path to the index dir.")] - pub index_dir: String, - #[clap( short, long, @@ -60,9 +69,6 @@ pub struct MergeArgs { #[derive(Parser, Debug, Clone)] pub struct SearchArgs { - #[clap(help = "Path to the index dir.")] - pub index_dir: String, - #[clap(help = "Query in tantivy syntax.")] pub query: String, diff --git a/src/main.rs b/src/main.rs index e822f21..0cdc5de 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,7 +5,6 @@ mod opendal_file_handle; mod unified_index; use std::{ - collections::HashSet, path::{Path, PathBuf}, sync::Arc, time::Duration, @@ -13,9 +12,11 @@ use std::{ use args::{IndexArgs, MergeArgs, SearchArgs}; use color_eyre::eyre::Result; +use dotenvy::dotenv; use futures::future::{try_join, try_join_all}; use opendal::{layers::LoggingLayer, BlockingOperator, Operator}; use pretty_env_logger::formatted_timed_builder; +use sqlx::{postgres::PgPoolOptions, query, PgPool}; use tantivy::{ collector::TopDocs, directory::{DirectoryClone, FileSlice, MmapDirectory}, @@ -27,7 +28,7 @@ use tantivy::{ DateTime, Document, Index, IndexWriter, ReloadPolicy, TantivyDocument, }; use tokio::{ - fs::{create_dir, read_dir, read_to_string, write, File}, + fs::{create_dir, create_dir_all, remove_file, File}, io::{AsyncBufReadExt, AsyncWriteExt, BufReader}, runtime::Builder, spawn, @@ -51,19 +52,15 @@ extern crate log; const DEFAULT_DEBUG_LOG_LEVEL: &str = "toshokan=trace,opendal::services=info"; const DEFAULT_RELEASE_LOG_LEVEL: &str = "toshokan=info,opendal::services=info"; -async fn open_unified_directories(index_dir: &str) -> Result> { - let mut index_ids = HashSet::new(); - let mut dir_reader = read_dir(index_dir).await?; - while let Some(entry) = dir_reader.next_entry().await? { - if let Some(filename) = entry.file_name().to_str() { - index_ids.insert( - filename - .chars() - .take_while(|&c| c != '.') - .collect::(), - ); - } - } +const MAX_DB_CONNECTIONS: u32 = 100; + +async fn open_unified_directories( + index_dir: &str, + pool: &PgPool, +) -> Result> { + let items = query!("SELECT id, file_name, footer_len FROM indexes") + .fetch_all(pool) + .await?; let mut builder = opendal::services::Fs::default(); builder.root(index_dir); @@ -73,28 +70,26 @@ async fn open_unified_directories(index_dir: &str) -> Result()?; - - directories_args.push((file_slice, footer_len)) + directories_args.push((item.id, file_slice, item.footer_len)) } let results = try_join_all( directories_args .into_iter() - .map(|(file_slice, footer_len)| { - spawn_blocking(move || -> Result { - UnifiedDirectory::open_with_len(file_slice, footer_len as usize) + .map(|(id, file_slice, footer_len)| { + spawn_blocking(move || -> Result<(String, UnifiedDirectory)> { + Ok(( + id, + UnifiedDirectory::open_with_len(file_slice, footer_len as usize)?, + )) }) }), ) @@ -103,9 +98,14 @@ async fn open_unified_directories(index_dir: &str) -> Result>() } -async fn write_unified_index(index: Index, input_dir: &str, output_dir: &str) -> Result<()> { - let build_dir_path = PathBuf::from(input_dir); - let file_cache = spawn_blocking(move || build_file_cache(&build_dir_path)).await??; +async fn write_unified_index( + index: Index, + input_dir: &str, + output_dir: &str, + pool: &PgPool, +) -> Result<()> { + let cloned_input_dir = PathBuf::from(input_dir); + let file_cache = spawn_blocking(move || build_file_cache(&cloned_input_dir)).await??; let unified_index_writer = UnifiedIndexWriter::from_file_paths( Path::new(input_dir), @@ -121,8 +121,9 @@ async fn write_unified_index(index: Index, input_dir: &str, output_dir: &str) -> .finish(); let id = uuid::Uuid::now_v7(); + let file_name = format!("{}.index", id); let mut writer = op - .writer_with(&format!("{}.index", id)) + .writer_with(&file_name) .content_type("application/octet-stream") .chunk(5_000_000) .await? @@ -134,11 +135,12 @@ async fn write_unified_index(index: Index, input_dir: &str, output_dir: &str) -> let (total_len, footer_len) = unified_index_writer.write(&mut writer, file_cache).await?; writer.shutdown().await?; - write( - Path::new(output_dir).join(format!("{}.footer", id)), - footer_len.to_string(), - ) - .await?; + query("INSERT INTO indexes (id, file_name, footer_len) VALUES ($1, $2, $3)") + .bind(&id.to_string()) + .bind(&file_name) + .bind(footer_len as i64) + .execute(pool) + .await?; debug!( "Index file length: {}. Footer length: {}", @@ -148,7 +150,7 @@ async fn write_unified_index(index: Index, input_dir: &str, output_dir: &str) -> Ok(()) } -async fn index(args: IndexArgs) -> Result<()> { +async fn index(args: IndexArgs, pool: PgPool, index_dir: &str) -> Result<()> { let mut schema_builder = Schema::builder(); let dynamic_field = schema_builder.add_json_field( "_dynamic", @@ -161,7 +163,7 @@ async fn index(args: IndexArgs) -> Result<()> { let schema = schema_builder.build(); - let _ = create_dir(&args.build_dir).await; + let _ = create_dir_all(&args.build_dir).await; let index = Index::open_or_create(MmapDirectory::open(&args.build_dir)?, schema.clone())?; let mut index_writer: IndexWriter = index.writer(args.memory_budget)?; index_writer.set_merge_policy(Box::new(NoMergePolicy)); @@ -202,25 +204,26 @@ async fn index(args: IndexArgs) -> Result<()> { spawn_blocking(move || index_writer.wait_merging_threads()).await??; - write_unified_index(index, &args.build_dir, &args.index_dir).await?; + write_unified_index(index, &args.build_dir, index_dir, &pool).await?; Ok(()) } -async fn merge(args: MergeArgs) -> Result<()> { - let _ = create_dir(&args.merge_dir).await; - let output_dir = MmapDirectory::open(&args.merge_dir)?; - - let directories = open_unified_directories(&args.index_dir) +async fn merge(args: MergeArgs, pool: PgPool, index_dir: &str) -> Result<()> { + let (ids, directories): (Vec<_>, Vec<_>) = open_unified_directories(index_dir, &pool) .await? .into_iter() - .map(|x| x.box_clone()) - .collect::>(); + .map(|(id, dir)| (id, dir.box_clone())) + .unzip(); + if directories.len() <= 1 { info!("Need at least 2 files in index directory to be able to merge"); return Ok(()); } + let _ = create_dir(&args.merge_dir).await; + let output_dir = MmapDirectory::open(&args.merge_dir)?; + let index = Index::open(MergeDirectory::new(directories, output_dir.box_clone())?)?; let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?; index_writer.set_merge_policy(Box::new(NoMergePolicy)); @@ -233,18 +236,33 @@ async fn merge(args: MergeArgs) -> Result<()> { spawn_blocking(move || index_writer.wait_merging_threads()).await??; - write_unified_index(index, &args.merge_dir, &args.index_dir).await?; + write_unified_index(index, &args.merge_dir, index_dir, &pool).await?; + + let delete_result = query("DELETE FROM indexes WHERE id = ANY($1)") + .bind(&ids) + .execute(&pool) + .await; + + for id in ids { + let _ = remove_file( + Path::new(index_dir) + .join(format!("{}.index", id)) + .to_str() + .expect("failed to build index path"), + ) + .await; + } + + delete_result?; Ok(()) } -async fn search(args: SearchArgs) -> Result<()> { +async fn search(args: SearchArgs, directories: Vec) -> Result<()> { if args.limit == 0 { return Ok(()); } - let directories = open_unified_directories(&args.index_dir).await?; - let (tx, mut rx) = channel(args.limit); let mut tx_handles = Vec::with_capacity(directories.len()); @@ -307,9 +325,19 @@ async fn search(args: SearchArgs) -> Result<()> { Ok(()) } +async fn open_db_pool(url: &str) -> Result { + Ok(PgPoolOptions::new() + .max_connections(MAX_DB_CONNECTIONS) + .connect(url) + .await?) +} + async fn async_main() -> Result<()> { color_eyre::install()?; + // Load vars inside .env into env vars, does nothing if the file does not exist. + let _ = dotenv(); + let default_log_level = if cfg!(debug_assertions) { DEFAULT_DEBUG_LOG_LEVEL } else { @@ -323,15 +351,27 @@ async fn async_main() -> Result<()> { log_builder.try_init()?; let args = parse_args(); + + let pool = open_db_pool(&args.db.unwrap_or_else(|| { + std::env::var("DATABASE_URL") + .expect("database url must be provided using either --db or DATABASE_URL env var") + })) + .await?; + match args.subcmd { SubCommand::Index(index_args) => { - index(index_args).await?; + index(index_args, pool, &args.index_dir).await?; } SubCommand::Merge(merge_args) => { - merge(merge_args).await?; + merge(merge_args, pool, &args.index_dir).await?; } SubCommand::Search(search_args) => { - search(search_args).await?; + let directories = open_unified_directories(&args.index_dir, &pool) + .await? + .into_iter() + .map(|(_, x)| x) + .collect::>(); + search(search_args, directories).await?; } } @@ -341,6 +381,7 @@ async fn async_main() -> Result<()> { fn main() -> Result<()> { let runtime = Builder::new_multi_thread() .thread_keep_alive(Duration::from_secs(20)) + .enable_all() .build()?; runtime.block_on(async_main()) }