-
Notifications
You must be signed in to change notification settings - Fork 0
/
Cargo.toml
40 lines (38 loc) · 1.48 KB
/
Cargo.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
[package]
name = "alea-preprocess"
authors = ["ALEA Institute <hello@aleainstitute.ai>"]
version = "0.1.13"
description = "Efficient, accessible preprocessing routines for pretrain, SFT, and DPO training data preparation from the ALEA Institute."
license = "MIT"
readme = "README.md"
homepage = "https://aleainstitute.ai/"
repository = "https://github.com/alea-institute/alea-preprocess"
keywords = ["alea", "llm", "data", "preprocess", "kl3m"]
edition = "2021"
exclude = ["**/__pycache__", "**/*.pyc", "**/*.pyo", "tests/", "resources/", "docker/", ".github/", ".pre-commit-config.yaml", ".gitignore"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "alea_preprocess"
crate-type = ["cdylib"]
[dependencies]
base64 = "0.22.1"
blake2 = { version = "0.11.0-pre.4" }
blake3 = { version = "1.5.3", features = ["rayon", "mmap"] }
file-format = { version = "0.25.0", features = ["reader"] }
flate2 = { version = "1.0.31", features = ["zlib-ng"] }
hex = "0.4.3"
html-escape = "0.2.13"
icu = "1.5.0"
lazy_static = "1.5.0"
pdfium-render = "0.8.24"
pyo3 = { version = "0.22.0", features = ["extension-module", "serde"] }
rayon = "1.10.0"
regex = "1.10.6"
reqwest = { version = "0.12.7", features = ["blocking", "json"] }
serde_json = "1.0.125"
strsim = "0.11.1"
tl = { version = "0.7.8", features = ["simd"] }
tokenizers = { version = "0.20.0", features = ["http"] }
tokio = { version = "1.39.3", features = ["full", "test-util"] }
walkdir = "2.5.0"
rand = "0.8.5"