[workspace] members = ["bindings/*"] [workspace.package] version = "0.18.1" authors = ["Ben Brandt "] edition = "2021" description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python." repository = "https://github.com/benbrandt/text-splitter" license = "MIT" keywords = ["text", "split", "tokenizer", "nlp", "ai"] categories = ["text-processing"] [workspace.lints.rust] future_incompatible = { level = "warn", priority = -1 } missing_debug_implementations = "warn" missing_docs = "warn" nonstandard_style = { level = "warn", priority = -1 } rust_2018_compatibility = { level = "warn", priority = -1 } rust_2018_idioms = { level = "warn", priority = -1 } rust_2021_compatibility = { level = "warn", priority = -1 } rust_2024_compatibility = { level = "warn", priority = -1 } unused = { level = "warn", priority = -1 } [workspace.lints.clippy] cargo = "warn" pedantic = "warn" [package] name = "text-splitter" version.workspace = true authors.workspace = true edition.workspace = true description.workspace = true repository.workspace = true license.workspace = true keywords.workspace = true categories.workspace = true exclude = [ ".github/**", ".vscode/**", "/bindings/**", "/benches/output.txt", "/docs/**", # Rely on large test files "/tests/snapshots/**", "/tests/text_splitter_snapshots.rs", "/tests/inputs/**", "/tests/tokenizers/**", "*.yml", "*.yaml", ] rust-version = "1.75.0" [package.metadata.docs.rs] all-features = true rustdoc-args = ["--cfg", "docsrs"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] ahash = "0.8.7" auto_enums = "0.8" either = "1.6" itertools = "0.13" once_cell = "1.20" pulldown-cmark = { version = "0.12", default-features = false, optional = true } regex = "1.10.6" rust_tokenizers = { version = "8", optional = true } strum = { version = "0.26", features = ["derive"] } thiserror = "1.0.65" tiktoken-rs = { version = "0.6", optional = true } tokenizers = { version = "0.20", default-features = false, optional = true } tree-sitter = { version = "0.24", optional = true } unicode-segmentation = "1.12" [dev-dependencies] cached-path = { version = "0.6", default-features = false, features = [ "rustls-tls", ] } dirs = "5.0.1" divan = "0.1.14" fake = "2" insta = { version = "1.40", features = ["glob", "yaml"] } more-asserts = "0.3" rayon = "1.10" tokenizers = { version = "0.20", default-features = false, features = [ "onig", "http", ] } tree-sitter-rust = "0.23" [[bench]] name = "chunk_size" harness = false [features] code = ["dep:tree-sitter"] markdown = ["dep:pulldown-cmark"] rust-tokenizers = ["dep:rust_tokenizers"] tiktoken-rs = ["dep:tiktoken-rs"] tokenizers = ["dep:tokenizers", "tokenizers/onig"] [lints] workspace = true # Tokenizers and indirect deps can cause slow runtime [profile.dev.package."*"] opt-level = 3