[package]
name = "charabia"
version = "0.9.2"
license = "MIT"
authors = ["Many <many@meilisearch.com>"]
edition = "2021"
description = "A simple library to detect the language, tokenize the text and normalize the tokens"
documentation = "https://docs.rs/charabia"
repository = "https://github.com/meilisearch/charabia"
keywords = ["segmenter", "tokenizer", "normalize", "language"]
categories = ["text-processing"]
exclude = ["dictionaries/txt/thai/words.txt"]

[dependencies]
aho-corasick = "1.1.3"
csv = "1.3.0"
either = "1.13.0"
finl_unicode = { version= "1.2.0", optional = true }
fst = "0.4"
jieba-rs = { version = "0.7", optional = true }
once_cell = "1.19.0"
serde = "1.0.192"
slice-group-by = "0.3.1"
whatlang = "0.16.4"
lindera = { version = "=0.32.2", default-features = false, optional = true }
pinyin = { version = "0.10", default-features = false, features = [
  "with_tone",
], optional = true }
wana_kana = { version = "4.0.0", optional = true }
unicode-normalization = "0.1.23"
irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" }

[features]
default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"]

# allow chinese specialized tokenization
chinese = ["chinese-segmentation", "chinese-normalization"]
chinese-segmentation = ["dep:jieba-rs"]
chinese-normalization = []
chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"]

# allow hebrew specialized tokenization
hebrew = []

# allow japanese specialized tokenization
japanese = ["japanese-segmentation-unidic", "japanese-transliteration"]
japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"]
japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"]
japanese-transliteration = ["dep:wana_kana"]

# allow korean specialized tokenization
korean = ["lindera/ko-dic", "lindera/compress"]

# allow thai specialized tokenization
thai = []

# allow greek specialized tokenization
greek = []

# allow splitting camelCase latin words
latin-camelcase = ["dep:finl_unicode"]

khmer = []

# allow vietnamese specialized tokenization
vietnamese = []

# allow splitting snake_case latin words
latin-snakecase = ["dep:finl_unicode"]

# force Charabia to recompose Swedish characters
swedish-recomposition = []

# allow turkish specialized tokenization
turkish = []

# allow decomposition of German composite words
german-segmentation = []

[dev-dependencies]
criterion = "0.5"
quickcheck = "1"
quickcheck_macros = "1"
mimalloc = "0.1.43"

[[bench]]
name = "bench"
harness = false