[package] name = "charabia" version = "0.8.12" license = "MIT" authors = ["Many "] edition = "2021" description = "A simple library to detect the language, tokenize the text and normalize the tokens" documentation = "https://docs.rs/charabia" repository = "https://github.com/meilisearch/charabia" keywords = ["segmenter", "tokenizer", "normalize", "language"] categories = ["text-processing"] exclude = ["dictionaries/txt/thai/words.txt"] [dependencies] aho-corasick = "1.1.3" cow-utils = "0.1" csv = "1.3.0" deunicode = "1.6.0" either = "1.13.0" finl_unicode = { version= "1.2.0", optional = true } fst = "0.4" jieba-rs = { version = "0.7", optional = true } once_cell = "1.19.0" serde = "1.0" slice-group-by = "0.3.1" whatlang = "0.16.4" lindera = { version = "=0.32.2", default-features = false, optional = true } pinyin = { version = "0.10", default-features = false, features = [ "with_tone", ], optional = true } wana_kana = { version = "3.0.0", optional = true } unicode-normalization = "0.1.23" irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" } litemap = "0.7.3" zerovec = "0.10.4" [features] default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"] # allow chinese specialized tokenization chinese = ["chinese-segmentation", "chinese-normalization"] chinese-segmentation = ["dep:jieba-rs"] chinese-normalization = [] chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"] # allow hebrew specialized tokenization hebrew = [] # allow japanese specialized tokenization japanese = ["japanese-segmentation-unidic"] japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"] japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"] japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization korean = ["lindera/ko-dic", "lindera/compress"] # allow thai specialized tokenization thai = [] # allow greek specialized tokenization greek = [] # allow splitting camelCase latin words latin-camelcase = ["dep:finl_unicode"] khmer = [] # allow vietnamese specialized tokenization vietnamese = [] # allow splitting snake_case latin words latin-snakecase = ["dep:finl_unicode"] # force Charabia to recompose Swedish characters swedish-recomposition = [] [dev-dependencies] criterion = "0.5" jemallocator = "0.5.4" quickcheck = "1" quickcheck_macros = "1" [[bench]] name = "bench" harness = false