[package] name = "charabia" version = "0.9.1" license = "MIT" authors = ["Many "] edition = "2021" description = "A simple library to detect the language, tokenize the text and normalize the tokens" documentation = "https://docs.rs/charabia" repository = "https://github.com/meilisearch/charabia" keywords = ["segmenter", "tokenizer", "normalize", "language"] categories = ["text-processing"] exclude = ["dictionaries/txt/thai/words.txt"] [dependencies] aho-corasick = "1.1.3" csv = "1.3.0" either = "1.13.0" finl_unicode = { version= "1.2.0", optional = true } fst = "0.4" jieba-rs = { version = "0.7", optional = true } once_cell = "1.19.0" serde = "1.0.192" slice-group-by = "0.3.1" whatlang = "0.16.4" lindera = { version = "=0.32.2", default-features = false, optional = true } pinyin = { version = "0.10", default-features = false, features = [ "with_tone", ], optional = true } wana_kana = { version = "3.0.0", optional = true } unicode-normalization = "0.1.23" irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" } [features] default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"] # allow chinese specialized tokenization chinese = ["chinese-segmentation", "chinese-normalization"] chinese-segmentation = ["dep:jieba-rs"] chinese-normalization = [] chinese-normalization-pinyin = ["dep:pinyin", "chinese-normalization"] # allow hebrew specialized tokenization hebrew = [] # allow japanese specialized tokenization japanese = ["japanese-segmentation-unidic"] japanese-segmentation-ipadic = ["lindera/ipadic", "lindera/compress"] japanese-segmentation-unidic = ["lindera/unidic", "lindera/compress"] japanese-transliteration = ["dep:wana_kana"] # allow korean specialized tokenization korean = ["lindera/ko-dic", "lindera/compress"] # allow thai specialized tokenization thai = [] # allow greek specialized tokenization greek = [] # allow splitting camelCase latin words latin-camelcase = ["dep:finl_unicode"] khmer = [] # allow vietnamese specialized tokenization vietnamese = [] # allow splitting snake_case latin words latin-snakecase = ["dep:finl_unicode"] # force Charabia to recompose Swedish characters swedish-recomposition = [] # allow turkish specialized tokenization turkish = [] # allow decomposition of German composite words german-segmentation = [] [dev-dependencies] criterion = "0.5" jemallocator = "0.5.4" quickcheck = "1" quickcheck_macros = "1" [[bench]] name = "bench" harness = false