[package] name = "ungoliant" version = "2.0.0" authors = ["Julien Abadji , Pedro J. Ortiz "] edition = "2021" description = "The pipeline for the OSCAR corpus." license = "Apache-2.0" homepage = "https://github.com/oscar-project/ungoliant" repository = "https://github.com/oscar-project/ungoliant" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] reqwest = { version = "0.11", default-features=false, features = ["rustls-tls", "blocking", "stream"] } flate2 = { version = "1.0.20"} futures-core = "0.3" futures-util = "0.3" futures = "0.3" structopt = "0.3.21" env_logger = "0.8.3" log = "0.4.14" itertools = "0.10.0" tokio = { version = "1", features = ["full"] } tokio-util = {version="0.6.6", features=["compat"]} warc = {version="0.3.0", features=["with_serde"]} ut1_blocklist = "0.3.0" fasttext = "0.7.6" bytes = "1" rayon = "1" twox-hash = "1.6" glob = "0.3.0" sha2 = "0.9.5" serde = { version = "1", features = ["derive"] } serde_json = "1" schemars = "0.8.3" runiq-lib = "1.2.2" rand = "0.8.4" url = "2.2.2" avro-rs = { version = "0.13.0", features = ["snappy"] } unicode-script = "0.5.4" unicode-segmentation = "1.8.0" csv = "1.1.6" unic-ucd = "0.9.0" oxilangtag = {version="0.1.3", features=["serde"]} language-tags = "0.3.2" lazy_static = "1.4.0" oscar-io = "0.2.2" #tlsh = {git="https://github.com/Uinelj/tlsh-rs", branch="fix-q3-panic"} tlsh-fixed = "0.1.1" ctclib-pp = {version="0.2.0", optional=true} [features] kenlm = ["dep:ctclib-pp"] [dev-dependencies] rand_distr = "0.4.2" sha-1 = "0.9" criterion = "0.3" serial_test = "0.5.1" tempfile="3.2.0" test-log = "0.2.11" [[bench]] name = "fasttext_bench" harness = false [[bench]] name = "pipeline_bench_rayon" harness = false [[bench]] name = "annotate_noisy" harness = false