ultra-nlp

Crates.ioultra-nlp
lib.rsultra-nlp
version0.8.0
sourcesrc
created_at2022-07-05 08:50:03.883651
updated_at2024-02-27 20:02:10.497426
descriptionA NLP library.
homepage
repositoryhttps://github.com/BlackGlory/ultra-nlp
max_upload_size
id619562
size146,829
(BlackGlory)

documentation

README

ultra-nlp

Install

cargo add ultra-nlp

Usage

ngrams

let text = "你好世界";

let result = ngrams(text, 2);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["你好", "好世", "世界"]
);

extract_consecutive_chinese_chars

let text = "foo中文bar字符baz";

let result = extract_consecutive_chinese_chars(text);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["中文", "字符"]
);

extract_consecutive_letters

let text = "foo中文,bar,字符baz";

let result = extract_consecutive_letters(text);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["foo中文", "bar", "字符baz"]
);

cedarwood(slow, low memory usage)

Ingore unmatched contents

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::Ignore
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec!["南京", "南京市", "市长", "长江", "大桥"]
);

Keep unmatched contents as chars

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::KeepAsChars
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
    ]
);

Keep unmatched ocntents as words

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::KeepAsWords
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
    ]
);

daachorse(fast, high memory usage)

Ignore unmatched contents

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::Ignore);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
      "南京", "南京市", "市长", "长江", "大桥",
    ]
);

Keep unmatched contents as chars

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsChars);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
    ]
);

Keep unmatched contents as words

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsWords);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
    ]
);
Commit count: 70

cargo fmt