use bert_tokenizer::{FullTokenizer, Tokenizer}; #[test] fn test_full_tokenizer_uncased() { let vocab_file = "tests/uncased_L-12_H-768_A-12/vocab.txt"; let do_lower_case = true; let tokenizer = FullTokenizer::new() .vocab_from_file(vocab_file) .do_lower_case(do_lower_case) .build(); let tokens = tokenizer.tokenize("Hello world!"); assert_eq!(tokens, vec!["hello", "world", "!"]); let ids = tokenizer.convert_tokens_to_ids(&tokens); let tokens = tokenizer.convert_ids_to_tokens(&ids); assert_eq!(tokens, vec!["hello", "world", "!"]); let text = tokenizer.convert_tokens_to_string(&tokens); assert_eq!(text, "hello world !"); } #[test] fn test_full_tokenizer_cased() { let vocab_file = "tests/cased_L-12_H-768_A-12/vocab.txt"; let do_lower_case = false; let tokenizer = FullTokenizer::new() .vocab_from_file(vocab_file) .do_lower_case(do_lower_case) .build(); let tokens = tokenizer.tokenize("Hello world!"); assert_eq!(tokens, vec!["Hello", "world", "!"]); let ids = tokenizer.convert_tokens_to_ids(&tokens); let tokens = tokenizer.convert_ids_to_tokens(&ids); assert_eq!(tokens, vec!["Hello", "world", "!"]); let text = tokenizer.convert_tokens_to_string(&tokens); assert_eq!(text, "Hello world !"); } #[test] fn test_full_tokenizer_cased_strip_accents() { let vocab_file = "tests/cased_L-12_H-768_A-12/vocab.txt"; let do_lower_case = false; let do_strip_accents = true; let tokenizer = FullTokenizer::new() .vocab_from_file(vocab_file) .do_lower_case(do_lower_case) .do_strip_accents(do_strip_accents) .build(); let tokens = tokenizer.tokenize("Hello wörld!"); assert_eq!(tokens, vec!["Hello", "world", "!"]); let ids = tokenizer.convert_tokens_to_ids(&tokens); let tokens = tokenizer.convert_ids_to_tokens(&ids); assert_eq!(tokens, vec!["Hello", "world", "!"]); let text = tokenizer.convert_tokens_to_string(&tokens); assert_eq!(text, "Hello world !"); } #[test] fn test_full_tokenizer_cased_no_strip_accents() { let vocab_file = "tests/cased_L-12_H-768_A-12/vocab.txt"; let tokenizer = FullTokenizer::new() .vocab_from_file(vocab_file) .build(); let tokens = tokenizer.tokenize("Hello wörld!"); assert_eq!(tokens, vec!["Hello", "w", "##ö", "##rl", "##d", "!"]); let ids = tokenizer.convert_tokens_to_ids(&tokens); let tokens = tokenizer.convert_ids_to_tokens(&ids); assert_eq!(tokens, vec!["Hello", "w", "##ö", "##rl", "##d", "!"]); let text = tokenizer.convert_tokens_to_string(&tokens); assert_eq!(text, "Hello wörld !"); }