use tokenizers::models::bpe::BPE; use tokenizers::pre_tokenizers::whitespace::Whitespace; use tokenizers::{DecoderWrapper, NormalizerWrapper, PostProcessorWrapper, PreTokenizerWrapper}; use tokenizers::{Model, Tokenizer, TokenizerBuilder}; #[test] fn bpe_values_after_training() { let mut tokenizer = TokenizerBuilder::< BPE, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper, >::default() .with_model( BPE::builder() .unk_token("[UNK]".to_string()) .dropout(0.1) .build() .unwrap(), ) .build() .unwrap(); let mut trainer = tokenizer.get_model().get_trainer(); tokenizer .train_from_files(&mut trainer, vec!["./data/small.txt".to_string()]) .unwrap(); assert_eq!(tokenizer.get_model().dropout, Some(0.1)); assert_eq!(tokenizer.get_model().unk_token, Some("[UNK]".to_string())); } #[test] fn bpe_continuing_subword_prefix_error() { let mut tokenizer = TokenizerBuilder::< BPE, NormalizerWrapper, PreTokenizerWrapper, PostProcessorWrapper, DecoderWrapper, >::default() .with_model( BPE::builder() .unk_token("[UNK]".to_string()) .continuing_subword_prefix("##".to_string()) .build() .unwrap(), ) .with_pre_tokenizer(Some(PreTokenizerWrapper::Whitespace(Whitespace {}))) .build() .unwrap(); let mut trainer = tokenizer.get_model().get_trainer(); tokenizer .train_from_files(&mut trainer, vec!["./data/small.txt".to_string()]) .unwrap(); tokenizer.save("tokenizer.json", true).unwrap(); let tokenizer = Tokenizer::from_file("tokenizer.json").unwrap(); assert_eq!(tokenizer.get_vocab_size(false), 1526); std::fs::remove_file("tokenizer.json").unwrap(); }