use tokenizers::models::wordpiece::WordPiece; use tokenizers::{AddedToken, Tokenizer}; fn main() { let start = std::time::Instant::now(); let mut tokenizer = Tokenizer::new(WordPiece::default()); // Mix special and not special // You can make sure ids are in order, and special status is correct. let tokens: Vec<_> = (0..120_000) .map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0)) .collect(); tokenizer.add_tokens(&tokens); tokenizer.save("_tok.json", true).unwrap(); println!("Save took {:?}", start.elapsed()); let start = std::time::Instant::now(); let _tok = Tokenizer::from_file("_tok.json").unwrap(); println!("Took {:?}", start.elapsed()); std::fs::remove_file("_tok.json").unwrap(); }