//! The library comes with a DefaultTokenizer, which is a struct that loads the internal //! `encoder.json` and `vocab.bpe`. It simplifies the creation of the `encode` and `decode` //! functions. This is specially useful when you just want to estimate the number of tokens //! your prompt will consume. //! //! > As a rule of thumb, OpenAI suggest that 100 tokens equal 75 words. use gpt_tokenizer::Default; fn main() { let tokenizer = Default::new(); let text = r#"I'Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890"#; let encoded = &tokenizer.encode(text); let decoded = &tokenizer.decode(encoded); println!("Original text: {}", text); println!("Encoded text: {:#?}", encoded); println!("Decoded text: {}", decoded); println!("Text size: {}", text.len()); println!("Words: {}", text.split(" ").count()); println!("Rule of Thumb: {}", text.split(" ").count() * 4 / 3); println!("Tokens: {}", encoded.len()); }