use std::env; use std::fs::File; use std::io::Read; use std::path::PathBuf; use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding}; use serde::Serialize; fn main() { serialize_tiktoken_bpe( "cl100k_base", include_bytes!("data/cl100k_base.tiktoken.gz"), 17846336922010275747, ); serialize_tiktoken_bpe( "o200k_base", include_bytes!("data/o200k_base.tiktoken.gz"), 17846336922010275747, ); println!("cargo::rerun-if-changed=build.rs"); } fn serialize_tiktoken_bpe(name: &str, data: &[u8], hash_factor: u64) { let mut dec = flate2::read::GzDecoder::new(data); let mut tiktoken = String::new(); dec.read_to_string(&mut tiktoken).expect("can decode data"); let tokens = read_tiktoken(&tiktoken).expect("can read data"); let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build")); path.push(format!("bpe_{name}.dict")); let file = File::create(path).expect("can create output file"); let mut serializer = rmp_serde::Serializer::new(file); let bpe = BytePairEncoding::from_dictionary(tokens, Some(hash_factor)); bpe.serialize(&mut serializer) .expect("serialization succeeds"); }