//! This is a just awful, awful implementation of a markov chain generator, but //! it serves the purpose, which is to use ngrams to generate subsequences of words //! in a corpus extern crate ngrams; extern crate rand; extern crate flate2; extern crate curl; use ngrams::Ngrams; use std::collections::HashMap; use std::io::{Cursor, BufRead, BufReader}; use flate2::read::GzDecoder; use curl::easy::Easy; use rand::distributions::{Weighted, WeightedChoice, IndependentSample}; #[derive(Debug, Clone, PartialEq)] struct Markov { data: HashMap>, } fn extract_sentence(s: String) -> String { s.split('\t').nth(1).unwrap().into() } fn tokenize_sentence(sentence: String) -> Vec { let tokens: Vec = sentence.split(|c| { match c { '"' | ',' | ';' | ':' => true, a if a.is_whitespace() => true, _ => false } }).filter(|a| a.len() > 0).map(|a| a.trim().to_owned()).collect(); tokens } impl Markov { fn new(url: &'static str) -> Markov { let mut map: HashMap> = HashMap::new(); let mut handle = Easy::new(); let mut data = Vec::new(); handle.url(url).unwrap(); { let mut transfer = handle.transfer(); transfer.write_function(|new_data| { data.extend_from_slice(new_data); Ok(new_data.len()) }).unwrap(); transfer.perform().unwrap(); } let file = BufReader::new( GzDecoder::new(Cursor::new(data) ).unwrap()); println!("extracting sentences..."); let sentences = file.lines().map(|a| a.unwrap()).map(extract_sentence).map(tokenize_sentence); print!("Building map of ngrams..."); for tokenized in sentences { let grams = Ngrams::new(tokenized.into_iter(), 2).pad(); for gram in grams { let first = gram[0].clone(); let second = gram[1].clone(); let entry = map.entry(first).or_insert(HashMap::new()); let entry = entry.entry(second).or_insert(0); *entry += 1; } } println!("Done!"); Markov { data: map } } fn random_word(&self, s: &str) -> String { match self.data.get(s) { Some(h) => { let mut choices = vec![]; for (word, count) in h { choices.push(Weighted { weight: *count, item: word }); } let wc = WeightedChoice::new(&mut choices); let mut rng = rand::thread_rng(); wc.ind_sample(&mut rng).clone() }, None => "whoops...".to_owned() } } fn sentence_generator(&self) -> SentenceGenerator { SentenceGenerator { markov: self.clone(), state: "\u{2060}".to_owned(), } } } struct SentenceGenerator { markov: Markov, state: String, } impl Iterator for SentenceGenerator { type Item = String; fn next(&mut self) -> Option { if self.state.ends_with('.') { return None; } self.state = self.markov.random_word(&self.state); Some(self.state.clone()) } } fn main() { let url = "https://gitlab.com/pwoolcoc/ngrams/raw/master/examples/eng_news_2005_1M-sentences.gz"; println!("Generating markov chain from input data\n\n\t{}\n\nThis is gonna take a while...", url); let chain = Markov::new(url); for _ in 0..10 { println!("{:?}", chain.sentence_generator().collect::>().join(" ")); } }