use std::collections::{HashMap, HashSet}; use pithy; use rayon::prelude::*; use colored::*; //Import Instant use std::time::Instant; //use std::alloc; //use cap::Cap; //#[global_allocator] //static ALLOCATOR: Cap = Cap::new(alloc::System, usize::max_value()); fn main() { //ALLOCATOR.set_limit(60 * 1024 * 1024).unwrap(); let arguments: Vec = std::env::args().collect(); //--bias: //slash (i.e \"/\") separated list of words to bias the summary towards. //Very experimental. Try lots of synonyms. let help_documentation = " ------------ --help: Print this help message -f: The file pithy will read from. Required. --sentences: The number of sentences for pithy to return. Defaults to 3. --approximate: Will return a decent approximation of the summary. Good for extremely long texts where you don't care about precision. --context: A second file to use as a context for the summary. Experimental. May not actually affect final results, but improve biased summaries. --bias: slash (i.e \"/\") separated list of words to bias the summary towards. If you are using pithy on a large text, increase the chunk_size to 2500-5000 to get relevant results. Note that this doesn't work in approximate mode. --bias_strength: The strength of the bias, must be an integer. Defaults to 6. --by_section: If set, pithy splits the text into sections, and each section is summarized separately. Defaults to false. --chunk_size: The number of sentences to read at a time. Defaults to 500 if unspecified. --force_all: If set, pithy reads the text all at once. Can be quite slow once you go past the 7k mark. Defaults to false. --force_chunk: If set, regardless of how large the text is, pithy splits it into chunks. Should be used in combination with chunk_size and by_section. --ngrams: If set, pithy uses ngrams rather than words. It's usually crap, but you might use it as a last resort for non-spaced languages that you can't pre-tokenise. Defaults to false. --min_length: The minimum sentence length before filtering. Defaults to 30. --max_length: The maximum sentence length before filtering. Defaults to 1500. --separator: The separator used to split the text into sentences. Defaults to '. '. You can type newline to separate by newlines. --clean_whitespace: If set, removes sentences with excessive whitespace. Useful for pdfs and copy-pastes from websites. --clean_nonalphabetic: If set, removes sentences with too many non-alphabetic characters. --clean_caps: If set, removes sentences with too many capital letters. Useful if the text contains a lot of references or indices. --length_penalty: The length penalty. Defaults to 1.5. Decrease to make glance for longer sentences, increase for shorter sentences. --density: Experimental setting. Defaults to 3. Setting it lower seems to bias pithy's summaries towards more common words, setting it higher seems to bias summaries towards rarer but more informative words. --no_context: If set, the context surrounding sentences isn't provided. Defaults to false. --relevance: If set, the sentences are sorted by their relevance rather than their order in the original text. Defaults to false. --nobar: If set, the progress bar is not printed. Defaults to false because progress bars are cool. ------------ pithy 0.1.0 - an absurdly fast, strangely accurate, summariser ------------ Quick example: pithy -f your_file_here.txt --sentences 4 "; if arguments.contains(&"--help".to_string()) || arguments.len() == 1 { println!("{}", help_documentation); return; } let by_section = arguments.contains(&"--by_section".to_string()); let approximate = arguments.contains(&"--approximate".to_string()); let chunk_size = if arguments.contains(&"--chunk_size".to_string()) { Some( arguments .get(arguments.iter().position(|x| x == "--chunk_size").unwrap() + 1) .expect("No chunk size provided") .parse::() .unwrap(), ) } else { None }; let second_file = if arguments.contains(&"--context".to_string()) { Some( arguments .get(arguments.iter().position(|x| x == "--context").unwrap() + 1) .expect("No context file provided"), ) } else { None }; let bias_list = if arguments.contains(&"--bias".to_string()) { Some( arguments .get(arguments.iter().position(|x| x == "--bias").unwrap() + 1) .expect("No bias list provided") .split("/") .map(|x| x.to_string()) .collect::>(), ) } else { None }; let bias_strength = if arguments.contains(&"--bias_strength".to_string()) { Some( arguments .get( arguments .iter() .position(|x| x == "--bias_strength") .unwrap() + 1, ) .expect("No bias strength provided") .parse::() .unwrap(), ) } else { Some(6.0) }; let filename = arguments .get(arguments.iter().position(|x| x == "-f").unwrap() + 1) .expect("No filename provided"); let number_of_sentences_to_return = if arguments.contains(&"--sentences".to_string()) { arguments .get(arguments.iter().position(|x| x == "--sentences").unwrap() + 1) .expect("No number of sentences provided") .parse::() .unwrap() } else { 3 }; let force_all = arguments.contains(&"--force_all".to_string()); let force_chunk = arguments.contains(&"--force_chunk".to_string()); //if space, we set "." as the separator. if newline, we set "\n" as the separator. let separator = if arguments.contains(&"--separator".to_string()) { let arg = arguments .get(arguments.iter().position(|x| x == "--separator").unwrap() + 1) .expect("No separator provided"); if arg == "newline" { "\n" } else { arg } } else { "." }; let ngrams = arguments.contains(&"--ngrams".to_string()); let min_length = if arguments.contains(&"--min_length".to_string()) { arguments .get(arguments.iter().position(|x| x == "--min_length").unwrap() + 1) .expect("No minimum length provided") .parse::() .unwrap() } else { 50 }; let max_length = if arguments.contains(&"--max_length".to_string()) { arguments .get(arguments.iter().position(|x| x == "--max_length").unwrap() + 1) .expect("No maximum length provided") .parse::() .unwrap() } else { 1500 }; let relevance = arguments.contains(&"--relevance".to_string()); let no_context = arguments.contains(&"--no_context".to_string()); let clean_whitespace = arguments.contains(&"--clean_whitespace".to_string()); let clean_nonalphabetic = arguments.contains(&"--clean_nonalphabetic".to_string()); let clean_caps = arguments.contains(&"--clean_caps".to_string()); let length_penalty = if arguments.contains(&"--length_penalty".to_string()) { arguments .get( arguments .iter() .position(|x| x == "--length_penalty") .unwrap() + 1, ) .expect("No length penalty provided") .parse::() .unwrap() } else { 0.6 }; let density = if arguments.contains(&"--density".to_string()) { arguments .get(arguments.iter().position(|x| x == "--density").unwrap() + 1) .expect("No density provided") .parse::() .unwrap() } else { 3.0 }; let use_bar = arguments.contains(&"--nobar".to_string()); let now = Instant::now(); let raw_text = std::fs::read_to_string(filename.clone()).expect("Could not open the file"); //api: pub fn from_raw_text(raw_text: String, separator: &str, min_length: usize, max_length: usize, ngrams: bool) let mut summariser = pithy::Summariser::new(); summariser.add_raw_text( filename.clone(), raw_text.clone(), separator, min_length, max_length, ngrams, ); if second_file.is_some() { let raw_text = std::fs::read_to_string(second_file.unwrap().clone()).expect("Could not open the file"); summariser.add_raw_text( second_file.unwrap().clone(), raw_text.clone(), separator, min_length, max_length, ngrams, ); } //api: excessive_whitespace: bool, excessive_punctuation_and_nums: bool, excessive_caps: bool, if clean_whitespace || clean_nonalphabetic || clean_caps { summariser.clean_sentences(clean_whitespace, clean_nonalphabetic, clean_caps); } //summariser.clean_sentences(clean_whitespace, clean_nonalphabetic, clean_caps); //api: number_of_sentences_to_return: usize, return_summaries_for_each: bool, chunk_size: Option, force_sum_all: bool, length_penalty: f32 let mut summary = if !approximate { let sum = summariser.top_sentences( number_of_sentences_to_return, by_section, chunk_size, force_all, length_penalty, force_chunk, density, bias_list, bias_strength, !use_bar, filename.clone(), ); sum } else { let sum = summariser.approximate_top_sentences( number_of_sentences_to_return, density, length_penalty, ); sum }; if !use_bar { println!("Summarising took {} seconds", now.elapsed().as_secs_f32()); } //sort sentences by .index if !relevance { summary.par_sort_unstable_by(|a, b| a.index.partial_cmp(&b.index).unwrap()); } //The summary is an array of strings, so we'll pretty-print it: // /retrieve_sentence_by_index let mut sentence_indices = summariser.sentences.keys().cloned().collect::>(); sentence_indices.sort_unstable(); //summariser.semirandom_walk(summary[0].index, 5); for sentence in summary.clone() { let index_number = sentence.index; //There might be missing indices, so in retrieving the previous sentence, we need to find what the closest number preceding it is if !no_context { let previous_sentence_indx = sentence_indices .iter() .filter(|x| **x < index_number) .last() .unwrap_or(&0); let next_sentence_indx = sentence_indices .iter() .filter(|x| **x > index_number) .next() .unwrap_or(&0); let previous_sentence = if summariser.sentences.get(previous_sentence_indx).is_some() { summariser .sentences .get(previous_sentence_indx) .unwrap() .clone() .text } else { String::from("") }; let next_sentence = if summariser.sentences.get(next_sentence_indx).is_some() { summariser .sentences .get(next_sentence_indx) .unwrap() .clone() .text } else { String::from("") }; print!( "\n{}\n{}{}{}{}{}{}", sentence.index, separator, previous_sentence.italic(), separator, sentence.text.bold().red(), separator, next_sentence.italic() ); } else { println!( "{}", sentence.index.to_string().underline().italic().magenta() ); println!("{}", sentence.text.bold().cyan()); println!("") } } println!("") //If the bar is turned off, then concatenate sentence.text and write it to stdout (so that the script can be used in pipes) //if use_bar { //let stdout = std::io::stdout(); //let lock = stdout.lock(); //let mut w = std::io::BufWriter::new(lock); //let mut output = String::new(); //for sentence in summary { //output.push_str(&sentence.text); //output.push_str(&separator); // } //w.write_all(output.as_bytes()).unwrap(); //} }