use fnv::{FnvBuildHasher, FnvHashSet}; use indexmap::IndexMap; use itertools::Itertools; use num::ToPrimitive; use std::{cmp::Ordering, collections::BTreeMap, num::NonZeroU64}; use tangram_finite::Finite; use tangram_progress_counter::ProgressCounter; use tangram_table::prelude::*; use tangram_text::{NGram, NGramType, Tokenizer}; use tangram_zip::zip; /// This struct contains settings used to compute stats. #[derive(Clone, Debug)] pub struct StatsSettings { /// This is the maximum number of unique numeric values to store in the histogram. pub number_histogram_max_size: usize, /// This is the maximum number of ngrams to track for text columns. pub ngrams_max_count: usize, /// This setting specifies which ngram types should be computed. pub ngram_types: FnvHashSet, } impl Default for StatsSettings { fn default() -> StatsSettings { StatsSettings { number_histogram_max_size: 100, ngrams_max_count: 20_000, ngram_types: vec![NGramType::Unigram, NGramType::Bigram] .into_iter() .collect(), } } } #[derive(Clone, Debug)] pub struct TextColumnStatsSettings { pub tokenizer: Tokenizer, } /// This struct holds column stats. #[derive(Clone, Debug)] pub struct Stats(pub Vec); /// This is an enum describing the different types of stats where the type matches the type of the source column. #[derive(Clone, Debug)] pub enum ColumnStats { Unknown(UnknownColumnStats), Number(NumberColumnStats), Enum(EnumColumnStats), Text(TextColumnStats), } /// This struct contains stats for unknown columns. #[derive(Clone, Debug)] pub struct UnknownColumnStats { /// This is the name of the column. pub column_name: String, pub count: usize, pub invalid_count: usize, } /// This struct contains stats for number columns. #[derive(Clone, Debug)] pub struct NumberColumnStats { /// This is the name of the column. pub column_name: String, /// The total number of values. pub count: usize, /// The total number of valid values. pub valid_count: usize, /// This is the total number of invalid values. Invalid values are values that fail to parse as finite f32. pub invalid_count: usize, /// This stores counts for each unique value. pub histogram: BTreeMap, usize>, } /// This struct contains stats for enum columns. #[derive(Clone, Debug)] pub struct EnumColumnStats { /// This is the name of the column. pub column_name: String, /// This is the total number of values. pub count: usize, /// The enum variants. pub variants: Vec, /// This is the total number of valid values. pub valid_count: usize, /// This is the total number of invalid values. pub invalid_count: usize, /// This is the histogram. pub histogram: Vec, } /// This struct contains stats for text columns. #[derive(Clone, Debug)] pub struct TextColumnStats { /// This is the name of the column. pub column_name: String, /// The total number of values. pub row_count: usize, /// This is the tokenizer used to split the text into tokens. pub tokenizer: Tokenizer, /// These are the types of ngrams to collect. pub ngram_types: FnvHashSet, /// These are stats collected on ngrams. pub ngrams: IndexMap, } #[derive(Clone, Debug, Default)] pub struct TextColumnStatsNGramEntry { pub row_count: usize, pub occurrence_count: usize, } pub struct StatsOutput(pub Vec); /// This enum describes the different types of column stats. #[derive(Debug)] pub enum ColumnStatsOutput { Unknown(UnknownColumnStatsOutput), Number(NumberColumnStatsOutput), Enum(EnumColumnStatsOutput), Text(TextColumnStatsOutput), } impl ColumnStatsOutput { /// Return the name of the source column. pub fn column_name(&self) -> &str { match self { ColumnStatsOutput::Unknown(value) => &value.column_name, ColumnStatsOutput::Number(value) => &value.column_name, ColumnStatsOutput::Enum(value) => &value.column_name, ColumnStatsOutput::Text(value) => &value.column_name, } } } /// This struct contains stats for unknown columns. #[derive(Debug)] pub struct UnknownColumnStatsOutput { /// This is the name of the column as it appears in the csv. pub column_name: String, /// This is the total number of examples that these stats were computed on. pub count: usize, } /// This struct contains stats for number columns. #[derive(Debug)] pub struct NumberColumnStatsOutput { /// This is the name of the column as it appears in the csv. pub column_name: String, /// This is the total number of examples that these stats were computed on. pub count: usize, /// This is a histogram mapping unique values to their counts. It is `None` if the number of unique values exceeds [`number_histogram_max_size`](StatsSettings#number_histogram_max_size). pub histogram: Option, usize)>>, /// This is the total number of unique values. pub unique_count: usize, /// This is the max of the values in the column. pub max: f32, /// This is the mean of the values in the column. pub mean: f32, /// This is the min of the values in the column. pub min: f32, /// This is the total number of invalid values. Invalid values are values that fail to parse as floating point numbers. pub invalid_count: usize, /// This is the variance of the values in the column. pub variance: f32, /// This is the standard deviation of the values in the column. It is equal to the square root of the variance. pub std: f32, /// This is the p25, or 25th-percentile value in the column. pub p25: f32, /// This is the p50, or 50th-percentile value in the column, i.e. the median. pub p50: f32, /// This is the p75, or 75th-percentile value in the column. pub p75: f32, } /// This struct contains stats for enum columns. #[derive(Debug)] pub struct EnumColumnStatsOutput { /// This is the name of the column as it appears in the csv. pub column_name: String, /// This is the total number of examples that these stats were computed on. pub count: u64, /// This is a histogram mapping unique variants of the enum to the total count of occurrences of the variant in the dataset. pub histogram: Vec<(String, usize)>, /// This is the total number of values in the dataset that are invalid. A value is invalid if it is not one of the enum's variants. pub invalid_count: usize, /// This is the total number of unique values, excluding invalid values. pub unique_count: usize, } /// This struct contains stats for text columns. #[derive(Debug)] pub struct TextColumnStatsOutput { /// This is the name of the column. pub column_name: String, /// This is the number of rows that these stats were computed for. pub row_count: u64, /// This is the tokenizer that was used to separate values into tokens. pub tokenizer: Tokenizer, /// This is the list of ngram types. pub ngram_types: FnvHashSet, /// This is the number of unique ngrams encountered. pub ngrams_count: usize, /// This contains stats for up to `stats_settings.ngrams_max_count` ngrams with the highest `entry.row_count`s. pub top_ngrams: IndexMap, } /// This struct contains stats for individual ngrams. #[derive(Clone, Debug)] pub struct TextColumnStatsOutputTopNGramsEntry { /// This is the number of rows that contain at least one occurrence of this ngram. pub row_count: usize, /// This is the number of occurrences of this ngram across all rows. pub occurrence_count: usize, /// This is the inverse document frequency of this ngram. [Learn more](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). pub idf: f32, } impl Stats { pub fn compute( table: &TableView, settings: &StatsSettings, handle_progress_event: &mut dyn FnMut(ProgressCounter), ) -> Stats { let progress_total = table.ncols() as u64 * table.nrows() as u64; let progress_counter = ProgressCounter::new(progress_total); handle_progress_event(progress_counter.clone()); let progress = &|progress| progress_counter.inc(progress); let column_stats = table .columns() .iter() .map(|column| ColumnStats::compute(column.view(), settings, progress)) .collect(); Stats(column_stats) } pub fn merge(self, other: Stats) -> Stats { let column_stats: Vec = zip!(self.0, other.0).map(|(a, b)| a.merge(b)).collect(); Stats(column_stats) } pub fn finalize(self, settings: &StatsSettings) -> StatsOutput { let column_stats = self .0 .into_iter() .map(|column_stats| column_stats.finalize(settings)) .collect(); StatsOutput(column_stats) } } impl ColumnStats { fn compute( column: TableColumnView, settings: &StatsSettings, progress: impl Fn(u64), ) -> ColumnStats { match column { TableColumnView::Unknown(column) => { progress(column.len() as u64); ColumnStats::Unknown(UnknownColumnStats { column_name: column.name().unwrap().to_owned(), count: column.len(), invalid_count: column.len(), }) } TableColumnView::Number(column) => ColumnStats::Number(NumberColumnStats::compute( column.view(), settings, progress, )), TableColumnView::Enum(column) => { ColumnStats::Enum(EnumColumnStats::compute(column, settings, progress)) } TableColumnView::Text(column) => { ColumnStats::Text(TextColumnStats::compute(column, settings, progress)) } } } fn merge(self, other: ColumnStats) -> ColumnStats { match (self, other) { (ColumnStats::Unknown(a), ColumnStats::Unknown(b)) => { ColumnStats::Unknown(UnknownColumnStats { column_name: a.column_name.clone(), count: a.count + b.count, invalid_count: a.invalid_count + b.invalid_count, }) } (ColumnStats::Number(a), ColumnStats::Number(b)) => ColumnStats::Number(a.merge(b)), (ColumnStats::Enum(a), ColumnStats::Enum(b)) => ColumnStats::Enum(a.merge(b)), (ColumnStats::Text(a), ColumnStats::Text(b)) => ColumnStats::Text(a.merge(b)), _ => unreachable!(), } } fn finalize(self, settings: &StatsSettings) -> ColumnStatsOutput { match self { ColumnStats::Unknown(column_stats_output) => { ColumnStatsOutput::Unknown(UnknownColumnStatsOutput { column_name: column_stats_output.column_name, count: column_stats_output.count, }) } ColumnStats::Number(column_stats_output) => { ColumnStatsOutput::Number(column_stats_output.finalize(settings)) } ColumnStats::Enum(column_stats_output) => { ColumnStatsOutput::Enum(column_stats_output.finalize(settings)) } ColumnStats::Text(column_stats_output) => { ColumnStatsOutput::Text(column_stats_output.finalize(settings)) } } } } impl NumberColumnStats { fn compute( column: NumberTableColumnView, _settings: &StatsSettings, progress: impl Fn(u64), ) -> NumberColumnStats { let mut stats = NumberColumnStats { column_name: column.name().unwrap().to_owned(), count: column.len(), histogram: BTreeMap::new(), invalid_count: 0, valid_count: 0, }; for value in column.iter() { // If the value parses as a finite f32, add it to the histogram. Otherwise, increment the invalid count. if let Ok(value) = >::new(*value) { *stats.histogram.entry(value).or_insert(0) += 1; stats.valid_count += 1; } else { stats.invalid_count += 1; } progress(1); } stats } fn merge(mut self, other: NumberColumnStats) -> NumberColumnStats { for (value, count) in other.histogram.iter() { *self.histogram.entry(*value).or_insert(0) += count; } self.count += other.count; self.invalid_count += other.invalid_count; self.valid_count += other.valid_count; self } fn finalize(self, settings: &StatsSettings) -> NumberColumnStatsOutput { let unique_values_count = self.histogram.len(); let invalid_count = self.invalid_count; let histogram = if self.histogram.len() <= settings.number_histogram_max_size { Some(self.histogram.iter().map(|(k, v)| (*k, *v)).collect()) } else { None }; let min = self.histogram.iter().next().unwrap().0.get(); let max = self.histogram.iter().next_back().unwrap().0.get(); let total_values_count = self.valid_count.to_f32().unwrap(); let quantiles: Vec = vec![0.25, 0.50, 0.75]; // Find the index of each quantile given the total number of values in the dataset. let quantile_indexes: Vec = quantiles .iter() .map(|q| ((total_values_count - 1.0) * q).trunc().to_usize().unwrap()) .collect(); // This is the fractiononal part of the index used to interpolate values if the index is not an integer value. let quantile_fracts: Vec = quantiles .iter() .map(|q| ((total_values_count - 1.0) * q).fract()) .collect(); let mut quantiles: Vec> = vec![None; quantiles.len()]; let mut current_count: usize = 0; let mut mean = 0.0; let mut m2 = 0.0; let mut iter = self.histogram.iter().peekable(); while let Some((value, count)) = iter.next() { let value = value.get(); let (new_mean, new_m2) = tangram_metrics::merge_mean_m2( current_count.to_u64().unwrap(), mean, m2, count.to_u64().unwrap(), value as f64, 0.0, ); mean = new_mean; m2 = new_m2; current_count += count; let quantiles_iter = zip!( quantiles.iter_mut(), quantile_indexes.iter(), quantile_fracts.iter(), ) .filter(|(quantile, _, _)| quantile.is_none()); for (quantile, index, fract) in quantiles_iter { match (current_count - 1).cmp(index) { Ordering::Equal => { if *fract > 0.0 { // Interpolate between two values. let next_value = iter.peek().unwrap().0.get(); *quantile = Some(value * (1.0 - fract) + next_value * fract); } else { *quantile = Some(value); } } Ordering::Greater => *quantile = Some(value), Ordering::Less => {} } } } let quantiles: Vec = quantiles.into_iter().map(|q| q.unwrap()).collect(); let p25 = quantiles[0]; let p50 = quantiles[1]; let p75 = quantiles[2]; let mean = mean.to_f32().unwrap(); let variance = tangram_metrics::m2_to_variance( m2, NonZeroU64::new(current_count.to_u64().unwrap()).unwrap(), ); NumberColumnStatsOutput { column_name: self.column_name, count: self.count, histogram, unique_count: unique_values_count, max, mean, min, invalid_count, variance, std: variance.sqrt(), p25, p50, p75, } } } impl EnumColumnStats { fn compute( column: EnumTableColumnView, _settings: &StatsSettings, progress: impl Fn(u64), ) -> EnumColumnStats { let mut histogram = vec![0; column.variants().len() + 1]; for value in column.iter() { let index = value.map(|v| v.get()).unwrap_or(0); histogram[index] += 1; progress(1); } let invalid_count = histogram[0]; EnumColumnStats { column_name: column.name().unwrap().to_owned(), count: column.len(), variants: column.variants().to_owned(), histogram, invalid_count, valid_count: 0, } } fn merge(mut self, other: EnumColumnStats) -> EnumColumnStats { for (a, b) in zip!(self.histogram.iter_mut(), other.histogram.iter()) { *a += b; } self.count += other.count; self.invalid_count += other.invalid_count; self.valid_count += other.valid_count; self } fn finalize(self, _settings: &StatsSettings) -> EnumColumnStatsOutput { EnumColumnStatsOutput { column_name: self.column_name, count: self.count.to_u64().unwrap(), invalid_count: self.invalid_count, unique_count: self.variants.len(), histogram: zip!(self.variants, self.histogram.into_iter().skip(1)) .map(|(value, count)| (value, count)) .collect(), } } } #[derive(Clone, Debug, Eq)] struct TokenEntry(pub NGram, pub usize); impl std::cmp::Ord for TokenEntry { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.1.cmp(&other.1) } } impl std::cmp::PartialOrd for TokenEntry { fn partial_cmp(&self, other: &Self) -> Option { self.1.partial_cmp(&other.1) } } impl std::cmp::PartialEq for TokenEntry { fn eq(&self, other: &Self) -> bool { self.1.eq(&other.1) } } impl TextColumnStats { fn compute( column: TextTableColumnView, settings: &StatsSettings, progress: impl Fn(u64), ) -> TextColumnStats { let tokenizer = Tokenizer::default(); let mut stats = TextColumnStats { column_name: column.name().unwrap().to_owned(), row_count: column.len(), tokenizer, ngrams: IndexMap::default(), ngram_types: settings.ngram_types.to_owned(), }; let mut ngrams_for_row = FnvHashSet::default(); for value in column.iter() { ngrams_for_row.clear(); let unigram_iter = if stats.ngram_types.contains(&NGramType::Unigram) { Some( stats .tokenizer .tokenize(value) .map(tangram_text::NGramRef::Unigram), ) } else { None }; let bigram_iter = if stats.ngram_types.contains(&NGramType::Bigram) { Some( stats.tokenizer.tokenize(value).tuple_windows().map( |(token_a, token_b)| tangram_text::NGramRef::Bigram(token_a, token_b), ), ) } else { None }; let ngram_iter = unigram_iter .into_iter() .flatten() .chain(bigram_iter.into_iter().flatten()); for ngram in ngram_iter { if let Some(entry) = stats.ngrams.get_mut(&ngram) { entry.occurrence_count += 1; } else { let ngram = ngram.to_ngram(); let entry = TextColumnStatsNGramEntry { row_count: 0, occurrence_count: 1, }; stats.ngrams.insert(ngram, entry); } ngrams_for_row.insert(ngram); } for ngram in ngrams_for_row.iter() { stats.ngrams.get_mut(ngram).unwrap().row_count += 1; } progress(1); } stats } fn merge(mut self, other: TextColumnStats) -> TextColumnStats { self.row_count += other.row_count; for (other_ngram, other_entry) in other.ngrams.into_iter() { if let Some(entry) = self.ngrams.get_mut(&other_ngram) { entry.row_count += other_entry.row_count; entry.occurrence_count += other_entry.occurrence_count; } else { self.ngrams.insert(other_ngram, other_entry); } } self } fn finalize(mut self, settings: &StatsSettings) -> TextColumnStatsOutput { self.ngrams .sort_by(|_, entry_a, _, entry_b| entry_a.row_count.cmp(&entry_b.row_count)); let row_count = self.row_count.to_f32().unwrap(); let ngrams_count = self.ngrams.len(); let ngrams: IndexMap = self .ngrams .into_iter() .rev() .take(settings.ngrams_max_count) .map(|(ngram, entry)| { let entry_row_count = entry.row_count.to_f32().unwrap(); let idf = ((1.0 + row_count) / (1.0 + entry_row_count)).ln() + 1.0; let entry = TextColumnStatsOutputTopNGramsEntry { idf, occurrence_count: entry.occurrence_count, row_count: entry.row_count, }; (ngram, entry) }) .collect(); TextColumnStatsOutput { column_name: self.column_name, tokenizer: self.tokenizer, row_count: self.row_count.to_u64().unwrap(), ngram_types: settings.ngram_types.clone(), ngrams_count, top_ngrams: ngrams, } } }