#![forbid(unsafe_code)] mod read; mod write; // The dynamic representation of values in native Rust. This is not exaustive. // todo: maybe refactor this into serde/json? #[derive(Debug, PartialEq)] pub enum Array { UInt32(Vec>), Int32(Vec>), Int64(Vec>), Int96(Vec>), Float(Vec>), Double(Vec>), Boolean(Vec>), Binary(Vec>>), FixedLenBinary(Vec>>), List(Vec>), Struct(Vec, Vec), } impl Array { pub fn len(&self) -> usize { match self { Array::UInt32(a) => a.len(), Array::Int32(a) => a.len(), Array::Int64(a) => a.len(), Array::Int96(a) => a.len(), Array::Float(a) => a.len(), Array::Double(a) => a.len(), Array::Boolean(a) => a.len(), Array::Binary(a) => a.len(), Array::FixedLenBinary(a) => a.len(), Array::List(a) => a.len(), Array::Struct(a, _) => a[0].len(), } } pub fn is_empty(&self) -> bool { self.len() == 0 } } // The dynamic representation of values in native Rust. This is not exaustive. // todo: maybe refactor this into serde/json? #[derive(Debug, PartialEq)] pub enum Value { UInt32(Option), Int32(Option), Int64(Option), Int96(Option<[u32; 3]>), Float32(Option), Float64(Option), Boolean(Option), Binary(Option>), FixedLenBinary(Option>), List(Option), } use std::path::PathBuf; use std::sync::Arc; use parquet2::schema::types::PhysicalType; use parquet2::schema::types::PrimitiveType; use parquet2::statistics::*; pub fn get_path() -> PathBuf { let dir = env!("CARGO_MANIFEST_DIR"); PathBuf::from(dir).join("testing/parquet-testing/data") } pub fn alltypes_plain(column: &str) -> Array { match column { "id" => { let expected = vec![4, 5, 6, 7, 2, 3, 0, 1]; let expected = expected.into_iter().map(Some).collect::>(); Array::Int32(expected) } "id-short-array" => { let expected = vec![4]; let expected = expected.into_iter().map(Some).collect::>(); Array::Int32(expected) } "bool_col" => { let expected = vec![true, false, true, false, true, false, true, false]; let expected = expected.into_iter().map(Some).collect::>(); Array::Boolean(expected) } "tinyint_col" => { let expected = vec![0, 1, 0, 1, 0, 1, 0, 1]; let expected = expected.into_iter().map(Some).collect::>(); Array::Int32(expected) } "smallint_col" => { let expected = vec![0, 1, 0, 1, 0, 1, 0, 1]; let expected = expected.into_iter().map(Some).collect::>(); Array::Int32(expected) } "int_col" => { let expected = vec![0, 1, 0, 1, 0, 1, 0, 1]; let expected = expected.into_iter().map(Some).collect::>(); Array::Int32(expected) } "bigint_col" => { let expected = vec![0, 10, 0, 10, 0, 10, 0, 10]; let expected = expected.into_iter().map(Some).collect::>(); Array::Int64(expected) } "float_col" => { let expected = vec![0.0, 1.1, 0.0, 1.1, 0.0, 1.1, 0.0, 1.1]; let expected = expected.into_iter().map(Some).collect::>(); Array::Float(expected) } "double_col" => { let expected = vec![0.0, 10.1, 0.0, 10.1, 0.0, 10.1, 0.0, 10.1]; let expected = expected.into_iter().map(Some).collect::>(); Array::Double(expected) } "date_string_col" => { let expected = vec![ vec![48, 51, 47, 48, 49, 47, 48, 57], vec![48, 51, 47, 48, 49, 47, 48, 57], vec![48, 52, 47, 48, 49, 47, 48, 57], vec![48, 52, 47, 48, 49, 47, 48, 57], vec![48, 50, 47, 48, 49, 47, 48, 57], vec![48, 50, 47, 48, 49, 47, 48, 57], vec![48, 49, 47, 48, 49, 47, 48, 57], vec![48, 49, 47, 48, 49, 47, 48, 57], ]; let expected = expected.into_iter().map(Some).collect::>(); Array::Binary(expected) } "string_col" => { let expected = vec![ vec![48], vec![49], vec![48], vec![49], vec![48], vec![49], vec![48], vec![49], ]; let expected = expected.into_iter().map(Some).collect::>(); Array::Binary(expected) } "timestamp_col" => { todo!() } _ => unreachable!(), } } pub fn alltypes_statistics(column: &str) -> Arc { match column { "id" => Arc::new(PrimitiveStatistics:: { primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int32), null_count: Some(0), distinct_count: None, min_value: Some(0), max_value: Some(7), }), "id-short-array" => Arc::new(PrimitiveStatistics:: { primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int32), null_count: Some(0), distinct_count: None, min_value: Some(4), max_value: Some(4), }), "bool_col" => Arc::new(BooleanStatistics { null_count: Some(0), distinct_count: None, min_value: Some(false), max_value: Some(true), }), "tinyint_col" | "smallint_col" | "int_col" => Arc::new(PrimitiveStatistics:: { primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int32), null_count: Some(0), distinct_count: None, min_value: Some(0), max_value: Some(1), }), "bigint_col" => Arc::new(PrimitiveStatistics:: { primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int64), null_count: Some(0), distinct_count: None, min_value: Some(0), max_value: Some(10), }), "float_col" => Arc::new(PrimitiveStatistics:: { primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Float), null_count: Some(0), distinct_count: None, min_value: Some(0.0), max_value: Some(1.1), }), "double_col" => Arc::new(PrimitiveStatistics:: { primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Double), null_count: Some(0), distinct_count: None, min_value: Some(0.0), max_value: Some(10.1), }), "date_string_col" => Arc::new(BinaryStatistics { primitive_type: PrimitiveType::from_physical( "col".to_string(), PhysicalType::ByteArray, ), null_count: Some(0), distinct_count: None, min_value: Some(vec![48, 49, 47, 48, 49, 47, 48, 57]), max_value: Some(vec![48, 52, 47, 48, 49, 47, 48, 57]), }), "string_col" => Arc::new(BinaryStatistics { primitive_type: PrimitiveType::from_physical( "col".to_string(), PhysicalType::ByteArray, ), null_count: Some(0), distinct_count: None, min_value: Some(vec![48]), max_value: Some(vec![49]), }), "timestamp_col" => { todo!() } _ => unreachable!(), } } // these values match the values in `integration` pub fn pyarrow_optional(column: &str) -> Array { let i64_values = &[ Some(0), Some(1), None, Some(3), None, Some(5), Some(6), Some(7), None, Some(9), ]; let f64_values = &[ Some(0.0), Some(1.0), None, Some(3.0), None, Some(5.0), Some(6.0), Some(7.0), None, Some(9.0), ]; let string_values = &[ Some(b"Hello".to_vec()), None, Some(b"aa".to_vec()), Some(b"".to_vec()), None, Some(b"abc".to_vec()), None, None, Some(b"def".to_vec()), Some(b"aaa".to_vec()), ]; let bool_values = &[ Some(true), None, Some(false), Some(false), None, Some(true), None, None, Some(true), Some(true), ]; let binary_values = &[ Some(b"aa".to_vec()), None, Some(b"cc".to_vec()), Some(b"dd".to_vec()), None, Some(b"ff".to_vec()), None, None, Some(b"ii".to_vec()), Some(b"jj".to_vec()), ]; match column { "int64" => Array::Int64(i64_values.to_vec()), "float64" => Array::Double(f64_values.to_vec()), "string" => Array::Binary(string_values.to_vec()), "bool" => Array::Boolean(bool_values.to_vec()), "date" => Array::Int64(i64_values.to_vec()), "uint32" => Array::Int32(i64_values.iter().map(|i| i.map(|x| x as i32)).collect()), "fixed_binary" => Array::FixedLenBinary(binary_values.to_vec()), _ => unreachable!(), } } pub fn pyarrow_optional_stats(column: &str) -> (Option, Value, Value) { match column { "int64" => (Some(3), Value::Int64(Some(0)), Value::Int64(Some(9))), "float64" => ( Some(3), Value::Float64(Some(0.0)), Value::Float64(Some(9.0)), ), "string" => ( Some(4), Value::Binary(Some(b"".to_vec())), Value::Binary(Some(b"def".to_vec())), ), "bool" => ( Some(4), Value::Boolean(Some(false)), Value::Boolean(Some(true)), ), "date" => (Some(3), Value::Int64(Some(0)), Value::Int64(Some(9))), "fixed_binary" => ( Some(3), Value::FixedLenBinary(Some(b"aa".to_vec())), Value::FixedLenBinary(Some(b"jj".to_vec())), ), _ => unreachable!(), } } // these values match the values in `integration` pub fn pyarrow_required(column: &str) -> Array { let i64_values = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; let f64_values = &[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]; let string_values = &[ "Hello", "bbb", "aa", "", "bbb", "abc", "bbb", "bbb", "def", "aaa", ]; let bool_values = &[ true, true, false, false, false, true, true, true, true, true, ]; let binary_values = &["aa", "bb", "cc", "dd", "ee", "ff", "gg", "hh", "ii", "jj"]; match column { "int64" => Array::Int64(i64_values.iter().map(|i| Some(*i as i64)).collect()), "float64" => Array::Double(f64_values.iter().map(|f| Some(*f)).collect()), "string" => Array::Binary( string_values .iter() .map(|s| Some(s.as_bytes().to_vec())) .collect(), ), "bool" => Array::Boolean(bool_values.iter().map(|b| Some(*b)).collect()), "date" => Array::Int64(i64_values.iter().map(|i| Some(*i as i64)).collect()), "uint32" => Array::Int32(i64_values.iter().map(|i| Some(*i)).collect()), "fixed_binary" => Array::FixedLenBinary( binary_values .iter() .map(|s| Some(s.as_bytes().to_vec())) .collect(), ), _ => unreachable!(), } } pub fn pyarrow_required_stats(column: &str) -> (Option, Value, Value) { match column { "int64" => (Some(0), Value::Int64(Some(0)), Value::Int64(Some(9))), "float64" => ( Some(3), Value::Float64(Some(0.0)), Value::Float64(Some(9.0)), ), "string" => ( Some(4), Value::Binary(Some(b"".to_vec())), Value::Binary(Some(b"def".to_vec())), ), "bool" => ( Some(4), Value::Boolean(Some(false)), Value::Boolean(Some(true)), ), "date" => (Some(3), Value::Int64(Some(0)), Value::Int64(Some(9))), "uint32" => (Some(0), Value::Int32(Some(0)), Value::Int32(Some(9))), "fixed_binary" => ( Some(4), Value::FixedLenBinary(Some(b"aa".to_vec())), Value::FixedLenBinary(Some(b"jj".to_vec())), ), _ => unreachable!(), } } // these values match the values in `integration` pub fn pyarrow_nested_optional(column: &str) -> Array { // [[0, 1], None, [2, None, 3], [4, 5, 6], [], [7, 8, 9], None, [10]] // def: 3, 3, 0, 3, 2, 3, 3, 3, 3, 1 3 3 3 0 3 // rep: 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0 let data = vec![ Some(Array::Int64(vec![Some(0), Some(1)])), None, Some(Array::Int64(vec![Some(2), None, Some(3)])), Some(Array::Int64(vec![Some(4), Some(5), Some(6)])), Some(Array::Int64(vec![])), Some(Array::Int64(vec![Some(7), Some(8), Some(9)])), None, Some(Array::Int64(vec![Some(10)])), ]; match column { "list_int64" => Array::List(data), _ => unreachable!(), } } // these values match the values in `integration` pub fn pyarrow_struct_optional(column: &str) -> Array { let validity = vec![false, true, true, true, true, true, true, true, true, true]; let string = vec![ Some("Hello".to_string()), None, Some("aa".to_string()), Some("".to_string()), None, Some("abc".to_string()), None, None, Some("def".to_string()), Some("aaa".to_string()), ] .into_iter() .map(|s| s.map(|s| s.as_bytes().to_vec())) .collect::>(); let boolean = vec![ Some(true), None, Some(false), Some(false), None, Some(true), None, None, Some(true), Some(true), ]; match column { "struct_nullable" => { let string = string .iter() .zip(validity.iter()) .map(|(item, valid)| if *valid { item.clone() } else { None }) .collect(); let boolean = boolean .iter() .zip(validity.iter()) .map(|(item, valid)| if *valid { *item } else { None }) .collect(); Array::Struct( vec![Array::Binary(string), Array::Boolean(boolean)], validity, ) } "struct_required" => Array::Struct( vec![Array::Binary(string), Array::Boolean(boolean)], vec![true; validity.len()], ), _ => unreachable!(), } }