mod read; use arrow2::array::*; use arrow2::bitmap::Bitmap; use arrow2::datatypes::*; use arrow2::error::Result; use arrow2::io::ndjson::write as ndjson_write; use read::{infer, read_and_deserialize}; fn round_trip(ndjson: String) -> Result<()> { let data_type = infer(&ndjson)?; let expected = read_and_deserialize(&ndjson, &data_type, 1000)?; let arrays = expected.clone().into_iter().map(Ok); let serializer = ndjson_write::Serializer::new(arrays, vec![]); let mut writer = ndjson_write::FileWriter::new(vec![], serializer); writer.by_ref().collect::>()?; // write let buf = writer.into_inner().0; let new_chunk = read_and_deserialize(std::str::from_utf8(&buf).unwrap(), &data_type, 1000)?; assert_eq!(expected, new_chunk); Ok(()) } #[test] fn round_trip_basics() -> Result<()> { let (data, _) = case_basics(); round_trip(data) } #[test] fn round_trip_list() -> Result<()> { let (data, _) = case_list(); round_trip(data) } fn case_list() -> (String, Box) { let data = r#"{"a":1, "b":[2.0, 1.3, -6.1], "c":[false, true], "d":"4"} {"a":-10, "b":null, "c":[true, true]} {"a":null, "b":[2.1, null, -6.2], "c":[false, null], "d":"text"} "# .to_string(); let data_type = DataType::Struct(vec![ Field::new("a", DataType::Int64, true), Field::new( "b", DataType::List(Box::new(Field::new("item", DataType::Float64, true))), true, ), Field::new( "c", DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), true, ), Field::new("d", DataType::Utf8, true), ]); let a = Int64Array::from(&[Some(1), Some(-10), None]); let mut b = MutableListArray::>::new(); b.try_extend(vec![ Some(vec![Some(2.0), Some(1.3), Some(-6.1)]), None, Some(vec![Some(2.1), None, Some(-6.2)]), ]) .unwrap(); let b: ListArray = b.into(); let mut c = MutableListArray::::new(); c.try_extend(vec![ Some(vec![Some(false), Some(true)]), Some(vec![Some(true), Some(true)]), Some(vec![Some(false), None]), ]) .unwrap(); let c: ListArray = c.into(); let d = Utf8Array::::from([Some("4"), None, Some("text")]); let array = StructArray::new( data_type, vec![a.boxed(), b.boxed(), c.boxed(), d.boxed()], None, ); (data, array.boxed()) } fn case_dict() -> (String, Box) { let data = r#"{"machine": "a", "events": [null, "Elect Leader", "Do Ballot"]} {"machine": "b", "events": ["Do Ballot", null, "Send Data", "Elect Leader"]} {"machine": "c", "events": ["Send Data"]} {"machine": "c"} {"machine": "c", "events": null} "# .to_string(); let data_type = DataType::List(Box::new(Field::new( "item", DataType::Dictionary(u64::KEY_TYPE, Box::new(DataType::Utf8), false), true, ))); let fields = vec![Field::new("events", data_type, true)]; type A = MutableDictionaryArray>; let mut array = MutableListArray::::new(); array .try_extend(vec![ Some(vec![None, Some("Elect Leader"), Some("Do Ballot")]), Some(vec![ Some("Do Ballot"), None, Some("Send Data"), Some("Elect Leader"), ]), Some(vec![Some("Send Data")]), None, None, ]) .unwrap(); let array: ListArray = array.into(); ( data, StructArray::new(DataType::Struct(fields), vec![array.boxed()], None).boxed(), ) } fn case_basics() -> (String, Box) { let data = r#"{"a":1, "b":2.0, "c":false, "d":"4"} {"a":-10, "b":-3.5, "c":true, "d":null} {"a":100000000, "b":0.6, "d":"text"}"# .to_string(); let data_type = DataType::Struct(vec![ Field::new("a", DataType::Int64, true), Field::new("b", DataType::Float64, true), Field::new("c", DataType::Boolean, true), Field::new("d", DataType::Utf8, true), ]); let array = StructArray::new( data_type, vec![ Int64Array::from_slice([1, -10, 100000000]).boxed(), Float64Array::from_slice([2.0, -3.5, 0.6]).boxed(), BooleanArray::from(&[Some(false), Some(true), None]).boxed(), Utf8Array::::from([Some("4"), None, Some("text")]).boxed(), ], None, ); (data, array.boxed()) } fn case_projection() -> (String, Box) { let data = r#"{"a":1, "b":2.0, "c":false, "d":"4", "e":"4"} {"a":10, "b":-3.5, "c":true, "d":null, "e":"text"} {"a":100000000, "b":0.6, "d":"text"}"# .to_string(); let data_type = DataType::Struct(vec![ Field::new("a", DataType::UInt32, true), Field::new("b", DataType::Float32, true), Field::new("c", DataType::Boolean, true), // note how "d" is not here Field::new("e", DataType::Binary, true), ]); let array = StructArray::new( data_type, vec![ UInt32Array::from_slice([1, 10, 100000000]).boxed(), Float32Array::from_slice([2.0, -3.5, 0.6]).boxed(), BooleanArray::from(&[Some(false), Some(true), None]).boxed(), BinaryArray::::from([Some(b"4".as_ref()), Some(b"text".as_ref()), None]).boxed(), ], None, ); (data, array.boxed()) } fn case_struct() -> (String, Box) { let data = r#"{"a": {"b": true, "c": {"d": "text"}}} {"a": {"b": false, "c": null}} {"a": {"b": true, "c": {"d": "text"}}} {"a": 1}"# .to_string(); let d_field = Field::new("d", DataType::Utf8, true); let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); let a_field = Field::new( "a", DataType::Struct(vec![ Field::new("b", DataType::Boolean, true), c_field.clone(), ]), true, ); let fields = vec![a_field]; // build expected output let d = Utf8Array::::from([Some("text"), None, Some("text"), None]); let c = StructArray::new( DataType::Struct(vec![d_field]), vec![d.boxed()], Some([true, false, true, true].into()), ); let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); let inner = DataType::Struct(vec![Field::new("b", DataType::Boolean, true), c_field]); let expected = StructArray::new( inner, vec![b.boxed(), c.boxed()], Some([true, true, true, false].into()), ); let data_type = DataType::Struct(fields); ( data, StructArray::new(data_type, vec![expected.boxed()], None).boxed(), ) } fn case_nested_list() -> (String, Box) { let d_field = Field::new("d", DataType::Utf8, true); let c_field = Field::new("c", DataType::Struct(vec![d_field.clone()]), true); let b_field = Field::new("b", DataType::Boolean, true); let a_struct_field = Field::new( "a", DataType::Struct(vec![b_field.clone(), c_field.clone()]), true, ); let a_list_data_type = DataType::List(Box::new(a_struct_field)); let a_field = Field::new("a", a_list_data_type.clone(), true); let data = r#" {"a": [{"b": true, "c": {"d": "a_text"}}, {"b": false, "c": {"d": "b_text"}}]} {"a": [{"b": false, "c": null}]} {"a": [{"b": true, "c": {"d": "c_text"}}, {"b": null, "c": {"d": "d_text"}}, {"b": true, "c": {"d": null}}]} {"a": null} {"a": []} "#.to_string(); // build expected output let d = Utf8Array::::from([ Some("a_text"), Some("b_text"), None, Some("c_text"), Some("d_text"), None, ]); let c = StructArray::new( DataType::Struct(vec![d_field]), vec![d.boxed()], Some(Bitmap::from_u8_slice([0b11111011], 6)), ); let b = BooleanArray::from(vec![ Some(true), Some(false), Some(false), Some(true), None, Some(true), ]); let a_struct = StructArray::new( DataType::Struct(vec![b_field, c_field]), vec![b.boxed(), c.boxed()], None, ); let expected = ListArray::new( a_list_data_type, vec![0i32, 2, 3, 6, 6, 6].try_into().unwrap(), a_struct.boxed(), Some([true, true, true, false, true].into()), ); let array = StructArray::new( DataType::Struct(vec![a_field]), vec![expected.boxed()], None, ) .boxed(); (data, array) } fn case(case: &str) -> (String, Box) { match case { "basics" => case_basics(), "projection" => case_projection(), "list" => case_list(), "dict" => case_dict(), "struct" => case_struct(), "nested_list" => case_nested_list(), _ => todo!(), } } #[test] fn infer_object() -> Result<()> { let data = r#"{"i64": 1, "f64": 0.1, "utf8": "foo1", "bools": true} {"i64": 2, "f64": 0.2, "utf8": "foo2", "bools": false} {"i64": 3, "f64": 0.3, "utf8": "foo3"} {"i64": 4, "f64": 0.4, "utf8": "foo4", "bools": false} "#; let u64_fld = Field::new("i64", DataType::Int64, true); let f64_fld = Field::new("f64", DataType::Float64, true); let utf8_fld = Field::new("utf8", DataType::Utf8, true); let bools_fld = Field::new("bools", DataType::Boolean, true); let expected = DataType::Struct(vec![u64_fld, f64_fld, utf8_fld, bools_fld]); let actual = infer(data)?; assert_eq!(expected, actual); Ok(()) }