extern crate etl; #[macro_use] extern crate unittest; use std::path::PathBuf; use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; use std::ops::Shl; use etl::dataframe::DataFrame; fn hash_details(value: &str) -> (String, f64) { let mut hasher = DefaultHasher::new(); let hash_size: u64 = 4; let midpoint = 1u64.shl(63); value.to_string().hash(&mut hasher); let h = hasher.finish(); let hash_feature = format!("vec_hash_e_{}", h % hash_size); let hash_sign= if h >= midpoint { 1.0 } else { -1.0 }; (hash_feature, hash_sign) } #[test] fn test_transform() { let data_path = PathBuf::from(file!()).parent().unwrap().join("data/transform_test.toml"); let (config, df) = DataFrame::load(data_path.as_path()).unwrap(); println!("{:?}", config); println!("{:?}", df); let mut fieldnames = df.fieldnames(); fieldnames.sort(); assert_eq!(fieldnames, ["c", "cat_ab", "d", "e", "map_convert_e", "map_e", "norm_f_sample", "norm_f_uncorr", "scaled_f_custom", "scaled_f_default", "vec_hash_e_0", "vec_hash_e_1", "vec_hash_e_2", "vec_hash_e_3", "vec_onehot_e_F", "vec_onehot_e_M","vec_onehot_e_e2", "vec_onehot_e_n11_F", "vec_onehot_e_n11_M", "vec_onehot_e_n11_e2"]); let field_c = df.get_signed_field("c"); assert!(field_c.is_some()); assert_eq!(field_c.unwrap(), &[1, 2, 3, 4, 5, 6, 7, 8, 9]); let field_d = df.get_signed_field("d"); assert!(field_d.is_some()); assert_eq!(field_d.unwrap(), &[9, 8, 7, 6, 5, 4, 3, 2, 1]); let field_e = df.get_text_field("e"); assert!(field_e.is_some()); assert_eq!(field_e.unwrap(), &["M", "e2", "F", "M", "M", "F", "F", "F", "F"]); let field_cat_ab = df.get_text_field("cat_ab"); assert!(field_cat_ab.is_some()); assert_eq!(field_cat_ab.unwrap(), &["a1!!b1", "a2!!b2", "a3!!b3", "a4!!b4", "a5!!b5", "a6!!b6", "a7!!b7", "a8!!b8", "a9!!b9"]); let field_map_e = df.get_text_field("map_e"); assert!(field_map_e.is_some()); assert_eq!(field_map_e.unwrap(), &["0", "-1", "1", "0", "0", "1", "1", "1", "1"]); let field_map_convert_e = df.get_signed_field("map_convert_e"); assert!(field_map_convert_e.is_some()); assert_eq!(field_map_convert_e.unwrap(), &[0, -1, 1, 0, 0, 1, 1, 1, 1]); let field_scaled_f_custom = df.get_float_field("scaled_f_custom"); assert!(field_scaled_f_custom.is_some()); assert_fpvec_eq!(field_scaled_f_custom.unwrap(), [-1.0, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 1.0], 1e-12); let field_scaled_f_default = df.get_float_field("scaled_f_default"); assert!(field_scaled_f_default.is_some()); assert_fpvec_eq!(field_scaled_f_default.unwrap(), [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1.0], 1e-12); let field_norm_f_uncorr = df.get_float_field("norm_f_uncorr"); assert!(field_norm_f_uncorr.is_some()); assert_fpvec_eq!(field_norm_f_uncorr.unwrap(), [-1.4201266762, -1.0837808845, -0.7474350928, -0.411089301, -0.0747435093, 0.2616022825, 0.5979480742, 0.9342938659, 1.9433312412], 1e-9); let field_norm_f_sample = df.get_float_field("norm_f_sample"); assert!(field_norm_f_sample.is_some()); assert_fpvec_eq!(field_norm_f_sample.unwrap(), [-1.3389082705, -1.021798417, -0.7046885634, -0.3875787099, -0.0704688563, 0.2466409972, 0.5637508508, 0.8808607043, 1.8321902649], 1e-9); let field_vec_onehot_e_e2 = df.get_float_field("vec_onehot_e_e2"); assert!(field_vec_onehot_e_e2.is_some()); assert_fpvec_eq!(field_vec_onehot_e_e2.unwrap(), [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]); let field_vec_onehot_e_f = df.get_float_field("vec_onehot_e_F"); assert!(field_vec_onehot_e_f.is_some()); assert_fpvec_eq!(field_vec_onehot_e_f.unwrap(), [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0]); let field_vec_onehot_e_m = df.get_float_field("vec_onehot_e_M"); assert!(field_vec_onehot_e_m.is_some()); assert_fpvec_eq!(field_vec_onehot_e_m.unwrap(), [1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]); let field_vec_onehot_e_n11_e2 = df.get_float_field("vec_onehot_e_n11_e2"); assert!(field_vec_onehot_e_n11_e2.is_some()); assert_fpvec_eq!(field_vec_onehot_e_n11_e2.unwrap(), [-1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]); let field_vec_onehot_e_n11_f = df.get_float_field("vec_onehot_e_n11_F"); assert!(field_vec_onehot_e_n11_f.is_some()); assert_fpvec_eq!(field_vec_onehot_e_n11_f.unwrap(), [-1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 1.0]); let field_vec_onehot_e_n11_m = df.get_float_field("vec_onehot_e_n11_M"); assert!(field_vec_onehot_e_n11_m.is_some()); assert_fpvec_eq!(field_vec_onehot_e_n11_m.unwrap(), [1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0]); let (feature_hash_e2, feature_sign_e2) = hash_details("e2"); let (feature_hash_m, feature_sign_m) = hash_details("M"); let (feature_hash_f, feature_sign_f) = hash_details("F"); let expected_hash = |s| { [ if feature_hash_m == s { feature_sign_m } else { 0.0 }, if feature_hash_e2 == s { feature_sign_e2 } else { 0.0 }, if feature_hash_f == s { feature_sign_f } else { 0.0 }, if feature_hash_m == s { feature_sign_m } else { 0.0 }, if feature_hash_m == s { feature_sign_m } else { 0.0 }, if feature_hash_f == s { feature_sign_f } else { 0.0 }, if feature_hash_f == s { feature_sign_f } else { 0.0 }, if feature_hash_f == s { feature_sign_f } else { 0.0 }, if feature_hash_f == s { feature_sign_f } else { 0.0 }, ] }; let field_vec_hash_e_0 = df.get_float_field("vec_hash_e_0"); assert!(field_vec_hash_e_0.is_some()); assert_fpvec_eq!(field_vec_hash_e_0.unwrap(), expected_hash("vec_hash_e_0")); let field_vec_hash_e_1 = df.get_float_field("vec_hash_e_1"); assert!(field_vec_hash_e_1.is_some()); assert_fpvec_eq!(field_vec_hash_e_1.unwrap(), expected_hash("vec_hash_e_1")); let field_vec_hash_e_2 = df.get_float_field("vec_hash_e_2"); assert!(field_vec_hash_e_2.is_some()); assert_fpvec_eq!(field_vec_hash_e_2.unwrap(), expected_hash("vec_hash_e_2")); let field_vec_hash_e_3 = df.get_float_field("vec_hash_e_3"); assert!(field_vec_hash_e_3.is_some()); assert_fpvec_eq!(field_vec_hash_e_3.unwrap(), expected_hash("vec_hash_e_3")); }