syntax = "proto3"; package sarus_data_spec; message Scalar { // A Scalar represents data that does not fulfill the promise of a Dataset. // A Dataset promises to have a schema and the possibility to iterate on // pyarrow.RecordBatches // A Scalar does not ensure this possibility. As a consequence, oprations // from standard libraries are allowed (pandas.mean, numpy.std,...) but // operations implemented for Datasets by Sarus like computing marginals or // fitting a Keras model cannot be performed on a Scalar. // Scalars are generated by transforms that explicitly require a specific // format (e.g. as_pandas, as_numpy,...) or as byproducts of transforms // (model weights, training history,...). string uuid = 1; // e.g. RFC 4122 id used to refer to the dataset (content linked?) string name = 2; string doc = 3; Spec spec = 4; map properties = 5; // Other properties // Definitions message Spec { // How to obtain the dataset oneof spec { Transformed transformed = 1; Model model = 2; PrivacyParameters privacy_params=3; RandomSeed random_seed = 4; SyntheticModel synthetic_model=5; } } message Transformed { string transform = 1; // Transform id repeated string arguments = 2; // Dataset or other object ids map named_arguments = 3; } message Model { bytes arguments = 1; bytes named_arguments = 2; ModelClass model_class = 3; enum ModelClass { TF_KERAS = 0; SK_SVC = 1; SK_ONEHOT = 2; SK_PCA = 3; // Cluster SK_AFFINITY_PROPAGATION = 4; SK_AGGLOMERATIVE_CLUSTERING = 5; SK_BIRCH = 6; SK_DBSCAN = 7; SK_FEATURE_AGGLOMERATION = 8; SK_KMEANS = 9; SK_MINIBATCH_KMEANS = 10; SK_MEAN_SHIFT = 11; SK_OPTICS = 12; SK_SPECTRAL_CLUSTERING = 13; SK_SPECTRAL_BICLUSTERING = 14; SK_SPECTRAL_COCLUSTERING = 15; // ensemble SK_ADABOOST_CLASSIFIER = 60; SK_ADABOOST_REGRESSOR = 61; SK_BAGGING_CLASSIFIER = 62; SK_BAGGING_REGRESSOR = 63; SK_EXTRA_TREES_REGRESSOR = 64; SK_EXTRA_TREES_CLASSIFIER = 65; SK_GRADIENT_BOOSTING_CLASSIFIER = 66; SK_GRADIENT_BOOSTING_REGRESSOR = 67; SK_ISOLATION_FOREST = 68; SK_RANDOM_FOREST_CLASSIFIER = 69; SK_RANDOM_FOREST_REGRESSOR = 70; SK_RANDOM_TREES_EMBEDDING = 71; SK_STACKING_CLASSIFIER = 72; SK_STACKING_REGRESSOR = 73; SK_VOTING_CLASSIFIER = 74; SK_VOTING_REGRESSOR = 75; SK_HIST_GRADIENT_BOOSTING_REGRESSOR = 76; SK_HIST_GRADIENT_BOOSTING_CLASSIFIER = 77; // Model selection SK_REPEATED_STRATIFIED_KFOLD = 80; // XGB XGB_CLASSIFIER = 92; SK_LABEL_ENCODER = 98; SK_KFOLD = 99; }; } message PrivacyParameters { repeated Point points = 1; message Point { double epsilon = 1; double delta = 2; } } message RandomSeed { int32 value=1; } message SyntheticModel{} }