syntax = "proto3"; import "sarus_data_spec/protobuf/type.proto"; import "sarus_data_spec/protobuf/path.proto"; import "sarus_data_spec/protobuf/scalar.proto"; message Transform { // A dataset transform string uuid = 1; // e.g. RFC 4122 id used to refer to the transform string name = 2; string doc = 3; Spec spec = 4; map properties = 5; // Other properties bool inversible = 6; bool schema_preserving = 7; // Definitions message Spec { oneof spec { Identity identity = 1; Variable variable = 2; Composed composed = 3; Project project = 4; Filter filter = 5; Shuffle shuffle = 6; Join join = 7; Cast cast = 8; Sample sample = 9; UserSettings user_settings = 10; Protect protect_dataset = 11; External external = 12; // np transforms, pd transforms,... Synthetic synthetic = 13; Transcode transcode = 14; InverseTranscode inverse_transcode = 15; GetItem get_item = 16; ProtectedPaths protected_paths = 17; AutomaticUserSettings automatic_user_settings = 18; PublicPaths public_paths = 19; AssignBudget assign_budget = 20; AutomaticBudget automatic_budget = 21; AttributesBudget attribute_budget = 22; SDBudget sd_budget = 23; DeriveSeed derive_seed = 24; GroupByPE group_by_pe = 25; SamplingRatios sampling_ratios = 26; SelectSql select_sql = 27; Extract extract = 28; RelationshipSpec relationship_spec = 29; DifferentiatedSample differentiated_sample = 30; ValidatedUserType validated_user_type=31; DPSelectSql dp_select_sql = 32; } } message External { bytes arguments = 1; bytes named_arguments = 2; OpIdentifier op_identifier = 3; message OpIdentifier{ oneof op { Std std = 1; Pandas pandas = 2; Numpy numpy = 3; Tensorflow tensorflow = 4; Sklearn sklearn = 5; PandasProfiling pandas_profiling = 6; XGBoost xgboost = 7; Skopt skopt = 8; Imblearn imblearn = 9; } }; message Std { string name = 1; } message Pandas { string name = 1; } message Numpy { string name = 1; } message Tensorflow { string name = 1; } message Sklearn { string name = 1; } message PandasProfiling { string name = 1; } message XGBoost { string name = 1; } message Skopt { string name = 1; } message Imblearn { string name = 1; } } message Identity {} // Does nothing message Variable { int32 position = 1; string name = 2; } // Numbered or named identity to use as input of the composed transform message Composed { string transform = 1; // Transform repeated string arguments = 2; // Arguments of the current transform are transforms map named_arguments = 3; } message Project { sarus_data_spec.Type projection = 1; // This should be a 'supertype' the type the data can project into. // For product types (struct or tuple), this is a type with a subset of the fields // For map types, this is a map type with subset (subtype) of key and superset (supertype) of value // For union types, this is a type with more terms in the union // To start wit let's simply use this with structs } message Filter { sarus_data_spec.Type filter = 1; // This should be a 'subtype' the type the data can be retricted to. // For union types, this is a type with less terms in the union. // Optional types can for instance be filtered to non-optional // Value type and Predicate types can be used to restrict values } message Shuffle { } message Join { sarus_data_spec.Type on = 1; // This should be a common 'supertype' between tables. } message Cast { sarus_data_spec.Type type = 1; // Type to cast into. } message Sample { // Sample a dataset oneof proportion { double fraction = 1; int64 size = 2; } sarus_data_spec.Scalar seed = 3; } message SchemaInference { CastPolicy cast_policy = 1; enum CastPolicy { NONE = 0; MOST_LIKELY = 1; } } message GroupBy { string key = 1; } message Synthetic { } message UserSettings { } message AutomaticUserSettings { int64 max_categories = 1; } message Protect { } message Transcode { } message InverseTranscode { } message DifferentiatedSample{ oneof proportion { double fraction = 1; int64 size = 2; } sarus_data_spec.Scalar seed = 3; } message ProtectedPaths { } message PublicPaths { } message GetItem { sarus_data_spec.Path path = 1; } message AssignBudget { } message AutomaticBudget { } message AttributesBudget { } message SDBudget { } message DeriveSeed { int64 random_integer = 1; } message GroupByPE { } message SamplingRatios { } message RelationshipSpec { } message SelectSql { oneof select { string query = 1; AliasedQueries aliased_queries = 2; } SQLDialect sql_dialect = 3; } message DPSelectSql { oneof select { string query = 1; AliasedQueries aliased_queries = 2; } SQLDialect sql_dialect = 3; } enum SQLDialect { NONE = 0; POSTGRES = 1; SQL_SERVER = 2; MY_SQL = 3; SQLLITE = 4; ORACLE = 5; BIG_QUERY = 6; REDSHIFT = 7; HIVE = 8; } message AliasedQueries { repeated AliasedQuery aliased_query = 1; } message AliasedQuery { sarus_data_spec.Path path = 1; string query = 2; } message Extract { int32 size = 1; } message ValidatedUserType { } }