// Copyright 2021 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.aiplatform.v1beta1.schema.trainingjob.definition; import "google/cloud/aiplatform/v1beta1/schema/trainingjob/definition/export_evaluated_data_items_config.proto"; option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1.Schema.TrainingJob.Definition"; option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/schema/trainingjob/definition/definitionpb;definitionpb"; option java_multiple_files = true; option java_outer_classname = "AutoMLTablesProto"; option java_package = "com.google.cloud.aiplatform.v1beta1.schema.trainingjob.definition"; option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1\\Schema\\TrainingJob\\Definition"; option ruby_package = "Google::Cloud::AIPlatform::V1beta1::Schema::TrainingJob::Definition"; // A TrainingJob that trains and uploads an AutoML Tables Model. message AutoMlTables { // The input parameters of this TrainingJob. AutoMlTablesInputs inputs = 1; // The metadata information. AutoMlTablesMetadata metadata = 2; } message AutoMlTablesInputs { message Transformation { // Training pipeline will infer the proper transformation based on the // statistic of dataset. message AutoTransformation { string column_name = 1; } // Training pipeline will perform following transformation functions. // * The value converted to float32. // * The z_score of the value. // * log(value+1) when the value is greater than or equal to 0. Otherwise, // this transformation is not applied and the value is considered a // missing value. // * z_score of log(value+1) when the value is greater than or equal to 0. // Otherwise, this transformation is not applied and the value is // considered a missing value. // * A boolean value that indicates whether the value is valid. message NumericTransformation { string column_name = 1; // If invalid values is allowed, the training pipeline will create a // boolean feature that indicated whether the value is valid. // Otherwise, the training pipeline will discard the input row from // trainining data. bool invalid_values_allowed = 2; } // Training pipeline will perform following transformation functions. // * The categorical string as is--no change to case, punctuation, // spelling, // tense, and so on. // * Convert the category name to a dictionary lookup index and generate an // embedding for each index. // * Categories that appear less than 5 times in the training dataset are // treated as the "unknown" category. The "unknown" category gets its own // special lookup index and resulting embedding. message CategoricalTransformation { string column_name = 1; } // Training pipeline will perform following transformation functions. // * Apply the transformation functions for Numerical columns. // * Determine the year, month, day,and weekday. Treat each value from the // * timestamp as a Categorical column. // * Invalid numerical values (for example, values that fall outside of a // typical timestamp range, or are extreme values) receive no special // treatment and are not removed. message TimestampTransformation { string column_name = 1; // The format in which that time field is expressed. The time_format must // either be one of: // * `unix-seconds` // * `unix-milliseconds` // * `unix-microseconds` // * `unix-nanoseconds` // (for respectively number of seconds, milliseconds, microseconds and // nanoseconds since start of the Unix epoch); // or be written in `strftime` syntax. If time_format is not set, then the // default format is RFC 3339 `date-time` format, where // `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z) string time_format = 2; // If invalid values is allowed, the training pipeline will create a // boolean feature that indicated whether the value is valid. // Otherwise, the training pipeline will discard the input row from // trainining data. bool invalid_values_allowed = 3; } // Training pipeline will perform following transformation functions. // * The text as is--no change to case, punctuation, spelling, tense, and // so // on. // * Tokenize text to words. Convert each words to a dictionary lookup // index // and generate an embedding for each index. Combine the embedding of all // elements into a single embedding using the mean. // * Tokenization is based on unicode script boundaries. // * Missing values get their own lookup index and resulting embedding. // * Stop-words receive no special treatment and are not removed. message TextTransformation { string column_name = 1; } // Treats the column as numerical array and performs following // transformation functions. // * All transformations for Numerical types applied to the average of the // all elements. // * The average of empty arrays is treated as zero. message NumericArrayTransformation { string column_name = 1; // If invalid values is allowed, the training pipeline will create a // boolean feature that indicated whether the value is valid. // Otherwise, the training pipeline will discard the input row from // trainining data. bool invalid_values_allowed = 2; } // Treats the column as categorical array and performs following // transformation functions. // * For each element in the array, convert the category name to a // dictionary // lookup index and generate an embedding for each index. // Combine the embedding of all elements into a single embedding using // the mean. // * Empty arrays treated as an embedding of zeroes. message CategoricalArrayTransformation { string column_name = 1; } // Treats the column as text array and performs following transformation // functions. // * Concatenate all text values in the array into a single text value // using // a space (" ") as a delimiter, and then treat the result as a single // text value. Apply the transformations for Text columns. // * Empty arrays treated as an empty text. message TextArrayTransformation { string column_name = 1; } // The transformation that the training pipeline will apply to the input // columns. oneof transformation_detail { AutoTransformation auto = 1; NumericTransformation numeric = 2; CategoricalTransformation categorical = 3; TimestampTransformation timestamp = 4; TextTransformation text = 5; NumericArrayTransformation repeated_numeric = 6; CategoricalArrayTransformation repeated_categorical = 7; TextArrayTransformation repeated_text = 8; } } // Additional optimization objective configuration. Required for // `maximize-precision-at-recall` and `maximize-recall-at-precision`, // otherwise unused. oneof additional_optimization_objective_config { // Required when optimization_objective is "maximize-precision-at-recall". // Must be between 0 and 1, inclusive. float optimization_objective_recall_value = 5; // Required when optimization_objective is "maximize-recall-at-precision". // Must be between 0 and 1, inclusive. float optimization_objective_precision_value = 6; } // The type of prediction the Model is to produce. // "classification" - Predict one out of multiple target values is // picked for each row. // "regression" - Predict a value based on its relation to other values. // This type is available only to columns that contain // semantically numeric values, i.e. integers or floating // point number, even if stored as e.g. strings. string prediction_type = 1; // The column name of the target column that the model is to predict. string target_column = 2; // Each transformation will apply transform function to given input column. // And the result will be used for training. // When creating transformation for BigQuery Struct column, the column should // be flattened using "." as the delimiter. repeated Transformation transformations = 3; // Objective function the model is optimizing towards. The training process // creates a model that maximizes/minimizes the value of the objective // function over the validation set. // // The supported optimization objectives depend on the prediction type. // If the field is not set, a default objective function is used. // // classification (binary): // "maximize-au-roc" (default) - Maximize the area under the receiver // operating characteristic (ROC) curve. // "minimize-log-loss" - Minimize log loss. // "maximize-au-prc" - Maximize the area under the precision-recall curve. // "maximize-precision-at-recall" - Maximize precision for a specified // recall value. // "maximize-recall-at-precision" - Maximize recall for a specified // precision value. // // classification (multi-class): // "minimize-log-loss" (default) - Minimize log loss. // // regression: // "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE). // "minimize-mae" - Minimize mean-absolute error (MAE). // "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE). string optimization_objective = 4; // Required. The train budget of creating this model, expressed in milli node // hours i.e. 1,000 value in this field means 1 node hour. // // The training cost of the model will not exceed this budget. The final cost // will be attempted to be close to the budget, though may end up being (even) // noticeably smaller - at the backend's discretion. This especially may // happen when further model training ceases to provide any improvements. // // If the budget is set to a value known to be insufficient to train a // model for the given dataset, the training won't be attempted and // will error. // // The train budget must be between 1,000 and 72,000 milli node hours, // inclusive. int64 train_budget_milli_node_hours = 7; // Use the entire training budget. This disables the early stopping feature. // By default, the early stopping feature is enabled, which means that AutoML // Tables might stop training before the entire training budget has been used. bool disable_early_stopping = 8; // Column name that should be used as the weight column. // Higher values in this column give more importance to the row // during model training. The column must have numeric values between 0 and // 10000 inclusively; 0 means the row is ignored for training. If weight // column field is not set, then all rows are assumed to have equal weight // of 1. string weight_column_name = 9; // Configuration for exporting test set predictions to a BigQuery table. If // this configuration is absent, then the export is not performed. ExportEvaluatedDataItemsConfig export_evaluated_data_items_config = 10; // Additional experiment flags for the Tables training pipeline. repeated string additional_experiments = 11; } // Model metadata specific to AutoML Tables. message AutoMlTablesMetadata { // Output only. The actual training cost of the model, expressed in milli // node hours, i.e. 1,000 value in this field means 1 node hour. Guaranteed // to not exceed the train budget. int64 train_cost_milli_node_hours = 1; }