// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.aiplatform.v1beta1.schema.trainingjob.definition;

import "google/cloud/aiplatform/v1beta1/schema/trainingjob/definition/export_evaluated_data_items_config.proto";

option csharp_namespace = "Google.Cloud.AIPlatform.V1Beta1.Schema.TrainingJob.Definition";
option go_package = "cloud.google.com/go/aiplatform/apiv1beta1/schema/trainingjob/definition/definitionpb;definitionpb";
option java_multiple_files = true;
option java_outer_classname = "AutoMLTablesProto";
option java_package = "com.google.cloud.aiplatform.v1beta1.schema.trainingjob.definition";
option php_namespace = "Google\\Cloud\\AIPlatform\\V1beta1\\Schema\\TrainingJob\\Definition";
option ruby_package = "Google::Cloud::AIPlatform::V1beta1::Schema::TrainingJob::Definition";

// A TrainingJob that trains and uploads an AutoML Tables Model.
message AutoMlTables {
  // The input parameters of this TrainingJob.
  AutoMlTablesInputs inputs = 1;

  // The metadata information.
  AutoMlTablesMetadata metadata = 2;
}

message AutoMlTablesInputs {
  message Transformation {
    // Training pipeline will infer the proper transformation based on the
    // statistic of dataset.
    message AutoTransformation {
      string column_name = 1;
    }

    // Training pipeline will perform following transformation functions.
    // *  The value converted to float32.
    // *  The z_score of the value.
    // *  log(value+1) when the value is greater than or equal to 0. Otherwise,
    //    this transformation is not applied and the value is considered a
    //    missing value.
    // *  z_score of log(value+1) when the value is greater than or equal to 0.
    //    Otherwise, this transformation is not applied and the value is
    //    considered a missing value.
    // *  A boolean value that indicates whether the value is valid.
    message NumericTransformation {
      string column_name = 1;

      // If invalid values is allowed, the training pipeline will create a
      // boolean feature that indicated whether the value is valid.
      // Otherwise, the training pipeline will discard the input row from
      // trainining data.
      bool invalid_values_allowed = 2;
    }

    // Training pipeline will perform following transformation functions.
    // *  The categorical string as is--no change to case, punctuation,
    // spelling,
    //    tense, and so on.
    // *  Convert the category name to a dictionary lookup index and generate an
    //    embedding for each index.
    // *  Categories that appear less than 5 times in the training dataset are
    //    treated as the "unknown" category. The "unknown" category gets its own
    //    special lookup index and resulting embedding.
    message CategoricalTransformation {
      string column_name = 1;
    }

    // Training pipeline will perform following transformation functions.
    // *  Apply the transformation functions for Numerical columns.
    // *  Determine the year, month, day,and weekday. Treat each value from the
    // *  timestamp as a Categorical column.
    // *  Invalid numerical values (for example, values that fall outside of a
    //    typical timestamp range, or are extreme values) receive no special
    //    treatment and are not removed.
    message TimestampTransformation {
      string column_name = 1;

      // The format in which that time field is expressed. The time_format must
      // either be one of:
      // * `unix-seconds`
      // * `unix-milliseconds`
      // * `unix-microseconds`
      // * `unix-nanoseconds`
      // (for respectively number of seconds, milliseconds, microseconds and
      // nanoseconds since start of the Unix epoch);
      // or be written in `strftime` syntax. If time_format is not set, then the
      // default format is RFC 3339 `date-time` format, where
      // `time-offset` = `"Z"` (e.g. 1985-04-12T23:20:50.52Z)
      string time_format = 2;

      // If invalid values is allowed, the training pipeline will create a
      // boolean feature that indicated whether the value is valid.
      // Otherwise, the training pipeline will discard the input row from
      // trainining data.
      bool invalid_values_allowed = 3;
    }

    // Training pipeline will perform following transformation functions.
    // *  The text as is--no change to case, punctuation, spelling, tense, and
    // so
    //    on.
    // *  Tokenize text to words. Convert each words to a dictionary lookup
    // index
    //    and generate an embedding for each index. Combine the embedding of all
    //    elements into a single embedding using the mean.
    // *  Tokenization is based on unicode script boundaries.
    // *  Missing values get their own lookup index and resulting embedding.
    // *  Stop-words receive no special treatment and are not removed.
    message TextTransformation {
      string column_name = 1;
    }

    // Treats the column as numerical array and performs following
    // transformation functions.
    // *  All transformations for Numerical types applied to the average of the
    //    all elements.
    // *  The average of empty arrays is treated as zero.
    message NumericArrayTransformation {
      string column_name = 1;

      // If invalid values is allowed, the training pipeline will create a
      // boolean feature that indicated whether the value is valid.
      // Otherwise, the training pipeline will discard the input row from
      // trainining data.
      bool invalid_values_allowed = 2;
    }

    // Treats the column as categorical array and performs following
    // transformation functions.
    // *  For each element in the array, convert the category name to a
    // dictionary
    //    lookup index and generate an embedding for each index.
    //    Combine the embedding of all elements into a single embedding using
    //    the mean.
    // *  Empty arrays treated as an embedding of zeroes.
    message CategoricalArrayTransformation {
      string column_name = 1;
    }

    // Treats the column as text array and performs following transformation
    // functions.
    // *  Concatenate all text values in the array into a single text value
    // using
    //    a space (" ") as a delimiter, and then treat the result as a single
    //    text value. Apply the transformations for Text columns.
    // *  Empty arrays treated as an empty text.
    message TextArrayTransformation {
      string column_name = 1;
    }

    // The transformation that the training pipeline will apply to the input
    // columns.
    oneof transformation_detail {
      AutoTransformation auto = 1;

      NumericTransformation numeric = 2;

      CategoricalTransformation categorical = 3;

      TimestampTransformation timestamp = 4;

      TextTransformation text = 5;

      NumericArrayTransformation repeated_numeric = 6;

      CategoricalArrayTransformation repeated_categorical = 7;

      TextArrayTransformation repeated_text = 8;
    }
  }

  // Additional optimization objective configuration. Required for
  // `maximize-precision-at-recall` and `maximize-recall-at-precision`,
  // otherwise unused.
  oneof additional_optimization_objective_config {
    // Required when optimization_objective is "maximize-precision-at-recall".
    // Must be between 0 and 1, inclusive.
    float optimization_objective_recall_value = 5;

    // Required when optimization_objective is "maximize-recall-at-precision".
    // Must be between 0 and 1, inclusive.
    float optimization_objective_precision_value = 6;
  }

  // The type of prediction the Model is to produce.
  //   "classification" - Predict one out of multiple target values is
  //                      picked for each row.
  //   "regression" - Predict a value based on its relation to other values.
  //                  This type is available only to columns that contain
  //                  semantically numeric values, i.e. integers or floating
  //                  point number, even if stored as e.g. strings.
  string prediction_type = 1;

  // The column name of the target column that the model is to predict.
  string target_column = 2;

  // Each transformation will apply transform function to given input column.
  // And the result will be used for training.
  // When creating transformation for BigQuery Struct column, the column should
  // be flattened using "." as the delimiter.
  repeated Transformation transformations = 3;

  // Objective function the model is optimizing towards. The training process
  // creates a model that maximizes/minimizes the value of the objective
  // function over the validation set.
  //
  // The supported optimization objectives depend on the prediction type.
  // If the field is not set, a default objective function is used.
  //
  // classification (binary):
  //   "maximize-au-roc" (default) - Maximize the area under the receiver
  //                                 operating characteristic (ROC) curve.
  //   "minimize-log-loss" - Minimize log loss.
  //   "maximize-au-prc" - Maximize the area under the precision-recall curve.
  //   "maximize-precision-at-recall" - Maximize precision for a specified
  //                                   recall value.
  //   "maximize-recall-at-precision" - Maximize recall for a specified
  //                                    precision value.
  //
  // classification (multi-class):
  //   "minimize-log-loss" (default) - Minimize log loss.
  //
  // regression:
  //   "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
  //   "minimize-mae" - Minimize mean-absolute error (MAE).
  //   "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
  string optimization_objective = 4;

  // Required. The train budget of creating this model, expressed in milli node
  // hours i.e. 1,000 value in this field means 1 node hour.
  //
  // The training cost of the model will not exceed this budget. The final cost
  // will be attempted to be close to the budget, though may end up being (even)
  // noticeably smaller - at the backend's discretion. This especially may
  // happen when further model training ceases to provide any improvements.
  //
  // If the budget is set to a value known to be insufficient to train a
  // model for the given dataset, the training won't be attempted and
  // will error.
  //
  // The train budget must be between 1,000 and 72,000 milli node hours,
  // inclusive.
  int64 train_budget_milli_node_hours = 7;

  // Use the entire training budget. This disables the early stopping feature.
  // By default, the early stopping feature is enabled, which means that AutoML
  // Tables might stop training before the entire training budget has been used.
  bool disable_early_stopping = 8;

  // Column name that should be used as the weight column.
  // Higher values in this column give more importance to the row
  // during model training. The column must have numeric values between 0 and
  // 10000 inclusively; 0 means the row is ignored for training. If weight
  // column field is not set, then all rows are assumed to have equal weight
  // of 1.
  string weight_column_name = 9;

  // Configuration for exporting test set predictions to a BigQuery table. If
  // this configuration is absent, then the export is not performed.
  ExportEvaluatedDataItemsConfig export_evaluated_data_items_config = 10;

  // Additional experiment flags for the Tables training pipeline.
  repeated string additional_experiments = 11;
}

// Model metadata specific to AutoML Tables.
message AutoMlTablesMetadata {
  // Output only. The actual training cost of the model, expressed in milli
  // node hours, i.e. 1,000 value in this field means 1 node hour. Guaranteed
  // to not exceed the train budget.
  int64 train_cost_milli_node_hours = 1;
}