// Copyright 2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.bigquery.v2;

import "google/api/field_behavior.proto";
import "google/cloud/bigquery/v2/decimal_target_types.proto";
import "google/cloud/bigquery/v2/file_set_specification_type.proto";
import "google/cloud/bigquery/v2/hive_partitioning.proto";
import "google/cloud/bigquery/v2/json_extension.proto";
import "google/cloud/bigquery/v2/map_target_type.proto";
import "google/cloud/bigquery/v2/table_schema.proto";
import "google/protobuf/wrappers.proto";

option go_package = "cloud.google.com/go/bigquery/apiv2/bigquerypb;bigquerypb";
option java_outer_classname = "ExternalDataConfigProto";
option java_package = "com.google.cloud.bigquery.v2";

// Options for external data sources.
message AvroOptions {
  // Optional. If sourceFormat is set to "AVRO", indicates whether to interpret
  // logical types as the corresponding BigQuery data type (for example,
  // TIMESTAMP), instead of using the raw type (for example, INTEGER).
  google.protobuf.BoolValue use_avro_logical_types = 1
      [(google.api.field_behavior) = OPTIONAL];
}

// Parquet Options for load and make external tables.
message ParquetOptions {
  // Optional. Indicates whether to infer Parquet ENUM logical type as STRING
  // instead of BYTES by default.
  google.protobuf.BoolValue enum_as_string = 1
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. Indicates whether to use schema inference specifically for
  // Parquet LIST logical type.
  google.protobuf.BoolValue enable_list_inference = 2
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. Indicates how to represent a Parquet map if present.
  MapTargetType map_target_type = 3 [(google.api.field_behavior) = OPTIONAL];
}

// Information related to a CSV data source.
message CsvOptions {
  // Optional. The separator character for fields in a CSV file. The separator
  // is interpreted as a single byte. For files encoded in ISO-8859-1, any
  // single character can be used as a separator. For files encoded in UTF-8,
  // characters represented in decimal range 1-127 (U+0001-U+007F) can be used
  // without any modification. UTF-8 characters encoded with multiple bytes
  // (i.e. U+0080 and above) will have only the first byte used for separating
  // fields. The remaining bytes will be treated as a part of the field.
  // BigQuery also supports the escape sequence "\t" (U+0009) to specify a tab
  // separator. The default value is comma (",", U+002C).
  string field_delimiter = 1 [(google.api.field_behavior) = OPTIONAL];

  // Optional. The number of rows at the top of a CSV file that BigQuery will
  // skip when reading the data. The default value is 0. This property is
  // useful if you have header rows in the file that should be skipped.
  // When autodetect is on, the behavior is the following:
  //
  // * skipLeadingRows unspecified - Autodetect tries to detect headers in the
  //   first row. If they are not detected, the row is read as data. Otherwise
  //   data is read starting from the second row.
  // * skipLeadingRows is 0 - Instructs autodetect that there are no headers and
  //   data should be read starting from the first row.
  // * skipLeadingRows = N > 0 - Autodetect skips N-1 rows and tries to detect
  //   headers in row N. If headers are not detected, row N is just skipped.
  //   Otherwise row N is used to extract column names for the detected schema.
  google.protobuf.Int64Value skip_leading_rows = 2
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. The value that is used to quote data sections in a CSV file.
  // BigQuery converts the string to ISO-8859-1 encoding, and then uses the
  // first byte of the encoded string to split the data in its raw, binary
  // state.
  // The default value is a double-quote (").
  // If your data does not contain quoted sections,
  // set the property value to an empty string.
  // If your data contains quoted newline characters, you must also set the
  // allowQuotedNewlines property to true.
  // To include the specific quote character within a quoted value, precede it
  // with an additional matching quote character. For example, if you want to
  // escape the default character  ' " ', use ' "" '.
  google.protobuf.StringValue quote = 3
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. Indicates if BigQuery should allow quoted data sections that
  // contain newline characters in a CSV file. The default value is false.
  google.protobuf.BoolValue allow_quoted_newlines = 4
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. Indicates if BigQuery should accept rows that are missing
  // trailing optional columns. If true, BigQuery treats missing trailing
  // columns as null values.
  // If false, records with missing trailing columns are treated as bad records,
  // and if there are too many bad records, an invalid error is returned in the
  // job result. The default value is false.
  google.protobuf.BoolValue allow_jagged_rows = 5
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. The character encoding of the data.
  // The supported values are UTF-8, ISO-8859-1, UTF-16BE, UTF-16LE, UTF-32BE,
  // and UTF-32LE.  The default value is UTF-8.
  // BigQuery decodes the data after the raw, binary data has been split using
  // the values of the quote and fieldDelimiter properties.
  string encoding = 6 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Indicates if the embedded ASCII control characters (the first 32
  // characters in the ASCII-table, from '\x00' to '\x1F') are preserved.
  google.protobuf.BoolValue preserve_ascii_control_characters = 7
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. Specifies a string that represents a null value in a CSV file.
  // For example, if you specify "\N", BigQuery interprets "\N" as a null value
  // when querying a CSV file.
  // The default value is the empty string. If you set this property to a custom
  // value, BigQuery throws an error if an empty string is present for all data
  // types except for STRING and BYTE. For STRING and BYTE columns, BigQuery
  // interprets the empty string as an empty value.
  google.protobuf.StringValue null_marker = 8
      [(google.api.field_behavior) = OPTIONAL];
}

// Json Options for load and make external tables.
message JsonOptions {
  // Optional. The character encoding of the data.
  // The supported values are UTF-8, UTF-16BE, UTF-16LE, UTF-32BE,
  // and UTF-32LE.  The default value is UTF-8.
  string encoding = 1 [(google.api.field_behavior) = OPTIONAL];
}

// Information related to a Bigtable column.
message BigtableColumn {
  // [Required] Qualifier of the column.
  // Columns in the parent column family that has this exact qualifier are
  // exposed as `<family field name>.<column field name>` field.
  // If the qualifier is valid UTF-8 string, it can be specified in the
  // qualifier_string field.  Otherwise, a base-64 encoded value must be set to
  // qualifier_encoded.
  // The column field name is the same as the column qualifier. However, if the
  // qualifier is not a valid BigQuery field identifier i.e. does not match
  // [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as field_name.
  google.protobuf.BytesValue qualifier_encoded = 1;

  // Qualifier string.
  google.protobuf.StringValue qualifier_string = 2;

  // Optional. If the qualifier is not a valid BigQuery field identifier i.e.
  // does not match [a-zA-Z][a-zA-Z0-9_]*,  a valid identifier must be provided
  // as the column field name and is used as field name in queries.
  string field_name = 3 [(google.api.field_behavior) = OPTIONAL];

  // Optional. The type to convert the value in cells of this column.
  // The values are expected to be encoded using HBase Bytes.toBytes function
  // when using the BINARY encoding value.
  // Following BigQuery types are allowed (case-sensitive):
  //
  // * BYTES
  // * STRING
  // * INTEGER
  // * FLOAT
  // * BOOLEAN
  // * JSON
  //
  // Default type is BYTES.
  // 'type' can also be set at the column family level. However, the setting at
  // this level takes precedence if 'type' is set at both levels.
  string type = 4 [(google.api.field_behavior) = OPTIONAL];

  // Optional. The encoding of the values when the type is not STRING.
  // Acceptable encoding values are:
  //   TEXT - indicates values are alphanumeric text strings.
  //   BINARY - indicates values are encoded using HBase Bytes.toBytes family of
  //            functions.
  // 'encoding' can also be set at the column family level. However, the setting
  // at this level takes precedence if 'encoding' is set at both levels.
  string encoding = 5 [(google.api.field_behavior) = OPTIONAL];

  // Optional. If this is set, only the latest version of value in this column
  //             are exposed.
  // 'onlyReadLatest' can also be set at the column family level. However, the
  // setting at this level takes precedence if 'onlyReadLatest' is set at both
  // levels.
  google.protobuf.BoolValue only_read_latest = 6
      [(google.api.field_behavior) = OPTIONAL];
}

// Information related to a Bigtable column family.
message BigtableColumnFamily {
  // Identifier of the column family.
  string family_id = 1;

  // Optional. The type to convert the value in cells of this column family.
  // The values are expected to be encoded using HBase Bytes.toBytes function
  // when using the BINARY encoding value.
  // Following BigQuery types are allowed (case-sensitive):
  //
  // * BYTES
  // * STRING
  // * INTEGER
  // * FLOAT
  // * BOOLEAN
  // * JSON
  //
  // Default type is BYTES.
  // This can be overridden for a specific column by listing that column in
  // 'columns' and specifying a type for it.
  string type = 2 [(google.api.field_behavior) = OPTIONAL];

  // Optional. The encoding of the values when the type is not STRING.
  // Acceptable encoding values are:
  //   TEXT - indicates values are alphanumeric text strings.
  //   BINARY - indicates values are encoded using HBase Bytes.toBytes family of
  //            functions.
  // This can be overridden for a specific column by listing that column in
  // 'columns' and specifying an encoding for it.
  string encoding = 3 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Lists of columns that should be exposed as individual fields as
  // opposed to a list of (column name, value) pairs.
  // All columns whose qualifier matches a qualifier in this list can be
  // accessed as `<family field name>.<column field name>`.
  // Other columns can be accessed as a list through
  // the `<family field name>.Column` field.
  repeated BigtableColumn columns = 4 [(google.api.field_behavior) = OPTIONAL];

  // Optional. If this is set only the latest version of value are exposed for
  // all columns in this column family.
  // This can be overridden for a specific column by listing that column in
  // 'columns' and specifying a different setting
  // for that column.
  google.protobuf.BoolValue only_read_latest = 5
      [(google.api.field_behavior) = OPTIONAL];
}

// Options specific to Google Cloud Bigtable data sources.
message BigtableOptions {
  // Optional. List of column families to expose in the table schema along with
  // their types.
  // This list restricts the column families that can be referenced in queries
  // and specifies their value types.
  // You can use this list to do type conversions - see the 'type' field for
  // more details.
  // If you leave this list empty, all column families are present in the table
  // schema and their values are read as BYTES.
  // During a query only the column families referenced in that query are read
  // from Bigtable.
  repeated BigtableColumnFamily column_families = 1
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. If field is true, then the column families that are not
  // specified in columnFamilies list are not exposed in the table schema.
  // Otherwise, they are read with BYTES type values.
  // The default value is false.
  google.protobuf.BoolValue ignore_unspecified_column_families = 2
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. If field is true, then the rowkey column families will be read
  // and converted to string. Otherwise they are read with BYTES type values and
  // users need to manually cast them with CAST if necessary.
  // The default value is false.
  google.protobuf.BoolValue read_rowkey_as_string = 3
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. If field is true, then each column family will be read as a
  // single JSON column. Otherwise they are read as a repeated cell structure
  // containing timestamp/value tuples. The default value is false.
  google.protobuf.BoolValue output_column_families_as_json = 4
      [(google.api.field_behavior) = OPTIONAL];
}

// Options specific to Google Sheets data sources.
message GoogleSheetsOptions {
  // Optional. The number of rows at the top of a sheet that BigQuery will skip
  // when reading the data. The default value is 0. This property is useful if
  // you have header rows that should be skipped. When autodetect is on,
  // the behavior is the following:
  // * skipLeadingRows unspecified - Autodetect tries to detect headers in the
  //   first row. If they are not detected, the row is read as data. Otherwise
  //   data is read starting from the second row.
  // * skipLeadingRows is 0 - Instructs autodetect that there are no headers and
  //   data should be read starting from the first row.
  // * skipLeadingRows = N > 0 - Autodetect skips N-1 rows and tries to detect
  //   headers in row N. If headers are not detected, row N is just skipped.
  //   Otherwise row N is used to extract column names for the detected schema.
  google.protobuf.Int64Value skip_leading_rows = 1
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. Range of a sheet to query from. Only used when non-empty.
  // Typical format: sheet_name!top_left_cell_id:bottom_right_cell_id
  // For example: sheet1!A1:B20
  string range = 2 [(google.api.field_behavior) = OPTIONAL];
}

message ExternalDataConfiguration {
  // Supported Object Metadata Types.
  enum ObjectMetadata {
    // Unspecified by default.
    OBJECT_METADATA_UNSPECIFIED = 0;

    // A synonym for `SIMPLE`.
    DIRECTORY = 1;

    // Directory listing of objects.
    SIMPLE = 2;
  }

  // MetadataCacheMode identifies if the table should use metadata caching for
  // files from external source (eg Google Cloud Storage).
  enum MetadataCacheMode {
    // Unspecified metadata cache mode.
    METADATA_CACHE_MODE_UNSPECIFIED = 0;

    // Set this mode to trigger automatic background refresh of metadata cache
    // from the external source. Queries will use the latest available cache
    // version within the table's maxStaleness interval.
    AUTOMATIC = 1;

    // Set this mode to enable triggering manual refresh of the metadata cache
    // from external source. Queries will use the latest manually triggered
    // cache version within the table's maxStaleness interval.
    MANUAL = 2;
  }

  // [Required] The fully-qualified URIs that point to your data in Google
  // Cloud. For Google Cloud Storage URIs:
  //   Each URI can contain one '*' wildcard character and it must come after
  //   the 'bucket' name.
  //   Size limits related to load jobs apply to external data sources.
  // For Google Cloud Bigtable URIs:
  //   Exactly one URI can be specified and it has be a fully specified and
  //   valid HTTPS URL for a Google Cloud Bigtable table.
  // For Google Cloud Datastore backups, exactly one URI can be specified. Also,
  // the '*' wildcard character is not allowed.
  repeated string source_uris = 1;

  // Optional. Specifies how source URIs are interpreted for constructing the
  // file set to load.  By default source URIs are expanded against the
  // underlying storage.  Other options include specifying manifest files. Only
  // applicable to object storage systems.
  FileSetSpecType file_set_spec_type = 25
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. The schema for the data.
  // Schema is required for CSV and JSON formats if autodetect is not on.
  // Schema is disallowed for Google Cloud Bigtable, Cloud Datastore backups,
  // Avro, ORC and Parquet formats.
  TableSchema schema = 2 [(google.api.field_behavior) = OPTIONAL];

  // [Required] The data format.
  // For CSV files, specify "CSV".
  // For Google sheets, specify "GOOGLE_SHEETS".
  // For newline-delimited JSON, specify "NEWLINE_DELIMITED_JSON".
  // For Avro files, specify "AVRO".
  // For Google Cloud Datastore backups, specify "DATASTORE_BACKUP".
  // For Apache Iceberg tables, specify "ICEBERG".
  // For ORC files, specify "ORC".
  // For Parquet files, specify "PARQUET".
  // [Beta] For Google Cloud Bigtable, specify "BIGTABLE".
  string source_format = 3;

  // Optional. The maximum number of bad records that BigQuery can ignore when
  // reading data. If the number of bad records exceeds this value, an invalid
  // error is returned in the job result. The default value is 0, which requires
  // that all records are valid. This setting is ignored for Google Cloud
  // Bigtable, Google Cloud Datastore backups, Avro, ORC and Parquet formats.
  google.protobuf.Int32Value max_bad_records = 4
      [(google.api.field_behavior) = OPTIONAL];

  // Try to detect schema and format options automatically.
  // Any option specified explicitly will be honored.
  google.protobuf.BoolValue autodetect = 5;

  // Optional. Indicates if BigQuery should allow extra values that are not
  // represented in the table schema.
  // If true, the extra values are ignored.
  // If false, records with extra columns are treated as bad records, and if
  // there are too many bad records, an invalid error is returned in the job
  // result.
  // The default value is false.
  // The sourceFormat property determines what BigQuery treats as an extra
  // value:
  //   CSV: Trailing columns
  //   JSON: Named values that don't match any column names
  //   Google Cloud Bigtable: This setting is ignored.
  //   Google Cloud Datastore backups: This setting is ignored.
  //   Avro: This setting is ignored.
  //   ORC: This setting is ignored.
  //   Parquet: This setting is ignored.
  google.protobuf.BoolValue ignore_unknown_values = 6
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. The compression type of the data source.
  // Possible values include GZIP and NONE. The default value is NONE.
  // This setting is ignored for Google Cloud Bigtable, Google Cloud Datastore
  // backups, Avro, ORC and Parquet
  // formats. An empty string is an invalid value.
  string compression = 7 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Additional properties to set if sourceFormat is set to CSV.
  CsvOptions csv_options = 8 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Additional properties to set if sourceFormat is set to JSON.
  JsonOptions json_options = 26 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Additional options if sourceFormat is set to BIGTABLE.
  BigtableOptions bigtable_options = 9 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Additional options if sourceFormat is set to GOOGLE_SHEETS.
  GoogleSheetsOptions google_sheets_options = 10
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. When set, configures hive partitioning support. Not all storage
  // formats support hive partitioning -- requesting hive partitioning on an
  // unsupported format will lead to an error, as will providing an invalid
  // specification.
  HivePartitioningOptions hive_partitioning_options = 13
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. The connection specifying the credentials to be used to read
  // external storage, such as Azure Blob, Cloud Storage, or S3. The
  // connection_id can have the form
  // `{project_id}.{location_id};{connection_id}` or
  // `projects/{project_id}/locations/{location_id}/connections/{connection_id}`.
  string connection_id = 14 [(google.api.field_behavior) = OPTIONAL];

  // Defines the list of possible SQL data types to which the source decimal
  // values are converted. This list and the precision and the scale parameters
  // of the decimal field determine the target type. In the order of NUMERIC,
  // BIGNUMERIC, and STRING, a
  // type is picked if it is in the specified list and if it supports the
  // precision and the scale. STRING supports all precision and scale values.
  // If none of the listed types supports the precision and the scale, the type
  // supporting the widest range in the specified list is picked, and if a value
  // exceeds the supported range when reading the data, an error will be thrown.
  //
  // Example: Suppose the value of this field is ["NUMERIC", "BIGNUMERIC"].
  // If (precision,scale) is:
  //
  // * (38,9) -> NUMERIC;
  // * (39,9) -> BIGNUMERIC (NUMERIC cannot hold 30 integer digits);
  // * (38,10) -> BIGNUMERIC (NUMERIC cannot hold 10 fractional digits);
  // * (76,38) -> BIGNUMERIC;
  // * (77,38) -> BIGNUMERIC (error if value exeeds supported range).
  //
  // This field cannot contain duplicate types. The order of the types in this
  // field is ignored. For example, ["BIGNUMERIC", "NUMERIC"] is the same as
  // ["NUMERIC", "BIGNUMERIC"] and NUMERIC always takes precedence over
  // BIGNUMERIC.
  //
  // Defaults to ["NUMERIC", "STRING"] for ORC and ["NUMERIC"] for the other
  // file formats.
  repeated DecimalTargetType decimal_target_types = 16;

  // Optional. Additional properties to set if sourceFormat is set to AVRO.
  AvroOptions avro_options = 17 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Load option to be used together with source_format
  // newline-delimited JSON to indicate that a variant of JSON is being loaded.
  // To load newline-delimited GeoJSON, specify GEOJSON (and source_format must
  // be set to NEWLINE_DELIMITED_JSON).
  JsonExtension json_extension = 18 [(google.api.field_behavior) = OPTIONAL];

  // Optional. Additional properties to set if sourceFormat is set to PARQUET.
  ParquetOptions parquet_options = 19 [(google.api.field_behavior) = OPTIONAL];

  // Optional. ObjectMetadata is used to create Object Tables. Object Tables
  // contain a listing of objects (with their metadata) found at the
  // source_uris. If ObjectMetadata is set, source_format should be omitted.
  //
  // Currently SIMPLE is the only supported Object Metadata type.
  optional ObjectMetadata object_metadata = 22
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. When creating an external table, the user can provide a reference
  // file with the table schema. This is enabled for the following formats:
  // AVRO, PARQUET, ORC.
  google.protobuf.StringValue reference_file_schema_uri = 23
      [(google.api.field_behavior) = OPTIONAL];

  // Optional. Metadata Cache Mode for the table. Set this to enable caching of
  // metadata from external data source.
  MetadataCacheMode metadata_cache_mode = 24
      [(google.api.field_behavior) = OPTIONAL];
}