// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.dataplex.v1; import "google/api/field_behavior.proto"; import "google/cloud/dataplex/v1/processing.proto"; option go_package = "cloud.google.com/go/dataplex/apiv1/dataplexpb;dataplexpb"; option java_multiple_files = true; option java_outer_classname = "DataProfileProto"; option java_package = "com.google.cloud.dataplex.v1"; // DataProfileScan related setting. message DataProfileSpec { // The configuration of post scan actions of DataProfileScan job. message PostScanActions { // The configuration of BigQuery export post scan action. message BigQueryExport { // Optional. The BigQuery table to export DataProfileScan results to. // Format: // //bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID string results_table = 1 [(google.api.field_behavior) = OPTIONAL]; } // Optional. If set, results will be exported to the provided BigQuery // table. BigQueryExport bigquery_export = 1 [(google.api.field_behavior) = OPTIONAL]; } // The specification for fields to include or exclude in data profile scan. message SelectedFields { // Optional. Expected input is a list of fully qualified names of fields as // in the schema. // // Only top-level field names for nested fields are supported. // For instance, if 'x' is of nested field type, listing 'x' is supported // but 'x.y.z' is not supported. Here 'y' and 'y.z' are nested fields of // 'x'. repeated string field_names = 1 [(google.api.field_behavior) = OPTIONAL]; } // Optional. The percentage of the records to be selected from the dataset for // DataScan. // // * Value can range between 0.0 and 100.0 with up to 3 significant decimal // digits. // * Sampling is not applied if `sampling_percent` is not specified, 0 or // 100. float sampling_percent = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. A filter applied to all rows in a single DataScan job. // The filter needs to be a valid SQL expression for a WHERE clause in // BigQuery standard SQL syntax. // Example: col1 >= 0 AND col2 < 10 string row_filter = 3 [(google.api.field_behavior) = OPTIONAL]; // Optional. Actions to take upon job completion.. PostScanActions post_scan_actions = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. The fields to include in data profile. // // If not specified, all fields at the time of profile scan job execution are // included, except for ones listed in `exclude_fields`. SelectedFields include_fields = 5 [(google.api.field_behavior) = OPTIONAL]; // Optional. The fields to exclude from data profile. // // If specified, the fields will be excluded from data profile, regardless of // `include_fields` value. SelectedFields exclude_fields = 6 [(google.api.field_behavior) = OPTIONAL]; } // DataProfileResult defines the output of DataProfileScan. Each field of the // table will have field type specific profile result. message DataProfileResult { // Contains name, type, mode and field type specific profile information. message Profile { // A field within a table. message Field { // The profile information for each field type. message ProfileInfo { // The profile information for a string type field. message StringFieldInfo { // Minimum length of non-null values in the scanned data. int64 min_length = 1; // Maximum length of non-null values in the scanned data. int64 max_length = 2; // Average length of non-null values in the scanned data. double average_length = 3; } // The profile information for an integer type field. message IntegerFieldInfo { // Average of non-null values in the scanned data. NaN, if the field // has a NaN. double average = 1; // Standard deviation of non-null values in the scanned data. NaN, if // the field has a NaN. double standard_deviation = 3; // Minimum of non-null values in the scanned data. NaN, if the field // has a NaN. int64 min = 4; // A quartile divides the number of data points into four parts, or // quarters, of more-or-less equal size. Three main quartiles used // are: The first quartile (Q1) splits off the lowest 25% of data from // the highest 75%. It is also known as the lower or 25th empirical // quartile, as 25% of the data is below this point. The second // quartile (Q2) is the median of a data set. So, 50% of the data lies // below this point. The third quartile (Q3) splits off the highest // 25% of data from the lowest 75%. It is known as the upper or 75th // empirical quartile, as 75% of the data lies below this point. // Here, the quartiles is provided as an ordered list of approximate // quartile values for the scanned data, occurring in order Q1, // median, Q3. repeated int64 quartiles = 6; // Maximum of non-null values in the scanned data. NaN, if the field // has a NaN. int64 max = 5; } // The profile information for a double type field. message DoubleFieldInfo { // Average of non-null values in the scanned data. NaN, if the field // has a NaN. double average = 1; // Standard deviation of non-null values in the scanned data. NaN, if // the field has a NaN. double standard_deviation = 3; // Minimum of non-null values in the scanned data. NaN, if the field // has a NaN. double min = 4; // A quartile divides the number of data points into four parts, or // quarters, of more-or-less equal size. Three main quartiles used // are: The first quartile (Q1) splits off the lowest 25% of data from // the highest 75%. It is also known as the lower or 25th empirical // quartile, as 25% of the data is below this point. The second // quartile (Q2) is the median of a data set. So, 50% of the data lies // below this point. The third quartile (Q3) splits off the highest // 25% of data from the lowest 75%. It is known as the upper or 75th // empirical quartile, as 75% of the data lies below this point. // Here, the quartiles is provided as an ordered list of quartile // values for the scanned data, occurring in order Q1, median, Q3. repeated double quartiles = 6; // Maximum of non-null values in the scanned data. NaN, if the field // has a NaN. double max = 5; } // Top N non-null values in the scanned data. message TopNValue { // String value of a top N non-null value. string value = 1; // Count of the corresponding value in the scanned data. int64 count = 2; // Ratio of the corresponding value in the field against the total // number of rows in the scanned data. double ratio = 3; } // Ratio of rows with null value against total scanned rows. double null_ratio = 2; // Ratio of rows with distinct values against total scanned rows. // Not available for complex non-groupable field type RECORD and fields // with REPEATABLE mode. double distinct_ratio = 3; // The list of top N non-null values, frequency and ratio with which // they occur in the scanned data. N is 10 or equal to the number of // distinct values in the field, whichever is smaller. Not available for // complex non-groupable field type RECORD and fields with REPEATABLE // mode. repeated TopNValue top_n_values = 4; // Structural and profile information for specific field type. Not // available, if mode is REPEATABLE. oneof field_info { // String type field information. StringFieldInfo string_profile = 101; // Integer type field information. IntegerFieldInfo integer_profile = 102; // Double type field information. DoubleFieldInfo double_profile = 103; } } // The name of the field. string name = 1; // The data type retrieved from the schema of the data source. For // instance, for a BigQuery native table, it is the [BigQuery Table // Schema](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#tablefieldschema). // For a Dataplex Entity, it is the [Entity // Schema](https://cloud.google.com/dataplex/docs/reference/rpc/google.cloud.dataplex.v1#type_3). string type = 2; // The mode of the field. Possible values include: // // * REQUIRED, if it is a required field. // * NULLABLE, if it is an optional field. // * REPEATED, if it is a repeated field. string mode = 3; // Profile information for the corresponding field. ProfileInfo profile = 4; } // List of fields with structural and profile information for each field. repeated Field fields = 2; } // The result of post scan actions of DataProfileScan job. message PostScanActionsResult { // The result of BigQuery export post scan action. message BigQueryExportResult { // Execution state for the exporting. enum State { // The exporting state is unspecified. STATE_UNSPECIFIED = 0; // The exporting completed successfully. SUCCEEDED = 1; // The exporting is no longer running due to an error. FAILED = 2; // The exporting is skipped due to no valid scan result to export // (usually caused by scan failed). SKIPPED = 3; } // Output only. Execution state for the BigQuery exporting. State state = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Additional information about the BigQuery exporting. string message = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Output only. The result of BigQuery export post scan action. BigQueryExportResult bigquery_export_result = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // The count of rows scanned. int64 row_count = 3; // The profile information per field. Profile profile = 4; // The data scanned for this result. ScannedData scanned_data = 5; // Output only. The result of post scan actions. PostScanActionsResult post_scan_actions_result = 6 [(google.api.field_behavior) = OUTPUT_ONLY]; }