// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.dataplex.v1; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; import "google/cloud/dataplex/v1/processing.proto"; option go_package = "cloud.google.com/go/dataplex/apiv1/dataplexpb;dataplexpb"; option java_multiple_files = true; option java_outer_classname = "DataQualityProto"; option java_package = "com.google.cloud.dataplex.v1"; option (google.api.resource_definition) = { type: "bigquery.googleapis.com/Table" pattern: "projects/{project}/datasets/{dataset}/tables/{table}" }; // DataQualityScan related setting. message DataQualitySpec { // The configuration of post scan actions of DataQualityScan. message PostScanActions { // The configuration of BigQuery export post scan action. message BigQueryExport { // Optional. The BigQuery table to export DataQualityScan results to. // Format: // //bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID/tables/TABLE_ID string results_table = 1 [(google.api.field_behavior) = OPTIONAL]; } // The individuals or groups who are designated to receive notifications // upon triggers. message Recipients { // Optional. The email recipients who will receive the DataQualityScan // results report. repeated string emails = 1 [(google.api.field_behavior) = OPTIONAL]; } // This trigger is triggered when the DQ score in the job result is less // than a specified input score. message ScoreThresholdTrigger { // Optional. The score range is in [0,100]. float score_threshold = 2 [(google.api.field_behavior) = OPTIONAL]; } // This trigger is triggered when the scan job itself fails, regardless of // the result. message JobFailureTrigger {} // This trigger is triggered whenever a scan job run ends, regardless // of the result. message JobEndTrigger {} // The configuration of notification report post scan action. message NotificationReport { // Required. The recipients who will receive the notification report. Recipients recipients = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. If set, report will be sent when score threshold is met. ScoreThresholdTrigger score_threshold_trigger = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. If set, report will be sent when a scan job fails. JobFailureTrigger job_failure_trigger = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. If set, report will be sent when a scan job ends. JobEndTrigger job_end_trigger = 5 [(google.api.field_behavior) = OPTIONAL]; } // Optional. If set, results will be exported to the provided BigQuery // table. BigQueryExport bigquery_export = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. If set, results will be sent to the provided notification // receipts upon triggers. NotificationReport notification_report = 2 [(google.api.field_behavior) = OPTIONAL]; } // Required. The list of rules to evaluate against a data source. At least one // rule is required. repeated DataQualityRule rules = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. The percentage of the records to be selected from the dataset for // DataScan. // // * Value can range between 0.0 and 100.0 with up to 3 significant decimal // digits. // * Sampling is not applied if `sampling_percent` is not specified, 0 or // 100. float sampling_percent = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. A filter applied to all rows in a single DataScan job. // The filter needs to be a valid SQL expression for a WHERE clause in // BigQuery standard SQL syntax. // Example: col1 >= 0 AND col2 < 10 string row_filter = 5 [(google.api.field_behavior) = OPTIONAL]; // Optional. Actions to take upon job completion. PostScanActions post_scan_actions = 6 [(google.api.field_behavior) = OPTIONAL]; } // The output of a DataQualityScan. message DataQualityResult { // The result of post scan actions of DataQualityScan job. message PostScanActionsResult { // The result of BigQuery export post scan action. message BigQueryExportResult { // Execution state for the exporting. enum State { // The exporting state is unspecified. STATE_UNSPECIFIED = 0; // The exporting completed successfully. SUCCEEDED = 1; // The exporting is no longer running due to an error. FAILED = 2; // The exporting is skipped due to no valid scan result to export // (usually caused by scan failed). SKIPPED = 3; } // Output only. Execution state for the BigQuery exporting. State state = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Additional information about the BigQuery exporting. string message = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Output only. The result of BigQuery export post scan action. BigQueryExportResult bigquery_export_result = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Overall data quality result -- `true` if all rules passed. bool passed = 5; // Output only. The overall data quality score. // // The score ranges between [0, 100] (up to two decimal points). optional float score = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; // A list of results at the dimension level. // // A dimension will have a corresponding `DataQualityDimensionResult` if and // only if there is at least one rule with the 'dimension' field set to it. repeated DataQualityDimensionResult dimensions = 2; // Output only. A list of results at the column level. // // A column will have a corresponding `DataQualityColumnResult` if and only if // there is at least one rule with the 'column' field set to it. repeated DataQualityColumnResult columns = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; // A list of all the rules in a job, and their results. repeated DataQualityRuleResult rules = 3; // The count of rows processed. int64 row_count = 4; // The data scanned for this result. ScannedData scanned_data = 7; // Output only. The result of post scan actions. PostScanActionsResult post_scan_actions_result = 8 [(google.api.field_behavior) = OUTPUT_ONLY]; } // DataQualityRuleResult provides a more detailed, per-rule view of the results. message DataQualityRuleResult { // The rule specified in the DataQualitySpec, as is. DataQualityRule rule = 1; // Whether the rule passed or failed. bool passed = 7; // The number of rows a rule was evaluated against. // // This field is only valid for row-level type rules. // // Evaluated count can be configured to either // // * include all rows (default) - with `null` rows automatically failing rule // evaluation, or // * exclude `null` rows from the `evaluated_count`, by setting // `ignore_nulls = true`. int64 evaluated_count = 9; // The number of rows which passed a rule evaluation. // // This field is only valid for row-level type rules. int64 passed_count = 8; // The number of rows with null values in the specified column. int64 null_count = 5; // The ratio of **passed_count / evaluated_count**. // // This field is only valid for row-level type rules. double pass_ratio = 6; // The query to find rows that did not pass this rule. // // This field is only valid for row-level type rules. string failing_rows_query = 10; // Output only. The number of rows returned by the sql statement in the // SqlAssertion rule. // // This field is only valid for SqlAssertion rules. int64 assertion_row_count = 11 [(google.api.field_behavior) = OUTPUT_ONLY]; } // DataQualityDimensionResult provides a more detailed, per-dimension view of // the results. message DataQualityDimensionResult { // Output only. The dimension config specified in the DataQualitySpec, as is. DataQualityDimension dimension = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Whether the dimension passed or failed. bool passed = 3; // Output only. The dimension-level data quality score for this data scan job // if and only if the 'dimension' field is set. // // The score ranges between [0, 100] (up to two decimal // points). optional float score = 4 [(google.api.field_behavior) = OUTPUT_ONLY]; } // A dimension captures data quality intent about a defined subset of the rules // specified. message DataQualityDimension { // The dimension name a rule belongs to. Supported dimensions are // ["COMPLETENESS", "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS", // "INTEGRITY"] string name = 1; } // A rule captures data quality intent about a data source. message DataQualityRule { // Evaluates whether each column value lies between a specified range. message RangeExpectation { // Optional. The minimum column value allowed for a row to pass this // validation. At least one of `min_value` and `max_value` need to be // provided. string min_value = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. The maximum column value allowed for a row to pass this // validation. At least one of `min_value` and `max_value` need to be // provided. string max_value = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether each value needs to be strictly greater than ('>') the // minimum, or if equality is allowed. // // Only relevant if a `min_value` has been defined. Default = false. bool strict_min_enabled = 3 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether each value needs to be strictly lesser than ('<') the // maximum, or if equality is allowed. // // Only relevant if a `max_value` has been defined. Default = false. bool strict_max_enabled = 4 [(google.api.field_behavior) = OPTIONAL]; } // Evaluates whether each column value is null. message NonNullExpectation {} // Evaluates whether each column value is contained by a specified set. message SetExpectation { // Optional. Expected values for the column value. repeated string values = 1 [(google.api.field_behavior) = OPTIONAL]; } // Evaluates whether each column value matches a specified regex. message RegexExpectation { // Optional. A regular expression the column value is expected to match. string regex = 1 [(google.api.field_behavior) = OPTIONAL]; } // Evaluates whether the column has duplicates. message UniquenessExpectation {} // Evaluates whether the column aggregate statistic lies between a specified // range. message StatisticRangeExpectation { // The list of aggregate metrics a rule can be evaluated against. enum ColumnStatistic { // Unspecified statistic type STATISTIC_UNDEFINED = 0; // Evaluate the column mean MEAN = 1; // Evaluate the column min MIN = 2; // Evaluate the column max MAX = 3; } // Optional. The aggregate metric to evaluate. ColumnStatistic statistic = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. The minimum column statistic value allowed for a row to pass // this validation. // // At least one of `min_value` and `max_value` need to be provided. string min_value = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. The maximum column statistic value allowed for a row to pass // this validation. // // At least one of `min_value` and `max_value` need to be provided. string max_value = 3 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether column statistic needs to be strictly greater than // ('>') the minimum, or if equality is allowed. // // Only relevant if a `min_value` has been defined. Default = false. bool strict_min_enabled = 4 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether column statistic needs to be strictly lesser than ('<') // the maximum, or if equality is allowed. // // Only relevant if a `max_value` has been defined. Default = false. bool strict_max_enabled = 5 [(google.api.field_behavior) = OPTIONAL]; } // Evaluates whether each row passes the specified condition. // // The SQL expression needs to use BigQuery standard SQL syntax and should // produce a boolean value per row as the result. // // Example: col1 >= 0 AND col2 < 10 message RowConditionExpectation { // Optional. The SQL expression. string sql_expression = 1 [(google.api.field_behavior) = OPTIONAL]; } // Evaluates whether the provided expression is true. // // The SQL expression needs to use BigQuery standard SQL syntax and should // produce a scalar boolean result. // // Example: MIN(col1) >= 0 message TableConditionExpectation { // Optional. The SQL expression. string sql_expression = 1 [(google.api.field_behavior) = OPTIONAL]; } // Queries for rows returned by the provided SQL statement. If any rows are // are returned, this rule fails. // // The SQL statement needs to use BigQuery standard SQL syntax, and must not // contain any semicolons. // // ${data()} can be used to reference the rows being evaluated, i.e. the table // after all additional filters (row filters, incremental data filters, // sampling) are applied. // // Example: SELECT * FROM ${data()} WHERE price < 0 message SqlAssertion { // Optional. The SQL statement. string sql_statement = 1 [(google.api.field_behavior) = OPTIONAL]; } // The rule-specific configuration. oneof rule_type { // Row-level rule which evaluates whether each column value lies between a // specified range. RangeExpectation range_expectation = 1; // Row-level rule which evaluates whether each column value is null. NonNullExpectation non_null_expectation = 2; // Row-level rule which evaluates whether each column value is contained by // a specified set. SetExpectation set_expectation = 3; // Row-level rule which evaluates whether each column value matches a // specified regex. RegexExpectation regex_expectation = 4; // Row-level rule which evaluates whether each column value is unique. UniquenessExpectation uniqueness_expectation = 100; // Aggregate rule which evaluates whether the column aggregate // statistic lies between a specified range. StatisticRangeExpectation statistic_range_expectation = 101; // Row-level rule which evaluates whether each row in a table passes the // specified condition. RowConditionExpectation row_condition_expectation = 200; // Aggregate rule which evaluates whether the provided expression is true // for a table. TableConditionExpectation table_condition_expectation = 201; // Aggregate rule which evaluates the number of rows returned for the // provided statement. SqlAssertion sql_assertion = 202; } // Optional. The unnested column which this rule is evaluated against. string column = 500 [(google.api.field_behavior) = OPTIONAL]; // Optional. Rows with `null` values will automatically fail a rule, unless // `ignore_null` is `true`. In that case, such `null` rows are trivially // considered passing. // // This field is only valid for the following type of rules: // // * RangeExpectation // * RegexExpectation // * SetExpectation // * UniquenessExpectation bool ignore_null = 501 [(google.api.field_behavior) = OPTIONAL]; // Required. The dimension a rule belongs to. Results are also aggregated at // the dimension level. Supported dimensions are **["COMPLETENESS", // "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS", "INTEGRITY"]** string dimension = 502 [(google.api.field_behavior) = REQUIRED]; // Optional. The minimum ratio of **passing_rows / total_rows** required to // pass this rule, with a range of [0.0, 1.0]. // // 0 indicates default value (i.e. 1.0). // // This field is only valid for row-level type rules. double threshold = 503 [(google.api.field_behavior) = OPTIONAL]; // Optional. A mutable name for the rule. // // * The name must contain only letters (a-z, A-Z), numbers (0-9), or // hyphens (-). // * The maximum length is 63 characters. // * Must start with a letter. // * Must end with a number or a letter. string name = 504 [(google.api.field_behavior) = OPTIONAL]; // Optional. Description of the rule. // // * The maximum length is 1,024 characters. string description = 505 [(google.api.field_behavior) = OPTIONAL]; } // DataQualityColumnResult provides a more detailed, per-column view of // the results. message DataQualityColumnResult { // Output only. The column specified in the DataQualityRule. string column = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. The column-level data quality score for this data scan job if // and only if the 'column' field is set. // // The score ranges between between [0, 100] (up to two decimal // points). optional float score = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; }