// Copyright 2023 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.dataplex.v1; import "google/api/field_behavior.proto"; import "google/cloud/dataplex/v1/processing.proto"; option go_package = "cloud.google.com/go/dataplex/apiv1/dataplexpb;dataplexpb"; option java_multiple_files = true; option java_outer_classname = "DataQualityProto"; option java_package = "com.google.cloud.dataplex.v1"; // DataQualityScan related setting. message DataQualitySpec { // The list of rules to evaluate against a data source. At least one rule is // required. repeated DataQualityRule rules = 1; } // The output of a DataQualityScan. message DataQualityResult { // Overall data quality result -- `true` if all rules passed. bool passed = 5; // A list of results at the dimension level. repeated DataQualityDimensionResult dimensions = 2; // A list of all the rules in a job, and their results. repeated DataQualityRuleResult rules = 3; // The count of rows processed. int64 row_count = 4; // The data scanned for this result. ScannedData scanned_data = 7; } // DataQualityRuleResult provides a more detailed, per-rule view of the results. message DataQualityRuleResult { // The rule specified in the DataQualitySpec, as is. DataQualityRule rule = 1; // Whether the rule passed or failed. bool passed = 7; // The number of rows a rule was evaluated against. This field is only valid // for ColumnMap type rules. // // Evaluated count can be configured to either // // * include all rows (default) - with `null` rows automatically failing rule // evaluation, or // * exclude `null` rows from the `evaluated_count`, by setting // `ignore_nulls = true`. int64 evaluated_count = 9; // The number of rows which passed a rule evaluation. // This field is only valid for ColumnMap type rules. int64 passed_count = 8; // The number of rows with null values in the specified column. int64 null_count = 5; // The ratio of **passed_count / evaluated_count**. // This field is only valid for ColumnMap type rules. double pass_ratio = 6; // The query to find rows that did not pass this rule. // Only applies to ColumnMap and RowCondition rules. string failing_rows_query = 10; } // DataQualityDimensionResult provides a more detailed, per-dimension view of // the results. message DataQualityDimensionResult { // Whether the dimension passed or failed. bool passed = 3; } // A rule captures data quality intent about a data source. message DataQualityRule { // Evaluates whether each column value lies between a specified range. message RangeExpectation { // Optional. The minimum column value allowed for a row to pass this // validation. At least one of `min_value` and `max_value` need to be // provided. string min_value = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. The maximum column value allowed for a row to pass this // validation. At least one of `min_value` and `max_value` need to be // provided. string max_value = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether each value needs to be strictly greater than ('>') the // minimum, or if equality is allowed. // // Only relevant if a `min_value` has been defined. Default = false. bool strict_min_enabled = 3 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether each value needs to be strictly lesser than ('<') the // maximum, or if equality is allowed. // // Only relevant if a `max_value` has been defined. Default = false. bool strict_max_enabled = 4 [(google.api.field_behavior) = OPTIONAL]; } // Evaluates whether each column value is null. message NonNullExpectation {} // Evaluates whether each column value is contained by a specified set. message SetExpectation { // Expected values for the column value. repeated string values = 1; } // Evaluates whether each column value matches a specified regex. message RegexExpectation { // A regular expression the column value is expected to match. string regex = 1; } // Evaluates whether the column has duplicates. message UniquenessExpectation {} // Evaluates whether the column aggregate statistic lies between a specified // range. message StatisticRangeExpectation { // The list of aggregate metrics a rule can be evaluated against. enum ColumnStatistic { // Unspecified statistic type STATISTIC_UNDEFINED = 0; // Evaluate the column mean MEAN = 1; // Evaluate the column min MIN = 2; // Evaluate the column max MAX = 3; } // The aggregate metric to evaluate. ColumnStatistic statistic = 1; // The minimum column statistic value allowed for a row to pass this // validation. // // At least one of `min_value` and `max_value` need to be provided. string min_value = 2; // The maximum column statistic value allowed for a row to pass this // validation. // // At least one of `min_value` and `max_value` need to be provided. string max_value = 3; // Whether column statistic needs to be strictly greater than ('>') // the minimum, or if equality is allowed. // // Only relevant if a `min_value` has been defined. Default = false. bool strict_min_enabled = 4; // Whether column statistic needs to be strictly lesser than ('<') the // maximum, or if equality is allowed. // // Only relevant if a `max_value` has been defined. Default = false. bool strict_max_enabled = 5; } // Evaluates whether each row passes the specified condition. // // The SQL expression needs to use BigQuery standard SQL syntax and should // produce a boolean value per row as the result. // // Example: col1 >= 0 AND col2 < 10 message RowConditionExpectation { // The SQL expression. string sql_expression = 1; } // Evaluates whether the provided expression is true. // // The SQL expression needs to use BigQuery standard SQL syntax and should // produce a scalar boolean result. // // Example: MIN(col1) >= 0 message TableConditionExpectation { // The SQL expression. string sql_expression = 1; } // The rule-specific configuration. oneof rule_type { // ColumnMap rule which evaluates whether each column value lies between a // specified range. RangeExpectation range_expectation = 1; // ColumnMap rule which evaluates whether each column value is null. NonNullExpectation non_null_expectation = 2; // ColumnMap rule which evaluates whether each column value is contained by // a specified set. SetExpectation set_expectation = 3; // ColumnMap rule which evaluates whether each column value matches a // specified regex. RegexExpectation regex_expectation = 4; // ColumnAggregate rule which evaluates whether the column has duplicates. UniquenessExpectation uniqueness_expectation = 100; // ColumnAggregate rule which evaluates whether the column aggregate // statistic lies between a specified range. StatisticRangeExpectation statistic_range_expectation = 101; // Table rule which evaluates whether each row passes the specified // condition. RowConditionExpectation row_condition_expectation = 200; // Table rule which evaluates whether the provided expression is true. TableConditionExpectation table_condition_expectation = 201; } // Optional. The unnested column which this rule is evaluated against. string column = 500 [(google.api.field_behavior) = OPTIONAL]; // Optional. Rows with `null` values will automatically fail a rule, unless // `ignore_null` is `true`. In that case, such `null` rows are trivially // considered passing. // // Only applicable to ColumnMap rules. bool ignore_null = 501 [(google.api.field_behavior) = OPTIONAL]; // Required. The dimension a rule belongs to. Results are also aggregated at // the dimension level. Supported dimensions are **["COMPLETENESS", // "ACCURACY", "CONSISTENCY", "VALIDITY", "UNIQUENESS", "INTEGRITY"]** string dimension = 502 [(google.api.field_behavior) = REQUIRED]; // Optional. The minimum ratio of **passing_rows / total_rows** required to // pass this rule, with a range of [0.0, 1.0]. // // 0 indicates default value (i.e. 1.0). double threshold = 503 [(google.api.field_behavior) = OPTIONAL]; }