// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.aiplatform.v1; import "google/api/annotations.proto"; import "google/api/client.proto"; import "google/api/field_behavior.proto"; import "google/api/resource.proto"; option csharp_namespace = "Google.Cloud.AIPlatform.V1"; option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb"; option java_multiple_files = true; option java_outer_classname = "EvaluationServiceProto"; option java_package = "com.google.cloud.aiplatform.v1"; option php_namespace = "Google\\Cloud\\AIPlatform\\V1"; option ruby_package = "Google::Cloud::AIPlatform::V1"; // Vertex AI Online Evaluation Service. service EvaluationService { option (google.api.default_host) = "aiplatform.googleapis.com"; option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform"; // Evaluates instances based on a given metric. rpc EvaluateInstances(EvaluateInstancesRequest) returns (EvaluateInstancesResponse) { option (google.api.http) = { post: "/v1/{location=projects/*/locations/*}:evaluateInstances" body: "*" }; } } // Pairwise prediction autorater preference. enum PairwiseChoice { // Unspecified prediction choice. PAIRWISE_CHOICE_UNSPECIFIED = 0; // Baseline prediction wins BASELINE = 1; // Candidate prediction wins CANDIDATE = 2; // Winner cannot be determined TIE = 3; } // Request message for EvaluationService.EvaluateInstances. message EvaluateInstancesRequest { // Instances and specs for evaluation oneof metric_inputs { // Auto metric instances. // Instances and metric spec for exact match metric. ExactMatchInput exact_match_input = 2; // Instances and metric spec for bleu metric. BleuInput bleu_input = 3; // Instances and metric spec for rouge metric. RougeInput rouge_input = 4; // LLM-based metric instance. // General text generation metrics, applicable to other categories. // Input for fluency metric. FluencyInput fluency_input = 5; // Input for coherence metric. CoherenceInput coherence_input = 6; // Input for safety metric. SafetyInput safety_input = 8; // Input for groundedness metric. GroundednessInput groundedness_input = 9; // Input for fulfillment metric. FulfillmentInput fulfillment_input = 12; // Input for summarization quality metric. SummarizationQualityInput summarization_quality_input = 7; // Input for pairwise summarization quality metric. PairwiseSummarizationQualityInput pairwise_summarization_quality_input = 23; // Input for summarization helpfulness metric. SummarizationHelpfulnessInput summarization_helpfulness_input = 14; // Input for summarization verbosity metric. SummarizationVerbosityInput summarization_verbosity_input = 15; // Input for question answering quality metric. QuestionAnsweringQualityInput question_answering_quality_input = 10; // Input for pairwise question answering quality metric. PairwiseQuestionAnsweringQualityInput pairwise_question_answering_quality_input = 24; // Input for question answering relevance metric. QuestionAnsweringRelevanceInput question_answering_relevance_input = 16; // Input for question answering helpfulness // metric. QuestionAnsweringHelpfulnessInput question_answering_helpfulness_input = 17; // Input for question answering correctness // metric. QuestionAnsweringCorrectnessInput question_answering_correctness_input = 18; // Input for pointwise metric. PointwiseMetricInput pointwise_metric_input = 28; // Input for pairwise metric. PairwiseMetricInput pairwise_metric_input = 29; // Tool call metric instances. // Input for tool call valid metric. ToolCallValidInput tool_call_valid_input = 19; // Input for tool name match metric. ToolNameMatchInput tool_name_match_input = 20; // Input for tool parameter key match metric. ToolParameterKeyMatchInput tool_parameter_key_match_input = 21; // Input for tool parameter key value match metric. ToolParameterKVMatchInput tool_parameter_kv_match_input = 22; } // Required. The resource name of the Location to evaluate the instances. // Format: `projects/{project}/locations/{location}` string location = 1 [ (google.api.field_behavior) = REQUIRED, (google.api.resource_reference) = { type: "locations.googleapis.com/Location" } ]; } // Response message for EvaluationService.EvaluateInstances. message EvaluateInstancesResponse { // Evaluation results will be served in the same order as presented in // EvaluationRequest.instances. oneof evaluation_results { // Auto metric evaluation results. // Results for exact match metric. ExactMatchResults exact_match_results = 1; // Results for bleu metric. BleuResults bleu_results = 2; // Results for rouge metric. RougeResults rouge_results = 3; // LLM-based metric evaluation result. // General text generation metrics, applicable to other categories. // Result for fluency metric. FluencyResult fluency_result = 4; // Result for coherence metric. CoherenceResult coherence_result = 5; // Result for safety metric. SafetyResult safety_result = 7; // Result for groundedness metric. GroundednessResult groundedness_result = 8; // Result for fulfillment metric. FulfillmentResult fulfillment_result = 11; // Summarization only metrics. // Result for summarization quality metric. SummarizationQualityResult summarization_quality_result = 6; // Result for pairwise summarization quality metric. PairwiseSummarizationQualityResult pairwise_summarization_quality_result = 22; // Result for summarization helpfulness metric. SummarizationHelpfulnessResult summarization_helpfulness_result = 13; // Result for summarization verbosity metric. SummarizationVerbosityResult summarization_verbosity_result = 14; // Question answering only metrics. // Result for question answering quality metric. QuestionAnsweringQualityResult question_answering_quality_result = 9; // Result for pairwise question answering quality metric. PairwiseQuestionAnsweringQualityResult pairwise_question_answering_quality_result = 23; // Result for question answering relevance metric. QuestionAnsweringRelevanceResult question_answering_relevance_result = 15; // Result for question answering helpfulness metric. QuestionAnsweringHelpfulnessResult question_answering_helpfulness_result = 16; // Result for question answering correctness metric. QuestionAnsweringCorrectnessResult question_answering_correctness_result = 17; // Generic metrics. // Result for pointwise metric. PointwiseMetricResult pointwise_metric_result = 27; // Result for pairwise metric. PairwiseMetricResult pairwise_metric_result = 28; // Tool call metrics. // Results for tool call valid metric. ToolCallValidResults tool_call_valid_results = 18; // Results for tool name match metric. ToolNameMatchResults tool_name_match_results = 19; // Results for tool parameter key match metric. ToolParameterKeyMatchResults tool_parameter_key_match_results = 20; // Results for tool parameter key value match metric. ToolParameterKVMatchResults tool_parameter_kv_match_results = 21; } } // Input for exact match metric. message ExactMatchInput { // Required. Spec for exact match metric. ExactMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Repeated exact match instances. repeated ExactMatchInstance instances = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for exact match instance. message ExactMatchInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for exact match metric - returns 1 if prediction and reference exactly // matches, otherwise 0. message ExactMatchSpec {} // Results for exact match metric. message ExactMatchResults { // Output only. Exact match metric values. repeated ExactMatchMetricValue exact_match_metric_values = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Exact match metric value for an instance. message ExactMatchMetricValue { // Output only. Exact match score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for bleu metric. message BleuInput { // Required. Spec for bleu score metric. BleuSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Repeated bleu instances. repeated BleuInstance instances = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for bleu instance. message BleuInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for bleu score metric - calculates the precision of n-grams in the // prediction as compared to reference - returns a score ranging between 0 to 1. message BleuSpec { // Optional. Whether to use_effective_order to compute bleu score. bool use_effective_order = 1 [(google.api.field_behavior) = OPTIONAL]; } // Results for bleu metric. message BleuResults { // Output only. Bleu metric values. repeated BleuMetricValue bleu_metric_values = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Bleu metric value for an instance. message BleuMetricValue { // Output only. Bleu score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for rouge metric. message RougeInput { // Required. Spec for rouge score metric. RougeSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Repeated rouge instances. repeated RougeInstance instances = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for rouge instance. message RougeInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for rouge score metric - calculates the recall of n-grams in prediction // as compared to reference - returns a score ranging between 0 and 1. message RougeSpec { // Optional. Supported rouge types are rougen[1-9], rougeL, and rougeLsum. string rouge_type = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether to use stemmer to compute rouge score. bool use_stemmer = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Whether to split summaries while using rougeLsum. bool split_summaries = 3 [(google.api.field_behavior) = OPTIONAL]; } // Results for rouge metric. message RougeResults { // Output only. Rouge metric values. repeated RougeMetricValue rouge_metric_values = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Rouge metric value for an instance. message RougeMetricValue { // Output only. Rouge score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for coherence metric. message CoherenceInput { // Required. Spec for coherence score metric. CoherenceSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Coherence instance. CoherenceInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for coherence instance. message CoherenceInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; } // Spec for coherence score metric. message CoherenceSpec { // Optional. Which version to use for evaluation. int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; } // Spec for coherence result. message CoherenceResult { // Output only. Coherence score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for coherence score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for coherence score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for fluency metric. message FluencyInput { // Required. Spec for fluency score metric. FluencySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Fluency instance. FluencyInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for fluency instance. message FluencyInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; } // Spec for fluency score metric. message FluencySpec { // Optional. Which version to use for evaluation. int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; } // Spec for fluency result. message FluencyResult { // Output only. Fluency score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for fluency score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for fluency score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for safety metric. message SafetyInput { // Required. Spec for safety metric. SafetySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Safety instance. SafetyInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for safety instance. message SafetyInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; } // Spec for safety metric. message SafetySpec { // Optional. Which version to use for evaluation. int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; } // Spec for safety result. message SafetyResult { // Output only. Safety score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for safety score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for safety score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for groundedness metric. message GroundednessInput { // Required. Spec for groundedness metric. GroundednessSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Groundedness instance. GroundednessInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for groundedness instance. message GroundednessInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Background information provided in context used to compare // against the prediction. optional string context = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for groundedness metric. message GroundednessSpec { // Optional. Which version to use for evaluation. int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; } // Spec for groundedness result. message GroundednessResult { // Output only. Groundedness score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for groundedness score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for groundedness score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for fulfillment metric. message FulfillmentInput { // Required. Spec for fulfillment score metric. FulfillmentSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Fulfillment instance. FulfillmentInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for fulfillment instance. message FulfillmentInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Inference instruction prompt to compare prediction with. optional string instruction = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for fulfillment metric. message FulfillmentSpec { // Optional. Which version to use for evaluation. int32 version = 1 [(google.api.field_behavior) = OPTIONAL]; } // Spec for fulfillment result. message FulfillmentResult { // Output only. Fulfillment score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for fulfillment score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for fulfillment score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for summarization quality metric. message SummarizationQualityInput { // Required. Spec for summarization quality score metric. SummarizationQualitySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Summarization quality instance. SummarizationQualityInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for summarization quality instance. message SummarizationQualityInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; // Required. Text to be summarized. optional string context = 3 [(google.api.field_behavior) = REQUIRED]; // Required. Summarization prompt for LLM. optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; } // Spec for summarization quality score metric. message SummarizationQualitySpec { // Optional. Whether to use instance.reference to compute summarization // quality. bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Which version to use for evaluation. int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; } // Spec for summarization quality result. message SummarizationQualityResult { // Output only. Summarization Quality score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for summarization quality score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for summarization quality score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for pairwise summarization quality metric. message PairwiseSummarizationQualityInput { // Required. Spec for pairwise summarization quality score metric. PairwiseSummarizationQualitySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Pairwise summarization quality instance. PairwiseSummarizationQualityInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for pairwise summarization quality instance. message PairwiseSummarizationQualityInstance { // Required. Output of the candidate model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Output of the baseline model. optional string baseline_prediction = 2 [(google.api.field_behavior) = REQUIRED]; // Optional. Ground truth used to compare against the prediction. optional string reference = 3 [(google.api.field_behavior) = OPTIONAL]; // Required. Text to be summarized. optional string context = 4 [(google.api.field_behavior) = REQUIRED]; // Required. Summarization prompt for LLM. optional string instruction = 5 [(google.api.field_behavior) = REQUIRED]; } // Spec for pairwise summarization quality score metric. message PairwiseSummarizationQualitySpec { // Optional. Whether to use instance.reference to compute pairwise // summarization quality. bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Which version to use for evaluation. int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; } // Spec for pairwise summarization quality result. message PairwiseSummarizationQualityResult { // Output only. Pairwise summarization prediction choice. PairwiseChoice pairwise_choice = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for summarization quality score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for summarization quality score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for summarization helpfulness metric. message SummarizationHelpfulnessInput { // Required. Spec for summarization helpfulness score metric. SummarizationHelpfulnessSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Summarization helpfulness instance. SummarizationHelpfulnessInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for summarization helpfulness instance. message SummarizationHelpfulnessInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; // Required. Text to be summarized. optional string context = 3 [(google.api.field_behavior) = REQUIRED]; // Optional. Summarization prompt for LLM. optional string instruction = 4 [(google.api.field_behavior) = OPTIONAL]; } // Spec for summarization helpfulness score metric. message SummarizationHelpfulnessSpec { // Optional. Whether to use instance.reference to compute summarization // helpfulness. bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Which version to use for evaluation. int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; } // Spec for summarization helpfulness result. message SummarizationHelpfulnessResult { // Output only. Summarization Helpfulness score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for summarization helpfulness score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for summarization helpfulness score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for summarization verbosity metric. message SummarizationVerbosityInput { // Required. Spec for summarization verbosity score metric. SummarizationVerbositySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Summarization verbosity instance. SummarizationVerbosityInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for summarization verbosity instance. message SummarizationVerbosityInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; // Required. Text to be summarized. optional string context = 3 [(google.api.field_behavior) = REQUIRED]; // Optional. Summarization prompt for LLM. optional string instruction = 4 [(google.api.field_behavior) = OPTIONAL]; } // Spec for summarization verbosity score metric. message SummarizationVerbositySpec { // Optional. Whether to use instance.reference to compute summarization // verbosity. bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Which version to use for evaluation. int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; } // Spec for summarization verbosity result. message SummarizationVerbosityResult { // Output only. Summarization Verbosity score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for summarization verbosity score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for summarization verbosity score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for question answering quality metric. message QuestionAnsweringQualityInput { // Required. Spec for question answering quality score metric. QuestionAnsweringQualitySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Question answering quality instance. QuestionAnsweringQualityInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for question answering quality instance. message QuestionAnsweringQualityInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; // Required. Text to answer the question. optional string context = 3 [(google.api.field_behavior) = REQUIRED]; // Required. Question Answering prompt for LLM. optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; } // Spec for question answering quality score metric. message QuestionAnsweringQualitySpec { // Optional. Whether to use instance.reference to compute question answering // quality. bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Which version to use for evaluation. int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; } // Spec for question answering quality result. message QuestionAnsweringQualityResult { // Output only. Question Answering Quality score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for question answering quality score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for question answering quality score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for pairwise question answering quality metric. message PairwiseQuestionAnsweringQualityInput { // Required. Spec for pairwise question answering quality score metric. PairwiseQuestionAnsweringQualitySpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Pairwise question answering quality instance. PairwiseQuestionAnsweringQualityInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for pairwise question answering quality instance. message PairwiseQuestionAnsweringQualityInstance { // Required. Output of the candidate model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Output of the baseline model. optional string baseline_prediction = 2 [(google.api.field_behavior) = REQUIRED]; // Optional. Ground truth used to compare against the prediction. optional string reference = 3 [(google.api.field_behavior) = OPTIONAL]; // Required. Text to answer the question. optional string context = 4 [(google.api.field_behavior) = REQUIRED]; // Required. Question Answering prompt for LLM. optional string instruction = 5 [(google.api.field_behavior) = REQUIRED]; } // Spec for pairwise question answering quality score metric. message PairwiseQuestionAnsweringQualitySpec { // Optional. Whether to use instance.reference to compute question answering // quality. bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Which version to use for evaluation. int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; } // Spec for pairwise question answering quality result. message PairwiseQuestionAnsweringQualityResult { // Output only. Pairwise question answering prediction choice. PairwiseChoice pairwise_choice = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for question answering quality score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for question answering quality score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for question answering relevance metric. message QuestionAnsweringRelevanceInput { // Required. Spec for question answering relevance score metric. QuestionAnsweringRelevanceSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Question answering relevance instance. QuestionAnsweringRelevanceInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for question answering relevance instance. message QuestionAnsweringRelevanceInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Text provided as context to answer the question. optional string context = 3 [(google.api.field_behavior) = OPTIONAL]; // Required. The question asked and other instruction in the inference prompt. optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; } // Spec for question answering relevance metric. message QuestionAnsweringRelevanceSpec { // Optional. Whether to use instance.reference to compute question answering // relevance. bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Which version to use for evaluation. int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; } // Spec for question answering relevance result. message QuestionAnsweringRelevanceResult { // Output only. Question Answering Relevance score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for question answering relevance score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for question answering relevance score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for question answering helpfulness metric. message QuestionAnsweringHelpfulnessInput { // Required. Spec for question answering helpfulness score metric. QuestionAnsweringHelpfulnessSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Question answering helpfulness instance. QuestionAnsweringHelpfulnessInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for question answering helpfulness instance. message QuestionAnsweringHelpfulnessInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Text provided as context to answer the question. optional string context = 3 [(google.api.field_behavior) = OPTIONAL]; // Required. The question asked and other instruction in the inference prompt. optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; } // Spec for question answering helpfulness metric. message QuestionAnsweringHelpfulnessSpec { // Optional. Whether to use instance.reference to compute question answering // helpfulness. bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Which version to use for evaluation. int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; } // Spec for question answering helpfulness result. message QuestionAnsweringHelpfulnessResult { // Output only. Question Answering Helpfulness score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for question answering helpfulness score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for question answering helpfulness score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for question answering correctness metric. message QuestionAnsweringCorrectnessInput { // Required. Spec for question answering correctness score metric. QuestionAnsweringCorrectnessSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Question answering correctness instance. QuestionAnsweringCorrectnessInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for question answering correctness instance. message QuestionAnsweringCorrectnessInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Optional. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = OPTIONAL]; // Optional. Text provided as context to answer the question. optional string context = 3 [(google.api.field_behavior) = OPTIONAL]; // Required. The question asked and other instruction in the inference prompt. optional string instruction = 4 [(google.api.field_behavior) = REQUIRED]; } // Spec for question answering correctness metric. message QuestionAnsweringCorrectnessSpec { // Optional. Whether to use instance.reference to compute question answering // correctness. bool use_reference = 1 [(google.api.field_behavior) = OPTIONAL]; // Optional. Which version to use for evaluation. int32 version = 2 [(google.api.field_behavior) = OPTIONAL]; } // Spec for question answering correctness result. message QuestionAnsweringCorrectnessResult { // Output only. Question Answering Correctness score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for question answering correctness score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Confidence for question answering correctness score. optional float confidence = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for pointwise metric. message PointwiseMetricInput { // Required. Spec for pointwise metric. PointwiseMetricSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Pointwise metric instance. PointwiseMetricInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Pointwise metric instance. Usually one instance corresponds to one row in an // evaluation dataset. message PointwiseMetricInstance { // Instance for pointwise metric. oneof instance { // Instance specified as a json string. String key-value pairs are expected // in the json_instance to render // PointwiseMetricSpec.instance_prompt_template. string json_instance = 1; } } // Spec for pointwise metric. message PointwiseMetricSpec { // Required. Metric prompt template for pointwise metric. optional string metric_prompt_template = 1 [(google.api.field_behavior) = REQUIRED]; } // Spec for pointwise metric result. message PointwiseMetricResult { // Output only. Pointwise metric score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for pointwise metric score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for pairwise metric. message PairwiseMetricInput { // Required. Spec for pairwise metric. PairwiseMetricSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Pairwise metric instance. PairwiseMetricInstance instance = 2 [(google.api.field_behavior) = REQUIRED]; } // Pairwise metric instance. Usually one instance corresponds to one row in an // evaluation dataset. message PairwiseMetricInstance { // Instance for pairwise metric. oneof instance { // Instance specified as a json string. String key-value pairs are expected // in the json_instance to render // PairwiseMetricSpec.instance_prompt_template. string json_instance = 1; } } // Spec for pairwise metric. message PairwiseMetricSpec { // Required. Metric prompt template for pairwise metric. optional string metric_prompt_template = 1 [(google.api.field_behavior) = REQUIRED]; } // Spec for pairwise metric result. message PairwiseMetricResult { // Output only. Pairwise metric choice. PairwiseChoice pairwise_choice = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; // Output only. Explanation for pairwise metric score. string explanation = 2 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for tool call valid metric. message ToolCallValidInput { // Required. Spec for tool call valid metric. ToolCallValidSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Repeated tool call valid instances. repeated ToolCallValidInstance instances = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for tool call valid metric. message ToolCallValidSpec {} // Spec for tool call valid instance. message ToolCallValidInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; } // Results for tool call valid metric. message ToolCallValidResults { // Output only. Tool call valid metric values. repeated ToolCallValidMetricValue tool_call_valid_metric_values = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Tool call valid metric value for an instance. message ToolCallValidMetricValue { // Output only. Tool call valid score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for tool name match metric. message ToolNameMatchInput { // Required. Spec for tool name match metric. ToolNameMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Repeated tool name match instances. repeated ToolNameMatchInstance instances = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for tool name match metric. message ToolNameMatchSpec {} // Spec for tool name match instance. message ToolNameMatchInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; } // Results for tool name match metric. message ToolNameMatchResults { // Output only. Tool name match metric values. repeated ToolNameMatchMetricValue tool_name_match_metric_values = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Tool name match metric value for an instance. message ToolNameMatchMetricValue { // Output only. Tool name match score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for tool parameter key match metric. message ToolParameterKeyMatchInput { // Required. Spec for tool parameter key match metric. ToolParameterKeyMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Repeated tool parameter key match instances. repeated ToolParameterKeyMatchInstance instances = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for tool parameter key match metric. message ToolParameterKeyMatchSpec {} // Spec for tool parameter key match instance. message ToolParameterKeyMatchInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; } // Results for tool parameter key match metric. message ToolParameterKeyMatchResults { // Output only. Tool parameter key match metric values. repeated ToolParameterKeyMatchMetricValue tool_parameter_key_match_metric_values = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Tool parameter key match metric value for an instance. message ToolParameterKeyMatchMetricValue { // Output only. Tool parameter key match score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Input for tool parameter key value match metric. message ToolParameterKVMatchInput { // Required. Spec for tool parameter key value match metric. ToolParameterKVMatchSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Repeated tool parameter key value match instances. repeated ToolParameterKVMatchInstance instances = 2 [(google.api.field_behavior) = REQUIRED]; } // Spec for tool parameter key value match metric. message ToolParameterKVMatchSpec { // Optional. Whether to use STRCIT string match on parameter values. bool use_strict_string_match = 1 [(google.api.field_behavior) = OPTIONAL]; } // Spec for tool parameter key value match instance. message ToolParameterKVMatchInstance { // Required. Output of the evaluated model. optional string prediction = 1 [(google.api.field_behavior) = REQUIRED]; // Required. Ground truth used to compare against the prediction. optional string reference = 2 [(google.api.field_behavior) = REQUIRED]; } // Results for tool parameter key value match metric. message ToolParameterKVMatchResults { // Output only. Tool parameter key value match metric values. repeated ToolParameterKVMatchMetricValue tool_parameter_kv_match_metric_values = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; } // Tool parameter key value match metric value for an instance. message ToolParameterKVMatchMetricValue { // Output only. Tool parameter key value match score. optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; }