// Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. syntax = "proto3"; package google.cloud.aiplatform.v1beta1.schema; import "google/cloud/aiplatform/v1beta1/schema/annotation_spec_color.proto"; import "google/cloud/aiplatform/v1beta1/schema/geometry.proto"; import "google/protobuf/duration.proto"; import "google/protobuf/struct.proto"; import "google/protobuf/wrappers.proto"; import "google/api/annotations.proto"; option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1beta1/schema;schema"; option java_multiple_files = true; option java_outer_classname = "IoFormatProto"; option java_package = "com.google.cloud.aiplatform.v1beta1.schema"; // Prediction input format for Image Classification. message ImageClassificationPredictionInstance { // The image bytes or GCS URI to make the prediction on. string content = 1; // The MIME type of the content of the image. Only the images in below listed // MIME types are supported. // - image/jpeg // - image/gif // - image/png // - image/webp // - image/bmp // - image/tiff // - image/vnd.microsoft.icon string mime_type = 2; } // Prediction input format for Image Object Detection. message ImageObjectDetectionPredictionInstance { // The image bytes or GCS URI to make the prediction on. string content = 1; // The MIME type of the content of the image. Only the images in below listed // MIME types are supported. // - image/jpeg // - image/gif // - image/png // - image/webp // - image/bmp // - image/tiff // - image/vnd.microsoft.icon string mime_type = 2; } // Prediction input format for Image Segmentation. message ImageSegmentationPredictionInstance { // The image bytes to make the predictions on. string content = 1; // The MIME type of the content of the image. Only the images in below listed // MIME types are supported. // - image/jpeg // - image/png string mime_type = 2; } // Prediction input format for Video Classification. message VideoClassificationPredictionInstance { // The Google Cloud Storage location of the video on which to perform the // prediction. string content = 1; // The MIME type of the content of the video. Only the following are // supported: video/mp4 video/avi video/quicktime string mime_type = 2; // The beginning, inclusive, of the video's time segment on which to perform // the prediction. Expressed as a number of seconds as measured from the // start of the video, with "s" appended at the end. Fractions are allowed, // up to a microsecond precision. string time_segment_start = 3; // The end, exclusive, of the video's time segment on which to perform // the prediction. Expressed as a number of seconds as measured from the // start of the video, with "s" appended at the end. Fractions are allowed, // up to a microsecond precision, and "Infinity" is allowed, which means the // end of the video. string time_segment_end = 4; } // Prediction input format for Video Classification. message VideoObjectTrackingPredictionInstance { // The Google Cloud Storage location of the video on which to perform the // prediction. string content = 1; // The MIME type of the content of the video. Only the following are // supported: video/mp4 video/avi video/quicktime string mime_type = 2; // The beginning, inclusive, of the video's time segment on which to perform // the prediction. Expressed as a number of seconds as measured from the // start of the video, with "s" appended at the end. Fractions are allowed, // up to a microsecond precision. string time_segment_start = 3; // The end, exclusive, of the video's time segment on which to perform // the prediction. Expressed as a number of seconds as measured from the // start of the video, with "s" appended at the end. Fractions are allowed, // up to a microsecond precision, and "Infinity" is allowed, which means the // end of the video. string time_segment_end = 4; } // Prediction input format for Video Action Recognition. message VideoActionRecognitionPredictionInstance { // The Google Cloud Storage location of the video on which to perform the // prediction. string content = 1; // The MIME type of the content of the video. Only the following are // supported: video/mp4 video/avi video/quicktime string mime_type = 2; // The beginning, inclusive, of the video's time segment on which to perform // the prediction. Expressed as a number of seconds as measured from the // start of the video, with "s" appended at the end. Fractions are allowed, // up to a microsecond precision. string time_segment_start = 3; // The end, exclusive, of the video's time segment on which to perform // the prediction. Expressed as a number of seconds as measured from the // start of the video, with "s" appended at the end. Fractions are allowed, // up to a microsecond precision, and "Infinity" is allowed, which means the // end of the video. string time_segment_end = 4; } // Prediction input format for Text Classification. message TextClassificationPredictionInstance { // The text snippet to make the predictions on. string content = 1; // The MIME type of the text snippet. The supported MIME types are listed // below. // - text/plain string mime_type = 2; } // Prediction input format for Text Sentiment. message TextSentimentPredictionInstance { // The text snippet to make the predictions on. string content = 1; // The MIME type of the text snippet. The supported MIME types are listed // below. // - text/plain string mime_type = 2; } // Prediction input format for Text Extraction. message TextExtractionPredictionInstance { // The text snippet to make the predictions on. string content = 1; // The MIME type of the text snippet. The supported MIME types are listed // below. // - text/plain string mime_type = 2; // This field is only used for batch prediction. If a key is provided, the // batch prediction result will by mapped to this key. If omitted, then the // batch prediction result will contain the entire input instance. AI Platform // will not check if keys in the request are duplicates, so it is up to the // caller to ensure the keys are unique. string key = 3; } // Prediction model parameters for Image Classification. message ImageClassificationPredictionParams { // The Model only returns predictions with at least this confidence score. // Default value is 0.0 float confidence_threshold = 1; // The Model only returns up to that many top, by confidence score, // predictions per instance. If this number is very high, the Model may return // fewer predictions. Default value is 10. int32 max_predictions = 2; } // Prediction model parameters for Image Object Detection. message ImageObjectDetectionPredictionParams { // The Model only returns predictions with at least this confidence score. // Default value is 0.0 float confidence_threshold = 1; // The Model only returns up to that many top, by confidence score, // predictions per instance. Note that number of returned predictions is also // limited by metadata's predictionsLimit. Default value is 10. int32 max_predictions = 2; } // Prediction model parameters for Image Segmentation. message ImageSegmentationPredictionParams { // When the model predicts category of pixels of the image, it will only // provide predictions for pixels that it is at least this much confident // about. All other pixels will be classified as background. Default value is // 0.5. float confidence_threshold = 1; } // Prediction model parameters for Video Classification. message VideoClassificationPredictionParams { // The Model only returns predictions with at least this confidence score. // Default value is 0.0 float confidence_threshold = 1; // The Model only returns up to that many top, by confidence score, // predictions per instance. If this number is very high, the Model may return // fewer predictions. Default value is 10,000. int32 max_predictions = 2; // Set to true to request segment-level classification. AI Platform returns // labels and their confidence scores for the entire time segment of the // video that user specified in the input instance. // Default value is true bool segment_classification = 3; // Set to true to request shot-level classification. AI Platform determines // the boundaries for each camera shot in the entire time segment of the // video that user specified in the input instance. AI Platform then // returns labels and their confidence scores for each detected shot, along // with the start and end time of the shot. // WARNING: Model evaluation is not done for this classification type, // the quality of it depends on the training data, but there are no metrics // provided to describe that quality. // Default value is false bool shot_classification = 4; // Set to true to request classification for a video at one-second intervals. // AI Platform returns labels and their confidence scores for each second of // the entire time segment of the video that user specified in the input // WARNING: Model evaluation is not done for this classification type, the // quality of it depends on the training data, but there are no metrics // provided to describe that quality. Default value is false bool one_sec_interval_classification = 5; } // Prediction model parameters for Video Object Tracking. message VideoObjectTrackingPredictionParams { // The Model only returns predictions with at least this confidence score. // Default value is 0.0 float confidence_threshold = 1; // The model only returns up to that many top, by confidence score, // predictions per frame of the video. If this number is very high, the // Model may return fewer predictions per frame. Default value is 50. int32 max_predictions = 2; // Only bounding boxes with shortest edge at least that long as a relative // value of video frame size are returned. Default value is 0.0. float min_bounding_box_size = 3; } // Prediction model parameters for Video Action Recognition. message VideoActionRecognitionPredictionParams { // The Model only returns predictions with at least this confidence score. // Default value is 0.0 float confidence_threshold = 1; // The model only returns up to that many top, by confidence score, // predictions per frame of the video. If this number is very high, the // Model may return fewer predictions per frame. Default value is 50. int32 max_predictions = 2; } // Represents a line of JSONL in the batch prediction output file. message PredictionResult { // Some identifier from the input so that the prediction can be mapped back to // the input instance. oneof input { // User's input instance. // Struct is used here instead of Any so that JsonFormat does not append an // extra "@type" field when we convert the proto to JSON. google.protobuf.Struct instance = 1; // Optional user-provided key from the input instance. string key = 2; } // The prediction result. // Value is used here instead of Any so that JsonFormat does not append an // extra "@type" field when we convert the proto to JSON and so we can // represent array of objects. google.protobuf.Value prediction = 3; } // Represents a line of JSONL in the text sentiment batch prediction output // file. This is a hack to allow printing of integer values. message TextSentimentPredictionResult { // Prediction output format for Text Sentiment. message Prediction { // The integer sentiment labels between 0 (inclusive) and sentimentMax label // (inclusive), while 0 maps to the least positive sentiment and // sentimentMax maps to the most positive one. The higher the score is, the // more positive the sentiment in the text snippet is. Note: sentimentMax is // an integer value between 1 (inclusive) and 10 (inclusive). int32 sentiment = 1; } // User's input instance. TextSentimentPredictionInstance instance = 1; // The prediction result. Prediction prediction = 2; } // Prediction output format for Image Classification. message ClassificationPredictionResult { // The resource IDs of the AnnotationSpecs that had been identified, ordered // by the confidence score descendingly. repeated int64 ids = 1; // The display names of the AnnotationSpecs that had been identified, order // matches the IDs. repeated string display_names = 2; // The Model's confidences in correctness of the predicted IDs, higher value // means higher confidence. Order matches the Ids. repeated float confidences = 3; } // Prediction output format for Image Object Detection. message ImageObjectDetectionPredictionResult { // The resource IDs of the AnnotationSpecs that had been identified, ordered // by the confidence score descendingly. repeated int64 ids = 1; // The display names of the AnnotationSpecs that had been identified, order // matches the IDs. repeated string display_names = 2; // The Model's confidences in correctness of the predicted IDs, higher value // means higher confidence. Order matches the Ids. repeated float confidences = 3; // Bounding boxes, i.e. the rectangles over the image, that pinpoint // the found AnnotationSpecs. Given in order that matches the IDs. Each // bounding box is an array of 4 numbers `xMin`, `xMax`, `yMin`, and // `yMax`, which represent the extremal coordinates of the box. They are // relative to the image size, and the point 0,0 is in the top left // of the image. repeated google.protobuf.ListValue bboxes = 4; } // Prediction output format for Video Classification. message VideoClassificationPredictionResult { // The resource ID of the AnnotationSpec that had been identified. string id = 1; // The display name of the AnnotationSpec that had been identified. string display_name = 2; // The type of the prediction. The requested types can be configured // via parameters. This will be one of // - segment-classification // - shot-classification // - one-sec-interval-classification string type = 3; // The beginning, inclusive, of the video's time segment in which the // AnnotationSpec has been identified. Expressed as a number of seconds as // measured from the start of the video, with fractions up to a microsecond // precision, and with "s" appended at the end. Note that for // 'segment-classification' prediction type, this equals the original // 'timeSegmentStart' from the input instance, for other types it is the // start of a shot or a 1 second interval respectively. google.protobuf.Duration time_segment_start = 4; // The end, exclusive, of the video's time segment in which the // AnnotationSpec has been identified. Expressed as a number of seconds as // measured from the start of the video, with fractions up to a microsecond // precision, and with "s" appended at the end. Note that for // 'segment-classification' prediction type, this equals the original // 'timeSegmentEnd' from the input instance, for other types it is the end // of a shot or a 1 second interval respectively. google.protobuf.Duration time_segment_end = 5; // The Model's confidence in correction of this prediction, higher // value means higher confidence. google.protobuf.FloatValue confidence = 6; } // Prediction output format for Video Object Tracking. message VideoObjectTrackingPredictionResult { // The fields `xMin`, `xMax`, `yMin`, and `yMax` refer to a bounding box, // i.e. the rectangle over the video frame pinpointing the found // AnnotationSpec. The coordinates are relative to the frame size, and the // point 0,0 is in the top left of the frame. message Frame { // A time (frame) of a video in which the object has been detected. // Expressed as a number of seconds as measured from the // start of the video, with fractions up to a microsecond precision, and // with "s" appended at the end. google.protobuf.Duration time_offset = 1; // The leftmost coordinate of the bounding box. google.protobuf.FloatValue x_min = 2; // The rightmost coordinate of the bounding box. google.protobuf.FloatValue x_max = 3; // The topmost coordinate of the bounding box. google.protobuf.FloatValue y_min = 4; // The bottommost coordinate of the bounding box. google.protobuf.FloatValue y_max = 5; } // The resource ID of the AnnotationSpec that had been identified. string id = 1; // The display name of the AnnotationSpec that had been identified. string display_name = 2; // The beginning, inclusive, of the video's time segment in which the // object instance has been detected. Expressed as a number of seconds as // measured from the start of the video, with fractions up to a microsecond // precision, and with "s" appended at the end. google.protobuf.Duration time_segment_start = 3; // The end, inclusive, of the video's time segment in which the // object instance has been detected. Expressed as a number of seconds as // measured from the start of the video, with fractions up to a microsecond // precision, and with "s" appended at the end. google.protobuf.Duration time_segment_end = 4; // The Model's confidence in correction of this prediction, higher // value means higher confidence. google.protobuf.FloatValue confidence = 5; // All of the frames of the video in which a single object instance has been // detected. The bounding boxes in the frames identify the same object. repeated Frame frames = 6; } // Prediction output format for Text Extraction. message TextExtractionPredictionResult { // The resource IDs of the AnnotationSpecs that had been identified, // ordered by the confidence score descendingly. repeated int64 ids = 1; // The display names of the AnnotationSpecs that had been identified, // order matches the IDs. repeated string display_names = 2; // The start offsets, inclusive, of the text segment in which the // AnnotationSpec has been identified. Expressed as a zero-based number // of characters as measured from the start of the text snippet. repeated int64 text_segment_start_offsets = 3; // The end offsets, inclusive, of the text segment in which the // AnnotationSpec has been identified. Expressed as a zero-based number // of characters as measured from the start of the text snippet. repeated int64 text_segment_end_offsets = 4; // The Model's confidences in correctness of the predicted IDs, higher // value means higher confidence. Order matches the Ids. repeated float confidences = 5; }