// Copyright 2024 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // syntax = "proto3"; package google.cloud.vision.v1p3beta1; import "google/api/annotations.proto"; import "google/api/client.proto"; import "google/api/field_behavior.proto"; import "google/cloud/vision/v1p3beta1/geometry.proto"; import "google/cloud/vision/v1p3beta1/product_search.proto"; import "google/cloud/vision/v1p3beta1/text_annotation.proto"; import "google/cloud/vision/v1p3beta1/web_detection.proto"; import "google/longrunning/operations.proto"; import "google/protobuf/timestamp.proto"; import "google/rpc/status.proto"; import "google/type/color.proto"; import "google/type/latlng.proto"; option cc_enable_arenas = true; option go_package = "cloud.google.com/go/vision/apiv1p3beta1/visionpb;visionpb"; option java_multiple_files = true; option java_outer_classname = "ImageAnnotatorProto"; option java_package = "com.google.cloud.vision.v1p3beta1"; // Service that performs Google Cloud Vision API detection tasks over client // images, such as face, landmark, logo, label, and text detection. The // ImageAnnotator service returns detected entities from the images. service ImageAnnotator { option (google.api.default_host) = "vision.googleapis.com"; option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform," "https://www.googleapis.com/auth/cloud-vision"; // Run image detection and annotation for a batch of images. rpc BatchAnnotateImages(BatchAnnotateImagesRequest) returns (BatchAnnotateImagesResponse) { option (google.api.http) = { post: "/v1p3beta1/images:annotate" body: "*" }; option (google.api.method_signature) = "requests"; } // Run asynchronous image detection and annotation for a list of generic // files, such as PDF files, which may contain multiple pages and multiple // images per page. Progress and results can be retrieved through the // `google.longrunning.Operations` interface. // `Operation.metadata` contains `OperationMetadata` (metadata). // `Operation.response` contains `AsyncBatchAnnotateFilesResponse` (results). rpc AsyncBatchAnnotateFiles(AsyncBatchAnnotateFilesRequest) returns (google.longrunning.Operation) { option (google.api.http) = { post: "/v1p3beta1/files:asyncBatchAnnotate" body: "*" }; option (google.api.method_signature) = "requests"; option (google.longrunning.operation_info) = { response_type: "AsyncBatchAnnotateFilesResponse" metadata_type: "OperationMetadata" }; } } // The type of Google Cloud Vision API detection to perform, and the maximum // number of results to return for that type. Multiple `Feature` objects can // be specified in the `features` list. message Feature { // Type of Google Cloud Vision API feature to be extracted. enum Type { // Unspecified feature type. TYPE_UNSPECIFIED = 0; // Run face detection. FACE_DETECTION = 1; // Run landmark detection. LANDMARK_DETECTION = 2; // Run logo detection. LOGO_DETECTION = 3; // Run label detection. LABEL_DETECTION = 4; // Run text detection / optical character recognition (OCR). Text detection // is optimized for areas of text within a larger image; if the image is // a document, use `DOCUMENT_TEXT_DETECTION` instead. TEXT_DETECTION = 5; // Run dense text document OCR. Takes precedence when both // `DOCUMENT_TEXT_DETECTION` and `TEXT_DETECTION` are present. DOCUMENT_TEXT_DETECTION = 11; // Run Safe Search to detect potentially unsafe // or undesirable content. SAFE_SEARCH_DETECTION = 6; // Compute a set of image properties, such as the // image's dominant colors. IMAGE_PROPERTIES = 7; // Run crop hints. CROP_HINTS = 9; // Run web detection. WEB_DETECTION = 10; // Run Product Search. PRODUCT_SEARCH = 12; // Run localizer for object detection. OBJECT_LOCALIZATION = 19; } // The feature type. Type type = 1; // Maximum number of results of this type. Does not apply to // `TEXT_DETECTION`, `DOCUMENT_TEXT_DETECTION`, or `CROP_HINTS`. int32 max_results = 2; // Model to use for the feature. // Supported values: "builtin/stable" (the default if unset) and // "builtin/latest". `DOCUMENT_TEXT_DETECTION` and `TEXT_DETECTION` also // support "builtin/weekly" for the bleeding edge release updated weekly. string model = 3; } // External image source (Google Cloud Storage or web URL image location). message ImageSource { // **Use `image_uri` instead.** // // The Google Cloud Storage URI of the form // `gs://bucket_name/object_name`. Object versioning is not supported. See // [Google Cloud Storage Request // URIs](https://cloud.google.com/storage/docs/reference-uris) for more info. string gcs_image_uri = 1; // The URI of the source image. Can be either: // // 1. A Google Cloud Storage URI of the form // `gs://bucket_name/object_name`. Object versioning is not supported. See // [Google Cloud Storage Request // URIs](https://cloud.google.com/storage/docs/reference-uris) for more // info. // // 2. A publicly-accessible image HTTP/HTTPS URL. When fetching images from // HTTP/HTTPS URLs, Google cannot guarantee that the request will be // completed. Your request may fail if the specified host denies the // request (e.g. due to request throttling or DOS prevention), or if Google // throttles requests to the site for abuse prevention. You should not // depend on externally-hosted images for production applications. // // When both `gcs_image_uri` and `image_uri` are specified, `image_uri` takes // precedence. string image_uri = 2; } // Client image to perform Google Cloud Vision API tasks over. message Image { // Image content, represented as a stream of bytes. // Note: As with all `bytes` fields, protobuffers use a pure binary // representation, whereas JSON representations use base64. bytes content = 1; // Google Cloud Storage image location, or publicly-accessible image // URL. If both `content` and `source` are provided for an image, `content` // takes precedence and is used to perform the image annotation request. ImageSource source = 2; } // A face annotation object contains the results of face detection. message FaceAnnotation { // A face-specific landmark (for example, a face feature). message Landmark { // Face landmark (feature) type. // Left and right are defined from the vantage of the viewer of the image // without considering mirror projections typical of photos. So, `LEFT_EYE`, // typically, is the person's right eye. enum Type { // Unknown face landmark detected. Should not be filled. UNKNOWN_LANDMARK = 0; // Left eye. LEFT_EYE = 1; // Right eye. RIGHT_EYE = 2; // Left of left eyebrow. LEFT_OF_LEFT_EYEBROW = 3; // Right of left eyebrow. RIGHT_OF_LEFT_EYEBROW = 4; // Left of right eyebrow. LEFT_OF_RIGHT_EYEBROW = 5; // Right of right eyebrow. RIGHT_OF_RIGHT_EYEBROW = 6; // Midpoint between eyes. MIDPOINT_BETWEEN_EYES = 7; // Nose tip. NOSE_TIP = 8; // Upper lip. UPPER_LIP = 9; // Lower lip. LOWER_LIP = 10; // Mouth left. MOUTH_LEFT = 11; // Mouth right. MOUTH_RIGHT = 12; // Mouth center. MOUTH_CENTER = 13; // Nose, bottom right. NOSE_BOTTOM_RIGHT = 14; // Nose, bottom left. NOSE_BOTTOM_LEFT = 15; // Nose, bottom center. NOSE_BOTTOM_CENTER = 16; // Left eye, top boundary. LEFT_EYE_TOP_BOUNDARY = 17; // Left eye, right corner. LEFT_EYE_RIGHT_CORNER = 18; // Left eye, bottom boundary. LEFT_EYE_BOTTOM_BOUNDARY = 19; // Left eye, left corner. LEFT_EYE_LEFT_CORNER = 20; // Right eye, top boundary. RIGHT_EYE_TOP_BOUNDARY = 21; // Right eye, right corner. RIGHT_EYE_RIGHT_CORNER = 22; // Right eye, bottom boundary. RIGHT_EYE_BOTTOM_BOUNDARY = 23; // Right eye, left corner. RIGHT_EYE_LEFT_CORNER = 24; // Left eyebrow, upper midpoint. LEFT_EYEBROW_UPPER_MIDPOINT = 25; // Right eyebrow, upper midpoint. RIGHT_EYEBROW_UPPER_MIDPOINT = 26; // Left ear tragion. LEFT_EAR_TRAGION = 27; // Right ear tragion. RIGHT_EAR_TRAGION = 28; // Left eye pupil. LEFT_EYE_PUPIL = 29; // Right eye pupil. RIGHT_EYE_PUPIL = 30; // Forehead glabella. FOREHEAD_GLABELLA = 31; // Chin gnathion. CHIN_GNATHION = 32; // Chin left gonion. CHIN_LEFT_GONION = 33; // Chin right gonion. CHIN_RIGHT_GONION = 34; } // Face landmark type. Type type = 3; // Face landmark position. Position position = 4; } // The bounding polygon around the face. The coordinates of the bounding box // are in the original image's scale, as returned in `ImageParams`. // The bounding box is computed to "frame" the face in accordance with human // expectations. It is based on the landmarker results. // Note that one or more x and/or y coordinates may not be generated in the // `BoundingPoly` (the polygon will be unbounded) if only a partial face // appears in the image to be annotated. BoundingPoly bounding_poly = 1; // The `fd_bounding_poly` bounding polygon is tighter than the // `boundingPoly`, and encloses only the skin part of the face. Typically, it // is used to eliminate the face from any image analysis that detects the // "amount of skin" visible in an image. It is not based on the // landmarker results, only on the initial face detection, hence // the fd (face detection) prefix. BoundingPoly fd_bounding_poly = 2; // Detected face landmarks. repeated Landmark landmarks = 3; // Roll angle, which indicates the amount of clockwise/anti-clockwise rotation // of the face relative to the image vertical about the axis perpendicular to // the face. Range [-180,180]. float roll_angle = 4; // Yaw angle, which indicates the leftward/rightward angle that the face is // pointing relative to the vertical plane perpendicular to the image. Range // [-180,180]. float pan_angle = 5; // Pitch angle, which indicates the upwards/downwards angle that the face is // pointing relative to the image's horizontal plane. Range [-180,180]. float tilt_angle = 6; // Detection confidence. Range [0, 1]. float detection_confidence = 7; // Face landmarking confidence. Range [0, 1]. float landmarking_confidence = 8; // Joy likelihood. Likelihood joy_likelihood = 9; // Sorrow likelihood. Likelihood sorrow_likelihood = 10; // Anger likelihood. Likelihood anger_likelihood = 11; // Surprise likelihood. Likelihood surprise_likelihood = 12; // Under-exposed likelihood. Likelihood under_exposed_likelihood = 13; // Blurred likelihood. Likelihood blurred_likelihood = 14; // Headwear likelihood. Likelihood headwear_likelihood = 15; } // Detected entity location information. message LocationInfo { // lat/long location coordinates. google.type.LatLng lat_lng = 1; } // A `Property` consists of a user-supplied name/value pair. message Property { // Name of the property. string name = 1; // Value of the property. string value = 2; // Value of numeric properties. uint64 uint64_value = 3; } // Set of detected entity features. message EntityAnnotation { // Opaque entity ID. Some IDs may be available in // [Google Knowledge Graph Search // API](https://developers.google.com/knowledge-graph/). string mid = 1; // The language code for the locale in which the entity textual // `description` is expressed. string locale = 2; // Entity textual description, expressed in its `locale` language. string description = 3; // Overall score of the result. Range [0, 1]. float score = 4; // **Deprecated. Use `score` instead.** // The accuracy of the entity detection in an image. // For example, for an image in which the "Eiffel Tower" entity is detected, // this field represents the confidence that there is a tower in the query // image. Range [0, 1]. float confidence = 5; // The relevancy of the ICA (Image Content Annotation) label to the // image. For example, the relevancy of "tower" is likely higher to an image // containing the detected "Eiffel Tower" than to an image containing a // detected distant towering building, even though the confidence that // there is a tower in each image may be the same. Range [0, 1]. float topicality = 6; // Image region to which this entity belongs. Not produced // for `LABEL_DETECTION` features. BoundingPoly bounding_poly = 7; // The location information for the detected entity. Multiple // `LocationInfo` elements can be present because one location may // indicate the location of the scene in the image, and another location // may indicate the location of the place where the image was taken. // Location information is usually present for landmarks. repeated LocationInfo locations = 8; // Some entities may have optional user-supplied `Property` (name/value) // fields, such a score or string that qualifies the entity. repeated Property properties = 9; } // Set of detected objects with bounding boxes. message LocalizedObjectAnnotation { // Object ID that should align with EntityAnnotation mid. string mid = 1; // The BCP-47 language code, such as "en-US" or "sr-Latn". For more // information, see // http://www.unicode.org/reports/tr35/#Unicode_locale_identifier. string language_code = 2; // Object name, expressed in its `language_code` language. string name = 3; // Score of the result. Range [0, 1]. float score = 4; // Image region to which this object belongs. This must be populated. BoundingPoly bounding_poly = 5; } // Set of features pertaining to the image, computed by computer vision // methods over safe-search verticals (for example, adult, spoof, medical, // violence). message SafeSearchAnnotation { // Represents the adult content likelihood for the image. Adult content may // contain elements such as nudity, pornographic images or cartoons, or // sexual activities. Likelihood adult = 1; // Spoof likelihood. The likelihood that an modification // was made to the image's canonical version to make it appear // funny or offensive. Likelihood spoof = 2; // Likelihood that this is a medical image. Likelihood medical = 3; // Likelihood that this image contains violent content. Likelihood violence = 4; // Likelihood that the request image contains racy content. Racy content may // include (but is not limited to) skimpy or sheer clothing, strategically // covered nudity, lewd or provocative poses, or close-ups of sensitive // body areas. Likelihood racy = 9; } // Rectangle determined by min and max `LatLng` pairs. message LatLongRect { // Min lat/long pair. google.type.LatLng min_lat_lng = 1; // Max lat/long pair. google.type.LatLng max_lat_lng = 2; } // Color information consists of RGB channels, score, and the fraction of // the image that the color occupies in the image. message ColorInfo { // RGB components of the color. google.type.Color color = 1; // Image-specific score for this color. Value in range [0, 1]. float score = 2; // The fraction of pixels the color occupies in the image. // Value in range [0, 1]. float pixel_fraction = 3; } // Set of dominant colors and their corresponding scores. message DominantColorsAnnotation { // RGB color values with their score and pixel fraction. repeated ColorInfo colors = 1; } // Stores image properties, such as dominant colors. message ImageProperties { // If present, dominant colors completed successfully. DominantColorsAnnotation dominant_colors = 1; } // Single crop hint that is used to generate a new crop when serving an image. message CropHint { // The bounding polygon for the crop region. The coordinates of the bounding // box are in the original image's scale, as returned in `ImageParams`. BoundingPoly bounding_poly = 1; // Confidence of this being a salient region. Range [0, 1]. float confidence = 2; // Fraction of importance of this salient region with respect to the original // image. float importance_fraction = 3; } // Set of crop hints that are used to generate new crops when serving images. message CropHintsAnnotation { // Crop hint results. repeated CropHint crop_hints = 1; } // Parameters for crop hints annotation request. message CropHintsParams { // Aspect ratios in floats, representing the ratio of the width to the height // of the image. For example, if the desired aspect ratio is 4/3, the // corresponding float value should be 1.33333. If not specified, the // best possible crop is returned. The number of provided aspect ratios is // limited to a maximum of 16; any aspect ratios provided after the 16th are // ignored. repeated float aspect_ratios = 1; } // Parameters for web detection request. message WebDetectionParams { // Whether to include results derived from the geo information in the image. bool include_geo_results = 2; } // Parameters for text detections. This is used to control TEXT_DETECTION and // DOCUMENT_TEXT_DETECTION features. message TextDetectionParams { // By default, Cloud Vision API only includes confidence score for // DOCUMENT_TEXT_DETECTION result. Set the flag to true to include confidence // score for TEXT_DETECTION as well. bool enable_text_detection_confidence_score = 9; // A list of advanced OCR options to fine-tune OCR behavior. repeated string advanced_ocr_options = 11; } // Image context and/or feature-specific parameters. message ImageContext { // Not used. LatLongRect lat_long_rect = 1; // List of languages to use for TEXT_DETECTION. In most cases, an empty value // yields the best results since it enables automatic language detection. For // languages based on the Latin alphabet, setting `language_hints` is not // needed. In rare cases, when the language of the text in the image is known, // setting a hint will help get better results (although it will be a // significant hindrance if the hint is wrong). Text detection returns an // error if one or more of the specified languages is not one of the // [supported languages](https://cloud.google.com/vision/docs/languages). repeated string language_hints = 2; // Parameters for crop hints annotation request. CropHintsParams crop_hints_params = 4; // Parameters for product search. google.cloud.vision.v1p3beta1.ProductSearchParams product_search_params = 5; // Parameters for web detection. WebDetectionParams web_detection_params = 6; // Parameters for text detection and document text detection. TextDetectionParams text_detection_params = 12; } // Request for performing Google Cloud Vision API tasks over a user-provided // image, with user-requested features. message AnnotateImageRequest { // The image to be processed. Image image = 1; // Requested features. repeated Feature features = 2; // Additional context that may accompany the image. ImageContext image_context = 3; } // If an image was produced from a file (e.g. a PDF), this message gives // information about the source of that image. message ImageAnnotationContext { // The URI of the file used to produce the image. string uri = 1; // If the file was a PDF or TIFF, this field gives the page number within // the file used to produce the image. int32 page_number = 2; } // Response to an image annotation request. message AnnotateImageResponse { // If present, face detection has completed successfully. repeated FaceAnnotation face_annotations = 1; // If present, landmark detection has completed successfully. repeated EntityAnnotation landmark_annotations = 2; // If present, logo detection has completed successfully. repeated EntityAnnotation logo_annotations = 3; // If present, label detection has completed successfully. repeated EntityAnnotation label_annotations = 4; // If present, localized object detection has completed successfully. // This will be sorted descending by confidence score. repeated LocalizedObjectAnnotation localized_object_annotations = 22; // If present, text (OCR) detection has completed successfully. repeated EntityAnnotation text_annotations = 5; // If present, text (OCR) detection or document (OCR) text detection has // completed successfully. // This annotation provides the structural hierarchy for the OCR detected // text. TextAnnotation full_text_annotation = 12; // If present, safe-search annotation has completed successfully. SafeSearchAnnotation safe_search_annotation = 6; // If present, image properties were extracted successfully. ImageProperties image_properties_annotation = 8; // If present, crop hints have completed successfully. CropHintsAnnotation crop_hints_annotation = 11; // If present, web detection has completed successfully. WebDetection web_detection = 13; // If present, product search has completed successfully. google.cloud.vision.v1p3beta1.ProductSearchResults product_search_results = 14; // If set, represents the error message for the operation. // Note that filled-in image annotations are guaranteed to be // correct, even when `error` is set. google.rpc.Status error = 9; // If present, contextual information is needed to understand where this image // comes from. ImageAnnotationContext context = 21; } // Response to a single file annotation request. A file may contain one or more // images, which individually have their own responses. message AnnotateFileResponse { // Information about the file for which this response is generated. InputConfig input_config = 1; // Individual responses to images found within the file. repeated AnnotateImageResponse responses = 2; } // Multiple image annotation requests are batched into a single service call. message BatchAnnotateImagesRequest { // Individual image annotation requests for this batch. repeated AnnotateImageRequest requests = 1 [(google.api.field_behavior) = REQUIRED]; } // Response to a batch image annotation request. message BatchAnnotateImagesResponse { // Individual responses to image annotation requests within the batch. repeated AnnotateImageResponse responses = 1; } // An offline file annotation request. message AsyncAnnotateFileRequest { // Required. Information about the input file. InputConfig input_config = 1; // Required. Requested features. repeated Feature features = 2; // Additional context that may accompany the image(s) in the file. ImageContext image_context = 3; // Required. The desired output location and metadata (e.g. format). OutputConfig output_config = 4; } // The response for a single offline file annotation request. message AsyncAnnotateFileResponse { // The output location and metadata from AsyncAnnotateFileRequest. OutputConfig output_config = 1; } // Multiple async file annotation requests are batched into a single service // call. message AsyncBatchAnnotateFilesRequest { // Required. Individual async file annotation requests for this batch. repeated AsyncAnnotateFileRequest requests = 1 [(google.api.field_behavior) = REQUIRED]; } // Response to an async batch file annotation request. message AsyncBatchAnnotateFilesResponse { // The list of file annotation responses, one for each request in // AsyncBatchAnnotateFilesRequest. repeated AsyncAnnotateFileResponse responses = 1; } // The desired input location and metadata. message InputConfig { // The Google Cloud Storage location to read the input from. GcsSource gcs_source = 1; // The type of the file. Currently only "application/pdf" and "image/tiff" // are supported. Wildcards are not supported. string mime_type = 2; } // The desired output location and metadata. message OutputConfig { // The Google Cloud Storage location to write the output(s) to. GcsDestination gcs_destination = 1; // The max number of response protos to put into each output JSON file on // Google Cloud Storage. // The valid range is [1, 100]. If not specified, the default value is 20. // // For example, for one pdf file with 100 pages, 100 response protos will // be generated. If `batch_size` = 20, then 5 json files each // containing 20 response protos will be written under the prefix // `gcs_destination`.`uri`. // // Currently, batch_size only applies to GcsDestination, with potential future // support for other output configurations. int32 batch_size = 2; } // The Google Cloud Storage location where the input will be read from. message GcsSource { // Google Cloud Storage URI for the input file. This must only be a // Google Cloud Storage object. Wildcards are not currently supported. string uri = 1; } // The Google Cloud Storage location where the output will be written to. message GcsDestination { // Google Cloud Storage URI where the results will be stored. Results will // be in JSON format and preceded by its corresponding input URI. This field // can either represent a single file, or a prefix for multiple outputs. // Prefixes must end in a `/`. // // Examples: // // * File: gs://bucket-name/filename.json // * Prefix: gs://bucket-name/prefix/here/ // * File: gs://bucket-name/prefix/here // // If multiple outputs, each response is still AnnotateFileResponse, each of // which contains some subset of the full list of AnnotateImageResponse. // Multiple outputs can happen if, for example, the output JSON is too large // and overflows into multiple sharded files. string uri = 1; } // A bucketized representation of likelihood, which is intended to give clients // highly stable results across model upgrades. enum Likelihood { // Unknown likelihood. UNKNOWN = 0; // It is very unlikely that the image belongs to the specified vertical. VERY_UNLIKELY = 1; // It is unlikely that the image belongs to the specified vertical. UNLIKELY = 2; // It is possible that the image belongs to the specified vertical. POSSIBLE = 3; // It is likely that the image belongs to the specified vertical. LIKELY = 4; // It is very likely that the image belongs to the specified vertical. VERY_LIKELY = 5; } // Contains metadata for the BatchAnnotateImages operation. message OperationMetadata { // Batch operation states. enum State { // Invalid. STATE_UNSPECIFIED = 0; // Request is received. CREATED = 1; // Request is actively being processed. RUNNING = 2; // The batch processing is done. DONE = 3; // The batch processing was cancelled. CANCELLED = 4; } // Current state of the batch operation. State state = 1; // The time when the batch request was received. google.protobuf.Timestamp create_time = 5; // The time when the operation result was last updated. google.protobuf.Timestamp update_time = 6; }