# Copyright 2022 The IREE Authors # # Licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """A Cloud Functions proxy enabling GCE VMs in a Managed Instance Group to delete themselves. GCE Managed instance groups don't have any good way to handle autoscaling for long-running workloads. With the autoscaler configured to scale in, instances get only 90 seconds warning to shut down. So we set the autoscaler to only scale out and have the VMs tear themselves down when they're down with their work. This is the approach suggested by the managed instance group team: https://drive.google.com/file/d/1XlwxF_0T7pUnbzhL5ePDoW-Q3GAaLO11 But anything that brings down the VM other than a delete call to the instance group manager API makes the VM get considered "unhealthy", which means it gets recreated in exactly the same configuration, regardless of any update or autoscaling settings. Making the correct API call requires broad permissions on the instance group manager, which we don't want to give the VMs. To scope permissions to individual instances, this proxy service makes use of instance identity tokens to allow an instance to make a call only to delete itself. See https://cloud.google.com/compute/docs/instances/verifying-instance-identity This makes use of the GCP Cloud Functions serverless offering. It's another level of abstraction on top of Cloud Run, where you don't even need to create your own docker container. For local development: functions-framework --target=delete_self curl -X DELETE -v --header "Authorization: Bearer $(cat /tmp/token.txt)" localhost:8080 You'll need to get a token that corresponds to an actual instance though or you'll get an error: gcloud compute ssh gh-runner-testing-presubmit-cpu-us-west1-h58j \ --user-output-enabled=false \ --command "curl -sSfL \ -H 'Metadata-Flavor: Google' \ 'http://metadata/computeMetadata/v1/instance/service-accounts/default/identity?audience=localhost&format=full'" \ > /tmp/token.txt To deploy: # Note timeout should be greater than STABILIZE_TIMEOUT_SECONDS gcloud functions deploy instance-self-deleter \ --gen2 \ --runtime=python310 \ --region=us-central1 \ --source=. \ --entry-point=delete_self \ --trigger-http \ --run-service-account=managed-instance-deleter@iree-oss.iam.gserviceaccount.com \ --service-account=managed-instance-deleter@iree-oss.iam.gserviceaccount.com \ --ingress-settings=internal-only \ --timeout=120s \ --set-env-vars ALLOWED_MIG_PATTERN='gh-runner-.*' See https://cloud.google.com/functions/docs for more details. """ import os import random import re import time from http.client import ( BAD_REQUEST, FORBIDDEN, GATEWAY_TIMEOUT, INTERNAL_SERVER_ERROR, NOT_FOUND, UNAUTHORIZED, ) import flask import functions_framework import google.api_core.exceptions import google.auth.exceptions import requests from google.auth import transport from google.cloud import compute from google.oauth2 import id_token AUTH_HEADER_PREFIX = "Bearer " ALLOWED_HTTP_METHODS = ["DELETE", "GET"] MIG_METADATA_KEY = "created-by" ALLOWED_MIG_PATTERN_ENV_VARIABLE = "ALLOWED_MIG_PATTERN" # Must be less than timeout configuration for deployment STABILIZE_TIMEOUT_SECONDS = 100 instances_client = compute.InstancesClient() migs_client = compute.RegionInstanceGroupManagersClient() autoscalers_client = compute.RegionAutoscalersClient() session = requests.Session() print("Server started") def _verify_token(token: str) -> dict: """Verify token signature and return the token payload""" request = transport.requests.Request(session) payload = id_token.verify_oauth2_token(token, request=request) return payload def _get_region(zone: str) -> str: """Extract region name from zone name""" # Drop the trailing zone identifier to get the region. Yeah it kinda does seem # like there should be a better way to do this... region, _ = zone.rsplit("-", maxsplit=1) return region def _get_name_from_resource(resource: str) -> str: """Extract just the final name component from a fully scoped resource name.""" _, name = resource.rsplit("/", maxsplit=1) return name def _get_from_items(items: compute.Items, key: str): # Why would the GCP Python API return something as silly as a dictionary? return next((item.value for item in items if item.key == key), None) def delete_instance_from_mig( mig_name: str, project: str, region: str, instance: compute.Instance ): try: operation = migs_client.delete_instances( instance_group_manager=mig_name, project=project, region=region, # For some reason we can't just use a list of instance names and need to # build this RhymingRythmicJavaClasses proto. Also, unlike all the other # parameters, the instance has to be a fully-specified URL for the # instance, not just its name. region_instance_group_managers_delete_instances_request_resource=( compute.RegionInstanceGroupManagersDeleteInstancesRequest( instances=[instance.self_link] ) ), ) except ( google.api_core.exceptions.Forbidden, google.api_core.exceptions.Unauthorized, google.api_core.exceptions.NotFound, ) as e: print(e) return flask.abort( e.code, f"Error requesting that {mig_name} delete {instance.name}." ) except Exception as e: # We'll call any other error here a server error. print(e) return flask.abort( INTERNAL_SERVER_ERROR, f"Error requesting that {mig_name} delete {instance.name}.", ) try: # This is actually an extended operation that you have to poll to get its # status, but we just check the status once because it appears that errors # always show up here and all we just want to return success in marking for # deletion. We don't need to wait for the deletion to actually take place. operation.result() except google.api_core.exceptions.ClientError as e: print(e) # Unpack the actual usable error message msg = ( f"Error requesting that {mig_name} delete {instance.name}:" "\n" + "\n".join( [f"{err.code}: {err.message}" for err in e.response.error.errors] ) ) print(msg) # We're not actually totally sure whether this is a client or server error # for the overall request, but let's call it a client error (the only client # here is our VM instances, so I think we can be a bit loose). return flask.abort(BAD_REQUEST, msg) success_msg = f"{instance.name} has been marked for deletion by {mig_name}." print(success_msg) return success_msg def should_scale_down(mig_name: str, project: str, region: str): start = time.time() print(f"Polling {mig_name} for stability") while time.time() - start < STABILIZE_TIMEOUT_SECONDS: try: mig = migs_client.get( project=project, region=region, instance_group_manager=mig_name ) except google.api_core.exceptions.NotFound as e: print(e) return flask.abort( e.code, f"Cannot find {mig_name} in region={region}, project={project}" ) if mig.status.is_stable: break # We sleep for a random amount of time here to avoid synchronizing callers # waiting for the MIG to be stable. sleep_secs = random.randint(1, 15) print(f"{mig_name} is not stable. Retrying in {sleep_secs} seconds") time.sleep(sleep_secs) else: return flask.abort( GATEWAY_TIMEOUT, "Timed out waiting for the MIG to become stable" ) autoscaler = autoscalers_client.get( project=project, region=region, autoscaler=_get_name_from_resource(mig.status.autoscaler), ) response = "true" if autoscaler.recommended_size < mig.target_size else "false" print( f"Autoscaler recommends size {autoscaler.recommended_size} and" f" {mig_name} is targetting size {mig.target_size}. Sending: {response}" ) return response @functions_framework.http def delete_self(request: flask.Request): """HTTP Cloud Function to delete the instance group making the request. Args: request: The request object. https://flask.palletsprojects.com/en/1.1.x/api/#incoming-request-data Returns: The response text, or any set of values that can be turned into a Response object using `make_response` https://flask.palletsprojects.com/en/1.1.x/api/#flask.make_response. Note: For more information on how Flask integrates with Cloud Functions, see the `Writing HTTP functions` page. https://cloud.google.com/functions/docs/writing/http#http_frameworks """ if request.method not in ALLOWED_HTTP_METHODS: return flask.abort( BAD_REQUEST, f"Invalid method {request.method}." f" Allowed methods: {ALLOWED_HTTP_METHODS}", ) # No path is needed, since the token and method contain all the information we # need. Maybe that design was a mistake, but since the resource being operated # on is always the instance making the call, it seemed handy. if request.path != "/": return flask.abort( BAD_REQUEST, f"Invalid request path {request.path}. Only root path is valid).", ) auth_header = request.headers.get("Authorization") if auth_header is None: return flask.abort(UNAUTHORIZED, "Authorization header is missing") if not auth_header.startswith(AUTH_HEADER_PREFIX): return flask.abort( UNAUTHORIZED, f"Authorization header does not start with expected string" f" {AUTH_HEADER_PREFIX}.", ) token = auth_header[len(AUTH_HEADER_PREFIX) :] try: # We don't verify audience here because Cloud IAM will have already done so # and jwt's matching of audiences is exact, which means trailing slashes or # http vs https matters and that's pretty brittle. token_payload = _verify_token(token) except (ValueError, google.auth.exceptions.GoogleAuthError) as e: print(e) return flask.abort(UNAUTHORIZED, "Decoding bearer token failed.") print(f"Token payload: {token_payload}") try: compute_info = token_payload["google"]["compute_engine"] except KeyError: return flask.abort( UNAUTHORIZED, "Bearer token payload does not have expected field google.compute", ) project = compute_info["project_id"] zone = compute_info["zone"] region = _get_region(zone) instance_name = compute_info["instance_name"] if request.method == "DELETE": print(f"Received request to delete {instance_name}") else: assert request.method == "GET" print(f"Received inquiry whether to delete {instance_name}") try: instance = instances_client.get( instance=instance_name, project=project, zone=zone ) except ( google.api_core.exceptions.NotFound, google.api_core.exceptions.Forbidden, ) as e: print(e) return flask.abort( e.code, f"Cannot view {instance_name} in zone={zone}, project={project}" ) instance_id = int(compute_info["instance_id"]) # Verify it's *actually* the same instance. Names get reused, but IDs don't. # For some reason you can't reference anything by ID in the API. if instance.id != instance_id: return flask.abort( BAD_REQUEST, f"Existing instance of the same name {instance.name} has a different" f" ID {instance.id} than token specifies {instance_id}.", ) mig_name = _get_from_items(instance.metadata.items, MIG_METADATA_KEY) if mig_name is None: return flask.abort( BAD_REQUEST, ( f"Instance is not part of a managed instance group." f" Did not find {MIG_METADATA_KEY} in metadata." ), ) mig_name = _get_name_from_resource(mig_name) # General good practice would be to compile the regex once, but the only way # to do that is to make it a global, which makes this difficult to test and # compiling this regex should not be expensive. allowed_mig_pattern = os.environ.get(ALLOWED_MIG_PATTERN_ENV_VARIABLE) if allowed_mig_pattern is None: flask.abort( INTERNAL_SERVER_ERROR, f"Missing required environment variable" f" {ALLOWED_MIG_PATTERN_ENV_VARIABLE}", ) if not re.fullmatch(allowed_mig_pattern, mig_name): return flask.abort(FORBIDDEN, f"No access to MIG {mig_name}") if request.method == "DELETE": return delete_instance_from_mig( mig_name=mig_name, project=project, region=region, instance=instance ) assert request.method == "GET" return should_scale_down(mig_name=mig_name, project=project, region=region)