#!/bin/bash # Copyright 2022 The IREE Authors # # Licensed under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception set -o errexit # Exit if any command fails set -o errtrace # make ERR trap inherit set -o pipefail # return error if any part of a pipe errors set -o nounset # error if an undefined variable is used SUCCESS_DELETE_INSTANCE=1 FAILURE_DELETE_INSTANCE=0 RUNNER_TYPE="${RUNNER_TYPE:-cpu}" RUNNER_TYPE="${RUNNER_TYPE,,}" TIME_STRING="$(date +%Y-%m-%d-%s)" INSTANCE_NAME="${INSTANCE_NAME:-github-runner-template-${RUNNER_TYPE}-${TIME_STRING}}" IMAGE_NAME="${INSTANCE_NAME/-template/}" ZONE="${ZONE:-us-central1-a}" PROJECT=iree-oss case "${RUNNER_TYPE}" in arm64) BASE_IMAGE_ARCH="-arm64" ;; *) BASE_IMAGE_ARCH="" ;; esac BASE_IMAGE="${BASE_IMAGE:-projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy${BASE_IMAGE_ARCH}-v20231030}" # We create the image using n1 machines with attached T4 GPUs. This image works # for the A100 machines as well though. GPU_MACHINE_TYPE="n1-standard-16" X86_64_MACHINE_TYPE="e2-medium" ARM64_MACHINE_TYPE="t2a-standard-8" CPU_IMAGE_SIZE_GB=10 # We need enough space to fetch Docker images that we test with # TODO(gcmn): See if we can make the image smaller, e.g. by resizing after setup # or using a local ssd for scratch space during setup. GPU_IMAGE_SIZE_GB=100 # It takes a little bit to bring up ssh on the instance. I haven't found a # better way to wait for this than just polling. MAX_IP_ATTEMPTS=5 MAX_SSH_ATTEMPTS=10 MAX_SCP_ATTEMPTS=5 DELETE_INSTANCE_CMD=( gcloud compute instances delete "${INSTANCE_NAME}" --zone="${ZONE}" ) SSH_CMD=( gcloud compute ssh "${INSTANCE_NAME}" --zone="${ZONE}" --no-user-output-enabled ) function cleanup_reminder() { echo "You can ssh in to debug with the following command:" echo "${SSH_CMD[@]}" echo "Make sure to delete ${INSTANCE_NAME} when you're done debugging:" echo "${DELETE_INSTANCE_CMD[@]}" } function failure_exit() { local exit_code="$?" trap - INT ERR EXIT if (( exit_code != 0 )); then echo "Image creation was not successful." if (( FAILURE_DELETE_INSTANCE==1 )); then echo "Attempting to delete instance ${INSTANCE_NAME}" "${DELETE_INSTANCE_CMD[@]}" --quiet exit "${exit_code}" else cleanup_reminder fi fi exit "${exit_code}" } trap failure_exit INT ERR EXIT SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")"; function get_ip() { gcloud compute instances describe \ "${INSTANCE_NAME}" \ --zone="${ZONE}" \ --format='value(networkInterfaces[0].accessConfigs[0].ip)' } function instance_ssh() { gcloud compute ssh "${INSTANCE_NAME}" --zone="${ZONE}" \ --no-user-output-enabled \ "$@" } function ssh_ping() { # ssh with a no-op command instance_ssh --command=":" } function wait_for_ip() { local -i max_attempts="$1" local -i failed_attempts=0 while (( failed_attempts <= max_attempts )) && [[ get_ip == "" ]]; do echo -n '.' failed_attempts="$(( failed_attempts+1 ))" sleep 1 done if (( failed_attempts > max_attempts )); then echo "Instance was never assigned an external IP. Aborting" exit 1 fi } function wait_for_ssh() { local -i max_attempts="$1" local -i failed_attempts=0 local output="" while (( failed_attempts <= max_attempts )) && ! ssh_output="$(ssh_ping 2>&1)"; do echo -n '.' failed_attempts="$(( failed_attempts+1 ))" done if (( failed_attempts > max_attempts )); then echo "Failed to connect to instance via ssh. Output from ssh command:" echo "${ssh_output}" exit 1 fi } function create_image() { if gcloud compute instances describe "${INSTANCE_NAME}" --zone="${ZONE}" > /dev/null 2>&1; then echo "Using existing instance '${INSTANCE_NAME}'" else echo "Creating instance '${INSTANCE_NAME}' for boot disk" case "${RUNNER_TYPE}" in cpu) local machine_type="${X86_64_MACHINE_TYPE}" local image_size_gb="${CPU_IMAGE_SIZE_GB}" local maintenance_policy=MIGRATE local -a extra_args=() ;; arm64) local machine_type="${ARM64_MACHINE_TYPE}" local image_size_gb="${CPU_IMAGE_SIZE_GB}" local maintenance_policy=MIGRATE local -a extra_args=() ;; gpu) local machine_type="${GPU_MACHINE_TYPE}" local image_size_gb="${GPU_IMAGE_SIZE_GB}" local maintenance_policy=TERMINATE local -a extra_args=("--accelerator=count=1,type=nvidia-tesla-t4") ;; *) echo "Unrecognized RUNNER_TYPE=${RUNNER_TYPE}" exit 1 ;; esac local -a create_instance_cmd=( gcloud compute instances create "${INSTANCE_NAME}" --project=iree-oss --zone="${ZONE}" # `address=''` indicates an ephemeral IP. This *shouldn't* be necessary here, # as the gcloud docs say that this is the default, but in fact if you leave it # off the VM gets no external IP and is impossible to SSH into. This knowledge # was hard won. --network-interface=network=default,address='',network-tier=PREMIUM --provisioning-model=STANDARD --no-service-account --no-scopes --no-shielded-secure-boot --shielded-vtpm --shielded-integrity-monitoring --reservation-affinity=any --metadata-from-file=startup-script="${SCRIPT_DIR}/image_setup.sh" --maintenance-policy="${maintenance_policy}" --metadata="github-runner-type=${RUNNER_TYPE}" --machine-type="${machine_type}" --create-disk="boot=yes,device-name=${INSTANCE_NAME},image=${BASE_IMAGE},mode=rw,size=${image_size_gb},type=projects/${PROJECT}/zones/${ZONE}/diskTypes/pd-balanced,auto-delete=yes" "${extra_args[@]}" ) (set -x; "${create_instance_cmd[@]}") fi echo "Waiting for instance to start up" # We could only use the ssh check below, but it's much nicer to know why an # an instance isn't responsive and this is something we can check first. wait_for_ip "${MAX_IP_ATTEMPTS}" wait_for_ssh "${MAX_SSH_ATTEMPTS}" echo "" local log_file="$(mktemp --tmpdir ${INSTANCE_NAME}.XXX.startup.log)" echo "Streaming startup logs from instance to stdout and ${log_file}" # Get the PID of the startup script local startup_pid="$(instance_ssh --command='systemctl show --property=ExecMainPID --value google-startup-scripts')" echo "" echo "*******************" # -t forces a pseudo-tty which allows us to run tail with a follow gcloud compute ssh "${INSTANCE_NAME}" --zone="${ZONE}" \ --no-user-output-enabled --ssh-flag="-t" \ --command="tail --follow=name --retry --lines=+1 --pid=${startup_pid} /startup.log" \ | tee "${log_file}" echo "*******************" echo "" local exit_code="$(instance_ssh --command="cat /startup-exit.txt")" if [[ "${exit_code}" != +([0-9]) ]]; then echo "Failed to retrieve exit code from startup script (got '${exit_code}')." exit 1 fi if (( exit_code != 0 )); then echo "Image setup failed with code '${exit_code}'. See logs above." exit "${exit_code}" fi echo "Startup finished successfully." echo "Deleting remote log file" instance_ssh --command="sudo rm /startup.log" echo "Shutting down instance" # This actually does things synchronously, so we don't need our own loop to # wait. gcloud compute instances stop "${INSTANCE_NAME}" --zone="${ZONE}" echo "Creating disk image" gcloud compute images create "${IMAGE_NAME}" \ --source-disk="${INSTANCE_NAME}" \ --source-disk-zone="${ZONE}" if (( SUCCESS_DELETE_INSTANCE == 1 )); then echo "Deleting instance" "${DELETE_INSTANCE_CMD[@]}" --quiet else echo "Not deleting instance because SUCCESS_DELETE_INSTANCE=${SUCCESS_DELETE_INSTANCE}" cleanup_reminder fi echo "Successfully created image: ${IMAGE_NAME}" } create_image