#!/usr/bin/env python3
# Copyright 2021 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Runs all matched benchmark suites on an Android device.

This script probes the Android phone via `adb` and uses the device information
to filter and run suitable benchmarks and optionally captures Tracy traces on
the Android phone.

It expects that `adb` is installed, and there is iree tools cross-compiled
towards Android. If to capture traces, another set of tracing-enabled iree
tools and the Tracy `capture` tool should be cross-compiled towards Android.

Example usages:

  # Without trace generation
  python3 run_benchmarks.py \
    --normal_benchmark_tool_dir=/path/to/normal/android/target/tools/dir \
    /path/to/host/build/dir

  # With trace generation
  python3 run_benchmarks.py \
    --normal_benchmark_tool_dir=/path/to/normal/android/target/tools/dir \
    --traced_benchmark_tool_dir=/path/to/tracy/android/target/tools/dir \
    --trace_capture_tool=/path/to/host/build/tracy/capture \
    /path/to/host/build/dir
"""

import sys
import pathlib

# Add build_tools python dir to the search path.
sys.path.insert(0, str(pathlib.Path(__file__).parent.with_name("python")))

import atexit
import json
import requests
import shutil
import socket
import struct
import subprocess
import tarfile
import time
from typing import Any, Optional, Sequence, Tuple

from common import benchmark_suite as benchmark_suite_module
from common.benchmark_config import BenchmarkConfig
from common.benchmark_driver import BenchmarkDriver
from common import benchmark_definition
from common.benchmark_definition import (
    execute_cmd,
    execute_cmd_and_get_stdout,
    execute_cmd_and_get_output,
    get_git_commit_hash,
    get_iree_benchmark_module_arguments,
    wait_for_iree_benchmark_module_start,
    parse_iree_benchmark_metrics,
)
from common.benchmark_suite import BenchmarkCase, BenchmarkSuite
from common.android_device_utils import (
    get_android_device_model,
    get_android_device_info,
    get_android_gpu_name,
)
import common.common_arguments
from e2e_test_artifacts import iree_artifacts
from e2e_test_framework.definitions import iree_definitions

# Root directory to perform benchmarks in on the Android device.
ANDROID_TMPDIR = pathlib.PurePosixPath("/data/local/tmp/iree-benchmarks")
ADB_SERVER_ADDR = ("localhost", 5037)
ANDROID_NORMAL_TOOL_DIR = ANDROID_TMPDIR / "normal-tools"
ANDROID_TRACED_TOOL_DIR = ANDROID_TMPDIR / "traced-tools"
ANDROID_TRACY_PORT = 8086


def adb_push_file(
    source: pathlib.Path,
    dest: pathlib.PurePosixPath,
    verbose: bool = False,
) -> pathlib.PurePosixPath:
    """Pushes content onto the Android device.

    Args:
      source: the path to the source file.
      dest: the full dest path on the device.

    Returns:
      The full path to the content on the Android device.
    """
    # When the output is a TTY, keep the default progress info output.
    # In other cases, redirect progress info to null to avoid bloating log files.
    stdout_redirect = None if sys.stdout.isatty() else subprocess.DEVNULL
    execute_cmd(
        ["adb", "push", source.resolve(), dest],
        verbose=verbose,
        stdout=stdout_redirect,
    )
    return dest


def adb_execute_and_get_output(
    cmd_args: Sequence[str],
    cwd: pathlib.PurePosixPath = ANDROID_TMPDIR,
    verbose: bool = False,
) -> Tuple[str, str]:
    """Executes command with adb shell.

    Switches to `cwd` before executing. Waits for completion and returns the
    command stdout.

    Args:
      cmd_args: a list containing the command to execute and its parameters
      cwd: the directory to execute the command in

    Returns:
      Strings for stdout and stderr.
    """
    cmd = ["adb", "shell", "cd", cwd, "&&"]
    cmd.extend(cmd_args)
    return execute_cmd_and_get_output(cmd, verbose=verbose)


def adb_execute(
    cmd_args: Sequence[str],
    cwd: pathlib.PurePosixPath = ANDROID_TMPDIR,
    verbose: bool = False,
) -> subprocess.CompletedProcess:
    """Executes command with adb shell.

    Switches to `cwd` before executing. Waits for completion. Output is streamed
    to the terminal.

    Args:
      cmd_args: a list containing the command to execute and its parameters
      cwd: the directory to execute the command in

    Returns:
      The completed process.
    """
    cmd = ["adb", "shell", "cd", cwd, "&&"]
    cmd.extend(cmd_args)
    return execute_cmd(cmd, verbose=verbose)


def is_magisk_su():
    """Returns true if the Android device has a Magisk SU binary."""
    stdout, _ = adb_execute_and_get_output(["su", "--help"])
    return "MagiskSU" in stdout


def adb_execute_as_root(cmd_args: Sequence[Any]) -> subprocess.CompletedProcess:
    """Executes the given command as root."""
    cmd = ["su", "-c" if is_magisk_su() else "root"]
    cmd.extend(cmd_args)
    return adb_execute(cmd)


def adb_start_cmd(
    cmd_args: Sequence[str],
    cwd: pathlib.PurePosixPath = ANDROID_TMPDIR,
    verbose: bool = False,
) -> subprocess.Popen:
    """Executes command with adb shell in a directory and returns the handle
    without waiting for completion.

    Args:
      cmd_args: a list containing the command to execute and its parameters
      cwd: the directory to execute the command in

    Returns:
      A Popen object for the started command.
    """
    cmd = ["adb", "shell", "cd", cwd, "&&"]
    cmd.extend(cmd_args)

    if verbose:
        print(f"cmd: {cmd}")
    return subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True)


def adb_path_exists(android_path: pathlib.PurePosixPath, verbose: bool = False):
    """Run stat to check if the path exists."""
    proc = adb_start_cmd(["stat", str(android_path)], verbose=verbose)
    return proc.wait() == 0


def adb_fetch_and_push_file(
    source: benchmark_definition.ResourceLocation,
    dest: pathlib.PurePosixPath,
    verbose: bool = False,
):
    """Fetch file from the path/URL and stream to the device.

    In the case of fetching, this method avoids the temporary file on the host
    and reduces the overhead when the file is large.

    Args:
      source: path/URL to fetch the file.
      dest: the full dest path on the device.
      verbose: output verbose message.

    Returns:
      File path on the device.
    """

    if adb_path_exists(dest, verbose):
        return dest

    # If the source is a local file, push directly.
    local_path = source.get_local_path()
    if local_path:
        return adb_push_file(local_path, dest, verbose=verbose)

    if verbose:
        print(f"Streaming file {source} to {dest}.")

    url = source.get_url()
    assert url is not None
    req = requests.get(url, stream=True, timeout=60)
    if not req.ok:
        raise RuntimeError(f"Failed to fetch {source}: {req.status_code} - {req.text}")

    # Implement the ADB sync protocol to stream file chunk to the device, since
    # the adb client tool doesn't support it.
    #
    # Alternatively we can use thrid-party library such as
    # https://github.com/JeffLIrion/adb_shell. But the protocol we need is
    # simple and fairly stable. This part can be replaced with other solutions
    # if needed.
    #
    # To understand the details of the protocol, see
    # https://cs.android.com/android/_/android/platform/packages/modules/adb/+/93c8e3c26e4de3a2b767a2394200bc0721bb1e24:OVERVIEW.TXT

    def wait_ack_ok(sock: socket.socket):
        buf = bytearray()
        while len(buf) < 4:
            data = sock.recv(4 - len(buf))
            if not data:
                break
            buf += data

        if buf.decode("utf-8") != "OKAY":
            raise RuntimeError(f"ADB communication error: {buf}")

    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
        sock.connect(ADB_SERVER_ADDR)
        # Connect to any device (the first 4 hexadecimals is the following text
        # command length).
        sock.sendall(b"0012host:transport-any")
        wait_ack_ok(sock)
        # Switch to sync mode.
        sock.sendall(b"0005sync:")
        wait_ack_ok(sock)
        # Send the dest file path and file permissions 0644 (rw-r-r).
        file_attr = f"{dest},{0o644}".encode("utf-8")
        sock.sendall(b"SEND" + struct.pack("I", len(file_attr)) + file_attr)
        # Stream the file chunks. 64k bytes is the max chunk size for adb.
        for data in req.iter_content(chunk_size=64 * 1024):
            sock.sendall(b"DATA" + struct.pack("I", len(data)) + data)
        # End the file stream and set the creation time.
        sock.sendall(b"DONE" + struct.pack("I", int(time.time())))
        wait_ack_ok(sock)

    return dest


class AndroidBenchmarkDriver(BenchmarkDriver):
    """Android benchmark driver."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.already_pushed_files = {}

    def run_benchmark_case(
        self,
        benchmark_case: BenchmarkCase,
        benchmark_results_filename: Optional[pathlib.Path],
        capture_filename: Optional[pathlib.Path],
    ) -> None:
        module_rel_dir = iree_artifacts.get_module_dir_path(
            benchmark_case.run_config.module_generation_config
        )
        android_case_dir = ANDROID_TMPDIR / module_rel_dir

        module_path = benchmark_case.module_dir / iree_artifacts.MODULE_FILENAME
        module_device_path = adb_fetch_and_push_file(
            source=module_path,
            dest=android_case_dir / iree_artifacts.MODULE_FILENAME,
            verbose=self.verbose,
        )

        inputs_dir = None
        if benchmark_case.input_uri:
            inputs_dir = self.__fetch_and_unpack_npy(
                url=benchmark_case.input_uri,
                device_dir=android_case_dir / "inputs_npy",
            )
        expected_outputs_dir = None
        if self.config.verify and benchmark_case.expected_output_uri:
            expected_outputs_dir = self.__fetch_and_unpack_npy(
                url=benchmark_case.expected_output_uri,
                device_dir=android_case_dir / "expected_outputs_npy",
            )

        run_config = benchmark_case.run_config
        # TODO(#15452): Change to `--task_topology_cpu_ids` once we figure out
        # the right mapping.
        taskset = self.__deduce_taskset_from_run_config(run_config)
        run_args = run_config.materialize_run_flags(inputs_dir=inputs_dir)
        run_args.append(f"--module={module_device_path}")

        if benchmark_results_filename is not None:
            if self.config.normal_benchmark_tool_dir is None:
                raise ValueError("normal_benchmark_tool_dir can't be None.")
            if expected_outputs_dir:
                self.__run_verify(
                    host_tool_dir=self.config.normal_benchmark_tool_dir,
                    run_args=run_args,
                    expected_outputs_dir=expected_outputs_dir,
                    verify_params=benchmark_case.verify_params,
                    taskset=taskset,
                )

            self.__run_benchmark(
                host_tool_dir=self.config.normal_benchmark_tool_dir,
                benchmark_case=benchmark_case,
                run_args=run_args,
                results_filename=benchmark_results_filename,
                taskset=taskset,
            )

        if capture_filename is not None:
            capture_config = self.config.trace_capture_config
            if capture_config is None:
                raise ValueError("Trace capture config can't be None.")

            self.__run_capture(
                host_tool_dir=capture_config.traced_benchmark_tool_dir,
                trace_capture_tool=capture_config.trace_capture_tool,
                benchmark_case=benchmark_case,
                run_args=run_args,
                capture_filename=capture_filename,
                taskset=taskset,
            )

    def __run_verify(
        self,
        host_tool_dir: pathlib.Path,
        run_args: Sequence[str],
        expected_outputs_dir: pathlib.PurePosixPath,
        verify_params: Sequence[str],
        taskset: str,
    ):
        device_tool = self.__check_and_push_file(
            host_tool_dir / "iree-run-module", ANDROID_NORMAL_TOOL_DIR
        )
        cmd = ["taskset", taskset, device_tool]
        cmd += run_args
        # Currently only support single output.
        cmd.append(f'--expected_output=@{expected_outputs_dir / "output_0.npy"}')
        cmd += verify_params
        adb_execute(cmd, verbose=self.verbose)

    def __run_benchmark(
        self,
        host_tool_dir: pathlib.Path,
        benchmark_case: BenchmarkCase,
        run_args: Sequence[str],
        results_filename: pathlib.Path,
        taskset: str,
    ):
        tool_name = benchmark_case.benchmark_tool_name
        device_tool = self.__check_and_push_file(
            host_tool_dir / tool_name, ANDROID_NORMAL_TOOL_DIR
        )
        cmd = ["taskset", taskset, device_tool]
        cmd += run_args
        if tool_name == "iree-benchmark-module":
            cmd += get_iree_benchmark_module_arguments(
                driver_info=benchmark_case.driver_info,
                benchmark_min_time=self.config.benchmark_min_time,
            )

        benchmark_stdout, benchmark_stderr = adb_execute_and_get_output(
            cmd, verbose=self.verbose
        )
        benchmark_metrics = parse_iree_benchmark_metrics(
            benchmark_stdout, benchmark_stderr
        )
        if self.verbose:
            print(benchmark_metrics)
        results_filename.write_text(json.dumps(benchmark_metrics.to_json_object()))

    def __run_capture(
        self,
        host_tool_dir: pathlib.Path,
        trace_capture_tool: pathlib.Path,
        benchmark_case: BenchmarkCase,
        run_args: Sequence[str],
        capture_filename: pathlib.Path,
        taskset: str,
    ):
        tool_name = benchmark_case.benchmark_tool_name
        device_tool = self.__check_and_push_file(
            host_tool_dir / tool_name, ANDROID_TRACED_TOOL_DIR
        )
        run_cmd = [
            "TRACY_NO_EXIT=1",
            f"IREE_PRESERVE_DYLIB_TEMP_FILES={ANDROID_TMPDIR}",
            "taskset",
            taskset,
            device_tool,
        ]
        run_cmd += run_args
        if tool_name == "iree-benchmark-module":
            run_cmd += get_iree_benchmark_module_arguments(
                driver_info=benchmark_case.driver_info,
                benchmark_min_time=self.config.benchmark_min_time,
                dump_results=False,
                capture_mode=True,
            )

        # Just launch the traced benchmark tool with TRACY_NO_EXIT=1 without
        # waiting for the adb command to complete as that won't happen.
        process = adb_start_cmd(run_cmd, verbose=self.verbose)

        wait_for_iree_benchmark_module_start(process, self.verbose)

        # Now it's okay to collect the trace via the capture tool. This will
        # send the signal to let the previously waiting benchmark tool to
        # complete.
        capture_cmd = [trace_capture_tool, "-f", "-o", capture_filename]
        # If verbose, just let the subprocess print its output. The subprocess
        # may need to detect if the output is a TTY to decide whether to log
        # verbose progress info and use ANSI colors, so it's better to use
        # stdout redirection than to capture the output in a string.
        stdout_redirect = None if self.verbose else subprocess.DEVNULL
        execute_cmd(capture_cmd, verbose=self.verbose, stdout=stdout_redirect)

    def __deduce_taskset_from_run_config(
        self, run_config: iree_definitions.E2EModelRunConfig
    ) -> str:
        """Deduces the CPU mask according to device and execution config."""

        cpu_params = run_config.target_device_spec.device_parameters.cpu_params
        if not cpu_params:
            # Assume the mobile CPUs have <= 16 cores.
            return "ffff"

        exec_config = run_config.module_execution_config
        pinned_cores = cpu_params.pinned_cores
        # Use the fastest cores in the spec for single-thread benchmarks.
        if (
            exec_config.driver == iree_definitions.RuntimeDriver.LOCAL_SYNC
            or "1-thread" in exec_config.tags
        ):
            pinned_cores = pinned_cores[-1:]

        cpu_mask = sum(1 << core_id for core_id in cpu_params.pinned_cores)
        return f"{cpu_mask:04x}"

    def __check_and_push_file(
        self, host_path: pathlib.Path, device_dir: pathlib.PurePosixPath
    ):
        """Checks if the file has been pushed and pushes it if not."""
        android_path = self.already_pushed_files.get(host_path)
        if android_path is not None:
            return android_path

        android_path = adb_push_file(
            host_path,
            device_dir / host_path.name,
            verbose=self.verbose,
        )
        self.already_pushed_files[host_path] = android_path
        return android_path

    def __fetch_and_unpack_npy(self, url: str, device_dir: pathlib.PurePosixPath):
        if adb_path_exists(device_dir, verbose=self.verbose):
            return device_dir

        archive_path = adb_fetch_and_push_file(
            source=benchmark_definition.ResourceLocation.build_url(url),
            dest=device_dir.with_suffix(".tgz"),
        )
        adb_execute(
            ["mkdir", "-p", str(device_dir)]
            + ["&&", "tar", "-xvf", str(archive_path), "-C", str(device_dir)],
            verbose=self.verbose,
        )
        return device_dir


def set_cpu_frequency_scaling_governor(governor: str):
    git_root = execute_cmd_and_get_stdout(["git", "rev-parse", "--show-toplevel"])
    cpu_script = (
        pathlib.Path(git_root)
        / "build_tools"
        / "benchmarks"
        / "set_android_scaling_governor.sh"
    )
    android_path = adb_push_file(cpu_script, ANDROID_TMPDIR / cpu_script.name)
    adb_execute_as_root([android_path, governor])


def set_gpu_frequency_scaling_policy(policy: str):
    git_root = execute_cmd_and_get_stdout(["git", "rev-parse", "--show-toplevel"])
    device_model = get_android_device_model()
    gpu_name = get_android_gpu_name()
    benchmarks_tool_dir = pathlib.Path(git_root) / "build_tools" / "benchmarks"
    if device_model == "Pixel-6" or device_model == "Pixel-6-Pro":
        gpu_script = benchmarks_tool_dir / "set_pixel6_gpu_scaling_policy.sh"
    elif gpu_name.lower().startswith("adreno"):
        gpu_script = benchmarks_tool_dir / "set_adreno_gpu_scaling_policy.sh"
    else:
        raise RuntimeError(
            f"Unsupported device '{device_model}' for setting GPU scaling policy"
        )
    android_path = adb_push_file(gpu_script, ANDROID_TMPDIR / gpu_script.name)
    adb_execute_as_root([android_path, policy])


def add_port_forwarding(port: int, verbose: bool):
    """Add adb port forwarding."""
    execute_cmd_and_get_stdout(
        ["adb", "forward", f"tcp:{port}", f"tcp:{port}"], verbose=verbose
    )
    atexit.register(
        execute_cmd_and_get_stdout,
        ["adb", "forward", "--remove", f"tcp:{port}"],
        verbose=verbose,
    )


def main(args):
    device_info = get_android_device_info(args.verbose)
    if args.verbose:
        print(device_info)

    commit = get_git_commit_hash("HEAD")
    benchmark_config = BenchmarkConfig.build_from_args(args, commit)
    benchmark_groups = json.loads(args.execution_benchmark_config.read_text())
    run_configs = benchmark_suite_module.get_run_configs_by_target_and_shard(
        benchmark_groups, args.target_device_name, args.shard_index
    )

    benchmark_suite = BenchmarkSuite.load_from_run_configs(
        run_configs=run_configs, root_benchmark_dir=benchmark_config.root_benchmark_dir
    )

    benchmark_driver = AndroidBenchmarkDriver(
        device_info=device_info,
        benchmark_config=benchmark_config,
        benchmark_suite=benchmark_suite,
        benchmark_grace_time=1.0,
        verbose=args.verbose,
    )

    if args.pin_cpu_freq:
        set_cpu_frequency_scaling_governor("performance")
        atexit.register(set_cpu_frequency_scaling_governor, "schedutil")
    if args.pin_gpu_freq:
        set_gpu_frequency_scaling_policy("performance")
        atexit.register(set_gpu_frequency_scaling_policy, "default")

    # Clear the benchmark directory on the Android device first just in case
    # there are leftovers from manual or failed runs.
    adb_execute(
        ["rm", "-rf", str(ANDROID_TMPDIR), "&&", "mkdir", "-p", str(ANDROID_TMPDIR)],
        cwd=pathlib.PurePosixPath("/"),
        verbose=args.verbose,
    )

    if not args.no_clean:
        # Clear the benchmark directory on the Android device.
        atexit.register(
            execute_cmd_and_get_stdout,
            ["adb", "shell", "rm", "-rf", ANDROID_TMPDIR],
            verbose=args.verbose,
        )
        # Also clear temporary directory on the host device.
        atexit.register(shutil.rmtree, args.tmp_dir)

    trace_capture_config = benchmark_config.trace_capture_config
    if trace_capture_config:
        add_port_forwarding(port=ANDROID_TRACY_PORT, verbose=args.verbose)

    benchmark_driver.run()

    benchmark_results = benchmark_driver.get_benchmark_results()
    if args.output is not None:
        with open(args.output, "w") as f:
            f.write(benchmark_results.to_json_str())

    if args.verbose:
        print(benchmark_results.commit)
        print(benchmark_results.benchmarks)

    if trace_capture_config:
        # Put all captures in a tarball and remove the original files.
        with tarfile.open(trace_capture_config.capture_tarball, "w:gz") as tar:
            for capture_filename in benchmark_driver.get_capture_filenames():
                tar.add(capture_filename)

    benchmark_errors = benchmark_driver.get_benchmark_errors()
    if benchmark_errors:
        print("Benchmarking completed with errors", file=sys.stderr)
        raise RuntimeError(benchmark_errors)


if __name__ == "__main__":
    main(common.common_arguments.Parser().parse_args())