syntax = "proto3";
package tensorflow;

import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/protobuf/config.proto";
import "tensorflow/contrib/tpu/profiler/op_profile.proto";

// The TPUProfiler service retrieves performance information about
// the programs running on connected TPUs over a period of time.
service TPUProfiler {
  // Starts a profiling session, blocks until it completes, and returns data.
  rpc Profile(ProfileRequest) returns (ProfileResponse) {
  }
  // Collects profiling data and returns user-friendly metrics.
  rpc Monitor(MonitorRequest) returns (MonitorResponse) {
  }
}

message ProfileOptions {
  // We don't collect the dataset ops by default for better trace-viewer
  // scalability. The caller can mannually set this field to include the ops.
  bool include_dataset_ops = 1;

  // next-field: 2
}

message ToolRequestOptions {
  // Required formats for the tool, it should be one of "json", "proto", "raw"
  // etc. If not specified (backward compatible), use default format, i.e. most
  // tools use json format.
  string output_formats = 2;

  // Whether save the result directly to repository or pass it back to caller.
  // Default to false for backward compatibilities.
  bool save_to_repo = 3;
}

message ProfileRequest {
  // In future, the caller will be able to customize when profiling starts and
  // stops. For now, it collects `duration_ms` milliseconds worth of data.
  uint64 duration_ms = 1;

  // The maximum number of events to return. By default (value 0), return all
  // events.
  uint64 max_events = 2;

  // Required profiling tools name such as "input_pipeline_analyzer" etc
  repeated string tools = 3;

  // Specifies the requirement for each tools.
  map<string, ToolRequestOptions> tool_options = 8;

  // Optional profiling options that control how a TF session will be profiled.
  ProfileOptions opts = 4;

  // The place where we will dump profile data. We will normally use
  // MODEL_DIR/plugin/profile/ as our repository root.
  string repository_root = 5;

  // The user provided profile session identifier.
  string session_id = 6;

  // The hostname of system where the profile should happen.
  // We use it as identifier in part of our output filename.
  string host_name = 7;

  // In future, the caller will indicate which TF session is being profiled, and
  // only data relating to that program will be returned. For now, we assume
  // all activity during the profiling period is relevant.
  // next-field: 9
}

message ProfileToolData {
  // The file name which this data is associated (e.g. "input_pipeline.json",
  // "cluster_xxx.memory_viewer.json").
  string name = 1;

  // The data payload (likely json) for the specific tool.
  bytes data = 2;
}

message ProfileResponse {
  reserved 1;  // was uint64 placeholder for returning something meaningful.
  // Graphs of programs executed on TPUs during the profiling period.
  repeated GraphDef computation_graph = 2;

  // Performance profile that can be used to annotate HLO operations in the
  // computation graph.
  RunMetadata hlo_metadata = 5;

  // Encoded Trace proto message that contains metadata about the trace captured
  // during the profiling period. Describes the devices and resources that
  // 'trace_events' refers to.
  bytes encoded_trace = 3;

  // Assembles a hierarchical performance profile based on HLOs in trace events.
  // If the trace covers multiple programs, the longest-running one is analyzed.
  // See op_profile.proto for the detailed semantics of the returned profile.
  tpu.op_profile.Profile op_profile = 4;

  // Data payload for each required tools.
  repeated ProfileToolData tool_data = 6;

  // When we write profiling data directly to repository directory, we need a
  // way to figure out whether the captured trace is empty (due to idle TPU).
  bool empty_trace = 7;

  // next-field: 8
}

message MonitorRequest {
  // Duration for which to profile between each update.
  uint64 duration_ms = 1;

  // Indicates the level at which we want to monitor. Currently, two levels are
  // supported:
  // Level 1: An ultra lightweight mode that captures only some utilization
  // metrics.
  // Level 2: More verbose than level 1. Collects utilization metrics, device
  // information, step time information, etc. Do not use this option if the TPU
  // host is being very heavily used.
  int32 monitoring_level = 2;

  // next-field: 3
}

message MonitorResponse {
  // Properly formatted string data that can be directly returned back to user.
  string data = 1;

  // next-field: 2
}