syntax = "proto3"; package tensorflow; import "tensorflow/core/framework/graph.proto"; import "tensorflow/core/protobuf/config.proto"; import "tensorflow/contrib/tpu/profiler/op_profile.proto"; // The TPUProfiler service retrieves performance information about // the programs running on connected TPUs over a period of time. service TPUProfiler { // Starts a profiling session, blocks until it completes, and returns data. rpc Profile(ProfileRequest) returns (ProfileResponse) { } // Collects profiling data and returns user-friendly metrics. rpc Monitor(MonitorRequest) returns (MonitorResponse) { } } message ProfileOptions { // We don't collect the dataset ops by default for better trace-viewer // scalability. The caller can mannually set this field to include the ops. bool include_dataset_ops = 1; // next-field: 2 } message ToolRequestOptions { // Required formats for the tool, it should be one of "json", "proto", "raw" // etc. If not specified (backward compatible), use default format, i.e. most // tools use json format. string output_formats = 2; // Whether save the result directly to repository or pass it back to caller. // Default to false for backward compatibilities. bool save_to_repo = 3; } message ProfileRequest { // In future, the caller will be able to customize when profiling starts and // stops. For now, it collects `duration_ms` milliseconds worth of data. uint64 duration_ms = 1; // The maximum number of events to return. By default (value 0), return all // events. uint64 max_events = 2; // Required profiling tools name such as "input_pipeline_analyzer" etc repeated string tools = 3; // Specifies the requirement for each tools. map tool_options = 8; // Optional profiling options that control how a TF session will be profiled. ProfileOptions opts = 4; // The place where we will dump profile data. We will normally use // MODEL_DIR/plugin/profile/ as our repository root. string repository_root = 5; // The user provided profile session identifier. string session_id = 6; // The hostname of system where the profile should happen. // We use it as identifier in part of our output filename. string host_name = 7; // In future, the caller will indicate which TF session is being profiled, and // only data relating to that program will be returned. For now, we assume // all activity during the profiling period is relevant. // next-field: 9 } message ProfileToolData { // The file name which this data is associated (e.g. "input_pipeline.json", // "cluster_xxx.memory_viewer.json"). string name = 1; // The data payload (likely json) for the specific tool. bytes data = 2; } message ProfileResponse { reserved 1; // was uint64 placeholder for returning something meaningful. // Graphs of programs executed on TPUs during the profiling period. repeated GraphDef computation_graph = 2; // Performance profile that can be used to annotate HLO operations in the // computation graph. RunMetadata hlo_metadata = 5; // Encoded Trace proto message that contains metadata about the trace captured // during the profiling period. Describes the devices and resources that // 'trace_events' refers to. bytes encoded_trace = 3; // Assembles a hierarchical performance profile based on HLOs in trace events. // If the trace covers multiple programs, the longest-running one is analyzed. // See op_profile.proto for the detailed semantics of the returned profile. tpu.op_profile.Profile op_profile = 4; // Data payload for each required tools. repeated ProfileToolData tool_data = 6; // When we write profiling data directly to repository directory, we need a // way to figure out whether the captured trace is empty (due to idle TPU). bool empty_trace = 7; // next-field: 8 } message MonitorRequest { // Duration for which to profile between each update. uint64 duration_ms = 1; // Indicates the level at which we want to monitor. Currently, two levels are // supported: // Level 1: An ultra lightweight mode that captures only some utilization // metrics. // Level 2: More verbose than level 1. Collects utilization metrics, device // information, step time information, etc. Do not use this option if the TPU // host is being very heavily used. int32 monitoring_level = 2; // next-field: 3 } message MonitorResponse { // Properly formatted string data that can be directly returned back to user. string data = 1; // next-field: 2 }