syntax = "proto3";

package tensorflow.tpu.op_profile;

// Profile is the top-level data that summarizes a program.
message Profile {
  reserved 2;
  reserved "by_program_structure";
  reserved 3;
  reserved "per_program";
  // Root of a profile broken down by instruction category.
  Node by_category = 1;
  // Root of a profile broken down by program.
  Node by_program = 4;
}

// An entry in the profile tree. (An instruction, or set of instructions).
message Node {
  string name = 1;      // Semantics depend on contents.
  Metrics metrics = 2;  // May be omitted e.g. for fused instructions.
  repeated Node children = 3;  // Subjected to pruning.

  // Details about what this node represents.
  oneof contents {
    InstructionCategory category = 4;
    XLAInstruction xla = 5;
  }

  int32 num_children = 6;  // Total number of children before pruning.
  // A category of XLA instructions.
  // name is a descriptive string, like "data formatting".
  message InstructionCategory {
  }
  // A single XLA instruction.
  // name is the unique instruction id, like "%multiply.5".
  message XLAInstruction {
    string op = 1;          // Opcode like %multiply
    string expression = 2;  // %multiply = [shape]multiply(operand1, operand2)
    string provenance = 3;  // Typically the TensorFlow operation name.
    string category = 4;
    // Describes the physical memory layout of the instruction's primary input.
    // e.g. for a convolution, this analyzes the image and ignores the kernel.
    LayoutAnalysis layout = 5;
    message LayoutAnalysis {
      // The physical data layout, from most-minor to most-major dimensions.
      repeated Dimension dimensions = 1;
      message Dimension {
        int32 size = 1;       // Size of the data in this dimension.
        int32 alignment = 2;  // Data must be padded to a multiple of alignment.
        string semantics = 3;  // What the dimension represents, e.g. "spatial".
      }
    }
  }
}

// Measurements of an operation (or aggregated set of operations).
// Metrics are always "total" rather than "self".
message Metrics {
  // Core-time taken by this operation, as a fraction of all operations.
  double time = 1;
  // Floating point computations performed by this operation, as a fraction of
  // peak core FLOPS * program time. This representation has useful properties:
  //  - it is proportional to the number of floating point operations performed
  //  - utilization is flops/time
  //  - wasted potential flops is proportional to time - flops
  //  - it does not reveal the peak core FLOPS of the hardware
  double flops = 2;

  // The memory bandwidth used to load operands, as a fraction of
  // thereotical memory bandwidth on the specific hardware.
  double memory_bandwidth = 3;

  double raw_time = 11;   // Elapsed core-time in picoseconds.
  double raw_flops = 12;  // Total floating-point operations performed.
  double raw_bytes_accessed = 13;  // Total bytes accessed (include read/write).
}