syntax = "proto3"; package tensorflow.tpu; import "google/protobuf/wrappers.proto"; message ClippingLimits { google.protobuf.FloatValue lower = 1; // -inf if not set google.protobuf.FloatValue upper = 2; // +inf if not set } // Get the learning rate from the parameters of the SendTPUEmbeddingGradients // op. message DynamicLearningRate { } // Source of learning rate to use. message LearningRate { oneof learning_rate { float constant = 1; DynamicLearningRate dynamic = 2; } } // Each optimizer's parameter proto has a link to its documentation and CPU // implementation (if available) for user reference. // https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151 message AdagradParameters { float initial_accumulator = 1; } // https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L423 message StochasticGradientDescentParameters { } // https://www.tensorflow.org/api_docs/python/tf/train/FtrlOptimizer // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L192 message FtrlParameters { float l1 = 1; float l2 = 2; float lr_power = 3; float initial_accum = 4; float initial_linear = 5; } // The Adam optimizer does not implement hyper-parameter update; use the dynamic // learning rate feature instead, setting the learning rate to: // user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) // Here, t is the current timestep. // // https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer // https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54 // // Note that the code by default implements the lazy version of Adam // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/LazyAdamOptimizer) // unless the use_non_lazy_adam parameter is set, in which case it implements // the normal version of Adam that updates all parameters in the embedding // table, even for entries that are not used in the current minibatch // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If // use_non_lazy_adam is enabled, use_gradient_accumulation is also required in // order to get correct results; a warning will be printed otherwise (which may // change to an error in the future). If use_sum_inside_sqrt is set, the Adam // variable update formula will be changed from m / (sqrt(v) + epsilon) to // m / sqrt(v + epsilon**2); this option improves the performance of TPU // training and is not expected to harm model quality. message AdamParameters { float beta1 = 3; float beta2 = 4; float epsilon = 5; float initial_m = 6; float initial_v = 7; bool use_non_lazy_adam = 8; bool use_sum_inside_sqrt = 10; } // https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L271 message MomentumParameters { float momentum = 1; bool use_nesterov = 2; float initial_accum = 3; } // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L356 message RmsPropParameters { float rho = 1; float momentum = 2; float epsilon = 3; float initial_ms = 4; float initial_mom = 5; } // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L372 message CenteredRmsPropParameters { float rho = 1; float momentum = 2; float epsilon = 3; float initial_ms = 4; float initial_mom = 5; float initial_mg = 6; } // Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf message MdlAdagradLightParameters { float l2 = 1; float lr_power = 2; float min_servable_mdl_benefit = 3; float mdl_mix_in_margin = 4; float mdl_benefit_rampup_coeff = 5; float mdl_min_weight = 6; float benefit_revisit_scale = 7; float max_event_benefit = 8; float max_total_benefit = 9; float mdl_hard_limit = 10; bool hard_limit_min_benefit = 11; bool mdl_regularize = 12; float initial_accumulator = 13; float initial_weight = 14; float initial_benefit = 15; } // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L68 message AdadeltaParameters { float rho = 1; float epsilon = 2; float initial_accumulator = 3; float initial_update = 4; } // https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer // https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L164 message ProximalAdagradParameters { float l1 = 1; float l2 = 2; float initial_accumulator = 3; } message OptimizationParameters { // Learning rate used for updating the embedding layer parameters. LearningRate learning_rate = 13; reserved 1; // Old learning rate tag. // Limits to which to clip the weight values after the backward pass; not // present means no limits are applied. ClippingLimits clipping_limits = 2; // Limits to which to clip the backward pass gradient before using it for // updates; not present means no limits are applied. ClippingLimits gradient_clipping_limits = 7; // Amount of weight decay to apply; see weight_decay_optimizers.py for // details. Almost all optimizers are supported with this option (MDL Adagrad // Light does not work, and SGD does not behave as expected if it is enabled). // Although there is no check, users who want weight decay will probably also // want to enable gradient accumulation as well so that the decay will happen // once per minibatch. float weight_decay_factor = 16; // Whether to use gradient accumulation (do two passes over the input // gradients: one to accumulate them into a temporary array and another to // apply them using the actual optimization algorithm). This feature is // experimental -- it has not been fully verified and may cause training // crashes and/or failures. bool use_gradient_accumulation = 15; // Optimization algorithm parameters; which field is selected determines which // algorithm to use. oneof parameters { AdagradParameters adagrad = 3; StochasticGradientDescentParameters stochastic_gradient_descent = 4; FtrlParameters ftrl = 5; AdamParameters adam = 6; MomentumParameters momentum = 8; RmsPropParameters rms_prop = 9; CenteredRmsPropParameters centered_rms_prop = 10; MdlAdagradLightParameters mdl_adagrad_light = 11; AdadeltaParameters adadelta = 12; ProximalAdagradParameters proximal_adagrad = 14; } } // Specification of an optimization algorithm's state variables (both the main // value vector and any extra accumulators, etc.). message StateVariableSpecification { // Parameter name for the state variable. string name = 1; // A normal state variable that should be saved and restored in checkpoints // and used as an input or output to non-debug TensorFlow ops. message UserDefined { } // A state variable that should be filled with a constant and normally hidden // from users (used for intermediate gradients being accumulated, for // example). message FillWithConstant { double initial_value = 1; } // Usage type of this state variable. oneof usage { UserDefined user_defined = 2; FillWithConstant fill_with_constant = 3; } }