syntax = "proto3";

package flyteidl.plugins.kubeflow;

option go_package = "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/plugins";

import "flyteidl/core/tasks.proto";
import "flyteidl/plugins/kubeflow/common.proto";

// Custom proto for torch elastic config for distributed training using 
// https://github.com/kubeflow/training-operator/blob/master/pkg/apis/kubeflow.org/v1/pytorch_types.go
message ElasticConfig {
  string rdzv_backend = 1;
  int32 min_replicas = 2;
  int32 max_replicas = 3;
  int32 nproc_per_node = 4;
  int32 max_restarts = 5;
}

// Proto for plugin that enables distributed training using https://github.com/kubeflow/pytorch-operator
message DistributedPyTorchTrainingTask {
  // Worker replicas spec
  DistributedPyTorchTrainingReplicaSpec worker_replicas = 1;

  // Master replicas spec, master replicas can only have 1 replica
  DistributedPyTorchTrainingReplicaSpec master_replicas = 2;

  // RunPolicy encapsulates various runtime policies of the distributed training
  // job, for example how to clean up resources and how long the job can stay
  // active.
  RunPolicy run_policy = 3;

  // config for an elastic pytorch job
  ElasticConfig elastic_config = 4;
}

message DistributedPyTorchTrainingReplicaSpec {
  // Number of replicas
  int32 replicas = 1;

  // Image used for the replica group
  string image = 2;

  // Resources required for the replica group
  core.Resources resources = 3;
  
  // RestartPolicy determines whether pods will be restarted when they exit
  RestartPolicy restart_policy = 4;
}