syntax = "proto3"; option cc_enable_arenas = true; package Ydb.Maintenance; option java_package = "tech.ydb.draft.maintenance.v1"; option go_package = "github.com/ydb-platform/ydb-go-genproto/draft/protos/Ydb_Maintenance"; import "protos/ydb_operation.proto"; import "protos/ydb_status_codes.proto"; import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; // Used to describe the scope of a single action message ActionScope { message PDiskId { uint32 node_id = 1; uint32 pdisk_id = 2; } oneof scope { PDiskId pdisk_id = 1; uint32 node_id = 2; string host_name = 3; // string RackName = 3; // string DataCenter = 4; } } // Taking an exclusive lock to perform maintenance message LockAction { ActionScope action_scope = 1; google.protobuf.Duration duration = 2; } // Will not be implemented in the 1st version // Switching to maintenance mode. Maintenance modes // can overlap with each other message SetMaintenanceModeAction { ActionScope action_scope = 1; bool drain_tablets = 2; bool evict_vdisks = 3; google.protobuf.Duration duration = 4; } message Action { oneof action { LockAction lock_action = 1; SetMaintenanceModeAction set_maintainance_mode_action = 2; } } message ActionUid { string task_uid = 1; // Unique id within a single task. Defined by cms uint32 group_id = 2; uint32 action_id = 3; } message ActionState { enum ActionStatus { ACTION_STATUS_UNSPECIFIED = 0; ACTION_STATUS_CREATED = 1; ACTION_STATUS_WAITING = 2; ACTION_STATUS_PENDING = 3; // Action has granded permission. ACTION_STATUS_PERMIT_GRANDED = 4; // Permission withdrawn due to deadline ACTION_STATUS_TIMEOUT_EXPIRED = 5; // The user marked the action as completed ACTION_STATUS_FINISHED_BY_USER = 6; } // The reason why the state did not update enum ActionReason { ACTION_REASON_UNSPECIFIED = 0; // Action is ok ACTION_REASON_OK = 1; // Affected storage group has too many unavailable (locked or down) // vdisks. Can't grant another for this availability mode ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS = 2; // Blob storage group is already broken ACTION_REASON_STORAGE_GROUP_BROKEN = 3; // Too many unavailable state storage rings, // it is impossible to grant node from another ring ACTION_REASON_TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS = 4; // State storage broken. Too many (more than (nToSelect - 1) / 2) // unavailable rings ACTION_REASON_STATE_STORAGE_BROKEN = 5; // Issue in cluster disabled nodes limit ACTION_REASON_DISABLED_NODES_LIMIT_RICHED = 6; // Issue in tenant limits ACTION_REASON_TENANT_DISABLED_NODES_LIMIT_RICHED = 7; // Wrong request ACTION_REASON_WRONG_REQUEST = 8; } Action action = 1; ActionStatus status = 2; ActionUid action_uid = 3; ActionReason reason = 4; // The time when the state was assigned google.protobuf.Timestamp state_timestamp = 5; // Fields specified for PERMISSION_GRANDED state google.protobuf.Timestamp deadline = 6; } message ActionGroup { repeated Action actions = 1; } message ActionGroupStates { repeated ActionState action_states = 1; } enum AvailabilityMode { AVAILABILITY_MODE_UNSPECIFIED = 0; // By default CMS tries to guarantee cluster availability // by allowing at most 1 disabled disk in each storage group. // For compute nodes tenant and cluster policies are followed. // In this mode CMS allows at most 1 disable state storage ring AVAILABILITY_MODE_STRONG = 1; // This mode allows to move cluster restart/update forward // in case some nodes are permanently down. In this mode // CMS allows at most 1 locked (by permission to restart // node or replace device) disk in a group. But total number // of disabled disks for a group shouldn't exceed number // of parity parts in that group. // Compute nodes are handled as in default mode. // In this mode CMS allows (nToSelect - 1) / 2 state storage rings AVAILABILITY_MODE_WEAK = 2; // In this mode CMS allows to lock 1 disk in a group, but if it can't // it waits for 15 minutes at gives 1 more node. AVAILABILITY_MODE_SMART = 3; // In this mode CMS allows at most 1 locked disk in a group // ignoring its parity parts count. Allows to restart nodes // even if multiple disks of some group are down. Using // this mode might cause data unavailability. // For compute nodes CMS follows tenant and cluster policies // but allows to restart at least one node for tenant or // cluster. AVAILABILITY_MODE_FORCE = 4; } enum ItemState { // Device/node state couldn't be identified. ITEM_STATE_UNSPECIFIED = 0; // Device/node is up. ITEM_STATE_UP = 1; // Device/node is Up, but permission granded ITEM_STATE_LOCKED = 2; // Device/node is down due to planned restart. ITEM_STATE_RESTART = 3; // Device/node is down off-schedule. ITEM_STATE_DOWN = 4; } message ListClusterNodesRequest {} message ListClusterNodesResponse { message Node { uint32 node_id = 1; string data_center = 2; string rack = 3; string fqdn = 4; uint32 interconnect_port = 5; ItemState state = 6; string tenant = 7; bool is_storage = 8; bool is_dynamic = 9; } repeated Node nodes = 1; } message ListNodesDevicesRequest { repeated uint32 node_id = 1; } message ListNodesDevicesResponse { message Device { string name = 1; ItemState state = 2; } message NodeDevices { uint32 node_id = 1; repeated Device devices = 2; } repeated NodeDevices nodes_devices = 1; } message MaintenanceTaskOptions { // The maximum number of action groups in progress at a time uint32 in_flight = 1; bool dry_run = 2; // Name of a task and some comment. // Provided for the convenience of the user. string name = 3; string comment = 4; // Availability mode is not preserved for scheduled events. AvailabilityMode availability_mode = 5; // User defined GUID string task_uid = 6; // Task with largest priority blocks other tasks // until all actions are completed. Default is 0 int64 priority = 7; } message CreateMaintenanceTaskRequest { MaintenanceTaskOptions task_options = 1; repeated ActionGroup action_groups = 2; // Indicates that client is no longer interested in the task after // the specified duration starting from the time task arrives at the cms. // If not specified then default duration from CMS config is used. google.protobuf.Duration task_timeout = 5; } // Updated action states and tryes to grand permissions message RefreshMaintenanceTaskRequest { string task_uid = 2; } message MaintenanceTaskResponse { StatusIds.StatusCode status = 1; string task_uid = 2; repeated ActionGroupStates actions_states = 3; // Try again after this deadline. Specified if there are no // PERMISSION_GRANDED actions after request google.protobuf.Timestamp deadline = 4; } message ListMaintenanceTasksRequest { // If specified, it will return the tasks created by this user. // Otherwise all tasks will be returned string user = 1; } message ListMaintenanceTasksResponse { repeated string tasks_uids = 1; } // Returns specified task message GetMaintenanceTaskRequest { string task_uid = 1; } message GetMaintenanceTaskResponse { MaintenanceTaskOptions task_options = 1; repeated ActionGroupStates actions_group_states = 2; google.protobuf.Timestamp task_deadline = 3; } // Drop maintenance task message DropMaintenanceTaskRequest { string task_uid = 1; } // Extends Request deadline message ProlongateMaintenanceTaskRequest { string task_uid = 1; google.protobuf.Timestamp new_deadline = 2; } message ManageMaintenanceTaskResponse { StatusIds.StatusCode status = 1; } // Removes resolved premits message ReleasePermitRequest { repeated ActionUid action_uid = 1; } // Extends permits deadlines message ProlongatePermitRequest { message ActionDuration { ActionUid action_uid = 1; google.protobuf.Timestamp new_deadline = 2; } repeated ActionDuration action_durations = 1; } message PermitStatus { ActionUid action_uid = 1; StatusIds.StatusCode status = 2; } message ManagePermitResponse { repeated PermitStatus permit_statuses = 1; } // Getting a detailed reason why the action doesn't get a permit message GetReadableActionReasonRequest { repeated ActionUid action_ids = 1; } message GetReadableActionReasonResponse { message Reason { ActionState action_state = 1; string Reason = 2; } repeated Reason reasons = 1; }