// Copyright 2023 Ant Group Co., Ltd. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // syntax = "proto3"; package secretflowapis.v1.sdc.teeapps.params; option java_package = "com.alipay.secretflow.secretflowapis.v1.sdc.teeapps.params"; option java_outer_classname = "DatasetSampleProto"; message DatasetSampleParams { // 随机数种子 uint32 random_state = 1; // required // 注:number和frac不能同时使用,使用number会返回固定的n个样本,使用frac会根据数据量计算返回的样本数 // 采样后返回的样本数 uint32 number = 2; // optional // 采样后样本与采样前样本的比率。通常为0~1, 如需过采样, 可以设置大于1 double frac = 3; // optional // 是否是放回抽样, 如果frac>1, replacement必须设置为true // default false bool replacement = 4; // required // 数据抽样类型: // "RANDOM_SAMPLE": 随机抽样 // "STRATIFY_SAMPLE" 分层抽样 string subsample_method = 5; // required // 分层抽样观测特征(用于对样本进行分桶,仅支持单特征) // eg: y string observe_feature = 6; // optional // 观测值分位点,可以设为n个分位点 // eg: [0.5] //如果observe_feature不为空,quantiles也不能为空! repeated double quantiles = 7; // 观测值权重(和必须为1),权重应为n+1个(n个分位点可将区间分为n+1份) // 可选项,默认各个分桶采样的权重和原始数据分布相同 // eg [0.3,0.7] repeated double weights = 8; // optional } message DatasetSampleReport { // 样本总数 uint32 num_before_sample = 1; // 采样总数 uint32 num_after_sample = 2; // 采样倍率 double sample_rate = 3; message StratifiedSampleBucketResult { // 桶样本数 uint32 num_before_sample = 1; // 桶采样数 uint32 num_after_sample = 2; // 桶采样占总采样权重 double sample_weight = 3; } // 可选项,进行分层抽样时,report需要返回每个桶的抽样情况 // 随机抽样不需要 repeated StratifiedSampleBucketResult stratified_sample_results = 4; }