#include "module.h" #include #include "replica_pool.h" namespace ctranslate2 { namespace python { class EncoderWrapper : public ReplicaPoolHelper { public: using ReplicaPoolHelper::ReplicaPoolHelper; EncoderForwardOutput forward_batch(const std::variant& inputs, const std::optional& lengths, const std::optional& token_type_ids) { std::future future; std::shared_lock lock(_mutex); assert_model_is_ready(); switch (inputs.index()) { case 0: future = _pool->forward_batch_async( std::get(inputs), token_type_ids.value_or(std::vector>())); break; case 1: future = _pool->forward_batch_async( std::get(inputs), token_type_ids.value_or(std::vector>())); break; case 2: if (!lengths) throw std::invalid_argument("lengths vector is required when passing a dense input"); future = _pool->forward_batch_async( std::get(inputs), lengths.value(), token_type_ids.value_or(std::vector>())); break; } return future.get(); } }; void register_encoder(py::module& m) { py::class_(m, "EncoderForwardOutput", "Forward output of an encoder model.") .def_readonly("last_hidden_state", &EncoderForwardOutput::last_hidden_state, "Output of the last layer.") .def_readonly("pooler_output", &EncoderForwardOutput::pooler_output, "Output of the pooling layer.") .def("__repr__", [](const EncoderForwardOutput& output) { return "EncoderForwardOutput(last_hidden_state=" + std::string(py::repr(py::cast(output.last_hidden_state))) + ", pooler_output=" + std::string(py::repr(py::cast(output.pooler_output))) + ")"; }) ; py::class_( m, "Encoder", R"pbdoc( A text encoder. Example: >>> encoder = ctranslate2.Encoder("model/", device="cpu") >>> encoder.forward_batch([["▁Hello", "▁world", "!"]]) )pbdoc") .def(py::init>&, const StringOrMap&, size_t, size_t, long, bool, bool, py::object>(), py::arg("model_path"), py::arg("device")="cpu", py::kw_only(), py::arg("device_index")=0, py::arg("compute_type")="default", py::arg("inter_threads")=1, py::arg("intra_threads")=0, py::arg("max_queued_batches")=0, py::arg("flash_attention")=false, py::arg("tensor_parallel")=false, py::arg("files")=py::none(), R"pbdoc( Initializes the encoder. Arguments: model_path: Path to the CTranslate2 model directory. device: Device to use (possible values are: cpu, cuda, auto). device_index: Device IDs where to place this encoder on. compute_type: Model computation type or a dictionary mapping a device name to the computation type (possible values are: default, auto, int8, int8_float32, int8_float16, int8_bfloat16, int16, float16, bfloat16, float32). inter_threads: Maximum number of parallel generations. intra_threads: Number of OpenMP threads per encoder (0 to use a default value). max_queued_batches: Maximum numbers of batches in the queue (-1 for unlimited, 0 for an automatic value). When the queue is full, future requests will block until a free slot is available. flash_attention: run model with flash attention 2 for self-attention layer tensor_parallel: run model with tensor parallel mode files: Load model files from the memory. This argument is a dictionary mapping file names to file contents as file-like or bytes objects. If this is set, :obj:`model_path` acts as an identifier for this model. )pbdoc") .def_property_readonly("device", &EncoderWrapper::device, "Device this encoder is running on.") .def_property_readonly("device_index", &EncoderWrapper::device_index, "List of device IDs where this encoder is running on.") .def_property_readonly("compute_type", &EncoderWrapper::compute_type, "Computation type used by the model.") .def_property_readonly("num_encoders", &EncoderWrapper::num_replicas, "Number of encoders backing this instance.") .def_property_readonly("num_queued_batches", &EncoderWrapper::num_queued_batches, "Number of batches waiting to be processed.") .def_property_readonly("tensor_parallel", &EncoderWrapper::tensor_parallel, "Run model with tensor parallel mode.") .def_property_readonly("num_active_batches", &EncoderWrapper::num_active_batches, "Number of batches waiting to be processed or currently processed.") .def("forward_batch", &EncoderWrapper::forward_batch, py::arg("inputs"), py::arg("lengths")=py::none(), py::arg("token_type_ids")=py::none(), py::call_guard(), R"pbdoc( Forwards a batch of sequences in the encoder. Arguments: inputs: A batch of sequences either as string tokens or token IDs. This argument can also be a dense int32 array with shape ``[batch_size, max_length]`` (e.g. created from a Numpy array or PyTorch tensor). lengths: The length of each sequence as a int32 array with shape ``[batch_size]``. Required when :obj:`inputs` is a dense array. token_type_ids: A batch of token type IDs of same shape as :obj:`inputs`. ``[batch_size, max_length]``. Returns: The encoder model output. )pbdoc") .def("unload_model", &EncoderWrapper::unload_model, py::arg("to_cpu")=false, py::call_guard(), R"pbdoc( Unloads the model attached to this encoder but keep enough runtime context to quickly resume encoder on the initial device. Arguments: to_cpu: If ``True``, the model is moved to the CPU memory and not fully unloaded. )pbdoc") .def("load_model", &EncoderWrapper::load_model, py::arg("keep_cache")=false, py::call_guard(), R"pbdoc( Loads the model back to the initial device. Arguments: keep_cache: If ``True``, the model cache in the CPU memory is not deleted if it exists. )pbdoc") .def_property_readonly("model_is_loaded", &EncoderWrapper::model_is_loaded, "Whether the model is loaded on the initial device and ready to be used.") ; } } }