#include #include #include "backend.hpp" #include "hardware.hpp" namespace huggingface::tgi::backends::trtllm { tle::ParallelConfig backend_workspace_t::parallel_config() const { // Single engine (TP = PP = 1) -> using leader mode (no MPI involved) const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get(); auto mode = tle::CommunicationMode::kLEADER; std::optional orchestratorConfig = std::nullopt; if (world_size > 1) { SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode"); mode = tle::CommunicationMode::kORCHESTRATOR; orchestratorConfig = std::make_optional(true, executor_worker_path_, nullptr, true); } else { SPDLOG_INFO("Detected single engine deployment, using leader mode"); } return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig); } tle::ExecutorConfig backend_workspace_t::executor_config() const { // Retrieve the compute capabilities to enable some options at runtime const auto compute_capabilities = hardware::cuda::compute_capabilities_t(); // Allocate the config tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1); // Set the parallel config as inferred executor_config.setParallelConfig(parallel_config()); // Define some configuration variables executor_config.setKvCacheConfig(tle::KvCacheConfig(true)); executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere()); executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION)); return executor_config; } backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {} size_t backend_t::num_tokens_ready() const noexcept { return executor_.getNumResponsesReady(); } std::expected backend_t::submit(std::span token_ids, const generation_params_t g_params, const sampling_params_t s_params) noexcept { SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params); return executor_.enqueueRequest(tle::Request{ {token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens static_cast(g_params.max_new_tokens), true, (tle::SamplingConfig) s_params, tle::OutputConfig{ /* returnLogProbs= */ true}, std::nullopt, std::nullopt, std::nullopt, std::nullopt, workspace.generation_config().stop_words }); } std::vector backend_t::pull_tokens() noexcept { SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready()); return executor_.awaitResponses(); } void backend_t::cancel(request_id_t request_id) noexcept { SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id); executor_.cancelRequest(request_id); } }