hf_text-generation-inference/backends/trtllm/csrc/backend.cpp

81 lines
3.5 KiB
C++

#include <ranges>
#include <nlohmann/json.hpp>
#include "backend.hpp"
#include "hardware.hpp"
namespace huggingface::tgi::backends::trtllm {
tle::ParallelConfig backend_workspace_t::parallel_config() const {
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
auto mode = tle::CommunicationMode::kLEADER;
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
if (world_size > 1) {
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
mode = tle::CommunicationMode::kORCHESTRATOR;
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr,
true);
} else {
SPDLOG_INFO("Detected single engine deployment, using leader mode");
}
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
}
tle::ExecutorConfig backend_workspace_t::executor_config() const {
// Retrieve the compute capabilities to enable some options at runtime
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
// Allocate the config
tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1);
// Set the parallel config as inferred
executor_config.setParallelConfig(parallel_config());
// Define some configuration variables
executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
return executor_config;
}
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
size_t backend_t::num_tokens_ready() const noexcept {
return executor_.getNumResponsesReady();
}
std::expected<request_id_t, backend_error_t>
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
const sampling_params_t s_params) noexcept {
SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
return executor_.enqueueRequest(tle::Request{
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
static_cast<tle::SizeType32>(g_params.max_new_tokens),
true,
(tle::SamplingConfig) s_params,
tle::OutputConfig{ /* returnLogProbs= */ true},
std::nullopt,
std::nullopt,
std::nullopt,
std::nullopt,
workspace.generation_config().stop_words
});
}
std::vector<tle::Response> backend_t::pull_tokens() noexcept {
SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready());
return executor_.awaitResponses();
}
void backend_t::cancel(request_id_t request_id) noexcept {
SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
executor_.cancelRequest(request_id);
}
}