From fd7e2b5bbd18ed7392a69f07a655ee64b55093b9 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 2 Dec 2024 00:05:59 +0100 Subject: [PATCH] feat(backend): more impl --- backends/trtllm/csrc/backend.cpp | 2 +- backends/trtllm/csrc/backend.hpp | 58 ++++++++++++++++++++----------- backends/trtllm/csrc/ffi.hpp | 52 ++++++++++++++++++++++++--- backends/trtllm/csrc/hardware.hpp | 1 + 4 files changed, 88 insertions(+), 25 deletions(-) diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp index bc3e33de..ce5cd851 100644 --- a/backends/trtllm/csrc/backend.cpp +++ b/backends/trtllm/csrc/backend.cpp @@ -48,7 +48,7 @@ namespace huggingface::tgi::backends::trtllm { } std::expected - backend_t::submit(std::span token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept { + backend_t::submit(const std::span token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept { SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params); return executor_.enqueueRequest(tle::Request { {token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp index 69724187..c8f8f21c 100644 --- a/backends/trtllm/csrc/backend.hpp +++ b/backends/trtllm/csrc/backend.hpp @@ -7,7 +7,9 @@ #include #include +#include #include + #include #include @@ -58,7 +60,8 @@ namespace huggingface::tgi::backends::trtllm { }; /** - * + * Represent possible values from transformers generation `generation_config.json`. + * It usually stores default sampling parameters to use, such as top_p, temperature, etc. */ struct generation_config_t { float_t top_p; @@ -69,15 +72,18 @@ namespace huggingface::tgi::backends::trtllm { top_p(config.value("top_p", 1.0f)), temperature( config.value("temperature", 1.0f)), stop_words(0) { if(config.contains("/eos_token_id"_json) && config["/eos_token_id"_json].is_array()) { const auto& eos_token_id = config["eos_token_id"]; - std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](int32_t token_id) { - stop_words.push_back({token_id}); + std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) { + stop_words.emplace_back(token_id.template get()); }); + + SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size()); } } }; /** - * + * Helper class representing various items which are stored within the TensorRT-LLM engines folder and + * can be retrieved at runtime */ class backend_workspace_t { private: @@ -111,32 +117,41 @@ namespace huggingface::tgi::backends::trtllm { [[nodiscard]] std::filesystem::path engines_folder() const { return engines_folder_; } /** - * - * @return + * Hugging Face transformers' generated `generation_config_t` mapping information stored in the + * `generation_config.json` holding default generation parameters. + * @return `generation_config_t` */ [[nodiscard]] const generation_config_t& generation_config() const { return generation_config_; } - /** - * - * @return +/** + * Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used + * to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information + * @return `tensorrt_llm::executor::ParallelConfig` instance */ [[nodiscard]] constexpr tle::ParallelConfig parallel_config() const; /** - * - * @return + * Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used + * to initialize `tensorrt_llm::executor::Executor` + * @return `tensorrt_llm::executor::ExecutorConfig` instance */ [[nodiscard]] constexpr tle::ExecutorConfig executor_config() const; }; - /** - * + * Error raised by the underlying backend implementation */ - class backend_exception_t: std::exception {}; + enum backend_error_t { + EXECUTOR_NOT_READY = 3, + EXECUTOR_SCHEDULING_FAILED = 4, + }; + /** - * + * Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to + * - schedule new request + * - pull status of submitted request(s) + * - cancel submitted request(s) */ class backend_t { private: @@ -156,7 +171,7 @@ namespace huggingface::tgi::backends::trtllm { * @return Either newly submitted request's id or the error why it failed to submit */ [[nodiscard("Discarded executor request_id needs to be assigned")]] - std::expected + std::expected submit(std::span token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept; /** @@ -188,15 +203,18 @@ namespace huggingface::tgi::backends::trtllm { }; } +/** + * Helper structures to define formatting strategies for various types in the backend + */ template <> struct fmt::formatter: formatter { - auto format(huggingface::tgi::backends::trtllm::generation_params_t c, format_context& ctx) const -> format_context::iterator { - return format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens); + auto format(huggingface::tgi::backends::trtllm::generation_params_t const& c, format_context& ctx) const -> format_context::iterator { + return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens); } }; template <> struct fmt::formatter: formatter { - auto format(huggingface::tgi::backends::trtllm::sampling_params_t c, format_context& ctx) const -> format_context::iterator { - return format_to( + auto format(huggingface::tgi::backends::trtllm::sampling_params_t const& c, format_context& ctx) const -> format_context::iterator { + return fmt::format_to( ctx.out(), "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, length_penalty={:.3f}, temperature={:.3f}, seed={:d} }}", c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.length_penalty, c.temperature, c.seed diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp index b964a064..b3f20b83 100644 --- a/backends/trtllm/csrc/ffi.hpp +++ b/backends/trtllm/csrc/ffi.hpp @@ -1,5 +1,8 @@ #include +#include + #include +#include namespace rust::behavior { template @@ -17,13 +20,15 @@ namespace rust::behavior { #include namespace huggingface::tgi::backends::trtllm { + std::once_flag backend_initialized_flag; + class tensorrt_llm_backend_t { private: backend_t inner_; public: tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path) - : inner_(engine_folder) {} + : inner_(engine_folder, executor_worker_path) {} size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); @@ -64,7 +69,46 @@ namespace huggingface::tgi::backends::trtllm { } }; - std::unique_ptr create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) { - return std::make_unique(engines_folder); + void initialize_logging() { +#ifndef TGI_TRTLLM_BACKEND_DEBUG + if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) { + std::string log_level(TRTLLM_LOG_LEVEL_CSTR); + std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + if (log_level == "debug") + spdlog::set_level(spdlog::level::debug); + else + spdlog::set_level(spdlog::level::info); } -} \ No newline at end of file +#else + spdlog::set_level(spdlog::level::debug); +#endif + } + + void initialize_tensorrt_llm_backend() { + SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version()); + + // Initialize everyone + initialize_logging(); + nvmlInit_v2(); + initTrtLlmPlugins(); + + const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count(); + if (numGpus.has_value()) { + SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", numGpus.value()); + } else { + SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system"); + // todo: throw + } + } + + std::unique_ptr create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) { + std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend); + return std::make_unique( + std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format), + std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format) + ); + } +} diff --git a/backends/trtllm/csrc/hardware.hpp b/backends/trtllm/csrc/hardware.hpp index f3435544..b7000885 100644 --- a/backends/trtllm/csrc/hardware.hpp +++ b/backends/trtllm/csrc/hardware.hpp @@ -1,3 +1,4 @@ +#pragma once #include #include