From fd7e2b5bbd18ed7392a69f07a655ee64b55093b9 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Mon, 2 Dec 2024 00:05:59 +0100
Subject: [PATCH] feat(backend): more impl

---
 backends/trtllm/csrc/backend.cpp  |  2 +-
 backends/trtllm/csrc/backend.hpp  | 58 ++++++++++++++++++++-----------
 backends/trtllm/csrc/ffi.hpp      | 52 ++++++++++++++++++++++++---
 backends/trtllm/csrc/hardware.hpp |  1 +
 4 files changed, 88 insertions(+), 25 deletions(-)
diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
index bc3e33de..ce5cd851 100644
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@@ -48,7 +48,7 @@ namespace huggingface::tgi::backends::trtllm {
     }
 
     std::expected<request_id_t, backend_exception_t>
-    backend_t::submit(std::span<tle::TokenIdType> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept {
+    backend_t::submit(const std::span<tle::TokenIdType> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
         SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
         return executor_.enqueueRequest(tle::Request {
                 {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp
index 69724187..c8f8f21c 100644
--- a/backends/trtllm/csrc/backend.hpp
+++ b/backends/trtllm/csrc/backend.hpp
@@ -7,7 +7,9 @@
 #include <span>
 
 #include <nlohmann/json.hpp>
+#include <spdlog/spdlog.h>
 #include <spdlog/fmt/fmt.h>
+
 #include <tensorrt_llm/executor/executor.h>
 
 #include <hardware.hpp>
@@ -58,7 +60,8 @@ namespace huggingface::tgi::backends::trtllm {
     };
 
     /**
-     *
+     * Represent possible values from transformers generation `generation_config.json`.
+     * It usually stores default sampling parameters to use, such as top_p, temperature, etc.
      */
     struct generation_config_t {
         float_t top_p;
@@ -69,15 +72,18 @@ namespace huggingface::tgi::backends::trtllm {
             top_p(config.value("top_p", 1.0f)), temperature( config.value("temperature", 1.0f)), stop_words(0) {
             if(config.contains("/eos_token_id"_json) && config["/eos_token_id"_json].is_array()) {
                 const auto& eos_token_id = config["eos_token_id"];
-                std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](int32_t token_id) {
-                    stop_words.push_back({token_id});
+                std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
+                    stop_words.emplace_back(token_id.template get<int32_t>());
                 });
+
+                SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
             }
         }
     };
 
     /**
-     *
+     * Helper class representing various items which are stored within the TensorRT-LLM engines folder and
+     * can be retrieved at runtime
      */
     class backend_workspace_t {
     private:
@@ -111,32 +117,41 @@ namespace huggingface::tgi::backends::trtllm {
         [[nodiscard]] std::filesystem::path engines_folder() const { return engines_folder_; }
 
         /**
-         *
-         * @return
+         * Hugging Face transformers' generated `generation_config_t` mapping information stored in the
+         * `generation_config.json` holding default generation parameters.
+         * @return `generation_config_t`
          */
         [[nodiscard]] const generation_config_t& generation_config() const { return generation_config_; }
 
-        /**
-         *
-         * @return
+/**
+         * Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
+         * to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information
+         * @return `tensorrt_llm::executor::ParallelConfig` instance
          */
         [[nodiscard]] constexpr tle::ParallelConfig parallel_config() const;
 
         /**
-         *
-         * @return
+         * Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used
+         * to initialize `tensorrt_llm::executor::Executor`
+         * @return `tensorrt_llm::executor::ExecutorConfig` instance
          */
         [[nodiscard]] constexpr tle::ExecutorConfig executor_config() const;
     };
 
-
     /**
-     *
+     * Error raised by the underlying backend implementation
      */
-    class backend_exception_t: std::exception  {};
+    enum backend_error_t {
+        EXECUTOR_NOT_READY = 3,
+        EXECUTOR_SCHEDULING_FAILED = 4,
+    };
+
 
     /**
-     *
+     * Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to
+     * - schedule new request
+     * - pull status of submitted request(s)
+     * - cancel submitted request(s)
      */
     class backend_t {
     private:
@@ -156,7 +171,7 @@ namespace huggingface::tgi::backends::trtllm {
          * @return Either newly submitted request's id or the error why it failed to submit
          */
         [[nodiscard("Discarded executor request_id needs to be assigned")]]
-        std::expected<request_id_t, backend_exception_t>
+        std::expected<request_id_t, backend_error_t>
         submit(std::span<token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
 
         /**
@@ -188,15 +203,18 @@ namespace huggingface::tgi::backends::trtllm {
     };
 }
 
+/**
+ * Helper structures to define formatting strategies for various types in the backend
+ */
 template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t>: formatter<string_view> {
-    auto format(huggingface::tgi::backends::trtllm::generation_params_t c, format_context& ctx) const -> format_context::iterator {
-        return format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
+    auto format(huggingface::tgi::backends::trtllm::generation_params_t const& c, format_context& ctx) const -> format_context::iterator {
+        return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
     }
 };
 
 template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t>: formatter<string_view> {
-    auto format(huggingface::tgi::backends::trtllm::sampling_params_t c, format_context& ctx) const -> format_context::iterator {
-        return format_to(
+    auto format(huggingface::tgi::backends::trtllm::sampling_params_t const& c, format_context& ctx) const -> format_context::iterator {
+        return fmt::format_to(
                 ctx.out(),
                 "sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, length_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
                 c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.length_penalty, c.temperature, c.seed
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
index b964a064..b3f20b83 100644
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@@ -1,5 +1,8 @@
 #include <memory>
+#include <thread>
+
 #include <tensorrt_llm/common/tllmException.h>
+#include <tensorrt_llm/plugins/api/tllmPlugin.h>
 
 namespace rust::behavior {
     template<typename Try, typename Fail>
@@ -17,13 +20,15 @@ namespace rust::behavior {
 #include <backend.hpp>
 
 namespace huggingface::tgi::backends::trtllm {
+    std::once_flag backend_initialized_flag;
+
     class tensorrt_llm_backend_t {
     private:
         backend_t inner_;
 
     public:
         tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
-            : inner_(engine_folder) {}
+            : inner_(engine_folder, executor_worker_path) {}
 
         size_t num_tokens_ready() const noexcept {
             return inner_.num_tokens_ready();
@@ -64,7 +69,46 @@ namespace huggingface::tgi::backends::trtllm {
         }
     };
 
-    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
-        return std::make_unique<tensorrt_llm_backend_t>(engines_folder);
+    void initialize_logging() {
+#ifndef TGI_TRTLLM_BACKEND_DEBUG
+        if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
+        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
+        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
+            return std::tolower(c);
+        });
+
+        if (log_level == "debug")
+            spdlog::set_level(spdlog::level::debug);
+        else
+            spdlog::set_level(spdlog::level::info);
     }
-}
\ No newline at end of file
+#else
+        spdlog::set_level(spdlog::level::debug);
+#endif
+    }
+
+    void initialize_tensorrt_llm_backend() {
+        SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version());
+
+        // Initialize everyone
+        initialize_logging();
+        nvmlInit_v2();
+        initTrtLlmPlugins();
+
+        const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count();
+        if (numGpus.has_value()) {
+            SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", numGpus.value());
+        } else {
+            SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system");
+            // todo: throw
+        }
+    }
+
+    std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
+        std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
+        return std::make_unique<tensorrt_llm_backend_t>(
+            std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
+            std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
+        );
+    }
+}
diff --git a/backends/trtllm/csrc/hardware.hpp b/backends/trtllm/csrc/hardware.hpp
index f3435544..b7000885 100644
--- a/backends/trtllm/csrc/hardware.hpp
+++ b/backends/trtllm/csrc/hardware.hpp
@@ -1,3 +1,4 @@
+#pragma once
 #include <cstdint>
 #include <optional>