From f8a1463915e4351479af7d2cb970ec3d5bba474f Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Wed, 3 Jul 2024 10:27:53 +0200
Subject: [PATCH] Enable end to end CMake build

---
 .devcontainer/Dockerfile.trtllm               |  0
 .devcontainer/devcontainer.json               |  0
 Cargo.lock                                    |  1 +
 backends/trtllm/CMakeLists.txt                | 48 +++++++------------
 backends/trtllm/Cargo.toml                    |  1 +
 backends/trtllm/build.rs                      | 42 ++++++++++++++--
 backends/trtllm/cmake/trtllm.cmake            | 15 +++++-
 .../trtllm/cmake/utils/detect_cuda_arch.cu    |  0
 backends/trtllm/include/backend.h             | 48 ++++++++++++++++---
 backends/trtllm/lib/backend.cpp               | 45 ++++++++++++++++-
 backends/trtllm/src/backend.rs                | 19 ++++++--
 backends/trtllm/src/ffi.cpp                   |  2 +-
 backends/trtllm/src/lib.rs                    |  6 +++
 backends/trtllm/src/main.rs                   | 13 ++++-
 backends/trtllm/tests/infer_test.cpp          |  9 ++++
 15 files changed, 201 insertions(+), 48 deletions(-)
 create mode 100644 .devcontainer/Dockerfile.trtllm
 create mode 100644 .devcontainer/devcontainer.json
 create mode 100644 backends/trtllm/cmake/utils/detect_cuda_arch.cu
 create mode 100644 backends/trtllm/tests/infer_test.cpp

diff --git a/.devcontainer/Dockerfile.trtllm b/.devcontainer/Dockerfile.trtllm
new file mode 100644
index 00000000..e69de29b
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 00000000..e69de29b
diff --git a/Cargo.lock b/Cargo.lock
index 74246d69..30187dff 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3466,6 +3466,7 @@ dependencies = [
  "cxx-build",
  "text-generation-router",
  "thiserror",
+ "tokenizers",
  "tokio",
  "tokio-stream",
 ]
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index b9daeea9..77eb1ad0 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -7,21 +7,28 @@ include(FetchContent)
 include(ExternalProject)
 
 option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
-set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "86-real;89-real;90-real" CACHE STRING "List of CUDA architectures to support")
+set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "native" CACHE STRING "List of CUDA architectures to support")
 set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located")
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located")
 
-if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_ROOT})
-    message(FATAL_ERROR "TensorRT specified location: ${TGI_TRTLLM_BACKEND_TRT_ROOT} doesn't exist")
-else ()
-    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
-        message(FATAL_ERROR "TensorRT headers were not found at: ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}")
-    endif ()
+#### Unit Tests ####
+if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
+    message(STATUS "Building tests")
+    FetchContent_Declare(
+            Catch2
+            GIT_REPOSITORY https://github.com/catchorg/Catch2
+            GIT_TAG v3.6.0
+    )
+    FetchContent_MakeAvailable(Catch2)
 
-    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
-        message(FATAL_ERROR "TensorRT libraries were not found at: ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}")
-    endif ()
+    add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
+    target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain)
+
+    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+    include(CTest)
+    include(Catch)
+    catch_discover_tests(tgi_trtllm_backend_tests)
 endif ()
 
 #### External dependencies ####
@@ -34,23 +41,4 @@ target_include_directories(tgi_trtllm_backend_impl PRIVATE
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
         $<INSTALL_INTERFACE:include>
 )
-target_link_libraries(tgi_trtllm_backend_impl PUBLIC spdlog::spdlog tensorrt_llm nvinfer_plugin_tensorrt_llm)
-
-#### Unit Tests ####
-if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
-    message(STATUS "Building tests")
-    FetchContent_Declare(
-            Catch2
-            GIT_REPOSITORY https://github.com/catchorg/Catch2
-            GIT_TAG v3.6.0
-    )
-    FetchContent_MakeAvailable(Catch2)
-
-    add_executable(tgi_trtllm_backend_tests)
-    target_link_libraries(tests PRIVATE Catch2::Catch2::Catch2WithMain)
-
-    list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
-    include(CTest)
-    include(Catch)
-    catch_discover_tests(tests)
-endif ()
\ No newline at end of file
+target_link_libraries(tgi_trtllm_backend_impl PUBLIC spdlog::spdlog tensorrt_llm nvinfer_plugin_tensorrt_llm)
\ No newline at end of file
diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml
index bfe59346..49b8830d 100644
--- a/backends/trtllm/Cargo.toml
+++ b/backends/trtllm/Cargo.toml
@@ -10,6 +10,7 @@ async-trait = "0.1.74"
 async-stream = "0.3.5"
 cxx = "1.0"
 text-generation-router = { path = "../../router" }
+tokenizers = { version = "0.19", features = ["hf-hub"] }
 tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tokio-stream = "0.1.14"
 clap = { version = "4.5.4", features = ["derive"] }
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index 4a671712..c040c6ab 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -5,17 +5,53 @@ use cxx_build::CFG;
 
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
 
+// fn build_tensort_llm<P: AsRef<Path>>(tensorrt_llm_root_dir: P, is_debug: bool) -> PathBuf {
+//     let build_wheel_path = tensorrt_llm_root_dir
+//         .as_ref()
+//         .join("/scripts")
+//         .join("build_wheel.py");
+//
+//     let build_wheel_path_str = build_wheel_path.display().to_string();
+//     let mut build_wheel_args = vec![
+//         build_wheel_path_str.as_ref(),
+//         "--cpp_only",
+//         "--extra-cmake-vars BUILD_TESTS=OFF",
+//         "--extra-cmake-vars BUILD_BENCHMARKS=OFF",
+//     ];
+//
+//     if is_debug {
+//         build_wheel_args.push("--fast_build");
+//     }
+//
+//     let out = Command::new("python3")
+//         .args(build_wheel_args)
+//         .output()
+//         .expect("Failed to compile TensorRT-LLM");
+//     PathBuf::new().join(tensorrt_llm_root_dir)
+// }
+
 fn main() {
+    // Misc variables
     let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
     let build_profile = env::var("PROFILE").unwrap();
+    let is_debug = match build_profile.as_ref() {
+        "debug" => true,
+        _ => false,
+    };
+
+    // Compile TensorRT-LLM (as of today, it cannot be compiled from CMake)
+    // let trtllm_path = build_tensort_llm(
+    //     backend_path.join("build").join("_deps").join("trtllm-src"),
+    //     is_debug,
+    // );
 
     // Build the backend implementation through CMake
     let backend_path = cmake::Config::new(".")
         .uses_cxx11()
         .generator("Ninja")
-        .profile(match build_profile.as_ref() {
-            "release" => "Release",
-            _ => "Debug",
+        .profile(match is_debug {
+            true => "Debug",
+            false => "Release",
         })
         .build_target("tgi_trtllm_backend_impl")
         .build();
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
index d785f350..1003e88e 100644
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@@ -1,3 +1,4 @@
+set(USE_CXX11_ABI ON)
 set(NVTX_DISABLE OFF)
 set(BUILD_PYT OFF)
 set(BUILD_PYBIND OFF)
@@ -8,6 +9,18 @@ set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
 set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
 
+#if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_ROOT})
+#    message(FATAL_ERROR "TensorRT specified location: ${TGI_TRTLLM_BACKEND_TRT_ROOT} doesn't exist")
+#else ()
+#    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
+#        message(FATAL_ERROR "TensorRT headers were not found at: ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}")
+#    endif ()
+#
+#    if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR})
+#        message(FATAL_ERROR "TensorRT libraries were not found at: ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}")
+#    endif ()
+#endif ()
+
 message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
@@ -24,7 +37,7 @@ fetchcontent_declare(
 )
 fetchcontent_makeavailable(trtllm)
 message(STATUS "Found TensorRT-LLM: ${trtllm_SOURCE_DIR}")
-execute_process(COMMAND git lfs init WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
+execute_process(COMMAND git lfs install WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
 execute_process(COMMAND git lfs pull WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/")
 add_subdirectory("${trtllm_SOURCE_DIR}/cpp")
 include_directories("${trtllm_SOURCE_DIR}/cpp/include")
diff --git a/backends/trtllm/cmake/utils/detect_cuda_arch.cu b/backends/trtllm/cmake/utils/detect_cuda_arch.cu
new file mode 100644
index 00000000..e69de29b
diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h
index 591e676a..de4409f3 100644
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@@ -6,19 +6,55 @@
 #define TGI_TRTLLM_BACKEND_H
 
 #include <filesystem>
+#include <span>
 
-//#include <tensorrt_llm/runtime/common.h>
-//#include <tensorrt_llm/executor/executor.h>
-//
-//namespace tle = tensorrt_llm::executor;
+#include <tensorrt_llm/runtime/common.h>
+#include <tensorrt_llm/executor/executor.h>
+
+namespace tle = tensorrt_llm::executor;
 
 namespace huggingface::tgi::backends {
     class TensorRtLlmBackend {
     private:
-//        tle::Executor executor;
+        tle::Executor executor;
 
     public:
-        TensorRtLlmBackend(const std::filesystem::path &engineFolder);
+        explicit TensorRtLlmBackend(const std::filesystem::path &engineFolder);
+
+        /***
+         * Indicate if the backend is ready to accept incoming request
+         * @return true if ready, false otherwise
+         */
+        [[nodiscard]] bool IsReady() const {
+            return executor.canEnqueueRequests();
+        }
+
+        /***
+         *
+         * @param tokens
+         * @param maxNewTokens
+         * @param topK
+         * @param topP
+         * @param temperature
+         * @param minLength
+         * @param repetitionPenalty
+         * @param frequencePenalty
+         * @param seed
+         * @param nTopTokens
+         * @return
+         */
+        [[nodiscard]] tle::IdType Submit(
+                std::vector<tle::TokenIdType> &tokens,
+                int32_t maxNewTokens,
+                float_t topK,
+                float_t topP,
+                float_t temperature,
+                int32_t minLength,
+                std::optional<float_t> repetitionPenalty = std::nullopt,
+                std::optional<float_t> frequencePenalty = std::nullopt,
+                std::optional<uint32_t> seed = std::nullopt,
+                std::optional<uint32_t> nTopTokens = std::nullopt
+        );
     };
 }
 
diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp
index 91df8451..dcc128a1 100644
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@@ -3,6 +3,49 @@
 
 #include "backend.h"
 
-huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder) {
+huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder)
+        : executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) {
     SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder);
 }
+
+tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
+        std::vector<tle::TokenIdType> &tokens,
+        const int32_t maxNewTokens,
+        const float_t topK,
+        const float_t topP,
+        const float_t temperature,
+        const int32_t minLength,
+        const std::optional<float_t> repetitionPenalty,
+        const std::optional<float_t> frequencePenalty,
+        const std::optional<uint32_t> seed,
+        const std::optional<uint32_t> nTopTokens
+) {
+    if (IsReady()) {
+        spdlog::debug(
+                "Submitting inference over {:d} tokens to the executor {:d}",
+                tokens.size(),
+                executor.getLatestIterationStats().back().numActiveRequests
+        );
+
+        const auto sampling = tle::SamplingConfig{
+                1,
+                topK,
+                topP,
+                std::nullopt,
+                std::nullopt,
+                std::nullopt,
+                seed,
+                temperature,
+                minLength,
+                std::nullopt,
+                repetitionPenalty.value_or(0.0),
+                std::nullopt,
+                frequencePenalty.value_or(1.0),
+        };
+        const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1};
+        const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output};
+
+        return executor.enqueueRequest(request);
+    }
+    return 0;
+}
diff --git a/backends/trtllm/src/backend.rs b/backends/trtllm/src/backend.rs
index 6f5c960c..d7d7180d 100644
--- a/backends/trtllm/src/backend.rs
+++ b/backends/trtllm/src/backend.rs
@@ -2,6 +2,8 @@ use std::path::Path;
 
 use async_trait::async_trait;
 use cxx::UniquePtr;
+use tokenizers::Tokenizer;
+use tokio::sync::mpsc;
 use tokio_stream::wrappers::UnboundedReceiverStream;
 
 use text_generation_router::infer::{Backend, InferError, InferStreamResponse};
@@ -11,6 +13,7 @@ use crate::errors::TensorRtLlmBackendError;
 use crate::ffi::{create_trtllm_backend, TensorRtLlmBackend};
 
 pub struct TrtLLmBackend {
+    tokenizer: Tokenizer,
     inner: UniquePtr<TensorRtLlmBackend>,
 }
 
@@ -18,11 +21,14 @@ unsafe impl Sync for TrtLLmBackend {}
 unsafe impl Send for TrtLLmBackend {}
 
 impl TrtLLmBackend {
-    pub fn new<P: AsRef<Path>>(engine_folder: P) -> Result<Self, TensorRtLlmBackendError> {
+    pub fn new<P: AsRef<Path>>(
+        tokenizer: Tokenizer,
+        engine_folder: P,
+    ) -> Result<Self, TensorRtLlmBackendError> {
         let engine_folder = engine_folder.as_ref();
         let inner = create_trtllm_backend(engine_folder.to_str().unwrap());
 
-        Ok(Self { inner })
+        Ok(Self { tokenizer, inner })
     }
 }
 
@@ -30,12 +36,15 @@ impl TrtLLmBackend {
 impl Backend for TrtLLmBackend {
     fn schedule(
         &self,
-        _request: ValidGenerateRequest,
+        request: ValidGenerateRequest,
     ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
-        todo!()
+        let (sender, receiver) = mpsc::unbounded_channel();
+        let request_id = self.inner.submit();
+
+        Ok(UnboundedReceiverStream::new(receiver))
     }
 
     async fn health(&self, _current_health: bool) -> bool {
-        true
+        self.inner.is_ready()
     }
 }
diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp
index 215f602e..0e68c71f 100644
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@@ -14,7 +14,7 @@ namespace huggingface::tgi::backends {
     */
     std::unique_ptr<TensorRtLlmBackend> create_trtllm_backend(rust::Str engineFolder) {
         const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
-        return std::make_unique<TensorRtLlmBackend>(enginePath);
+        return std::make_unique<TensorRtLlmBackend>(std::move(enginePath));
     }
 
 }
\ No newline at end of file
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index b2c6e45b..b92fc159 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -11,5 +11,11 @@ mod ffi {
         type TensorRtLlmBackend;
 
         fn create_trtllm_backend(engine_folder: &str) -> UniquePtr<TensorRtLlmBackend>;
+
+        #[rust_name = "is_ready"]
+        fn IsReady(&self) -> bool;
+
+        #[rust_name = "submit"]
+        fn Submit(&self) -> u64;
     }
 }
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index 0a0d28c0..3d5e6da8 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -1,4 +1,7 @@
+use std::collections::HashMap;
+
 use clap::Parser;
+use tokenizers::{FromPretrainedParameters, Tokenizer};
 
 use text_generation_backends_trtllm::{errors::TensorRtLlmBackendError, TrtLLmBackend};
 use text_generation_router::server;
@@ -109,7 +112,15 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
     }
 
     // Run server
-    let backend = TrtLLmBackend::new(model_id)?;
+    let tokenizer = Tokenizer::from_pretrained(
+        tokenizer_name.clone(),
+        Some(FromPretrainedParameters {
+            revision: revision.clone().unwrap_or(String::from("main")),
+            user_agent: HashMap::new(),
+            auth_token,
+        }),
+    )?;
+    let backend = TrtLLmBackend::new(tokenizer, model_id)?;
     server::run(
         backend,
         max_concurrent_requests,
diff --git a/backends/trtllm/tests/infer_test.cpp b/backends/trtllm/tests/infer_test.cpp
new file mode 100644
index 00000000..d59d0466
--- /dev/null
+++ b/backends/trtllm/tests/infer_test.cpp
@@ -0,0 +1,9 @@
+//
+// Created by mfuntowicz on 7/2/24.
+//
+#include <catch2/catch_all.hpp>
+#include "../include/backend.h"
+
+TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
+    huggingface::tgi::backends::TensorRtLlmBackend backend("fixtures/engines/llama3-8b-instruct.engine");
+}
\ No newline at end of file