From f8a1463915e4351479af7d2cb970ec3d5bba474f Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 3 Jul 2024 10:27:53 +0200 Subject: [PATCH] Enable end to end CMake build --- .devcontainer/Dockerfile.trtllm | 0 .devcontainer/devcontainer.json | 0 Cargo.lock | 1 + backends/trtllm/CMakeLists.txt | 48 +++++++------------ backends/trtllm/Cargo.toml | 1 + backends/trtllm/build.rs | 42 ++++++++++++++-- backends/trtllm/cmake/trtllm.cmake | 15 +++++- .../trtllm/cmake/utils/detect_cuda_arch.cu | 0 backends/trtllm/include/backend.h | 48 ++++++++++++++++--- backends/trtllm/lib/backend.cpp | 45 ++++++++++++++++- backends/trtllm/src/backend.rs | 19 ++++++-- backends/trtllm/src/ffi.cpp | 2 +- backends/trtllm/src/lib.rs | 6 +++ backends/trtllm/src/main.rs | 13 ++++- backends/trtllm/tests/infer_test.cpp | 9 ++++ 15 files changed, 201 insertions(+), 48 deletions(-) create mode 100644 .devcontainer/Dockerfile.trtllm create mode 100644 .devcontainer/devcontainer.json create mode 100644 backends/trtllm/cmake/utils/detect_cuda_arch.cu create mode 100644 backends/trtllm/tests/infer_test.cpp diff --git a/.devcontainer/Dockerfile.trtllm b/.devcontainer/Dockerfile.trtllm new file mode 100644 index 00000000..e69de29b diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..e69de29b diff --git a/Cargo.lock b/Cargo.lock index 74246d69..30187dff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3466,6 +3466,7 @@ dependencies = [ "cxx-build", "text-generation-router", "thiserror", + "tokenizers", "tokio", "tokio-stream", ] diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index b9daeea9..77eb1ad0 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -7,21 +7,28 @@ include(FetchContent) include(ExternalProject) option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF) -set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "86-real;89-real;90-real" CACHE STRING "List of CUDA architectures to support") +set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "native" CACHE STRING "List of CUDA architectures to support") set(TGI_TRTLLM_BACKEND_TRT_ROOT "/usr/local/tensorrt" CACHE PATH "Path where TensorRT libraries and headers are located") set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" CACHE PATH "Path where TensorRT headers are located") set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE PATH "Path where TensorRT libraries are located") -if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_ROOT}) - message(FATAL_ERROR "TensorRT specified location: ${TGI_TRTLLM_BACKEND_TRT_ROOT} doesn't exist") -else () - if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) - message(FATAL_ERROR "TensorRT headers were not found at: ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}") - endif () +#### Unit Tests #### +if (${TGI_TRTLLM_BACKEND_BUILD_TESTS}) + message(STATUS "Building tests") + FetchContent_Declare( + Catch2 + GIT_REPOSITORY https://github.com/catchorg/Catch2 + GIT_TAG v3.6.0 + ) + FetchContent_MakeAvailable(Catch2) - if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}) - message(FATAL_ERROR "TensorRT libraries were not found at: ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}") - endif () + add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp) + target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain) + + list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) + include(CTest) + include(Catch) + catch_discover_tests(tgi_trtllm_backend_tests) endif () #### External dependencies #### @@ -34,23 +41,4 @@ target_include_directories(tgi_trtllm_backend_impl PRIVATE $ $ ) -target_link_libraries(tgi_trtllm_backend_impl PUBLIC spdlog::spdlog tensorrt_llm nvinfer_plugin_tensorrt_llm) - -#### Unit Tests #### -if (${TGI_TRTLLM_BACKEND_BUILD_TESTS}) - message(STATUS "Building tests") - FetchContent_Declare( - Catch2 - GIT_REPOSITORY https://github.com/catchorg/Catch2 - GIT_TAG v3.6.0 - ) - FetchContent_MakeAvailable(Catch2) - - add_executable(tgi_trtllm_backend_tests) - target_link_libraries(tests PRIVATE Catch2::Catch2::Catch2WithMain) - - list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras) - include(CTest) - include(Catch) - catch_discover_tests(tests) -endif () \ No newline at end of file +target_link_libraries(tgi_trtllm_backend_impl PUBLIC spdlog::spdlog tensorrt_llm nvinfer_plugin_tensorrt_llm) \ No newline at end of file diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml index bfe59346..49b8830d 100644 --- a/backends/trtllm/Cargo.toml +++ b/backends/trtllm/Cargo.toml @@ -10,6 +10,7 @@ async-trait = "0.1.74" async-stream = "0.3.5" cxx = "1.0" text-generation-router = { path = "../../router" } +tokenizers = { version = "0.19", features = ["hf-hub"] } tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] } tokio-stream = "0.1.14" clap = { version = "4.5.4", features = ["derive"] } diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index 4a671712..c040c6ab 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -5,17 +5,53 @@ use cxx_build::CFG; const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; +// fn build_tensort_llm>(tensorrt_llm_root_dir: P, is_debug: bool) -> PathBuf { +// let build_wheel_path = tensorrt_llm_root_dir +// .as_ref() +// .join("/scripts") +// .join("build_wheel.py"); +// +// let build_wheel_path_str = build_wheel_path.display().to_string(); +// let mut build_wheel_args = vec![ +// build_wheel_path_str.as_ref(), +// "--cpp_only", +// "--extra-cmake-vars BUILD_TESTS=OFF", +// "--extra-cmake-vars BUILD_BENCHMARKS=OFF", +// ]; +// +// if is_debug { +// build_wheel_args.push("--fast_build"); +// } +// +// let out = Command::new("python3") +// .args(build_wheel_args) +// .output() +// .expect("Failed to compile TensorRT-LLM"); +// PathBuf::new().join(tensorrt_llm_root_dir) +// } + fn main() { + // Misc variables let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); let build_profile = env::var("PROFILE").unwrap(); + let is_debug = match build_profile.as_ref() { + "debug" => true, + _ => false, + }; + + // Compile TensorRT-LLM (as of today, it cannot be compiled from CMake) + // let trtllm_path = build_tensort_llm( + // backend_path.join("build").join("_deps").join("trtllm-src"), + // is_debug, + // ); // Build the backend implementation through CMake let backend_path = cmake::Config::new(".") .uses_cxx11() .generator("Ninja") - .profile(match build_profile.as_ref() { - "release" => "Release", - _ => "Debug", + .profile(match is_debug { + true => "Debug", + false => "Release", }) .build_target("tgi_trtllm_backend_impl") .build(); diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake index d785f350..1003e88e 100644 --- a/backends/trtllm/cmake/trtllm.cmake +++ b/backends/trtllm/cmake/trtllm.cmake @@ -1,3 +1,4 @@ +set(USE_CXX11_ABI ON) set(NVTX_DISABLE OFF) set(BUILD_PYT OFF) set(BUILD_PYBIND OFF) @@ -8,6 +9,18 @@ set(TRT_INCLUDE_DIR ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) set(TRT_LIB_DIR ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}) set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST}) +#if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_ROOT}) +# message(FATAL_ERROR "TensorRT specified location: ${TGI_TRTLLM_BACKEND_TRT_ROOT} doesn't exist") +#else () +# if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}) +# message(FATAL_ERROR "TensorRT headers were not found at: ${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR}") +# endif () +# +# if (NOT EXISTS ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}) +# message(FATAL_ERROR "TensorRT libraries were not found at: ${TGI_TRTLLM_BACKEND_TRT_LIB_DIR}") +# endif () +#endif () + message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}") if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") @@ -24,7 +37,7 @@ fetchcontent_declare( ) fetchcontent_makeavailable(trtllm) message(STATUS "Found TensorRT-LLM: ${trtllm_SOURCE_DIR}") -execute_process(COMMAND git lfs init WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/") +execute_process(COMMAND git lfs install WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/") execute_process(COMMAND git lfs pull WORKING_DIRECTORY "${trtllm_SOURCE_DIR}/") add_subdirectory("${trtllm_SOURCE_DIR}/cpp") include_directories("${trtllm_SOURCE_DIR}/cpp/include") diff --git a/backends/trtllm/cmake/utils/detect_cuda_arch.cu b/backends/trtllm/cmake/utils/detect_cuda_arch.cu new file mode 100644 index 00000000..e69de29b diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 591e676a..de4409f3 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -6,19 +6,55 @@ #define TGI_TRTLLM_BACKEND_H #include +#include -//#include -//#include -// -//namespace tle = tensorrt_llm::executor; +#include +#include + +namespace tle = tensorrt_llm::executor; namespace huggingface::tgi::backends { class TensorRtLlmBackend { private: -// tle::Executor executor; + tle::Executor executor; public: - TensorRtLlmBackend(const std::filesystem::path &engineFolder); + explicit TensorRtLlmBackend(const std::filesystem::path &engineFolder); + + /*** + * Indicate if the backend is ready to accept incoming request + * @return true if ready, false otherwise + */ + [[nodiscard]] bool IsReady() const { + return executor.canEnqueueRequests(); + } + + /*** + * + * @param tokens + * @param maxNewTokens + * @param topK + * @param topP + * @param temperature + * @param minLength + * @param repetitionPenalty + * @param frequencePenalty + * @param seed + * @param nTopTokens + * @return + */ + [[nodiscard]] tle::IdType Submit( + std::vector &tokens, + int32_t maxNewTokens, + float_t topK, + float_t topP, + float_t temperature, + int32_t minLength, + std::optional repetitionPenalty = std::nullopt, + std::optional frequencePenalty = std::nullopt, + std::optional seed = std::nullopt, + std::optional nTopTokens = std::nullopt + ); }; } diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 91df8451..dcc128a1 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -3,6 +3,49 @@ #include "backend.h" -huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder) { +huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(const std::filesystem::path &engineFolder) + : executor(engineFolder, tle::ModelType::kDECODER_ONLY, tle::ExecutorConfig{}) { SPDLOG_INFO(FMT_STRING("Loading engines from {}"), engineFolder); } + +tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( + std::vector &tokens, + const int32_t maxNewTokens, + const float_t topK, + const float_t topP, + const float_t temperature, + const int32_t minLength, + const std::optional repetitionPenalty, + const std::optional frequencePenalty, + const std::optional seed, + const std::optional nTopTokens +) { + if (IsReady()) { + spdlog::debug( + "Submitting inference over {:d} tokens to the executor {:d}", + tokens.size(), + executor.getLatestIterationStats().back().numActiveRequests + ); + + const auto sampling = tle::SamplingConfig{ + 1, + topK, + topP, + std::nullopt, + std::nullopt, + std::nullopt, + seed, + temperature, + minLength, + std::nullopt, + repetitionPenalty.value_or(0.0), + std::nullopt, + frequencePenalty.value_or(1.0), + }; + const auto output = tle::OutputConfig{false, false, nTopTokens.value_or(1) > 1}; + const auto request = tle::Request{std::move(tokens), maxNewTokens, true, sampling, output}; + + return executor.enqueueRequest(request); + } + return 0; +} diff --git a/backends/trtllm/src/backend.rs b/backends/trtllm/src/backend.rs index 6f5c960c..d7d7180d 100644 --- a/backends/trtllm/src/backend.rs +++ b/backends/trtllm/src/backend.rs @@ -2,6 +2,8 @@ use std::path::Path; use async_trait::async_trait; use cxx::UniquePtr; +use tokenizers::Tokenizer; +use tokio::sync::mpsc; use tokio_stream::wrappers::UnboundedReceiverStream; use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; @@ -11,6 +13,7 @@ use crate::errors::TensorRtLlmBackendError; use crate::ffi::{create_trtllm_backend, TensorRtLlmBackend}; pub struct TrtLLmBackend { + tokenizer: Tokenizer, inner: UniquePtr, } @@ -18,11 +21,14 @@ unsafe impl Sync for TrtLLmBackend {} unsafe impl Send for TrtLLmBackend {} impl TrtLLmBackend { - pub fn new>(engine_folder: P) -> Result { + pub fn new>( + tokenizer: Tokenizer, + engine_folder: P, + ) -> Result { let engine_folder = engine_folder.as_ref(); let inner = create_trtllm_backend(engine_folder.to_str().unwrap()); - Ok(Self { inner }) + Ok(Self { tokenizer, inner }) } } @@ -30,12 +36,15 @@ impl TrtLLmBackend { impl Backend for TrtLLmBackend { fn schedule( &self, - _request: ValidGenerateRequest, + request: ValidGenerateRequest, ) -> Result>, InferError> { - todo!() + let (sender, receiver) = mpsc::unbounded_channel(); + let request_id = self.inner.submit(); + + Ok(UnboundedReceiverStream::new(receiver)) } async fn health(&self, _current_health: bool) -> bool { - true + self.inner.is_ready() } } diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index 215f602e..0e68c71f 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -14,7 +14,7 @@ namespace huggingface::tgi::backends { */ std::unique_ptr create_trtllm_backend(rust::Str engineFolder) { const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end()); - return std::make_unique(enginePath); + return std::make_unique(std::move(enginePath)); } } \ No newline at end of file diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index b2c6e45b..b92fc159 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -11,5 +11,11 @@ mod ffi { type TensorRtLlmBackend; fn create_trtllm_backend(engine_folder: &str) -> UniquePtr; + + #[rust_name = "is_ready"] + fn IsReady(&self) -> bool; + + #[rust_name = "submit"] + fn Submit(&self) -> u64; } } diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index 0a0d28c0..3d5e6da8 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -1,4 +1,7 @@ +use std::collections::HashMap; + use clap::Parser; +use tokenizers::{FromPretrainedParameters, Tokenizer}; use text_generation_backends_trtllm::{errors::TensorRtLlmBackendError, TrtLLmBackend}; use text_generation_router::server; @@ -109,7 +112,15 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { } // Run server - let backend = TrtLLmBackend::new(model_id)?; + let tokenizer = Tokenizer::from_pretrained( + tokenizer_name.clone(), + Some(FromPretrainedParameters { + revision: revision.clone().unwrap_or(String::from("main")), + user_agent: HashMap::new(), + auth_token, + }), + )?; + let backend = TrtLLmBackend::new(tokenizer, model_id)?; server::run( backend, max_concurrent_requests, diff --git a/backends/trtllm/tests/infer_test.cpp b/backends/trtllm/tests/infer_test.cpp new file mode 100644 index 00000000..d59d0466 --- /dev/null +++ b/backends/trtllm/tests/infer_test.cpp @@ -0,0 +1,9 @@ +// +// Created by mfuntowicz on 7/2/24. +// +#include +#include "../include/backend.h" + +TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") { + huggingface::tgi::backends::TensorRtLlmBackend backend("fixtures/engines/llama3-8b-instruct.engine"); +} \ No newline at end of file