TensorRT-LLM backend bump to latest version + misc fixes (#2791)
* misc(cmake) update dependencies * feat(hardware) enable new hardware.hpp and unittests * test(ctest) enable address sanitizer * feat(backend): initial rewrite of the backend for simplicity * feat(backend): remove all the logs from hardware.hpp * feat(backend): added some logging * feat(backend): enable compiler warning if support for RVO not applying * feat(backend): missing return statement * feat(backend): introduce backend_workspace_t to store precomputed information from the engine folder * feat(backend): delete previous backend impl * feat(backend): more impl * feat(backend): use latest trtllm main version to have g++ >= 13 compatibility * feat(backend): allow overriding which Python to use * feat(backend): fix backend_exception_t -> backend_error_t naming * feat(backend): impl missing generation_step_t as return value of pull_tokens * feat(backend): make backend_workspace_t::engines_folder constexpr * feat(backend): fix main.rs retrieving the tokenizer * feat(backend): add guard to multiple header definitions * test(backend): add more unittest * feat(backend): remove constexpr from par * feat(backend): remove constexpig * test(backend): more test coverage * chore(trtllm): update dependency towards 0.15.0 * effectively cancel the request on the executor * feat(backend) fix moving backend when pulling * feat(backend): make sure we can easily cancel request on the executor * feat(backend): fix missing "0" field access * misc(backend): fix reborrowing Pin<&mut T> as described in the doc https://doc.rust-lang.org/stable/std/pin/struct.Pin.html#method.as_mut * chore: Add doc and CI for TRTLLM (#2799) * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * chore: Add doc and CI for TRTLLM * doc: Formatting * misc(backend): indent --------- Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>
This commit is contained in:
parent
3bb3fd19ae
commit
ea7f4082c4
|
@ -8,6 +8,7 @@ on:
|
|||
description: Hardware
|
||||
# options:
|
||||
# - cuda
|
||||
# - cuda-trtllm
|
||||
# - rocm
|
||||
# - intel
|
||||
required: true
|
||||
|
@ -52,6 +53,15 @@ jobs:
|
|||
export platform=""
|
||||
export extra_pytest=""
|
||||
;;
|
||||
cuda-trtllm)
|
||||
export dockerfile="Dockerfile_trtllm"
|
||||
export label_extension="-trtllm"
|
||||
export docker_volume="/mnt/cache"
|
||||
export docker_devices=""
|
||||
export runs_on="ubuntu-latest"
|
||||
export platform=""
|
||||
export extra_pytest=""
|
||||
;;
|
||||
rocm)
|
||||
export dockerfile="Dockerfile_amd"
|
||||
export label_extension="-rocm"
|
||||
|
|
|
@ -37,7 +37,7 @@ jobs:
|
|||
# fail-fast is true by default
|
||||
fail-fast: false
|
||||
matrix:
|
||||
hardware: ["cuda", "rocm", "intel-xpu", "intel-cpu"]
|
||||
hardware: ["cuda", "cuda-trtllm", "rocm", "intel-xpu", "intel-cpu"]
|
||||
uses: ./.github/workflows/build.yaml # calls the one above ^
|
||||
permissions:
|
||||
contents: write
|
||||
|
|
|
@ -2850,20 +2850,6 @@ dependencies = [
|
|||
"urlencoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-otlp"
|
||||
version = "0.13.0"
|
||||
|
@ -2963,24 +2949,6 @@ dependencies = [
|
|||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry_sdk"
|
||||
version = "0.24.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures-channel",
|
||||
"futures-executor",
|
||||
"futures-util",
|
||||
"glob",
|
||||
"once_cell",
|
||||
"opentelemetry 0.24.0",
|
||||
"percent-encoding",
|
||||
"rand",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
|
@ -4369,7 +4337,6 @@ dependencies = [
|
|||
name = "text-generation-backends-trtllm"
|
||||
version = "3.0.2-dev0"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"clap 4.5.21",
|
||||
"cmake",
|
||||
|
@ -4377,16 +4344,14 @@ dependencies = [
|
|||
"cxx-build",
|
||||
"hashbrown 0.14.5",
|
||||
"hf-hub",
|
||||
"log",
|
||||
"pkg-config",
|
||||
"pyo3",
|
||||
"text-generation-router",
|
||||
"thiserror",
|
||||
"tokenizers",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
"tracing-opentelemetry 0.25.0",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -5086,24 +5051,6 @@ dependencies = [
|
|||
"web-time 0.2.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry"
|
||||
version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"opentelemetry 0.24.0",
|
||||
"opentelemetry_sdk 0.24.1",
|
||||
"smallvec",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log 0.2.0",
|
||||
"tracing-subscriber",
|
||||
"web-time 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-opentelemetry-instrumentation-sdk"
|
||||
version = "0.16.0"
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
ARG CUDA_ARCH_LIST="75-real;80-real;86-real;89-real;90-real"
|
||||
ARG OMPI_VERSION="4.1.6"
|
||||
ARG OMPI_VERSION="4.1.7rc1"
|
||||
|
||||
# Build dependencies resolver stage
|
||||
FROM lukemathwalker/cargo-chef:latest AS chef
|
||||
|
@ -10,7 +10,7 @@ COPY . .
|
|||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
# CUDA dependent dependencies resolver stage
|
||||
FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
|
||||
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
|
@ -18,18 +18,21 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
|||
build-essential \
|
||||
cmake \
|
||||
curl \
|
||||
gcc \
|
||||
g++ \
|
||||
gcc-14 \
|
||||
g++-14 \
|
||||
git \
|
||||
git-lfs \
|
||||
libssl-dev \
|
||||
libucx-dev \
|
||||
ninja-build \
|
||||
pkg-config \
|
||||
pipx \
|
||||
python3 \
|
||||
python3-dev \
|
||||
python3-setuptools \
|
||||
tar \
|
||||
wget
|
||||
wget && \
|
||||
pipx ensurepath
|
||||
|
||||
ENV TGI_INSTALL_PREFIX=/usr/local/tgi
|
||||
ENV TENSORRT_INSTALL_PREFIX=/usr/local/tensorrt
|
||||
|
@ -83,13 +86,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
|
|||
cd backends/trtllm && \
|
||||
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
|
||||
|
||||
FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
|
||||
RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
|
||||
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
|
||||
RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
|
||||
rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
|
||||
python3 -m pip install transformers tokenizers
|
||||
pipx ensurepath && \
|
||||
pipx install --include-deps transformers tokenizers
|
||||
|
||||
WORKDIR /usr/local/tgi/bin
|
||||
|
||||
ENV PATH=/root/.local/share/pipx/venvs/transformers/bin/:$PATH
|
||||
ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
|
||||
ENV TOKENIZERS_PARALLELISM=false
|
||||
ENV OMPI_MCA_plm_rsh_agent=""
|
||||
|
|
|
@ -13,10 +13,11 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
|
|||
endif ()
|
||||
|
||||
project(tgi-trtllm-backend VERSION 1.0.0)
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
set(CMAKE_CXX_STANDARD 23)
|
||||
|
||||
include(FetchContent)
|
||||
include(ExternalProject)
|
||||
include(CheckCXXCompilerFlag)
|
||||
|
||||
option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF)
|
||||
option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF)
|
||||
|
@ -29,11 +30,20 @@ set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE ST
|
|||
find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
|
||||
|
||||
#### External dependencies ####
|
||||
include(cmake/fmt.cmake)
|
||||
include(cmake/json.cmake)
|
||||
include(cmake/spdlog.cmake)
|
||||
include(cmake/trtllm.cmake)
|
||||
|
||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
add_compile_definitions(TGI_TRTLLM_BACKEND_DEBUG=1)
|
||||
endif()
|
||||
|
||||
# This attempt to detect if the compiler can emit warning if it can't apply return value optimization from a function
|
||||
check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
|
||||
if(${COMPILER_SUPPORT_WARNING_ON_NVRO})
|
||||
set(CMAKE_CXX_FLAGS "{CMAKE_CXX_FLAGS} -Wnvro")
|
||||
endif()
|
||||
|
||||
# Let's build TRTLLM as part of CMake
|
||||
add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
|
||||
|
||||
|
@ -41,15 +51,21 @@ add_subdirectory("${trtllm_SOURCE_DIR}/cpp" "${trtllm_SOURCE_DIR}/..")
|
|||
set_target_properties(executorWorker PROPERTIES SKIP_BUILD_RPATH TRUE)
|
||||
|
||||
# TGI TRTLLM Backend definition
|
||||
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp include/hardware.h)
|
||||
add_library(tgi_trtllm_backend_impl STATIC csrc/hardware.hpp csrc/backend.hpp csrc/backend.cpp)
|
||||
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
|
||||
target_include_directories(tgi_trtllm_backend_impl PRIVATE
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
||||
$<INSTALL_INTERFACE:include>
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/csrc>
|
||||
# $<INSTALL_INTERFACE:csrc>
|
||||
)
|
||||
target_include_directories(tgi_trtllm_backend_impl PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapper CUDA::cudart CUDA::nvml)
|
||||
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt)
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE CUDA::cudart CUDA::nvml)
|
||||
target_link_libraries(tgi_trtllm_backend_impl PUBLIC nlohmann_json::nlohmann_json spdlog::spdlog)
|
||||
|
||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
|
||||
else()
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
|
||||
endif ()
|
||||
|
||||
# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
|
||||
install(TARGETS tgi_trtllm_backend_impl tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
|
||||
|
@ -60,16 +76,30 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS})
|
|||
message(STATUS "Building tests")
|
||||
FetchContent_Declare(
|
||||
Catch2
|
||||
GIT_REPOSITORY https://github.com/catchorg/Catch2
|
||||
GIT_TAG v3.6.0
|
||||
URL https://github.com/catchorg/Catch2/archive/refs/tags/v3.7.1.tar.gz
|
||||
)
|
||||
FetchContent_MakeAvailable(Catch2)
|
||||
|
||||
# add_executable(tgi_trtllm_backend_tests tests/infer_test.cpp)
|
||||
# target_link_libraries(tgi_trtllm_backend_tests PRIVATE tgi_trtllm_backend_impl Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog fmt::fmt CUDA::cudart CUDA::nvml)
|
||||
add_executable(tgi_trtllm_backend_tests tests/test_hardware.cpp tests/test_backend.cpp)
|
||||
target_include_directories(tgi_trtllm_backend_tests PUBLIC "${trtllm_SOURCE_DIR}/cpp/include")
|
||||
target_include_directories(tgi_trtllm_backend_tests PUBLIC "csrc/")
|
||||
target_link_libraries(tgi_trtllm_backend_tests PRIVATE ${TRTLLM_LIBS} CUDA::cudart CUDA::nvml)
|
||||
target_link_libraries(tgi_trtllm_backend_tests PUBLIC Catch2::Catch2WithMain nlohmann_json::nlohmann_json spdlog::spdlog tgi_trtllm_backend_impl)
|
||||
|
||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm)
|
||||
else()
|
||||
target_link_libraries(tgi_trtllm_backend_tests PRIVATE tensorrt_llm nvinfer_plugin_tensorrt_llm tensorrt_llm_nvrtc_wrapperm)
|
||||
endif ()
|
||||
|
||||
if(CMAKE_BUILD_TYPE MATCHES "Debug")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fsanitize=undefined -fsanitize=address")
|
||||
target_link_options(tgi_trtllm_backend_tests BEFORE PUBLIC -fsanitize=undefined PUBLIC -fsanitize=address)
|
||||
endif()
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
|
||||
include(CTest)
|
||||
include(Catch)
|
||||
# catch_discover_tests(tgi_trtllm_backend_tests)
|
||||
catch_discover_tests(tgi_trtllm_backend_tests)
|
||||
endif ()
|
||||
|
|
|
@ -7,20 +7,21 @@ homepage.workspace = true
|
|||
|
||||
[dependencies]
|
||||
async-trait = "0.1"
|
||||
async-stream = "0.3"
|
||||
#async-stream = "0.3"
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
cxx = "1.0"
|
||||
hashbrown = "0.14"
|
||||
hf-hub = { workspace = true }
|
||||
log = { version = "0.4", features = [] }
|
||||
#log = { version = "0.4", features = [] }
|
||||
text-generation-router = { path = "../../router" }
|
||||
tokenizers = { workspace = true }
|
||||
tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
|
||||
tokio-stream = "0.1.15"
|
||||
thiserror = "1.0.63"
|
||||
tracing = "0.1"
|
||||
tracing-opentelemetry = "0.25"
|
||||
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
|
||||
#tracing-opentelemetry = "0.25"
|
||||
#tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
|
||||
pyo3 = { workspace = true }
|
||||
|
||||
[build-dependencies]
|
||||
cmake = "0.1"
|
||||
|
|
|
@ -4,7 +4,7 @@ use std::env;
|
|||
use std::env::consts::ARCH;
|
||||
use std::path::{absolute, PathBuf};
|
||||
|
||||
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
|
||||
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
|
||||
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
|
||||
const CUDA_REQUIRED_VERSION: &str = "12.6";
|
||||
const MPI_REQUIRED_VERSION: &str = "4.1";
|
||||
|
@ -43,8 +43,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
|
|||
install_path = absolute(out_dir).expect("cannot happen").join(install_path);
|
||||
}
|
||||
|
||||
let _ = cmake::Config::new(".")
|
||||
.uses_cxx11()
|
||||
let mut config = cmake::Config::new(".");
|
||||
config.uses_cxx11()
|
||||
.generator("Ninja")
|
||||
.profile(match is_debug {
|
||||
true => "Debug",
|
||||
|
@ -53,9 +53,16 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
|
|||
.env("OPT_LEVEL", opt_level)
|
||||
.define("CMAKE_INSTALL_PREFIX", &install_path)
|
||||
.define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc")
|
||||
.define("Python3_ROOT_DIR", "../venv")
|
||||
.define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list)
|
||||
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path)
|
||||
.build();
|
||||
.define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path);
|
||||
|
||||
// Allow to override which Python to use ...
|
||||
if let Some(python3) = option_env!("Python3_EXECUTABLE") {
|
||||
config.define("Python3_EXECUTABLE", python3);
|
||||
}
|
||||
|
||||
config.build();
|
||||
|
||||
// Additional transitive CMake dependencies
|
||||
let deps_folder = out_dir.join("build").join("_deps");
|
||||
|
@ -90,26 +97,25 @@ fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
|
|||
CFG.include_prefix = "backends/trtllm";
|
||||
cxx_build::bridge("src/lib.rs")
|
||||
.static_flag(true)
|
||||
.include(deps_folder.join("fmt-src").join("include"))
|
||||
.std("c++23")
|
||||
.include(deps_folder.join("spdlog-src").join("include"))
|
||||
.include(deps_folder.join("json-src").join("include"))
|
||||
.include(deps_folder.join("trtllm-src").join("cpp").join("include"))
|
||||
.include("/usr/local/cuda/include")
|
||||
.include("/usr/local/tensorrt/include")
|
||||
.file("src/ffi.cpp")
|
||||
.std("c++20")
|
||||
.define("NDEBUG", ndebug)
|
||||
.include("csrc/")
|
||||
.file("csrc/ffi.hpp")
|
||||
.define("TGI_TRTLLM_BACKEND_DEBUG", ndebug)
|
||||
.compile("tgi_trtllm_backend");
|
||||
|
||||
println!("cargo:rerun-if-changed=CMakeLists.txt");
|
||||
println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
|
||||
println!("cargo:rerun-if-changed=cmake/json.cmake");
|
||||
println!("cargo:rerun-if-changed=cmake/fmt.cmake");
|
||||
println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
|
||||
println!("cargo:rerun-if-changed=include/backend.h");
|
||||
println!("cargo:rerun-if-changed=lib/backend.cpp");
|
||||
println!("cargo:rerun-if-changed=include/ffi.h");
|
||||
println!("cargo:rerun-if-changed=src/ffi.cpp");
|
||||
println!("cargo:rerun-if-changed=csrc/backend.hpp");
|
||||
println!("cargo:rerun-if-changed=csrc/backend.cpp");
|
||||
println!("cargo:rerun-if-changed=csrc/hardware.hpp");
|
||||
println!("cargo:rerun-if-changed=csrc/ffi.hpp");
|
||||
}
|
||||
|
||||
fn main() {
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
FetchContent_Declare(
|
||||
fmt
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
|
||||
)
|
||||
FetchContent_MakeAvailable(fmt)
|
|
@ -1,6 +1,6 @@
|
|||
fetchcontent_declare(
|
||||
json
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
|
||||
# DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
|
||||
)
|
||||
fetchcontent_makeavailable(json)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
set(SPDLOG_USE_FMT ON)
|
||||
set(SPDLOG_BUILD_SHARED OFF)
|
||||
set(SPDLOG_FMT_EXTERNAL ON)
|
||||
set(SPDLOG_FMT_EXTERNAL OFF)
|
||||
|
||||
# Define the level at which SPDLOG_ compilation level is defined
|
||||
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
|
@ -11,7 +11,7 @@ endif ()
|
|||
|
||||
fetchcontent_declare(
|
||||
spdlog
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
# DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
|
||||
)
|
||||
fetchcontent_makeavailable(spdlog)
|
||||
|
|
|
@ -11,6 +11,7 @@ set(CMAKE_CUDA_ARCHITECTURES ${TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST})
|
|||
|
||||
message(STATUS "Building for CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
||||
set(ENABLE_UCX OFF)
|
||||
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
set(FAST_BUILD ON)
|
||||
set(NVTX_DISABLE OFF)
|
||||
|
@ -20,11 +21,13 @@ else ()
|
|||
set(NVTX_DISABLE ON)
|
||||
endif ()
|
||||
|
||||
find_package(Python3 REQUIRED Interpreter)
|
||||
|
||||
fetchcontent_declare(
|
||||
trtllm
|
||||
GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
|
||||
GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
|
||||
GIT_SHALLOW FALSE
|
||||
GIT_REPOSITORY https://github.com/huggingface/TensorRT-LLM.git
|
||||
GIT_TAG 1bb9ca4688805444f203647674bac1d7219d0579
|
||||
GIT_SHALLOW ON
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
)
|
||||
fetchcontent_makeavailable(trtllm)
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
#include <ranges>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "backend.hpp"
|
||||
#include "hardware.hpp"
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
tle::ParallelConfig backend_workspace_t::parallel_config() const {
|
||||
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
|
||||
const auto world_size = config_["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
|
||||
|
||||
auto mode = tle::CommunicationMode::kLEADER;
|
||||
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
|
||||
|
||||
if (world_size > 1) {
|
||||
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
|
||||
mode = tle::CommunicationMode::kORCHESTRATOR;
|
||||
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, executor_worker_path_, nullptr, true);
|
||||
} else {
|
||||
SPDLOG_INFO("Detected single engine deployment, using leader mode");
|
||||
}
|
||||
|
||||
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
||||
}
|
||||
|
||||
|
||||
tle::ExecutorConfig backend_workspace_t::executor_config() const {
|
||||
// Retrieve the compute capabilities to enable some options at runtime
|
||||
const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
|
||||
|
||||
// Allocate the config
|
||||
tle::ExecutorConfig executor_config(/* maxBeamWidth = */ 1);
|
||||
|
||||
// Set the parallel config as inferred
|
||||
executor_config.setParallelConfig(parallel_config());
|
||||
|
||||
// Define some configuration variables
|
||||
executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
|
||||
executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
|
||||
executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
|
||||
return executor_config;
|
||||
}
|
||||
|
||||
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
|
||||
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
|
||||
|
||||
size_t backend_t::num_tokens_ready() const noexcept {
|
||||
return executor_.getNumResponsesReady();
|
||||
}
|
||||
|
||||
std::expected<request_id_t, backend_error_t>
|
||||
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
|
||||
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
|
||||
return executor_.enqueueRequest(tle::Request {
|
||||
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
|
||||
static_cast<tle::SizeType32>(generation_params.max_new_tokens),
|
||||
true,
|
||||
(tle::SamplingConfig) sampling_params,
|
||||
tle::OutputConfig { /* returnLogProbs= */ true },
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
workspace.generation_config().stop_words
|
||||
});
|
||||
}
|
||||
|
||||
std::vector<tle::Response> backend_t::pull_tokens() noexcept {
|
||||
SPDLOG_TRACE(FMT_STRING("Pulling out tokens ({:d} available)"), num_tokens_ready());
|
||||
return executor_.awaitResponses();
|
||||
}
|
||||
|
||||
void backend_t::cancel(request_id_t request_id) noexcept {
|
||||
SPDLOG_TRACE(FMT_STRING("Cancelling request: {:d}"), request_id);
|
||||
executor_.cancelRequest(request_id);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,231 @@
|
|||
#ifndef TGI_BACKEND_TRTLLM
|
||||
#define TGI_BACKEND_TRTLLM
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <expected>
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
#include <span>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <spdlog/fmt/fmt.h>
|
||||
|
||||
#include <tensorrt_llm/executor/executor.h>
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
namespace tle = tensorrt_llm::executor;
|
||||
using json = nlohmann::json;
|
||||
using request_id_t = uint64_t;
|
||||
using token_id_t = tle::TokenIdType;
|
||||
|
||||
/**
|
||||
* Represent the parameters used for generation
|
||||
*/
|
||||
struct generation_params_t {
|
||||
uint32_t max_new_tokens;
|
||||
};
|
||||
|
||||
/**
|
||||
* Represent the parameters used to sample tokens from the logit distribution
|
||||
*/
|
||||
struct sampling_params_t {
|
||||
uint32_t top_k;
|
||||
float_t top_p;
|
||||
float_t repetition_penalty;
|
||||
float_t frequency_penalty;
|
||||
float_t temperature;
|
||||
uint64_t seed;
|
||||
|
||||
constexpr explicit operator tle::SamplingConfig() const {
|
||||
return tle::SamplingConfig{
|
||||
1,
|
||||
top_k,
|
||||
top_p,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
seed,
|
||||
temperature,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
repetition_penalty,
|
||||
std::nullopt,
|
||||
frequency_penalty,
|
||||
std::nullopt
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Represent possible values from transformers generation `generation_config.json`.
|
||||
* It usually stores default sampling parameters to use, such as top_p, temperature, etc.
|
||||
*/
|
||||
struct generation_config_t {
|
||||
float_t top_p;
|
||||
float_t temperature;
|
||||
std::list<std::vector<int32_t>> stop_words;
|
||||
|
||||
constexpr explicit generation_config_t(const json &config) :
|
||||
top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {
|
||||
if (config.contains("/eos_token_id"_json_pointer) && config["/eos_token_id"_json_pointer].is_array()) {
|
||||
const auto &eos_token_id = config["/eos_token_id"_json_pointer];
|
||||
std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
|
||||
stop_words.emplace_back(1, token_id.template get<int32_t>());
|
||||
});
|
||||
|
||||
SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Helper class representing various items which are stored within the TensorRT-LLM engines folder and
|
||||
* can be retrieved at runtime
|
||||
*/
|
||||
class backend_workspace_t {
|
||||
private:
|
||||
constexpr static auto as_json = [](const std::filesystem::path &path) -> json {
|
||||
std::ifstream config_f(path);
|
||||
return json::parse(config_f);
|
||||
};
|
||||
|
||||
std::filesystem::path engines_folder_;
|
||||
std::filesystem::path executor_worker_path_;
|
||||
json config_;
|
||||
generation_config_t generation_config_;
|
||||
|
||||
public:
|
||||
backend_workspace_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path) :
|
||||
engines_folder_(engines_folder),
|
||||
executor_worker_path_(executor_worker_path),
|
||||
config_(as_json(engines_folder / "config.json")),
|
||||
generation_config_(as_json(engines_folder / "generation_config.json")) {};
|
||||
|
||||
backend_workspace_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path) :
|
||||
engines_folder_(engines_folder),
|
||||
executor_worker_path_(executor_worker_path),
|
||||
config_(as_json(engines_folder / "config.json")),
|
||||
generation_config_(as_json(engines_folder / "generation_config.json")) {};
|
||||
|
||||
/**
|
||||
* Path to the folder containing the TensorRT-LLM engines
|
||||
* @return local filesystem path to the folder
|
||||
*/
|
||||
[[nodiscard]] constexpr std::filesystem::path engines_folder() const { return engines_folder_; }
|
||||
|
||||
/**
|
||||
* Hugging Face transformers' generated `generation_config_t` mapping information stored in the
|
||||
* `generation_config.json` holding default generation parameters.
|
||||
* @return `generation_config_t`
|
||||
*/
|
||||
[[nodiscard]] constexpr const generation_config_t &generation_config() const { return generation_config_; }
|
||||
|
||||
/**
|
||||
* Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
|
||||
* to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information
|
||||
* @return `tensorrt_llm::executor::ParallelConfig` instance
|
||||
*/
|
||||
[[nodiscard]] tle::ParallelConfig parallel_config() const;
|
||||
|
||||
/**
|
||||
* Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used
|
||||
* to initialize `tensorrt_llm::executor::Executor`
|
||||
* @return `tensorrt_llm::executor::ExecutorConfig` instance
|
||||
*/
|
||||
[[nodiscard]] tle::ExecutorConfig executor_config() const;
|
||||
};
|
||||
|
||||
/**
|
||||
* Error raised by the underlying backend implementation
|
||||
*/
|
||||
enum backend_error_t {
|
||||
EXECUTOR_NOT_READY = 3,
|
||||
EXECUTOR_SCHEDULING_FAILED = 4,
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to
|
||||
* - schedule new request
|
||||
* - pull status of submitted request(s)
|
||||
* - cancel submitted request(s)
|
||||
*/
|
||||
class backend_t {
|
||||
private:
|
||||
backend_workspace_t workspace;
|
||||
tle::Executor executor_;
|
||||
|
||||
public:
|
||||
backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
|
||||
|
||||
backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
|
||||
: backend_t(engines_folder, executor_worker_path) {};
|
||||
|
||||
/**
|
||||
* Submit a new request to the executor
|
||||
* @param token_ids
|
||||
* @param generation_params
|
||||
* @param sampling_params
|
||||
* @return Either newly submitted request's id or the error why it failed to submit
|
||||
*/
|
||||
[[nodiscard("Discarded executor request_id needs to be assigned")]]
|
||||
std::expected<request_id_t, backend_error_t>
|
||||
submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,
|
||||
sampling_params_t sampling_params) noexcept;
|
||||
|
||||
/**
|
||||
* Query the number of tokens available across all in-flight generations
|
||||
* @return
|
||||
*/
|
||||
[[nodiscard("Pulling out the number of tokens")]]
|
||||
size_t num_tokens_ready() const noexcept;
|
||||
|
||||
/**
|
||||
* Pull out newly generated tokens from the executor
|
||||
* @return
|
||||
*/
|
||||
[[nodiscard("")]]
|
||||
std::vector<tle::Response> pull_tokens() noexcept;
|
||||
|
||||
/**
|
||||
* Cancel the specified request on the executor' set
|
||||
* @param request_id Request's Identifier to remove from the in-flight executor
|
||||
*/
|
||||
void cancel(request_id_t) noexcept;
|
||||
};
|
||||
|
||||
/**
|
||||
* Create a TensorRT-LLM executor from a workspace
|
||||
*/
|
||||
const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
|
||||
return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,
|
||||
workspace.executor_config()};
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper structures to define formatting strategies for various types in the backend
|
||||
*/
|
||||
template<>
|
||||
struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t> : formatter<string_view> {
|
||||
auto format(huggingface::tgi::backends::trtllm::generation_params_t const &c,
|
||||
format_context &ctx) const -> format_context::iterator {
|
||||
return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t> : formatter<string_view> {
|
||||
auto format(huggingface::tgi::backends::trtllm::sampling_params_t const &c,
|
||||
format_context &ctx) const -> format_context::iterator {
|
||||
return fmt::format_to(
|
||||
ctx.out(),
|
||||
"sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
|
||||
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.temperature, c.seed
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
|
@ -0,0 +1,162 @@
|
|||
#ifndef TGI_BACKEND_TRTLLM_FFI
|
||||
#define TGI_BACKEND_TRTLLM_FFI
|
||||
|
||||
#include <memory>
|
||||
#include <thread>
|
||||
|
||||
#include <nvml.h>
|
||||
#include <tensorrt_llm/common/tllmException.h>
|
||||
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <backend.hpp>
|
||||
#include <hardware.hpp>
|
||||
|
||||
namespace rust::behavior {
|
||||
template<typename Try, typename Fail>
|
||||
static void trycatch(Try &&func, Fail &&fail) noexcept try {
|
||||
func();
|
||||
} catch (tensorrt_llm::common::TllmException &e) {
|
||||
fail(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
class tensorrt_llm_backend_t;
|
||||
}
|
||||
|
||||
#include "backends/trtllm/src/lib.rs.h"
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
std::once_flag backend_initialized_flag;
|
||||
|
||||
class tensorrt_llm_backend_t {
|
||||
private:
|
||||
backend_t inner_;
|
||||
|
||||
public:
|
||||
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
|
||||
: inner_(engine_folder, executor_worker_path) {}
|
||||
|
||||
size_t num_tokens_ready() const noexcept {
|
||||
return inner_.num_tokens_ready();
|
||||
}
|
||||
|
||||
request_id_t submit(
|
||||
rust::Slice<const uint32_t> tokens,
|
||||
uint32_t max_new_tokens,
|
||||
uint32_t top_k,
|
||||
float_t top_p,
|
||||
float_t temperature,
|
||||
float_t repetition_penalty,
|
||||
float_t frequency_penalty,
|
||||
uint64_t seed
|
||||
) {
|
||||
// This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
|
||||
SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
|
||||
|
||||
// Submit the request to the executor and get back a potential request_id used to track request status
|
||||
const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
|
||||
const auto maybe_request_id = inner_.submit(
|
||||
signed_tokens,
|
||||
{max_new_tokens},
|
||||
{top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
|
||||
);
|
||||
|
||||
// If we do have a value, let's return the request_id
|
||||
if(maybe_request_id.has_value()) [[likely]] {
|
||||
return *maybe_request_id;
|
||||
} else {
|
||||
SPDLOG_WARN("[FFI] Failed to submit request to the executor");
|
||||
return maybe_request_id.error();
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
|
||||
if(num_tokens_ready() > 0) [[likely]] {
|
||||
const auto responses = inner_.pull_tokens();
|
||||
|
||||
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
|
||||
// Transform tle::Response to GenerationStep
|
||||
auto steps = std::make_unique<std::vector<generation_step_t>>();
|
||||
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
|
||||
const auto reqId = r.getRequestId();
|
||||
if (!r.hasError()) [[likely]] {
|
||||
const auto result = r.getResult();
|
||||
return generation_step_t{
|
||||
reqId,
|
||||
static_cast<uint32_t>(result.outputTokenIds[0][0]),
|
||||
result.logProbs.value()[0][0],
|
||||
result.isFinal,
|
||||
false,
|
||||
std::string()
|
||||
};
|
||||
} else {
|
||||
return generation_step_t{
|
||||
reqId,
|
||||
0,
|
||||
0.0,
|
||||
true,
|
||||
true,
|
||||
std::move(r.getErrorMsg())
|
||||
};
|
||||
}
|
||||
});
|
||||
return steps;
|
||||
|
||||
} else {
|
||||
return std::make_unique<std::vector<generation_step_t>>();
|
||||
}
|
||||
}
|
||||
|
||||
void cancel(request_id_t requestId) noexcept {
|
||||
SPDLOG_DEBUG("[FFI] cancelling request {:d}", requestId);
|
||||
inner_.cancel(requestId);
|
||||
}
|
||||
};
|
||||
|
||||
void initialize_logging() {
|
||||
#ifndef TGI_TRTLLM_BACKEND_DEBUG
|
||||
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
||||
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
||||
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
||||
return std::tolower(c);
|
||||
});
|
||||
|
||||
if (log_level == "debug")
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
else
|
||||
spdlog::set_level(spdlog::level::info);
|
||||
}
|
||||
#else
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
#endif
|
||||
}
|
||||
|
||||
void initialize_tensorrt_llm_backend() {
|
||||
SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version());
|
||||
|
||||
// Initialize everyone
|
||||
initialize_logging();
|
||||
nvmlInit_v2();
|
||||
initTrtLlmPlugins();
|
||||
|
||||
const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count();
|
||||
if (numGpus.has_value()) {
|
||||
SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", *numGpus);
|
||||
} else {
|
||||
SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system");
|
||||
// todo: throw
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
|
||||
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
|
||||
return std::make_unique<tensorrt_llm_backend_t>(
|
||||
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
|
||||
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
|
||||
);
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,81 @@
|
|||
#ifndef TGI_HARDWARE_CUDA
|
||||
#define TGI_HARDWARE_CUDA
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
|
||||
#include <nvml.h>
|
||||
|
||||
namespace huggingface::tgi::hardware::cuda {
|
||||
static constexpr auto VOLTA = std::make_tuple(7u, 0u);
|
||||
static constexpr auto TURING = std::make_tuple(7u, 5u);
|
||||
static constexpr auto AMPERE = std::make_tuple(8u, 0u);
|
||||
static constexpr auto HOPPER = std::make_tuple(9u, 0u);
|
||||
static constexpr auto ADA_LOVELACE = std::make_tuple(8u, 9u);
|
||||
|
||||
/**
|
||||
* Get the number of GPUs on the local machine
|
||||
* @return std::nullopt if no device is available, otherwise >= 1
|
||||
*/
|
||||
inline std::optional<size_t> get_device_count() {
|
||||
uint32_t numGpus = 0;
|
||||
if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
|
||||
return numGpus;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store information about the version of the CUDA Compute Capabilities detected on the device
|
||||
*/
|
||||
struct compute_capabilities_t {
|
||||
int32_t major;
|
||||
int32_t minor;
|
||||
|
||||
compute_capabilities_t(): compute_capabilities_t(0) {}
|
||||
explicit compute_capabilities_t(size_t device_idx): major(-1), minor(-1) {
|
||||
nvmlDevice_t device;
|
||||
if (nvmlDeviceGetHandleByIndex_v2(device_idx, &device) == NVML_SUCCESS) {
|
||||
nvmlDeviceGetCudaComputeCapability(device, &major, &minor);
|
||||
}
|
||||
};
|
||||
compute_capabilities_t(int32_t major, int32_t minor): major(major), minor(minor) {}
|
||||
|
||||
/**
|
||||
* Evaluate if the underlying capabilities is at least greater or equals to the provided 2-tuple (major, minor)
|
||||
* @param sm Architecture version (major, minor)
|
||||
* @return True if greater or equals to the underlying compute capabilities
|
||||
*/
|
||||
[[nodiscard]] constexpr auto is_at_least(std::tuple<uint32_t, uint32_t> sm) const -> decltype(auto) { return std::tie(major, minor) >= sm; }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Volta architecture (sm_70)
|
||||
* @return true if at least Volta (>= sm_70), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_volta() const { return is_at_least(VOLTA); }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Turing architecture (sm_75)
|
||||
* @return true if at least Turing (>= sm_75), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_turing() const { return is_at_least(TURING); }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Ampere architecture (sm_80)
|
||||
* @return true if at least Ampere (>= sm_80), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_ampere() const { return is_at_least(AMPERE); }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Ada Lovelace architecture (sm_89)
|
||||
* @return true if at least Ada Lovelace (>= sm_89), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_ada_lovelace() const { return is_at_least(ADA_LOVELACE); }
|
||||
|
||||
/**
|
||||
* Check if the capabilities match at least Hopper architecture (sm_90)
|
||||
* @return true if at least Hopper (>= sm_90), false otherwise
|
||||
*/
|
||||
[[nodiscard]] constexpr bool is_at_least_hopper() const { return is_at_least(HOPPER); }
|
||||
};
|
||||
}
|
||||
#endif
|
|
@ -1,144 +0,0 @@
|
|||
//
|
||||
// Created by Morgan Funtowicz on 6/30/24.
|
||||
//
|
||||
|
||||
#ifndef TGI_TRTLLM_BACKEND_H
|
||||
#define TGI_TRTLLM_BACKEND_H
|
||||
|
||||
#include <array>
|
||||
#include <cmath>
|
||||
#include <filesystem>
|
||||
#include <span>
|
||||
#include <vector>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
#include <tensorrt_llm/runtime/common.h>
|
||||
#include <tensorrt_llm/executor/executor.h>
|
||||
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
|
||||
|
||||
using json = nlohmann::json;
|
||||
namespace tle = tensorrt_llm::executor;
|
||||
|
||||
|
||||
#define CAST_SIZETYPE(x) static_cast<tle::SizeType32>(x)
|
||||
|
||||
namespace huggingface::tgi::backends {
|
||||
using RequestId = tle::IdType;
|
||||
using TokenId = tle::TokenIdType;
|
||||
|
||||
const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
|
||||
constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
|
||||
"Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
|
||||
constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
|
||||
"Submitting inference [{}] to the executor ({:d} already in-flight)");
|
||||
constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
|
||||
"Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}");
|
||||
|
||||
/**
|
||||
* Initialize all the components required by TRTLLM.
|
||||
* It is required to call this function before attempting to load any engine
|
||||
*/
|
||||
void InitializeBackend();
|
||||
|
||||
/**
|
||||
* Initialize logging mechanism
|
||||
*/
|
||||
void InitializeLogging();
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param config TensorRT-LLM configuration object
|
||||
* @param workerPath Path to the "executorWorker" provided by TensorRT-LLM when using orchestrator mode
|
||||
* @return
|
||||
*/
|
||||
tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
|
||||
|
||||
/**
|
||||
*
|
||||
* @param worldSize
|
||||
* @param workerPath
|
||||
* @return
|
||||
*/
|
||||
tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
|
||||
|
||||
/**
|
||||
* Get the sampling configuration from the parameters provided by TGI
|
||||
* @param topK
|
||||
* @param topP
|
||||
* @param temperature
|
||||
* @param repetition_penalty
|
||||
* @param frequency_penalty
|
||||
* @param seed
|
||||
* @return
|
||||
*/
|
||||
tle::SamplingConfig GetSamplingConfig(
|
||||
uint32_t topK,
|
||||
float_t topP,
|
||||
float_t temperature,
|
||||
float_t repetition_penalty,
|
||||
float_t frequency_penalty,
|
||||
uint64_t seed
|
||||
) noexcept;
|
||||
|
||||
/**
|
||||
* Attempt to retrieve the
|
||||
* @param generationConfigPath
|
||||
* @return
|
||||
*/
|
||||
std::optional<std::list<std::vector<TokenId>>>
|
||||
GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
class TensorRtLlmBackend {
|
||||
private:
|
||||
const json config;
|
||||
tle::Executor executor;
|
||||
|
||||
/** Frequently accessed variables cached here **/
|
||||
uint32_t maxNumTokens;
|
||||
std::list<std::vector<TokenId>> stopWords;
|
||||
|
||||
public:
|
||||
explicit TensorRtLlmBackend(
|
||||
const std::filesystem::path &engineFolder,
|
||||
const std::filesystem::path &executorWorker
|
||||
);
|
||||
|
||||
/**
|
||||
* Query the executor for the number of token available for pulling
|
||||
* @return
|
||||
*/
|
||||
[[nodiscard]] size_t NumResponsesReady() const;
|
||||
|
||||
/**
|
||||
* Submit a new generation task to the executor
|
||||
* @param tokens
|
||||
* @param topK
|
||||
* @param topP
|
||||
* @param temperature
|
||||
* @param repetitionPenalty
|
||||
* @param frequencyPenalty
|
||||
* @param seed
|
||||
* @return Request id related to this generation for reference
|
||||
*/
|
||||
[[nodiscard]] RequestId Submit(
|
||||
const std::vector<TokenId> &tokens,
|
||||
uint32_t maxNewTokens,
|
||||
int32_t topK,
|
||||
float_t topP,
|
||||
float_t temperature,
|
||||
float_t repetitionPenalty,
|
||||
float_t frequencyPenalty,
|
||||
uint64_t seed
|
||||
);
|
||||
|
||||
[[nodiscard]] std::vector<tle::Response> PullNewTokens();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
#endif //TGI_TRTLLM_BACKEND_H
|
|
@ -1,75 +0,0 @@
|
|||
//
|
||||
// Created by mfuntowicz on 7/11/24.
|
||||
//
|
||||
|
||||
#ifndef TGI_TRTLLM_BACKEND_FFI_H
|
||||
#define TGI_TRTLLM_BACKEND_FFI_H
|
||||
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include "backend.h"
|
||||
|
||||
namespace huggingface::tgi::backends {
|
||||
class TensorRtLlmBackendImpl;
|
||||
}
|
||||
|
||||
// Template to support returning error from TllmException back to Rust in a Result<>
|
||||
#include <tensorrt_llm/common/tllmException.h>
|
||||
|
||||
namespace rust::behavior {
|
||||
template<typename Try, typename Fail>
|
||||
static void trycatch(Try &&func, Fail &&fail) noexcept try {
|
||||
func();
|
||||
} catch (tensorrt_llm::common::TllmException &e) {
|
||||
fail(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
#include "backends/trtllm/src/lib.rs.h"
|
||||
|
||||
namespace huggingface::tgi::backends {
|
||||
|
||||
class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
|
||||
public:
|
||||
/***
|
||||
*
|
||||
* @param engineFolder
|
||||
* @param executorWorker
|
||||
*/
|
||||
TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
|
||||
|
||||
/***
|
||||
*
|
||||
* @param tokens
|
||||
* @param maxNewTokens
|
||||
* @param topK
|
||||
* @param topP
|
||||
* @param temperature
|
||||
* @param repetition_penalty
|
||||
* @param frequency_penalty
|
||||
* @param seed
|
||||
* @return
|
||||
*/
|
||||
[[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
|
||||
uint64_t
|
||||
Submit(rust::Slice<const uint32_t> tokens, uint32_t maxNewTokens,
|
||||
int32_t topK, float_t topP, float_t temperature,
|
||||
float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
|
||||
|
||||
/***
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
std::unique_ptr<std::vector<GenerationStep>> PullTokens();
|
||||
};
|
||||
|
||||
/***
|
||||
*
|
||||
* @param engineFolder
|
||||
* @return
|
||||
*/
|
||||
std::unique_ptr<TensorRtLlmBackendImpl> CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker);
|
||||
}
|
||||
|
||||
#endif //TGI_TRTLLM_BACKEND_FFI_H
|
|
@ -1,59 +0,0 @@
|
|||
//
|
||||
// Created by mfuntowicz on 7/23/24.
|
||||
//
|
||||
|
||||
#ifndef TGI_TRTLLM_BACKEND_HARDWARE_H
|
||||
#define TGI_TRTLLM_BACKEND_HARDWARE_H
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <fmt/base.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <nvml.h>
|
||||
|
||||
namespace huggingface::hardware::cuda {
|
||||
|
||||
#define AMPERE_SM_MAJOR 8
|
||||
#define HOPPER_SM_MAJOR 9
|
||||
|
||||
/**
|
||||
* Store information about the version of the CUDA Compute Capabilities detected on the device
|
||||
*/
|
||||
struct CudaComputeCapabilities {
|
||||
int32_t major;
|
||||
int32_t minor;
|
||||
|
||||
[[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
|
||||
|
||||
[[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; }
|
||||
};
|
||||
|
||||
CudaComputeCapabilities GetCudaComputeCapabilities() {
|
||||
// Get the compute capabilities of the current hardware
|
||||
nvmlDevice_t device;
|
||||
CudaComputeCapabilities capabilities{0, 0};
|
||||
if (nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
|
||||
SPDLOG_DEBUG("Successfully acquired nvmlDevice_t = 0");
|
||||
if (nvmlDeviceGetCudaComputeCapability(device, &capabilities.major, &capabilities.minor) == NVML_SUCCESS) {
|
||||
SPDLOG_INFO("Detected sm_{:d}{:d} compute capabilities", capabilities.major, capabilities.minor);
|
||||
}
|
||||
}
|
||||
|
||||
return capabilities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of GPU detected. If no GPU is detected, return size_t::max()
|
||||
* @return
|
||||
*/
|
||||
std::optional<size_t> GetNumDevices() {
|
||||
uint32_t numGpus = 0;
|
||||
if (nvmlDeviceGetCount_v2(&numGpus) == NVML_SUCCESS) {
|
||||
return std::optional(numGpus);
|
||||
} else {
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif //TGI_TRTLLM_BACKEND_HARDWARE_H
|
|
@ -1,203 +0,0 @@
|
|||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
|
||||
#include <fmt/ranges.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <nvml.h>
|
||||
|
||||
#include "backend.h"
|
||||
#include "hardware.h"
|
||||
|
||||
|
||||
void huggingface::tgi::backends::InitializeLogging() {
|
||||
#ifdef NDEBUG
|
||||
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
||||
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
||||
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
||||
return std::tolower(c);
|
||||
});
|
||||
|
||||
if (log_level == "debug")
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
else
|
||||
spdlog::set_level(spdlog::level::info);
|
||||
}
|
||||
#else
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
#endif
|
||||
}
|
||||
|
||||
void huggingface::tgi::backends::InitializeBackend() {
|
||||
SPDLOG_INFO("Initializing Backend...");
|
||||
nvmlInit_v2();
|
||||
initTrtLlmPlugins();
|
||||
|
||||
InitializeLogging();
|
||||
|
||||
SPDLOG_INFO("Backend Executor Version: {}", tle::version());
|
||||
const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
|
||||
if (numGpus.has_value()) {
|
||||
SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
|
||||
} else {
|
||||
SPDLOG_WARN("Failed to detected Nvidia GPU(s) on the system");
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]]
|
||||
tle::ParallelConfig
|
||||
huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept {
|
||||
auto mode = tle::CommunicationMode::kLEADER;
|
||||
std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
|
||||
|
||||
if (worldSize > 1) {
|
||||
SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
|
||||
mode = tle::CommunicationMode::kORCHESTRATOR;
|
||||
orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
|
||||
} else {
|
||||
SPDLOG_INFO("Detected single engine deployment, using leader mode");
|
||||
}
|
||||
|
||||
return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
|
||||
}
|
||||
|
||||
[[nodiscard]]
|
||||
tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
|
||||
tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
|
||||
|
||||
// Retrieve the compute capabilities to enable some options at runtime
|
||||
const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
|
||||
|
||||
// Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
|
||||
const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
|
||||
execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath));
|
||||
|
||||
// Define some configuration variables
|
||||
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
|
||||
execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere());
|
||||
execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
|
||||
return execConfig;
|
||||
}
|
||||
|
||||
tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
|
||||
const uint32_t topK,
|
||||
const float_t topP,
|
||||
const float_t temperature,
|
||||
const float_t repetition_penalty,
|
||||
const float_t frequency_penalty,
|
||||
const uint64_t seed) noexcept {
|
||||
|
||||
return tle::SamplingConfig(
|
||||
1, // TGI only use a single beam
|
||||
topK,
|
||||
topP,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
std::nullopt,
|
||||
seed,
|
||||
temperature,
|
||||
temperature,
|
||||
std::nullopt,
|
||||
repetition_penalty,
|
||||
std::nullopt,
|
||||
frequency_penalty
|
||||
);
|
||||
}
|
||||
|
||||
std::optional<std::list<std::vector<huggingface::tgi::backends::TokenId>>>
|
||||
huggingface::tgi::backends::GetStopWordsFromConfig(
|
||||
const std::filesystem::path &generationConfigPath) noexcept {
|
||||
if (exists(generationConfigPath)) {
|
||||
const auto generationConfig = json::parse(std::ifstream(generationConfigPath));
|
||||
if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) {
|
||||
SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size());
|
||||
std::list<std::vector<huggingface::tgi::backends::TokenId>> stopWords(eosTokenIds.size());
|
||||
|
||||
const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type {
|
||||
return {tokenIdObj.template get<tle::TokenIdType>()};
|
||||
};
|
||||
|
||||
std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token);
|
||||
return stopWords;
|
||||
} else {
|
||||
SPDLOG_INFO("Invalid EOS tokens entry found (not an array)");
|
||||
}
|
||||
} else {
|
||||
SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist");
|
||||
}
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
|
||||
const std::filesystem::path &enginesFolder,
|
||||
const std::filesystem::path &executorWorker
|
||||
) :
|
||||
config(json::parse(std::ifstream(enginesFolder / "config.json"))),
|
||||
executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY,
|
||||
GetExecutorConfig(config, executorWorker.string())) {
|
||||
|
||||
SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get<std::string_view>());
|
||||
|
||||
// Ensure we have enough GPUs on the system
|
||||
const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
|
||||
const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0);
|
||||
if (numGpus < worldSize) {
|
||||
SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize);
|
||||
// todo : raise exception to catch on rust side
|
||||
}
|
||||
|
||||
// Cache variables
|
||||
maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<uint32_t>();
|
||||
|
||||
// Attempt to discover stopWords from the generation_config.json
|
||||
const auto generationConfigPath = enginesFolder / "generation_config.json";
|
||||
stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list<std::vector<TokenId>>());
|
||||
}
|
||||
|
||||
[[nodiscard("Returned number of requests needs to be consumed")]]
|
||||
size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
|
||||
#ifdef NDEBUG
|
||||
return executor.getNumResponsesReady();
|
||||
#else
|
||||
const auto numResponses = executor.getNumResponsesReady();
|
||||
if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses);
|
||||
return numResponses;
|
||||
#endif
|
||||
}
|
||||
|
||||
[[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
|
||||
tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
|
||||
const std::vector<tle::TokenIdType> &tokens,
|
||||
const uint32_t maxNewTokens,
|
||||
const int32_t topK,
|
||||
const float_t topP,
|
||||
const float_t temperature,
|
||||
const float_t repetitionPenalty,
|
||||
const float_t frequencyPenalty,
|
||||
const uint64_t seed
|
||||
) {
|
||||
const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast<uint32_t>(maxNumTokens - tokens.size()));
|
||||
#ifndef NDEBUG
|
||||
{
|
||||
const auto &iterations = executor.getLatestIterationStats();
|
||||
const auto &lastIteration = iterations.front();
|
||||
|
||||
SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests);
|
||||
SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
|
||||
SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked);
|
||||
}
|
||||
#endif
|
||||
|
||||
const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
|
||||
|
||||
// Build the request
|
||||
auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG};
|
||||
request.setStopWords(stopWords);
|
||||
|
||||
// Submit to the executor for batching
|
||||
return executor.enqueueRequest(request);
|
||||
}
|
||||
|
||||
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() {
|
||||
return executor.awaitResponses();
|
||||
}
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
set -ex
|
||||
|
||||
TRT_VER_BASE="10.4.0"
|
||||
TRT_VER_BASE="10.6.0"
|
||||
TRT_VER_FULL="${TRT_VER_BASE}.26"
|
||||
CUDA_VER="12.6"
|
||||
CUDNN_VER="9.5.0.50-1"
|
||||
|
|
|
@ -1,89 +0,0 @@
|
|||
//
|
||||
// Created by mfuntowicz on 6/30/24.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <exception>
|
||||
#include <filesystem>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <iterator>
|
||||
#include <ranges>
|
||||
#include <vector>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
#include "backends/trtllm/include/ffi.h"
|
||||
|
||||
|
||||
huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
|
||||
const std::string_view &engineFolder,
|
||||
const std::string_view &executorWorker
|
||||
) : TensorRtLlmBackend(engineFolder, executorWorker) {}
|
||||
|
||||
|
||||
uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
|
||||
rust::Slice<const uint32_t> tokens,
|
||||
uint32_t maxNewTokens,
|
||||
int32_t topK,
|
||||
float_t topP,
|
||||
float_t temperature,
|
||||
float_t repetition_penalty,
|
||||
float_t frequency_penalty,
|
||||
uint64_t seed) {
|
||||
|
||||
// This will copy all the items from the initial slice
|
||||
std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
|
||||
return TensorRtLlmBackend::Submit(
|
||||
std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
|
||||
}
|
||||
|
||||
std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
|
||||
huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
|
||||
const auto responses = TensorRtLlmBackend::PullNewTokens();
|
||||
|
||||
auto steps = std::make_unique<std::vector<GenerationStep>>();
|
||||
steps->reserve(responses.size());
|
||||
|
||||
#ifndef NDEBUG
|
||||
SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
|
||||
#endif
|
||||
|
||||
// Transform tle::Response to GenerationStep
|
||||
std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
|
||||
const auto reqId = r.getRequestId();
|
||||
if (!r.hasError()) {
|
||||
const auto result = r.getResult();
|
||||
return GenerationStep{
|
||||
reqId,
|
||||
static_cast<uint32_t>(result.outputTokenIds[0][0]),
|
||||
result.logProbs.value()[0][0],
|
||||
result.isFinal,
|
||||
false,
|
||||
std::string()
|
||||
};
|
||||
} else {
|
||||
return GenerationStep{
|
||||
reqId,
|
||||
0,
|
||||
0.0,
|
||||
true,
|
||||
true,
|
||||
std::move(r.getErrorMsg())
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
return steps;
|
||||
}
|
||||
|
||||
std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
|
||||
huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
|
||||
SPDLOG_INFO("Creating TensorRT-LLM Backend");
|
||||
// Unconditionally call this to initialize and discover TRTLLM plugins
|
||||
InitializeBackend();
|
||||
|
||||
const auto enginePath = std::string_view(engineFolder.begin(), engineFolder.end());
|
||||
const auto executorPath = std::string_view(executorWorker.begin(), executorWorker.end());
|
||||
return std::make_unique<TensorRtLlmBackendImpl>(std::move(enginePath), std::move(executorPath));
|
||||
}
|
|
@ -4,10 +4,11 @@ pub mod errors;
|
|||
mod looper;
|
||||
mod utils;
|
||||
|
||||
#[cxx::bridge(namespace = "huggingface::tgi::backends")]
|
||||
#[cxx::bridge(namespace = "huggingface::tgi::backends::trtllm")]
|
||||
mod ffi {
|
||||
/// Struct used as shared type between rust and C++ to represent the result
|
||||
/// of a single decoding iteration
|
||||
#[cxx_name = "generation_step_t"]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GenerationStep {
|
||||
request_id: u64,
|
||||
|
@ -19,9 +20,10 @@ mod ffi {
|
|||
}
|
||||
|
||||
unsafe extern "C++" {
|
||||
include!("backends/trtllm/src/ffi.cpp");
|
||||
include!("backends/trtllm/csrc/ffi.hpp");
|
||||
|
||||
/// Represent an instance of the underlying TensorRT-LLM backend
|
||||
#[cxx_name = "tensorrt_llm_backend_t"]
|
||||
type TensorRtLlmBackendImpl;
|
||||
|
||||
/// Create an instance backed behind a std::unique_ptr to manage the lifespan of the backend
|
||||
|
@ -38,21 +40,18 @@ mod ffi {
|
|||
/// ```
|
||||
///
|
||||
/// ```
|
||||
#[rust_name = "create_tensorrt_llm_backend"]
|
||||
fn CreateTensorRtLlmBackend(
|
||||
fn create_backend_from_engine_folder(
|
||||
engine_folder: &str,
|
||||
executor_worker: &str,
|
||||
) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
|
||||
|
||||
#[rust_name = "num_responses_ready"]
|
||||
fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
|
||||
fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
|
||||
|
||||
#[rust_name = "submit"]
|
||||
fn Submit(
|
||||
fn submit(
|
||||
self: Pin<&mut TensorRtLlmBackendImpl>,
|
||||
tokens: &[u32],
|
||||
max_new_tokens: u32,
|
||||
top_k: i32,
|
||||
top_k: u32,
|
||||
top_p: f32,
|
||||
temperature: f32,
|
||||
repetition_penalty: f32,
|
||||
|
@ -60,9 +59,10 @@ mod ffi {
|
|||
seed: u64,
|
||||
) -> Result<u64>;
|
||||
|
||||
#[rust_name = "pull_tokens"]
|
||||
fn PullTokens(
|
||||
fn pull_tokens(
|
||||
self: Pin<&mut TensorRtLlmBackendImpl>,
|
||||
) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
|
||||
|
||||
fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,13 @@
|
|||
use std::hint;
|
||||
use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use cxx::UniquePtr;
|
||||
use hashbrown::HashMap;
|
||||
use std::hint;
|
||||
use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
use tokenizers::Tokenizer;
|
||||
use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
|
||||
use tokio::sync::TryAcquireError;
|
||||
use tokio::task::{spawn_blocking, JoinHandle};
|
||||
use tokio::task::spawn_blocking;
|
||||
use tokio::time::Instant;
|
||||
use tokio_stream::wrappers::UnboundedReceiverStream;
|
||||
use tracing::{debug, error, warn};
|
||||
|
@ -22,7 +21,7 @@ use text_generation_router::validation::{Chunk, ValidGenerateRequest};
|
|||
use text_generation_router::{FinishReason, Token};
|
||||
|
||||
use crate::errors::TensorRtLlmBackendError;
|
||||
use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
|
||||
use crate::ffi::{create_backend_from_engine_folder, GenerationStep, TensorRtLlmBackendImpl};
|
||||
use crate::utils::first_line;
|
||||
|
||||
type InferResult<T> = Result<T, InferError>;
|
||||
|
@ -30,9 +29,10 @@ type InferResult<T> = Result<T, InferError>;
|
|||
/// Wrap the requests along with the channel used to stream back to the client the decoded tokens
|
||||
struct GenerationContext {
|
||||
request: ValidGenerateRequest,
|
||||
streamer: UnboundedSender<InferResult<InferStreamResponse>>,
|
||||
tokens: Vec<u32>,
|
||||
start: Option<Instant>,
|
||||
queued: Instant,
|
||||
streamer: UnboundedSender<InferResult<InferStreamResponse>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
|
@ -58,31 +58,22 @@ impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
|
|||
}
|
||||
}
|
||||
|
||||
/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens
|
||||
struct DecodedTokenContext {
|
||||
token: DecodedToken,
|
||||
start: Option<Instant>,
|
||||
queued: Instant,
|
||||
channel: UnboundedSender<InferResult<InferStreamResponse>>,
|
||||
}
|
||||
|
||||
fn executor_status_looper(
|
||||
mut backend: UniquePtr<TensorRtLlmBackendImpl>,
|
||||
max_inflight_requests: usize,
|
||||
mut waiting_requests: UnboundedReceiver<GenerationContext>,
|
||||
post_processor_sender: UnboundedSender<(u64, InferResult<DecodedTokenContext>)>,
|
||||
tokenizer: Tokenizer,
|
||||
mut backend: UniquePtr<TensorRtLlmBackendImpl>,
|
||||
mut backlog: UnboundedReceiver<GenerationContext>,
|
||||
) {
|
||||
// Track the tuple (request_id, stream) for each request
|
||||
let mut in_flights =
|
||||
HashMap::<u64, GenerationContext>::with_capacity(max_inflight_requests * 2);
|
||||
|
||||
// TODO: Does it need a spin-loop?
|
||||
'scheduler: loop {
|
||||
// Is there any request pending to be scheduled?
|
||||
let awaiting_requests = waiting_requests.len();
|
||||
let awaiting_requests = backlog.len();
|
||||
for _ in 0..awaiting_requests {
|
||||
// Retrieve all the requests
|
||||
if let Some(mut ctx) = waiting_requests.blocking_recv() {
|
||||
if let Some(ctx) = backlog.blocking_recv() {
|
||||
// Submit all the request to the executor and move the context to the in-flight tracker
|
||||
let request = &ctx.request;
|
||||
let generation_params = &request.parameters;
|
||||
|
@ -93,7 +84,7 @@ fn executor_status_looper(
|
|||
match backend.pin_mut().submit(
|
||||
&input_ids.unwrap(), // This is checked beforehand in validate()
|
||||
stopping_params.max_new_tokens,
|
||||
generation_params.top_k as i32,
|
||||
generation_params.top_k,
|
||||
generation_params.top_p,
|
||||
generation_params.temperature,
|
||||
generation_params.repetition_penalty,
|
||||
|
@ -103,7 +94,6 @@ fn executor_status_looper(
|
|||
Ok(request_id) => {
|
||||
// Insert the context linked to the generated request id in the tracker
|
||||
debug!("[in-flight] Added {}", request_id);
|
||||
ctx.start = Some(Instant::now());
|
||||
in_flights.insert(request_id, ctx);
|
||||
}
|
||||
Err(e) => {
|
||||
|
@ -117,29 +107,40 @@ fn executor_status_looper(
|
|||
}
|
||||
}
|
||||
};
|
||||
} else {
|
||||
break 'scheduler;
|
||||
}
|
||||
}
|
||||
|
||||
if backend.num_responses_ready() > 0 {
|
||||
match backend.pin_mut().pull_tokens() {
|
||||
if backend.num_tokens_ready() > 0 {
|
||||
let mut backend = backend.pin_mut();
|
||||
match backend.as_mut().pull_tokens() {
|
||||
Ok(responses) => {
|
||||
// Iterate through all the decoded token
|
||||
for step in responses.deref() {
|
||||
if let Some(ctx) = in_flights.get(&step.request_id) {
|
||||
// Remove from tracked requests
|
||||
let parcel =
|
||||
DecodedToken::try_from(step).map(|dt| DecodedTokenContext {
|
||||
token: dt,
|
||||
start: ctx.start,
|
||||
queued: ctx.queued,
|
||||
channel: ctx.streamer.clone(),
|
||||
});
|
||||
if let Some(ctx) = in_flights.get_mut(&step.request_id) {
|
||||
// Update the starting timestamp if not set
|
||||
// This value might not be the actual real starting time of the request
|
||||
// on the executor side - Need to expose more info from the executor to
|
||||
// retrieve this value
|
||||
// TODO : Expose actual real starting time for a request on FFI layer
|
||||
if ctx.start.is_none() {
|
||||
ctx.start = Some(Instant::now());
|
||||
}
|
||||
|
||||
// Submit the work to p:the post_processor
|
||||
let posted = post_processor_sender.send((step.request_id, parcel));
|
||||
// Try to map the generation step to a DecodedToken
|
||||
let response = match DecodedToken::try_from(step) {
|
||||
Ok(decoded_token) => {
|
||||
post_process_decoded_token(&tokenizer, ctx, decoded_token)
|
||||
}
|
||||
Err(err) => Err(err)
|
||||
};
|
||||
|
||||
if posted.is_err() || step.is_final {
|
||||
debug!("Removing {}", step.request_id);
|
||||
// Attempt to send back the response to the client
|
||||
if let Err(_) = ctx.streamer.send(response) {
|
||||
// Client has dropped, remove from tracked requests
|
||||
debug!("Client dropped - removing request {} from tracked requests", step.request_id);
|
||||
backend.as_mut().cancel(step.request_id);
|
||||
let _ = in_flights.remove(&step.request_id);
|
||||
}
|
||||
} else {
|
||||
|
@ -159,80 +160,48 @@ fn executor_status_looper(
|
|||
}
|
||||
}
|
||||
|
||||
fn post_processor_looper<const MAX_NUM_TOKENS: usize>(
|
||||
tokenizer: Tokenizer,
|
||||
max_inflight_requests: usize,
|
||||
mut decoded_tokens: UnboundedReceiver<(u64, InferResult<DecodedTokenContext>)>,
|
||||
) {
|
||||
let mut states: HashMap<u64, Vec<u32>> = HashMap::with_capacity(max_inflight_requests * 2);
|
||||
fn post_process_decoded_token(tokenizer: &Tokenizer, ctx: &mut GenerationContext, decoded_token: DecodedToken) -> InferResult<InferStreamResponse> {
|
||||
match tokenizer.decode(&[decoded_token.id], false) {
|
||||
Ok(text) => {
|
||||
let is_special =
|
||||
tokenizer.get_added_vocabulary().is_special_token(&text);
|
||||
let token = Token {
|
||||
id: decoded_token.id,
|
||||
text,
|
||||
logprob: decoded_token.log_prob,
|
||||
special: is_special,
|
||||
};
|
||||
|
||||
'post_processor: loop {
|
||||
if decoded_tokens.is_closed() {
|
||||
warn!("Post processor IPC is closed, loop will exit now.");
|
||||
break 'post_processor;
|
||||
}
|
||||
// Append the token to the tracked generated tokens
|
||||
ctx.tokens.push(token.id);
|
||||
|
||||
if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() {
|
||||
match decoded {
|
||||
Ok(ctx) => {
|
||||
states
|
||||
.entry(request_id)
|
||||
.and_modify(|s| s.push(*&ctx.token.id))
|
||||
.or_insert_with(|| {
|
||||
let mut state = Vec::with_capacity(MAX_NUM_TOKENS);
|
||||
state.push(*&ctx.token.id);
|
||||
state
|
||||
});
|
||||
|
||||
let out = match tokenizer.decode(&[ctx.token.id], false) {
|
||||
Ok(text) => {
|
||||
let is_special =
|
||||
tokenizer.get_added_vocabulary().is_special_token(&text);
|
||||
let token = Token {
|
||||
id: ctx.token.id,
|
||||
text,
|
||||
logprob: ctx.token.log_prob,
|
||||
special: is_special,
|
||||
};
|
||||
|
||||
let out = if !ctx.token.is_final {
|
||||
InferStreamResponse::Intermediate {
|
||||
token,
|
||||
top_tokens: vec![],
|
||||
}
|
||||
} else {
|
||||
let tokens = states.remove(&request_id).unwrap();
|
||||
let text = tokenizer.decode(&tokens, true);
|
||||
let generated_text = GeneratedText {
|
||||
text: text.unwrap(),
|
||||
generated_tokens: tokens.len() as u32,
|
||||
finish_reason: FinishReason::EndOfSequenceToken,
|
||||
seed: None,
|
||||
};
|
||||
|
||||
InferStreamResponse::End {
|
||||
token,
|
||||
top_tokens: vec![],
|
||||
generated_text,
|
||||
start: ctx.start.unwrap(),
|
||||
queued: ctx.queued,
|
||||
}
|
||||
};
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
Err(err) => Err(GenerationError(err.to_string())),
|
||||
};
|
||||
|
||||
if let Err(_) = ctx.channel.send(out) {
|
||||
warn!("Failed to send decoded token back to the user")
|
||||
}
|
||||
// Map the correct response depending on the step is final or not
|
||||
let out = if !decoded_token.is_final {
|
||||
InferStreamResponse::Intermediate {
|
||||
token,
|
||||
top_tokens: vec![],
|
||||
}
|
||||
Err(_err) => {
|
||||
todo!("what do we do?")
|
||||
} else {
|
||||
let text = tokenizer.decode(&ctx.tokens, true);
|
||||
let generated_text = GeneratedText {
|
||||
text: text.unwrap(),
|
||||
generated_tokens: ctx.tokens.len() as u32,
|
||||
finish_reason: FinishReason::EndOfSequenceToken, // TODO : Map FinishReason
|
||||
seed: None,
|
||||
};
|
||||
|
||||
InferStreamResponse::End {
|
||||
token,
|
||||
top_tokens: vec![],
|
||||
generated_text,
|
||||
start: ctx.start.unwrap(),
|
||||
queued: ctx.queued,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
Err(err) => Err(GenerationError(err.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -277,11 +246,8 @@ fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
|
|||
|
||||
unsafe impl Send for TensorRtLlmBackendImpl {}
|
||||
|
||||
pub struct TensorRtLlmBackendV2 {
|
||||
executor_looper: JoinHandle<()>,
|
||||
post_processor_looper: JoinHandle<()>,
|
||||
executor: UnboundedSender<GenerationContext>,
|
||||
}
|
||||
pub struct TensorRtLlmBackendV2(UnboundedSender<GenerationContext>);
|
||||
|
||||
|
||||
impl TensorRtLlmBackendV2 {
|
||||
pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
|
||||
|
@ -295,32 +261,22 @@ impl TensorRtLlmBackendV2 {
|
|||
|
||||
// Allocate the IPC layer to communicate with the backend
|
||||
let (executor_sender, executor_receiver) = unbounded_channel();
|
||||
let (post_processor_sender, post_processor_receiver) = unbounded_channel();
|
||||
|
||||
// Create the FFI backend
|
||||
let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
|
||||
let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
|
||||
.map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
|
||||
|
||||
// Executor looper is responsible for scheduling and pulling requests state at regular interval
|
||||
let executor_looper = spawn_blocking(move || {
|
||||
spawn_blocking(move || {
|
||||
executor_status_looper(
|
||||
backend,
|
||||
max_inflight_requests,
|
||||
tokenizer,
|
||||
backend,
|
||||
executor_receiver,
|
||||
post_processor_sender,
|
||||
)
|
||||
});
|
||||
|
||||
// Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user
|
||||
let post_processor_looper = spawn_blocking(move || {
|
||||
post_processor_looper::<256>(tokenizer, max_inflight_requests, post_processor_receiver)
|
||||
});
|
||||
|
||||
Ok(TensorRtLlmBackendV2 {
|
||||
executor_looper,
|
||||
post_processor_looper,
|
||||
executor: executor_sender,
|
||||
})
|
||||
Ok(TensorRtLlmBackendV2(executor_sender))
|
||||
}
|
||||
|
||||
fn validate(request: &ValidGenerateRequest) -> InferResult<()> {
|
||||
|
@ -354,20 +310,21 @@ impl TensorRtLlmBackendV2 {
|
|||
impl Backend for TensorRtLlmBackendV2 {
|
||||
fn schedule(
|
||||
&self,
|
||||
inner: ValidGenerateRequest,
|
||||
request: ValidGenerateRequest,
|
||||
) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
|
||||
Self::validate(&inner)?;
|
||||
Self::validate(&request)?;
|
||||
|
||||
// Open-up the stream to send tokens
|
||||
let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();
|
||||
|
||||
// Send the context to the executor for scheduling
|
||||
let queued = Instant::now();
|
||||
match self.executor.send(GenerationContext {
|
||||
request: inner,
|
||||
match self.0.send(GenerationContext {
|
||||
request,
|
||||
streamer,
|
||||
tokens: Vec::with_capacity(256),
|
||||
start: None,
|
||||
queued,
|
||||
streamer,
|
||||
}) {
|
||||
Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
|
||||
Err(_) => Err(GenerationError(
|
||||
|
@ -377,6 +334,6 @@ impl Backend for TensorRtLlmBackendV2 {
|
|||
}
|
||||
|
||||
async fn health(&self, _: bool) -> bool {
|
||||
!self.executor_looper.is_finished() & !self.post_processor_looper.is_finished()
|
||||
true
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,14 +3,13 @@ use std::path::{Path, PathBuf};
|
|||
use clap::Parser;
|
||||
use hf_hub::api::tokio::{Api, ApiBuilder};
|
||||
use hf_hub::{Cache, Repo, RepoType};
|
||||
use tokenizers::Tokenizer;
|
||||
use tracing::info;
|
||||
|
||||
use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
|
||||
use text_generation_backends_trtllm::TensorRtLlmBackendV2;
|
||||
use text_generation_router::server::get_base_tokenizer;
|
||||
use text_generation_router::usage_stats::UsageStatsLevel;
|
||||
use text_generation_router::{server, HubTokenizerConfig};
|
||||
use text_generation_router::{server, HubTokenizerConfig, Tokenizer};
|
||||
use text_generation_router::server::{get_hub_model_info, legacy_tokenizer_handle, py_resolve_tokenizer};
|
||||
|
||||
/// App Configuration
|
||||
#[derive(Parser, Debug)]
|
||||
|
@ -61,7 +60,7 @@ struct Args {
|
|||
#[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
|
||||
executor_worker: PathBuf,
|
||||
#[clap(default_value = "on", long, env)]
|
||||
usage_stats: usage_stats::UsageStatsLevel,
|
||||
usage_stats: UsageStatsLevel,
|
||||
#[clap(default_value = "2000000", long, env)]
|
||||
payload_limit: usize,
|
||||
}
|
||||
|
@ -126,18 +125,18 @@ async fn get_tokenizer(
|
|||
|
||||
// Load tokenizer and model info
|
||||
let (
|
||||
tokenizer_filename,
|
||||
_config_filename,
|
||||
tokenizer_config_filename,
|
||||
config_filename,
|
||||
_tokenizer_config_filename,
|
||||
_preprocessor_config_filename,
|
||||
_processor_config_filename,
|
||||
_model_info
|
||||
) = match api {
|
||||
Type::None => (
|
||||
Some(local_path.join("tokenizer.json")),
|
||||
Some(local_path.join("config.json")),
|
||||
Some(local_path.join("tokenizer_config.json")),
|
||||
Some(local_path.join("preprocessor_config.json")),
|
||||
Some(local_path.join("processor_config.json")),
|
||||
None
|
||||
),
|
||||
Type::Api(api) => {
|
||||
let api_repo = api.repo(Repo::with_revision(
|
||||
|
@ -146,21 +145,24 @@ async fn get_tokenizer(
|
|||
revision.unwrap_or_else(|| "main").to_string(),
|
||||
));
|
||||
|
||||
let tokenizer_filename = match api_repo.get("tokenizer.json").await {
|
||||
Ok(tokenizer_filename) => Some(tokenizer_filename),
|
||||
Err(_) => get_base_tokenizer(&api, &api_repo).await,
|
||||
};
|
||||
|
||||
let config_filename = api_repo.get("config.json").await.ok();
|
||||
let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
|
||||
let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
|
||||
let processor_config_filename = api_repo.get("processor_config.json").await.ok();
|
||||
|
||||
let model_info = if let Some(model_info) = get_hub_model_info(&api_repo).await {
|
||||
Some(model_info)
|
||||
} else {
|
||||
tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
|
||||
None
|
||||
};
|
||||
(
|
||||
tokenizer_filename,
|
||||
config_filename,
|
||||
tokenizer_config_filename,
|
||||
preprocessor_config_filename,
|
||||
processor_config_filename,
|
||||
model_info,
|
||||
)
|
||||
}
|
||||
Type::Cache(cache) => {
|
||||
|
@ -170,24 +172,55 @@ async fn get_tokenizer(
|
|||
revision.clone().unwrap_or_else(|| "main").to_string(),
|
||||
));
|
||||
(
|
||||
repo.get("tokenizer.json"),
|
||||
repo.get("config.json"),
|
||||
repo.get("tokenizer_config.json"),
|
||||
repo.get("preprocessor_config.json"),
|
||||
repo.get("processor_config.json"),
|
||||
None
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
// Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
|
||||
let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
|
||||
{
|
||||
HubTokenizerConfig::from_file(filename)
|
||||
} else {
|
||||
tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
|
||||
// let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
|
||||
// {
|
||||
// HubTokenizerConfig::from_file(filename)
|
||||
// } else {
|
||||
// tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
|
||||
// };
|
||||
|
||||
// let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
|
||||
// tracing::warn!("Could not find tokenizer config locally and no API specified");
|
||||
// HubTokenizerConfig::default()
|
||||
// });
|
||||
|
||||
let tokenizer: Tokenizer = {
|
||||
use pyo3::prelude::*;
|
||||
pyo3::Python::with_gil(|py| -> PyResult<()> {
|
||||
py_resolve_tokenizer(py, &tokenizer_name, revision.as_deref(), false)?;
|
||||
Ok(())
|
||||
})
|
||||
.inspect_err(|err| {
|
||||
tracing::error!("Failed to import python tokenizer {err}");
|
||||
})
|
||||
.or_else(|err| {
|
||||
let out = legacy_tokenizer_handle(config_filename.as_ref());
|
||||
out.ok_or(err)
|
||||
})
|
||||
.expect("We cannot load a tokenizer");
|
||||
let filename = "out/tokenizer.json";
|
||||
if let Ok(tok) = tokenizers::Tokenizer::from_file(filename) {
|
||||
Tokenizer::Rust(tok)
|
||||
} else {
|
||||
Tokenizer::Python {
|
||||
tokenizer_name: tokenizer_name.to_string(),
|
||||
revision: revision.map(|revision| revision.to_string()),
|
||||
trust_remote_code: false,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok())
|
||||
Some(tokenizer)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
|
@ -258,50 +291,55 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
|
|||
}
|
||||
|
||||
// Create the backend
|
||||
let tokenizer = get_tokenizer(
|
||||
match get_tokenizer(
|
||||
&tokenizer_name,
|
||||
tokenizer_config_path.as_deref(),
|
||||
revision.as_deref(),
|
||||
)
|
||||
.await
|
||||
.expect("Failed to retrieve tokenizer implementation");
|
||||
.expect("Failed to retrieve tokenizer implementation") {
|
||||
Tokenizer::Python { .. } => {
|
||||
Err(TensorRtLlmBackendError::Tokenizer("Failed to retrieve Rust based tokenizer".to_string()))
|
||||
}
|
||||
Tokenizer::Rust(tokenizer) => {
|
||||
info!("Successfully retrieved tokenizer {}", &tokenizer_name);
|
||||
let backend = TensorRtLlmBackendV2::new(
|
||||
tokenizer,
|
||||
model_id,
|
||||
executor_worker,
|
||||
max_concurrent_requests,
|
||||
)?;
|
||||
|
||||
info!("Successfully retrieved tokenizer {}", &tokenizer_name);
|
||||
let backend = TensorRtLlmBackendV2::new(
|
||||
tokenizer,
|
||||
model_id,
|
||||
executor_worker,
|
||||
max_concurrent_requests,
|
||||
)?;
|
||||
info!("Successfully created backend");
|
||||
|
||||
info!("Successfully created backend");
|
||||
// Run server
|
||||
server::run(
|
||||
backend,
|
||||
max_concurrent_requests,
|
||||
max_best_of,
|
||||
max_stop_sequences,
|
||||
max_top_n_tokens,
|
||||
max_input_tokens,
|
||||
max_total_tokens,
|
||||
validation_workers,
|
||||
auth_token,
|
||||
tokenizer_name,
|
||||
tokenizer_config_path,
|
||||
revision,
|
||||
false,
|
||||
hostname,
|
||||
port,
|
||||
cors_allow_origin,
|
||||
false,
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
max_client_batch_size,
|
||||
usage_stats,
|
||||
payload_limit,
|
||||
).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Run server
|
||||
server::run(
|
||||
backend,
|
||||
max_concurrent_requests,
|
||||
max_best_of,
|
||||
max_stop_sequences,
|
||||
max_top_n_tokens,
|
||||
max_input_tokens,
|
||||
max_total_tokens,
|
||||
validation_workers,
|
||||
auth_token,
|
||||
tokenizer_name,
|
||||
tokenizer_config_path,
|
||||
revision,
|
||||
false,
|
||||
hostname,
|
||||
port,
|
||||
cors_allow_origin,
|
||||
false,
|
||||
None,
|
||||
None,
|
||||
true,
|
||||
max_client_batch_size,
|
||||
usage_stats,
|
||||
payload_limit,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
//
|
||||
// Created by mfuntowicz on 7/2/24.
|
||||
//
|
||||
#include <catch2/catch_all.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include "../include/backend.h"
|
||||
|
||||
TEST_CASE("Load TRTLLM Engine on the TGI Backend", "[trtllm][engine][load]") {
|
||||
const auto engines = std::filesystem::path("/home/mfuntowicz/.cache/huggingface/assets/trtllm/0.11.0.dev2024062500/meta-llama--Meta-Llama-3-8B-Instruct/4090/engines/");
|
||||
const auto executor = std::filesystem::path("/home/mfuntowicz/Workspace/text-generation-inference/backends/trtllm/cmake-build-debug/cmake-build-debug/_deps/trtllm-src/cpp/tensorrt_llm/executor_worker/executorWorker");
|
||||
|
||||
spdlog::info("Loading config from: {}", absolute(engines).string());
|
||||
huggingface::tgi::backends::TensorRtLlmBackend backend(engines, executor);
|
||||
}
|
|
@ -0,0 +1,152 @@
|
|||
//
|
||||
// Created by mfuntowicz on 12/3/24.
|
||||
//
|
||||
|
||||
#include <catch2/catch_all.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <tensorrt_llm/executor/executor.h>
|
||||
|
||||
#include "backend.hpp"
|
||||
|
||||
|
||||
|
||||
using namespace huggingface::tgi::backends::trtllm;
|
||||
|
||||
TEST_CASE("parse generation_config.json all set", "[generation_config_t]")
|
||||
{
|
||||
const json config_j = {{"temperature", 0.6}, {"top_p", 0.95}, {"eos_token_id", {1,2,3}}};
|
||||
const auto generation_config = generation_config_t(config_j);
|
||||
|
||||
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(0.6, 1e-6));
|
||||
REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(0.95, 1e-6));
|
||||
|
||||
// Stop words
|
||||
REQUIRE_FALSE(generation_config.stop_words.empty());
|
||||
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
|
||||
|
||||
for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
|
||||
{
|
||||
// Currently we do not support multi-tokens stop words
|
||||
REQUIRE(lhs.size() == 1);
|
||||
REQUIRE(rhs.size() == 1);
|
||||
REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("parse generation_config.json default", "[generation_config_t]")
|
||||
{
|
||||
const json config_j = {{"eos_token_id", {1,2,3}}};
|
||||
const auto generation_config = generation_config_t(config_j);
|
||||
|
||||
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
|
||||
REQUIRE_FALSE(generation_config.stop_words.empty());
|
||||
REQUIRE(generation_config.stop_words.size() == config_j["/eos_token_id"_json_pointer].size());
|
||||
|
||||
for (auto [lhs, rhs] : std::views::zip(generation_config.stop_words, std::list<std::vector<int32_t>>{{1}, {2}, {3}}))
|
||||
{
|
||||
// Currently we do not support multi-tokens stop words
|
||||
REQUIRE(lhs.size() == 1);
|
||||
REQUIRE(rhs.size() == 1);
|
||||
REQUIRE_THAT(lhs, Catch::Matchers::UnorderedEquals(rhs));
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("parse generation_config.json empty", "[generation_config_t]")
|
||||
{
|
||||
const json config_j = {{"eos_token_id", {}}};
|
||||
const auto generation_config = generation_config_t(config_j);
|
||||
|
||||
REQUIRE_THAT(generation_config.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
REQUIRE_THAT(generation_config.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
|
||||
REQUIRE(generation_config.stop_words.empty());
|
||||
|
||||
const json config_j2 = {};
|
||||
const auto generation_config2 = generation_config_t(config_j);
|
||||
|
||||
REQUIRE_THAT(generation_config2.temperature, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
REQUIRE_THAT(generation_config2.top_p, Catch::Matchers::WithinAbs(1.0, 1e-6));
|
||||
|
||||
REQUIRE(generation_config2.stop_words.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("parallel_config single", "[backend_workspace_t]")
|
||||
{
|
||||
// Generate temporary folder
|
||||
const auto tmp_p = std::filesystem::temp_directory_path();
|
||||
const auto config_p = tmp_p / "config.json";
|
||||
const auto generation_config_p = tmp_p / "generation_config.json";
|
||||
|
||||
// Generate content
|
||||
std::ofstream o_config(config_p);
|
||||
o_config << R"({"pretrained_config": {"mapping": {"world_size": 2}}})"_json;
|
||||
o_config.close();
|
||||
|
||||
std::ofstream o_generation_config(generation_config_p);
|
||||
o_generation_config << R"({"eos_token_id": []})"_json;
|
||||
o_generation_config.close();
|
||||
|
||||
const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
|
||||
const auto parallel = workspace.parallel_config();
|
||||
REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kORCHESTRATOR);
|
||||
REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);
|
||||
|
||||
std::filesystem::remove(config_p);
|
||||
std::filesystem::remove(generation_config_p);
|
||||
}
|
||||
|
||||
TEST_CASE("parallel_config multi", "[backend_workspace_t]")
|
||||
{
|
||||
// Generate temporary folder
|
||||
const auto tmp_p = std::filesystem::temp_directory_path();
|
||||
const auto config_p = tmp_p / "config.json";
|
||||
const auto generation_config_p = tmp_p / "generation_config.json";
|
||||
|
||||
// Generate content
|
||||
std::ofstream o_config(config_p);
|
||||
o_config << R"({"pretrained_config": {"mapping": {"world_size": 1}}})"_json;
|
||||
o_config.close();
|
||||
|
||||
std::ofstream o_generation_config(generation_config_p);
|
||||
o_generation_config << R"({"eos_token_id": []})"_json;
|
||||
o_generation_config.close();
|
||||
|
||||
const auto workspace = backend_workspace_t(tmp_p.generic_string(), tmp_p.generic_string());
|
||||
const auto parallel = workspace.parallel_config();
|
||||
REQUIRE(parallel.getCommunicationMode() == tle::CommunicationMode::kLEADER);
|
||||
REQUIRE(parallel.getCommunicationType() == tle::CommunicationType::kMPI);
|
||||
|
||||
std::filesystem::remove(config_p);
|
||||
std::filesystem::remove(generation_config_p);
|
||||
}
|
||||
|
||||
TEST_CASE("executor_config", "[backend_workspace_t]")
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
TEST_CASE("sampling_params_t to tle::SamplingConfig", "[backend_t]")
|
||||
{
|
||||
const sampling_params_t params = {40, 0.95, 0.9, 1.0, 0.6, 2014};
|
||||
const auto config = static_cast<tle::SamplingConfig>(params);
|
||||
|
||||
REQUIRE(config.getTopK().has_value());
|
||||
REQUIRE(config.getTopK().value() == params.top_k);
|
||||
|
||||
REQUIRE(config.getSeed().has_value());
|
||||
REQUIRE(config.getSeed().value() == params.seed);
|
||||
|
||||
REQUIRE(config.getTopP().has_value());
|
||||
REQUIRE_THAT(*config.getTopP(), Catch::Matchers::WithinAbs(params.top_p, 1e-6f));
|
||||
|
||||
REQUIRE(config.getRepetitionPenalty().has_value());
|
||||
REQUIRE_THAT(*config.getRepetitionPenalty(), Catch::Matchers::WithinAbs(params.repetition_penalty, 1e-6f));
|
||||
|
||||
REQUIRE(config.getFrequencyPenalty().has_value());
|
||||
REQUIRE_THAT(*config.getFrequencyPenalty(), Catch::Matchers::WithinAbs(params.frequency_penalty, 1e-6f));
|
||||
|
||||
REQUIRE(config.getTemperature().has_value());
|
||||
REQUIRE_THAT(*config.getTemperature(), Catch::Matchers::WithinAbs(params.temperature, 1e-6f));
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
//
|
||||
// Created by mfuntowicz on 11/16/24.
|
||||
//
|
||||
|
||||
#include <catch2/catch_all.hpp>
|
||||
#include "../csrc/hardware.hpp"
|
||||
|
||||
using namespace huggingface::tgi::hardware::cuda;
|
||||
|
||||
TEST_CASE("is_at_least_<arch>") {
|
||||
const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
|
||||
REQUIRE(VOLTA_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least_hopper());
|
||||
|
||||
const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
|
||||
REQUIRE(TURING_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE(TURING_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least_hopper());
|
||||
|
||||
const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least_hopper());
|
||||
|
||||
const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least_hopper());
|
||||
|
||||
const static auto HOPPER_CAPABILITIES = compute_capabilities_t(9, 0);
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_volta());
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_turing());
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_ampere());
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_ada_lovelace());
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least_hopper());
|
||||
}
|
||||
|
||||
TEST_CASE("is_at_least") {
|
||||
const static auto VOLTA_CAPABILITIES = compute_capabilities_t(7, 0);
|
||||
REQUIRE(VOLTA_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE_FALSE(VOLTA_CAPABILITIES.is_at_least(HOPPER));
|
||||
|
||||
const static auto TURING_CAPABILITIES = compute_capabilities_t(7, 5);
|
||||
REQUIRE(TURING_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE(TURING_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE_FALSE(TURING_CAPABILITIES.is_at_least(HOPPER));
|
||||
|
||||
const static auto AMPERE_CAPABILITIES = compute_capabilities_t(8, 0);
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE(AMPERE_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE_FALSE(AMPERE_CAPABILITIES.is_at_least(HOPPER));
|
||||
|
||||
const static auto ADA_LOVELACE_CAPABILITIES = compute_capabilities_t(8, 9);
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE(ADA_LOVELACE_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE_FALSE(ADA_LOVELACE_CAPABILITIES.is_at_least(HOPPER));
|
||||
|
||||
const static auto HOPPER_CAPABILITIES = compute_capabilities_t (9, 0);
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(VOLTA));
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(TURING));
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(AMPERE));
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(ADA_LOVELACE));
|
||||
REQUIRE(HOPPER_CAPABILITIES.is_at_least(HOPPER));
|
||||
}
|
|
@ -17,6 +17,8 @@
|
|||
title: Using TGI with Intel GPUs
|
||||
- local: installation
|
||||
title: Installation from source
|
||||
- local: multi_backend_support
|
||||
title: Multi-backend support
|
||||
|
||||
- local: architecture
|
||||
title: Internal Architecture
|
||||
|
@ -45,6 +47,10 @@
|
|||
- local: basic_tutorials/train_medusa
|
||||
title: Train Medusa
|
||||
title: Tutorials
|
||||
- sections:
|
||||
- local: backends/trtllm
|
||||
title: TensorRT-LLM
|
||||
title: Backends
|
||||
- sections:
|
||||
- local: reference/launcher
|
||||
title: All TGI CLI options
|
||||
|
|
|
@ -9,8 +9,10 @@ A high-level architecture diagram can be seen here:
|
|||
This diagram shows well there are these separate components:
|
||||
|
||||
- **The router**, also named `webserver`, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.
|
||||
- **The model server**, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
|
||||
- **The launcher** is a helper that will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.
|
||||
- **The model server**, responsible for receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.
|
||||
|
||||
Note that for other backends (eg. TRTLLM) the model server and launcher are specific to the backend.
|
||||
|
||||
The router and the model server can be two different machines, they do not need to be deployed together.
|
||||
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
# TensorRT-LLM backend
|
||||
|
||||
The NVIDIA TensorRT-LLM (TRTLLM) backend is a high-performance backend for LLMs
|
||||
that uses NVIDIA's TensorRT library for inference acceleration.
|
||||
It makes use of specific optimizations for NVIDIA GPUs, such as custom kernels.
|
||||
|
||||
To use the TRTLLM backend you need to compile `engines` for the models you want to use.
|
||||
Each `engine` must be compiled on the same GPU architecture that you will use for inference.
|
||||
|
||||
## Supported models
|
||||
|
||||
Check the [support matrix](https://nvidia.github.io/TensorRT-LLM/reference/support-matrix.html) to see which models are
|
||||
supported.
|
||||
|
||||
## Compiling engines
|
||||
|
||||
You can use [Optimum-NVIDIA](https://github.com/huggingface/optimum-nvidia) to compile engines for the models you
|
||||
want to use.
|
||||
|
||||
```bash
|
||||
MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
||||
# Install huggingface_cli
|
||||
python -m pip install huggingface-cli[hf_transfer]
|
||||
|
||||
# Login to the Hugging Face Hub
|
||||
huggingface-cli login
|
||||
|
||||
# Create a directory to store the model
|
||||
mkdir -p /tmp/models/$MODEL_NAME
|
||||
|
||||
# Create a directory to store the compiled engine
|
||||
mkdir -p /tmp/engines/$MODEL_NAME
|
||||
|
||||
# Download the model
|
||||
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir /tmp/models/$MODEL_NAME $MODEL_NAME
|
||||
|
||||
# Compile the engine using Optimum-NVIDIA
|
||||
docker run \
|
||||
--rm \
|
||||
-it \
|
||||
--gpus=1 \
|
||||
-v /tmp/models/$MODEL_NAME:/model \
|
||||
-v /tmp/engines/$MODEL_NAME:/engine \
|
||||
huggingface/optimum-nvidia \
|
||||
optimum-cli export trtllm \
|
||||
--tp=1 \
|
||||
--pp=1 \
|
||||
--max-batch-size=128 \
|
||||
--max-input-length 4096 \
|
||||
--max-output-length 8192 \
|
||||
--max-beams-width=1 \
|
||||
--destination /engine \
|
||||
$MODEL_NAME
|
||||
```
|
||||
|
||||
Your compiled engine will be saved in the `/tmp/engines/$MODEL_NAME` directory.
|
||||
|
||||
## Using the TRTLLM backend
|
||||
|
||||
Run TGI-TRTLLM Docker image with the compiled engine:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
--gpus 1 \
|
||||
-it \
|
||||
--rm \
|
||||
-p 3000:3000 \
|
||||
-e MODEL=$MODEL_NAME \
|
||||
-e PORT=3000 \
|
||||
-e HF_TOKEN='hf_XXX' \
|
||||
-v /tmp/engines/$MODEL_NAME:/data \
|
||||
ghcr.io/huggingface/text-generation-inference:latest-trtllm \
|
||||
--executor-worker executorWorker \
|
||||
--model-id /data/$MODEL_NAME
|
||||
```
|
||||
|
||||
## Development
|
||||
|
||||
To develop TRTLLM backend, you can use [dev containers](https://containers.dev/) located in
|
||||
`.devcontainer` directory.
|
|
@ -0,0 +1,13 @@
|
|||
# Multi-backend support
|
||||
|
||||
TGI (Text Generation Inference) offers flexibility by supporting multiple backends for serving large language models (LLMs).
|
||||
With multi-backend support, you can choose the backend that best suits your needs,
|
||||
whether you prioritize performance, ease of use, or compatibility with specific hardware. API interaction with
|
||||
TGI remains consistent across backends, allowing you to switch between them seamlessly.
|
||||
|
||||
**Supported backends:**
|
||||
* **TGI CUDA backend**: This high-performance backend is optimized for NVIDIA GPUs and serves as the default option
|
||||
within TGI. Developed in-house, it boasts numerous optimizations and is used in production by various projects, including those by Hugging Face.
|
||||
* **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference.
|
||||
It utilizes specialized optimizations and custom kernels for enhanced performance.
|
||||
However, it requires a model-specific compilation step for each GPU architecture.
|
|
@ -1593,7 +1593,7 @@ pub fn schema() -> ApiDoc {
|
|||
ApiDoc
|
||||
}
|
||||
|
||||
fn py_resolve_tokenizer(
|
||||
pub fn py_resolve_tokenizer(
|
||||
py: pyo3::Python,
|
||||
tokenizer_name: &str,
|
||||
revision: Option<&str>,
|
||||
|
@ -1619,7 +1619,7 @@ fn py_resolve_tokenizer(
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
|
||||
pub fn legacy_tokenizer_handle(config_filename: Option<&PathBuf>) -> Option<()> {
|
||||
// XXX Legacy case for FasterDecoding/medusa-vicuna-7b-v1.3
|
||||
// and state-spaces/mamba-130m
|
||||
tracing::warn!("Odd tokenizer detected, falling back on legacy tokenization");
|
||||
|
|
Loading…
Reference in New Issue