[Backend] Bump TRTLLM to v.0.17.0 (#2991)
* backend(trtllm): bump TRTLLM to v.0.17.0 * backend(trtllm): forget to bump dockerfile * backend(trtllm): use arg instead of env * backend(trtllm): use correct library reference decoder_attention_src * backend(trtllm): link against decoder_attention_{0|1} * backend(trtllm): build against gcc-14 with cuda12.8 * backend(trtllm): use return value optimization flag as as error if available * backend(trtllm): make sure we escalade all warnings as errors on the backend impl in debug mode * backend(trtllm): link against CUDA 12.8
This commit is contained in:
parent
36223f834e
commit
856709d5c3
|
@ -1,12 +1,14 @@
|
|||
ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real"
|
||||
ARG cuda_arch_list="75-real;80-real;86-real;89-real;90-real;100-real;120-real"
|
||||
ARG cuda_base=12.8.0
|
||||
ARG build_type=release
|
||||
ARG ompi_version=4.1.7
|
||||
ARG sccache_gha_enabled=off
|
||||
ARG actions_cache_url=""
|
||||
ARG actions_runtime_token=""
|
||||
|
||||
|
||||
# CUDA dependent dependencies resolver stage
|
||||
FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS cuda-builder
|
||||
FROM nvidia/cuda:${cuda_base}-cudnn-devel-ubuntu24.04 AS cuda-builder
|
||||
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||
build-essential \
|
||||
|
@ -98,14 +100,16 @@ COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
|
|||
|
||||
ENV RUSTC_WRAPPER=sccache
|
||||
ENV CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX
|
||||
RUN export CMAKE_C_COMPILER_LAUNCHER=sccache && \
|
||||
RUN export CC=gcc-14 \
|
||||
export CXX=g++-14 \
|
||||
export CMAKE_C_COMPILER_LAUNCHER=sccache && \
|
||||
export CMAKE_CXX_COMPILER_LAUNCHER=sccache && \
|
||||
export CMAKE_CUDA_COMPILER_LAUNCHER=sccache && \
|
||||
mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
|
||||
cargo build --profile ${build_type} --package text-generation-backends-trtllm --bin text-generation-backends-trtllm && \
|
||||
sccache --show-stats
|
||||
|
||||
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS runtime
|
||||
FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS runtime
|
||||
RUN apt update && apt install -y libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
|
||||
rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
|
||||
pipx ensurepath && \
|
||||
|
@ -124,7 +128,7 @@ COPY --from=tgi-builder /usr/local/tgi /usr/local/tgi
|
|||
COPY --from=tgi-builder /usr/src/text-generation-inference/target/release/text-generation-backends-trtllm /usr/local/tgi/bin/text-generation-launcher
|
||||
|
||||
# This is used only for the CI/CD
|
||||
FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 AS ci-runtime
|
||||
FROM nvidia/cuda:${cuda_base}-cudnn-runtime-ubuntu24.04 AS ci-runtime
|
||||
RUN apt update && apt install -y libasan8 libubsan1 libucx0 pipx python3-minimal python3-dev python3-pip python3-venv && \
|
||||
rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
|
||||
pipx ensurepath && \
|
||||
|
|
|
@ -59,7 +59,9 @@ target_link_libraries(tgi_trtllm_backend_impl PRIVATE tensorrt_llm nvinfer_plugi
|
|||
|
||||
# This install all the artifacts in CMAKE_INSTALL_PREFIX under include/ lib/ bin/ to make easy to link / find it back
|
||||
install(TARGETS tgi_trtllm_backend_impl)
|
||||
install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention executorWorker)
|
||||
#install(TARGETS cutlass_src fb_gemm_src fpA_intB_gemm_src gemm_swiglu_sm90_src kernels_src)
|
||||
install(TARGETS decoder_attention_0 decoder_attention_1)
|
||||
install(TARGETS tensorrt_llm nvinfer_plugin_tensorrt_llm decoder_attention_src executorWorker)
|
||||
install(FILES ${TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH} TYPE LIB)
|
||||
if (NOT ${TGI_TRTLLM_BACKEND_DEBUG})
|
||||
install(FILES ${TRTLLM_EXECUTOR_STATIC_LIBRARY_PATH} TYPE LIB)
|
||||
|
@ -82,8 +84,9 @@ if (${TGI_TRTLLM_BACKEND_BUILD_TESTS} AND CMAKE_BUILD_TYPE MATCHES "Debug")
|
|||
check_cxx_compiler_flag("-Wnrvo" COMPILER_SUPPORT_WARNING_ON_NVRO)
|
||||
if (${COMPILER_SUPPORT_WARNING_ON_NVRO})
|
||||
message(STATUS "Enabling non-NVRO detection")
|
||||
target_compile_options(tgi_trtllm_backend_impl "-Wnvro")
|
||||
target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wnrvo)
|
||||
endif ()
|
||||
target_compile_options(tgi_trtllm_backend_impl PRIVATE -Wall)
|
||||
|
||||
cmake_path(GET TRTLLM_NVRTC_WRAPPER_LIBRARY_PATH PARENT_PATH TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH)
|
||||
message(STATUS "Adding linking path: ${TRTLLM_NVRTC_WRAPPER_PARENT_LIBRARY_PATH}")
|
||||
|
|
|
@ -7,7 +7,7 @@ use std::sync::LazyLock;
|
|||
|
||||
const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 1] = ["spdlog"];
|
||||
const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
|
||||
const CUDA_REQUIRED_VERSION: &str = "12.6";
|
||||
const CUDA_REQUIRED_VERSION: &str = "12.8";
|
||||
const MPI_REQUIRED_VERSION: &str = "4.1";
|
||||
const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
|
||||
const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
|
||||
|
@ -25,11 +25,12 @@ const IS_GHA_BUILD: LazyLock<bool> = LazyLock::new(|| {
|
|||
// Dependencies
|
||||
const BACKEND_DEPS: &str = "tgi_trtllm_backend_impl";
|
||||
const CUDA_TRANSITIVE_DEPS: [&str; 4] = ["cuda", "cudart", "cublas", "nvidia-ml"];
|
||||
const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 4] = [
|
||||
const TENSORRT_LLM_TRANSITIVE_DEPS: [(&str, &str); 5] = [
|
||||
("dylib", "tensorrt_llm"),
|
||||
("dylib", "tensorrt_llm_nvrtc_wrapper"),
|
||||
("dylib", "nvinfer_plugin_tensorrt_llm"),
|
||||
("dylib", "decoder_attention"),
|
||||
("dylib", "decoder_attention_0"),
|
||||
("dylib", "decoder_attention_1"),
|
||||
];
|
||||
|
||||
macro_rules! probe {
|
||||
|
|
|
@ -28,7 +28,7 @@ find_package(Python3 REQUIRED Interpreter)
|
|||
fetchcontent_declare(
|
||||
trtllm
|
||||
GIT_REPOSITORY https://github.com/nvidia/TensorRT-LLM.git
|
||||
GIT_TAG v0.16.0
|
||||
GIT_TAG v0.17.0
|
||||
GIT_SHALLOW ON
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP
|
||||
)
|
||||
|
|
|
@ -2,13 +2,13 @@
|
|||
|
||||
set -ex
|
||||
|
||||
TRT_VER_BASE="10.7.0"
|
||||
TRT_VER_FULL="${TRT_VER_BASE}.23"
|
||||
CUDA_VER="12.6"
|
||||
CUDNN_VER="9.5.0.50-1"
|
||||
NCCL_VER="2.22.3-1+cuda12.6"
|
||||
CUBLAS_VER="12.6.3.3-1"
|
||||
NVRTC_VER="12.6.77-1"
|
||||
TRT_VER_BASE="10.8.0"
|
||||
TRT_VER_FULL="${TRT_VER_BASE}.43"
|
||||
CUDA_VER="12.8"
|
||||
CUDNN_VER="9.7.0.66-1"
|
||||
NCCL_VER="2.25.1-1+cuda${CUDA_VER}"
|
||||
CUBLAS_VER="${CUDA_VER}.3.14-1"
|
||||
NVRTC_VER="${CUDA_VER}.61-1"
|
||||
|
||||
for i in "$@"; do
|
||||
case $i in
|
||||
|
@ -73,7 +73,7 @@ install_centos_requirements() {
|
|||
install_tensorrt() {
|
||||
#PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
|
||||
#PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
|
||||
TRT_CUDA_VERSION="12.6"
|
||||
TRT_CUDA_VERSION="12.8"
|
||||
|
||||
if [ -z "$RELEASE_URL_TRT" ];then
|
||||
ARCH=${TRT_TARGETARCH}
|
||||
|
|
Loading…
Reference in New Issue