bind to CUDA::nvml to retrieve compute capabilities at runtime

2024-07-08 22:32:41 +00:00 · 2024-07-08 22:32:41 +00:00 · bec188ff73
parent 68a0247a2c
commit bec188ff73
2 changed files with 16 additions and 5 deletions
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@ -17,6 +17,8 @@ include(cmake/json.cmake)
 include(cmake/spdlog.cmake)
 include(cmake/trtllm.cmake)
 find_package(CUDAToolkit REQUIRED COMPONENTS CUDA::nvml)
 # TGI TRTLLM Backend definition
 add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
 target_include_directories(tgi_trtllm_backend_impl PRIVATE
@ -25,7 +27,7 @@ target_include_directories(tgi_trtllm_backend_impl PRIVATE
 )
 include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
 target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
-target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog)
+target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog CUDA::nvml)
 #### Unit Tests ####
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@ -1,11 +1,12 @@
 #include <fmt/std.h>
 #include <nvml.h>
 #include <spdlog/spdlog.h>
 #include "backend.h"
 void huggingface::tgi::backends::InitializeBackend() {
    SPDLOG_INFO("Initializing Backend...");
-
+    nvmlInit_v2();
    initTrtLlmPlugins();
 }
@ -13,7 +14,15 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
    tle::ExecutorConfig execConfig(1);
    // TODO : Need to check for >= sm_80 (ampere)
-    // execConfig.setEnableChunkedContext(true)
+    nvmlDevice_t device;
    int32_t cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor;
    if(nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
        if(nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) == NVML_SUCCESS) {
            SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor);
            execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
        }
    }
    execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
    if(config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1){
@ -65,7 +74,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
        std::optional<uint32_t> nTopTokens
 ) {
    spdlog::debug(
-            "Submitting inference over {:d} tokens to the executor {:d}",
+            FMT_STRING("Submitting inference over {:d} tokens to the executor {:d}"),
            tokens.size(),
            executor.getLatestIterationStats().back().numActiveRequests
    );
@ -92,7 +101,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
 }
 std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) {
-    SPDLOG_DEBUG("Polling request {:d}", reqId);
+    SPDLOG_DEBUG(FMT_STRING("Polling request {:d}"), reqId);
    const auto responses = executor.awaitResponses(reqId);
    return responses;
 }