bind to CUDA::nvml to retrieve compute capabilities at runtime

This commit is contained in:
Morgan Funtowicz 2024-07-08 22:32:41 +00:00
parent 68a0247a2c
commit bec188ff73
2 changed files with 16 additions and 5 deletions

View File

@ -17,6 +17,8 @@ include(cmake/json.cmake)
include(cmake/spdlog.cmake)
include(cmake/trtllm.cmake)
find_package(CUDAToolkit REQUIRED COMPONENTS CUDA::nvml)
# TGI TRTLLM Backend definition
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
target_include_directories(tgi_trtllm_backend_impl PRIVATE
@ -25,7 +27,7 @@ target_include_directories(tgi_trtllm_backend_impl PRIVATE
)
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog)
target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog CUDA::nvml)
#### Unit Tests ####

View File

@ -1,11 +1,12 @@
#include <fmt/std.h>
#include <nvml.h>
#include <spdlog/spdlog.h>
#include "backend.h"
void huggingface::tgi::backends::InitializeBackend() {
SPDLOG_INFO("Initializing Backend...");
nvmlInit_v2();
initTrtLlmPlugins();
}
@ -13,7 +14,15 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
tle::ExecutorConfig execConfig(1);
// TODO : Need to check for >= sm_80 (ampere)
// execConfig.setEnableChunkedContext(true)
nvmlDevice_t device;
int32_t cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor;
if(nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
if(nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) == NVML_SUCCESS) {
SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor);
execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
}
}
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
if(config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1){
@ -65,7 +74,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
std::optional<uint32_t> nTopTokens
) {
spdlog::debug(
"Submitting inference over {:d} tokens to the executor {:d}",
FMT_STRING("Submitting inference over {:d} tokens to the executor {:d}"),
tokens.size(),
executor.getLatestIterationStats().back().numActiveRequests
);
@ -92,7 +101,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
}
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) {
SPDLOG_DEBUG("Polling request {:d}", reqId);
SPDLOG_DEBUG(FMT_STRING("Polling request {:d}"), reqId);
const auto responses = executor.awaitResponses(reqId);
return responses;
}