bind to CUDA::nvml to retrieve compute capabilities at runtime
This commit is contained in:
parent
68a0247a2c
commit
bec188ff73
|
@ -17,6 +17,8 @@ include(cmake/json.cmake)
|
|||
include(cmake/spdlog.cmake)
|
||||
include(cmake/trtllm.cmake)
|
||||
|
||||
find_package(CUDAToolkit REQUIRED COMPONENTS CUDA::nvml)
|
||||
|
||||
# TGI TRTLLM Backend definition
|
||||
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
|
||||
target_include_directories(tgi_trtllm_backend_impl PRIVATE
|
||||
|
@ -25,7 +27,7 @@ target_include_directories(tgi_trtllm_backend_impl PRIVATE
|
|||
)
|
||||
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
|
||||
target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog)
|
||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog CUDA::nvml)
|
||||
|
||||
|
||||
#### Unit Tests ####
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
#include <fmt/std.h>
|
||||
#include <nvml.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "backend.h"
|
||||
|
||||
void huggingface::tgi::backends::InitializeBackend() {
|
||||
SPDLOG_INFO("Initializing Backend...");
|
||||
|
||||
nvmlInit_v2();
|
||||
initTrtLlmPlugins();
|
||||
}
|
||||
|
||||
|
@ -13,7 +14,15 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
|
|||
tle::ExecutorConfig execConfig(1);
|
||||
|
||||
// TODO : Need to check for >= sm_80 (ampere)
|
||||
// execConfig.setEnableChunkedContext(true)
|
||||
nvmlDevice_t device;
|
||||
int32_t cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor;
|
||||
|
||||
if(nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
|
||||
if(nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) == NVML_SUCCESS) {
|
||||
SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor);
|
||||
execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
|
||||
}
|
||||
}
|
||||
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
|
||||
|
||||
if(config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1){
|
||||
|
@ -65,7 +74,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
|
|||
std::optional<uint32_t> nTopTokens
|
||||
) {
|
||||
spdlog::debug(
|
||||
"Submitting inference over {:d} tokens to the executor {:d}",
|
||||
FMT_STRING("Submitting inference over {:d} tokens to the executor {:d}"),
|
||||
tokens.size(),
|
||||
executor.getLatestIterationStats().back().numActiveRequests
|
||||
);
|
||||
|
@ -92,7 +101,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
|
|||
}
|
||||
|
||||
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) {
|
||||
SPDLOG_DEBUG("Polling request {:d}", reqId);
|
||||
SPDLOG_DEBUG(FMT_STRING("Polling request {:d}"), reqId);
|
||||
const auto responses = executor.awaitResponses(reqId);
|
||||
return responses;
|
||||
}
|
Loading…
Reference in New Issue