bind to CUDA::nvml to retrieve compute capabilities at runtime
This commit is contained in:
parent
68a0247a2c
commit
bec188ff73
|
@ -17,6 +17,8 @@ include(cmake/json.cmake)
|
||||||
include(cmake/spdlog.cmake)
|
include(cmake/spdlog.cmake)
|
||||||
include(cmake/trtllm.cmake)
|
include(cmake/trtllm.cmake)
|
||||||
|
|
||||||
|
find_package(CUDAToolkit REQUIRED COMPONENTS CUDA::nvml)
|
||||||
|
|
||||||
# TGI TRTLLM Backend definition
|
# TGI TRTLLM Backend definition
|
||||||
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
|
add_library(tgi_trtllm_backend_impl STATIC include/backend.h lib/backend.cpp)
|
||||||
target_include_directories(tgi_trtllm_backend_impl PRIVATE
|
target_include_directories(tgi_trtllm_backend_impl PRIVATE
|
||||||
|
@ -25,7 +27,7 @@ target_include_directories(tgi_trtllm_backend_impl PRIVATE
|
||||||
)
|
)
|
||||||
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
|
include_directories(${TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR})
|
||||||
target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
|
target_link_libraries(tgi_trtllm_backend_impl PUBLIC tensorrt_llm nvinfer_plugin_tensorrt_llm)
|
||||||
target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog)
|
target_link_libraries(tgi_trtllm_backend_impl PRIVATE nlohmann_json::nlohmann_json spdlog::spdlog CUDA::nvml)
|
||||||
|
|
||||||
|
|
||||||
#### Unit Tests ####
|
#### Unit Tests ####
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
#include <fmt/std.h>
|
#include <fmt/std.h>
|
||||||
|
#include <nvml.h>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include "backend.h"
|
#include "backend.h"
|
||||||
|
|
||||||
void huggingface::tgi::backends::InitializeBackend() {
|
void huggingface::tgi::backends::InitializeBackend() {
|
||||||
SPDLOG_INFO("Initializing Backend...");
|
SPDLOG_INFO("Initializing Backend...");
|
||||||
|
nvmlInit_v2();
|
||||||
initTrtLlmPlugins();
|
initTrtLlmPlugins();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,7 +14,15 @@ tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &co
|
||||||
tle::ExecutorConfig execConfig(1);
|
tle::ExecutorConfig execConfig(1);
|
||||||
|
|
||||||
// TODO : Need to check for >= sm_80 (ampere)
|
// TODO : Need to check for >= sm_80 (ampere)
|
||||||
// execConfig.setEnableChunkedContext(true)
|
nvmlDevice_t device;
|
||||||
|
int32_t cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor;
|
||||||
|
|
||||||
|
if(nvmlDeviceGetHandleByIndex_v2(0, &device) == NVML_SUCCESS) {
|
||||||
|
if(nvmlDeviceGetCudaComputeCapability(device, &cudaComputeCapabilitiesMajor, &cudaComputeCapabilitiesMinor) == NVML_SUCCESS) {
|
||||||
|
SPDLOG_INFO(FMT_STRING("Detected sm_{:d}{:d} compute capabilities"), cudaComputeCapabilitiesMajor, cudaComputeCapabilitiesMinor);
|
||||||
|
execConfig.setEnableChunkedContext(cudaComputeCapabilitiesMajor >= 8);
|
||||||
|
}
|
||||||
|
}
|
||||||
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
|
execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
|
||||||
|
|
||||||
if(config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1){
|
if(config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1){
|
||||||
|
@ -65,7 +74,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
|
||||||
std::optional<uint32_t> nTopTokens
|
std::optional<uint32_t> nTopTokens
|
||||||
) {
|
) {
|
||||||
spdlog::debug(
|
spdlog::debug(
|
||||||
"Submitting inference over {:d} tokens to the executor {:d}",
|
FMT_STRING("Submitting inference over {:d} tokens to the executor {:d}"),
|
||||||
tokens.size(),
|
tokens.size(),
|
||||||
executor.getLatestIterationStats().back().numActiveRequests
|
executor.getLatestIterationStats().back().numActiveRequests
|
||||||
);
|
);
|
||||||
|
@ -92,7 +101,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) {
|
std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType reqId) {
|
||||||
SPDLOG_DEBUG("Polling request {:d}", reqId);
|
SPDLOG_DEBUG(FMT_STRING("Polling request {:d}"), reqId);
|
||||||
const auto responses = executor.awaitResponses(reqId);
|
const auto responses = executor.awaitResponses(reqId);
|
||||||
return responses;
|
return responses;
|
||||||
}
|
}
|
Loading…
Reference in New Issue