diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 83e862c5..2864021e 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -72,12 +72,6 @@ namespace huggingface::tgi::backends { const std::filesystem::path &executorWorker ); - /** - * Query the executor for the number of token available for pulling - * @return - */ - [[nodiscard]] size_t NumResponsesReady() const; - /** * Submit a new generation task to the executor * @param tokens diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 96f5f9f4..e2e0cbea 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -9,13 +9,13 @@ #include "hardware.h" void huggingface::tgi::backends::InitializeBackend() { - if(const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")){ + if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) { std::string log_level(TRTLLM_LOG_LEVEL_CSTR); - std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { + std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { return std::tolower(c); }); - if(log_level == "debug") + if (log_level == "debug") spdlog::set_level(spdlog::level::debug); else spdlog::set_level(spdlog::level::info); @@ -102,17 +102,6 @@ huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref()); } -[[nodiscard("Returned number of requests needs to be consumed")]] -size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { - const auto numResponses = executor.getNumResponsesReady(); - -#ifndef NDEBUG - if(numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses); -#endif - - return numResponses; -} - [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]] tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const std::vector &tokens, @@ -138,10 +127,11 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( #ifndef NDEBUG SPDLOG_INFO( - FMT_STRING("Sampling config: topK={:d}, topP={:d}, temperature={:d}, repetition_penalty={:d}, frequency_penalty={:d}, seed={:d}"), - topK, topP, temperature, repetition_penalty, frequency_penalty, seed + FMT_STRING( + "Sampling config: topK={:d}, topP={:d}, temperature={:d}, repetition_penalty={:d}, frequency_penalty={:d}, seed={:d}"), + topK, topP, temperature, repetition_penalty, frequency_penalty, seed ) - SPDLOG_INFO(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); + SPDLOG_INFO(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); #endif const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed);