diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index 16f45f5d..96f5f9f4 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -106,7 +106,7 @@ huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { const auto numResponses = executor.getNumResponsesReady(); -#ifdef NDEBUG +#ifndef NDEBUG if(numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses); #endif @@ -124,13 +124,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const float_t frequency_penalty, const uint64_t seed ) { -#ifdef NDEBUG - SPDLOG_DEBUG( - FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"), - tokens.size(), - executor.getLatestIterationStats().back().numActiveRequests - ); -#else +#ifndef NDEBUG SPDLOG_DEBUG( FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"), fmt::join(tokens, ", "), @@ -142,7 +136,7 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const auto maxNewTokensChecked = static_cast( std::min(maxNewTokens, static_cast(maxNumTokens - tokens.size()))); -#ifdef NDEBUG +#ifndef NDEBUG SPDLOG_INFO( FMT_STRING("Sampling config: topK={:d}, topP={:d}, temperature={:d}, repetition_penalty={:d}, frequency_penalty={:d}, seed={:d}"), topK, topP, temperature, repetition_penalty, frequency_penalty, seed @@ -156,4 +150,4 @@ tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( std::vector huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() { return executor.awaitResponses(); -} +} \ No newline at end of file diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index 54c17bc4..80e74cf7 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -40,7 +40,9 @@ huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() { auto steps = std::make_unique>(); steps->reserve(responses.size()); +#ifndef NDEBUG SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size()); +#endif // Transform tle::Response to GenerationStep std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {