feat(backend): more impl
This commit is contained in:
parent
df99164dc1
commit
fd7e2b5bbd
|
@ -48,7 +48,7 @@ namespace huggingface::tgi::backends::trtllm {
|
|||
}
|
||||
|
||||
std::expected<request_id_t, backend_exception_t>
|
||||
backend_t::submit(std::span<tle::TokenIdType> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept {
|
||||
backend_t::submit(const std::span<tle::TokenIdType> token_ids, const generation_params_t generation_params, const sampling_params_t sampling_params) noexcept {
|
||||
SPDLOG_DEBUG("Submitting {:d} tokens to the executor for scheduling ({}, {})", token_ids.size(), generation_params, sampling_params);
|
||||
return executor_.enqueueRequest(tle::Request {
|
||||
{token_ids.begin(), token_ids.end()}, // Making actual copy of the tokens
|
||||
|
|
|
@ -7,7 +7,9 @@
|
|||
#include <span>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <spdlog/fmt/fmt.h>
|
||||
|
||||
#include <tensorrt_llm/executor/executor.h>
|
||||
|
||||
#include <hardware.hpp>
|
||||
|
@ -58,7 +60,8 @@ namespace huggingface::tgi::backends::trtllm {
|
|||
};
|
||||
|
||||
/**
|
||||
*
|
||||
* Represent possible values from transformers generation `generation_config.json`.
|
||||
* It usually stores default sampling parameters to use, such as top_p, temperature, etc.
|
||||
*/
|
||||
struct generation_config_t {
|
||||
float_t top_p;
|
||||
|
@ -69,15 +72,18 @@ namespace huggingface::tgi::backends::trtllm {
|
|||
top_p(config.value("top_p", 1.0f)), temperature( config.value("temperature", 1.0f)), stop_words(0) {
|
||||
if(config.contains("/eos_token_id"_json) && config["/eos_token_id"_json].is_array()) {
|
||||
const auto& eos_token_id = config["eos_token_id"];
|
||||
std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](int32_t token_id) {
|
||||
stop_words.push_back({token_id});
|
||||
std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
|
||||
stop_words.emplace_back(token_id.template get<int32_t>());
|
||||
});
|
||||
|
||||
SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
*
|
||||
* Helper class representing various items which are stored within the TensorRT-LLM engines folder and
|
||||
* can be retrieved at runtime
|
||||
*/
|
||||
class backend_workspace_t {
|
||||
private:
|
||||
|
@ -111,32 +117,41 @@ namespace huggingface::tgi::backends::trtllm {
|
|||
[[nodiscard]] std::filesystem::path engines_folder() const { return engines_folder_; }
|
||||
|
||||
/**
|
||||
*
|
||||
* @return
|
||||
* Hugging Face transformers' generated `generation_config_t` mapping information stored in the
|
||||
* `generation_config.json` holding default generation parameters.
|
||||
* @return `generation_config_t`
|
||||
*/
|
||||
[[nodiscard]] const generation_config_t& generation_config() const { return generation_config_; }
|
||||
|
||||
/**
|
||||
*
|
||||
* @return
|
||||
/**
|
||||
* Factory method returning new `tensorrt_llm::executor::ParallelConfig` instance used
|
||||
* to initialize `tensorrt_llm::executor::Executor` with multi-instance communication information
|
||||
* @return `tensorrt_llm::executor::ParallelConfig` instance
|
||||
*/
|
||||
[[nodiscard]] constexpr tle::ParallelConfig parallel_config() const;
|
||||
|
||||
/**
|
||||
*
|
||||
* @return
|
||||
* Factory method returning new `tensorrt_llm::executor::ExecutorConfig` instance used
|
||||
* to initialize `tensorrt_llm::executor::Executor`
|
||||
* @return `tensorrt_llm::executor::ExecutorConfig` instance
|
||||
*/
|
||||
[[nodiscard]] constexpr tle::ExecutorConfig executor_config() const;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* Error raised by the underlying backend implementation
|
||||
*/
|
||||
class backend_exception_t: std::exception {};
|
||||
enum backend_error_t {
|
||||
EXECUTOR_NOT_READY = 3,
|
||||
EXECUTOR_SCHEDULING_FAILED = 4,
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* Actual TensorRT-LLM backend implementation interacting with TensorRT-LLM Executor service to
|
||||
* - schedule new request
|
||||
* - pull status of submitted request(s)
|
||||
* - cancel submitted request(s)
|
||||
*/
|
||||
class backend_t {
|
||||
private:
|
||||
|
@ -156,7 +171,7 @@ namespace huggingface::tgi::backends::trtllm {
|
|||
* @return Either newly submitted request's id or the error why it failed to submit
|
||||
*/
|
||||
[[nodiscard("Discarded executor request_id needs to be assigned")]]
|
||||
std::expected<request_id_t, backend_exception_t>
|
||||
std::expected<request_id_t, backend_error_t>
|
||||
submit(std::span<token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
|
||||
|
||||
/**
|
||||
|
@ -188,15 +203,18 @@ namespace huggingface::tgi::backends::trtllm {
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper structures to define formatting strategies for various types in the backend
|
||||
*/
|
||||
template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::generation_params_t>: formatter<string_view> {
|
||||
auto format(huggingface::tgi::backends::trtllm::generation_params_t c, format_context& ctx) const -> format_context::iterator {
|
||||
return format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
|
||||
auto format(huggingface::tgi::backends::trtllm::generation_params_t const& c, format_context& ctx) const -> format_context::iterator {
|
||||
return fmt::format_to(ctx.out(), "generation_params_t{{ max_new_tokens={:d} }}", c.max_new_tokens);
|
||||
}
|
||||
};
|
||||
|
||||
template <> struct fmt::formatter<huggingface::tgi::backends::trtllm::sampling_params_t>: formatter<string_view> {
|
||||
auto format(huggingface::tgi::backends::trtllm::sampling_params_t c, format_context& ctx) const -> format_context::iterator {
|
||||
return format_to(
|
||||
auto format(huggingface::tgi::backends::trtllm::sampling_params_t const& c, format_context& ctx) const -> format_context::iterator {
|
||||
return fmt::format_to(
|
||||
ctx.out(),
|
||||
"sampling_params_t{{ top_k={:d}, top_p={:.3f}, repetition_penalty={:.3f}, frequency_penalty={:.3f}, length_penalty={:.3f}, temperature={:.3f}, seed={:d} }}",
|
||||
c.top_k, c.top_p, c.repetition_penalty, c.frequency_penalty, c.length_penalty, c.temperature, c.seed
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
#include <memory>
|
||||
#include <thread>
|
||||
|
||||
#include <tensorrt_llm/common/tllmException.h>
|
||||
#include <tensorrt_llm/plugins/api/tllmPlugin.h>
|
||||
|
||||
namespace rust::behavior {
|
||||
template<typename Try, typename Fail>
|
||||
|
@ -17,13 +20,15 @@ namespace rust::behavior {
|
|||
#include <backend.hpp>
|
||||
|
||||
namespace huggingface::tgi::backends::trtllm {
|
||||
std::once_flag backend_initialized_flag;
|
||||
|
||||
class tensorrt_llm_backend_t {
|
||||
private:
|
||||
backend_t inner_;
|
||||
|
||||
public:
|
||||
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
|
||||
: inner_(engine_folder) {}
|
||||
: inner_(engine_folder, executor_worker_path) {}
|
||||
|
||||
size_t num_tokens_ready() const noexcept {
|
||||
return inner_.num_tokens_ready();
|
||||
|
@ -64,7 +69,46 @@ namespace huggingface::tgi::backends::trtllm {
|
|||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
|
||||
return std::make_unique<tensorrt_llm_backend_t>(engines_folder);
|
||||
void initialize_logging() {
|
||||
#ifndef TGI_TRTLLM_BACKEND_DEBUG
|
||||
if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
|
||||
std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
|
||||
std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
|
||||
return std::tolower(c);
|
||||
});
|
||||
|
||||
if (log_level == "debug")
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
else
|
||||
spdlog::set_level(spdlog::level::info);
|
||||
}
|
||||
}
|
||||
#else
|
||||
spdlog::set_level(spdlog::level::debug);
|
||||
#endif
|
||||
}
|
||||
|
||||
void initialize_tensorrt_llm_backend() {
|
||||
SPDLOG_INFO("Initializing TGI - TensoRT-LLM Backend (v{})", tle::version());
|
||||
|
||||
// Initialize everyone
|
||||
initialize_logging();
|
||||
nvmlInit_v2();
|
||||
initTrtLlmPlugins();
|
||||
|
||||
const auto numGpus = huggingface::tgi::hardware::cuda::get_device_count();
|
||||
if (numGpus.has_value()) {
|
||||
SPDLOG_INFO("[FFI] Detected {:d} Nvidia GPU(s)", numGpus.value());
|
||||
} else {
|
||||
SPDLOG_WARN("[FFI] Failed to detected Nvidia GPU(s) on the system");
|
||||
// todo: throw
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<tensorrt_llm_backend_t> create_backend_from_engine_folder(rust::Str engines_folder, rust::Str executor_worker_path) {
|
||||
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
|
||||
return std::make_unique<tensorrt_llm_backend_t>(
|
||||
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()), std::filesystem::path::format::auto_format),
|
||||
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()), std::filesystem::path::format::auto_format)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
|
||||
|
|
Loading…
Reference in New Issue