feat(backend): initial rewrite of the backend for simplicity

2024-11-19 00:17:35 +01:00 · 2024-11-19 00:17:35 +01:00 · f24e9fa2b9
parent a80c346f72
commit f24e9fa2b9
3 changed files with 198 additions and 0 deletions
--- a/backends/trtllm/csrc/backend.cpp
+++ b/backends/trtllm/csrc/backend.cpp
@ -0,0 +1,38 @@
 #include <ranges>
 #include <utility>
 #include "backend.hpp"
 #include <spdlog/spdlog.h>
 namespace huggingface::tgi::backends::trtllm {
    size_t backend_t::num_tokens_ready() const noexcept {
        return executor_.getNumResponsesReady();
    }
    std::expected<request_id_t, backend_exception_t>
    backend_t::submit(std::span<tle::TokenIdType> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept {
        SPDLOG_DEBUG(FMT_STRING("Submitting {:d} tokens to the executor for scheduling"), token_ids.size());
        return executor_.enqueueRequest(tle::Request {
                {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
                static_cast<tle::SizeType32>(generation_params.max_new_tokens),
                true,
                (tle::SamplingConfig) sampling_params,
                tle::OutputConfig { /* returnLogProbs= */ true },
                std::nullopt,
                std::nullopt,
                std::nullopt,
                std::nullopt,
                stop_words_
        });
    }
    std::vector<tle::Response> backend_t::pull_tokens() noexcept {
        return executor_.awaitResponses();
    }
    void backend_t::cancel(request_id_t request_id) noexcept {
        SPDLOG_INFO(FMT_STRING("Cancelling request: {:d}"), request_id);
        executor_.cancelRequest(request_id);
    }
 }
--- a/backends/trtllm/csrc/backend.hpp
+++ b/backends/trtllm/csrc/backend.hpp
@ -0,0 +1,100 @@
 #include <cmath>
 #include <cstdint>
 #include <exception>
 #include <expected>
 #include <list>
 #include <span>
 #include <tensorrt_llm/executor/executor.h>
 namespace huggingface::tgi::backends::trtllm {
    namespace tle = tensorrt_llm::executor;
    using request_id_t = uint32_t;
    using token_id_t = tle::TokenIdType;
    /**
     * Represent the parameters used for generation
     */
    struct generation_params_t {
        uint32_t max_new_tokens;
    };
    /**
     * Represent the parameters used to sample tokens from the logit distribution
     */
    struct sampling_params_t {
        uint32_t top_k;
        float_t top_p;
        float_t repetition_penalty;
        float_t frequency_penalty;
        float_t length_penalty;
        float_t temperature;
        uint64_t seed;
        explicit operator tle::SamplingConfig() const {
            return tle::SamplingConfig {
                1,
                top_k,
                top_p,
                std::nullopt,
                std::nullopt,
                std::nullopt,
                seed,
                temperature,
                std::nullopt,
                std::nullopt,
                repetition_penalty,
                std::nullopt,
                frequency_penalty,
                length_penalty
            };
        }
    };
    /**
     *
     */
    class backend_exception_t: std::exception  {};
    /**
     *
     */
    class backend_t {
    private:
        tle::Executor executor_;
        std::list<std::vector<int32_t>> stop_words_;
    public:
        /**
         * Submit a new request to the executor
         * @param token_ids
         * @param generation_params
         * @param sampling_params
         * @return Either newly submitted request's id or the error why it failed to submit
         */
        [[nodiscard("Discarded executor request_id needs to be assigned")]]
        std::expected<request_id_t, backend_exception_t>
        submit(std::span<token_id_t> token_ids, generation_params_t generation_params, sampling_params_t sampling_params) noexcept;
        /**
         * Query the number of tokens available across all in-flight generations
         * @return
         */
        [[nodiscard("Pulling out the number of tokens")]]
        size_t num_tokens_ready() const noexcept;
        /**
         * Pull out newly generated tokens from the executor
         * @return
         */
        [[nodiscard("")]]
        std::vector<tle::Response> pull_tokens() noexcept;
        /**
         * Cancel the specified request on the executor' set
         * @param request_id Request's Identifier to remove from the in-flight executor
         */
        void cancel(request_id_t) noexcept;
    };
 }
--- a/backends/trtllm/csrc/ffi.hpp
+++ b/backends/trtllm/csrc/ffi.hpp
@ -0,0 +1,60 @@
 #include <tensorrt_llm/common/tllmException.h>
 namespace rust::behavior {
    template<typename Try, typename Fail>
    static void trycatch(Try &&func, Fail &&fail) noexcept try {
        func();
    } catch (tensorrt_llm::common::TllmException &e) {
        fail(e.what());
    }
 }
 #include <backend.hpp>
 namespace huggingface::tgi::backends::trtllm {
    class tensorrt_llm_backend_t {
    private:
        backend_t inner_;
    public:
        tensorrt_llm_backend_t(std::filesystem::path &engine_folder): inner_(engine_folder) {}
        size_t num_tokens_ready() const noexcept {
            return inner_.num_tokens_ready();
        }
        request_id_t submit(
                rust::Slice<const uint32_t> tokens,
                uint32_t max_new_tokens,
                uint32_t top_k,
                float_t top_p,
                float_t temperature,
                float_t repetition_penalty,
                float_t frequency_penalty,
                uint64_t seed
        ) {
            // Submit the request to the executor and get back a potential request_id used to track request status
            const auto maybe_request_id = inner_.submit(
                {tokens_.data(), tokens.size()},
                {max_new_tokens},
                {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
            );
            // If we do have a value, let's return the request_id
            if(maybe_request_id.has_value()) [[likely]] {
                return *maybe_request_id;
            } else {
            }
        }
        void cancel(request_id_t requestId) noexcept {
            SPDLOG
            inner_.cancel(requestId);
        }
    };
 }