hf_text-generation-inference/backends/llamacpp/csrc/backend.hpp

//
// Created by Morgan Funtowicz on 9/28/2024.
//
#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP

#include <atomic>
#include <cmath>
#include <expected>
#include <filesystem>
#include <functional>
#include <queue>
#include <memory>
#include <optional>
#include <span>
#include <stop_token>
#include <vector>

#include <llama.h>
#include <thread>

#define LLAMA_SUCCESS(x) x == 0

namespace huggingface::tgi::backends::llamacpp {

    static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
    typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_ptr;

    static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };
    typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;

    typedef std::function<bool(llama_token, float_t, bool, size_t)> llama_decode_callback;
    static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };

    /**
     *
     */
    enum backend_error_t : uint8_t {
        MODEL_FILE_DOESNT_EXIST = 1
    };

    /**
     *
     */
    struct sampling_params_t {
        uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();
        float_t top_p = 1.0f;
        float_t frequency_penalty = 0.0f;
        float_t repetition_penalty = 0.0f;
        uint64_t seed = 2014;

        /**
         * Convert this GenerationParams to the respective llama_sampler structure
         * @param Pointer to the model data
         * @return
         */
        llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const;
    };

    /**
     *
     */
    struct generation_params_t {
        uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();
        bool ignore_eos_token = false;
    };

    struct generation_context_t {
        generation_params_t generation_params;
        sampling_params_t sampling_params;
        std::span<const llama_token> input_tokens;
    };

    /**
     *
     */
    class worker_t {
    private:
        const std::shared_ptr<llama_model> mModel_;
        const llama_context_params mParams_;

    public:
        /**
         *
         * @param model
         * @param params
         */
        worker_t(std::shared_ptr<llama_model> model, const llama_context_params &params);

        /**
         *
         * @param context
         * @param generation_context
         * @param callback
         */
        size_t
        generate(llama_context *, const generation_context_t &, const std::optional<llama_decode_callback> &) const;

        /**
         *
         */
        void loop(std::stop_source &driver, std::queue<generation_context_t> &backlog) const;
    };


    class backend_base_t {

    protected:
        std::shared_ptr<llama_model> mModel_;

    public:

        /**
         *
         * @param model
         */
        explicit backend_base_t(llama_model *model);

        /**
         * Destructor
         */
        ~backend_base_t();

        /**
         *
         * @param tokens
         * @param generation_params
         * @param sampling_params
         * @param callback
         * @return
         */
        [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
        std::expected<std::vector<llama_token>, backend_error_t> generate(
                std::span<const llama_token> tokens,
                const generation_params_t &generation_params,
                const sampling_params_t &sampling_params,
                const std::optional<llama_decode_callback> &callback = std::nullopt
        );

        /**
         *
         * @param tokens
         * @param generation_params
         * @param sampling_params
         * @params callback
         * @return
         */
        [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]
        virtual std::expected<size_t, backend_error_t> stream(
                std::span<const llama_token> tokens,
                const generation_params_t &generation_params,
                const sampling_params_t &sampling_params,
                const llama_decode_callback &callback
        ) = 0;
    };


    class single_worker_backend_t : backend_base_t {
    private:
        constexpr static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr {
            auto llParams = llama_context_default_params();
            llParams.flash_attn = true;
            llParams.n_batch = 1;
            llParams.n_threads = 1;
            llParams.no_perf = true;
            llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL;

            return {llama_new_context_with_model(pModel, llParams), llama_context_deleter};
        };

        llama_context_ptr mContext_;
        worker_t mWorker_;

    public:
        explicit single_worker_backend_t(llama_model *pModel, const std::optional<llama_context_params> &);

        using backend_base_t::generate;

        std::expected<size_t, backend_error_t> stream(
                std::span<const llama_token> tokens,
                const generation_params_t &generation_params,
                const sampling_params_t &sampling_params,
                const llama_decode_callback &callback) override;
    };

    class multi_worker_backend_t : backend_base_t {
    private:
        llama_context_ptr mContext_;

    public:
        using backend_base_t::generate;

        std::expected<size_t, backend_error_t> stream(
                std::span<const llama_token> tokens,
                const generation_params_t &generation_params,
                const sampling_params_t &sampling_params,
                const llama_decode_callback &callback) override;
    };
}

#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`//`
			`// Created by Morgan Funtowicz on 9/28/2024.`
			`//`
			`#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP`
			`#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP`

feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`#include <atomic>`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`#include <cmath>`
			`#include <expected>`
feat(backend): correctly load llama.cpp model from llama api and not gpt2 2024-10-22 07:22:56 -06:00			`#include <filesystem>`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`#include <functional>`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`#include <queue>`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`#include <memory>`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`#include <optional>`
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00			`#include <span>`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`#include <stop_token>`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`#include <vector>`

feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`#include <llama.h>`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`#include <thread>`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00
feat(backend): minor refactor 2024-10-23 06:12:32 -06:00			`#define LLAMA_SUCCESS(x) x == 0`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`namespace huggingface::tgi::backends::llamacpp {`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00
			`static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };`
feat(backend): fix memory leaking on llama_sampler when the decode ends 2024-11-03 03:17:02 -07:00			`typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_ptr;`

			`static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };`
			`typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00
feat(backend): add early stopping criteria from TGI stream callback 2024-11-04 09:01:22 -07:00			`typedef std::function<bool(llama_token, float_t, bool, size_t)> llama_decode_callback;`
			`static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00
			`/**`
			`*`
			`*/`
			`enum backend_error_t : uint8_t {`
feat(backend): correctly load llama.cpp model from llama api and not gpt2 2024-10-22 07:22:56 -06:00			`MODEL_FILE_DOESNT_EXIST = 1`
			`};`

feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
			`*`
			`*/`
			`struct sampling_params_t {`
			`uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();`
			`float_t top_p = 1.0f;`
			`float_t frequency_penalty = 0.0f;`
			`float_t repetition_penalty = 0.0f;`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`uint64_t seed = 2014;`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00
			`/**`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`* Convert this GenerationParams to the respective llama_sampler structure`
			`* @param Pointer to the model data`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`* @return`
			`*/`
feat(backend): fix memory leaking on llama_sampler when the decode ends 2024-11-03 03:17:02 -07:00			`llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const;`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`};`

feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
			`*`
			`*/`
			`struct generation_params_t {`
			`uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();`
feat(backend): add mapping for ignore_eos_token stopping criteria 2024-10-31 14:32:29 -06:00			`bool ignore_eos_token = false;`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`};`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`struct generation_context_t {`
			`generation_params_t generation_params;`
			`sampling_params_t sampling_params;`
			`std::span<const llama_token> input_tokens;`
			`};`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
			`*`
			`*/`
			`class worker_t {`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`private:`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`const std::shared_ptr<llama_model> mModel_;`
			`const llama_context_params mParams_;`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`public:`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
			`*`
			`* @param model`
			`* @param params`
			`*/`
			`worker_t(std::shared_ptr<llama_model> model, const llama_context_params &params);`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
			`*`
			`* @param context`
			`* @param generation_context`
			`* @param callback`
			`*/`
			`size_t`
			`generate(llama_context *, const generation_context_t &, const std::optional<llama_decode_callback> &) const;`

			`/**`
			`*`
			`*/`
			`void loop(std::stop_source &driver, std::queue<generation_context_t> &backlog) const;`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`};`


feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`class backend_base_t {`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`protected:`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`std::shared_ptr<llama_model> mModel_;`

			`public:`

feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
			`*`
			`* @param model`
			`*/`
			`explicit backend_base_t(llama_model *model);`

			`/**`
			`* Destructor`
			`*/`
			`~backend_base_t();`
feat(llamacpp): initial end2end build 2024-10-04 02:42:31 -06:00
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`/**`
			`*`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`* @param tokens`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`* @param generation_params`
			`* @param sampling_params`
			`* @param callback`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`* @return`
			`*/`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`[[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`std::expected<std::vector<llama_token>, backend_error_t> generate(`
			`std::span<const llama_token> tokens,`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`const generation_params_t &generation_params,`
			`const sampling_params_t &sampling_params,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`const std::optional<llama_decode_callback> &callback = std::nullopt`
			`);`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00
			`/**`
			`*`
			`* @param tokens`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`* @param generation_params`
			`* @param sampling_params`
			`* @params callback`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`* @return`
			`*/`
feat(backend): expose frequency and repetition penalties 2024-10-23 06:12:52 -06:00			`[[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]]`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`virtual std::expected<size_t, backend_error_t> stream(`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`std::span<const llama_token> tokens,`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`const generation_params_t &generation_params,`
			`const sampling_params_t &sampling_params,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`const llama_decode_callback &callback`
			`) = 0;`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`};`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00

			`class single_worker_backend_t : backend_base_t {`
			`private:`
chore(backend): minor fixes mostly format 2024-11-05 15:48:13 -07:00			`constexpr static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr {`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`auto llParams = llama_context_default_params();`
			`llParams.flash_attn = true;`
			`llParams.n_batch = 1;`
chore(backend): minor fixes mostly format 2024-11-05 15:48:13 -07:00			`llParams.n_threads = 1;`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`llParams.no_perf = true;`
			`llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL;`

			`return {llama_new_context_with_model(pModel, llParams), llama_context_deleter};`
			`};`

feat(backend): fix memory leaking on llama_sampler when the decode ends 2024-11-03 03:17:02 -07:00			`llama_context_ptr mContext_;`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`worker_t mWorker_;`

			`public:`
			`explicit single_worker_backend_t(llama_model *pModel, const std::optional<llama_context_params> &);`

chore(backend): minor fixes mostly format 2024-11-05 15:48:13 -07:00			`using backend_base_t::generate;`

feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`std::expected<size_t, backend_error_t> stream(`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`std::span<const llama_token> tokens,`
			`const generation_params_t &generation_params,`
			`const sampling_params_t &sampling_params,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`const llama_decode_callback &callback) override;`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`};`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`class multi_worker_backend_t : backend_base_t {`
			`private:`
feat(backend): fix memory leaking on llama_sampler when the decode ends 2024-11-03 03:17:02 -07:00			`llama_context_ptr mContext_;`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`public:`
chore(backend): minor fixes mostly format 2024-11-05 15:48:13 -07:00			`using backend_base_t::generate;`

feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`std::expected<size_t, backend_error_t> stream(`
			`std::span<const llama_token> tokens,`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`const generation_params_t &generation_params,`
			`const sampling_params_t &sampling_params,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`const llama_decode_callback &callback) override;`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`};`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`}`

			`#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP`