hf_text-generation-inference/backends/llamacpp/csrc/backend.hpp

//
// Created by Morgan Funtowicz on 9/28/2024.
//
#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP

#include <atomic>
#include <cmath>
#include <expected>
#include <filesystem>
#include <functional>
#include <queue>
#include <memory>
#include <optional>
#include <span>
#include <stop_token>
#include <vector>

#include <llama.h>
#include <thread>

#define LLAMA_SUCCESS(x) x == 0

namespace huggingface::tgi::backends::llamacpp {

    static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
    typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_ptr;

    static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };
    typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;

    typedef std::function<bool(llama_token, float_t, bool, size_t)> llama_decode_callback;
    static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };

    /**
     * Represent an error which can be returned as part of an std::expected
     */
    enum backend_error_t : uint8_t {
        // Provided model filepath doesnt exist
        MODEL_FILE_DOESNT_EXIST = 1
    };

    /**
     * Hold all the parameters provided by TGI to sample from the final distribution of tokens
     */
    struct sampling_params_t {
        uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();
        float_t top_p = 1.0f;
        float_t frequency_penalty = 0.0f;
        float_t repetition_penalty = 0.0f;
        uint64_t seed = 2014;

        /**
         * Convert this GenerationParams to the respective llama_sampler structure
         * @param Pointer to the model data
         * @return
         */
        llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const;
    };

    /**
     * Hold all the parameters provided by TGI to control the generation process
     */
    struct generation_params_t {
        uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();
        bool ignore_eos_token = false;
    };

    /**
     * Container structure wrapping up the current generation context composed by:
     * - a non-owning view over the prompt tokens
     * - the sampling parameters
     * - the generation parameters
     */
    struct generation_context_t {
        generation_params_t generation_params;
        sampling_params_t sampling_params;
        std::span<const llama_token> input_tokens;
    };

    /**
     * Represent the actual model execution (i.e. "forward") and generation loop for llama.cpp
     */
    class worker_t {
    private:
        std::shared_ptr<llama_model> model_;
        llama_context_ptr context_;

    public:
        /**
         * Create a new llama.cpp worker from the provided llama_model and the context parameters
         * @param model Previously allocated `llama_model` holding the weights of the neural network
         * @param params Parameters to allocate the execution context of the model
         */
        worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);

        /**
         * Generate multiple successive tokens, sampled from the distribution generated by executing a forward pass
         * over the neural network operations and matrices
         * @param generation_context The generation context holding sampling and generation parameters along with prompt tokens
         * @param callback An optional callback function which would be called everytime a new token is sampled
         */
        [[nodiscard]] std::expected<size_t, backend_error_t>
        generate(const generation_context_t &, const std::optional<llama_decode_callback> &) const;
    };
}

#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`//`
			`// Created by Morgan Funtowicz on 9/28/2024.`
			`//`
			`#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP`
			`#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP`

feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`#include <atomic>`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`#include <cmath>`
			`#include <expected>`
feat(backend): correctly load llama.cpp model from llama api and not gpt2 2024-10-22 07:22:56 -06:00			`#include <filesystem>`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`#include <functional>`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`#include <queue>`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`#include <memory>`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`#include <optional>`
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00			`#include <span>`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`#include <stop_token>`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`#include <vector>`

feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`#include <llama.h>`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`#include <thread>`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00
feat(backend): minor refactor 2024-10-23 06:12:32 -06:00			`#define LLAMA_SUCCESS(x) x == 0`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`namespace huggingface::tgi::backends::llamacpp {`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00
			`static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };`
feat(backend): fix memory leaking on llama_sampler when the decode ends 2024-11-03 03:17:02 -07:00			`typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_ptr;`

			`static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };`
			`typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00
feat(backend): add early stopping criteria from TGI stream callback 2024-11-04 09:01:22 -07:00			`typedef std::function<bool(llama_token, float_t, bool, size_t)> llama_decode_callback;`
			`static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00
			`/**`
misc(doc): c++ documentation 2024-11-22 07:13:54 -07:00			`* Represent an error which can be returned as part of an std::expected`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`*/`
			`enum backend_error_t : uint8_t {`
misc(doc): c++ documentation 2024-11-22 07:13:54 -07:00			`// Provided model filepath doesnt exist`
feat(backend): correctly load llama.cpp model from llama api and not gpt2 2024-10-22 07:22:56 -06:00			`MODEL_FILE_DOESNT_EXIST = 1`
			`};`

feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
misc(doc): c++ documentation 2024-11-22 07:13:54 -07:00			`* Hold all the parameters provided by TGI to sample from the final distribution of tokens`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`*/`
			`struct sampling_params_t {`
			`uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();`
			`float_t top_p = 1.0f;`
			`float_t frequency_penalty = 0.0f;`
			`float_t repetition_penalty = 0.0f;`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`uint64_t seed = 2014;`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00
			`/**`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`* Convert this GenerationParams to the respective llama_sampler structure`
			`* @param Pointer to the model data`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`* @return`
			`*/`
feat(backend): fix memory leaking on llama_sampler when the decode ends 2024-11-03 03:17:02 -07:00			`llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const;`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`};`

feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
misc(doc): c++ documentation 2024-11-22 07:13:54 -07:00			`* Hold all the parameters provided by TGI to control the generation process`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`*/`
			`struct generation_params_t {`
			`uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();`
feat(backend): add mapping for ignore_eos_token stopping criteria 2024-10-31 14:32:29 -06:00			`bool ignore_eos_token = false;`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`};`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00
misc(doc): c++ documentation 2024-11-22 07:13:54 -07:00			`/**`
			`* Container structure wrapping up the current generation context composed by:`
			`* - a non-owning view over the prompt tokens`
			`* - the sampling parameters`
			`* - the generation parameters`
			`*/`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`struct generation_context_t {`
			`generation_params_t generation_params;`
			`sampling_params_t sampling_params;`
			`std::span<const llama_token> input_tokens;`
			`};`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
misc(doc): c++ documentation 2024-11-22 07:13:54 -07:00			`* Represent the actual model execution (i.e. "forward") and generation loop for llama.cpp`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`*/`
			`class worker_t {`
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`private:`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`std::shared_ptr<llama_model> model_;`
			`llama_context_ptr context_;`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
feat(llamacpp): wip explosion 2024-10-29 15:30:36 -06:00			`public:`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
misc(doc): c++ documentation 2024-11-22 07:13:54 -07:00			`* Create a new llama.cpp worker from the provided llama_model and the context parameters`
			* @param model Previously allocated `llama_model` holding the weights of the neural network
			`* @param params Parameters to allocate the execution context of the model`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`*/`
feat(backend): correctly setup llama_context providing n_threads and n_ubatch 2024-11-21 13:43:50 -07:00			`worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`/**`
misc(doc): c++ documentation 2024-11-22 07:13:54 -07:00			`* Generate multiple successive tokens, sampled from the distribution generated by executing a forward pass`
			`* over the neural network operations and matrices`
			`* @param generation_context The generation context holding sampling and generation parameters along with prompt tokens`
			`* @param callback An optional callback function which would be called everytime a new token is sampled`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`*/`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`[[nodiscard]] std::expected<size_t, backend_error_t>`
			`generate(const generation_context_t &, const std::optional<llama_decode_callback> &) const;`
feat(backend): entirely rewrite backend 2024-10-30 15:40:37 -06:00			`};`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`}`

			`#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP`