//
// Created by Morgan Funtowicz on 9/28/2024.
//
#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP

#include <atomic>
#include <cmath>
#include <expected>
#include <filesystem>
#include <functional>
#include <queue>
#include <memory>
#include <optional>
#include <span>
#include <stop_token>
#include <vector>

#include <llama.h>
#include <thread>

#define LLAMA_SUCCESS(x) x == 0

namespace huggingface::tgi::backends::llamacpp {

    static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
    typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_ptr;

    static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };
    typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;

    typedef std::function<bool(llama_token, float_t, bool, size_t)> llama_decode_callback;
    static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };

    /**
     * Represent an error which can be returned as part of an std::expected
     */
    enum backend_error_t : uint8_t {
        // Provided model filepath doesnt exist
        MODEL_FILE_DOESNT_EXIST = 1
    };

    /**
     * Hold all the parameters provided by TGI to sample from the final distribution of tokens
     */
    struct sampling_params_t {
        uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();
        float_t top_p = 1.0f;
        float_t frequency_penalty = 0.0f;
        float_t repetition_penalty = 0.0f;
        uint64_t seed = 2014;

        /**
         * Convert this GenerationParams to the respective llama_sampler structure
         * @param Pointer to the model data
         * @return
         */
        llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const;
    };

    /**
     * Hold all the parameters provided by TGI to control the generation process
     */
    struct generation_params_t {
        uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();
        bool ignore_eos_token = false;
    };

    /**
     * Container structure wrapping up the current generation context composed by:
     * - a non-owning view over the prompt tokens
     * - the sampling parameters
     * - the generation parameters
     */
    struct generation_context_t {
        generation_params_t generation_params;
        sampling_params_t sampling_params;
        std::span<const llama_token> input_tokens;
    };

    /**
     * Represent the actual model execution (i.e. "forward") and generation loop for llama.cpp
     */
    class worker_t {
    private:
        std::shared_ptr<llama_model> model_;
        llama_context_ptr context_;

    public:
        /**
         * Create a new llama.cpp worker from the provided llama_model and the context parameters
         * @param model Previously allocated `llama_model` holding the weights of the neural network
         * @param params Parameters to allocate the execution context of the model
         */
        worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);

        /**
         * Generate multiple successive tokens, sampled from the distribution generated by executing a forward pass
         * over the neural network operations and matrices
         * @param generation_context The generation context holding sampling and generation parameters along with prompt tokens
         * @param callback An optional callback function which would be called everytime a new token is sampled
         */
        [[nodiscard]] std::expected<size_t, backend_error_t>
        generate(const generation_context_t &, const std::optional<llama_decode_callback> &) const;
    };
}

#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP