// // Created by Morgan Funtowicz on 9/28/2024. // #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #include #include #include #include #include #include #include #include #include #include #include #include #include #define LLAMA_SUCCESS(x) x == 0 namespace huggingface::tgi::backends::llamacpp { static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); }; typedef std::unique_ptr llama_context_ptr; static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); }; typedef std::unique_ptr llama_sampler_ptr; typedef std::function llama_decode_callback; static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; }; /** * Represent an error which can be returned as part of an std::expected */ enum backend_error_t : uint8_t { // Provided model filepath doesnt exist MODEL_FILE_DOESNT_EXIST = 1 }; /** * Hold all the parameters provided by TGI to sample from the final distribution of tokens */ struct sampling_params_t { uint32_t top_k = std::numeric_limits::max(); float_t top_p = 1.0f; float_t frequency_penalty = 0.0f; float_t repetition_penalty = 0.0f; uint64_t seed = 2014; /** * Convert this GenerationParams to the respective llama_sampler structure * @param Pointer to the model data * @return */ llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const; }; /** * Hold all the parameters provided by TGI to control the generation process */ struct generation_params_t { uint32_t max_new_tokens = std::numeric_limits::max(); bool ignore_eos_token = false; }; /** * Container structure wrapping up the current generation context composed by: * - a non-owning view over the prompt tokens * - the sampling parameters * - the generation parameters */ struct generation_context_t { generation_params_t generation_params; sampling_params_t sampling_params; std::span input_tokens; }; /** * Represent the actual model execution (i.e. "forward") and generation loop for llama.cpp */ class worker_t { private: std::shared_ptr model_; llama_context_ptr context_; public: /** * Create a new llama.cpp worker from the provided llama_model and the context parameters * @param model Previously allocated `llama_model` holding the weights of the neural network * @param params Parameters to allocate the execution context of the model */ worker_t(std::shared_ptr, const llama_context_params &&); /** * Generate multiple successive tokens, sampled from the distribution generated by executing a forward pass * over the neural network operations and matrices * @param generation_context The generation context holding sampling and generation parameters along with prompt tokens * @param callback An optional callback function which would be called everytime a new token is sampled */ [[nodiscard]] std::expected generate(const generation_context_t &, const std::optional &) const; }; } #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP