// // Created by Morgan Funtowicz on 9/28/2024. // #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #include #include #include #include #include #include #include #include #include #include #include #include #include #define LLAMA_SUCCESS(x) x == 0 namespace huggingface::tgi::backends::llamacpp { static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); }; typedef std::unique_ptr llama_context_ptr; static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); }; typedef std::unique_ptr llama_sampler_ptr; typedef std::function llama_decode_callback; static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; }; /** * */ enum backend_error_t : uint8_t { MODEL_FILE_DOESNT_EXIST = 1 }; /** * */ struct sampling_params_t { uint32_t top_k = std::numeric_limits::max(); float_t top_p = 1.0f; float_t frequency_penalty = 0.0f; float_t repetition_penalty = 0.0f; uint64_t seed = 2014; /** * Convert this GenerationParams to the respective llama_sampler structure * @param Pointer to the model data * @return */ llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const; }; /** * */ struct generation_params_t { uint32_t max_new_tokens = std::numeric_limits::max(); bool ignore_eos_token = false; }; struct generation_context_t { generation_params_t generation_params; sampling_params_t sampling_params; std::span input_tokens; }; /** * */ class worker_t { private: std::shared_ptr model_; llama_context_ptr context_; public: /** * * @param model * @param params */ worker_t(std::shared_ptr, const llama_context_params &&); /** * * @param context * @param generation_context * @param callback */ [[nodiscard]] std::expected generate(const generation_context_t &, const std::optional &) const; }; } #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP