2024-10-03 06:00:17 -06:00
|
|
|
//
|
|
|
|
// Created by Morgan Funtowicz on 9/28/2024.
|
|
|
|
//
|
|
|
|
#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
|
|
|
|
#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
|
|
|
|
|
2024-10-29 15:30:36 -06:00
|
|
|
#include <atomic>
|
2024-10-22 16:09:10 -06:00
|
|
|
#include <cmath>
|
|
|
|
#include <expected>
|
2024-10-22 07:22:56 -06:00
|
|
|
#include <filesystem>
|
2024-10-30 15:40:37 -06:00
|
|
|
#include <functional>
|
2024-10-29 15:30:36 -06:00
|
|
|
#include <queue>
|
2024-10-03 06:00:17 -06:00
|
|
|
#include <memory>
|
2024-10-30 15:40:37 -06:00
|
|
|
#include <optional>
|
2024-10-24 01:56:40 -06:00
|
|
|
#include <span>
|
2024-10-30 15:40:37 -06:00
|
|
|
#include <stop_token>
|
2024-10-24 08:42:50 -06:00
|
|
|
#include <vector>
|
|
|
|
|
2024-10-03 06:00:17 -06:00
|
|
|
#include <llama.h>
|
2024-10-30 15:40:37 -06:00
|
|
|
#include <thread>
|
2024-10-03 06:00:17 -06:00
|
|
|
|
2024-10-23 06:12:32 -06:00
|
|
|
#define LLAMA_SUCCESS(x) x == 0
|
2024-10-03 06:00:17 -06:00
|
|
|
|
2024-10-24 08:42:50 -06:00
|
|
|
namespace huggingface::tgi::backends::llamacpp {
|
2024-10-30 15:40:37 -06:00
|
|
|
|
|
|
|
static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); };
|
2024-11-03 03:17:02 -07:00
|
|
|
typedef std::unique_ptr<llama_context, decltype(llama_context_deleter)> llama_context_ptr;
|
|
|
|
|
|
|
|
static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };
|
|
|
|
typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;
|
2024-10-30 15:40:37 -06:00
|
|
|
|
2024-11-04 09:01:22 -07:00
|
|
|
typedef std::function<bool(llama_token, float_t, bool, size_t)> llama_decode_callback;
|
|
|
|
static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };
|
2024-10-30 15:40:37 -06:00
|
|
|
|
|
|
|
/**
|
2024-11-22 07:13:54 -07:00
|
|
|
* Represent an error which can be returned as part of an std::expected
|
2024-10-30 15:40:37 -06:00
|
|
|
*/
|
|
|
|
enum backend_error_t : uint8_t {
|
2024-11-22 07:13:54 -07:00
|
|
|
// Provided model filepath doesnt exist
|
2024-10-22 07:22:56 -06:00
|
|
|
MODEL_FILE_DOESNT_EXIST = 1
|
|
|
|
};
|
|
|
|
|
2024-10-30 15:40:37 -06:00
|
|
|
/**
|
2024-11-22 07:13:54 -07:00
|
|
|
* Hold all the parameters provided by TGI to sample from the final distribution of tokens
|
2024-10-30 15:40:37 -06:00
|
|
|
*/
|
|
|
|
struct sampling_params_t {
|
|
|
|
uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();
|
|
|
|
float_t top_p = 1.0f;
|
|
|
|
float_t frequency_penalty = 0.0f;
|
|
|
|
float_t repetition_penalty = 0.0f;
|
2024-10-29 15:30:36 -06:00
|
|
|
uint64_t seed = 2014;
|
2024-10-22 16:09:10 -06:00
|
|
|
|
|
|
|
/**
|
2024-10-29 15:30:36 -06:00
|
|
|
* Convert this GenerationParams to the respective llama_sampler structure
|
|
|
|
* @param Pointer to the model data
|
2024-10-22 16:09:10 -06:00
|
|
|
* @return
|
|
|
|
*/
|
2024-11-03 03:17:02 -07:00
|
|
|
llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const;
|
2024-10-29 15:30:36 -06:00
|
|
|
};
|
|
|
|
|
2024-10-30 15:40:37 -06:00
|
|
|
/**
|
2024-11-22 07:13:54 -07:00
|
|
|
* Hold all the parameters provided by TGI to control the generation process
|
2024-10-30 15:40:37 -06:00
|
|
|
*/
|
|
|
|
struct generation_params_t {
|
|
|
|
uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();
|
2024-10-31 14:32:29 -06:00
|
|
|
bool ignore_eos_token = false;
|
2024-10-30 15:40:37 -06:00
|
|
|
};
|
2024-10-22 16:09:10 -06:00
|
|
|
|
2024-11-22 07:13:54 -07:00
|
|
|
/**
|
|
|
|
* Container structure wrapping up the current generation context composed by:
|
|
|
|
* - a non-owning view over the prompt tokens
|
|
|
|
* - the sampling parameters
|
|
|
|
* - the generation parameters
|
|
|
|
*/
|
2024-10-30 15:40:37 -06:00
|
|
|
struct generation_context_t {
|
|
|
|
generation_params_t generation_params;
|
|
|
|
sampling_params_t sampling_params;
|
|
|
|
std::span<const llama_token> input_tokens;
|
|
|
|
};
|
2024-10-29 15:30:36 -06:00
|
|
|
|
2024-10-30 15:40:37 -06:00
|
|
|
/**
|
2024-11-22 07:13:54 -07:00
|
|
|
* Represent the actual model execution (i.e. "forward") and generation loop for llama.cpp
|
2024-10-30 15:40:37 -06:00
|
|
|
*/
|
|
|
|
class worker_t {
|
2024-10-29 15:30:36 -06:00
|
|
|
private:
|
2024-11-09 14:10:33 -07:00
|
|
|
std::shared_ptr<llama_model> model_;
|
|
|
|
llama_context_ptr context_;
|
2024-10-24 08:42:50 -06:00
|
|
|
|
2024-10-29 15:30:36 -06:00
|
|
|
public:
|
2024-10-30 15:40:37 -06:00
|
|
|
/**
|
2024-11-22 07:13:54 -07:00
|
|
|
* Create a new llama.cpp worker from the provided llama_model and the context parameters
|
|
|
|
* @param model Previously allocated `llama_model` holding the weights of the neural network
|
|
|
|
* @param params Parameters to allocate the execution context of the model
|
2024-10-30 15:40:37 -06:00
|
|
|
*/
|
2024-11-21 13:43:50 -07:00
|
|
|
worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);
|
2024-10-24 08:42:50 -06:00
|
|
|
|
2024-10-30 15:40:37 -06:00
|
|
|
/**
|
2024-11-22 07:13:54 -07:00
|
|
|
* Generate multiple successive tokens, sampled from the distribution generated by executing a forward pass
|
|
|
|
* over the neural network operations and matrices
|
|
|
|
* @param generation_context The generation context holding sampling and generation parameters along with prompt tokens
|
|
|
|
* @param callback An optional callback function which would be called everytime a new token is sampled
|
2024-10-30 15:40:37 -06:00
|
|
|
*/
|
2024-11-09 14:10:33 -07:00
|
|
|
[[nodiscard]] std::expected<size_t, backend_error_t>
|
|
|
|
generate(const generation_context_t &, const std::optional<llama_decode_callback> &) const;
|
2024-10-30 15:40:37 -06:00
|
|
|
};
|
2024-10-03 06:00:17 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
|