misc(doc): c++ documentation
This commit is contained in:
parent
4ee2ee58c9
commit
b9c04b9c07
|
@ -33,14 +33,15 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };
|
static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; };
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Represent an error which can be returned as part of an std::expected
|
||||||
*/
|
*/
|
||||||
enum backend_error_t : uint8_t {
|
enum backend_error_t : uint8_t {
|
||||||
|
// Provided model filepath doesnt exist
|
||||||
MODEL_FILE_DOESNT_EXIST = 1
|
MODEL_FILE_DOESNT_EXIST = 1
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Hold all the parameters provided by TGI to sample from the final distribution of tokens
|
||||||
*/
|
*/
|
||||||
struct sampling_params_t {
|
struct sampling_params_t {
|
||||||
uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();
|
uint32_t top_k = std::numeric_limits<decltype(top_k)>::max();
|
||||||
|
@ -58,13 +59,19 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Hold all the parameters provided by TGI to control the generation process
|
||||||
*/
|
*/
|
||||||
struct generation_params_t {
|
struct generation_params_t {
|
||||||
uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();
|
uint32_t max_new_tokens = std::numeric_limits<uint32_t>::max();
|
||||||
bool ignore_eos_token = false;
|
bool ignore_eos_token = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Container structure wrapping up the current generation context composed by:
|
||||||
|
* - a non-owning view over the prompt tokens
|
||||||
|
* - the sampling parameters
|
||||||
|
* - the generation parameters
|
||||||
|
*/
|
||||||
struct generation_context_t {
|
struct generation_context_t {
|
||||||
generation_params_t generation_params;
|
generation_params_t generation_params;
|
||||||
sampling_params_t sampling_params;
|
sampling_params_t sampling_params;
|
||||||
|
@ -72,7 +79,7 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Represent the actual model execution (i.e. "forward") and generation loop for llama.cpp
|
||||||
*/
|
*/
|
||||||
class worker_t {
|
class worker_t {
|
||||||
private:
|
private:
|
||||||
|
@ -81,17 +88,17 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
*
|
* Create a new llama.cpp worker from the provided llama_model and the context parameters
|
||||||
* @param model
|
* @param model Previously allocated `llama_model` holding the weights of the neural network
|
||||||
* @param params
|
* @param params Parameters to allocate the execution context of the model
|
||||||
*/
|
*/
|
||||||
worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);
|
worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Generate multiple successive tokens, sampled from the distribution generated by executing a forward pass
|
||||||
* @param context
|
* over the neural network operations and matrices
|
||||||
* @param generation_context
|
* @param generation_context The generation context holding sampling and generation parameters along with prompt tokens
|
||||||
* @param callback
|
* @param callback An optional callback function which would be called everytime a new token is sampled
|
||||||
*/
|
*/
|
||||||
[[nodiscard]] std::expected<size_t, backend_error_t>
|
[[nodiscard]] std::expected<size_t, backend_error_t>
|
||||||
generate(const generation_context_t &, const std::optional<llama_decode_callback> &) const;
|
generate(const generation_context_t &, const std::optional<llama_decode_callback> &) const;
|
||||||
|
|
|
@ -35,11 +35,18 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
|
|
||||||
namespace huggingface::tgi::backends::llamacpp {
|
namespace huggingface::tgi::backends::llamacpp {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Smart pointer to drop a llama_model when going out of scope
|
||||||
|
*/
|
||||||
auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); };
|
auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); };
|
||||||
auto make_shared_llama_model = [](llama_model *model) {
|
auto make_shared_llama_model = [](llama_model *model) {
|
||||||
return std::shared_ptr<llama_model>(model, llama_model_deleter);
|
return std::shared_ptr<llama_model>(model, llama_model_deleter);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* llama.cpp backend specific exception mapped from `backend_exception_t` to throw at the FFI level and
|
||||||
|
* allow automatic implementation of Result<_, Exception> from C++ to Rust
|
||||||
|
*/
|
||||||
class llama_cpp_backend_exception_t : std::exception {};
|
class llama_cpp_backend_exception_t : std::exception {};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -51,9 +58,29 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
worker_t worker_;
|
worker_t worker_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
/**
|
||||||
|
* Create a new llama.cpp worker frontend allowing to map custom Rust FFI types from CXX crate to c++ boundary
|
||||||
|
* @param model The `llama_model` to use on the worker
|
||||||
|
* @param num_threads The number of threads the worker is allowed to spawn accross for its threadpool
|
||||||
|
*/
|
||||||
explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads):
|
explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads):
|
||||||
model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {}
|
model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate a new set of tokens from the provided `input_tokens`, streaming each individual token generated
|
||||||
|
* through the `callback`.
|
||||||
|
* Individual tokens are generated using the sampling parameters provided through `sampling_params` and the
|
||||||
|
* generation parameters, provided through `generation_params` allowing to define the behaviour of the generation loop.
|
||||||
|
* `ctx` is an opaque structure defined on Rust side which holds stream information to send tokens back to the originating client.
|
||||||
|
* @param input_tokens Prompt input tokens originating from the tokenization of the request's text input
|
||||||
|
* @param generation_params Parameters controlling the generation loop such as ignoring the end of sentence token or
|
||||||
|
* the maximum number of tokens to generate
|
||||||
|
* @param sampling_params Parameters controlling the sampling process on the final token distribution
|
||||||
|
* @param ctx Opaque structure from Rust holding HTTP channel to stream back response to the client
|
||||||
|
* @param callback Function pointer called everytime a new token is generated during the generation loop.
|
||||||
|
* If this callback returns `true` it signals an early termination request on the Rust side.
|
||||||
|
* @return Number of generated tokens
|
||||||
|
*/
|
||||||
size_t stream(
|
size_t stream(
|
||||||
rust::Slice<const uint32_t> input_tokens,
|
rust::Slice<const uint32_t> input_tokens,
|
||||||
const generation_params_t generation_params,
|
const generation_params_t generation_params,
|
||||||
|
@ -88,6 +115,12 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility method to allocate a new worker frontend from Rust
|
||||||
|
* @param modelPath The GGUF model path as an UTF-8 string from Rust
|
||||||
|
* @param num_threads Integer greater than zero representing the number of threads the worker is allowed to use for computations
|
||||||
|
* @return unique ownership of `llama_cpp_worker_frontend_t` pointer
|
||||||
|
*/
|
||||||
std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath, uint32_t num_threads) {
|
std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath, uint32_t num_threads) {
|
||||||
#ifdef TGI_LLAMACPP_BACKEND_DEBUG
|
#ifdef TGI_LLAMACPP_BACKEND_DEBUG
|
||||||
spdlog::set_level(spdlog::level::debug);
|
spdlog::set_level(spdlog::level::debug);
|
||||||
|
@ -108,9 +141,16 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
return std::make_unique<llama_cpp_worker_frontend_t>(model, static_cast<int32_t>(num_threads));
|
return std::make_unique<llama_cpp_worker_frontend_t>(model, static_cast<int32_t>(num_threads));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Smart pointer to automatically destroy the underlying numa_bitset * when going out of scope
|
||||||
|
*/
|
||||||
struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
|
struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
|
||||||
typedef std::unique_ptr<struct bitmask, numa_cpumask_deleter> unique_cpumask_ptr;
|
typedef std::unique_ptr<struct bitmask, numa_cpumask_deleter> unique_cpumask_ptr;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Define the NUMA core and memory affinity for the current thread by binding cores and memory to respective NUMA node(s)
|
||||||
|
* @param affinity The set of allowed execution cores to inform the scheduler for the current thread
|
||||||
|
*/
|
||||||
void set_numa_core_affinity(rust::Slice<const size_t> affinity) {
|
void set_numa_core_affinity(rust::Slice<const size_t> affinity) {
|
||||||
// void set_numactl_core_affinity(std::vector<size_t> affinity) {
|
// void set_numactl_core_affinity(std::vector<size_t> affinity) {
|
||||||
#ifdef NUMA_AVAILABLE
|
#ifdef NUMA_AVAILABLE
|
||||||
|
@ -175,7 +215,7 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Force an update of the llama.cpp/ggml threadpool, reading from NUMA cores affinity
|
||||||
*/
|
*/
|
||||||
void update_numa_affinity() {
|
void update_numa_affinity() {
|
||||||
SPDLOG_INFO("Rebinding NUMA affinity for current worker on thread: {}", std::this_thread::get_id());
|
SPDLOG_INFO("Rebinding NUMA affinity for current worker on thread: {}", std::this_thread::get_id());
|
||||||
|
|
Loading…
Reference in New Issue