diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 039d4eac..0e1a13ac 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -33,14 +33,15 @@ namespace huggingface::tgi::backends::llamacpp { static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; }; /** - * + * Represent an error which can be returned as part of an std::expected */ enum backend_error_t : uint8_t { + // Provided model filepath doesnt exist MODEL_FILE_DOESNT_EXIST = 1 }; /** - * + * Hold all the parameters provided by TGI to sample from the final distribution of tokens */ struct sampling_params_t { uint32_t top_k = std::numeric_limits::max(); @@ -58,13 +59,19 @@ namespace huggingface::tgi::backends::llamacpp { }; /** - * + * Hold all the parameters provided by TGI to control the generation process */ struct generation_params_t { uint32_t max_new_tokens = std::numeric_limits::max(); bool ignore_eos_token = false; }; + /** + * Container structure wrapping up the current generation context composed by: + * - a non-owning view over the prompt tokens + * - the sampling parameters + * - the generation parameters + */ struct generation_context_t { generation_params_t generation_params; sampling_params_t sampling_params; @@ -72,7 +79,7 @@ namespace huggingface::tgi::backends::llamacpp { }; /** - * + * Represent the actual model execution (i.e. "forward") and generation loop for llama.cpp */ class worker_t { private: @@ -81,17 +88,17 @@ namespace huggingface::tgi::backends::llamacpp { public: /** - * - * @param model - * @param params + * Create a new llama.cpp worker from the provided llama_model and the context parameters + * @param model Previously allocated `llama_model` holding the weights of the neural network + * @param params Parameters to allocate the execution context of the model */ worker_t(std::shared_ptr, const llama_context_params &&); /** - * - * @param context - * @param generation_context - * @param callback + * Generate multiple successive tokens, sampled from the distribution generated by executing a forward pass + * over the neural network operations and matrices + * @param generation_context The generation context holding sampling and generation parameters along with prompt tokens + * @param callback An optional callback function which would be called everytime a new token is sampled */ [[nodiscard]] std::expected generate(const generation_context_t &, const std::optional &) const; diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index d33a4c7b..36455263 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -35,11 +35,18 @@ namespace huggingface::tgi::backends::llamacpp { namespace huggingface::tgi::backends::llamacpp { + /** + * Smart pointer to drop a llama_model when going out of scope + */ auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); }; auto make_shared_llama_model = [](llama_model *model) { return std::shared_ptr(model, llama_model_deleter); }; + /** + * llama.cpp backend specific exception mapped from `backend_exception_t` to throw at the FFI level and + * allow automatic implementation of Result<_, Exception> from C++ to Rust + */ class llama_cpp_backend_exception_t : std::exception {}; /** @@ -51,9 +58,29 @@ namespace huggingface::tgi::backends::llamacpp { worker_t worker_; public: + /** + * Create a new llama.cpp worker frontend allowing to map custom Rust FFI types from CXX crate to c++ boundary + * @param model The `llama_model` to use on the worker + * @param num_threads The number of threads the worker is allowed to spawn accross for its threadpool + */ explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads): model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {} + /** + * Generate a new set of tokens from the provided `input_tokens`, streaming each individual token generated + * through the `callback`. + * Individual tokens are generated using the sampling parameters provided through `sampling_params` and the + * generation parameters, provided through `generation_params` allowing to define the behaviour of the generation loop. + * `ctx` is an opaque structure defined on Rust side which holds stream information to send tokens back to the originating client. + * @param input_tokens Prompt input tokens originating from the tokenization of the request's text input + * @param generation_params Parameters controlling the generation loop such as ignoring the end of sentence token or + * the maximum number of tokens to generate + * @param sampling_params Parameters controlling the sampling process on the final token distribution + * @param ctx Opaque structure from Rust holding HTTP channel to stream back response to the client + * @param callback Function pointer called everytime a new token is generated during the generation loop. + * If this callback returns `true` it signals an early termination request on the Rust side. + * @return Number of generated tokens + */ size_t stream( rust::Slice input_tokens, const generation_params_t generation_params, @@ -88,6 +115,12 @@ namespace huggingface::tgi::backends::llamacpp { } }; + /** + * Utility method to allocate a new worker frontend from Rust + * @param modelPath The GGUF model path as an UTF-8 string from Rust + * @param num_threads Integer greater than zero representing the number of threads the worker is allowed to use for computations + * @return unique ownership of `llama_cpp_worker_frontend_t` pointer + */ std::unique_ptr create_worker_frontend(rust::Str modelPath, uint32_t num_threads) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG spdlog::set_level(spdlog::level::debug); @@ -108,9 +141,16 @@ namespace huggingface::tgi::backends::llamacpp { return std::make_unique(model, static_cast(num_threads)); } + /** + * Smart pointer to automatically destroy the underlying numa_bitset * when going out of scope + */ struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }}; typedef std::unique_ptr unique_cpumask_ptr; + /** + * Define the NUMA core and memory affinity for the current thread by binding cores and memory to respective NUMA node(s) + * @param affinity The set of allowed execution cores to inform the scheduler for the current thread + */ void set_numa_core_affinity(rust::Slice affinity) { // void set_numactl_core_affinity(std::vector affinity) { #ifdef NUMA_AVAILABLE @@ -175,7 +215,7 @@ namespace huggingface::tgi::backends::llamacpp { } /** - * + * Force an update of the llama.cpp/ggml threadpool, reading from NUMA cores affinity */ void update_numa_affinity() { SPDLOG_INFO("Rebinding NUMA affinity for current worker on thread: {}", std::this_thread::get_id());