hf_text-generation-inference/backends/llamacpp/csrc/ffi.hpp

//
// Created by mfuntowicz on 10/23/24.
//

#ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP
#define TGI_LLAMA_CPP_BACKEND_FFI_HPP

#include <cstdint>
#include <exception>
#include <filesystem>
#include <memory>
#include <ranges>
#include <string_view>
#include <thread>

#include <spdlog/spdlog.h>
#include <spdlog/fmt/ranges.h>
#include <spdlog/fmt/std.h>

#ifdef NUMA_AVAILABLE
#define CURRENT_THREAD 0
#include <algorithm>
#include <unordered_set>
#include <numa.h>
#endif

namespace huggingface::tgi::backends::llamacpp {
    class llama_cpp_worker_frontend_t;
}

#include "backend.hpp"
#include "backends/llamacpp/src/lib.rs.h"
#include "rust/cxx.h"


namespace huggingface::tgi::backends::llamacpp {

    auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); };
    auto make_shared_llama_model = [](llama_model *model) {
        return std::shared_ptr<llama_model>(model, llama_model_deleter);
    };

    class llama_cpp_backend_exception_t : std::exception {};

    /**
     * Llama.cpp frontend over the worker interfacing with Rust FFI layer
     */
    class llama_cpp_worker_frontend_t {
    private:
        std::shared_ptr<llama_model> model_;
        worker_t worker_;

    public:
        explicit llama_cpp_worker_frontend_t(llama_model *model):
            model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {}

        size_t stream(
                rust::Slice<const uint32_t> input_tokens,
                const generation_params_t generation_params,
                const sampling_params_t &sampling_params,
                InferContext *ctx,
                rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback
        ) {
            // Wrapper around the provided Rust callback to inject the InferContext when returning from the C++ FFI boundaries
            // It captures the context (ctx) using reference and will automatically call the Rust callback forwarding the InferContext
            auto context_forwarding_callback =
                    [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {
                return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
            };

            // Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token*
            static auto as_llama_token = [](const uint32_t x){ return static_cast<llama_token>(x); };

#ifdef __cpp_lib_ranges_to_container
            auto input_tokens_v = input_tokens | std::views::transform(as_llama_token) | std::ranges::to<std::vector>();
#else
            auto input_tokens_ = input_tokens | std::views::transform(as_llama_token);
            auto input_tokens_v = std::vector<llama_token>(input_tokens_.begin(), input_tokens_.end());
#endif

            // Defer the generation to the actual worker_t
            const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v};
            if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] {
                return *result;
            } else {
                throw llama_cpp_backend_exception_t {};
            }
        }
    };

    std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
#ifdef TGI_LLAMACPP_BACKEND_DEBUG
        spdlog::set_level(spdlog::level::debug);
#endif

        // Initialize the numa context from numactl
        static const bool INITIALIZED_NUMA_CONTEXT_ONCE = [](){
            llama_numa_init(GGML_NUMA_STRATEGY_NUMACTL);
            return true;
        }();

        // Allocate model weights parameters
        auto params = llama_model_default_params();
        params.use_mmap = true;

        // Allocate the model from the Rust provided, string path
        auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));
        return std::make_unique<llama_cpp_worker_frontend_t>(model);
    }

    struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
    typedef std::unique_ptr<struct bitmask, numa_cpumask_deleter> unique_cpumask_ptr;

    void set_numactl_core_affinity(rust::Slice<const size_t> affinity) {
//    void set_numactl_core_affinity(std::vector<size_t> affinity) {
#ifdef NUMA_AVAILABLE
        if(numa_available()) {
            SPDLOG_INFO("Setting numactl cores affinity to {} for thread {}", affinity, std::this_thread::get_id());

            auto cpumask = unique_cpumask_ptr(numa_allocate_cpumask());
            std::ranges::for_each(affinity, [&cpumask](size_t cpu) { numa_bitmask_setbit(cpumask.get(), cpu); });
            numa_sched_setaffinity(CURRENT_THREAD, cpumask.get());

            // Retrieve some information about the current setup
            if(const auto numa_num_nodes = numa_num_configured_nodes(); numa_num_nodes > 1) {
                const auto *numa_all_cpus = numa_all_cpus_ptr;
                SPDLOG_INFO(FMT_STRING("All CPUs: {:b} (# Nodes: {:d}"), *numa_all_cpus->maskp, numa_num_nodes);

                // Retrieve the cpumask specific for the current node
                auto cpus_per_node = unique_cpumask_ptr(numa_allocate_cpumask());

                // Allocate a set which keeps track of which nodes is being targeted
                auto numa_spawning_nodes = std::unordered_set<size_t>();
                for(auto node = 0; node < numa_num_nodes; ++node) {
                    // Retrieve the cpumask for the target node
                    numa_node_to_cpus(node, cpus_per_node.get());

                    // intersect which cores on the nodes are targeted, in no one on that specific node
                    // the value of allocated_cpus_on_node will be 0 as the result of the AND operation.
                    const auto allocated_cpus_on_node = *cpus_per_node->maskp & *cpumask->maskp;
                    if(allocated_cpus_on_node > 0) {

                        // If we have some cores on the node, attempt to insert in the set of targeted node
                        if(const auto [_, was_inserted] = numa_spawning_nodes.emplace(node); was_inserted) {
                            SPDLOG_DEBUG("Allocated thread spawning node: {:d}", node);
                        }
                    }

                    // Clear all the bits relative to the current node
                    numa_bitmask_clearall(cpus_per_node.get());
                }

                // Bind the memory if we spawn a single node, otherwise, let's display a warning
                if(numa_spawning_nodes.size() == 1) {
                    SPDLOG_INFO(FMT_STRING("Setting memory affinity to node: {:d}"), *numa_spawning_nodes.begin());
                    numa_set_preferred(*numa_spawning_nodes.begin());
                } else {
                    SPDLOG_WARN(FMT_STRING("Specified thread affinity spawn multiple NUMA nodes: {}"), numa_spawning_nodes);
                }
            }

#ifdef TGI_LLAMACPP_BACKEND_DEBUG
            // Sanity check in the logs...
            auto *cpumask_check = numa_allocate_cpumask();
            numa_sched_getaffinity(CURRENT_THREAD, cpumask_check);
            SPDLOG_DEBUG(
                    FMT_STRING("numa_sched_affinity for thread {} -> {:b}"),
                    std::this_thread::get_id(), *cpumask_check->maskp);
            numa_free_cpumask(cpumask_check);
#endif
        }
#else
        SPDLOG_WARN("TGI's llama.cpp backend was compiled without NUMA support");
#endif
    }
}


#endif //TGI_LLAMA_CPP_BACKEND_FFI_HPP
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00			`//`
			`// Created by mfuntowicz on 10/23/24.`
			`//`

			`#ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP`
			`#define TGI_LLAMA_CPP_BACKEND_FFI_HPP`

feat(backend): multistream inference on CPU 2024-11-20 16:03:05 -07:00			`#include <cstdint>`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`#include <exception>`
			`#include <filesystem>`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`#include <memory>`
feat(backend): use std::ranges to map uint32_t to llama_token 2024-11-12 16:07:59 -07:00			`#include <ranges>`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`#include <string_view>`
feat(backend): multistream inference on CPU 2024-11-20 16:03:05 -07:00			`#include <thread>`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
			`#include <spdlog/spdlog.h>`
feat(backend): multistream inference on CPU 2024-11-20 16:03:05 -07:00			`#include <spdlog/fmt/ranges.h>`
			`#include <spdlog/fmt/std.h>`

feat(backend): bind thread and memory affinity for thread 2024-11-21 05:52:38 -07:00			`#ifdef NUMA_AVAILABLE`
			`#define CURRENT_THREAD 0`
			`#include <algorithm>`
			`#include <unordered_set>`
feat(backend): multistream inference on CPU 2024-11-20 16:03:05 -07:00			`#include <numa.h>`
feat(backend): bind thread and memory affinity for thread 2024-11-21 05:52:38 -07:00			`#endif`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`namespace huggingface::tgi::backends::llamacpp {`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`class llama_cpp_worker_frontend_t;`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`}`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`#include "backend.hpp"`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`#include "backends/llamacpp/src/lib.rs.h"`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 17:36:32 -06:00			`#include "rust/cxx.h"`
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00

feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`namespace huggingface::tgi::backends::llamacpp {`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); };`
			`auto make_shared_llama_model = [](llama_model *model) {`
			`return std::shared_ptr<llama_model>(model, llama_model_deleter);`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`};`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`class llama_cpp_backend_exception_t : std::exception {};`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`/**`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`* Llama.cpp frontend over the worker interfacing with Rust FFI layer`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`*/`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`class llama_cpp_worker_frontend_t {`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`private:`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`std::shared_ptr<llama_model> model_;`
			`worker_t worker_;`
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`public:`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`explicit llama_cpp_worker_frontend_t(llama_model *model):`
			`model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {}`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 17:36:32 -06:00			`size_t stream(`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`rust::Slice<const uint32_t> input_tokens,`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 17:36:32 -06:00			`const generation_params_t generation_params,`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`const sampling_params_t &sampling_params,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`InferContext *ctx,`
feat(backend): add early stopping criteria from TGI stream callback 2024-11-04 09:01:22 -07:00			`rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`) {`
chore(backend): minor improvements 2024-11-12 16:08:26 -07:00			`// Wrapper around the provided Rust callback to inject the InferContext when returning from the C++ FFI boundaries`
			`// It captures the context (ctx) using reference and will automatically call the Rust callback forwarding the InferContext`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`auto context_forwarding_callback =`
			`[=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {`
			`return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`};`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`// Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token*`
feat(backend): use std::ranges to map uint32_t to llama_token 2024-11-12 16:07:59 -07:00			`static auto as_llama_token = [](const uint32_t x){ return static_cast<llama_token>(x); };`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00
feat(backend): use std::ranges to map uint32_t to llama_token 2024-11-12 16:07:59 -07:00			`#ifdef __cpp_lib_ranges_to_container`
			`auto input_tokens_v = input_tokens \| std::views::transform(as_llama_token) \| std::ranges::to<std::vector>();`
			`#else`
			`auto input_tokens_ = input_tokens \| std::views::transform(as_llama_token);`
			`auto input_tokens_v = std::vector<llama_token>(input_tokens_.begin(), input_tokens_.end());`
			`#endif`

			`// Defer the generation to the actual worker_t`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v};`
			`if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] {`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`return *result;`
			`} else {`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`throw llama_cpp_backend_exception_t {};`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`}`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`}`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`};`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {`
feat(backend): bind thread and memory affinity for thread 2024-11-21 05:52:38 -07:00			`#ifdef TGI_LLAMACPP_BACKEND_DEBUG`
			`spdlog::set_level(spdlog::level::debug);`
			`#endif`

chore(backend): minor improvements 2024-11-12 16:08:26 -07:00			`// Initialize the numa context from numactl`
			`static const bool INITIALIZED_NUMA_CONTEXT_ONCE = [](){`
			`llama_numa_init(GGML_NUMA_STRATEGY_NUMACTL);`
			`return true;`
			`}();`

			`// Allocate model weights parameters`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`auto params = llama_model_default_params();`
			`params.use_mmap = true;`

chore(backend): minor improvements 2024-11-12 16:08:26 -07:00			`// Allocate the model from the Rust provided, string path`
			`auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`return std::make_unique<llama_cpp_worker_frontend_t>(model);`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`}`
feat(backend): multistream inference on CPU 2024-11-20 16:03:05 -07:00
feat(backend): bind thread and memory affinity for thread 2024-11-21 05:52:38 -07:00			`struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};`
			`typedef std::unique_ptr<struct bitmask, numa_cpumask_deleter> unique_cpumask_ptr;`
feat(backend): multistream inference on CPU 2024-11-20 16:03:05 -07:00
feat(backend): bind thread and memory affinity for thread 2024-11-21 05:52:38 -07:00			`void set_numactl_core_affinity(rust::Slice<const size_t> affinity) {`
			`// void set_numactl_core_affinity(std::vector<size_t> affinity) {`
			`#ifdef NUMA_AVAILABLE`
			`if(numa_available()) {`
			`SPDLOG_INFO("Setting numactl cores affinity to {} for thread {}", affinity, std::this_thread::get_id());`

			`auto cpumask = unique_cpumask_ptr(numa_allocate_cpumask());`
			`std::ranges::for_each(affinity, [&cpumask](size_t cpu) { numa_bitmask_setbit(cpumask.get(), cpu); });`
			`numa_sched_setaffinity(CURRENT_THREAD, cpumask.get());`

			`// Retrieve some information about the current setup`
			`if(const auto numa_num_nodes = numa_num_configured_nodes(); numa_num_nodes > 1) {`
			`const auto *numa_all_cpus = numa_all_cpus_ptr;`
			`SPDLOG_INFO(FMT_STRING("All CPUs: {:b} (# Nodes: {:d}"), *numa_all_cpus->maskp, numa_num_nodes);`

			`// Retrieve the cpumask specific for the current node`
			`auto cpus_per_node = unique_cpumask_ptr(numa_allocate_cpumask());`

			`// Allocate a set which keeps track of which nodes is being targeted`
			`auto numa_spawning_nodes = std::unordered_set<size_t>();`
			`for(auto node = 0; node < numa_num_nodes; ++node) {`
			`// Retrieve the cpumask for the target node`
			`numa_node_to_cpus(node, cpus_per_node.get());`

			`// intersect which cores on the nodes are targeted, in no one on that specific node`
			`// the value of allocated_cpus_on_node will be 0 as the result of the AND operation.`
			`const auto allocated_cpus_on_node = cpus_per_node->maskp & cpumask->maskp;`
			`if(allocated_cpus_on_node > 0) {`

			`// If we have some cores on the node, attempt to insert in the set of targeted node`
			`if(const auto [_, was_inserted] = numa_spawning_nodes.emplace(node); was_inserted) {`
			`SPDLOG_DEBUG("Allocated thread spawning node: {:d}", node);`
			`}`
			`}`

			`// Clear all the bits relative to the current node`
			`numa_bitmask_clearall(cpus_per_node.get());`
			`}`

			`// Bind the memory if we spawn a single node, otherwise, let's display a warning`
			`if(numa_spawning_nodes.size() == 1) {`
			`SPDLOG_INFO(FMT_STRING("Setting memory affinity to node: {:d}"), *numa_spawning_nodes.begin());`
			`numa_set_preferred(*numa_spawning_nodes.begin());`
			`} else {`
			`SPDLOG_WARN(FMT_STRING("Specified thread affinity spawn multiple NUMA nodes: {}"), numa_spawning_nodes);`
			`}`
			`}`
feat(backend): multistream inference on CPU 2024-11-20 16:03:05 -07:00
feat(backend): bind thread and memory affinity for thread 2024-11-21 05:52:38 -07:00			`#ifdef TGI_LLAMACPP_BACKEND_DEBUG`
			`// Sanity check in the logs...`
			`auto *cpumask_check = numa_allocate_cpumask();`
			`numa_sched_getaffinity(CURRENT_THREAD, cpumask_check);`
			`SPDLOG_DEBUG(`
			`FMT_STRING("numa_sched_affinity for thread {} -> {:b}"),`
			`std::this_thread::get_id(), *cpumask_check->maskp);`
			`numa_free_cpumask(cpumask_check);`
			`#endif`
			`}`
			`#else`
			`SPDLOG_WARN("TGI's llama.cpp backend was compiled without NUMA support");`
			`#endif`
feat(backend): multistream inference on CPU 2024-11-20 16:03:05 -07:00			`}`
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00			`}`


			`#endif //TGI_LLAMA_CPP_BACKEND_FFI_HPP`