hf_text-generation-inference/backends/llamacpp/csrc/ffi.hpp

//
// Created by mfuntowicz on 10/23/24.
//

#ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP
#define TGI_LLAMA_CPP_BACKEND_FFI_HPP

#include <exception>
#include <filesystem>
#include <memory>
#include <string_view>
#include <variant>

#include <spdlog/spdlog.h>

namespace huggingface::tgi::backends::llamacpp {
    class llama_cpp_worker_frontend_t;
}

#include "backend.hpp"
#include "backends/llamacpp/src/lib.rs.h"
#include "rust/cxx.h"


namespace huggingface::tgi::backends::llamacpp {

    auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); };
    auto make_shared_llama_model = [](llama_model *model) {
        return std::shared_ptr<llama_model>(model, llama_model_deleter);
    };

    class llama_cpp_backend_exception_t : std::exception {};

    /**
     * Llama.cpp frontend over the worker interfacing with Rust FFI layer
     */
    class llama_cpp_worker_frontend_t {
    private:
        std::shared_ptr<llama_model> model_;
        worker_t worker_;

    public:
        explicit llama_cpp_worker_frontend_t(llama_model *model):
            model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {}

        size_t stream(
                rust::Slice<const uint32_t> input_tokens,
                const generation_params_t generation_params,
                const sampling_params_t &sampling_params,
                InferContext *ctx,
                rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback
        ) {
            auto context_forwarding_callback =
                    [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {
                return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
            };

            // Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token*
            auto input_tokens_v = std::vector<llama_token>(input_tokens.size());
            std::memcpy(input_tokens_v.data(), input_tokens.data(), input_tokens.size());

            const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v};
            if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] {
                return *result;
            } else {
                throw llama_cpp_backend_exception_t {};
            }
        }
    };

    std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
        const auto cxxPath = std::string(modelPath);
        auto params = llama_model_default_params();
        params.use_mmap = true;

        auto *model = (llama_load_model_from_file(cxxPath.c_str(), params));
        return std::make_unique<llama_cpp_worker_frontend_t>(model);
    }
}


#endif //TGI_LLAMA_CPP_BACKEND_FFI_HPP
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00			`//`
			`// Created by mfuntowicz on 10/23/24.`
			`//`

			`#ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP`
			`#define TGI_LLAMA_CPP_BACKEND_FFI_HPP`

feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`#include <exception>`
			`#include <filesystem>`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`#include <memory>`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`#include <string_view>`
misc(backend): missing header <variant> 2024-11-05 15:47:22 -07:00			`#include <variant>`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
			`#include <spdlog/spdlog.h>`

feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`namespace huggingface::tgi::backends::llamacpp {`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`class llama_cpp_worker_frontend_t;`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`}`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`#include "backend.hpp"`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`#include "backends/llamacpp/src/lib.rs.h"`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 17:36:32 -06:00			`#include "rust/cxx.h"`
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00

feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`namespace huggingface::tgi::backends::llamacpp {`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); };`
			`auto make_shared_llama_model = [](llama_model *model) {`
			`return std::shared_ptr<llama_model>(model, llama_model_deleter);`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`};`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`class llama_cpp_backend_exception_t : std::exception {};`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`/**`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`* Llama.cpp frontend over the worker interfacing with Rust FFI layer`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`*/`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`class llama_cpp_worker_frontend_t {`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`private:`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`std::shared_ptr<llama_model> model_;`
			`worker_t worker_;`
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`public:`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`explicit llama_cpp_worker_frontend_t(llama_model *model):`
			`model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {}`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 17:36:32 -06:00			`size_t stream(`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`rust::Slice<const uint32_t> input_tokens,`
feat(backend): avoid dropping the boxed stream at the end of the callback 2024-11-02 17:36:32 -06:00			`const generation_params_t generation_params,`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`const sampling_params_t &sampling_params,`
feat(backend): refactor the callback to handle intermediate and end inference message 2024-11-04 08:17:43 -07:00			`InferContext *ctx,`
feat(backend): add early stopping criteria from TGI stream callback 2024-11-04 09:01:22 -07:00			`rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`) {`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`auto context_forwarding_callback =`
			`[=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {`
			`return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`};`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`// Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token*`
feat(backend): remove reinterpret_cast converting from uint32_t to llama_token(int32_t) 2024-11-09 14:19:38 -07:00			`auto input_tokens_v = std::vector<llama_token>(input_tokens.size());`
			`std::memcpy(input_tokens_v.data(), input_tokens.data(), input_tokens.size());`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00
			`const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v};`
			`if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] {`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`return *result;`
			`} else {`
feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`throw llama_cpp_backend_exception_t {};`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`}`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`}`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`};`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {`
feat(backend): full rework of the backend internal to safer c++ 2024-10-31 10:51:57 -06:00			`const auto cxxPath = std::string(modelPath);`
			`auto params = llama_model_default_params();`
			`params.use_mmap = true;`

feat(backend): simplify overall cpp structure 2024-11-09 14:10:33 -07:00			`auto *model = (llama_load_model_from_file(cxxPath.c_str(), params));`
			`return std::make_unique<llama_cpp_worker_frontend_t>(model);`
feat(backend): build and link through build.rs 2024-10-24 08:42:50 -06:00			`}`
feat(backend): wip Rust binding 2024-10-24 01:56:40 -06:00			`}`


			`#endif //TGI_LLAMA_CPP_BACKEND_FFI_HPP`