feat(llamacpp): initial commit

# Conflicts: # Cargo.lock
2024-10-03 14:00:17 +02:00 · 2024-10-03 14:00:17 +02:00 · aa1fcba59f
parent a785000842
commit aa1fcba59f
8 changed files with 157 additions and 1 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -7,7 +7,7 @@ members = [
  "backends/trtllm",
  "launcher",
  "router"
-]
+, "backends/llamacpp"]
 default-members = [
  "benchmark",
  "backends/v2",
--- a/backends/llamacpp/CMakeLists.txt
+++ b/backends/llamacpp/CMakeLists.txt
@ -0,0 +1,28 @@
 cmake_minimum_required(VERSION 3.20)
 project(tgi-llama-cpp-backend VERSION 1.0.0)
 set(CMAKE_CXX_STANDARD 20)
 include(FetchContent)
 set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against")
 # Add dependencies
 include(cmake/fmt.cmake)
 include(cmake/spdlog.cmake)
 # Download llama.cpp repo at the specific version
 fetchcontent_declare(
    llama
 #    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
    GIT_TAG b3837
    GIT_SHALLOW FALSE
 )
 fetchcontent_makeavailable(llama)
 add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
 target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11)
 target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common)
--- a/backends/llamacpp/Cargo.toml
+++ b/backends/llamacpp/Cargo.toml
@ -0,0 +1,8 @@
 [package]
 name = "text-generation-backend-llamacpp"
 version.workspace = true
 edition.workspace = true
 authors.workspace = true
 homepage.workspace = true
 [dependencies]
--- a/backends/llamacpp/cmake/fmt.cmake
+++ b/backends/llamacpp/cmake/fmt.cmake
@ -0,0 +1,6 @@
 FetchContent_Declare(
        fmt
        GIT_REPOSITORY https://github.com/fmtlib/fmt
        GIT_TAG 11.0.1
 )
 FetchContent_MakeAvailable(fmt)
--- a/backends/llamacpp/cmake/spdlog.cmake
+++ b/backends/llamacpp/cmake/spdlog.cmake
@ -0,0 +1,17 @@
 set(SPDLOG_USE_FMT ON)
 set(SPDLOG_BUILD_SHARED OFF)
 set(SPDLOG_FMT_EXTERNAL ON)
 # Define the level at which SPDLOG_ compilation level is defined
 if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
 else ()
    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
 endif ()
 fetchcontent_declare(
        spdlog
        GIT_REPOSITORY https://github.com/gabime/spdlog.git
        GIT_TAG v1.14.1
 )
 fetchcontent_makeavailable(spdlog)
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@ -0,0 +1,66 @@
 //
 // Created by Morgan Funtowicz on 9/28/2024.
 //
 #include <arg.h>
 #include <common.h>
 #include <fmt/format.h>
 #include <spdlog/spdlog.h>
 #include "backend.hpp"
 namespace huggingface::tgi::backends::llama {
    std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root) {
        SPDLOG_INFO(FMT_STRING("Loading model from {}"), root);
        gpt_init();
        // Fake argv
        std::vector<std::string_view> args = {"tgi_llama_cpp_backend", "--model", root};
        std::vector<char*> argv;
        for(const auto& arg : args) {
            argv.push_back(const_cast<char *>(arg.data()));
        }
        argv.push_back(nullptr);
        // Create the GPT parameters
        gpt_params params;
        if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) {
            throw std::runtime_error("Failed to create GPT Params from model");
        }
        // Create the inference engine
        SPDLOG_INFO("Allocating llama.cpp model from gpt_params");
        auto result = llama_init_from_gpt_params(params);
        // Unpack all the inference engine components
        auto model = result.model;
        auto context = result.context;
        auto loras = result.lora_adapters;
        // Make sure everything is correctly initialized
        if(model == nullptr)
            throw std::runtime_error(fmt::format("Failed to load model from {}", root));
        return std::make_unique<TgiLlamaCppBackend>(model, context);
    }
    TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
        : model(model), ctx(ctx), batch() {
    }
    TgiLlamaCppBackend::~TgiLlamaCppBackend() {
        if(model)
        {
            SPDLOG_DEBUG("Freeing llama.cpp model");
            llama_free_model(model);
        }
        if(ctx)
        {
            SPDLOG_DEBUG("Freeing llama.cpp context");
            llama_free(ctx);
        }
    }
 }
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@ -0,0 +1,28 @@
 //
 // Created by Morgan Funtowicz on 9/28/2024.
 //
 #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
 #include <memory>
 #include <llama.h>
 namespace huggingface::tgi::backends::llama {
    const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
    class TgiLlamaCppBackend {
    private:
        llama_model* model;
        llama_context* ctx;
        llama_batch batch;
    public:
        TgiLlamaCppBackend(llama_model* const model, llama_context* const);
        ~TgiLlamaCppBackend();
    };
    std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root);
 }
 #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
--- a/backends/llamacpp/src/main.rs
+++ b/backends/llamacpp/src/main.rs
@ -0,0 +1,3 @@
 fn main() {
    println!("Hello, world!");
 }