From aa1fcba59fef8f3685f2851ac1de4b4420c69cd1 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <funtowiczmo@gmail.com>
Date: Thu, 3 Oct 2024 14:00:17 +0200
Subject: [PATCH] feat(llamacpp): initial commit

# Conflicts:
#	Cargo.lock
---
 Cargo.toml                           |  2 +-
 backends/llamacpp/CMakeLists.txt     | 28 ++++++++++++
 backends/llamacpp/Cargo.toml         |  8 ++++
 backends/llamacpp/cmake/fmt.cmake    |  6 +++
 backends/llamacpp/cmake/spdlog.cmake | 17 +++++++
 backends/llamacpp/csrc/backend.cpp   | 66 ++++++++++++++++++++++++++++
 backends/llamacpp/csrc/backend.hpp   | 28 ++++++++++++
 backends/llamacpp/src/main.rs        |  3 ++
 8 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 backends/llamacpp/CMakeLists.txt
 create mode 100644 backends/llamacpp/Cargo.toml
 create mode 100644 backends/llamacpp/cmake/fmt.cmake
 create mode 100644 backends/llamacpp/cmake/spdlog.cmake
 create mode 100644 backends/llamacpp/csrc/backend.cpp
 create mode 100644 backends/llamacpp/csrc/backend.hpp
 create mode 100644 backends/llamacpp/src/main.rs

diff --git a/Cargo.toml b/Cargo.toml
index 9a7e76c4..f3ab5ee5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,7 +7,7 @@ members = [
   "backends/trtllm",
   "launcher",
   "router"
-]
+, "backends/llamacpp"]
 default-members = [
   "benchmark",
   "backends/v2",
diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt
new file mode 100644
index 00000000..2f9026f1
--- /dev/null
+++ b/backends/llamacpp/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.20)
+
+project(tgi-llama-cpp-backend VERSION 1.0.0)
+set(CMAKE_CXX_STANDARD 20)
+
+include(FetchContent)
+
+set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against")
+
+
+# Add dependencies
+include(cmake/fmt.cmake)
+include(cmake/spdlog.cmake)
+
+# Download llama.cpp repo at the specific version
+fetchcontent_declare(
+    llama
+#    DOWNLOAD_EXTRACT_TIMESTAMP TRUE
+    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
+    GIT_TAG b3837
+    GIT_SHALLOW FALSE
+)
+
+fetchcontent_makeavailable(llama)
+
+add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
+target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11)
+target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common)
diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml
new file mode 100644
index 00000000..2e8ed7dd
--- /dev/null
+++ b/backends/llamacpp/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "text-generation-backend-llamacpp"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[dependencies]
diff --git a/backends/llamacpp/cmake/fmt.cmake b/backends/llamacpp/cmake/fmt.cmake
new file mode 100644
index 00000000..f94a9c56
--- /dev/null
+++ b/backends/llamacpp/cmake/fmt.cmake
@@ -0,0 +1,6 @@
+FetchContent_Declare(
+        fmt
+        GIT_REPOSITORY https://github.com/fmtlib/fmt
+        GIT_TAG 11.0.1
+)
+FetchContent_MakeAvailable(fmt)
diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake
new file mode 100644
index 00000000..c4ee5c97
--- /dev/null
+++ b/backends/llamacpp/cmake/spdlog.cmake
@@ -0,0 +1,17 @@
+set(SPDLOG_USE_FMT ON)
+set(SPDLOG_BUILD_SHARED OFF)
+set(SPDLOG_FMT_EXTERNAL ON)
+
+# Define the level at which SPDLOG_ compilation level is defined
+if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
+else ()
+    add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
+endif ()
+
+fetchcontent_declare(
+        spdlog
+        GIT_REPOSITORY https://github.com/gabime/spdlog.git
+        GIT_TAG v1.14.1
+)
+fetchcontent_makeavailable(spdlog)
diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp
new file mode 100644
index 00000000..9ce1dbc9
--- /dev/null
+++ b/backends/llamacpp/csrc/backend.cpp
@@ -0,0 +1,66 @@
+//
+// Created by Morgan Funtowicz on 9/28/2024.
+//
+
+#include <arg.h>
+#include <common.h>
+#include <fmt/format.h>
+#include <spdlog/spdlog.h>
+#include "backend.hpp"
+
+namespace huggingface::tgi::backends::llama {
+
+    std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root) {
+        SPDLOG_INFO(FMT_STRING("Loading model from {}"), root);
+        gpt_init();
+
+        // Fake argv
+        std::vector<std::string_view> args = {"tgi_llama_cpp_backend", "--model", root};
+        std::vector<char*> argv;
+        for(const auto& arg : args) {
+            argv.push_back(const_cast<char *>(arg.data()));
+        }
+        argv.push_back(nullptr);
+
+        // Create the GPT parameters
+        gpt_params params;
+        if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) {
+            throw std::runtime_error("Failed to create GPT Params from model");
+        }
+
+
+        // Create the inference engine
+        SPDLOG_INFO("Allocating llama.cpp model from gpt_params");
+        auto result = llama_init_from_gpt_params(params);
+
+        // Unpack all the inference engine components
+        auto model = result.model;
+        auto context = result.context;
+        auto loras = result.lora_adapters;
+
+        // Make sure everything is correctly initialized
+        if(model == nullptr)
+            throw std::runtime_error(fmt::format("Failed to load model from {}", root));
+
+        return std::make_unique<TgiLlamaCppBackend>(model, context);
+    }
+
+    TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
+        : model(model), ctx(ctx), batch() {
+
+    }
+
+    TgiLlamaCppBackend::~TgiLlamaCppBackend() {
+        if(model)
+        {
+            SPDLOG_DEBUG("Freeing llama.cpp model");
+            llama_free_model(model);
+        }
+
+        if(ctx)
+        {
+            SPDLOG_DEBUG("Freeing llama.cpp context");
+            llama_free(ctx);
+        }
+    }
+}
\ No newline at end of file
diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp
new file mode 100644
index 00000000..a643454e
--- /dev/null
+++ b/backends/llamacpp/csrc/backend.hpp
@@ -0,0 +1,28 @@
+//
+// Created by Morgan Funtowicz on 9/28/2024.
+//
+
+#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
+#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
+
+#include <memory>
+#include <llama.h>
+
+namespace huggingface::tgi::backends::llama {
+    const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
+
+
+    class TgiLlamaCppBackend {
+    private:
+        llama_model* model;
+        llama_context* ctx;
+        llama_batch batch;
+    public:
+        TgiLlamaCppBackend(llama_model* const model, llama_context* const);
+        ~TgiLlamaCppBackend();
+    };
+
+    std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root);
+}
+
+#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs
new file mode 100644
index 00000000..e7a11a96
--- /dev/null
+++ b/backends/llamacpp/src/main.rs
@@ -0,0 +1,3 @@
+fn main() {
+    println!("Hello, world!");
+}