diff --git a/Cargo.toml b/Cargo.toml index 9a7e76c4..f3ab5ee5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ "backends/trtllm", "launcher", "router" -] +, "backends/llamacpp"] default-members = [ "benchmark", "backends/v2", diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt new file mode 100644 index 00000000..2f9026f1 --- /dev/null +++ b/backends/llamacpp/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.20) + +project(tgi-llama-cpp-backend VERSION 1.0.0) +set(CMAKE_CXX_STANDARD 20) + +include(FetchContent) + +set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against") + + +# Add dependencies +include(cmake/fmt.cmake) +include(cmake/spdlog.cmake) + +# Download llama.cpp repo at the specific version +fetchcontent_declare( + llama +# DOWNLOAD_EXTRACT_TIMESTAMP TRUE + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git + GIT_TAG b3837 + GIT_SHALLOW FALSE +) + +fetchcontent_makeavailable(llama) + +add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) +target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11) +target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common) diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml new file mode 100644 index 00000000..2e8ed7dd --- /dev/null +++ b/backends/llamacpp/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "text-generation-backend-llamacpp" +version.workspace = true +edition.workspace = true +authors.workspace = true +homepage.workspace = true + +[dependencies] diff --git a/backends/llamacpp/cmake/fmt.cmake b/backends/llamacpp/cmake/fmt.cmake new file mode 100644 index 00000000..f94a9c56 --- /dev/null +++ b/backends/llamacpp/cmake/fmt.cmake @@ -0,0 +1,6 @@ +FetchContent_Declare( + fmt + GIT_REPOSITORY https://github.com/fmtlib/fmt + GIT_TAG 11.0.1 +) +FetchContent_MakeAvailable(fmt) diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake new file mode 100644 index 00000000..c4ee5c97 --- /dev/null +++ b/backends/llamacpp/cmake/spdlog.cmake @@ -0,0 +1,17 @@ +set(SPDLOG_USE_FMT ON) +set(SPDLOG_BUILD_SHARED OFF) +set(SPDLOG_FMT_EXTERNAL ON) + +# Define the level at which SPDLOG_ compilation level is defined +if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") + add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG) +else () + add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO) +endif () + +fetchcontent_declare( + spdlog + GIT_REPOSITORY https://github.com/gabime/spdlog.git + GIT_TAG v1.14.1 +) +fetchcontent_makeavailable(spdlog) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp new file mode 100644 index 00000000..9ce1dbc9 --- /dev/null +++ b/backends/llamacpp/csrc/backend.cpp @@ -0,0 +1,66 @@ +// +// Created by Morgan Funtowicz on 9/28/2024. +// + +#include +#include +#include +#include +#include "backend.hpp" + +namespace huggingface::tgi::backends::llama { + + std::unique_ptr CreateLlamaCppBackend(std::string_view root) { + SPDLOG_INFO(FMT_STRING("Loading model from {}"), root); + gpt_init(); + + // Fake argv + std::vector args = {"tgi_llama_cpp_backend", "--model", root}; + std::vector argv; + for(const auto& arg : args) { + argv.push_back(const_cast(arg.data())); + } + argv.push_back(nullptr); + + // Create the GPT parameters + gpt_params params; + if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) { + throw std::runtime_error("Failed to create GPT Params from model"); + } + + + // Create the inference engine + SPDLOG_INFO("Allocating llama.cpp model from gpt_params"); + auto result = llama_init_from_gpt_params(params); + + // Unpack all the inference engine components + auto model = result.model; + auto context = result.context; + auto loras = result.lora_adapters; + + // Make sure everything is correctly initialized + if(model == nullptr) + throw std::runtime_error(fmt::format("Failed to load model from {}", root)); + + return std::make_unique(model, context); + } + + TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) + : model(model), ctx(ctx), batch() { + + } + + TgiLlamaCppBackend::~TgiLlamaCppBackend() { + if(model) + { + SPDLOG_DEBUG("Freeing llama.cpp model"); + llama_free_model(model); + } + + if(ctx) + { + SPDLOG_DEBUG("Freeing llama.cpp context"); + llama_free(ctx); + } + } +} \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp new file mode 100644 index 00000000..a643454e --- /dev/null +++ b/backends/llamacpp/csrc/backend.hpp @@ -0,0 +1,28 @@ +// +// Created by Morgan Funtowicz on 9/28/2024. +// + +#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP +#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP + +#include +#include + +namespace huggingface::tgi::backends::llama { + const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; + + + class TgiLlamaCppBackend { + private: + llama_model* model; + llama_context* ctx; + llama_batch batch; + public: + TgiLlamaCppBackend(llama_model* const model, llama_context* const); + ~TgiLlamaCppBackend(); + }; + + std::unique_ptr CreateLlamaCppBackend(std::string_view root); +} + +#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs new file mode 100644 index 00000000..e7a11a96 --- /dev/null +++ b/backends/llamacpp/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +}