feat(llamacpp): initial commit

# Conflicts:
#	Cargo.lock
This commit is contained in:
Morgan Funtowicz 2024-10-03 14:00:17 +02:00
parent a785000842
commit aa1fcba59f
8 changed files with 157 additions and 1 deletions

View File

@ -7,7 +7,7 @@ members = [
"backends/trtllm", "backends/trtllm",
"launcher", "launcher",
"router" "router"
] , "backends/llamacpp"]
default-members = [ default-members = [
"benchmark", "benchmark",
"backends/v2", "backends/v2",

View File

@ -0,0 +1,28 @@
cmake_minimum_required(VERSION 3.20)
project(tgi-llama-cpp-backend VERSION 1.0.0)
set(CMAKE_CXX_STANDARD 20)
include(FetchContent)
set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against")
# Add dependencies
include(cmake/fmt.cmake)
include(cmake/spdlog.cmake)
# Download llama.cpp repo at the specific version
fetchcontent_declare(
llama
# DOWNLOAD_EXTRACT_TIMESTAMP TRUE
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
GIT_TAG b3837
GIT_SHALLOW FALSE
)
fetchcontent_makeavailable(llama)
add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11)
target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common)

View File

@ -0,0 +1,8 @@
[package]
name = "text-generation-backend-llamacpp"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true
[dependencies]

View File

@ -0,0 +1,6 @@
FetchContent_Declare(
fmt
GIT_REPOSITORY https://github.com/fmtlib/fmt
GIT_TAG 11.0.1
)
FetchContent_MakeAvailable(fmt)

View File

@ -0,0 +1,17 @@
set(SPDLOG_USE_FMT ON)
set(SPDLOG_BUILD_SHARED OFF)
set(SPDLOG_FMT_EXTERNAL ON)
# Define the level at which SPDLOG_ compilation level is defined
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
else ()
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
endif ()
fetchcontent_declare(
spdlog
GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG v1.14.1
)
fetchcontent_makeavailable(spdlog)

View File

@ -0,0 +1,66 @@
//
// Created by Morgan Funtowicz on 9/28/2024.
//
#include <arg.h>
#include <common.h>
#include <fmt/format.h>
#include <spdlog/spdlog.h>
#include "backend.hpp"
namespace huggingface::tgi::backends::llama {
std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root) {
SPDLOG_INFO(FMT_STRING("Loading model from {}"), root);
gpt_init();
// Fake argv
std::vector<std::string_view> args = {"tgi_llama_cpp_backend", "--model", root};
std::vector<char*> argv;
for(const auto& arg : args) {
argv.push_back(const_cast<char *>(arg.data()));
}
argv.push_back(nullptr);
// Create the GPT parameters
gpt_params params;
if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) {
throw std::runtime_error("Failed to create GPT Params from model");
}
// Create the inference engine
SPDLOG_INFO("Allocating llama.cpp model from gpt_params");
auto result = llama_init_from_gpt_params(params);
// Unpack all the inference engine components
auto model = result.model;
auto context = result.context;
auto loras = result.lora_adapters;
// Make sure everything is correctly initialized
if(model == nullptr)
throw std::runtime_error(fmt::format("Failed to load model from {}", root));
return std::make_unique<TgiLlamaCppBackend>(model, context);
}
TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
: model(model), ctx(ctx), batch() {
}
TgiLlamaCppBackend::~TgiLlamaCppBackend() {
if(model)
{
SPDLOG_DEBUG("Freeing llama.cpp model");
llama_free_model(model);
}
if(ctx)
{
SPDLOG_DEBUG("Freeing llama.cpp context");
llama_free(ctx);
}
}
}

View File

@ -0,0 +1,28 @@
//
// Created by Morgan Funtowicz on 9/28/2024.
//
#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
#include <memory>
#include <llama.h>
namespace huggingface::tgi::backends::llama {
const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
class TgiLlamaCppBackend {
private:
llama_model* model;
llama_context* ctx;
llama_batch batch;
public:
TgiLlamaCppBackend(llama_model* const model, llama_context* const);
~TgiLlamaCppBackend();
};
std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root);
}
#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP

View File

@ -0,0 +1,3 @@
fn main() {
println!("Hello, world!");
}