parent
a785000842
commit
aa1fcba59f
|
@ -7,7 +7,7 @@ members = [
|
|||
"backends/trtllm",
|
||||
"launcher",
|
||||
"router"
|
||||
]
|
||||
, "backends/llamacpp"]
|
||||
default-members = [
|
||||
"benchmark",
|
||||
"backends/v2",
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
cmake_minimum_required(VERSION 3.20)
|
||||
|
||||
project(tgi-llama-cpp-backend VERSION 1.0.0)
|
||||
set(CMAKE_CXX_STANDARD 20)
|
||||
|
||||
include(FetchContent)
|
||||
|
||||
set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against")
|
||||
|
||||
|
||||
# Add dependencies
|
||||
include(cmake/fmt.cmake)
|
||||
include(cmake/spdlog.cmake)
|
||||
|
||||
# Download llama.cpp repo at the specific version
|
||||
fetchcontent_declare(
|
||||
llama
|
||||
# DOWNLOAD_EXTRACT_TIMESTAMP TRUE
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
|
||||
GIT_TAG b3837
|
||||
GIT_SHALLOW FALSE
|
||||
)
|
||||
|
||||
fetchcontent_makeavailable(llama)
|
||||
|
||||
add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp)
|
||||
target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11)
|
||||
target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common)
|
|
@ -0,0 +1,8 @@
|
|||
[package]
|
||||
name = "text-generation-backend-llamacpp"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
authors.workspace = true
|
||||
homepage.workspace = true
|
||||
|
||||
[dependencies]
|
|
@ -0,0 +1,6 @@
|
|||
FetchContent_Declare(
|
||||
fmt
|
||||
GIT_REPOSITORY https://github.com/fmtlib/fmt
|
||||
GIT_TAG 11.0.1
|
||||
)
|
||||
FetchContent_MakeAvailable(fmt)
|
|
@ -0,0 +1,17 @@
|
|||
set(SPDLOG_USE_FMT ON)
|
||||
set(SPDLOG_BUILD_SHARED OFF)
|
||||
set(SPDLOG_FMT_EXTERNAL ON)
|
||||
|
||||
# Define the level at which SPDLOG_ compilation level is defined
|
||||
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG)
|
||||
else ()
|
||||
add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO)
|
||||
endif ()
|
||||
|
||||
fetchcontent_declare(
|
||||
spdlog
|
||||
GIT_REPOSITORY https://github.com/gabime/spdlog.git
|
||||
GIT_TAG v1.14.1
|
||||
)
|
||||
fetchcontent_makeavailable(spdlog)
|
|
@ -0,0 +1,66 @@
|
|||
//
|
||||
// Created by Morgan Funtowicz on 9/28/2024.
|
||||
//
|
||||
|
||||
#include <arg.h>
|
||||
#include <common.h>
|
||||
#include <fmt/format.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include "backend.hpp"
|
||||
|
||||
namespace huggingface::tgi::backends::llama {
|
||||
|
||||
std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root) {
|
||||
SPDLOG_INFO(FMT_STRING("Loading model from {}"), root);
|
||||
gpt_init();
|
||||
|
||||
// Fake argv
|
||||
std::vector<std::string_view> args = {"tgi_llama_cpp_backend", "--model", root};
|
||||
std::vector<char*> argv;
|
||||
for(const auto& arg : args) {
|
||||
argv.push_back(const_cast<char *>(arg.data()));
|
||||
}
|
||||
argv.push_back(nullptr);
|
||||
|
||||
// Create the GPT parameters
|
||||
gpt_params params;
|
||||
if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) {
|
||||
throw std::runtime_error("Failed to create GPT Params from model");
|
||||
}
|
||||
|
||||
|
||||
// Create the inference engine
|
||||
SPDLOG_INFO("Allocating llama.cpp model from gpt_params");
|
||||
auto result = llama_init_from_gpt_params(params);
|
||||
|
||||
// Unpack all the inference engine components
|
||||
auto model = result.model;
|
||||
auto context = result.context;
|
||||
auto loras = result.lora_adapters;
|
||||
|
||||
// Make sure everything is correctly initialized
|
||||
if(model == nullptr)
|
||||
throw std::runtime_error(fmt::format("Failed to load model from {}", root));
|
||||
|
||||
return std::make_unique<TgiLlamaCppBackend>(model, context);
|
||||
}
|
||||
|
||||
TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx)
|
||||
: model(model), ctx(ctx), batch() {
|
||||
|
||||
}
|
||||
|
||||
TgiLlamaCppBackend::~TgiLlamaCppBackend() {
|
||||
if(model)
|
||||
{
|
||||
SPDLOG_DEBUG("Freeing llama.cpp model");
|
||||
llama_free_model(model);
|
||||
}
|
||||
|
||||
if(ctx)
|
||||
{
|
||||
SPDLOG_DEBUG("Freeing llama.cpp context");
|
||||
llama_free(ctx);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
//
|
||||
// Created by Morgan Funtowicz on 9/28/2024.
|
||||
//
|
||||
|
||||
#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
|
||||
#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
|
||||
|
||||
#include <memory>
|
||||
#include <llama.h>
|
||||
|
||||
namespace huggingface::tgi::backends::llama {
|
||||
const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp";
|
||||
|
||||
|
||||
class TgiLlamaCppBackend {
|
||||
private:
|
||||
llama_model* model;
|
||||
llama_context* ctx;
|
||||
llama_batch batch;
|
||||
public:
|
||||
TgiLlamaCppBackend(llama_model* const model, llama_context* const);
|
||||
~TgiLlamaCppBackend();
|
||||
};
|
||||
|
||||
std::unique_ptr<TgiLlamaCppBackend> CreateLlamaCppBackend(std::string_view root);
|
||||
}
|
||||
|
||||
#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
|
|
@ -0,0 +1,3 @@
|
|||
fn main() {
|
||||
println!("Hello, world!");
|
||||
}
|
Loading…
Reference in New Issue