// // Created by Morgan Funtowicz on 9/28/2024. // #include #include #include #include #include #include #include #include #include "backend.hpp" namespace huggingface::tgi::backends::llamacpp { [[nodiscard]] std::expected, TgiLlamaCppBackendError> TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath) noexcept { SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath); llama_backend_init(); llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); // Load the model if (!exists(modelPath)) { return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST); } auto params = llama_model_default_params(); auto *model = llama_load_model_from_file(modelPath.c_str(), params); auto *context = llama_new_context_with_model(model, { .n_batch = 1, .n_threads = 16, .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, .flash_attn = false, }); return std::make_pair(model, context); } huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) : model(model), ctx(ctx) { #ifndef NDEBUG char modelName[256]; llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName)); SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); #endif } huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::~TgiLlamaCppBackend() { if (ctx) { SPDLOG_DEBUG("Freeing llama.cpp context"); llama_free(ctx); } if (model) { SPDLOG_DEBUG("Freeing llama.cpp model"); llama_free_model(model); } } std::vector TgiLlamaCppBackend::Tokenize(const std::string &text) const { std::vector tokens(llama_n_seq_max(ctx)); if (auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); nTokens < 0) { tokens.resize(-nTokens); llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); } else { tokens.resize(nTokens); } SPDLOG_DEBUG(FMT_STRING("Tokenized input with {:d} tokens"), tokens.size()); return tokens; } std::unique_ptr TgiLlamaCppBackend::GetSamplerFromArgs( const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, const uint64_t seed) { auto *sampler = llama_sampler_chain_init({.no_perf = false}); // Penalties llama_sampler_chain_add(sampler, llama_sampler_init_penalties( llama_n_vocab(model), llama_token_eos(model), llama_token_nl(model), 0.0f, repetitionPenalty, frequencyPenalty, 0.0f, false, false )); llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast(topK))); if (0 < topP && topP < 1) { llama_sampler_chain_add(sampler, llama_sampler_init_top_p(topP, 1)); } llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed)); return std::make_unique(sampler); } std::expected, TgiLlamaCppBackendError> huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::Generate( std::span tokens, const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, const uint32_t maxNewTokens, const uint64_t seed ) { SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size()); // Allocate generation result std::vector generated; generated.reserve(llama_n_seq_max(ctx) - tokens.size()); // Retrieve decoding context auto batch = llama_batch_get_one(const_cast(tokens.data()), static_cast(tokens.size())); auto sampler = GetSamplerFromArgs(topK, topP, frequencyPenalty, repetitionPenalty, seed); // Decode for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) { #ifndef NDEBUG const auto start = std::chrono::steady_clock::now(); const auto status = llama_decode(ctx, batch); const auto end = std::chrono::steady_clock::now(); const auto latency = std::chrono::duration_cast(end - start); SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); #else const auto status = llama_decode(ctx, batch); #endif if (LLAMA_SUCCESS(status)) { // Sample the new token auto new_token_id = llama_sampler_sample(*sampler, ctx, -1); generated.emplace_back(new_token_id); generating = !llama_token_is_eog(model, new_token_id); // Next iteration batch = llama_batch_get_one(&new_token_id, 1); } } return generated; } }