hf_text-generation-inference/backends/llamacpp/csrc/backend.hpp

//
// Created by Morgan Funtowicz on 9/28/2024.
//
#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP

#include <cmath>
#include <expected>
#include <filesystem>
#include <memory>
#include <llama.h>

#define LLAMA_SUCCESS(x) x == 0

namespace huggingface::tgi::backends::llama {
    enum TgiLlamaCppBackendError {
        MODEL_FILE_DOESNT_EXIST = 1
    };


    class TgiLlamaCppBackend {
        using TokenId = llama_token;

    private:
        llama_model* model;
        llama_context* ctx;

        /**
         *
         * @param topK
         * @param topP
         * @return
         */
        std::unique_ptr<llama_sampler *> GetSamplerFromArgs(
                uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed);

    public:
        TgiLlamaCppBackend(llama_model *model, llama_context *ctx);
        ~TgiLlamaCppBackend();

        /**
         *
         * @param text
         * @return
         */
        [[nodiscard]] std::vector<TgiLlamaCppBackend::TokenId> Tokenize(const std::string& text) const;

        /**
         *
         * @param tokens
         * @param topK
         * @param topP
         * @param maxNewTokens
         * @return
         */
        [[nodiscard]] std::vector<TgiLlamaCppBackend::TokenId> Generate(
                std::span<const TokenId> tokens,
                uint32_t topK,
                float_t topP = 1.0f,
                uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max()
        );
    };

    std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>
    CreateLlamaCppBackend(const std::filesystem::path& root);
}

#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`//`
			`// Created by Morgan Funtowicz on 9/28/2024.`
			`//`
			`#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP`
			`#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP`

feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`#include <cmath>`
			`#include <expected>`
feat(backend): correctly load llama.cpp model from llama api and not gpt2 2024-10-22 07:22:56 -06:00			`#include <filesystem>`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`#include <memory>`
			`#include <llama.h>`

feat(backend): minor refactor 2024-10-23 06:12:32 -06:00			`#define LLAMA_SUCCESS(x) x == 0`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`namespace huggingface::tgi::backends::llama {`
feat(backend): correctly load llama.cpp model from llama api and not gpt2 2024-10-22 07:22:56 -06:00			`enum TgiLlamaCppBackendError {`
			`MODEL_FILE_DOESNT_EXIST = 1`
			`};`

feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00
			`class TgiLlamaCppBackend {`
feat(backend): use llama_token as TokenId type 2024-10-22 16:10:41 -06:00			`using TokenId = llama_token;`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`private:`
			`llama_model* model;`
			`llama_context* ctx;`
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00
			`/**`
			`*`
			`* @param topK`
			`* @param topP`
			`* @return`
			`*/`
			`std::unique_ptr<llama_sampler *> GetSamplerFromArgs(`
			`uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed);`

feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`public:`
feat(llamacpp): initial end2end build 2024-10-04 02:42:31 -06:00			`TgiLlamaCppBackend(llama_model model, llama_context ctx);`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`~TgiLlamaCppBackend();`
feat(llamacpp): initial end2end build 2024-10-04 02:42:31 -06:00
feat(backend): add some initial decoding steps 2024-10-22 16:09:10 -06:00			`/**`
			`*`
			`* @param text`
			`* @return`
			`*/`
			`[[nodiscard]] std::vector<TgiLlamaCppBackend::TokenId> Tokenize(const std::string& text) const;`

			`/**`
			`*`
			`* @param tokens`
			`* @param topK`
			`* @param topP`
			`* @param maxNewTokens`
			`* @return`
			`*/`
			`[[nodiscard]] std::vector<TgiLlamaCppBackend::TokenId> Generate(`
			`std::span<const TokenId> tokens,`
			`uint32_t topK,`
			`float_t topP = 1.0f,`
			`uint32_t maxNewTokens = std::numeric_limits<uint32_t>::max()`
			`);`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`};`

feat(backend): correctly load llama.cpp model from llama api and not gpt2 2024-10-22 07:22:56 -06:00			`std::expected<std::unique_ptr<TgiLlamaCppBackend>, TgiLlamaCppBackendError>`
			`CreateLlamaCppBackend(const std::filesystem::path& root);`
feat(llamacpp): initial commit # Conflicts: # Cargo.lock 2024-10-03 06:00:17 -06:00			`}`

			`#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP`