chore(backend): minor improvements

This commit is contained in:
Morgan Funtowicz 2024-11-13 00:08:26 +01:00
parent 363d5e45de
commit 02cd6fe427
1 changed files with 11 additions and 2 deletions

View File

@ -50,6 +50,8 @@ namespace huggingface::tgi::backends::llamacpp {
InferContext *ctx,
rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback
) {
// Wrapper around the provided Rust callback to inject the InferContext when returning from the C++ FFI boundaries
// It captures the context (ctx) using reference and will automatically call the Rust callback forwarding the InferContext
auto context_forwarding_callback =
[=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {
return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
@ -76,11 +78,18 @@ namespace huggingface::tgi::backends::llamacpp {
};
std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
const auto cxxPath = std::string(modelPath);
// Initialize the numa context from numactl
static const bool INITIALIZED_NUMA_CONTEXT_ONCE = [](){
llama_numa_init(GGML_NUMA_STRATEGY_NUMACTL);
return true;
}();
// Allocate model weights parameters
auto params = llama_model_default_params();
params.use_mmap = true;
auto *model = (llama_load_model_from_file(cxxPath.c_str(), params));
// Allocate the model from the Rust provided, string path
auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));
return std::make_unique<llama_cpp_worker_frontend_t>(model);
}
}