chore(backend): minor improvements
This commit is contained in:
parent
363d5e45de
commit
02cd6fe427
|
@ -50,6 +50,8 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
InferContext *ctx,
|
InferContext *ctx,
|
||||||
rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback
|
rust::Fn<bool(InferContext *, uint32_t, float_t, bool, size_t)> callback
|
||||||
) {
|
) {
|
||||||
|
// Wrapper around the provided Rust callback to inject the InferContext when returning from the C++ FFI boundaries
|
||||||
|
// It captures the context (ctx) using reference and will automatically call the Rust callback forwarding the InferContext
|
||||||
auto context_forwarding_callback =
|
auto context_forwarding_callback =
|
||||||
[=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {
|
[=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool {
|
||||||
return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
|
return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens);
|
||||||
|
@ -76,11 +78,18 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
|
std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
|
||||||
const auto cxxPath = std::string(modelPath);
|
// Initialize the numa context from numactl
|
||||||
|
static const bool INITIALIZED_NUMA_CONTEXT_ONCE = [](){
|
||||||
|
llama_numa_init(GGML_NUMA_STRATEGY_NUMACTL);
|
||||||
|
return true;
|
||||||
|
}();
|
||||||
|
|
||||||
|
// Allocate model weights parameters
|
||||||
auto params = llama_model_default_params();
|
auto params = llama_model_default_params();
|
||||||
params.use_mmap = true;
|
params.use_mmap = true;
|
||||||
|
|
||||||
auto *model = (llama_load_model_from_file(cxxPath.c_str(), params));
|
// Allocate the model from the Rust provided, string path
|
||||||
|
auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));
|
||||||
return std::make_unique<llama_cpp_worker_frontend_t>(model);
|
return std::make_unique<llama_cpp_worker_frontend_t>(model);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue