feat(backend): correctly setup llama_context providing n_threads and n_ubatch

This commit is contained in:
Morgan Funtowicz 2024-11-21 21:43:50 +01:00
parent 50c376612c
commit 84eead219a
5 changed files with 24 additions and 18 deletions

View File

@ -39,7 +39,7 @@ namespace huggingface::tgi::backends::llamacpp {
return {pSampler, llama_sampler_deleter}; return {pSampler, llama_sampler_deleter};
} }
worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &params) worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &&params)
: model_(model), context_(llama_new_context_with_model(model_.get(), params)) { : model_(model), context_(llama_new_context_with_model(model_.get(), params)) {
#ifdef TGI_LLAMACPP_BACKEND_DEBUG #ifdef TGI_LLAMACPP_BACKEND_DEBUG

View File

@ -85,7 +85,7 @@ namespace huggingface::tgi::backends::llamacpp {
* @param model * @param model
* @param params * @param params
*/ */
worker_t(std::shared_ptr<llama_model>, const llama_context_params &); worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);
/** /**
* *

View File

@ -51,8 +51,8 @@ namespace huggingface::tgi::backends::llamacpp {
worker_t worker_; worker_t worker_;
public: public:
explicit llama_cpp_worker_frontend_t(llama_model *model): explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads):
model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {} model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {}
size_t stream( size_t stream(
rust::Slice<const uint32_t> input_tokens, rust::Slice<const uint32_t> input_tokens,
@ -88,7 +88,7 @@ namespace huggingface::tgi::backends::llamacpp {
} }
}; };
std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) { std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath, uint32_t num_threads) {
#ifdef TGI_LLAMACPP_BACKEND_DEBUG #ifdef TGI_LLAMACPP_BACKEND_DEBUG
spdlog::set_level(spdlog::level::debug); spdlog::set_level(spdlog::level::debug);
#endif #endif
@ -105,7 +105,7 @@ namespace huggingface::tgi::backends::llamacpp {
// Allocate the model from the Rust provided, string path // Allocate the model from the Rust provided, string path
auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params)); auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));
return std::make_unique<llama_cpp_worker_frontend_t>(model); return std::make_unique<llama_cpp_worker_frontend_t>(model, static_cast<int32_t>(num_threads));
} }
struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }}; struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};

View File

@ -122,8 +122,9 @@ pub struct LlamaCppBackend {
impl LlamaCppBackend { impl LlamaCppBackend {
fn allocate_worker( fn allocate_worker(
path: &Path, path: &Path,
num_threads: u32,
) -> Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError> { ) -> Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError> {
create_worker_frontend(&path.display().to_string()).map_err(|ref err| { create_worker_frontend(&path.display().to_string(), num_threads).map_err(|ref err| {
LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string()) LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string())
}) })
} }
@ -145,17 +146,19 @@ impl LlamaCppBackend {
// Allocate all the workers // Allocate all the workers
let streams = cores_allocation let streams = cores_allocation
.iter() .iter()
.map(|affinity| match Self::allocate_worker(path) { .map(
Ok(worker) => { |affinity| match Self::allocate_worker(path, num_cores_per_instance as u32) {
let tokenizer = Arc::clone(&tokenizer); Ok(worker) => {
let (sender, receiver) = channel(); let tokenizer = Arc::clone(&tokenizer);
let affinity = affinity.clone().collect::<Vec<_>>(); let (sender, receiver) = channel();
spawn(move || worker_loop(worker, affinity, tokenizer, receiver)); let affinity = affinity.clone().collect::<Vec<_>>();
spawn(move || worker_loop(worker, affinity, tokenizer, receiver));
Ok(LlamaCppWorker { sender }) Ok(LlamaCppWorker { sender })
} }
Err(e) => Err(e), Err(e) => Err(e),
}) },
)
.collect::<Result<Vec<_>, _>>()?; .collect::<Result<Vec<_>, _>>()?;
// Start the scheduler loop // Start the scheduler loop

View File

@ -49,7 +49,10 @@ mod ffi {
#[cxx_name = "llama_cpp_worker_frontend_t"] #[cxx_name = "llama_cpp_worker_frontend_t"]
type LlamaCppWorkerFrontend; type LlamaCppWorkerFrontend;
fn create_worker_frontend(modelPath: &str) -> Result<UniquePtr<LlamaCppWorkerFrontend>>; fn create_worker_frontend(
modelPath: &str,
num_threads: u32,
) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
fn set_numactl_core_affinity(affinity: &[usize]); fn set_numactl_core_affinity(affinity: &[usize]);