feat(backend): correctly setup llama_context providing n_threads and n_ubatch
This commit is contained in:
parent
50c376612c
commit
84eead219a
|
@ -39,7 +39,7 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
return {pSampler, llama_sampler_deleter};
|
return {pSampler, llama_sampler_deleter};
|
||||||
}
|
}
|
||||||
|
|
||||||
worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params ¶ms)
|
worker_t::worker_t(std::shared_ptr<llama_model> model, const llama_context_params &¶ms)
|
||||||
: model_(model), context_(llama_new_context_with_model(model_.get(), params)) {
|
: model_(model), context_(llama_new_context_with_model(model_.get(), params)) {
|
||||||
|
|
||||||
#ifdef TGI_LLAMACPP_BACKEND_DEBUG
|
#ifdef TGI_LLAMACPP_BACKEND_DEBUG
|
||||||
|
|
|
@ -85,7 +85,7 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
* @param model
|
* @param model
|
||||||
* @param params
|
* @param params
|
||||||
*/
|
*/
|
||||||
worker_t(std::shared_ptr<llama_model>, const llama_context_params &);
|
worker_t(std::shared_ptr<llama_model>, const llama_context_params &&);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
|
|
@ -51,8 +51,8 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
worker_t worker_;
|
worker_t worker_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit llama_cpp_worker_frontend_t(llama_model *model):
|
explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads):
|
||||||
model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {}
|
model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {}
|
||||||
|
|
||||||
size_t stream(
|
size_t stream(
|
||||||
rust::Slice<const uint32_t> input_tokens,
|
rust::Slice<const uint32_t> input_tokens,
|
||||||
|
@ -88,7 +88,7 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath) {
|
std::unique_ptr<llama_cpp_worker_frontend_t> create_worker_frontend(rust::Str modelPath, uint32_t num_threads) {
|
||||||
#ifdef TGI_LLAMACPP_BACKEND_DEBUG
|
#ifdef TGI_LLAMACPP_BACKEND_DEBUG
|
||||||
spdlog::set_level(spdlog::level::debug);
|
spdlog::set_level(spdlog::level::debug);
|
||||||
#endif
|
#endif
|
||||||
|
@ -105,7 +105,7 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
|
|
||||||
// Allocate the model from the Rust provided, string path
|
// Allocate the model from the Rust provided, string path
|
||||||
auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));
|
auto *model = (llama_load_model_from_file(static_cast<std::string>(modelPath).c_str(), params));
|
||||||
return std::make_unique<llama_cpp_worker_frontend_t>(model);
|
return std::make_unique<llama_cpp_worker_frontend_t>(model, static_cast<int32_t>(num_threads));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
|
struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
|
||||||
|
|
|
@ -122,8 +122,9 @@ pub struct LlamaCppBackend {
|
||||||
impl LlamaCppBackend {
|
impl LlamaCppBackend {
|
||||||
fn allocate_worker(
|
fn allocate_worker(
|
||||||
path: &Path,
|
path: &Path,
|
||||||
|
num_threads: u32,
|
||||||
) -> Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError> {
|
) -> Result<UniquePtr<LlamaCppWorkerFrontend>, LlamaCppBackendError> {
|
||||||
create_worker_frontend(&path.display().to_string()).map_err(|ref err| {
|
create_worker_frontend(&path.display().to_string(), num_threads).map_err(|ref err| {
|
||||||
LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string())
|
LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string())
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -145,7 +146,8 @@ impl LlamaCppBackend {
|
||||||
// Allocate all the workers
|
// Allocate all the workers
|
||||||
let streams = cores_allocation
|
let streams = cores_allocation
|
||||||
.iter()
|
.iter()
|
||||||
.map(|affinity| match Self::allocate_worker(path) {
|
.map(
|
||||||
|
|affinity| match Self::allocate_worker(path, num_cores_per_instance as u32) {
|
||||||
Ok(worker) => {
|
Ok(worker) => {
|
||||||
let tokenizer = Arc::clone(&tokenizer);
|
let tokenizer = Arc::clone(&tokenizer);
|
||||||
let (sender, receiver) = channel();
|
let (sender, receiver) = channel();
|
||||||
|
@ -155,7 +157,8 @@ impl LlamaCppBackend {
|
||||||
Ok(LlamaCppWorker { sender })
|
Ok(LlamaCppWorker { sender })
|
||||||
}
|
}
|
||||||
Err(e) => Err(e),
|
Err(e) => Err(e),
|
||||||
})
|
},
|
||||||
|
)
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
|
||||||
// Start the scheduler loop
|
// Start the scheduler loop
|
||||||
|
|
|
@ -49,7 +49,10 @@ mod ffi {
|
||||||
#[cxx_name = "llama_cpp_worker_frontend_t"]
|
#[cxx_name = "llama_cpp_worker_frontend_t"]
|
||||||
type LlamaCppWorkerFrontend;
|
type LlamaCppWorkerFrontend;
|
||||||
|
|
||||||
fn create_worker_frontend(modelPath: &str) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
|
fn create_worker_frontend(
|
||||||
|
modelPath: &str,
|
||||||
|
num_threads: u32,
|
||||||
|
) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
|
||||||
|
|
||||||
fn set_numactl_core_affinity(affinity: &[usize]);
|
fn set_numactl_core_affinity(affinity: &[usize]);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue