misc(backend): allow rebinding numa core affinity
This commit is contained in:
parent
30ae99631c
commit
2d9465d181
|
@ -3,7 +3,6 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <span>
|
|
||||||
|
|
||||||
#include <ggml.h>
|
#include <ggml.h>
|
||||||
#include <llama.h>
|
#include <llama.h>
|
||||||
|
|
|
@ -111,7 +111,7 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
|
struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }};
|
||||||
typedef std::unique_ptr<struct bitmask, numa_cpumask_deleter> unique_cpumask_ptr;
|
typedef std::unique_ptr<struct bitmask, numa_cpumask_deleter> unique_cpumask_ptr;
|
||||||
|
|
||||||
void set_numactl_core_affinity(rust::Slice<const size_t> affinity) {
|
void set_numa_core_affinity(rust::Slice<const size_t> affinity) {
|
||||||
// void set_numactl_core_affinity(std::vector<size_t> affinity) {
|
// void set_numactl_core_affinity(std::vector<size_t> affinity) {
|
||||||
#ifdef NUMA_AVAILABLE
|
#ifdef NUMA_AVAILABLE
|
||||||
if(numa_available()) {
|
if(numa_available()) {
|
||||||
|
@ -173,6 +173,14 @@ namespace huggingface::tgi::backends::llamacpp {
|
||||||
SPDLOG_WARN("TGI's llama.cpp backend was compiled without NUMA support");
|
SPDLOG_WARN("TGI's llama.cpp backend was compiled without NUMA support");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
void update_numa_affinity() {
|
||||||
|
SPDLOG_INFO("Rebinding NUMA affinity for current worker on thread: {}", std::this_thread::get_id());
|
||||||
|
llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
use crate::ffi::{
|
use crate::ffi::{
|
||||||
create_worker_frontend, set_numactl_core_affinity, GenerationParams, LlamaCppWorkerFrontend,
|
create_worker_frontend, set_numa_core_affinity, update_numa_affinity, GenerationParams,
|
||||||
SamplingParams,
|
LlamaCppWorkerFrontend, SamplingParams,
|
||||||
};
|
};
|
||||||
use async_channel::{unbounded as mpmc_unbounded, Receiver as MpmcReceiver, Sender as MpmcSender};
|
use async_channel::{unbounded as mpmc_unbounded, Receiver as MpmcReceiver, Sender as MpmcSender};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
|
@ -8,7 +8,6 @@ use cxx::UniquePtr;
|
||||||
use log::warn;
|
use log::warn;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::sync::mpsc::{channel, Receiver, Sender};
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::thread::spawn;
|
use std::thread::spawn;
|
||||||
use text_generation_router::infer::InferError::GenerationError;
|
use text_generation_router::infer::InferError::GenerationError;
|
||||||
|
@ -25,17 +24,6 @@ use tokio::time::Instant;
|
||||||
use tokio_stream::wrappers::UnboundedReceiverStream;
|
use tokio_stream::wrappers::UnboundedReceiverStream;
|
||||||
use tracing::{debug, error, info};
|
use tracing::{debug, error, info};
|
||||||
|
|
||||||
macro_rules! send_or_warn {
|
|
||||||
($send: expr, $err: expr) => {
|
|
||||||
if let Err(se) = $send.send(err) {
|
|
||||||
warn!(
|
|
||||||
"Failed to send message back to the user: {}. Originating error: {}",
|
|
||||||
se, e
|
|
||||||
);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_num_cores() -> usize {
|
fn get_num_cores() -> usize {
|
||||||
match option_env!("TGI_USE_PHYSICAL_CORES")
|
match option_env!("TGI_USE_PHYSICAL_CORES")
|
||||||
.unwrap_or("OFF")
|
.unwrap_or("OFF")
|
||||||
|
@ -272,8 +260,9 @@ fn worker_loop(
|
||||||
// This loop will mostly decode single token at every step, so no need to rely on parallelism
|
// This loop will mostly decode single token at every step, so no need to rely on parallelism
|
||||||
tokenizers::utils::parallelism::set_parallelism(false);
|
tokenizers::utils::parallelism::set_parallelism(false);
|
||||||
|
|
||||||
// Bind cores for the current thread
|
// Bind cores for the current thread and make sure it's taken into account
|
||||||
set_numactl_core_affinity(&affinity);
|
set_numa_core_affinity(&affinity);
|
||||||
|
update_numa_affinity();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if let Ok((generation, stream)) = backlog.recv_blocking() {
|
if let Ok((generation, stream)) = backlog.recv_blocking() {
|
||||||
|
|
|
@ -54,7 +54,8 @@ mod ffi {
|
||||||
num_threads: u32,
|
num_threads: u32,
|
||||||
) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
|
) -> Result<UniquePtr<LlamaCppWorkerFrontend>>;
|
||||||
|
|
||||||
fn set_numactl_core_affinity(affinity: &[usize]);
|
fn set_numa_core_affinity(affinity: &[usize]);
|
||||||
|
fn update_numa_affinity();
|
||||||
|
|
||||||
unsafe fn stream(
|
unsafe fn stream(
|
||||||
self: Pin<&mut LlamaCppWorkerFrontend>,
|
self: Pin<&mut LlamaCppWorkerFrontend>,
|
||||||
|
|
Loading…
Reference in New Issue