diff --git a/Cargo.lock b/Cargo.lock index ffc98baa..923b5cbe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -801,6 +801,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "ctrlc" version = "3.4.4" @@ -3402,9 +3423,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.118" +version = "1.0.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4" +checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" dependencies = [ "itoa", "ryu", @@ -3650,15 +3671,16 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" [[package]] name = "sysinfo" -version = "0.30.12" +version = "0.30.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "732ffa00f53e6b2af46208fba5718d9662a421049204e156328b66791ffa15ae" +checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3" dependencies = [ "cfg-if", "core-foundation-sys", "libc", "ntapi", "once_cell", + "rayon", "windows", ] @@ -3805,6 +3827,7 @@ dependencies = [ "axum-tracing-opentelemetry", "base64 0.22.1", "clap", + "csv", "futures", "futures-util", "hf-hub", @@ -3826,6 +3849,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sysinfo", "text-generation-client", "thiserror", "tokenizers", @@ -3837,6 +3861,7 @@ dependencies = [ "tracing-subscriber", "utoipa", "utoipa-swagger-ui", + "uuid", "vergen", ] @@ -4508,9 +4533,25 @@ dependencies = [ [[package]] name = "uuid" -version = "1.9.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +dependencies = [ + "getrandom", + "rand", + "uuid-macro-internal", +] + +[[package]] +name = "uuid-macro-internal" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee1cd046f83ea2c4e920d6ee9f7c3537ef928d75dce5d84a87c2c5d6b3999a3a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] [[package]] name = "v_frame" diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/basic_tutorials/launcher.md index 5e40146f..77f88490 100644 --- a/docs/source/basic_tutorials/launcher.md +++ b/docs/source/basic_tutorials/launcher.md @@ -424,6 +424,22 @@ Options: [env: LORA_ADAPTERS=] +``` +## DISABLE_USAGE_STATS +```shell + --disable-usage-stats + Disable sending of all usage statistics + + [env: DISABLE_USAGE_STATS=] + +``` +## DISABLE_CRASH_REPORTS +```shell + --disable-crash-reports + Disable sending of crash reports, but allow anonymous usage statistics + + [env: DISABLE_CRASH_REPORTS=] + ``` ## HELP ```shell diff --git a/docs/source/usage_statistics.md b/docs/source/usage_statistics.md new file mode 100644 index 00000000..adf0d70f --- /dev/null +++ b/docs/source/usage_statistics.md @@ -0,0 +1,73 @@ + +# Collection of Usage Statistics + +Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted. + +Data is sent twice, once on server startup and once when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine. + +## What data is collected + +The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/router/src/usage_stats.rs). +As of release 2.1.2 this is an example of the data collected: + +- From the TGI configuration: +```json +{ + "event_type": "start", + "disable_grammar_support": false, + "max_batch_prefill_tokens": 4096, + "max_batch_size": null, + "max_batch_total_tokens": null, + "max_best_of": 2, + "max_client_batch_size": 4, + "max_concurrent_requests": 128, + "max_input_tokens": 1024, + "max_stop_sequences": 4, + "max_top_n_tokens": 5, + "max_total_tokens": 2048, + "max_waiting_tokens": 20, + "messages_api_enabled": false, + "model_config": { + "model_type": "Bloom" + }, + "revision": null, + "tokenizer_class": "BloomTokenizerFast", + "validation_workers": 2, + "waiting_served_ratio": 1.2, + "docker_label": "latest", + "git_sha": "cfc118704880453d29bcbe4fbbd91dda501cf5fe", + "nvidia_env": { + "name": "NVIDIA A10G", + "pci_bus_id": "00000000:00:1E.0", + "driver_version": "535.183.01", + "pstate": "P8", + "pcie_link_gen_max": "4", + "pcie_link_gen_current": "1", + "temperature_gpu": "31", + "utilization_gpu": "0 %", + "utilization_memory": "0 %", + "memory_total": "23028 MiB", + "memory_free": "22515 MiB", + "memory_used": "0 MiB", + "reset_status_reset_required": "No", + "reset_status_drain_and_reset_recommended": "No", + "compute_cap": "8.6", + "ecc_errors_corrected_volatile_total": "0", + "mig_mode_current": "[N/A]", + "power_draw_instant": "10.86 W", + "power_limit": "300.00 W" + }, + "system_env": { + "cpu_count": 16, + "cpu_type": "AMD EPYC 7R32", + "total_memory": 66681196544, + "architecture": "x86_64", + "platform": "linux-unix-x86_64" + } +} + +``` + +## How to opt-out + +You can easily opt out by passing the `--disable-usage-stats` to the text-generation-launcher command. This will disable all usage statistics. You can also pass `--disable-crash-reports` which disables sending specific crash reports, but allows anonymous usage statistics. diff --git a/launcher/src/main.rs b/launcher/src/main.rs index d2ca38e5..ef7b4712 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -457,6 +457,14 @@ struct Args { /// startup that will be available to callers via the `adapter_id` field in a request. #[clap(long, env)] lora_adapters: Option, + + /// Disable sending of all usage statistics + #[clap(default_value = "false", long, env)] + disable_usage_stats: bool, + + /// Disable sending of crash reports, but allow anonymous usage statistics + #[clap(default_value = "false", long, env)] + disable_crash_reports: bool, } #[derive(Debug)] @@ -1201,6 +1209,14 @@ fn spawn_webserver( args.model_id, ]; + // Pass usage stats flags to router + if args.disable_usage_stats { + router_args.push("--disable-usage-stats".to_string()); + } + if args.disable_crash_reports { + router_args.push("--disable-crash-reports".to_string()); + } + // Grammar support if args.disable_grammar_support { router_args.push("--disable-grammar-support".to_string()); diff --git a/router/Cargo.toml b/router/Cargo.toml index 60fb5c9d..0fc700a0 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -52,6 +52,10 @@ regex = "1.10.3" once_cell = "1.19.0" image = "0.25.1" base64 = { workspace = true } +sysinfo = "0.30.13" +uuid = { version = "1.9.1", default-features = false, features = ["v4", "fast-rng", "macro-diagnostics"] } +csv = "1.3.0" + [build-dependencies] vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] } diff --git a/router/src/lib.rs b/router/src/lib.rs index f856406d..52926c6c 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -7,6 +7,8 @@ mod validation; #[cfg(feature = "kserve")] mod kserve; +pub mod usage_stats; + use serde::{Deserialize, Serialize}; use tracing::warn; use utoipa::ToSchema; @@ -40,13 +42,13 @@ pub struct HubModelInfo { pub pipeline_tag: Option, } -#[derive(Debug, Clone, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct ChatTemplate { name: String, template: String, } -#[derive(Debug, Clone, Deserialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(untagged)] pub enum ChatTemplateVersions { Single(String), @@ -55,7 +57,7 @@ pub enum ChatTemplateVersions { use std::path::Path; -#[derive(Debug, Clone, Deserialize, Default)] +#[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct HubTokenizerConfig { pub chat_template: Option, pub completion_template: Option, diff --git a/router/src/main.rs b/router/src/main.rs index b060d73c..bfc77913 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -14,6 +14,7 @@ use std::io::BufReader; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::path::{Path, PathBuf}; use text_generation_router::config::Config; +use text_generation_router::usage_stats; use text_generation_router::{ server, HubModelInfo, HubPreprocessorConfig, HubProcessorConfig, HubTokenizerConfig, }; @@ -87,6 +88,10 @@ struct Args { disable_grammar_support: bool, #[clap(default_value = "4", long, env)] max_client_batch_size: usize, + #[clap(long, env, default_value_t)] + disable_usage_stats: bool, + #[clap(long, env, default_value_t)] + disable_crash_reports: bool, } #[derive(Debug, Subcommand)] @@ -128,6 +133,8 @@ async fn main() -> Result<(), RouterError> { messages_api_enabled, disable_grammar_support, max_client_batch_size, + disable_usage_stats, + disable_crash_reports, command, } = args; @@ -324,6 +331,7 @@ async fn main() -> Result<(), RouterError> { tracing::warn!("Could not find tokenizer config locally and no API specified"); HubTokenizerConfig::default() }); + let tokenizer_class = tokenizer_config.tokenizer_class.clone(); let tokenizer: Option = tokenizer_filename.and_then(|filename| { let mut tokenizer = Tokenizer::from_file(filename).ok(); @@ -378,8 +386,47 @@ async fn main() -> Result<(), RouterError> { } }; + // Only send usage stats when TGI is run in container and the function returns Some + let is_container = matches!(usage_stats::is_container(), Ok(true)); + + let user_agent = if !disable_usage_stats && is_container { + let reduced_args = usage_stats::Args::new( + config.clone(), + tokenizer_class, + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + waiting_served_ratio, + max_batch_prefill_tokens, + max_batch_total_tokens, + max_waiting_tokens, + max_batch_size, + revision, + validation_workers, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + disable_usage_stats, + disable_crash_reports, + ); + Some(usage_stats::UserAgent::new(reduced_args)) + } else { + None + }; + + if let Some(ref ua) = user_agent { + let start_event = + usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start, None); + tokio::spawn(async move { + start_event.send().await; + }); + }; + // Run server - server::run( + let result = server::run( master_shard_uds_path, model_info, compat_return_full_text, @@ -410,8 +457,41 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, print_schema_command, ) - .await?; - Ok(()) + .await; + + match result { + Ok(_) => { + if let Some(ref ua) = user_agent { + let stop_event = usage_stats::UsageStatsEvent::new( + ua.clone(), + usage_stats::EventType::Stop, + None, + ); + stop_event.send().await; + }; + Ok(()) + } + Err(e) => { + if let Some(ref ua) = user_agent { + if !disable_crash_reports { + let error_event = usage_stats::UsageStatsEvent::new( + ua.clone(), + usage_stats::EventType::Error, + Some(e.to_string()), + ); + error_event.send().await; + } else { + let unknow_error_event = usage_stats::UsageStatsEvent::new( + ua.clone(), + usage_stats::EventType::Error, + Some("unknow_error".to_string()), + ); + unknow_error_event.send().await; + } + }; + Err(RouterError::WebServer(e)) + } + } } /// Init logging using env variables LOG_LEVEL and LOG_FORMAT: diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs new file mode 100644 index 00000000..8559ae90 --- /dev/null +++ b/router/src/usage_stats.rs @@ -0,0 +1,355 @@ +use crate::config::Config; +use csv::ReaderBuilder; +use reqwest::header::HeaderMap; +use serde::Serialize; +use std::{ + fs::File, + io::{self, BufRead}, + path::Path, + process::Command, + time::Duration, +}; +use uuid::Uuid; + +const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi"; + +#[derive(Debug, Clone, Serialize)] +pub struct UserAgent { + pub uid: String, + pub args: Args, + pub env: Env, +} + +impl UserAgent { + pub fn new(reduced_args: Args) -> Self { + Self { + uid: Uuid::new_v4().to_string(), + args: reduced_args, + env: Env::new(), + } + } +} + +#[derive(Serialize, Debug)] +pub enum EventType { + Start, + Stop, + Error, +} + +#[derive(Debug, Serialize)] +pub struct UsageStatsEvent { + user_agent: UserAgent, + event_type: EventType, + #[serde(skip_serializing_if = "Option::is_none")] + error_reason: Option, +} + +impl UsageStatsEvent { + pub fn new(user_agent: UserAgent, event_type: EventType, error_reason: Option) -> Self { + Self { + user_agent, + event_type, + error_reason, + } + } + pub async fn send(&self) { + let mut headers = HeaderMap::new(); + headers.insert("Content-Type", "application/json".parse().unwrap()); + let body = serde_json::to_string(&self).unwrap(); + let client = reqwest::Client::new(); + let _ = client + .post(TELEMETRY_URL) + .headers(headers) + .body(body) + .timeout(Duration::from_secs(5)) + .send() + .await; + } +} + +#[derive(Debug, Clone, Serialize)] +pub struct Args { + model_config: Option, + tokenizer_config: Option, + max_concurrent_requests: usize, + max_best_of: usize, + max_stop_sequences: usize, + max_top_n_tokens: u32, + max_input_tokens: usize, + max_total_tokens: usize, + waiting_served_ratio: f32, + max_batch_prefill_tokens: u32, + max_batch_total_tokens: Option, + max_waiting_tokens: usize, + max_batch_size: Option, + revision: Option, + validation_workers: usize, + messages_api_enabled: bool, + disable_grammar_support: bool, + max_client_batch_size: usize, + disable_usage_stats: bool, + disable_crash_reports: bool, +} + +impl Args { + #[allow(clippy::too_many_arguments)] + pub fn new( + model_config: Option, + tokenizer_config: Option, + max_concurrent_requests: usize, + max_best_of: usize, + max_stop_sequences: usize, + max_top_n_tokens: u32, + max_input_tokens: usize, + max_total_tokens: usize, + waiting_served_ratio: f32, + max_batch_prefill_tokens: u32, + max_batch_total_tokens: Option, + max_waiting_tokens: usize, + max_batch_size: Option, + revision: Option, + validation_workers: usize, + messages_api_enabled: bool, + disable_grammar_support: bool, + max_client_batch_size: usize, + disable_usage_stats: bool, + disable_crash_reports: bool, + ) -> Self { + Self { + model_config, + tokenizer_config, + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + waiting_served_ratio, + max_batch_prefill_tokens, + max_batch_total_tokens, + max_waiting_tokens, + max_batch_size, + revision, + validation_workers, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + disable_usage_stats, + disable_crash_reports, + } + } +} + +/// This is more or less a copy of the code from the `text-generation-launcher` crate to avoid a dependency +#[derive(Serialize, Debug, Clone)] +pub struct Env { + git_sha: &'static str, + docker_label: &'static str, + nvidia_info: Option>, + xpu_info: Option>, + system_env: SystemInfo, +} + +#[derive(Debug, Serialize, Clone)] +struct NvidiaSmiInfo { + name: String, + pci_bus_id: String, + driver_version: String, + pstate: String, + pcie_link_gen_max: String, + pcie_link_gen_current: String, + temperature_gpu: String, + utilization_gpu: String, + utilization_memory: String, + memory_total: String, + memory_free: String, + memory_used: String, + reset_status_reset_required: String, + reset_status_drain_and_reset_recommended: String, + compute_cap: String, + ecc_errors_corrected_volatile_total: String, + mig_mode_current: String, + power_draw_instant: String, + power_limit: String, +} + +impl NvidiaSmiInfo { + fn new() -> Option> { + let output = Command::new("nvidia-smi") + .args([ + "--query-gpu=name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.gpucurrent,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,reset_status.reset_required,reset_status.drain_and_reset_recommended,compute_cap,ecc.errors.corrected.volatile.total,mig.mode.current,power.draw.instant,power.limit", + "--format=csv" + ]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let stdout = String::from_utf8(output.stdout).ok()?; + + let mut rdr = ReaderBuilder::new() + .has_headers(true) + .from_reader(stdout.as_bytes()); + + let mut infos = Vec::new(); + + for result in rdr.records() { + let record = result.ok()?; + infos.push(NvidiaSmiInfo { + name: record[0].to_string(), + pci_bus_id: record[1].to_string(), + driver_version: record[2].to_string(), + pstate: record[3].to_string(), + pcie_link_gen_max: record[4].to_string(), + pcie_link_gen_current: record[5].to_string(), + temperature_gpu: record[6].to_string(), + utilization_gpu: record[7].to_string(), + utilization_memory: record[8].to_string(), + memory_total: record[9].to_string(), + memory_free: record[10].to_string(), + memory_used: record[11].to_string(), + reset_status_reset_required: record[12].to_string(), + reset_status_drain_and_reset_recommended: record[13].to_string(), + compute_cap: record[14].to_string(), + ecc_errors_corrected_volatile_total: record[15].to_string(), + mig_mode_current: record[16].to_string(), + power_draw_instant: record[17].to_string(), + power_limit: record[18].to_string(), + }); + } + + Some(infos) + } +} + +#[derive(Debug, Serialize, Clone)] +struct XpuSmiInfo { + device_id: usize, + gpu_utilization: f32, + gpu_power: f32, + gpu_core_temperature: f32, + gpu_memory_bandwidth_utilization: f32, +} + +impl XpuSmiInfo { + /// based on this https://github.com/intel/xpumanager/blob/master/doc/smi_user_guide.md#dump-the-device-statistics-in-csv-format + fn new() -> Option> { + let output = Command::new("xpu-smi") + .args([ + "dump", "-d", "-1", "-m", + "0,1,3,17", // Metrics IDs: GPU Utilization, GPU Power, GPU Core Temperature, GPU Memory Bandwidth Utilization + "-n", "1", "-j", + ]) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let stdout = String::from_utf8(output.stdout).ok()?; + let mut infos = Vec::new(); + + let json_data: serde_json::Value = match serde_json::from_str(&stdout) { + Ok(data) => data, + Err(_) => return None, + }; + + if let Some(metrics_data) = json_data.as_array() { + for entry in metrics_data { + let device_id = entry["deviceId"].as_u64()? as usize; + let gpu_utilization = entry["metrics"][0].as_f64()? as f32; + let gpu_power = entry["metrics"][1].as_f64()? as f32; + let gpu_core_temperature = entry["metrics"][2].as_f64()? as f32; + let gpu_memory_bandwidth_utilization = entry["metrics"][3].as_f64()? as f32; + + infos.push(XpuSmiInfo { + device_id, + gpu_utilization, + gpu_power, + gpu_core_temperature, + gpu_memory_bandwidth_utilization, + }); + } + } + + Some(infos) + } +} + +#[derive(Serialize, Debug, Clone)] +pub struct SystemInfo { + cpu_count: usize, + cpu_type: String, + total_memory: u64, + architecture: String, + platform: String, +} + +impl SystemInfo { + fn new() -> Self { + let mut system = sysinfo::System::new_all(); + system.refresh_all(); + + let cpu_count = system.cpus().len(); + let cpu_type = system.cpus()[0].brand().to_string(); + let total_memory = system.total_memory(); + let architecture = std::env::consts::ARCH.to_string(); + let platform = format!( + "{}-{}-{}", + std::env::consts::OS, + std::env::consts::FAMILY, + std::env::consts::ARCH + ); + Self { + cpu_count, + cpu_type, + total_memory, + architecture, + platform, + } + } +} + +impl Default for Env { + fn default() -> Self { + Self::new() + } +} + +impl Env { + pub fn new() -> Self { + Self { + system_env: SystemInfo::new(), + nvidia_info: NvidiaSmiInfo::new(), + xpu_info: XpuSmiInfo::new(), + git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"), + docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"), + } + } +} + +pub fn is_container() -> io::Result { + let path = Path::new("/proc/self/cgroup"); + let file = File::open(path)?; + let reader = io::BufReader::new(file); + + for line in reader.lines() { + let line = line?; + // Check for common container runtimes + if line.contains("/docker/") + || line.contains("/docker-") + || line.contains("/kubepods/") + || line.contains("/kubepods-") + || line.contains("containerd") + || line.contains("crio") + || line.contains("podman") + { + return Ok(true); + } + } + Ok(false) +}