usage stats and crash reports (#2220)

* draft of usage stats

* fix wrong link

* launcher doesn't need sysinfo dep

* only tokenizer class instead of hole struct

* unused import

* fix clippy errors

* update openAPI doc

* cargo fmt

* fix error in passing flags to router

* try again to update docs

* run pre-commit locally

* Update router/src/main.rs

Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>

* Update router/src/main.rs

Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>

* on crash use anonymous error event

* delete json_output and ngrok

* more robust way of checking if is in container

* more robust nvidia smi

* parse xpu more robustly

* fix errors

* add nvidia-smi details in docs

* cargo fmt

* fix clippy

* should make docs check pass

* Update router/src/usage_stats.rs

Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>

* error reason can't be in nested json

* cargo fmt

---------

Co-authored-by: Hugo Larcher <hugo.larcher@huggingface.co>
Co-authored-by: Erik Kaunismäki <erikkaum@Eriks-MacBook-Pro.local>
This commit is contained in:
Erik Kaunismäki 2024-07-19 16:17:56 +02:00 committed by GitHub
parent 3f37a66774
commit 4c19593a90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 599 additions and 12 deletions

53
Cargo.lock generated
View File

@ -801,6 +801,27 @@ dependencies = [
"typenum",
]
[[package]]
name = "csv"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
dependencies = [
"csv-core",
"itoa",
"ryu",
"serde",
]
[[package]]
name = "csv-core"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
dependencies = [
"memchr",
]
[[package]]
name = "ctrlc"
version = "3.4.4"
@ -3402,9 +3423,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.118"
version = "1.0.120"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d947f6b3163d8857ea16c4fa0dd4840d52f3041039a85decd46867eb1abef2e4"
checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5"
dependencies = [
"itoa",
"ryu",
@ -3650,15 +3671,16 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
[[package]]
name = "sysinfo"
version = "0.30.12"
version = "0.30.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "732ffa00f53e6b2af46208fba5718d9662a421049204e156328b66791ffa15ae"
checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3"
dependencies = [
"cfg-if",
"core-foundation-sys",
"libc",
"ntapi",
"once_cell",
"rayon",
"windows",
]
@ -3805,6 +3827,7 @@ dependencies = [
"axum-tracing-opentelemetry",
"base64 0.22.1",
"clap",
"csv",
"futures",
"futures-util",
"hf-hub",
@ -3826,6 +3849,7 @@ dependencies = [
"reqwest",
"serde",
"serde_json",
"sysinfo",
"text-generation-client",
"thiserror",
"tokenizers",
@ -3837,6 +3861,7 @@ dependencies = [
"tracing-subscriber",
"utoipa",
"utoipa-swagger-ui",
"uuid",
"vergen",
]
@ -4508,9 +4533,25 @@ dependencies = [
[[package]]
name = "uuid"
version = "1.9.1"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439"
checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
dependencies = [
"getrandom",
"rand",
"uuid-macro-internal",
]
[[package]]
name = "uuid-macro-internal"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee1cd046f83ea2c4e920d6ee9f7c3537ef928d75dce5d84a87c2c5d6b3999a3a"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.68",
]
[[package]]
name = "v_frame"

View File

@ -424,6 +424,22 @@ Options:
[env: LORA_ADAPTERS=]
```
## DISABLE_USAGE_STATS
```shell
--disable-usage-stats
Disable sending of all usage statistics
[env: DISABLE_USAGE_STATS=]
```
## DISABLE_CRASH_REPORTS
```shell
--disable-crash-reports
Disable sending of crash reports, but allow anonymous usage statistics
[env: DISABLE_CRASH_REPORTS=]
```
## HELP
```shell

View File

@ -0,0 +1,73 @@
# Collection of Usage Statistics
Text Generation Inference collects anonymous usage statistics to help us improve the service. The collected data is used to improve TGI and to understand what causes failures. The data is collected transparently and any sensitive information is omitted.
Data is sent twice, once on server startup and once when server stops. Also, usage statistics are only enabled when TGI is running in docker to avoid collecting data then TGI runs directly on the host machine.
## What data is collected
The code that collects the data is available [here](https://github.com/huggingface/text-generation-inference/blob/main/router/src/usage_stats.rs).
As of release 2.1.2 this is an example of the data collected:
- From the TGI configuration:
```json
{
"event_type": "start",
"disable_grammar_support": false,
"max_batch_prefill_tokens": 4096,
"max_batch_size": null,
"max_batch_total_tokens": null,
"max_best_of": 2,
"max_client_batch_size": 4,
"max_concurrent_requests": 128,
"max_input_tokens": 1024,
"max_stop_sequences": 4,
"max_top_n_tokens": 5,
"max_total_tokens": 2048,
"max_waiting_tokens": 20,
"messages_api_enabled": false,
"model_config": {
"model_type": "Bloom"
},
"revision": null,
"tokenizer_class": "BloomTokenizerFast",
"validation_workers": 2,
"waiting_served_ratio": 1.2,
"docker_label": "latest",
"git_sha": "cfc118704880453d29bcbe4fbbd91dda501cf5fe",
"nvidia_env": {
"name": "NVIDIA A10G",
"pci_bus_id": "00000000:00:1E.0",
"driver_version": "535.183.01",
"pstate": "P8",
"pcie_link_gen_max": "4",
"pcie_link_gen_current": "1",
"temperature_gpu": "31",
"utilization_gpu": "0 %",
"utilization_memory": "0 %",
"memory_total": "23028 MiB",
"memory_free": "22515 MiB",
"memory_used": "0 MiB",
"reset_status_reset_required": "No",
"reset_status_drain_and_reset_recommended": "No",
"compute_cap": "8.6",
"ecc_errors_corrected_volatile_total": "0",
"mig_mode_current": "[N/A]",
"power_draw_instant": "10.86 W",
"power_limit": "300.00 W"
},
"system_env": {
"cpu_count": 16,
"cpu_type": "AMD EPYC 7R32",
"total_memory": 66681196544,
"architecture": "x86_64",
"platform": "linux-unix-x86_64"
}
}
```
## How to opt-out
You can easily opt out by passing the `--disable-usage-stats` to the text-generation-launcher command. This will disable all usage statistics. You can also pass `--disable-crash-reports` which disables sending specific crash reports, but allows anonymous usage statistics.

View File

@ -457,6 +457,14 @@ struct Args {
/// startup that will be available to callers via the `adapter_id` field in a request.
#[clap(long, env)]
lora_adapters: Option<String>,
/// Disable sending of all usage statistics
#[clap(default_value = "false", long, env)]
disable_usage_stats: bool,
/// Disable sending of crash reports, but allow anonymous usage statistics
#[clap(default_value = "false", long, env)]
disable_crash_reports: bool,
}
#[derive(Debug)]
@ -1201,6 +1209,14 @@ fn spawn_webserver(
args.model_id,
];
// Pass usage stats flags to router
if args.disable_usage_stats {
router_args.push("--disable-usage-stats".to_string());
}
if args.disable_crash_reports {
router_args.push("--disable-crash-reports".to_string());
}
// Grammar support
if args.disable_grammar_support {
router_args.push("--disable-grammar-support".to_string());

View File

@ -52,6 +52,10 @@ regex = "1.10.3"
once_cell = "1.19.0"
image = "0.25.1"
base64 = { workspace = true }
sysinfo = "0.30.13"
uuid = { version = "1.9.1", default-features = false, features = ["v4", "fast-rng", "macro-diagnostics"] }
csv = "1.3.0"
[build-dependencies]
vergen = { version = "8.2.5", features = ["build", "git", "gitcl"] }

View File

@ -7,6 +7,8 @@ mod validation;
#[cfg(feature = "kserve")]
mod kserve;
pub mod usage_stats;
use serde::{Deserialize, Serialize};
use tracing::warn;
use utoipa::ToSchema;
@ -40,13 +42,13 @@ pub struct HubModelInfo {
pub pipeline_tag: Option<String>,
}
#[derive(Debug, Clone, Deserialize, PartialEq)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ChatTemplate {
name: String,
template: String,
}
#[derive(Debug, Clone, Deserialize, PartialEq)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(untagged)]
pub enum ChatTemplateVersions {
Single(String),
@ -55,7 +57,7 @@ pub enum ChatTemplateVersions {
use std::path::Path;
#[derive(Debug, Clone, Deserialize, Default)]
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct HubTokenizerConfig {
pub chat_template: Option<ChatTemplateVersions>,
pub completion_template: Option<String>,

View File

@ -14,6 +14,7 @@ use std::io::BufReader;
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use std::path::{Path, PathBuf};
use text_generation_router::config::Config;
use text_generation_router::usage_stats;
use text_generation_router::{
server, HubModelInfo, HubPreprocessorConfig, HubProcessorConfig, HubTokenizerConfig,
};
@ -87,6 +88,10 @@ struct Args {
disable_grammar_support: bool,
#[clap(default_value = "4", long, env)]
max_client_batch_size: usize,
#[clap(long, env, default_value_t)]
disable_usage_stats: bool,
#[clap(long, env, default_value_t)]
disable_crash_reports: bool,
}
#[derive(Debug, Subcommand)]
@ -128,6 +133,8 @@ async fn main() -> Result<(), RouterError> {
messages_api_enabled,
disable_grammar_support,
max_client_batch_size,
disable_usage_stats,
disable_crash_reports,
command,
} = args;
@ -324,6 +331,7 @@ async fn main() -> Result<(), RouterError> {
tracing::warn!("Could not find tokenizer config locally and no API specified");
HubTokenizerConfig::default()
});
let tokenizer_class = tokenizer_config.tokenizer_class.clone();
let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
let mut tokenizer = Tokenizer::from_file(filename).ok();
@ -378,8 +386,47 @@ async fn main() -> Result<(), RouterError> {
}
};
// Only send usage stats when TGI is run in container and the function returns Some
let is_container = matches!(usage_stats::is_container(), Ok(true));
let user_agent = if !disable_usage_stats && is_container {
let reduced_args = usage_stats::Args::new(
config.clone(),
tokenizer_class,
max_concurrent_requests,
max_best_of,
max_stop_sequences,
max_top_n_tokens,
max_input_tokens,
max_total_tokens,
waiting_served_ratio,
max_batch_prefill_tokens,
max_batch_total_tokens,
max_waiting_tokens,
max_batch_size,
revision,
validation_workers,
messages_api_enabled,
disable_grammar_support,
max_client_batch_size,
disable_usage_stats,
disable_crash_reports,
);
Some(usage_stats::UserAgent::new(reduced_args))
} else {
None
};
if let Some(ref ua) = user_agent {
let start_event =
usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start, None);
tokio::spawn(async move {
start_event.send().await;
});
};
// Run server
server::run(
let result = server::run(
master_shard_uds_path,
model_info,
compat_return_full_text,
@ -410,8 +457,41 @@ async fn main() -> Result<(), RouterError> {
max_client_batch_size,
print_schema_command,
)
.await?;
Ok(())
.await;
match result {
Ok(_) => {
if let Some(ref ua) = user_agent {
let stop_event = usage_stats::UsageStatsEvent::new(
ua.clone(),
usage_stats::EventType::Stop,
None,
);
stop_event.send().await;
};
Ok(())
}
Err(e) => {
if let Some(ref ua) = user_agent {
if !disable_crash_reports {
let error_event = usage_stats::UsageStatsEvent::new(
ua.clone(),
usage_stats::EventType::Error,
Some(e.to_string()),
);
error_event.send().await;
} else {
let unknow_error_event = usage_stats::UsageStatsEvent::new(
ua.clone(),
usage_stats::EventType::Error,
Some("unknow_error".to_string()),
);
unknow_error_event.send().await;
}
};
Err(RouterError::WebServer(e))
}
}
}
/// Init logging using env variables LOG_LEVEL and LOG_FORMAT:

355
router/src/usage_stats.rs Normal file
View File

@ -0,0 +1,355 @@
use crate::config::Config;
use csv::ReaderBuilder;
use reqwest::header::HeaderMap;
use serde::Serialize;
use std::{
fs::File,
io::{self, BufRead},
path::Path,
process::Command,
time::Duration,
};
use uuid::Uuid;
const TELEMETRY_URL: &str = "https://huggingface.co/api/telemetry/tgi";
#[derive(Debug, Clone, Serialize)]
pub struct UserAgent {
pub uid: String,
pub args: Args,
pub env: Env,
}
impl UserAgent {
pub fn new(reduced_args: Args) -> Self {
Self {
uid: Uuid::new_v4().to_string(),
args: reduced_args,
env: Env::new(),
}
}
}
#[derive(Serialize, Debug)]
pub enum EventType {
Start,
Stop,
Error,
}
#[derive(Debug, Serialize)]
pub struct UsageStatsEvent {
user_agent: UserAgent,
event_type: EventType,
#[serde(skip_serializing_if = "Option::is_none")]
error_reason: Option<String>,
}
impl UsageStatsEvent {
pub fn new(user_agent: UserAgent, event_type: EventType, error_reason: Option<String>) -> Self {
Self {
user_agent,
event_type,
error_reason,
}
}
pub async fn send(&self) {
let mut headers = HeaderMap::new();
headers.insert("Content-Type", "application/json".parse().unwrap());
let body = serde_json::to_string(&self).unwrap();
let client = reqwest::Client::new();
let _ = client
.post(TELEMETRY_URL)
.headers(headers)
.body(body)
.timeout(Duration::from_secs(5))
.send()
.await;
}
}
#[derive(Debug, Clone, Serialize)]
pub struct Args {
model_config: Option<Config>,
tokenizer_config: Option<String>,
max_concurrent_requests: usize,
max_best_of: usize,
max_stop_sequences: usize,
max_top_n_tokens: u32,
max_input_tokens: usize,
max_total_tokens: usize,
waiting_served_ratio: f32,
max_batch_prefill_tokens: u32,
max_batch_total_tokens: Option<u32>,
max_waiting_tokens: usize,
max_batch_size: Option<usize>,
revision: Option<String>,
validation_workers: usize,
messages_api_enabled: bool,
disable_grammar_support: bool,
max_client_batch_size: usize,
disable_usage_stats: bool,
disable_crash_reports: bool,
}
impl Args {
#[allow(clippy::too_many_arguments)]
pub fn new(
model_config: Option<Config>,
tokenizer_config: Option<String>,
max_concurrent_requests: usize,
max_best_of: usize,
max_stop_sequences: usize,
max_top_n_tokens: u32,
max_input_tokens: usize,
max_total_tokens: usize,
waiting_served_ratio: f32,
max_batch_prefill_tokens: u32,
max_batch_total_tokens: Option<u32>,
max_waiting_tokens: usize,
max_batch_size: Option<usize>,
revision: Option<String>,
validation_workers: usize,
messages_api_enabled: bool,
disable_grammar_support: bool,
max_client_batch_size: usize,
disable_usage_stats: bool,
disable_crash_reports: bool,
) -> Self {
Self {
model_config,
tokenizer_config,
max_concurrent_requests,
max_best_of,
max_stop_sequences,
max_top_n_tokens,
max_input_tokens,
max_total_tokens,
waiting_served_ratio,
max_batch_prefill_tokens,
max_batch_total_tokens,
max_waiting_tokens,
max_batch_size,
revision,
validation_workers,
messages_api_enabled,
disable_grammar_support,
max_client_batch_size,
disable_usage_stats,
disable_crash_reports,
}
}
}
/// This is more or less a copy of the code from the `text-generation-launcher` crate to avoid a dependency
#[derive(Serialize, Debug, Clone)]
pub struct Env {
git_sha: &'static str,
docker_label: &'static str,
nvidia_info: Option<Vec<NvidiaSmiInfo>>,
xpu_info: Option<Vec<XpuSmiInfo>>,
system_env: SystemInfo,
}
#[derive(Debug, Serialize, Clone)]
struct NvidiaSmiInfo {
name: String,
pci_bus_id: String,
driver_version: String,
pstate: String,
pcie_link_gen_max: String,
pcie_link_gen_current: String,
temperature_gpu: String,
utilization_gpu: String,
utilization_memory: String,
memory_total: String,
memory_free: String,
memory_used: String,
reset_status_reset_required: String,
reset_status_drain_and_reset_recommended: String,
compute_cap: String,
ecc_errors_corrected_volatile_total: String,
mig_mode_current: String,
power_draw_instant: String,
power_limit: String,
}
impl NvidiaSmiInfo {
fn new() -> Option<Vec<NvidiaSmiInfo>> {
let output = Command::new("nvidia-smi")
.args([
"--query-gpu=name,pci.bus_id,driver_version,pstate,pcie.link.gen.max,pcie.link.gen.gpucurrent,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used,reset_status.reset_required,reset_status.drain_and_reset_recommended,compute_cap,ecc.errors.corrected.volatile.total,mig.mode.current,power.draw.instant,power.limit",
"--format=csv"
])
.output()
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8(output.stdout).ok()?;
let mut rdr = ReaderBuilder::new()
.has_headers(true)
.from_reader(stdout.as_bytes());
let mut infos = Vec::new();
for result in rdr.records() {
let record = result.ok()?;
infos.push(NvidiaSmiInfo {
name: record[0].to_string(),
pci_bus_id: record[1].to_string(),
driver_version: record[2].to_string(),
pstate: record[3].to_string(),
pcie_link_gen_max: record[4].to_string(),
pcie_link_gen_current: record[5].to_string(),
temperature_gpu: record[6].to_string(),
utilization_gpu: record[7].to_string(),
utilization_memory: record[8].to_string(),
memory_total: record[9].to_string(),
memory_free: record[10].to_string(),
memory_used: record[11].to_string(),
reset_status_reset_required: record[12].to_string(),
reset_status_drain_and_reset_recommended: record[13].to_string(),
compute_cap: record[14].to_string(),
ecc_errors_corrected_volatile_total: record[15].to_string(),
mig_mode_current: record[16].to_string(),
power_draw_instant: record[17].to_string(),
power_limit: record[18].to_string(),
});
}
Some(infos)
}
}
#[derive(Debug, Serialize, Clone)]
struct XpuSmiInfo {
device_id: usize,
gpu_utilization: f32,
gpu_power: f32,
gpu_core_temperature: f32,
gpu_memory_bandwidth_utilization: f32,
}
impl XpuSmiInfo {
/// based on this https://github.com/intel/xpumanager/blob/master/doc/smi_user_guide.md#dump-the-device-statistics-in-csv-format
fn new() -> Option<Vec<XpuSmiInfo>> {
let output = Command::new("xpu-smi")
.args([
"dump", "-d", "-1", "-m",
"0,1,3,17", // Metrics IDs: GPU Utilization, GPU Power, GPU Core Temperature, GPU Memory Bandwidth Utilization
"-n", "1", "-j",
])
.output()
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8(output.stdout).ok()?;
let mut infos = Vec::new();
let json_data: serde_json::Value = match serde_json::from_str(&stdout) {
Ok(data) => data,
Err(_) => return None,
};
if let Some(metrics_data) = json_data.as_array() {
for entry in metrics_data {
let device_id = entry["deviceId"].as_u64()? as usize;
let gpu_utilization = entry["metrics"][0].as_f64()? as f32;
let gpu_power = entry["metrics"][1].as_f64()? as f32;
let gpu_core_temperature = entry["metrics"][2].as_f64()? as f32;
let gpu_memory_bandwidth_utilization = entry["metrics"][3].as_f64()? as f32;
infos.push(XpuSmiInfo {
device_id,
gpu_utilization,
gpu_power,
gpu_core_temperature,
gpu_memory_bandwidth_utilization,
});
}
}
Some(infos)
}
}
#[derive(Serialize, Debug, Clone)]
pub struct SystemInfo {
cpu_count: usize,
cpu_type: String,
total_memory: u64,
architecture: String,
platform: String,
}
impl SystemInfo {
fn new() -> Self {
let mut system = sysinfo::System::new_all();
system.refresh_all();
let cpu_count = system.cpus().len();
let cpu_type = system.cpus()[0].brand().to_string();
let total_memory = system.total_memory();
let architecture = std::env::consts::ARCH.to_string();
let platform = format!(
"{}-{}-{}",
std::env::consts::OS,
std::env::consts::FAMILY,
std::env::consts::ARCH
);
Self {
cpu_count,
cpu_type,
total_memory,
architecture,
platform,
}
}
}
impl Default for Env {
fn default() -> Self {
Self::new()
}
}
impl Env {
pub fn new() -> Self {
Self {
system_env: SystemInfo::new(),
nvidia_info: NvidiaSmiInfo::new(),
xpu_info: XpuSmiInfo::new(),
git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
docker_label: option_env!("DOCKER_LABEL").unwrap_or("N/A"),
}
}
}
pub fn is_container() -> io::Result<bool> {
let path = Path::new("/proc/self/cgroup");
let file = File::open(path)?;
let reader = io::BufReader::new(file);
for line in reader.lines() {
let line = line?;
// Check for common container runtimes
if line.contains("/docker/")
|| line.contains("/docker-")
|| line.contains("/kubepods/")
|| line.contains("/kubepods-")
|| line.contains("containerd")
|| line.contains("crio")
|| line.contains("podman")
{
return Ok(true);
}
}
Ok(false)
}