Add OTLP Service Name Environment Variable (#2076)

* Adding Service Name Environment variable for https://github.com/huggingface/text-generation-inference/issues/2069

* Update Docs

* Update README.md

* Update Launcher Docs

* Update Launcher Docs
Removing Option
This commit is contained in:
KevinDuffy94 2024-06-25 08:33:01 +01:00 committed by GitHub
parent 3447c722fd
commit 1869ee2f57
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 41 additions and 10 deletions

View File

@ -153,7 +153,8 @@ this will impact performance.
### Distributed Tracing ### Distributed Tracing
`text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature `text-generation-inference` is instrumented with distributed tracing using OpenTelemetry. You can use this feature
by setting the address to an OTLP collector with the `--otlp-endpoint` argument. by setting the address to an OTLP collector with the `--otlp-endpoint` argument. The default service name can be
overridden with the `--otlp-service-name` argument
### Architecture ### Architecture

View File

@ -70,6 +70,8 @@ Options:
[env: JSON_OUTPUT=] [env: JSON_OUTPUT=]
--otlp-endpoint <OTLP_ENDPOINT> --otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=] [env: OTLP_ENDPOINT=]
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
--cors-allow-origin <CORS_ALLOW_ORIGIN> --cors-allow-origin <CORS_ALLOW_ORIGIN>
[env: CORS_ALLOW_ORIGIN=] [env: CORS_ALLOW_ORIGIN=]
--ngrok --ngrok
@ -138,6 +140,8 @@ Serve's command line parameters on the TGI repository are these:
│ --logger-level TEXT [default: INFO] │ │ --logger-level TEXT [default: INFO] │
│ --json-output --no-json-output [default: no-json-output] │ │ --json-output --no-json-output [default: no-json-output] │
│ --otlp-endpoint TEXT [default: None] │ │ --otlp-endpoint TEXT [default: None] │
│ --otlp-service-name TEXT [default: │
│ text-generation-inference...│
│ --help Show this message and exit. │ │ --help Show this message and exit. │
╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
``` ```

View File

@ -336,6 +336,13 @@ Options:
--otlp-endpoint <OTLP_ENDPOINT> --otlp-endpoint <OTLP_ENDPOINT>
[env: OTLP_ENDPOINT=] [env: OTLP_ENDPOINT=]
```
## OTLP_SERVICE_NAME
```shell
--otlp-service-name <OTLP_SERVICE_NAME>
[env: OTLP_SERVICE_NAME=]
[default: text-generation-inference.router]
``` ```
## CORS_ALLOW_ORIGIN ## CORS_ALLOW_ORIGIN
```shell ```shell

View File

@ -413,6 +413,9 @@ struct Args {
#[clap(long, env)] #[clap(long, env)]
otlp_endpoint: Option<String>, otlp_endpoint: Option<String>,
#[clap(default_value = "text-generation-inference.router", long, env)]
otlp_service_name: String,
#[clap(long, env)] #[clap(long, env)]
cors_allow_origin: Vec<String>, cors_allow_origin: Vec<String>,
#[clap(long, env)] #[clap(long, env)]
@ -483,6 +486,7 @@ fn shard_manager(
max_batch_size: Option<usize>, max_batch_size: Option<usize>,
max_input_tokens: usize, max_input_tokens: usize,
otlp_endpoint: Option<String>, otlp_endpoint: Option<String>,
otlp_service_name: String,
log_level: LevelFilter, log_level: LevelFilter,
status_sender: mpsc::Sender<ShardStatus>, status_sender: mpsc::Sender<ShardStatus>,
shutdown: Arc<AtomicBool>, shutdown: Arc<AtomicBool>,
@ -548,12 +552,16 @@ fn shard_manager(
(None, Some(factor)) => Some((RopeScaling::Linear, factor)), (None, Some(factor)) => Some((RopeScaling::Linear, factor)),
}; };
// OpenTelemetry // OpenTelemetry Endpoint
if let Some(otlp_endpoint) = otlp_endpoint { if let Some(otlp_endpoint) = otlp_endpoint {
shard_args.push("--otlp-endpoint".to_string()); shard_args.push("--otlp-endpoint".to_string());
shard_args.push(otlp_endpoint); shard_args.push(otlp_endpoint);
} }
// OpenTelemetry Service Name
shard_args.push("--otlp-service-name".to_string());
shard_args.push(otlp_service_name);
// In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter. // In case we use sliding window, we may ignore the sliding in flash for some backends depending on the parameter.
shard_args.push("--max-input-tokens".to_string()); shard_args.push("--max-input-tokens".to_string());
shard_args.push(max_input_tokens.to_string()); shard_args.push(max_input_tokens.to_string());
@ -1035,6 +1043,7 @@ fn spawn_shards(
let shutdown = shutdown.clone(); let shutdown = shutdown.clone();
let shutdown_sender = shutdown_sender.clone(); let shutdown_sender = shutdown_sender.clone();
let otlp_endpoint = args.otlp_endpoint.clone(); let otlp_endpoint = args.otlp_endpoint.clone();
let otlp_service_name = args.otlp_service_name.clone();
let quantize = args.quantize; let quantize = args.quantize;
let speculate = args.speculate; let speculate = args.speculate;
let dtype = args.dtype; let dtype = args.dtype;
@ -1074,6 +1083,7 @@ fn spawn_shards(
max_batch_size, max_batch_size,
max_input_tokens, max_input_tokens,
otlp_endpoint, otlp_endpoint,
otlp_service_name,
max_log_level, max_log_level,
status_sender, status_sender,
shutdown, shutdown,
@ -1207,6 +1217,12 @@ fn spawn_webserver(
router_args.push(otlp_endpoint); router_args.push(otlp_endpoint);
} }
// OpenTelemetry
let otlp_service_name = args.otlp_service_name;
router_args.push("--otlp-service-name".to_string());
router_args.push(otlp_service_name);
// CORS origins // CORS origins
for origin in args.cors_allow_origin.into_iter() { for origin in args.cors_allow_origin.into_iter() {
router_args.push("--cors-allow-origin".to_string()); router_args.push("--cors-allow-origin".to_string());

View File

@ -65,6 +65,8 @@ struct Args {
json_output: bool, json_output: bool,
#[clap(long, env)] #[clap(long, env)]
otlp_endpoint: Option<String>, otlp_endpoint: Option<String>,
#[clap(default_value = "text-generation-inference.router", long, env)]
otlp_service_name: String,
#[clap(long, env)] #[clap(long, env)]
cors_allow_origin: Option<Vec<String>>, cors_allow_origin: Option<Vec<String>>,
#[clap(long, env)] #[clap(long, env)]
@ -107,6 +109,7 @@ async fn main() -> Result<(), RouterError> {
validation_workers, validation_workers,
json_output, json_output,
otlp_endpoint, otlp_endpoint,
otlp_service_name,
cors_allow_origin, cors_allow_origin,
ngrok, ngrok,
ngrok_authtoken, ngrok_authtoken,
@ -117,7 +120,7 @@ async fn main() -> Result<(), RouterError> {
} = args; } = args;
// Launch Tokio runtime // Launch Tokio runtime
init_logging(otlp_endpoint, json_output); init_logging(otlp_endpoint, otlp_service_name, json_output);
// Validate args // Validate args
if max_input_tokens >= max_total_tokens { if max_input_tokens >= max_total_tokens {
@ -367,10 +370,11 @@ async fn main() -> Result<(), RouterError> {
/// Init logging using env variables LOG_LEVEL and LOG_FORMAT: /// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
/// - otlp_endpoint is an optional URL to an Open Telemetry collector /// - otlp_endpoint is an optional URL to an Open Telemetry collector
/// - otlp_service_name service name to appear in APM
/// - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO) /// - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
/// - LOG_FORMAT may be TEXT or JSON (default to TEXT) /// - LOG_FORMAT may be TEXT or JSON (default to TEXT)
/// - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms) /// - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
fn init_logging(otlp_endpoint: Option<String>, json_output: bool) { fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
let mut layers = Vec::new(); let mut layers = Vec::new();
// STDOUT/STDERR layer // STDOUT/STDERR layer
@ -401,7 +405,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
trace::config() trace::config()
.with_resource(Resource::new(vec![KeyValue::new( .with_resource(Resource::new(vec![KeyValue::new(
"service.name", "service.name",
"text-generation-inference.router", otlp_service_name,
)])) )]))
.with_sampler(Sampler::AlwaysOn), .with_sampler(Sampler::AlwaysOn),
) )

View File

@ -42,6 +42,7 @@ def serve(
logger_level: str = "INFO", logger_level: str = "INFO",
json_output: bool = False, json_output: bool = False,
otlp_endpoint: Optional[str] = None, otlp_endpoint: Optional[str] = None,
otlp_service_name: str = "text-generation-inference.server",
max_input_tokens: Optional[int] = None, max_input_tokens: Optional[int] = None,
): ):
if sharded: if sharded:
@ -76,7 +77,7 @@ def serve(
# Setup OpenTelemetry distributed tracing # Setup OpenTelemetry distributed tracing
if otlp_endpoint is not None: if otlp_endpoint is not None:
setup_tracing(shard=os.getenv("RANK", 0), otlp_endpoint=otlp_endpoint) setup_tracing(otlp_service_name=otlp_service_name, otlp_endpoint=otlp_endpoint)
# Downgrade enum into str for easier management later on # Downgrade enum into str for easier management later on
quantize = None if quantize is None else quantize.value quantize = None if quantize is None else quantize.value

View File

@ -54,10 +54,8 @@ class UDSOpenTelemetryAioServerInterceptor(OpenTelemetryAioServerInterceptor):
) )
def setup_tracing(shard: int, otlp_endpoint: str): def setup_tracing(otlp_service_name: str, otlp_endpoint: str):
resource = Resource.create( resource = Resource.create(attributes={"service.name": otlp_service_name})
attributes={"service.name": f"text-generation-inference.server-{shard}"}
)
span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True) span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True)
span_processor = BatchSpanProcessor(span_exporter) span_processor = BatchSpanProcessor(span_exporter)