doc(launcher): add more docs to the `launcher` itself and link in the README (#257)
This commit is contained in:
parent
593a563414
commit
b0b97fd9a7
|
@ -84,6 +84,11 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
|
||||||
```
|
```
|
||||||
**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
|
**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
|
||||||
|
|
||||||
|
To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli:
|
||||||
|
```
|
||||||
|
text-generation-launcher --help
|
||||||
|
```
|
||||||
|
|
||||||
You can then query the model using either the `/generate` or `/generate_stream` routes:
|
You can then query the model using either the `/generate` or `/generate_stream` routes:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
|
|
@ -18,52 +18,182 @@ use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection};
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
#[clap(author, version, about, long_about = None)]
|
#[clap(author, version, about, long_about = None)]
|
||||||
struct Args {
|
struct Args {
|
||||||
|
/// The name of the model to load.
|
||||||
|
/// Can be a MODEL_ID as listed on <https://hf.co/models> like
|
||||||
|
/// `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`.
|
||||||
|
/// Or it can be a local directory containing the necessary files
|
||||||
|
/// as saved by `save_pretrained(...)` methods of transformers
|
||||||
#[clap(default_value = "bigscience/bloom-560m", long, env)]
|
#[clap(default_value = "bigscience/bloom-560m", long, env)]
|
||||||
model_id: String,
|
model_id: String,
|
||||||
|
|
||||||
|
/// The actual revision of the model if you're referring to a model
|
||||||
|
/// on the hub. You can use a specific commit id or a branch like `refs/pr/2`.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
revision: Option<String>,
|
revision: Option<String>,
|
||||||
|
|
||||||
|
/// Wether to shard or not the model across multiple GPUs
|
||||||
|
/// By default text-generation-inference will use all available GPUs to run
|
||||||
|
/// the model. Setting it to `false` deactivates `num_shard`.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
sharded: Option<bool>,
|
sharded: Option<bool>,
|
||||||
|
|
||||||
|
/// The number of shards to use if you don't want to use all GPUs on a given machine.
|
||||||
|
/// You can use `CUDA_VISIBLE_DEVICE=0,1 text-generation-launcher... --num_shard 2`
|
||||||
|
/// and `CUDA_VISIBLE_DEVICE=2,3 text-generation-launcher... --num_shard 2` to
|
||||||
|
/// launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
num_shard: Option<usize>,
|
num_shard: Option<usize>,
|
||||||
|
|
||||||
|
/// Wether you want the model to be quantized or not. This will use bitsandbytes for
|
||||||
|
/// quantization on the fly.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
quantize: bool,
|
quantize: bool,
|
||||||
|
|
||||||
|
/// The maximum amount of concurrent requests for this particular deployment.
|
||||||
|
/// Having a low limit will refuse clients requests instead of having them
|
||||||
|
/// wait for too long and is usually good to handle backpressure correctly.
|
||||||
#[clap(default_value = "128", long, env)]
|
#[clap(default_value = "128", long, env)]
|
||||||
max_concurrent_requests: usize,
|
max_concurrent_requests: usize,
|
||||||
|
|
||||||
|
/// This is the maximum allowed value for clients to set `best_of`.
|
||||||
|
/// Best of makes `n` generations at the same time, and return the best
|
||||||
|
/// in terms of overall log probability over the entire generated sequence
|
||||||
#[clap(default_value = "2", long, env)]
|
#[clap(default_value = "2", long, env)]
|
||||||
max_best_of: usize,
|
max_best_of: usize,
|
||||||
|
|
||||||
|
/// This is the maximum allowed value for clients to set `stop_sequences`.
|
||||||
|
/// Stop sequences are used to allow the model to stop on more than just
|
||||||
|
/// the EOS token, and enable more complex "prompting" where users can preprompt
|
||||||
|
/// the model in a specific way and define their "own" stop token aligned with
|
||||||
|
/// their prompt.
|
||||||
#[clap(default_value = "4", long, env)]
|
#[clap(default_value = "4", long, env)]
|
||||||
max_stop_sequences: usize,
|
max_stop_sequences: usize,
|
||||||
|
|
||||||
|
/// This is the maximum allowed input length (expressed in number of tokens)
|
||||||
|
/// for users. The larger this value, the longer prompt users can send which
|
||||||
|
/// can impact the overall memory required to handle the load.
|
||||||
|
/// Please note that some models have a finite range of sequence they can handle.
|
||||||
#[clap(default_value = "1000", long, env)]
|
#[clap(default_value = "1000", long, env)]
|
||||||
max_input_length: usize,
|
max_input_length: usize,
|
||||||
|
|
||||||
|
/// This is the most important value to set as it defines the "memory budget"
|
||||||
|
/// of running clients requests.
|
||||||
|
/// Clients will send input sequences and ask to generate `max_new_tokens`
|
||||||
|
/// on top. with a value of `1512` users can send either a prompt of
|
||||||
|
/// `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for
|
||||||
|
/// `1511` max_new_tokens.
|
||||||
|
/// The larger this value, the larger amount each request will be in your RAM
|
||||||
|
/// and the less effective batching can be.
|
||||||
#[clap(default_value = "1512", long, env)]
|
#[clap(default_value = "1512", long, env)]
|
||||||
max_total_tokens: usize,
|
max_total_tokens: usize,
|
||||||
|
|
||||||
|
/// The maximum allowed batch size during dynamic batching.
|
||||||
|
/// Using `max_batch_total_tokens` should be favored in general
|
||||||
|
/// as it's a finer way to control RAM usage.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
max_batch_size: Option<usize>,
|
max_batch_size: Option<usize>,
|
||||||
|
|
||||||
|
/// This represents the ratio of waiting queries vs running queries where
|
||||||
|
/// you want to start considering pausing the running queries to include the waiting
|
||||||
|
/// ones into the same batch.
|
||||||
|
/// `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's
|
||||||
|
/// only 10 queries left in the current batch we check if we can fit those 12
|
||||||
|
/// waiting queries into the batching strategy, and if yes, then batching happens
|
||||||
|
/// delaying the 10 running queries by a `prefill` run.
|
||||||
|
///
|
||||||
|
/// This setting is only applied if there is room in the batch
|
||||||
|
/// as defined by `max_batch_total_tokens`.
|
||||||
#[clap(default_value = "1.2", long, env)]
|
#[clap(default_value = "1.2", long, env)]
|
||||||
waiting_served_ratio: f32,
|
waiting_served_ratio: f32,
|
||||||
|
|
||||||
|
/// **IMPORTANT** This is one critical control to allow maximum usage
|
||||||
|
/// of the available hardware.
|
||||||
|
///
|
||||||
|
/// This represents the total amount of potential tokens within a batch.
|
||||||
|
/// When using padding (not recommended) this would be equivalent of
|
||||||
|
/// `batch_size` * `max_total_tokens`.
|
||||||
|
///
|
||||||
|
/// However in the non-padded (flash attention) version this can be much finer.
|
||||||
|
///
|
||||||
|
/// For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100`
|
||||||
|
/// or a single query of `1000` tokens.
|
||||||
|
///
|
||||||
|
/// So you don't have to control that finely
|
||||||
|
/// `max_batch_size` or `max_total_tokens`. In fact you could mostly relax them if you
|
||||||
|
/// want maximum flexibility. However, for your users if they are asking for the full amount of
|
||||||
|
/// total tokens, they are likely to wait for a very long time to get a spot
|
||||||
|
/// in the batch (since they are going to be alone) so setting `max_batch_size`
|
||||||
|
/// and `max_total_tokens` can still be useful to prevent those long waiting times.
|
||||||
|
///
|
||||||
|
/// Overall this number should be the largest possible amount that fits the
|
||||||
|
/// remaining memory (after the model is loaded). Since the actual memory overhead
|
||||||
|
/// depends on other parameters like if you're using quantization, flash attention
|
||||||
|
/// or the model implementation, text-generation-inference cannot infer this number
|
||||||
|
/// automatically.
|
||||||
#[clap(default_value = "32000", long, env)]
|
#[clap(default_value = "32000", long, env)]
|
||||||
max_batch_total_tokens: u32,
|
max_batch_total_tokens: u32,
|
||||||
|
|
||||||
|
/// This setting defines how many tokens can be passed before forcing the waiting
|
||||||
|
/// queries to be put on the batch (if the size of the batch allows for it).
|
||||||
|
/// New queries require 1 `prefill` forward, which is different from `decode`
|
||||||
|
/// and therefore you need to pause the running batch in order to run `prefill`
|
||||||
|
/// to create the correct values for the waiting queries to be able to join the batch.
|
||||||
|
///
|
||||||
|
/// With a value too small, queries will always "steal" the compute to run `prefill`
|
||||||
|
/// and running queries will be delayed by a lot.
|
||||||
|
///
|
||||||
|
/// With a value too big, waiting queries could wait for a very long time
|
||||||
|
/// before being allowed a slot in the running batch. If your server is busy
|
||||||
|
/// that means that requests that could run in ~2s on an empty server could
|
||||||
|
/// end up running in ~20s because the query had to wait for 18s.
|
||||||
|
///
|
||||||
|
/// This number is expressed in number of tokens to make it a bit more
|
||||||
|
/// "model" agnostic, but what should really matter is the overall latency
|
||||||
|
/// for end users.
|
||||||
#[clap(default_value = "20", long, env)]
|
#[clap(default_value = "20", long, env)]
|
||||||
max_waiting_tokens: usize,
|
max_waiting_tokens: usize,
|
||||||
#[clap(default_value = "3000", long, short, env)]
|
#[clap(default_value = "3000", long, short, env)]
|
||||||
|
|
||||||
|
/// The port to listen on.
|
||||||
port: u16,
|
port: u16,
|
||||||
|
|
||||||
|
/// The name of the socket for gRPC communication between the webserver
|
||||||
|
/// and the shards.
|
||||||
#[clap(default_value = "/tmp/text-generation-server", long, env)]
|
#[clap(default_value = "/tmp/text-generation-server", long, env)]
|
||||||
shard_uds_path: String,
|
shard_uds_path: String,
|
||||||
|
|
||||||
|
/// The address the master shard will listen on. (setting used by torch distributed)
|
||||||
#[clap(default_value = "localhost", long, env)]
|
#[clap(default_value = "localhost", long, env)]
|
||||||
master_addr: String,
|
master_addr: String,
|
||||||
|
|
||||||
|
/// The address the master port will listen on. (setting used by torch distributed)
|
||||||
#[clap(default_value = "29500", long, env)]
|
#[clap(default_value = "29500", long, env)]
|
||||||
master_port: usize,
|
master_port: usize,
|
||||||
|
|
||||||
|
/// The location of the huggingface hub cache.
|
||||||
|
/// Used to override the location if you want to provide a mounted disk for instance
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
huggingface_hub_cache: Option<String>,
|
huggingface_hub_cache: Option<String>,
|
||||||
|
|
||||||
|
/// The location of the huggingface hub cache.
|
||||||
|
/// Used to override the location if you want to provide a mounted disk for instance
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
weights_cache_override: Option<String>,
|
weights_cache_override: Option<String>,
|
||||||
|
|
||||||
|
/// For some models (like bloom), text-generation-inference implemented custom
|
||||||
|
/// cuda kernels to speed up inference. Those kernels were only tested on A100.
|
||||||
|
/// Use this flag to disable them if you're running on different hardware and
|
||||||
|
/// encounter issues.
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
disable_custom_kernels: bool,
|
disable_custom_kernels: bool,
|
||||||
|
|
||||||
|
/// Outputs the logs in JSON format (useful for telemetry)
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
json_output: bool,
|
json_output: bool,
|
||||||
|
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
otlp_endpoint: Option<String>,
|
otlp_endpoint: Option<String>,
|
||||||
|
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
cors_allow_origin: Vec<String>,
|
cors_allow_origin: Vec<String>,
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
|
|
Loading…
Reference in New Issue