doc(launcher): add more docs to the `launcher` itself and link in the README (#257)
This commit is contained in:
parent
593a563414
commit
b0b97fd9a7
|
@ -84,6 +84,11 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
|
|||
```
|
||||
**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.
|
||||
|
||||
To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli:
|
||||
```
|
||||
text-generation-launcher --help
|
||||
```
|
||||
|
||||
You can then query the model using either the `/generate` or `/generate_stream` routes:
|
||||
|
||||
```shell
|
||||
|
|
|
@ -18,52 +18,182 @@ use subprocess::{ExitStatus, Popen, PopenConfig, PopenError, Redirection};
|
|||
#[derive(Parser, Debug)]
|
||||
#[clap(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
/// The name of the model to load.
|
||||
/// Can be a MODEL_ID as listed on <https://hf.co/models> like
|
||||
/// `gpt2` or `OpenAssistant/oasst-sft-1-pythia-12b`.
|
||||
/// Or it can be a local directory containing the necessary files
|
||||
/// as saved by `save_pretrained(...)` methods of transformers
|
||||
#[clap(default_value = "bigscience/bloom-560m", long, env)]
|
||||
model_id: String,
|
||||
|
||||
/// The actual revision of the model if you're referring to a model
|
||||
/// on the hub. You can use a specific commit id or a branch like `refs/pr/2`.
|
||||
#[clap(long, env)]
|
||||
revision: Option<String>,
|
||||
|
||||
/// Wether to shard or not the model across multiple GPUs
|
||||
/// By default text-generation-inference will use all available GPUs to run
|
||||
/// the model. Setting it to `false` deactivates `num_shard`.
|
||||
#[clap(long, env)]
|
||||
sharded: Option<bool>,
|
||||
|
||||
/// The number of shards to use if you don't want to use all GPUs on a given machine.
|
||||
/// You can use `CUDA_VISIBLE_DEVICE=0,1 text-generation-launcher... --num_shard 2`
|
||||
/// and `CUDA_VISIBLE_DEVICE=2,3 text-generation-launcher... --num_shard 2` to
|
||||
/// launch 2 copies with 2 shard each on a given machine with 4 GPUs for instance.
|
||||
#[clap(long, env)]
|
||||
num_shard: Option<usize>,
|
||||
|
||||
/// Wether you want the model to be quantized or not. This will use bitsandbytes for
|
||||
/// quantization on the fly.
|
||||
#[clap(long, env)]
|
||||
quantize: bool,
|
||||
|
||||
/// The maximum amount of concurrent requests for this particular deployment.
|
||||
/// Having a low limit will refuse clients requests instead of having them
|
||||
/// wait for too long and is usually good to handle backpressure correctly.
|
||||
#[clap(default_value = "128", long, env)]
|
||||
max_concurrent_requests: usize,
|
||||
|
||||
/// This is the maximum allowed value for clients to set `best_of`.
|
||||
/// Best of makes `n` generations at the same time, and return the best
|
||||
/// in terms of overall log probability over the entire generated sequence
|
||||
#[clap(default_value = "2", long, env)]
|
||||
max_best_of: usize,
|
||||
|
||||
/// This is the maximum allowed value for clients to set `stop_sequences`.
|
||||
/// Stop sequences are used to allow the model to stop on more than just
|
||||
/// the EOS token, and enable more complex "prompting" where users can preprompt
|
||||
/// the model in a specific way and define their "own" stop token aligned with
|
||||
/// their prompt.
|
||||
#[clap(default_value = "4", long, env)]
|
||||
max_stop_sequences: usize,
|
||||
|
||||
/// This is the maximum allowed input length (expressed in number of tokens)
|
||||
/// for users. The larger this value, the longer prompt users can send which
|
||||
/// can impact the overall memory required to handle the load.
|
||||
/// Please note that some models have a finite range of sequence they can handle.
|
||||
#[clap(default_value = "1000", long, env)]
|
||||
max_input_length: usize,
|
||||
|
||||
/// This is the most important value to set as it defines the "memory budget"
|
||||
/// of running clients requests.
|
||||
/// Clients will send input sequences and ask to generate `max_new_tokens`
|
||||
/// on top. with a value of `1512` users can send either a prompt of
|
||||
/// `1000` and ask for `512` new tokens, or send a prompt of `1` and ask for
|
||||
/// `1511` max_new_tokens.
|
||||
/// The larger this value, the larger amount each request will be in your RAM
|
||||
/// and the less effective batching can be.
|
||||
#[clap(default_value = "1512", long, env)]
|
||||
max_total_tokens: usize,
|
||||
|
||||
/// The maximum allowed batch size during dynamic batching.
|
||||
/// Using `max_batch_total_tokens` should be favored in general
|
||||
/// as it's a finer way to control RAM usage.
|
||||
#[clap(long, env)]
|
||||
max_batch_size: Option<usize>,
|
||||
|
||||
/// This represents the ratio of waiting queries vs running queries where
|
||||
/// you want to start considering pausing the running queries to include the waiting
|
||||
/// ones into the same batch.
|
||||
/// `waiting_served_ratio=1.2` Means when 12 queries are waiting and there's
|
||||
/// only 10 queries left in the current batch we check if we can fit those 12
|
||||
/// waiting queries into the batching strategy, and if yes, then batching happens
|
||||
/// delaying the 10 running queries by a `prefill` run.
|
||||
///
|
||||
/// This setting is only applied if there is room in the batch
|
||||
/// as defined by `max_batch_total_tokens`.
|
||||
#[clap(default_value = "1.2", long, env)]
|
||||
waiting_served_ratio: f32,
|
||||
|
||||
/// **IMPORTANT** This is one critical control to allow maximum usage
|
||||
/// of the available hardware.
|
||||
///
|
||||
/// This represents the total amount of potential tokens within a batch.
|
||||
/// When using padding (not recommended) this would be equivalent of
|
||||
/// `batch_size` * `max_total_tokens`.
|
||||
///
|
||||
/// However in the non-padded (flash attention) version this can be much finer.
|
||||
///
|
||||
/// For `max_batch_total_tokens=1000`, you could fit `10` queries of `total_tokens=100`
|
||||
/// or a single query of `1000` tokens.
|
||||
///
|
||||
/// So you don't have to control that finely
|
||||
/// `max_batch_size` or `max_total_tokens`. In fact you could mostly relax them if you
|
||||
/// want maximum flexibility. However, for your users if they are asking for the full amount of
|
||||
/// total tokens, they are likely to wait for a very long time to get a spot
|
||||
/// in the batch (since they are going to be alone) so setting `max_batch_size`
|
||||
/// and `max_total_tokens` can still be useful to prevent those long waiting times.
|
||||
///
|
||||
/// Overall this number should be the largest possible amount that fits the
|
||||
/// remaining memory (after the model is loaded). Since the actual memory overhead
|
||||
/// depends on other parameters like if you're using quantization, flash attention
|
||||
/// or the model implementation, text-generation-inference cannot infer this number
|
||||
/// automatically.
|
||||
#[clap(default_value = "32000", long, env)]
|
||||
max_batch_total_tokens: u32,
|
||||
|
||||
/// This setting defines how many tokens can be passed before forcing the waiting
|
||||
/// queries to be put on the batch (if the size of the batch allows for it).
|
||||
/// New queries require 1 `prefill` forward, which is different from `decode`
|
||||
/// and therefore you need to pause the running batch in order to run `prefill`
|
||||
/// to create the correct values for the waiting queries to be able to join the batch.
|
||||
///
|
||||
/// With a value too small, queries will always "steal" the compute to run `prefill`
|
||||
/// and running queries will be delayed by a lot.
|
||||
///
|
||||
/// With a value too big, waiting queries could wait for a very long time
|
||||
/// before being allowed a slot in the running batch. If your server is busy
|
||||
/// that means that requests that could run in ~2s on an empty server could
|
||||
/// end up running in ~20s because the query had to wait for 18s.
|
||||
///
|
||||
/// This number is expressed in number of tokens to make it a bit more
|
||||
/// "model" agnostic, but what should really matter is the overall latency
|
||||
/// for end users.
|
||||
#[clap(default_value = "20", long, env)]
|
||||
max_waiting_tokens: usize,
|
||||
#[clap(default_value = "3000", long, short, env)]
|
||||
|
||||
/// The port to listen on.
|
||||
port: u16,
|
||||
|
||||
/// The name of the socket for gRPC communication between the webserver
|
||||
/// and the shards.
|
||||
#[clap(default_value = "/tmp/text-generation-server", long, env)]
|
||||
shard_uds_path: String,
|
||||
|
||||
/// The address the master shard will listen on. (setting used by torch distributed)
|
||||
#[clap(default_value = "localhost", long, env)]
|
||||
master_addr: String,
|
||||
|
||||
/// The address the master port will listen on. (setting used by torch distributed)
|
||||
#[clap(default_value = "29500", long, env)]
|
||||
master_port: usize,
|
||||
|
||||
/// The location of the huggingface hub cache.
|
||||
/// Used to override the location if you want to provide a mounted disk for instance
|
||||
#[clap(long, env)]
|
||||
huggingface_hub_cache: Option<String>,
|
||||
|
||||
/// The location of the huggingface hub cache.
|
||||
/// Used to override the location if you want to provide a mounted disk for instance
|
||||
#[clap(long, env)]
|
||||
weights_cache_override: Option<String>,
|
||||
|
||||
/// For some models (like bloom), text-generation-inference implemented custom
|
||||
/// cuda kernels to speed up inference. Those kernels were only tested on A100.
|
||||
/// Use this flag to disable them if you're running on different hardware and
|
||||
/// encounter issues.
|
||||
#[clap(long, env)]
|
||||
disable_custom_kernels: bool,
|
||||
|
||||
/// Outputs the logs in JSON format (useful for telemetry)
|
||||
#[clap(long, env)]
|
||||
json_output: bool,
|
||||
|
||||
#[clap(long, env)]
|
||||
otlp_endpoint: Option<String>,
|
||||
|
||||
#[clap(long, env)]
|
||||
cors_allow_origin: Vec<String>,
|
||||
#[clap(long, env)]
|
||||
|
|
Loading…
Reference in New Issue