From 1ad3250b8997576fdc52aa99d777c56b9dfed254 Mon Sep 17 00:00:00 2001 From: OlivierDehaene Date: Wed, 8 Feb 2023 17:53:33 +0100 Subject: [PATCH] fix(docker): increase shm size (#60) --- Dockerfile | 2 -- README.md | 37 ++++++++++++++++++++++++++++----- launcher/src/main.rs | 11 ++++++++-- server/text_generation/utils.py | 14 ++++++------- 4 files changed, 48 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index 93846d77..03e7e5dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,9 +30,7 @@ ENV LANG=C.UTF-8 \ MODEL_ID=bigscience/bloom-560m \ QUANTIZE=false \ NUM_SHARD=1 \ - SAFETENSORS_FAST_GPU=1 \ PORT=80 \ - NCCL_ASYNC_ERROR_HANDLING=1 \ CUDA_HOME=/usr/local/cuda \ LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \ CONDA_DEFAULT_ENV=text-generation \ diff --git a/README.md b/README.md index 54851c2a..9763578c 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,9 @@ to power LLMs api-inference widgets. - [Officially Supported Models](#officially-supported-models) - [Get Started](#get-started) - [Docker](#docker) + - [API Documentation](#api-documentation) + - [A note on Shared Memory](#a-note-on-shared-memory-shm) - [Local Install](#local-install) - - [OpenAPI](#api-documentation) - [CUDA Kernels](#cuda-kernels) - [Run BLOOM](#run-bloom) - [Download](#download) @@ -54,7 +55,7 @@ to power LLMs api-inference widgets. - ~~[Galactica](https://huggingface.co/facebook/galactica-120b)~~ (deactivated) - [SantaCoder](https://huggingface.co/bigcode/santacoder) - [GPT-Neox 20B](https://huggingface.co/EleutherAI/gpt-neox-20b) -- [FLAN-T5-XXL](https://huggingface.co/google/flan-t5-xxl): use `--revision pr/26` +- [FLAN-T5-XXL](https://huggingface.co/google/flan-t5-xxl) Other models are supported on a best effort basis using: @@ -75,7 +76,7 @@ model=bigscience/bloom-560m num_shard=2 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --num-shard $num_shard +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --num-shard $num_shard ``` You can then query the model using either the `/generate` or `/generate_stream` routes: @@ -101,6 +102,32 @@ curl 127.0.0.1:8080/generate_stream \ You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference). +### A note on Shared Memory (shm) + +[`NCCL`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html) is a communication framework used by +`PyTorch` to do distributed training/inference. `text-generation-inference` make +use of `NCCL` to enable Tensor Parallelism to dramatically speed up inference for large language models. + +In order to share data between the different devices of a `NCCL` group, `NCCL` might fall back to using the host memory if +peer-to-peer using NVLink or PCI is not possible. + +To allow the container to use 1G of Shared Memory and support SHM sharing, we add `--shm-size 1g` on the above command. + +If you are running `text-generation-inference` inside `Kubernetes`. You can also add Shared Memory to the container by +creating a volume with: + +```yaml +- name: shm + emptyDir: + medium: Memory + sizeLimit: 1Gi +``` + +and mounting it to `/dev/shm`. + +Finally, you can also disable SHM sharing by using the `NCCL_SHM_DISABLE=1` environment variable. However, note that +this will impact performance. + ### Local install You can also opt to install `text-generation-inference` locally. @@ -122,10 +149,10 @@ BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork make run-bloom-560m ``` -**Note:** on some machines, you may also need the OpenSSL libraries. On Linux machines, run: +**Note:** on some machines, you may also need the OpenSSL libraries and gcc. On Linux machines, run: ```shell -sudo apt-get install libssl-dev +sudo apt-get install libssl-dev gcc -y ``` ### CUDA Kernels diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 6684c3de..49175570 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -38,9 +38,9 @@ struct Args { port: u16, #[clap(default_value = "/tmp/text-generation-server", long, env)] shard_uds_path: String, - #[clap(default_value = "0.0.0.0", long, env)] + #[clap(default_value = "localhost", long, env)] master_addr: String, - #[clap(default_value = "6000", long, env)] + #[clap(default_value = "29500", long, env)] master_port: usize, #[clap(long, env)] json_output: bool, @@ -305,6 +305,7 @@ fn shard_manager( ("MASTER_ADDR".into(), master_addr.into()), ("MASTER_PORT".into(), master_port.to_string().into()), ("SAFETENSORS_FAST_GPU".into(), "1".into()), + ("NCCL_ASYNC_ERROR_HANDLING".into(), "1".into()), ]; // If the HUGGINGFACE_HUB_CACHE env var is set, pass it to the shard @@ -322,6 +323,12 @@ fn shard_manager( )); }; + // If the NCCL_SHM_DISABLE env var is set, pass it to the shard + // needed when running NCCL inside a docker container and when you can't increase shm size + if let Ok(nccl_shm_disalbe) = env::var("NCCL_SHM_DISABLE") { + env.push(("NCCL_SHM_DISABLE".into(), nccl_shm_disalbe.into())); + }; + // If the CUDA_VISIBLE_DEVICES env var is set, pass it to the shard if let Ok(cuda_visible_devices) = env::var("CUDA_VISIBLE_DEVICES") { env.push(("CUDA_VISIBLE_DEVICES".into(), cuda_visible_devices.into())); diff --git a/server/text_generation/utils.py b/server/text_generation/utils.py index 2821a124..5f0a2119 100644 --- a/server/text_generation/utils.py +++ b/server/text_generation/utils.py @@ -162,29 +162,29 @@ def initialize_torch_distributed(): world_size = int(os.getenv("WORLD_SIZE", "1")) if torch.cuda.is_available(): - # initialized `torch.distributed` + from torch.distributed import ProcessGroupNCCL # Set the device id. assert world_size <= torch.cuda.device_count(), "Each process is one gpu" device = rank % torch.cuda.device_count() torch.cuda.set_device(device) backend = "nccl" + options = ProcessGroupNCCL.Options() + options.is_high_priority_stream = True + options._timeout = timedelta(seconds=60) else: backend = "gloo" - - master_ip = os.getenv("MASTER_ADDR", "0.0.0.0") - master_port = os.getenv("MASTER_PORT", "6000") - init_method = f"tcp://{master_ip}:{master_port}" + options = None # Call the init process. torch.distributed.init_process_group( backend=backend, - init_method=init_method, world_size=world_size, rank=rank, timeout=timedelta(seconds=60), + pg_options=options ) - return torch.distributed.distributed_c10d._get_default_group(), rank, world_size + return torch.distributed.group.WORLD, rank, world_size def weight_hub_files(model_id, revision=None, extension=".safetensors"):