Add cuda graphs sizes and make it default. (#1703)

# What does this PR do? ``` text-generation-launcher --model-id XXX # Uses cuda graphs by default text-generation-launcher --model-id XXX --cuda-graphs "1,2" #Restrict the number of cuda graphs which saves VRAM text-generation-launcher --model-id XXX --cuda-graphs "0" # Disabling it entirely ```   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
2024-04-04 23:01:56 +02:00 · 2024-04-04 23:01:56 +02:00 · 99874eae74
parent 4ee0a0c401
commit 99874eae74
6 changed files with 48 additions and 22 deletions
--- a/docs/source/basic_tutorials/launcher.md
+++ b/docs/source/basic_tutorials/launcher.md
@ -206,12 +206,13 @@ Options:
          [env: MAX_BATCH_SIZE=]
 ```
-## ENABLE_CUDA_GRAPHS
+## CUDA_GRAPHS
 ```shell
-      --enable-cuda-graphs
+      --cuda-graphs <CUDA_GRAPHS>
-          Enable experimental support for cuda graphs
+          Specify the batch sizes to compute cuda graphs for. Use "0" to disable
-          [env: ENABLE_CUDA_GRAPHS=]
+          [env: CUDA_GRAPHS=]
          [default: 1,2,4,8,16,32,64,96,128]
 ```
 ## HOSTNAME
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -383,7 +383,6 @@ def launcher(event_loop):
        env = {
            "LOG_LEVEL": "info,text_generation_router=debug",
            "ENABLE_CUDA_GRAPHS": "true",
        }
        if not use_flash_attention:
            env["USE_FLASH_ATTENTION"] = "false"
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -284,9 +284,15 @@ struct Args {
    #[clap(long, env)]
    max_batch_size: Option<usize>,
-    /// Enable experimental support for cuda graphs
+    /// Specify the batch sizes to compute cuda graphs for.
-    #[clap(long, env)]
+    /// Use "0" to disable.
-    enable_cuda_graphs: bool,
+    #[clap(
        long,
        env,
        value_delimiter = ',',
        default_value = "1,2,4,8,16,32,64,96,128"
    )]
    cuda_graphs: Vec<usize>,
    /// The IP address to listen on
    #[clap(default_value = "0.0.0.0", long, env)]
@ -416,7 +422,7 @@ fn shard_manager(
    disable_custom_kernels: bool,
    watermark_gamma: Option<f32>,
    watermark_delta: Option<f32>,
-    enable_cuda_graphs: bool,
+    cuda_graphs: Vec<usize>,
    cuda_memory_fraction: f32,
    rope_scaling: Option<RopeScaling>,
    rope_factor: Option<f32>,
@ -549,8 +555,16 @@ fn shard_manager(
    };
    // Enable experimental support for cuda graphs
-    if enable_cuda_graphs {
+    if !cuda_graphs.is_empty() {
-        envs.push(("ENABLE_CUDA_GRAPHS".into(), "True".into()))
+        envs.push((
            "CUDA_GRAPHS".into(),
            cuda_graphs
                .into_iter()
                .map(|c| c.to_string())
                .collect::<Vec<_>>()
                .join(",")
                .into(),
        ));
    }
    // If disable_custom_kernels is true, pass it to the shard as an env var
@ -941,7 +955,11 @@ fn spawn_shards(
        let disable_custom_kernels = args.disable_custom_kernels;
        let watermark_gamma = args.watermark_gamma;
        let watermark_delta = args.watermark_delta;
-        let enable_cuda_graphs = args.enable_cuda_graphs;
+        let cuda_graphs: Vec<usize> = args
            .cuda_graphs
            .iter()
            .filter_map(|&c| if c > 0 { Some(c) } else { None })
            .collect();
        let cuda_memory_fraction = args.cuda_memory_fraction;
        let rope_scaling = args.rope_scaling;
        let rope_factor = args.rope_factor;
@ -963,7 +981,7 @@ fn spawn_shards(
                disable_custom_kernels,
                watermark_gamma,
                watermark_delta,
-                enable_cuda_graphs,
+                cuda_graphs,
                cuda_memory_fraction,
                rope_scaling,
                rope_factor,
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -28,7 +28,7 @@ from text_generation_server.models.cache_manager import (
    BLOCK_SIZE,
 )
 from text_generation_server.pb import generate_pb2
-from text_generation_server.models.globals import MEM_POOL, ENABLE_CUDA_GRAPHS
+from text_generation_server.models.globals import MEM_POOL, CUDA_GRAPHS
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
@ -798,11 +798,11 @@ class FlashCausalLM(Model):
            self.device,
        )
-        if ENABLE_CUDA_GRAPHS:
+        if CUDA_GRAPHS:
            try:
-                logger.info("Experimental support for Cuda Graphs is enabled")
+                logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}")
                # Warmup cuda graphs
-                for bs in [1, 2, 4] + [8 * i for i in range(1, 9)]:
+                for bs in CUDA_GRAPHS:
                    if self.speculate is None or self.speculate + 1 <= bs:
                        self.cuda_graph_warmup(bs, max_s, max_bt)
            except Exception:
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@ -3,4 +3,12 @@ import os
 MEM_POOL = torch.cuda.graph_pool_handle()
 # This is overridden by the cli
-ENABLE_CUDA_GRAPHS = os.getenv("ENABLE_CUDA_GRAPHS", "false").lower() in {"1", "true"}
+cuda_graphs = os.getenv("CUDA_GRAPHS")
 if cuda_graphs is not None:
    try:
        cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
    except Exception as e:
        raise RuntimeError(
            f"Could not parse cuda graphs {cuda_graphs}, expected comma separated list for batch sizes to run on: {e}"
        )
 CUDA_GRAPHS = cuda_graphs
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@ -13,7 +13,7 @@ from text_generation_server.utils import (
    weight_files,
    Weights,
 )
-from text_generation_server.models.globals import ENABLE_CUDA_GRAPHS, MEM_POOL
+from text_generation_server.models.globals import CUDA_GRAPHS, MEM_POOL
 import time
 from text_generation_server.models.custom_modeling.mamba_modeling import (
    MambaModel,
@ -465,12 +465,12 @@ class Mamba(Model):
    def warmup(self, batch) -> Optional[int]:
        # TODO: implement warmup for Mamba if needed
-        if ENABLE_CUDA_GRAPHS:
+        if CUDA_GRAPHS:
            if self.speculate is None or self.speculate == 0:
                try:
-                    logger.info("Experimental support for Cuda Graphs is enabled")
+                    logger.info(f"Cuda Graphs are enabled for sizes {CUDA_GRAPHS}")
                    # Warmup cuda graphs
-                    for bs in [1, 2, 4] + [8 * i for i in range(1, 9)]:
+                    for bs in CUDA_GRAPHS:
                        self.cuda_graph_warmup(bs)
                except Exception:
                    logger.exception(f"Decode cuda graph warmup failed")