fix: adjust test to only run on cuda

2024-10-09 20:02:29 +00:00 · 2024-10-09 20:02:29 +00:00 · c396c54231
parent 541c476492
commit c396c54231
2 changed files with 12 additions and 14 deletions
--- a/server/tests/models/test_flash_causal_lm.py
+++ b/server/tests/models/test_flash_causal_lm.py
@ -1,32 +1,30 @@
 import pytest
 import torch
 from transformers import AutoTokenizer
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.flash_causal_lm import (
    FlashCausalLMBatch,
    FlashCausalLM,
 )
 from text_generation_server.models.custom_modeling.flash_llama_modeling import (
    FlashLlamaForCausalLM,
 )
 from text_generation_server.models.globals import set_adapter_to_index
 from text_generation_server.utils.import_utils import SYSTEM, empty_cache, synchronize
 from unittest.mock import Mock
 import base64
 if SYSTEM == "cuda":
    from text_generation_server.models.flash_causal_lm import (
        FlashCausalLMBatch,
        FlashCausalLM,
    )
    from text_generation_server.models.custom_modeling.flash_llama_modeling import (
        FlashLlamaForCausalLM,
    )
 model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 set_adapter_to_index({})
-def test_flash_causal_lm_warmup():
+if SYSTEM == "cuda":
-    if SYSTEM == "cuda":
+    def test_flash_causal_lm_warmup():
        flash_causal_lm_warmup()
    else:
        pytest.skip("Test only runs on CUDA")
 def flash_causal_lm_warmup():
    revision = "main"
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@ -398,7 +398,7 @@ class HeterogeneousNextTokenChooser:
        next_logprobs = torch.gather(logprobs, 1, next_ids.view(-1, 1)).view(-1)
-        if speculate > 0:
+        if speculate and speculate > 0:
            if speculative_scores is not None:
                # Medusa provided some scores
                speculative_ids = Greedy()(speculative_scores)