diff --git a/llm_server/llm/vllm/tokenize.py b/llm_server/llm/vllm/tokenize.py index 1e3c2a1..2df7789 100644 --- a/llm_server/llm/vllm/tokenize.py +++ b/llm_server/llm/vllm/tokenize.py @@ -4,6 +4,8 @@ import requests import tiktoken from llm_server import opts +from llm_server.cluster.backend import get_a_cluster_backend +from llm_server.cluster.cluster_config import cluster_config def tokenize(prompt: str, backend_url: str) -> int: @@ -11,6 +13,16 @@ def tokenize(prompt: str, backend_url: str) -> int: assert isinstance(prompt, str) assert isinstance(backend_url, str) + # TODO: put this in a shared function + # The backend could have died between when the request was + # submitted and now, so let's double check it's still online. + backend_info = cluster_config.get_backend(backend_url) + if not backend_info['online']: + old = backend_url + backend_url = get_a_cluster_backend() + print(f'Backend {old} offline. Request was redirected to {backend_url}') + del old # gc + if not prompt: # The tokenizers have issues when the prompt is None. return 0