Merge cluster to master #3
|
@ -3,13 +3,7 @@ from llm_server.llm import oobabooga, vllm
|
||||||
|
|
||||||
|
|
||||||
def get_token_count(prompt: str, backend_url: str):
|
def get_token_count(prompt: str, backend_url: str):
|
||||||
assert isinstance(backend_url, str)
|
backend_url = cluster_config.validate_backend(backend_url)
|
||||||
|
|
||||||
if not prompt:
|
|
||||||
# The tokenizers have issues when the prompt is None.
|
|
||||||
return 0
|
|
||||||
assert isinstance(prompt, str)
|
|
||||||
|
|
||||||
backend_mode = cluster_config.get_backend(backend_url)['mode']
|
backend_mode = cluster_config.get_backend(backend_url)['mode']
|
||||||
if backend_mode == 'vllm':
|
if backend_mode == 'vllm':
|
||||||
return vllm.tokenize(prompt, backend_url)
|
return vllm.tokenize(prompt, backend_url)
|
||||||
|
|
|
@ -150,7 +150,6 @@ def openai_chat_completions():
|
||||||
# The worker incremented it, we'll decrement it.
|
# The worker incremented it, we'll decrement it.
|
||||||
decrement_ip_count(handler.client_ip, 'processing_ips')
|
decrement_ip_count(handler.client_ip, 'processing_ips')
|
||||||
decr_active_workers(handler.selected_model, handler.backend_url)
|
decr_active_workers(handler.selected_model, handler.backend_url)
|
||||||
print(len(generated_text))
|
|
||||||
|
|
||||||
return Response(generate(), mimetype='text/event-stream')
|
return Response(generate(), mimetype='text/event-stream')
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
Reference in New Issue