Merge cluster to master #3
|
@ -12,17 +12,10 @@ def worker():
|
|||
(request_json_body, client_ip, token, parameters, backend_url), event_id, selected_model = priority_queue.get()
|
||||
if not backend_url:
|
||||
backend_url = get_a_cluster_backend(selected_model)
|
||||
else:
|
||||
backend_url = cluster_config.validate_backend(backend_url)
|
||||
backend_info = cluster_config.get_backend(backend_url)
|
||||
|
||||
# The backend could have died between when the request was
|
||||
# submitted and now, so let's double check it's still online.
|
||||
if not backend_info['online']:
|
||||
old = backend_url
|
||||
backend_url = get_a_cluster_backend()
|
||||
backend_info = cluster_config.get_backend(backend_url)
|
||||
print(f'Backend {old} offline. Request was redirected to {backend_url}')
|
||||
del old # gc
|
||||
|
||||
if not selected_model:
|
||||
selected_model = backend_info['model']
|
||||
|
||||
|
@ -67,7 +60,7 @@ def need_to_wait(backend_url: str):
|
|||
active_workers = redis.get(f'active_gen_workers:{backend_url}', 0, dtype=int)
|
||||
concurrent_gens = cluster_config.get_backend(backend_url).get('concurrent_gens', 1)
|
||||
s = time.time()
|
||||
print(active_workers >= concurrent_gens, active_workers, concurrent_gens)
|
||||
print(active_workers)
|
||||
while active_workers >= concurrent_gens:
|
||||
time.sleep(0.01)
|
||||
e = time.time()
|
||||
|
|
|
@ -30,6 +30,7 @@ from llm_server.routes.v1 import bp
|
|||
from llm_server.routes.v1.generate_stats import generate_stats
|
||||
from llm_server.sock import init_socketio
|
||||
|
||||
# TODO: queue item timeout
|
||||
# TODO: return an `error: True`, error code, and error message rather than just a formatted message
|
||||
# TODO: what happens when all backends are offline? What about the "online" key in the stats page?
|
||||
# TODO: redis SCAN vs KEYS??
|
||||
|
|
Reference in New Issue