119 lines
4.6 KiB
Python
119 lines
4.6 KiB
Python
import json
|
|
import threading
|
|
import traceback
|
|
from uuid import uuid4
|
|
|
|
import ujson
|
|
from redis import Redis
|
|
|
|
from llm_server.cluster.cluster_config import cluster_config
|
|
from llm_server.custom_redis import RedisCustom, redis
|
|
from llm_server.llm.generator import generator
|
|
from llm_server.routes.queue import DataEvent, RedisPriorityQueue, decr_active_workers, decrement_ip_count, incr_active_workers, increment_ip_count
|
|
|
|
stream_redis = Redis(db=8)
|
|
|
|
STREAM_NAME_PREFIX = 'stream'
|
|
|
|
|
|
def get_stream_name(name: str):
|
|
return f'{STREAM_NAME_PREFIX}:{name}'
|
|
|
|
|
|
def inference_do_stream(stream_name: str, msg_to_backend: dict, backend_url: str, event_id: str):
|
|
prompt = msg_to_backend['prompt']
|
|
stream_name = get_stream_name(stream_name)
|
|
redis.delete(f'notifications:{event_id}')
|
|
stream_redis.delete(get_stream_name(stream_name)) # be extra sure
|
|
try:
|
|
response = generator(msg_to_backend, backend_url)
|
|
generated_text = ''
|
|
partial_response = b''
|
|
for chunk in response.iter_content(chunk_size=1):
|
|
# If there is no more data, break the loop
|
|
if not chunk:
|
|
break
|
|
# message = redis.lpop(f'notifications:{event_id}')
|
|
# if message and message.decode('utf-8') == 'canceled':
|
|
# print('Client canceled generation')
|
|
# response.close()
|
|
# return
|
|
|
|
partial_response += chunk
|
|
if partial_response.endswith(b'\x00'):
|
|
json_strs = partial_response.split(b'\x00')
|
|
for json_str in json_strs:
|
|
if json_str:
|
|
try:
|
|
json_obj = json.loads(json_str.decode())
|
|
new = json_obj['text'][0].split(prompt + generated_text)[1]
|
|
generated_text = generated_text + new
|
|
except IndexError:
|
|
# ????
|
|
continue
|
|
stream_redis.xadd(stream_name, {'data': ujson.dumps({'new': new, 'completed': False, 'error': None})})
|
|
except Exception as e:
|
|
stream_redis.xadd(stream_name, {'data': ujson.dumps({'new': None, 'completed': True, 'error': f'{e.__class__.__name__}: {e}'})})
|
|
traceback.print_exc()
|
|
finally:
|
|
# Publish final message to Redis stream
|
|
stream_redis.xadd(stream_name, {'data': ujson.dumps({'new': None, 'completed': True, 'error': None})})
|
|
|
|
|
|
def worker(backend_url):
|
|
status_redis = RedisCustom('worker_status')
|
|
worker_id = str(uuid4())
|
|
status_redis.setp(str(worker_id), None)
|
|
redis_queue = RedisPriorityQueue(backend_url)
|
|
while True:
|
|
(request_json_body, client_ip, token, parameters), event_id, selected_model, timestamp, do_stream = redis_queue.get()
|
|
backend_info = cluster_config.get_backend(backend_url)
|
|
|
|
if not backend_info['online']:
|
|
# TODO: communicate to caller
|
|
# redis.publish(event_id, 'offline')
|
|
return
|
|
|
|
if not selected_model:
|
|
selected_model = backend_info['model']
|
|
|
|
stream_redis.delete(get_stream_name(worker_id)) # clean up any old streams
|
|
increment_ip_count(client_ip, 'processing_ips')
|
|
incr_active_workers(selected_model, backend_url)
|
|
status_redis.setp(str(worker_id), ('generating', client_ip))
|
|
|
|
try:
|
|
if do_stream:
|
|
# Return the name of the stream that the slave should connect to.
|
|
event = DataEvent(event_id)
|
|
event.set(get_stream_name(worker_id))
|
|
|
|
msg_to_backend = {
|
|
**parameters,
|
|
'prompt': request_json_body['prompt'],
|
|
'stream': True,
|
|
}
|
|
inference_do_stream(worker_id, msg_to_backend, backend_url, event_id)
|
|
else:
|
|
# Normal inference (not streaming).
|
|
success, response, error_msg = generator(request_json_body, backend_url)
|
|
event = DataEvent(event_id)
|
|
event.set((success, response, error_msg))
|
|
except:
|
|
traceback.print_exc()
|
|
finally:
|
|
decrement_ip_count(client_ip, 'processing_ips')
|
|
decr_active_workers(selected_model, backend_url)
|
|
status_redis.setp(str(worker_id), None)
|
|
|
|
|
|
def start_workers(cluster: dict):
|
|
i = 0
|
|
for item in cluster:
|
|
for _ in range(item['concurrent_gens']):
|
|
t = threading.Thread(target=worker, args=(item['backend_url'],))
|
|
t.daemon = True
|
|
t.start()
|
|
i += 1
|
|
print(f'Started {i} inference workers.')
|