import json import threading import time import traceback from uuid import uuid4 import ujson from redis import Redis from llm_server.cluster.cluster_config import cluster_config from llm_server.custom_redis import RedisCustom, redis from llm_server.llm.generator import generator from llm_server.logging import create_logger from llm_server.routes.queue import DataEvent, RedisPriorityQueue, decr_active_workers, decrement_ip_count, incr_active_workers, increment_ip_count stream_redis = Redis(db=8) STREAM_NAME_PREFIX = 'stream' def check_cancellation(event, event_id): """ This thread checks the pub/sub channel in the background so the main process isn't bogged down with Redis calls. Otherwise, the main process slows down to 1 token/sec. :param event: :param event_id: :return: """ pubsub = redis.pubsub() pubsub.subscribe(f'notifications:{event_id}') while not event.is_set(): message = pubsub.get_message() if message and message['data'] == b'canceled': event.set() time.sleep(0.5) # check every half second def get_stream_name(name: str): return f'{STREAM_NAME_PREFIX}:{name}' def inference_do_stream(stream_name: str, msg_to_backend: dict, backend_url: str, event_id: str): logger = create_logger('inferencer') prompt = msg_to_backend['prompt'] stream_name = get_stream_name(stream_name) stream_redis.delete(get_stream_name(stream_name)) # be extra sure event = threading.Event() threading.Thread(target=check_cancellation, args=(event, event_id)).start() try: response = generator(msg_to_backend, backend_url) generated_text = '' partial_response = b'' for chunk in response.iter_content(chunk_size=1): # If there is no more data, break the loop if not chunk: break if event.is_set(): logger.debug('Client canceled generation') response.close() return partial_response += chunk if partial_response.endswith(b'\x00'): json_strs = partial_response.split(b'\x00') for json_str in json_strs: if json_str: try: json_obj = json.loads(json_str.decode()) new = json_obj['text'][0].split(prompt + generated_text)[1] generated_text = generated_text + new except IndexError: # ???? continue stream_redis.xadd(stream_name, {'data': ujson.dumps({'new': new, 'completed': False, 'error': None})}) except AttributeError as e: if str(e) == "'bool' object has no attribute 'iter_content'": # We don't care about these errors. logger.debug('failed to stream from backend - no response') else: raise except Exception as e: stream_redis.xadd(stream_name, {'data': ujson.dumps({'new': None, 'completed': True, 'error': f'{e.__class__.__name__}: {e}'})}) raise # We won't handle the exception here. finally: # Publish final message to Redis stream stream_redis.xadd(stream_name, {'data': ujson.dumps({'new': None, 'completed': True, 'error': None})}) event.set() # stop the cancellation checking thread # def worker(backend_url): logger = create_logger('inferencer') status_redis = RedisCustom('worker_status') worker_id = str(uuid4()) status_redis.setp(str(worker_id), None) redis_queue = RedisPriorityQueue(backend_url) while True: status_redis.setp(str(worker_id), 'waiting...') (request_json_body, client_ip, token, parameters), event_id, selected_model, timestamp, do_stream = redis_queue.get() event = DataEvent(event_id) try: backend_info = cluster_config.get_backend(backend_url) except: # This is not a critical error because it usually means that the backend is # offline and this backend is in a state of transition from online to offline. logger.debug(f'got an exception while getting info for backend {backend_url} - ', traceback.format_exc()) event.set((False, None, 'exception')) continue if not backend_info['online']: event.set((False, None, 'canceled')) continue if not selected_model: selected_model = backend_info['model'] logger.debug(f"Starting using {backend_url} and {selected_model}. Online: {backend_info['online']}. Streaming: {do_stream}") try: stream_redis.delete(get_stream_name(worker_id)) # clean up any old streams increment_ip_count(client_ip, 'processing_ips') incr_active_workers(selected_model, backend_url) if do_stream: status_redis.setp(str(worker_id), ('streaming', client_ip)) # Return the name of the stream that the slave should connect to. event.set((True, get_stream_name(worker_id), None)) msg_to_backend = { **parameters, 'prompt': request_json_body['prompt'], 'stream': True, } inference_do_stream(worker_id, msg_to_backend, backend_url, event_id) else: # Normal inference (not streaming). status_redis.setp(str(worker_id), ('generating', client_ip)) success, response, error_msg = generator(request_json_body, backend_url) event.set((success, response, error_msg)) except: logger.error(traceback.format_exc()) event.set((False, None, 'exception')) finally: decrement_ip_count(client_ip, 'processing_ips') decr_active_workers(selected_model, backend_url) status_redis.setp(str(worker_id), None) def start_workers(cluster: dict): logger = create_logger('inferencer') i = 0 for item in cluster: for _ in range(item['concurrent_gens']): t = threading.Thread(target=worker, args=(item['backend_url'],)) t.daemon = True t.start() i += 1 logger.info(f'Started {i} inference workers.')