local-llm-server/llm_server/workers/inferencer.py

import json
import threading
import time
import traceback
from uuid import uuid4

import ujson
from redis import Redis

from llm_server.cluster.cluster_config import cluster_config
from llm_server.custom_redis import RedisCustom, redis
from llm_server.llm.generator import generator
from llm_server.routes.queue import DataEvent, RedisPriorityQueue, decr_active_workers, decrement_ip_count, incr_active_workers, increment_ip_count

stream_redis = Redis(db=8)

STREAM_NAME_PREFIX = 'stream'


def check_cancellation(event, event_id):
    """
    This thread checks the pub/sub channel in the background so the main process
    isn't bogged down with Redis calls. Otherwise, the main process slows down to 1 token/sec.
    :param event:
    :param event_id:
    :return:
    """
    pubsub = redis.pubsub()
    pubsub.subscribe(f'notifications:{event_id}')
    while not event.is_set():
        message = pubsub.get_message()
        if message and message['data'] == b'canceled':
            event.set()
        time.sleep(0.5)  # check every half second


def get_stream_name(name: str):
    return f'{STREAM_NAME_PREFIX}:{name}'


def inference_do_stream(stream_name: str, msg_to_backend: dict, backend_url: str, event_id: str):
    prompt = msg_to_backend['prompt']
    stream_name = get_stream_name(stream_name)
    stream_redis.delete(get_stream_name(stream_name))  # be extra sure
    event = threading.Event()
    threading.Thread(target=check_cancellation, args=(event, event_id)).start()
    try:
        response = generator(msg_to_backend, backend_url)
        generated_text = ''
        partial_response = b''
        for chunk in response.iter_content(chunk_size=1):
            # If there is no more data, break the loop
            if not chunk:
                break
            if event.is_set():
                print('Client canceled generation')
                response.close()
                return

            partial_response += chunk
            if partial_response.endswith(b'\x00'):
                json_strs = partial_response.split(b'\x00')
                for json_str in json_strs:
                    if json_str:
                        try:
                            json_obj = json.loads(json_str.decode())
                            new = json_obj['text'][0].split(prompt + generated_text)[1]
                            generated_text = generated_text + new
                        except IndexError:
                            # ????
                            continue
                        stream_redis.xadd(stream_name, {'data': ujson.dumps({'new': new, 'completed': False, 'error': None})})
    except Exception as e:
        stream_redis.xadd(stream_name, {'data': ujson.dumps({'new': None, 'completed': True, 'error': f'{e.__class__.__name__}: {e}'})})
        traceback.print_exc()
    finally:
        # Publish final message to Redis stream
        stream_redis.xadd(stream_name, {'data': ujson.dumps({'new': None, 'completed': True, 'error': None})})
        event.set()  # stop the cancellation checking thread


def worker(backend_url):
    status_redis = RedisCustom('worker_status')
    worker_id = str(uuid4())
    status_redis.setp(str(worker_id), None)
    redis_queue = RedisPriorityQueue(backend_url)
    while True:
        (request_json_body, client_ip, token, parameters), event_id, selected_model, timestamp, do_stream = redis_queue.get()
        backend_info = cluster_config.get_backend(backend_url)

        if not backend_info['online']:
            # TODO: communicate to caller
            # redis.publish(event_id, 'offline')
            return

        if not selected_model:
            selected_model = backend_info['model']

        stream_redis.delete(get_stream_name(worker_id))  # clean up any old streams
        increment_ip_count(client_ip, 'processing_ips')
        incr_active_workers(selected_model, backend_url)
        status_redis.setp(str(worker_id), ('generating', client_ip))

        try:
            if do_stream:
                # Return the name of the stream that the slave should connect to.
                event = DataEvent(event_id)
                event.set((True, get_stream_name(worker_id), None))

                msg_to_backend = {
                    **parameters,
                    'prompt': request_json_body['prompt'],
                    'stream': True,
                }
                inference_do_stream(worker_id, msg_to_backend, backend_url, event_id)
            else:
                # Normal inference (not streaming).
                success, response, error_msg = generator(request_json_body, backend_url)
                event = DataEvent(event_id)
                event.set((success, response, error_msg))
        except:
            traceback.print_exc()
        finally:
            decrement_ip_count(client_ip, 'processing_ips')
            decr_active_workers(selected_model, backend_url)
            status_redis.setp(str(worker_id), None)


def start_workers(cluster: dict):
    i = 0
    for item in cluster:
        for _ in range(item['concurrent_gens']):
            t = threading.Thread(target=worker, args=(item['backend_url'],))
            t.daemon = True
            t.start()
            i += 1
    print(f'Started {i} inference workers.')