local-llm-server/llm_server/routes/queue.py

import heapq
import threading
import time

from llm_server import opts
from llm_server.llm.generator import generator
from llm_server.routes.cache import redis
from llm_server.routes.stats import generation_elapsed, generation_elapsed_lock

processing_ips = set()
processing_ips_lock = threading.Lock()


class PriorityQueue:
    def __init__(self):
        self._queue = []
        self._index = 0
        self._cv = threading.Condition()
        self._ip_count = {}

    def put(self, item, priority):
        event = DataEvent()
        with self._cv:
            # Check if the IP is already in the dictionary and if it has reached the limit
            if item[1] in self._ip_count and self._ip_count[item[1]] >= opts.ip_in_queue_max and priority != 0:
                return None  # reject the request
            heapq.heappush(self._queue, (-priority, self._index, item, event))
            self._index += 1
            # Increment the count for this IP
            self._ip_count[item[1]] = self._ip_count.get(item[1], 0) + 1
            self._cv.notify()
        return event

    def get(self):
        with self._cv:
            while len(self._queue) == 0:
                self._cv.wait()
            _, _, item, event = heapq.heappop(self._queue)
            # Decrement the count for this IP
            self._ip_count[item[1]] -= 1
            if self._ip_count[item[1]] == 0:
                del self._ip_count[item[1]]  # Remove the IP from the dictionary if count is 0
        return item, event

    def __len__(self):
        return len(self._queue)


priority_queue = PriorityQueue()


class DataEvent(threading.Event):
    def __init__(self):
        super().__init__()
        self.data = None


def worker():
    global processing_ips_lock
    while True:
        (request_json_body, client_ip, token, parameters), event = priority_queue.get()

        redis.sadd('processing_ips', client_ip)
        redis.incr('active_gen_workers')

        start_time = time.time()
        success, response, error_msg = generator(request_json_body)

        end_time = time.time()
        elapsed_time = end_time - start_time
        with generation_elapsed_lock:
            generation_elapsed.append((end_time, elapsed_time))

        event.data = (success, response, error_msg)
        event.set()

        redis.srem('processing_ips', client_ip)
        redis.decr('active_gen_workers')


def start_workers(num_workers: int):
    for _ in range(num_workers):
        threading.Thread(target=worker).start()
add a queue system 2023-08-23 20:12:38 -06:00			`import heapq`
			`import threading`
add estimated wait time and other time tracking stats 2023-08-23 21:33:52 -06:00			`import time`
add a queue system 2023-08-23 20:12:38 -06:00
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`from llm_server import opts`
add a queue system 2023-08-23 20:12:38 -06:00			`from llm_server.llm.generator import generator`
fix some stuff related to gunicorn workers 2023-08-23 22:01:06 -06:00			`from llm_server.routes.cache import redis`
add estimated wait time and other time tracking stats 2023-08-23 21:33:52 -06:00			`from llm_server.routes.stats import generation_elapsed, generation_elapsed_lock`
add a queue system 2023-08-23 20:12:38 -06:00
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`processing_ips = set()`
			`processing_ips_lock = threading.Lock()`

add a queue system 2023-08-23 20:12:38 -06:00
			`class PriorityQueue:`
			`def __init__(self):`
			`self._queue = []`
			`self._index = 0`
			`self._cv = threading.Condition()`
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`self._ip_count = {}`
add a queue system 2023-08-23 20:12:38 -06:00
			`def put(self, item, priority):`
			`event = DataEvent()`
			`with self._cv:`
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`# Check if the IP is already in the dictionary and if it has reached the limit`
exclude tokens with priority 0 from simultaneous requests ratelimit 2023-08-28 00:03:25 -06:00			`if item[1] in self._ip_count and self._ip_count[item[1]] >= opts.ip_in_queue_max and priority != 0:`
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`return None # reject the request`
add a queue system 2023-08-23 20:12:38 -06:00			`heapq.heappush(self._queue, (-priority, self._index, item, event))`
			`self._index += 1`
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`# Increment the count for this IP`
			`self._ip_count[item[1]] = self._ip_count.get(item[1], 0) + 1`
add a queue system 2023-08-23 20:12:38 -06:00			`self._cv.notify()`
			`return event`

			`def get(self):`
			`with self._cv:`
			`while len(self._queue) == 0:`
			`self._cv.wait()`
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`_, _, item, event = heapq.heappop(self._queue)`
			`# Decrement the count for this IP`
			`self._ip_count[item[1]] -= 1`
			`if self._ip_count[item[1]] == 0:`
			`del self._ip_count[item[1]] # Remove the IP from the dictionary if count is 0`
			`return item, event`
forgot to start workers 2023-08-23 20:33:49 -06:00
			`def __len__(self):`
			`return len(self._queue)`
add a queue system 2023-08-23 20:12:38 -06:00

			`priority_queue = PriorityQueue()`


			`class DataEvent(threading.Event):`
			`def __init__(self):`
			`super().__init__()`
			`self.data = None`


			`def worker():`
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`global processing_ips_lock`
add a queue system 2023-08-23 20:12:38 -06:00			`while True:`
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`(request_json_body, client_ip, token, parameters), event = priority_queue.get()`
fix some stuff related to gunicorn workers 2023-08-23 22:01:06 -06:00
limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`redis.sadd('processing_ips', client_ip)`
fix some stuff related to gunicorn workers 2023-08-23 22:01:06 -06:00			`redis.incr('active_gen_workers')`

add estimated wait time and other time tracking stats 2023-08-23 21:33:52 -06:00			`start_time = time.time()`
add a queue system 2023-08-23 20:12:38 -06:00			`success, response, error_msg = generator(request_json_body)`
add estimated wait time and other time tracking stats 2023-08-23 21:33:52 -06:00
			`end_time = time.time()`
			`elapsed_time = end_time - start_time`
			`with generation_elapsed_lock:`
			`generation_elapsed.append((end_time, elapsed_time))`

add a queue system 2023-08-23 20:12:38 -06:00			`event.data = (success, response, error_msg)`
			`event.set()`

limit amount of simultaneous requests an IP can make 2023-08-27 23:48:10 -06:00			`redis.srem('processing_ips', client_ip)`
fix some stuff related to gunicorn workers 2023-08-23 22:01:06 -06:00			`redis.decr('active_gen_workers')`

add a queue system 2023-08-23 20:12:38 -06:00
			`def start_workers(num_workers: int):`
forgot to start workers 2023-08-23 20:33:49 -06:00			`for _ in range(num_workers):`
add a queue system 2023-08-23 20:12:38 -06:00			`threading.Thread(target=worker).start()`