diff --git a/llm_server/routes/queue.py b/llm_server/routes/queue.py index 4f98323..d38635f 100644 --- a/llm_server/routes/queue.py +++ b/llm_server/routes/queue.py @@ -22,7 +22,7 @@ class PriorityQueue: event = DataEvent() with self._cv: # Check if the IP is already in the dictionary and if it has reached the limit - if item[1] in self._ip_count and self._ip_count[item[1]] >= opts.ip_in_queue_max: + if item[1] in self._ip_count and self._ip_count[item[1]] >= opts.ip_in_queue_max and priority != 0: return None # reject the request heapq.heappush(self._queue, (-priority, self._index, item, event)) self._index += 1 diff --git a/llm_server/routes/v1/generate.py b/llm_server/routes/v1/generate.py index aef4fbe..f1526c0 100644 --- a/llm_server/routes/v1/generate.py +++ b/llm_server/routes/v1/generate.py @@ -51,7 +51,7 @@ def generate(): else: print(f'Token {token} was given priority {priority}.') - if not redis.sismember('processing_ips', client_ip): + if not redis.sismember('processing_ips', client_ip) or priority == 0: event = priority_queue.put((request_json_body, client_ip, token, parameters), priority) else: event = None @@ -69,8 +69,6 @@ def generate(): else: raise Exception return jsonify({ - # 'code': 429, - # 'error': f'no more than {opts.ip_in_queue_max} simultaneous requests per IP', **response_json_body }), 200 diff --git a/llm_server/routes/v1/generate_stats.py b/llm_server/routes/v1/generate_stats.py index eb3e9d1..fd445e3 100644 --- a/llm_server/routes/v1/generate_stats.py +++ b/llm_server/routes/v1/generate_stats.py @@ -95,6 +95,7 @@ def generate_stats(): 'queue_size': opts.concurrent_gens, 'model': model_name, 'mode': opts.mode, + 'simultaneous_requests': opts.ip_in_queue_max, }, 'keys': { 'openaiKeys': '∞',