do default model rather than default backend, adjust moderation endpoint logic and add timeout, exclude system tokens from recent proompters, calculate number of moderators from endpoint concurrent gens, adjust homepage
This commit is contained in:
parent
63c12ea830
commit
32ad97e57c
|
@ -23,12 +23,14 @@ def get_model_choices(regen: bool = False):
|
|||
|
||||
context_size = []
|
||||
avg_gen_per_worker = []
|
||||
concurrent_gens = 0
|
||||
for backend_url in b:
|
||||
backend_info = cluster_config.get_backend(backend_url)
|
||||
if backend_info.get('model_config'):
|
||||
context_size.append(backend_info['model_config']['max_position_embeddings'])
|
||||
if backend_info.get('average_generation_elapsed_sec'):
|
||||
avg_gen_per_worker.append(backend_info['average_generation_elapsed_sec'])
|
||||
concurrent_gens += backend_info['concurrent_gens']
|
||||
|
||||
active_gen_workers = get_active_gen_workers_model(model)
|
||||
proompters_in_queue = priority_queue.len(model)
|
||||
|
@ -37,16 +39,10 @@ def get_model_choices(regen: bool = False):
|
|||
average_generation_elapsed_sec = np.average(avg_gen_per_worker)
|
||||
else:
|
||||
average_generation_elapsed_sec = 0
|
||||
estimated_wait_sec = calculate_wait_time(average_generation_elapsed_sec, proompters_in_queue, opts.concurrent_gens, active_gen_workers)
|
||||
|
||||
if proompters_in_queue == 0 and active_gen_workers >= opts.concurrent_gens:
|
||||
# There will be a wait if the queue is empty but prompts are processing, but we don't
|
||||
# know how long.
|
||||
estimated_wait_sec = f"less than {estimated_wait_sec} seconds"
|
||||
else:
|
||||
estimated_wait_sec = f"{estimated_wait_sec} seconds"
|
||||
estimated_wait_sec = calculate_wait_time(average_generation_elapsed_sec, proompters_in_queue, concurrent_gens, active_gen_workers)
|
||||
|
||||
model_choices[model] = {
|
||||
'model': model,
|
||||
'client_api': f'https://{base_client_api}/{model}',
|
||||
'ws_client_api': f'wss://{base_client_api}/{model}/v1/stream' if opts.enable_streaming else None,
|
||||
'openai_client_api': f'https://{base_client_api}/openai/{model}' if opts.enable_openi_compatible_backend else 'disabled',
|
||||
|
@ -55,6 +51,7 @@ def get_model_choices(regen: bool = False):
|
|||
'queued': proompters_in_queue,
|
||||
'processing': active_gen_workers,
|
||||
'avg_generation_time': average_generation_elapsed_sec,
|
||||
'concurrent_gens': concurrent_gens
|
||||
}
|
||||
|
||||
if len(context_size):
|
||||
|
@ -63,30 +60,8 @@ def get_model_choices(regen: bool = False):
|
|||
# Python wants to sort lowercase vs. uppercase letters differently.
|
||||
model_choices = dict(sorted(model_choices.items(), key=lambda item: item[0].upper()))
|
||||
|
||||
default_backend = get_a_cluster_backend()
|
||||
default_backend_dict = {}
|
||||
if default_backend:
|
||||
default_backend_info = cluster_config.get_backend(default_backend)
|
||||
default_context_size = default_backend_info['model_config']['max_position_embeddings']
|
||||
default_average_generation_elapsed_sec = default_backend_info.get('average_generation_elapsed_sec')
|
||||
default_active_gen_workers = redis.get(f'active_gen_workers:{default_backend}', dtype=int, default=0)
|
||||
default_proompters_in_queue = priority_queue.len(default_backend_info['model'])
|
||||
default_estimated_wait_sec = calculate_wait_time(default_average_generation_elapsed_sec, default_proompters_in_queue, default_backend_info['concurrent_gens'], default_active_gen_workers)
|
||||
default_backend_url = get_a_cluster_backend()
|
||||
default_model = cluster_config.get_backend(default_backend_url)['model']
|
||||
|
||||
default_backend_dict = {
|
||||
'client_api': f'https://{base_client_api}',
|
||||
'ws_client_api': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None,
|
||||
'openai_client_api': f'https://{base_client_api}/openai' if opts.enable_openi_compatible_backend else 'disabled',
|
||||
'estimated_wait': default_estimated_wait_sec,
|
||||
'queued': default_proompters_in_queue,
|
||||
'processing': default_active_gen_workers,
|
||||
'context_size': default_context_size,
|
||||
'hash': default_backend_info['hash'],
|
||||
'model': default_backend_info['model'],
|
||||
'avg_generation_time': default_average_generation_elapsed_sec,
|
||||
'online': True
|
||||
}
|
||||
|
||||
redis.setp('model_choices', (model_choices, default_backend_dict))
|
||||
|
||||
return model_choices, default_backend_dict
|
||||
redis.setp('model_choices', (model_choices, default_model))
|
||||
return model_choices, default_model
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import yaml
|
||||
|
||||
config_default_vars = {
|
||||
'frontend_api_mode': 'ooba',
|
||||
'log_prompts': False,
|
||||
'database_path': './proxy-server.db',
|
||||
'auth_required': False,
|
||||
|
@ -28,19 +29,19 @@ config_default_vars = {
|
|||
'openai_force_no_hashes': True,
|
||||
'include_system_tokens_in_stats': True,
|
||||
'openai_moderation_scan_last_n': 5,
|
||||
'openai_moderation_workers': 10,
|
||||
'openai_org_name': 'OpenAI',
|
||||
'openai_silent_trim': False,
|
||||
'openai_moderation_enabled': True,
|
||||
'netdata_root': None,
|
||||
'show_backends': True,
|
||||
'cluster_workers': 30,
|
||||
'background_homepage_cacher': True
|
||||
'background_homepage_cacher': True,
|
||||
'openai_moderation_timeout': 5
|
||||
}
|
||||
config_required_vars = ['cluster', 'mode', 'llm_middleware_name']
|
||||
config_required_vars = ['cluster', 'llm_middleware_name']
|
||||
|
||||
mode_ui_names = {
|
||||
'oobabooga': ('Text Gen WebUI (ooba)', 'Blocking API url', 'Streaming API url'),
|
||||
'ooba': ('Text Gen WebUI (ooba)', 'Blocking API url', 'Streaming API url'),
|
||||
'vllm': ('Text Gen WebUI (ooba)', 'Blocking API url', 'Streaming API url'),
|
||||
}
|
||||
|
||||
|
|
|
@ -16,15 +16,9 @@ def load_config(config_path):
|
|||
if not success:
|
||||
return success, config, msg
|
||||
|
||||
if config['mode'] not in ['oobabooga', 'vllm']:
|
||||
print('Unknown mode:', config['mode'])
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: this is atrocious
|
||||
opts.mode = config['mode']
|
||||
opts.auth_required = config['auth_required']
|
||||
opts.log_prompts = config['log_prompts']
|
||||
opts.concurrent_gens = config['concurrent_gens']
|
||||
opts.frontend_api_client = config['frontend_api_client']
|
||||
opts.show_num_prompts = config['show_num_prompts']
|
||||
opts.show_uptime = config['show_uptime']
|
||||
|
@ -47,13 +41,14 @@ def load_config(config_path):
|
|||
opts.openai_force_no_hashes = config['openai_force_no_hashes']
|
||||
opts.include_system_tokens_in_stats = config['include_system_tokens_in_stats']
|
||||
opts.openai_moderation_scan_last_n = config['openai_moderation_scan_last_n']
|
||||
opts.openai_moderation_workers = config['openai_moderation_workers']
|
||||
opts.openai_org_name = config['openai_org_name']
|
||||
opts.openai_silent_trim = config['openai_silent_trim']
|
||||
opts.openai_moderation_enabled = config['openai_moderation_enabled']
|
||||
opts.show_backends = config['show_backends']
|
||||
opts.cluster_workers = config['cluster_workers']
|
||||
opts.background_homepage_cacher = config['background_homepage_cacher']
|
||||
opts.openai_moderation_timeout = config['openai_moderation_timeout']
|
||||
opts.frontend_api_mode = config['frontend_api_mode']
|
||||
|
||||
if opts.openai_expose_our_model and not opts.openai_api_key:
|
||||
print('If you set openai_epose_our_model to false, you must set your OpenAI key in openai_api_key.')
|
||||
|
@ -75,8 +70,6 @@ def load_config(config_path):
|
|||
if config['load_num_prompts']:
|
||||
redis.set('proompts', get_number_of_rows('prompts'))
|
||||
|
||||
redis.set('backend_mode', opts.mode)
|
||||
|
||||
return success, config, msg
|
||||
|
||||
|
||||
|
|
|
@ -2,9 +2,7 @@
|
|||
|
||||
# TODO: rewrite the config system so I don't have to add every single config default here
|
||||
|
||||
concurrent_gens = 3
|
||||
mode = 'oobabooga'
|
||||
backend_url = None
|
||||
frontend_api_mode = 'ooba'
|
||||
max_new_tokens = 500
|
||||
auth_required = False
|
||||
log_prompts = False
|
||||
|
@ -31,7 +29,6 @@ openai_expose_our_model = False
|
|||
openai_force_no_hashes = True
|
||||
include_system_tokens_in_stats = True
|
||||
openai_moderation_scan_last_n = 5
|
||||
openai_moderation_workers = 10
|
||||
openai_org_name = 'OpenAI'
|
||||
openai_silent_trim = False
|
||||
openai_moderation_enabled = True
|
||||
|
@ -39,3 +36,4 @@ cluster = {}
|
|||
show_backends = True
|
||||
cluster_workers = 30
|
||||
background_homepage_cacher = True
|
||||
openai_moderation_timeout = 5
|
||||
|
|
|
@ -41,7 +41,7 @@ class OpenAIRequestHandler(RequestHandler):
|
|||
if opts.openai_moderation_enabled and opts.openai_api_key and is_api_key_moderated(self.token):
|
||||
print('moderating', self.token)
|
||||
try:
|
||||
# Gather the last message from the user and all preceeding system messages
|
||||
# Gather the last message from the user and all preceding system messages
|
||||
msg_l = self.request.json['messages'].copy()
|
||||
msg_l.reverse()
|
||||
tag = uuid4()
|
||||
|
@ -78,8 +78,8 @@ class OpenAIRequestHandler(RequestHandler):
|
|||
|
||||
def handle_ratelimited(self, do_log: bool = True):
|
||||
print('OAI ratelimited:', self.client_ip, self.request.headers)
|
||||
_, default_backend_info = get_model_choices()
|
||||
w = int(default_backend_info['estimated_wait']) if default_backend_info['estimated_wait'] > 0 else 2
|
||||
_, default_model = get_model_choices()
|
||||
w = int(default_model['estimated_wait']) if default_model['estimated_wait'] > 0 else 2
|
||||
response = jsonify({
|
||||
"error": {
|
||||
"message": "Rate limit reached on tokens per min. Limit: 10000 / min. Please try again in 6s. Contact us through our help center at help.openai.com if you continue to have issues.",
|
||||
|
|
|
@ -44,6 +44,9 @@ class RequestHandler:
|
|||
self.backend = get_backend_handler(self.cluster_backend_info['mode'], self.backend_url)
|
||||
self.parameters = None
|
||||
self.used = False
|
||||
|
||||
if not self.token.startswith('SYSTEM__'):
|
||||
# "recent_prompters" is only used for stats.
|
||||
redis.zadd('recent_prompters', {self.client_ip: time.time()})
|
||||
|
||||
def get_auth_token(self):
|
||||
|
|
|
@ -47,8 +47,8 @@ def generate_stats(regen: bool = False):
|
|||
'timestamp': int(time.time()),
|
||||
'config': {
|
||||
'gatekeeper': 'none' if opts.auth_required is False else 'token',
|
||||
'concurrent': opts.concurrent_gens,
|
||||
'simultaneous_requests_per_ip': opts.simultaneous_requests_per_ip,
|
||||
'api_mode': opts.frontend_api_mode
|
||||
},
|
||||
'keys': {
|
||||
'openaiKeys': '∞',
|
||||
|
@ -76,7 +76,7 @@ def generate_stats(regen: bool = False):
|
|||
else:
|
||||
output['backend_info'] = {}
|
||||
|
||||
output['default'] = get_model_choices(regen=True)[1]
|
||||
output['default_model'] = get_model_choices(regen=True)[1]
|
||||
|
||||
result = deep_sort(output)
|
||||
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import json
|
||||
import threading
|
||||
import time
|
||||
import traceback
|
||||
|
||||
import redis as redis_redis
|
||||
|
||||
from llm_server import opts
|
||||
from llm_server.llm.openai.moderation import check_moderation_endpoint
|
||||
|
||||
redis_moderation = redis_redis.Redis()
|
||||
|
@ -19,10 +21,32 @@ def start_moderation_workers(num_workers):
|
|||
print(f'Started {i} moderation workers.')
|
||||
|
||||
|
||||
# TODO: don't use UUID tags to identify items. Use native redis
|
||||
|
||||
def get_results(tag, num_tasks):
|
||||
tag = str(tag) # Cast a UUID4 to a string.
|
||||
flagged_categories = set()
|
||||
num_results = 0
|
||||
start_time = time.time()
|
||||
while num_results < num_tasks:
|
||||
result = redis_moderation.blpop(['queue:flagged_categories'], timeout=opts.openai_moderation_timeout)
|
||||
if result is None:
|
||||
break # Timeout occurred, break the loop.
|
||||
result_tag, categories = json.loads(result[1])
|
||||
if result_tag == tag:
|
||||
if categories:
|
||||
for item in categories:
|
||||
flagged_categories.add(item)
|
||||
num_results += 1
|
||||
if time.time() - start_time > opts.openai_moderation_timeout:
|
||||
print('Timed out waiting for result from moderator.')
|
||||
break
|
||||
return list(flagged_categories)
|
||||
|
||||
|
||||
def moderation_worker():
|
||||
print('moderator started')
|
||||
while True:
|
||||
result = redis_moderation.blpop('queue:msgs_to_check')
|
||||
result = redis_moderation.blpop(['queue:msgs_to_check'])
|
||||
try:
|
||||
msg, tag = json.loads(result[1])
|
||||
print(tag)
|
||||
|
@ -36,18 +60,3 @@ def moderation_worker():
|
|||
|
||||
def add_moderation_task(msg, tag):
|
||||
redis_moderation.rpush('queue:msgs_to_check', json.dumps((msg, str(tag))))
|
||||
|
||||
|
||||
def get_results(tag, num_tasks):
|
||||
tag = str(tag) # Required for comparison with Redis results.
|
||||
flagged_categories = set()
|
||||
num_results = 0
|
||||
while num_results < num_tasks:
|
||||
result = redis_moderation.blpop('queue:flagged_categories')
|
||||
result_tag, categories = json.loads(result[1])
|
||||
if result_tag == tag:
|
||||
if categories:
|
||||
for item in categories:
|
||||
flagged_categories.add(item)
|
||||
num_results += 1
|
||||
return list(flagged_categories)
|
||||
|
|
|
@ -2,6 +2,7 @@ import time
|
|||
from threading import Thread
|
||||
|
||||
from llm_server import opts
|
||||
from llm_server.cluster.cluster_config import cluster_config
|
||||
from llm_server.cluster.stores import redis_running_models
|
||||
from llm_server.cluster.worker import cluster_worker
|
||||
from llm_server.routes.v1.generate_stats import generate_stats
|
||||
|
@ -26,7 +27,7 @@ def start_background():
|
|||
t.start()
|
||||
print('Started the main background thread.')
|
||||
|
||||
start_moderation_workers(opts.openai_moderation_workers)
|
||||
start_moderation_workers(opts.cluster_workers * 3)
|
||||
|
||||
t = Thread(target=cache_stats)
|
||||
t.daemon = True
|
||||
|
|
25
server.py
25
server.py
|
@ -24,12 +24,14 @@ from llm_server.routes.server_error import handle_server_error
|
|||
from llm_server.routes.v1 import bp
|
||||
from llm_server.sock import init_socketio
|
||||
|
||||
# TODO: make sure system tokens are excluded from 5/24 hr proompters
|
||||
# TODO: implement blind RRD controlled via header and only used when there is a queue on the primary backend(s)
|
||||
# TODO: is frequency penalty the same as ooba repetition penalty???
|
||||
# TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions
|
||||
# TODO: if a backend is at its limit of concurrent requests, choose a different one
|
||||
|
||||
# Lower priority
|
||||
# TODO: fix moderation freezing after a while
|
||||
# TODO: support logit_bias on OpenAI and Ooba endpoints.
|
||||
# TODO: add a way to cancel VLLM gens. Maybe use websockets?
|
||||
# TODO: validate openai_silent_trim works as expected and only when enabled
|
||||
|
@ -94,14 +96,15 @@ create_db()
|
|||
def home():
|
||||
base_client_api = redis.get('base_client_api', dtype=str)
|
||||
stats = generate_stats()
|
||||
model_choices, default_backend_info = get_model_choices()
|
||||
model_choices, default_model = get_model_choices()
|
||||
default_model_info = model_choices[default_model]
|
||||
|
||||
if default_backend_info['queued'] == 0 and default_backend_info['queued'] >= opts.concurrent_gens:
|
||||
if default_model_info['queued'] == 0 and default_model_info['queued'] >= default_model_info['concurrent_gens']:
|
||||
# There will be a wait if the queue is empty but prompts are processing, but we don't
|
||||
# know how long.
|
||||
default_estimated_wait_sec = f"less than {int(default_backend_info['estimated_wait'])} seconds"
|
||||
default_estimated_wait_sec = f"less than {int(default_model_info['estimated_wait'])} seconds"
|
||||
else:
|
||||
default_estimated_wait_sec = f"{int(default_backend_info['estimated_wait'])} seconds"
|
||||
default_estimated_wait_sec = f"{int(default_model_info['estimated_wait'])} seconds"
|
||||
|
||||
if len(config['analytics_tracking_code']):
|
||||
analytics_tracking_code = f"<script>\n{config['analytics_tracking_code']}\n</script>"
|
||||
|
@ -123,17 +126,17 @@ def home():
|
|||
llm_middleware_name=opts.llm_middleware_name,
|
||||
analytics_tracking_code=analytics_tracking_code,
|
||||
info_html=info_html,
|
||||
default_model=default_backend_info['model'],
|
||||
default_active_gen_workers=default_backend_info['processing'],
|
||||
default_proompters_in_queue=default_backend_info['queued'],
|
||||
default_model=default_model_info['model'],
|
||||
default_active_gen_workers=default_model_info['processing'],
|
||||
default_proompters_in_queue=default_model_info['queued'],
|
||||
current_model=opts.manual_model_name if opts.manual_model_name else None, # else running_model,
|
||||
client_api=f'https://{base_client_api}',
|
||||
ws_client_api=f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else 'disabled',
|
||||
default_estimated_wait=default_estimated_wait_sec,
|
||||
mode_name=mode_ui_names[opts.mode][0],
|
||||
api_input_textbox=mode_ui_names[opts.mode][1],
|
||||
streaming_input_textbox=mode_ui_names[opts.mode][2],
|
||||
default_context_size=default_backend_info['context_size'],
|
||||
mode_name=mode_ui_names[opts.frontend_api_mode][0],
|
||||
api_input_textbox=mode_ui_names[opts.frontend_api_mode][1],
|
||||
streaming_input_textbox=mode_ui_names[opts.frontend_api_mode][2],
|
||||
default_context_size=default_model_info['context_size'],
|
||||
stats_json=json.dumps(stats, indent=4, ensure_ascii=False),
|
||||
extra_info=mode_info,
|
||||
openai_client_api=f'https://{base_client_api}/openai/v1' if opts.enable_openi_compatible_backend else 'disabled',
|
||||
|
|
|
@ -152,8 +152,16 @@
|
|||
{% for key, value in model_choices.items() %}
|
||||
<div class="info-box">
|
||||
<h3>{{ key }} <span class="header-workers">- {{ value.backend_count }} {% if value.backend_count == 1 %}worker{% else %}workers{% endif %}</span></h3>
|
||||
|
||||
{% if value.estimated_wait == 0 and value.estimated_wait >= value.concurrent_gens %}
|
||||
{# There will be a wait if the queue is empty but prompts are processing, but we don't know how long. #}
|
||||
{% set estimated_wait_sec = "less than " + value.estimated_wait|int|string + " seconds" %}
|
||||
{% else %}
|
||||
{% set estimated_wait_sec = value.estimated_wait|int|string + " seconds" %}
|
||||
{% endif %}
|
||||
|
||||
<p>
|
||||
<strong>Estimated Wait Time:</strong> {{ value.estimated_wait }}<br>
|
||||
<strong>Estimated Wait Time:</strong> {{ estimated_wait_sec }}<br>
|
||||
Processing: {{ value.processing }}<br>
|
||||
Queued: {{ value.queued }}<br>
|
||||
</p>
|
||||
|
|
Reference in New Issue