diff --git a/llm_server/llm/hf_textgen/generate.py b/llm_server/llm/hf_textgen/generate.py index 9b7dfdc..a864580 100644 --- a/llm_server/llm/hf_textgen/generate.py +++ b/llm_server/llm/hf_textgen/generate.py @@ -18,7 +18,7 @@ def prepare_json(json_data: dict): return { 'inputs': json_data.get('prompt', ''), 'parameters': { - 'max_new_tokens': opts.token_limit - token_count, + 'max_new_tokens': opts.context_size - token_count, 'repetition_penalty': json_data.get('repetition_penalty', None), 'seed': seed, 'stop': json_data.get('stopping_strings', []), diff --git a/llm_server/opts.py b/llm_server/opts.py index f5be76a..a20a364 100644 --- a/llm_server/opts.py +++ b/llm_server/opts.py @@ -4,7 +4,7 @@ running_model = 'none' concurrent_gens = 3 mode = 'oobabooga' backend_url = None -token_limit = 5555 +context_size = 5555 database_path = './proxy-server.db' auth_required = False log_prompts = False diff --git a/llm_server/routes/v1/generate.py b/llm_server/routes/v1/generate.py index 58f56a8..14c67b5 100644 --- a/llm_server/routes/v1/generate.py +++ b/llm_server/routes/v1/generate.py @@ -64,7 +64,7 @@ def generate(): backend_response = safe_list_get(response_json_body.get('results', []), 0, {}).get('text') if not backend_response: if opts.mode == 'oobabooga': - backend_response = format_sillytavern_err(f'Backend returned an empty string. This can happen when your parameters are incorrect. Make sure your context size is no greater than {opts.token_limit}.', 'error') + backend_response = format_sillytavern_err(f'Backend returned an empty string. This can happen when your parameters are incorrect. Make sure your context size is no greater than {opts.context_size}.', 'error') response_json_body['results'][0]['text'] = backend_response else: raise Exception diff --git a/llm_server/routes/v1/proxy.py b/llm_server/routes/v1/proxy.py index ec2b8f7..b5e69c4 100644 --- a/llm_server/routes/v1/proxy.py +++ b/llm_server/routes/v1/proxy.py @@ -17,21 +17,30 @@ from ...llm.info import get_running_model @cache.cached(timeout=60, query_string=True) @cache_control(60) def get_stats(): - model_list = get_running_model() + model_list = get_running_model() # will return False when the fetch fails if isinstance(model_list, bool): - # get_running_model() will return False when the fetch fails online = False else: online = True return jsonify({ - 'proompters_now': opts.concurrent_gens - concurrent_semaphore._value, - 'proompters_1_min': proompters_1_min, - 'total_proompts': stats.proompts.value, - 'uptime': int((datetime.now() - stats.start_time).total_seconds()), + 'stats': { + 'proompters_now': opts.concurrent_gens - concurrent_semaphore._value, + 'proompters_1_min': proompters_1_min, + 'total_proompts': stats.proompts.value, + 'uptime': int((datetime.now() - stats.start_time).total_seconds()), + }, 'online': online, 'mode': opts.mode, 'model': get_running_model(), - 'client': f'https://{request.headers.get("Host")}/{opts.frontend_api_client.strip("/")}', - 'timestamp': int(time.time()) + 'endpoints': { + 'blocking': f'https://{request.headers.get("Host")}/{opts.frontend_api_client.strip("/")}', + }, + 'timestamp': int(time.time()), + 'openaiKeys': '∞', + 'anthropicKeys': '∞', + 'config': { + 'gatekeeper': 'none' if opts.auth_required is False else 'token', + 'context_size': opts.context_size, + } }), 200 diff --git a/server.py b/server.py index 820d6af..fd17771 100644 --- a/server.py +++ b/server.py @@ -46,7 +46,7 @@ opts.auth_required = config['auth_required'] opts.log_prompts = config['log_prompts'] opts.concurrent_gens = config['concurrent_gens'] opts.frontend_api_client = config['frontend_api_client'] -opts.token_limit = config['token_limit'] +opts.context_size = config['token_limit'] app = Flask(__name__) cache.init_app(app)