diff --git a/llm_server/database.py b/llm_server/database.py index f5c2537..41c3d0d 100644 --- a/llm_server/database.py +++ b/llm_server/database.py @@ -160,7 +160,11 @@ def weighted_average_column_for_model(table_name, column_name, model_name, backe # if total_weight == 0: # continue - calculated_avg = weighted_sum / total_weight + if total_weight > 0: + # Avoid division by zero + calculated_avg = weighted_sum / total_weight + else: + calculated_avg = 0 conn.close() diff --git a/llm_server/helpers.py b/llm_server/helpers.py index 40dd81c..55df351 100644 --- a/llm_server/helpers.py +++ b/llm_server/helpers.py @@ -2,6 +2,8 @@ import json from collections import OrderedDict from pathlib import Path +from flask import make_response + def resolve_path(*p: str): return Path(*p).expanduser().resolve().absolute() @@ -47,3 +49,11 @@ def indefinite_article(word): return 'an' else: return 'a' + + +def jsonify_pretty(json_dict: dict, status=200, indent=4, sort_keys=True): + response = make_response(json.dumps(json_dict, indent=indent, sort_keys=sort_keys)) + response.headers['Content-Type'] = 'application/json; charset=utf-8' + response.headers['mimetype'] = 'application/json' + response.status_code = status + return response diff --git a/llm_server/llm/llm_backend.py b/llm_server/llm/llm_backend.py index 53f43c9..d6a1f25 100644 --- a/llm_server/llm/llm_backend.py +++ b/llm_server/llm/llm_backend.py @@ -33,7 +33,6 @@ class LLMBackend: @staticmethod def validate_prompt(prompt: str) -> Tuple[bool, Union[str, None]]: prompt_len = len(tokenizer.encode(prompt)) - print(prompt_len, opts.context_size) if prompt_len > opts.context_size - 10: # Our tokenizer isn't 100% accurate so we cut it down a bit. TODO: add a tokenizer endpoint to VLLM return False, f'Token indices sequence length is longer than the specified maximum sequence length for this model ({prompt_len} > {opts.context_size}). Please lower your context size' return True, None diff --git a/llm_server/routes/v1/proxy.py b/llm_server/routes/v1/proxy.py index acd0797..05bb534 100644 --- a/llm_server/routes/v1/proxy.py +++ b/llm_server/routes/v1/proxy.py @@ -3,9 +3,10 @@ from flask import jsonify from . import bp from .generate_stats import generate_stats from ..cache import cache +from ...helpers import jsonify_pretty @bp.route('/stats', methods=['GET']) @cache.cached(timeout=5, query_string=True) def get_stats(): - return jsonify(generate_stats()), 200 + return jsonify_pretty(generate_stats()) diff --git a/other/vllm/README.md b/other/vllm/README.md index eb59ae8..ce50a19 100644 --- a/other/vllm/README.md +++ b/other/vllm/README.md @@ -1,9 +1,14 @@ ### Nginx -1. Make sure your proxies all have a long timeout: +Make sure your proxies all have a long timeout: ``` proxy_read_timeout 300; proxy_connect_timeout 300; proxy_send_timeout 300; ``` -The LLM middleware has a request timeout of 120 so this longer timeout is to avoid any issues. \ No newline at end of file + +The LLM middleware has a request timeout of 95 so this longer timeout is to avoid any issues. + +### Model Preperation + +Make sure your model's `tokenizer_config.json` has `4096` set equal to or greater than your token limit. diff --git a/templates/home.html b/templates/home.html index ffebc11..4b9c153 100644 --- a/templates/home.html +++ b/templates/home.html @@ -69,6 +69,9 @@
+