adjust logging, add more vllm stuff
This commit is contained in:
parent
e053f48fdc
commit
bcedd2ab3d
|
@ -0,0 +1,4 @@
|
|||
```bash
|
||||
wget https://git.evulid.cc/attachments/6e7bfc04-cad4-4494-a98d-1391fbb402d3 -O /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl && pip install /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl && rm /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl
|
||||
pip install auto_gptq
|
||||
```
|
|
@ -18,14 +18,16 @@ def init_db():
|
|||
CREATE TABLE prompts (
|
||||
ip TEXT,
|
||||
token TEXT DEFAULT NULL,
|
||||
backend TEXT,
|
||||
model TEXT,
|
||||
backend_mode TEXT,
|
||||
backend_url TEXT,
|
||||
request_url TEXT,
|
||||
generation_time FLOAT,
|
||||
prompt TEXT,
|
||||
prompt_tokens INTEGER,
|
||||
response TEXT,
|
||||
response_tokens INTEGER,
|
||||
response_status INTEGER,
|
||||
generation_time FLOAT,
|
||||
model TEXT,
|
||||
parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)),
|
||||
headers TEXT CHECK (headers IS NULL OR json_valid(headers)),
|
||||
timestamp INTEGER
|
||||
|
@ -46,7 +48,7 @@ def init_db():
|
|||
conn.close()
|
||||
|
||||
|
||||
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, response_tokens: int = None, is_error: bool = False):
|
||||
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, request_url, response_tokens: int = None, is_error: bool = False):
|
||||
prompt_tokens = len(tokenizer.encode(prompt))
|
||||
|
||||
if not is_error:
|
||||
|
@ -73,7 +75,7 @@ def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backe
|
|||
conn = sqlite3.connect(opts.database_path)
|
||||
c = conn.cursor()
|
||||
c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
(ip, token, opts.mode, prompt, prompt_tokens, response, response_tokens, backend_response_code, gen_time, opts.running_model, json.dumps(parameters), json.dumps(headers), timestamp))
|
||||
(ip, token, opts.running_model, opts.mode, opts.backend_url, request_url, gen_time, prompt, prompt_tokens, response, response_tokens, backend_response_code, json.dumps(parameters), json.dumps(headers), timestamp))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
from typing import Tuple, Union
|
||||
|
||||
import flask
|
||||
|
||||
|
||||
class LLMBackend:
|
||||
default_params: dict
|
||||
|
||||
def handle_response(self, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
|
||||
def handle_response(self, request: flask.Request, success, response: flask.Response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
|
||||
raise NotImplementedError
|
||||
|
||||
def validate_params(self, params_dict: dict) -> Tuple[bool, str | None]:
|
||||
|
|
|
@ -9,7 +9,7 @@ from ...routes.helpers.http import validate_json
|
|||
|
||||
|
||||
class OobaboogaBackend(LLMBackend):
|
||||
def handle_response(self, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
|
||||
def handle_response(self, request, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
|
||||
backend_err = False
|
||||
response_valid_json, response_json_body = validate_json(response)
|
||||
try:
|
||||
|
@ -22,7 +22,7 @@ class OobaboogaBackend(LLMBackend):
|
|||
# We encountered an error
|
||||
if not success or not response:
|
||||
backend_response = format_sillytavern_err(f'Failed to reach the backend (oobabooga): {error_msg}', 'error')
|
||||
log_prompt(client_ip, token, prompt, backend_response, None, parameters, headers, response if response else 0, is_error=True)
|
||||
log_prompt(client_ip, token, prompt, backend_response, None, parameters, headers, response if response else 0, request.url, is_error=True)
|
||||
return jsonify({
|
||||
'code': 500,
|
||||
'msg': 'failed to reach backend',
|
||||
|
@ -43,13 +43,13 @@ class OobaboogaBackend(LLMBackend):
|
|||
if not backend_err:
|
||||
redis.incr('proompts')
|
||||
|
||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
|
||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
|
||||
return jsonify({
|
||||
**response_json_body
|
||||
}), 200
|
||||
else:
|
||||
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
|
||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code, is_error=True)
|
||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code, request.url, is_error=True)
|
||||
return jsonify({
|
||||
'code': 500,
|
||||
'msg': 'the backend did not return valid JSON',
|
||||
|
|
|
@ -17,7 +17,7 @@ from llm_server.routes.helpers.http import validate_json
|
|||
class VLLMBackend(LLMBackend):
|
||||
default_params = vars(SamplingParams())
|
||||
|
||||
def handle_response(self, success, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers):
|
||||
def handle_response(self, request, success, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers):
|
||||
response_valid_json, response_json_body = validate_json(response)
|
||||
backend_err = False
|
||||
try:
|
||||
|
@ -41,11 +41,11 @@ class VLLMBackend(LLMBackend):
|
|||
# f'HTTP CODE {response_status_code}'
|
||||
# )
|
||||
|
||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
|
||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
|
||||
return jsonify({'results': [{'text': backend_response}]}), 200
|
||||
else:
|
||||
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
|
||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code if response else None, is_error=True)
|
||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code if response else None, request.url, is_error=True)
|
||||
return jsonify({
|
||||
'code': 500,
|
||||
'msg': 'the backend did not return valid JSON',
|
||||
|
|
|
@ -27,7 +27,7 @@ class OobaRequestHandler(RequestHandler):
|
|||
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
|
||||
combined_error_message = ', '.join(error_messages)
|
||||
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, is_error=True)
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, self.request.url, is_error=True)
|
||||
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
|
||||
return jsonify({
|
||||
'code': 400,
|
||||
|
@ -54,11 +54,11 @@ class OobaRequestHandler(RequestHandler):
|
|||
elapsed_time = end_time - self.start_time
|
||||
|
||||
self.used = True
|
||||
return self.backend.handle_response(success, response, error_msg, self.client_ip, self.token, prompt, elapsed_time, self.parameters, dict(self.request.headers))
|
||||
return self.backend.handle_response(self.request, success, response, error_msg, self.client_ip, self.token, prompt, elapsed_time, self.parameters, dict(self.request.headers))
|
||||
|
||||
def handle_ratelimited(self):
|
||||
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, is_error=True)
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
|
||||
return jsonify({
|
||||
'results': [{'text': backend_response}]
|
||||
}), 200
|
||||
|
|
|
@ -35,7 +35,7 @@ class OpenAIRequestHandler(RequestHandler):
|
|||
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
|
||||
combined_error_message = ', '.join(error_messages)
|
||||
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, is_error=True)
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, self.request.url, is_error=True)
|
||||
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
|
||||
return jsonify({
|
||||
'code': 400,
|
||||
|
@ -67,7 +67,7 @@ class OpenAIRequestHandler(RequestHandler):
|
|||
|
||||
def handle_ratelimited(self):
|
||||
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, is_error=True)
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
|
||||
return build_openai_response(self.prompt, backend_response), 200
|
||||
|
||||
def transform_messages_to_prompt(self):
|
||||
|
|
|
@ -2,6 +2,8 @@ import sqlite3
|
|||
import time
|
||||
from typing import Union
|
||||
|
||||
import flask
|
||||
|
||||
from llm_server import opts
|
||||
from llm_server.llm.oobabooga.ooba_backend import OobaboogaBackend
|
||||
from llm_server.llm.vllm.vllm_backend import VLLMBackend
|
||||
|
@ -12,7 +14,7 @@ DEFAULT_PRIORITY = 9999
|
|||
|
||||
|
||||
class RequestHandler:
|
||||
def __init__(self, incoming_request):
|
||||
def __init__(self, incoming_request: flask.Request):
|
||||
self.request_json_body = None
|
||||
self.request = incoming_request
|
||||
self.start_time = time.time()
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Expected to be run as root in some sort of container
|
||||
|
||||
cd /tmp || exit
|
||||
|
||||
if [ ! -d /tmp/vllm-gptq ]; then
|
||||
git clone https://github.com/chu-tianxiang/vllm-gptq.git
|
||||
cd vllm-gptq || exit
|
||||
else
|
||||
cd vllm-gptq || exit
|
||||
git pull
|
||||
fi
|
||||
|
||||
if [ ! -d /root/miniconda3 ]; then
|
||||
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3-latest-Linux-x86_64.sh
|
||||
bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b
|
||||
rm /tmp/Miniconda3-latest-Linux-x86_64.sh
|
||||
fi
|
||||
|
||||
eval "$(/root/miniconda3/bin/conda shell.bash hook)"
|
||||
|
||||
if [ ! -d /root/miniconda3/envs/vllm-gptq ]; then
|
||||
conda create --name vllm-gptq -c conda-forge python=3.11 -y
|
||||
conda activate vllm-gptq
|
||||
pip install ninja
|
||||
conda install -y -c "nvidia/label/cuda-11.8.0" cuda==11.8
|
||||
conda install -y cudatoolkit cudnn
|
||||
else
|
||||
conda activate vllm-gptq
|
||||
fi
|
||||
|
||||
pip install -r requirements.txt
|
||||
|
||||
CUDA_HOME=/root/miniconda3/envs/vllm-gptq python setup.py bdist_wheel
|
||||
|
||||
echo -e "\n\n===\nOUTPUT:"
|
||||
find /tmp/vllm-gptq -name '*.whl'
|
|
@ -44,7 +44,7 @@ def get_requirements() -> List[str]:
|
|||
|
||||
setuptools.setup(
|
||||
name="vllm-gptq",
|
||||
version=find_version(get_path("vllm", "__init__.py")),
|
||||
version=find_version(get_path("", "__init__.py")),
|
||||
author="vLLM Team",
|
||||
license="Apache 2.0",
|
||||
description="A high-throughput and memory-efficient inference and serving engine for LLMs",
|
|
@ -12,6 +12,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
|
|||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
# python api_server.py --model /storage/oobabooga/one-click-installers/text-generation-webui/models/TheBloke_MythoMax-L2-13B-GPTQ/ --host 0.0.0.0 --port 7000 --max-num-batched-tokens 24576
|
||||
|
||||
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
||||
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
|
||||
app = FastAPI()
|
|
@ -1,4 +0,0 @@
|
|||
```bash
|
||||
wget https://git.evulid.cc/attachments/6e7bfc04-cad4-4494-a98d-1391fbb402d3 -O vllm-0.1.3-cp311-cp311-linux_x86_64.whl && pip install vllm-0.1.3-cp311-cp311-linux_x86_64.whl
|
||||
pip install auto_gptq
|
||||
```
|
Reference in New Issue