adjust logging, add more vllm stuff

This commit is contained in:
Cyberes 2023-09-13 11:22:33 -06:00
parent e053f48fdc
commit bcedd2ab3d
12 changed files with 70 additions and 24 deletions

4
VLLM INSTALL.md Normal file
View File

@ -0,0 +1,4 @@
```bash
wget https://git.evulid.cc/attachments/6e7bfc04-cad4-4494-a98d-1391fbb402d3 -O /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl && pip install /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl && rm /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl
pip install auto_gptq
```

View File

@ -18,14 +18,16 @@ def init_db():
CREATE TABLE prompts (
ip TEXT,
token TEXT DEFAULT NULL,
backend TEXT,
model TEXT,
backend_mode TEXT,
backend_url TEXT,
request_url TEXT,
generation_time FLOAT,
prompt TEXT,
prompt_tokens INTEGER,
response TEXT,
response_tokens INTEGER,
response_status INTEGER,
generation_time FLOAT,
model TEXT,
parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)),
headers TEXT CHECK (headers IS NULL OR json_valid(headers)),
timestamp INTEGER
@ -46,7 +48,7 @@ def init_db():
conn.close()
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, response_tokens: int = None, is_error: bool = False):
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, request_url, response_tokens: int = None, is_error: bool = False):
prompt_tokens = len(tokenizer.encode(prompt))
if not is_error:
@ -73,7 +75,7 @@ def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backe
conn = sqlite3.connect(opts.database_path)
c = conn.cursor()
c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(ip, token, opts.mode, prompt, prompt_tokens, response, response_tokens, backend_response_code, gen_time, opts.running_model, json.dumps(parameters), json.dumps(headers), timestamp))
(ip, token, opts.running_model, opts.mode, opts.backend_url, request_url, gen_time, prompt, prompt_tokens, response, response_tokens, backend_response_code, json.dumps(parameters), json.dumps(headers), timestamp))
conn.commit()
conn.close()

View File

@ -1,10 +1,12 @@
from typing import Tuple, Union
import flask
class LLMBackend:
default_params: dict
def handle_response(self, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
def handle_response(self, request: flask.Request, success, response: flask.Response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
raise NotImplementedError
def validate_params(self, params_dict: dict) -> Tuple[bool, str | None]:

View File

@ -9,7 +9,7 @@ from ...routes.helpers.http import validate_json
class OobaboogaBackend(LLMBackend):
def handle_response(self, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
def handle_response(self, request, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
backend_err = False
response_valid_json, response_json_body = validate_json(response)
try:
@ -22,7 +22,7 @@ class OobaboogaBackend(LLMBackend):
# We encountered an error
if not success or not response:
backend_response = format_sillytavern_err(f'Failed to reach the backend (oobabooga): {error_msg}', 'error')
log_prompt(client_ip, token, prompt, backend_response, None, parameters, headers, response if response else 0, is_error=True)
log_prompt(client_ip, token, prompt, backend_response, None, parameters, headers, response if response else 0, request.url, is_error=True)
return jsonify({
'code': 500,
'msg': 'failed to reach backend',
@ -43,13 +43,13 @@ class OobaboogaBackend(LLMBackend):
if not backend_err:
redis.incr('proompts')
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
return jsonify({
**response_json_body
}), 200
else:
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code, is_error=True)
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code, request.url, is_error=True)
return jsonify({
'code': 500,
'msg': 'the backend did not return valid JSON',

View File

@ -17,7 +17,7 @@ from llm_server.routes.helpers.http import validate_json
class VLLMBackend(LLMBackend):
default_params = vars(SamplingParams())
def handle_response(self, success, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers):
def handle_response(self, request, success, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers):
response_valid_json, response_json_body = validate_json(response)
backend_err = False
try:
@ -41,11 +41,11 @@ class VLLMBackend(LLMBackend):
# f'HTTP CODE {response_status_code}'
# )
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
return jsonify({'results': [{'text': backend_response}]}), 200
else:
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code if response else None, is_error=True)
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code if response else None, request.url, is_error=True)
return jsonify({
'code': 500,
'msg': 'the backend did not return valid JSON',

View File

@ -27,7 +27,7 @@ class OobaRequestHandler(RequestHandler):
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
combined_error_message = ', '.join(error_messages)
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, is_error=True)
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, self.request.url, is_error=True)
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
return jsonify({
'code': 400,
@ -54,11 +54,11 @@ class OobaRequestHandler(RequestHandler):
elapsed_time = end_time - self.start_time
self.used = True
return self.backend.handle_response(success, response, error_msg, self.client_ip, self.token, prompt, elapsed_time, self.parameters, dict(self.request.headers))
return self.backend.handle_response(self.request, success, response, error_msg, self.client_ip, self.token, prompt, elapsed_time, self.parameters, dict(self.request.headers))
def handle_ratelimited(self):
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, is_error=True)
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
return jsonify({
'results': [{'text': backend_response}]
}), 200

View File

@ -35,7 +35,7 @@ class OpenAIRequestHandler(RequestHandler):
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
combined_error_message = ', '.join(error_messages)
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, is_error=True)
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, self.request.url, is_error=True)
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
return jsonify({
'code': 400,
@ -67,7 +67,7 @@ class OpenAIRequestHandler(RequestHandler):
def handle_ratelimited(self):
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, is_error=True)
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
return build_openai_response(self.prompt, backend_response), 200
def transform_messages_to_prompt(self):

View File

@ -2,6 +2,8 @@ import sqlite3
import time
from typing import Union
import flask
from llm_server import opts
from llm_server.llm.oobabooga.ooba_backend import OobaboogaBackend
from llm_server.llm.vllm.vllm_backend import VLLMBackend
@ -12,7 +14,7 @@ DEFAULT_PRIORITY = 9999
class RequestHandler:
def __init__(self, incoming_request):
def __init__(self, incoming_request: flask.Request):
self.request_json_body = None
self.request = incoming_request
self.start_time = time.time()

38
other/vllm/build-vllm.sh Normal file
View File

@ -0,0 +1,38 @@
#!/bin/bash
# Expected to be run as root in some sort of container
cd /tmp || exit
if [ ! -d /tmp/vllm-gptq ]; then
git clone https://github.com/chu-tianxiang/vllm-gptq.git
cd vllm-gptq || exit
else
cd vllm-gptq || exit
git pull
fi
if [ ! -d /root/miniconda3 ]; then
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3-latest-Linux-x86_64.sh
bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b
rm /tmp/Miniconda3-latest-Linux-x86_64.sh
fi
eval "$(/root/miniconda3/bin/conda shell.bash hook)"
if [ ! -d /root/miniconda3/envs/vllm-gptq ]; then
conda create --name vllm-gptq -c conda-forge python=3.11 -y
conda activate vllm-gptq
pip install ninja
conda install -y -c "nvidia/label/cuda-11.8.0" cuda==11.8
conda install -y cudatoolkit cudnn
else
conda activate vllm-gptq
fi
pip install -r requirements.txt
CUDA_HOME=/root/miniconda3/envs/vllm-gptq python setup.py bdist_wheel
echo -e "\n\n===\nOUTPUT:"
find /tmp/vllm-gptq -name '*.whl'

View File

@ -44,7 +44,7 @@ def get_requirements() -> List[str]:
setuptools.setup(
name="vllm-gptq",
version=find_version(get_path("vllm", "__init__.py")),
version=find_version(get_path("", "__init__.py")),
author="vLLM Team",
license="Apache 2.0",
description="A high-throughput and memory-efficient inference and serving engine for LLMs",

View File

@ -12,6 +12,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
# python api_server.py --model /storage/oobabooga/one-click-installers/text-generation-webui/models/TheBloke_MythoMax-L2-13B-GPTQ/ --host 0.0.0.0 --port 7000 --max-num-batched-tokens 24576
TIMEOUT_KEEP_ALIVE = 5 # seconds.
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
app = FastAPI()

View File

@ -1,4 +0,0 @@
```bash
wget https://git.evulid.cc/attachments/6e7bfc04-cad4-4494-a98d-1391fbb402d3 -O vllm-0.1.3-cp311-cp311-linux_x86_64.whl && pip install vllm-0.1.3-cp311-cp311-linux_x86_64.whl
pip install auto_gptq
```