adjust logging, add more vllm stuff

This commit is contained in:
Cyberes 2023-09-13 11:22:33 -06:00
parent e053f48fdc
commit bcedd2ab3d
12 changed files with 70 additions and 24 deletions

4
VLLM INSTALL.md Normal file
View File

@ -0,0 +1,4 @@
```bash
wget https://git.evulid.cc/attachments/6e7bfc04-cad4-4494-a98d-1391fbb402d3 -O /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl && pip install /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl && rm /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl
pip install auto_gptq
```

View File

@ -18,14 +18,16 @@ def init_db():
CREATE TABLE prompts ( CREATE TABLE prompts (
ip TEXT, ip TEXT,
token TEXT DEFAULT NULL, token TEXT DEFAULT NULL,
backend TEXT, model TEXT,
backend_mode TEXT,
backend_url TEXT,
request_url TEXT,
generation_time FLOAT,
prompt TEXT, prompt TEXT,
prompt_tokens INTEGER, prompt_tokens INTEGER,
response TEXT, response TEXT,
response_tokens INTEGER, response_tokens INTEGER,
response_status INTEGER, response_status INTEGER,
generation_time FLOAT,
model TEXT,
parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)), parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)),
headers TEXT CHECK (headers IS NULL OR json_valid(headers)), headers TEXT CHECK (headers IS NULL OR json_valid(headers)),
timestamp INTEGER timestamp INTEGER
@ -46,7 +48,7 @@ def init_db():
conn.close() conn.close()
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, response_tokens: int = None, is_error: bool = False): def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, request_url, response_tokens: int = None, is_error: bool = False):
prompt_tokens = len(tokenizer.encode(prompt)) prompt_tokens = len(tokenizer.encode(prompt))
if not is_error: if not is_error:
@ -73,7 +75,7 @@ def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backe
conn = sqlite3.connect(opts.database_path) conn = sqlite3.connect(opts.database_path)
c = conn.cursor() c = conn.cursor()
c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(ip, token, opts.mode, prompt, prompt_tokens, response, response_tokens, backend_response_code, gen_time, opts.running_model, json.dumps(parameters), json.dumps(headers), timestamp)) (ip, token, opts.running_model, opts.mode, opts.backend_url, request_url, gen_time, prompt, prompt_tokens, response, response_tokens, backend_response_code, json.dumps(parameters), json.dumps(headers), timestamp))
conn.commit() conn.commit()
conn.close() conn.close()

View File

@ -1,10 +1,12 @@
from typing import Tuple, Union from typing import Tuple, Union
import flask
class LLMBackend: class LLMBackend:
default_params: dict default_params: dict
def handle_response(self, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers): def handle_response(self, request: flask.Request, success, response: flask.Response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
raise NotImplementedError raise NotImplementedError
def validate_params(self, params_dict: dict) -> Tuple[bool, str | None]: def validate_params(self, params_dict: dict) -> Tuple[bool, str | None]:

View File

@ -9,7 +9,7 @@ from ...routes.helpers.http import validate_json
class OobaboogaBackend(LLMBackend): class OobaboogaBackend(LLMBackend):
def handle_response(self, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers): def handle_response(self, request, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
backend_err = False backend_err = False
response_valid_json, response_json_body = validate_json(response) response_valid_json, response_json_body = validate_json(response)
try: try:
@ -22,7 +22,7 @@ class OobaboogaBackend(LLMBackend):
# We encountered an error # We encountered an error
if not success or not response: if not success or not response:
backend_response = format_sillytavern_err(f'Failed to reach the backend (oobabooga): {error_msg}', 'error') backend_response = format_sillytavern_err(f'Failed to reach the backend (oobabooga): {error_msg}', 'error')
log_prompt(client_ip, token, prompt, backend_response, None, parameters, headers, response if response else 0, is_error=True) log_prompt(client_ip, token, prompt, backend_response, None, parameters, headers, response if response else 0, request.url, is_error=True)
return jsonify({ return jsonify({
'code': 500, 'code': 500,
'msg': 'failed to reach backend', 'msg': 'failed to reach backend',
@ -43,13 +43,13 @@ class OobaboogaBackend(LLMBackend):
if not backend_err: if not backend_err:
redis.incr('proompts') redis.incr('proompts')
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err) log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
return jsonify({ return jsonify({
**response_json_body **response_json_body
}), 200 }), 200
else: else:
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error') backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code, is_error=True) log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code, request.url, is_error=True)
return jsonify({ return jsonify({
'code': 500, 'code': 500,
'msg': 'the backend did not return valid JSON', 'msg': 'the backend did not return valid JSON',

View File

@ -17,7 +17,7 @@ from llm_server.routes.helpers.http import validate_json
class VLLMBackend(LLMBackend): class VLLMBackend(LLMBackend):
default_params = vars(SamplingParams()) default_params = vars(SamplingParams())
def handle_response(self, success, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers): def handle_response(self, request, success, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers):
response_valid_json, response_json_body = validate_json(response) response_valid_json, response_json_body = validate_json(response)
backend_err = False backend_err = False
try: try:
@ -41,11 +41,11 @@ class VLLMBackend(LLMBackend):
# f'HTTP CODE {response_status_code}' # f'HTTP CODE {response_status_code}'
# ) # )
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err) log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
return jsonify({'results': [{'text': backend_response}]}), 200 return jsonify({'results': [{'text': backend_response}]}), 200
else: else:
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error') backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code if response else None, is_error=True) log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code if response else None, request.url, is_error=True)
return jsonify({ return jsonify({
'code': 500, 'code': 500,
'msg': 'the backend did not return valid JSON', 'msg': 'the backend did not return valid JSON',

View File

@ -27,7 +27,7 @@ class OobaRequestHandler(RequestHandler):
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg] error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
combined_error_message = ', '.join(error_messages) combined_error_message = ', '.join(error_messages)
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error') err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, is_error=True) log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, self.request.url, is_error=True)
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types # TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
return jsonify({ return jsonify({
'code': 400, 'code': 400,
@ -54,11 +54,11 @@ class OobaRequestHandler(RequestHandler):
elapsed_time = end_time - self.start_time elapsed_time = end_time - self.start_time
self.used = True self.used = True
return self.backend.handle_response(success, response, error_msg, self.client_ip, self.token, prompt, elapsed_time, self.parameters, dict(self.request.headers)) return self.backend.handle_response(self.request, success, response, error_msg, self.client_ip, self.token, prompt, elapsed_time, self.parameters, dict(self.request.headers))
def handle_ratelimited(self): def handle_ratelimited(self):
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error') backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, is_error=True) log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
return jsonify({ return jsonify({
'results': [{'text': backend_response}] 'results': [{'text': backend_response}]
}), 200 }), 200

View File

@ -35,7 +35,7 @@ class OpenAIRequestHandler(RequestHandler):
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg] error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
combined_error_message = ', '.join(error_messages) combined_error_message = ', '.join(error_messages)
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error') err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, is_error=True) log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, self.request.url, is_error=True)
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types # TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
return jsonify({ return jsonify({
'code': 400, 'code': 400,
@ -67,7 +67,7 @@ class OpenAIRequestHandler(RequestHandler):
def handle_ratelimited(self): def handle_ratelimited(self):
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error') backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, is_error=True) log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
return build_openai_response(self.prompt, backend_response), 200 return build_openai_response(self.prompt, backend_response), 200
def transform_messages_to_prompt(self): def transform_messages_to_prompt(self):

View File

@ -2,6 +2,8 @@ import sqlite3
import time import time
from typing import Union from typing import Union
import flask
from llm_server import opts from llm_server import opts
from llm_server.llm.oobabooga.ooba_backend import OobaboogaBackend from llm_server.llm.oobabooga.ooba_backend import OobaboogaBackend
from llm_server.llm.vllm.vllm_backend import VLLMBackend from llm_server.llm.vllm.vllm_backend import VLLMBackend
@ -12,7 +14,7 @@ DEFAULT_PRIORITY = 9999
class RequestHandler: class RequestHandler:
def __init__(self, incoming_request): def __init__(self, incoming_request: flask.Request):
self.request_json_body = None self.request_json_body = None
self.request = incoming_request self.request = incoming_request
self.start_time = time.time() self.start_time = time.time()

38
other/vllm/build-vllm.sh Normal file
View File

@ -0,0 +1,38 @@
#!/bin/bash
# Expected to be run as root in some sort of container
cd /tmp || exit
if [ ! -d /tmp/vllm-gptq ]; then
git clone https://github.com/chu-tianxiang/vllm-gptq.git
cd vllm-gptq || exit
else
cd vllm-gptq || exit
git pull
fi
if [ ! -d /root/miniconda3 ]; then
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3-latest-Linux-x86_64.sh
bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b
rm /tmp/Miniconda3-latest-Linux-x86_64.sh
fi
eval "$(/root/miniconda3/bin/conda shell.bash hook)"
if [ ! -d /root/miniconda3/envs/vllm-gptq ]; then
conda create --name vllm-gptq -c conda-forge python=3.11 -y
conda activate vllm-gptq
pip install ninja
conda install -y -c "nvidia/label/cuda-11.8.0" cuda==11.8
conda install -y cudatoolkit cudnn
else
conda activate vllm-gptq
fi
pip install -r requirements.txt
CUDA_HOME=/root/miniconda3/envs/vllm-gptq python setup.py bdist_wheel
echo -e "\n\n===\nOUTPUT:"
find /tmp/vllm-gptq -name '*.whl'

View File

@ -44,7 +44,7 @@ def get_requirements() -> List[str]:
setuptools.setup( setuptools.setup(
name="vllm-gptq", name="vllm-gptq",
version=find_version(get_path("vllm", "__init__.py")), version=find_version(get_path("", "__init__.py")),
author="vLLM Team", author="vLLM Team",
license="Apache 2.0", license="Apache 2.0",
description="A high-throughput and memory-efficient inference and serving engine for LLMs", description="A high-throughput and memory-efficient inference and serving engine for LLMs",

View File

@ -12,6 +12,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid from vllm.utils import random_uuid
# python api_server.py --model /storage/oobabooga/one-click-installers/text-generation-webui/models/TheBloke_MythoMax-L2-13B-GPTQ/ --host 0.0.0.0 --port 7000 --max-num-batched-tokens 24576
TIMEOUT_KEEP_ALIVE = 5 # seconds. TIMEOUT_KEEP_ALIVE = 5 # seconds.
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds. TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
app = FastAPI() app = FastAPI()

View File

@ -1,4 +0,0 @@
```bash
wget https://git.evulid.cc/attachments/6e7bfc04-cad4-4494-a98d-1391fbb402d3 -O vllm-0.1.3-cp311-cp311-linux_x86_64.whl && pip install vllm-0.1.3-cp311-cp311-linux_x86_64.whl
pip install auto_gptq
```