adjust logging, add more vllm stuff
This commit is contained in:
parent
e053f48fdc
commit
bcedd2ab3d
|
@ -0,0 +1,4 @@
|
||||||
|
```bash
|
||||||
|
wget https://git.evulid.cc/attachments/6e7bfc04-cad4-4494-a98d-1391fbb402d3 -O /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl && pip install /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl && rm /tmp/vllm-0.1.3-cp311-cp311-linux_x86_64.whl
|
||||||
|
pip install auto_gptq
|
||||||
|
```
|
|
@ -18,14 +18,16 @@ def init_db():
|
||||||
CREATE TABLE prompts (
|
CREATE TABLE prompts (
|
||||||
ip TEXT,
|
ip TEXT,
|
||||||
token TEXT DEFAULT NULL,
|
token TEXT DEFAULT NULL,
|
||||||
backend TEXT,
|
model TEXT,
|
||||||
|
backend_mode TEXT,
|
||||||
|
backend_url TEXT,
|
||||||
|
request_url TEXT,
|
||||||
|
generation_time FLOAT,
|
||||||
prompt TEXT,
|
prompt TEXT,
|
||||||
prompt_tokens INTEGER,
|
prompt_tokens INTEGER,
|
||||||
response TEXT,
|
response TEXT,
|
||||||
response_tokens INTEGER,
|
response_tokens INTEGER,
|
||||||
response_status INTEGER,
|
response_status INTEGER,
|
||||||
generation_time FLOAT,
|
|
||||||
model TEXT,
|
|
||||||
parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)),
|
parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)),
|
||||||
headers TEXT CHECK (headers IS NULL OR json_valid(headers)),
|
headers TEXT CHECK (headers IS NULL OR json_valid(headers)),
|
||||||
timestamp INTEGER
|
timestamp INTEGER
|
||||||
|
@ -46,7 +48,7 @@ def init_db():
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, response_tokens: int = None, is_error: bool = False):
|
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, request_url, response_tokens: int = None, is_error: bool = False):
|
||||||
prompt_tokens = len(tokenizer.encode(prompt))
|
prompt_tokens = len(tokenizer.encode(prompt))
|
||||||
|
|
||||||
if not is_error:
|
if not is_error:
|
||||||
|
@ -73,7 +75,7 @@ def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backe
|
||||||
conn = sqlite3.connect(opts.database_path)
|
conn = sqlite3.connect(opts.database_path)
|
||||||
c = conn.cursor()
|
c = conn.cursor()
|
||||||
c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||||
(ip, token, opts.mode, prompt, prompt_tokens, response, response_tokens, backend_response_code, gen_time, opts.running_model, json.dumps(parameters), json.dumps(headers), timestamp))
|
(ip, token, opts.running_model, opts.mode, opts.backend_url, request_url, gen_time, prompt, prompt_tokens, response, response_tokens, backend_response_code, json.dumps(parameters), json.dumps(headers), timestamp))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
from typing import Tuple, Union
|
from typing import Tuple, Union
|
||||||
|
|
||||||
|
import flask
|
||||||
|
|
||||||
|
|
||||||
class LLMBackend:
|
class LLMBackend:
|
||||||
default_params: dict
|
default_params: dict
|
||||||
|
|
||||||
def handle_response(self, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
|
def handle_response(self, request: flask.Request, success, response: flask.Response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def validate_params(self, params_dict: dict) -> Tuple[bool, str | None]:
|
def validate_params(self, params_dict: dict) -> Tuple[bool, str | None]:
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ...routes.helpers.http import validate_json
|
||||||
|
|
||||||
|
|
||||||
class OobaboogaBackend(LLMBackend):
|
class OobaboogaBackend(LLMBackend):
|
||||||
def handle_response(self, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
|
def handle_response(self, request, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers):
|
||||||
backend_err = False
|
backend_err = False
|
||||||
response_valid_json, response_json_body = validate_json(response)
|
response_valid_json, response_json_body = validate_json(response)
|
||||||
try:
|
try:
|
||||||
|
@ -22,7 +22,7 @@ class OobaboogaBackend(LLMBackend):
|
||||||
# We encountered an error
|
# We encountered an error
|
||||||
if not success or not response:
|
if not success or not response:
|
||||||
backend_response = format_sillytavern_err(f'Failed to reach the backend (oobabooga): {error_msg}', 'error')
|
backend_response = format_sillytavern_err(f'Failed to reach the backend (oobabooga): {error_msg}', 'error')
|
||||||
log_prompt(client_ip, token, prompt, backend_response, None, parameters, headers, response if response else 0, is_error=True)
|
log_prompt(client_ip, token, prompt, backend_response, None, parameters, headers, response if response else 0, request.url, is_error=True)
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'code': 500,
|
'code': 500,
|
||||||
'msg': 'failed to reach backend',
|
'msg': 'failed to reach backend',
|
||||||
|
@ -43,13 +43,13 @@ class OobaboogaBackend(LLMBackend):
|
||||||
if not backend_err:
|
if not backend_err:
|
||||||
redis.incr('proompts')
|
redis.incr('proompts')
|
||||||
|
|
||||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
|
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
|
||||||
return jsonify({
|
return jsonify({
|
||||||
**response_json_body
|
**response_json_body
|
||||||
}), 200
|
}), 200
|
||||||
else:
|
else:
|
||||||
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
|
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
|
||||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code, is_error=True)
|
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code, request.url, is_error=True)
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'code': 500,
|
'code': 500,
|
||||||
'msg': 'the backend did not return valid JSON',
|
'msg': 'the backend did not return valid JSON',
|
||||||
|
|
|
@ -17,7 +17,7 @@ from llm_server.routes.helpers.http import validate_json
|
||||||
class VLLMBackend(LLMBackend):
|
class VLLMBackend(LLMBackend):
|
||||||
default_params = vars(SamplingParams())
|
default_params = vars(SamplingParams())
|
||||||
|
|
||||||
def handle_response(self, success, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers):
|
def handle_response(self, request, success, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers):
|
||||||
response_valid_json, response_json_body = validate_json(response)
|
response_valid_json, response_json_body = validate_json(response)
|
||||||
backend_err = False
|
backend_err = False
|
||||||
try:
|
try:
|
||||||
|
@ -41,11 +41,11 @@ class VLLMBackend(LLMBackend):
|
||||||
# f'HTTP CODE {response_status_code}'
|
# f'HTTP CODE {response_status_code}'
|
||||||
# )
|
# )
|
||||||
|
|
||||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
|
log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err)
|
||||||
return jsonify({'results': [{'text': backend_response}]}), 200
|
return jsonify({'results': [{'text': backend_response}]}), 200
|
||||||
else:
|
else:
|
||||||
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
|
backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error')
|
||||||
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code if response else None, is_error=True)
|
log_prompt(client_ip, token, prompt, backend_response, elapsed_time, parameters, headers, response.status_code if response else None, request.url, is_error=True)
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'code': 500,
|
'code': 500,
|
||||||
'msg': 'the backend did not return valid JSON',
|
'msg': 'the backend did not return valid JSON',
|
||||||
|
|
|
@ -27,7 +27,7 @@ class OobaRequestHandler(RequestHandler):
|
||||||
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
|
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
|
||||||
combined_error_message = ', '.join(error_messages)
|
combined_error_message = ', '.join(error_messages)
|
||||||
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
|
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
|
||||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, is_error=True)
|
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, self.request.url, is_error=True)
|
||||||
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
|
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'code': 400,
|
'code': 400,
|
||||||
|
@ -54,11 +54,11 @@ class OobaRequestHandler(RequestHandler):
|
||||||
elapsed_time = end_time - self.start_time
|
elapsed_time = end_time - self.start_time
|
||||||
|
|
||||||
self.used = True
|
self.used = True
|
||||||
return self.backend.handle_response(success, response, error_msg, self.client_ip, self.token, prompt, elapsed_time, self.parameters, dict(self.request.headers))
|
return self.backend.handle_response(self.request, success, response, error_msg, self.client_ip, self.token, prompt, elapsed_time, self.parameters, dict(self.request.headers))
|
||||||
|
|
||||||
def handle_ratelimited(self):
|
def handle_ratelimited(self):
|
||||||
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
|
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
|
||||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, is_error=True)
|
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'results': [{'text': backend_response}]
|
'results': [{'text': backend_response}]
|
||||||
}), 200
|
}), 200
|
||||||
|
|
|
@ -35,7 +35,7 @@ class OpenAIRequestHandler(RequestHandler):
|
||||||
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
|
error_messages = [msg for valid, msg in [request_valid, params_valid] if not valid and msg]
|
||||||
combined_error_message = ', '.join(error_messages)
|
combined_error_message = ', '.join(error_messages)
|
||||||
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
|
err = format_sillytavern_err(f'Validation Error: {combined_error_message}.', 'error')
|
||||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, is_error=True)
|
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), err, 0, self.parameters, dict(self.request.headers), 0, self.request.url, is_error=True)
|
||||||
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
|
# TODO: add a method to LLMBackend to return a formatted response string, since we have both Ooba and OpenAI response types
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'code': 400,
|
'code': 400,
|
||||||
|
@ -67,7 +67,7 @@ class OpenAIRequestHandler(RequestHandler):
|
||||||
|
|
||||||
def handle_ratelimited(self):
|
def handle_ratelimited(self):
|
||||||
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
|
backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error')
|
||||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, is_error=True)
|
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
|
||||||
return build_openai_response(self.prompt, backend_response), 200
|
return build_openai_response(self.prompt, backend_response), 200
|
||||||
|
|
||||||
def transform_messages_to_prompt(self):
|
def transform_messages_to_prompt(self):
|
||||||
|
|
|
@ -2,6 +2,8 @@ import sqlite3
|
||||||
import time
|
import time
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
|
import flask
|
||||||
|
|
||||||
from llm_server import opts
|
from llm_server import opts
|
||||||
from llm_server.llm.oobabooga.ooba_backend import OobaboogaBackend
|
from llm_server.llm.oobabooga.ooba_backend import OobaboogaBackend
|
||||||
from llm_server.llm.vllm.vllm_backend import VLLMBackend
|
from llm_server.llm.vllm.vllm_backend import VLLMBackend
|
||||||
|
@ -12,7 +14,7 @@ DEFAULT_PRIORITY = 9999
|
||||||
|
|
||||||
|
|
||||||
class RequestHandler:
|
class RequestHandler:
|
||||||
def __init__(self, incoming_request):
|
def __init__(self, incoming_request: flask.Request):
|
||||||
self.request_json_body = None
|
self.request_json_body = None
|
||||||
self.request = incoming_request
|
self.request = incoming_request
|
||||||
self.start_time = time.time()
|
self.start_time = time.time()
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Expected to be run as root in some sort of container
|
||||||
|
|
||||||
|
cd /tmp || exit
|
||||||
|
|
||||||
|
if [ ! -d /tmp/vllm-gptq ]; then
|
||||||
|
git clone https://github.com/chu-tianxiang/vllm-gptq.git
|
||||||
|
cd vllm-gptq || exit
|
||||||
|
else
|
||||||
|
cd vllm-gptq || exit
|
||||||
|
git pull
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -d /root/miniconda3 ]; then
|
||||||
|
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O Miniconda3-latest-Linux-x86_64.sh
|
||||||
|
bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b
|
||||||
|
rm /tmp/Miniconda3-latest-Linux-x86_64.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
eval "$(/root/miniconda3/bin/conda shell.bash hook)"
|
||||||
|
|
||||||
|
if [ ! -d /root/miniconda3/envs/vllm-gptq ]; then
|
||||||
|
conda create --name vllm-gptq -c conda-forge python=3.11 -y
|
||||||
|
conda activate vllm-gptq
|
||||||
|
pip install ninja
|
||||||
|
conda install -y -c "nvidia/label/cuda-11.8.0" cuda==11.8
|
||||||
|
conda install -y cudatoolkit cudnn
|
||||||
|
else
|
||||||
|
conda activate vllm-gptq
|
||||||
|
fi
|
||||||
|
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
CUDA_HOME=/root/miniconda3/envs/vllm-gptq python setup.py bdist_wheel
|
||||||
|
|
||||||
|
echo -e "\n\n===\nOUTPUT:"
|
||||||
|
find /tmp/vllm-gptq -name '*.whl'
|
|
@ -44,7 +44,7 @@ def get_requirements() -> List[str]:
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="vllm-gptq",
|
name="vllm-gptq",
|
||||||
version=find_version(get_path("vllm", "__init__.py")),
|
version=find_version(get_path("", "__init__.py")),
|
||||||
author="vLLM Team",
|
author="vLLM Team",
|
||||||
license="Apache 2.0",
|
license="Apache 2.0",
|
||||||
description="A high-throughput and memory-efficient inference and serving engine for LLMs",
|
description="A high-throughput and memory-efficient inference and serving engine for LLMs",
|
|
@ -12,6 +12,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import random_uuid
|
from vllm.utils import random_uuid
|
||||||
|
|
||||||
|
# python api_server.py --model /storage/oobabooga/one-click-installers/text-generation-webui/models/TheBloke_MythoMax-L2-13B-GPTQ/ --host 0.0.0.0 --port 7000 --max-num-batched-tokens 24576
|
||||||
|
|
||||||
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
||||||
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
|
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
|
||||||
app = FastAPI()
|
app = FastAPI()
|
|
@ -1,4 +0,0 @@
|
||||||
```bash
|
|
||||||
wget https://git.evulid.cc/attachments/6e7bfc04-cad4-4494-a98d-1391fbb402d3 -O vllm-0.1.3-cp311-cp311-linux_x86_64.whl && pip install vllm-0.1.3-cp311-cp311-linux_x86_64.whl
|
|
||||||
pip install auto_gptq
|
|
||||||
```
|
|
Reference in New Issue