get functional again
This commit is contained in:
parent
2ab2e6eed1
commit
9a1d41a9b7
|
@ -155,3 +155,9 @@ show_uptime: true
|
||||||
|
|
||||||
# Display the total number of tokens generated.
|
# Display the total number of tokens generated.
|
||||||
show_total_output_tokens: true
|
show_total_output_tokens: true
|
||||||
|
|
||||||
|
# seconds
|
||||||
|
backend_generate_request_timeout: 95
|
||||||
|
|
||||||
|
# ms
|
||||||
|
redis_stream_timeout: 25000
|
|
@ -73,3 +73,4 @@ class ConfigModel(BaseModel):
|
||||||
manual_model_name: Union[str, None]
|
manual_model_name: Union[str, None]
|
||||||
backend_request_timeout: int
|
backend_request_timeout: int
|
||||||
backend_generate_request_timeout: int
|
backend_generate_request_timeout: int
|
||||||
|
redis_stream_timeout: int
|
||||||
|
|
|
@ -55,5 +55,6 @@ config_scheme = bison.Scheme(
|
||||||
bison.Option('load_num_prompts', default=True, field_type=bool),
|
bison.Option('load_num_prompts', default=True, field_type=bool),
|
||||||
bison.Option('manual_model_name', default=None, field_type=Union[str, None]),
|
bison.Option('manual_model_name', default=None, field_type=Union[str, None]),
|
||||||
bison.Option('backend_request_timeout', default=30, field_type=int),
|
bison.Option('backend_request_timeout', default=30, field_type=int),
|
||||||
bison.Option('backend_generate_request_timeout', default=95, field_type=int)
|
bison.Option('backend_generate_request_timeout', default=95, field_type=int),
|
||||||
|
bison.Option('redis_stream_timeout', default=25000, field_type=int)
|
||||||
)
|
)
|
||||||
|
|
|
@ -30,7 +30,7 @@ def do_db_log(ip: str, token: str, prompt: str, response: Union[str, None], gen_
|
||||||
if not response_tokens:
|
if not response_tokens:
|
||||||
response_tokens = get_token_count(response, backend_url)
|
response_tokens = get_token_count(response, backend_url)
|
||||||
else:
|
else:
|
||||||
response_tokens = None
|
response_tokens = 0
|
||||||
|
|
||||||
# Sometimes we may want to insert null into the DB, but
|
# Sometimes we may want to insert null into the DB, but
|
||||||
# usually we want to insert a float.
|
# usually we want to insert a float.
|
||||||
|
|
|
@ -5,6 +5,5 @@ DEFAULT_OPENAI_SYSTEM_PROMPT = ("You are an assistant chatbot. Your main functio
|
||||||
"apologize and suggest the user seek help elsewhere.")
|
"apologize and suggest the user seek help elsewhere.")
|
||||||
OPENAI_FORMATTING_PROMPT = """Lines that start with "### ASSISTANT" were messages you sent previously.\nLines that start with "### USER" were messages sent by the user you are chatting with.\nYou will respond to the "### RESPONSE:" prompt as the assistant and follow the instructions given by the user."""
|
OPENAI_FORMATTING_PROMPT = """Lines that start with "### ASSISTANT" were messages you sent previously.\nLines that start with "### USER" were messages sent by the user you are chatting with.\nYou will respond to the "### RESPONSE:" prompt as the assistant and follow the instructions given by the user."""
|
||||||
|
|
||||||
REDIS_STREAM_TIMEOUT = 25000
|
|
||||||
LOGGING_FORMAT = "%(asctime)s: %(levelname)s:%(name)s - %(message)s"
|
LOGGING_FORMAT = "%(asctime)s: %(levelname)s:%(name)s - %(message)s"
|
||||||
BACKEND_OFFLINE = 'The model you requested is not a valid choice. Please retry your query.'
|
BACKEND_OFFLINE = 'The model you requested is not a valid choice. Please retry your query.'
|
||||||
|
|
|
@ -93,7 +93,8 @@ def return_invalid_model_err(requested_model: str):
|
||||||
return_oai_invalid_request_error(msg)
|
return_oai_invalid_request_error(msg)
|
||||||
|
|
||||||
|
|
||||||
def return_oai_internal_server_error():
|
def return_oai_internal_server_error(error: str):
|
||||||
|
_logger.error(f'OAI Error: {error}')
|
||||||
return jsonify({
|
return jsonify({
|
||||||
"error": {
|
"error": {
|
||||||
"message": "Internal server error",
|
"message": "Internal server error",
|
||||||
|
|
|
@ -20,7 +20,6 @@ class OobaRequestHandler(RequestHandler):
|
||||||
def handle_request(self, return_ok: bool = True):
|
def handle_request(self, return_ok: bool = True):
|
||||||
assert not self.used
|
assert not self.used
|
||||||
if self.offline:
|
if self.offline:
|
||||||
# _logger.debug(f'This backend is offline.')
|
|
||||||
return self.handle_error(llm_server.globals.BACKEND_OFFLINE)
|
return self.handle_error(llm_server.globals.BACKEND_OFFLINE)
|
||||||
|
|
||||||
request_valid, invalid_response = self.validate_request()
|
request_valid, invalid_response = self.validate_request()
|
||||||
|
|
|
@ -27,8 +27,7 @@ def handle_error(e):
|
||||||
"auth_subrequest_error"
|
"auth_subrequest_error"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_logger.error(f'OAI returning error: {e}')
|
return return_oai_internal_server_error(e)
|
||||||
return return_oai_internal_server_error()
|
|
||||||
|
|
||||||
|
|
||||||
from .models import openai_list_models
|
from .models import openai_list_models
|
||||||
|
|
|
@ -32,7 +32,7 @@ def openai_chat_completions(model_name=None):
|
||||||
else:
|
else:
|
||||||
handler = OpenAIRequestHandler(incoming_request=request, incoming_json=request_json_body, selected_model=model_name)
|
handler = OpenAIRequestHandler(incoming_request=request, incoming_json=request_json_body, selected_model=model_name)
|
||||||
if handler.offline:
|
if handler.offline:
|
||||||
return return_oai_internal_server_error()
|
return return_oai_internal_server_error(f'backend {handler.backend_url} is offline')
|
||||||
|
|
||||||
if not request_json_body.get('stream'):
|
if not request_json_body.get('stream'):
|
||||||
try:
|
try:
|
||||||
|
@ -112,9 +112,9 @@ def openai_chat_completions(model_name=None):
|
||||||
try:
|
try:
|
||||||
last_id = '0-0'
|
last_id = '0-0'
|
||||||
while True:
|
while True:
|
||||||
stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().REDIS_STREAM_TIMEOUT)
|
stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().redis_stream_timeout)
|
||||||
if not stream_data:
|
if not stream_data:
|
||||||
_logger.debug(f"No message received in {GlobalConfig.get().REDIS_STREAM_TIMEOUT / 1000} seconds, closing stream.")
|
_logger.debug(f"No message received in {GlobalConfig.get().redis_stream_timeout / 1000} seconds, closing stream.")
|
||||||
yield 'data: [DONE]\n\n'
|
yield 'data: [DONE]\n\n'
|
||||||
else:
|
else:
|
||||||
for stream_index, item in stream_data[0][1]:
|
for stream_index, item in stream_data[0][1]:
|
||||||
|
|
|
@ -157,9 +157,9 @@ def openai_completions(model_name=None):
|
||||||
try:
|
try:
|
||||||
last_id = '0-0'
|
last_id = '0-0'
|
||||||
while True:
|
while True:
|
||||||
stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().REDIS_STREAM_TIMEOUT)
|
stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().redis_stream_timeout)
|
||||||
if not stream_data:
|
if not stream_data:
|
||||||
_logger.debug(f"No message received in {GlobalConfig.get().REDIS_STREAM_TIMEOUT / 1000} seconds, closing stream.")
|
_logger.debug(f"No message received in {GlobalConfig.get().redis_stream_timeout / 1000} seconds, closing stream.")
|
||||||
yield 'data: [DONE]\n\n'
|
yield 'data: [DONE]\n\n'
|
||||||
else:
|
else:
|
||||||
for stream_index, item in stream_data[0][1]:
|
for stream_index, item in stream_data[0][1]:
|
||||||
|
|
|
@ -32,7 +32,7 @@ class OpenAIRequestHandler(RequestHandler):
|
||||||
def handle_request(self) -> Tuple[flask.Response, int]:
|
def handle_request(self) -> Tuple[flask.Response, int]:
|
||||||
assert not self.used
|
assert not self.used
|
||||||
if self.offline:
|
if self.offline:
|
||||||
return return_oai_internal_server_error()
|
return return_oai_internal_server_error(f'backend {self.backend_url} is offline.')
|
||||||
|
|
||||||
disable_openai_handling = request.headers.get('Llm-Disable-Openai', False) == 'true' \
|
disable_openai_handling = request.headers.get('Llm-Disable-Openai', False) == 'true' \
|
||||||
and is_valid_api_key(parse_token(request.headers.get('Authorization', ''))) \
|
and is_valid_api_key(parse_token(request.headers.get('Authorization', ''))) \
|
||||||
|
|
|
@ -144,9 +144,9 @@ def do_stream(ws, model_name):
|
||||||
try:
|
try:
|
||||||
last_id = '0-0' # The ID of the last entry we read.
|
last_id = '0-0' # The ID of the last entry we read.
|
||||||
while True:
|
while True:
|
||||||
stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().REDIS_STREAM_TIMEOUT)
|
stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().redis_stream_timeout)
|
||||||
if not stream_data:
|
if not stream_data:
|
||||||
_logger.error(f"No message received in {GlobalConfig.get().REDIS_STREAM_TIMEOUT / 1000} seconds, closing stream.")
|
_logger.error(f"No message received in {GlobalConfig.get().redis_stream_timeout / 1000} seconds, closing stream.")
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
for stream_index, item in stream_data[0][1]:
|
for stream_index, item in stream_data[0][1]:
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
HOST="proxy.chub-archive.evulid.cc"
|
HOST="http://localhost:5000"
|
||||||
|
|
||||||
AUTH_KEY="TEST_1df979f0-6df1-41bd-814a-e99b1680e727"
|
AUTH_KEY="TEST_1df979f0-6df1-41bd-814a-e99b1680e727"
|
||||||
|
|
||||||
PROXY_SERVERS=(
|
PROXY_SERVERS=(
|
||||||
"http://172.0.4.7:3128"
|
"http://172.0.4.7:3128"
|
||||||
"http://172.0.4.8:3128"
|
"http://172.0.4.8:3128"
|
||||||
"http://172.0.4.10:3128"
|
|
||||||
"http://172.0.4.12:3128"
|
|
||||||
"http://172.0.4.13:3128"
|
|
||||||
)
|
)
|
||||||
|
|
|
@ -49,7 +49,7 @@ while true; do
|
||||||
EOF
|
EOF
|
||||||
)
|
)
|
||||||
|
|
||||||
curl "https://$HOST/api/v1/generate" -m 100 -x "$our_proxy_server" \
|
curl "$HOST/api/v1/generate" -m 100 -x "$our_proxy_server" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "Authorization: Bearer $AUTH_KEY" \
|
-H "Authorization: Bearer $AUTH_KEY" \
|
||||||
-d "$DATA"
|
-d "$DATA"
|
||||||
|
|
|
@ -43,7 +43,7 @@ while true; do
|
||||||
EOF
|
EOF
|
||||||
)
|
)
|
||||||
|
|
||||||
curl "https://$HOST/api/openai/v1/chat/completions" -m 100 -x "$our_proxy_server" \
|
curl "$HOST/api/openai/v1/chat/completions" -m 100 -x "$our_proxy_server" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "Authorization: Bearer $AUTH_KEY" \
|
-H "Authorization: Bearer $AUTH_KEY" \
|
||||||
-d "$DATA"
|
-d "$DATA"
|
||||||
|
|
|
@ -43,7 +43,7 @@ while true; do
|
||||||
EOF
|
EOF
|
||||||
)
|
)
|
||||||
|
|
||||||
curl "https://$HOST/api/openai/v1/completions" -m 100 -x "$our_proxy_server" \
|
curl "$HOST/api/openai/v1/completions" -m 100 -x "$our_proxy_server" \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-H "Authorization: Bearer $AUTH_KEY" \
|
-H "Authorization: Bearer $AUTH_KEY" \
|
||||||
-d "$DATA"
|
-d "$DATA"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -70,7 +70,14 @@ async def run(context):
|
||||||
'stopping_strings': []
|
'stopping_strings': []
|
||||||
}
|
}
|
||||||
|
|
||||||
async with websockets.connect(f'wss://{config["HOST"]}/api/v1/stream', ping_interval=None) as websocket:
|
socket_type = 'ws://'
|
||||||
|
if config['HOST'].startswith('https://'):
|
||||||
|
socket_type = 'wss://'
|
||||||
|
config['HOST'] = config['HOST'].strip('http://')
|
||||||
|
config['HOST'] = config['HOST'].strip('https://')
|
||||||
|
|
||||||
|
print('Connecting to', f'{socket_type}{config["HOST"]}/api/v1/stream')
|
||||||
|
async with websockets.connect(f'{socket_type}{config["HOST"]}/api/v1/stream', ping_interval=None) as websocket:
|
||||||
await websocket.send(json.dumps(request))
|
await websocket.send(json.dumps(request))
|
||||||
|
|
||||||
yield context # Remove this if you just want to see the reply
|
yield context # Remove this if you just want to see the reply
|
||||||
|
@ -89,12 +96,12 @@ async def run(context):
|
||||||
|
|
||||||
|
|
||||||
async def print_response_stream(prompt):
|
async def print_response_stream(prompt):
|
||||||
try:
|
# try:
|
||||||
async for response in run(prompt):
|
async for response in run(prompt):
|
||||||
print(response, end='')
|
print(response, end='')
|
||||||
sys.stdout.flush() # If we don't flush, we won't see tokens in realtime.
|
sys.stdout.flush() # If we don't flush, we won't see tokens in realtime.
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
print(e)
|
# print(e)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -14,12 +14,13 @@ from llm_server.routes.v1 import bp
|
||||||
from llm_server.routes.v1.generate_stats import generate_stats
|
from llm_server.routes.v1.generate_stats import generate_stats
|
||||||
from llm_server.sock import init_wssocket
|
from llm_server.sock import init_wssocket
|
||||||
|
|
||||||
# TODO: seperate queue item timeout for websockets (make longer, like 5 minutes)
|
# TODO: detect blocking disconnect
|
||||||
# TODO: return an `error: True`, error code, and error message rather than just a formatted message
|
# TODO: return an `error: True`, error code, and error message rather than just a formatted message
|
||||||
# TODO: what happens when all backends are offline? What about the "online" key in the stats page?
|
# TODO: what happens when all backends are offline? What about the "online" key in the stats page?
|
||||||
# TODO: redis SCAN vs KEYS??
|
# TODO: redis SCAN vs KEYS??
|
||||||
# TODO: is frequency penalty the same as ooba repetition penalty???
|
# TODO: is frequency penalty the same as ooba repetition penalty???
|
||||||
# TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions
|
# TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions
|
||||||
|
# TODO: insert pydantic object into database
|
||||||
|
|
||||||
# Lower priority
|
# Lower priority
|
||||||
# TODO: if a backend is at its limit of concurrent requests, choose a different one
|
# TODO: if a backend is at its limit of concurrent requests, choose a different one
|
||||||
|
|
Reference in New Issue