2023-10-27 19:19:22 -06:00
3 changed files with 63 additions and 34 deletions
--- a/llm_server/routes/openai_request_handler.py
+++ b/llm_server/routes/openai_request_handler.py
@ -67,7 +67,6 @@ class OpenAIRequestHandler(RequestHandler):
        llm_request = {**self.parameters, 'prompt': self.prompt}
        (success, _, _, _), (backend_response, backend_response_status_code) = self.generate_response(llm_request)
        model = self.request_json_body.get('model')
        if success:
@ -98,6 +97,7 @@ class OpenAIRequestHandler(RequestHandler):
        return response, 429
    def handle_error(self, error_msg: str, error_type: str = 'error') -> Tuple[flask.Response, int]:
        print(error_msg)
        return jsonify({
            "error": {
                "message": "Invalid request, check your parameters and try again.",
--- a/other/gradio/gradio_chat.py
+++ b/other/gradio/gradio_chat.py
@ -0,0 +1,62 @@
 import os
 import sys
 import warnings
 import gradio as gr
 import openai
 warnings.filterwarnings("ignore")
 API_BASE = os.getenv('API_BASE')
 if not API_BASE:
    print('Must set the secret variable API_BASE to your https://your-site/api/openai/v1')
    sys.exit(1)
 # A system prompt can be injected into the very first spot in the context.
 # If the user sends a message that contains the CONTEXT_TRIGGER_PHRASE,
 # the content in CONTEXT_TRIGGER_INJECTION will be injected.
 # Setting CONTEXT_TRIGGER_PHRASE will also add it to the selectable examples.
 CONTEXT_TRIGGER_PHRASE = os.getenv('CONTEXT_TRIGGER_PHRASE')
 CONTEXT_TRIGGER_INJECTION = os.getenv('CONTEXT_TRIGGER_INJECTION')
 openai.api_key = 'null'
 openai.api_base = API_BASE
 def stream_response(prompt, history):
    messages = []
    do_injection = False
    for human, assistant in history:
        messages.append({'role': 'user', 'content': str(human)})
        messages.append({'role': 'assistant', 'content': str(assistant)})
        if CONTEXT_TRIGGER_INJECTION and CONTEXT_TRIGGER_PHRASE in human:
            do_injection = True
    messages.append({'role': 'user', 'content': prompt})
    if do_injection or (CONTEXT_TRIGGER_INJECTION and CONTEXT_TRIGGER_PHRASE in prompt):
        messages.insert(0, {'role': 'system', 'content': CONTEXT_TRIGGER_INJECTION})
    try:
        response = openai.ChatCompletion.create(
            model='0',
            messages=messages,
            temperature=0,
            max_tokens=300,
            stream=True
        )
    except Exception:
        raise gr.Error("Failed to reach inference endpoint.")
    message = ''
    for chunk in response:
        if len(chunk['choices'][0]['delta']) != 0:
            message += chunk['choices'][0]['delta']['content']
            yield message
 examples = ["hello", "hola", "merhaba"]
 if CONTEXT_TRIGGER_PHRASE:
    examples.insert(0, CONTEXT_TRIGGER_PHRASE)
 gr.ChatInterface(stream_response, examples=examples, title="Chatbot Demo", analytics_enabled=False, cache_examples=False, css='#component-0{height:100%!important}').queue(concurrency_count=3).launch()
--- a/other/gradio_chat.py
+++ b/other/gradio_chat.py
@ -1,33 +0,0 @@
 import warnings
 import gradio as gr
 import openai
 warnings.filterwarnings("ignore")
 openai.api_key = 'null'
 openai.api_base = 'http://localhost:5000/api/openai/v1'
 def stream_response(prompt, history):
    messages = []
    for x in history:
        messages.append({'role': 'user', 'content': x[0]})
        messages.append({'role': 'assistant', 'content': x[1]})
    messages.append({'role': 'user', 'content': prompt})
    response = openai.ChatCompletion.create(
        model='0',
        messages=messages,
        temperature=0,
        max_tokens=300,
        stream=True
    )
    message = ''
    for chunk in response:
        message += chunk['choices'][0]['delta']['content']
        yield message
 gr.ChatInterface(stream_response, examples=["hello", "hola", "merhaba"], title="Chatbot Demo", analytics_enabled=False, cache_examples=False, css='#component-0{height:100%!important}').queue().launch()