local-llm-server/other/gradio/gradio_chat.py

import os
import sys
import time
import traceback
import warnings
from threading import Thread

import gradio as gr
import openai
import requests

warnings.filterwarnings("ignore")

API_BASE = os.getenv('API_BASE')
if not API_BASE:
    print('Must set the secret variable API_BASE to your https://your-site/api')
    sys.exit(1)
API_BASE = API_BASE.strip('/')

APP_TITLE = os.getenv('APP_TITLE')
PRIMARY_MODEL_CHOICE = os.getenv('PRIMARY_MODEL_CHOICE')
TRACKING_CODE = os.getenv('TRACKING_CODE')


def background():
    while True:
        previous = openai.api_base
        try:
            r = requests.get(API_BASE + '/stats').json()
            if PRIMARY_MODEL_CHOICE in r['models']['choices'].keys():
                openai.api_base = API_BASE + '/openai/' + PRIMARY_MODEL_CHOICE + '/v1'
            else:
                openai.api_base = API_BASE + '/openai/v1'
        except:
            traceback.print_exc()
            openai.api_base = API_BASE + '/openai/v1'
        if openai.api_base != previous:
            print('Set primary model to', openai.api_base)
        time.sleep(10)


if PRIMARY_MODEL_CHOICE:
    t = Thread(target=background)
    t.daemon = True
    t.start()
    print('Started the background thread.')

# A system prompt can be injected into the very first spot in the context.
# If the user sends a message that contains the CONTEXT_TRIGGER_PHRASE,
# the content in CONTEXT_TRIGGER_INJECTION will be injected.
# Setting CONTEXT_TRIGGER_PHRASE will also add it to the selectable examples.
CONTEXT_TRIGGER_PHRASE = os.getenv('CONTEXT_TRIGGER_PHRASE')
CONTEXT_TRIGGER_INJECTION = os.getenv('CONTEXT_TRIGGER_INJECTION')

openai.api_key = 'null'
openai.api_base = API_BASE + '/openai/v1'


def stream_response(prompt, history):
    messages = []
    do_injection = False
    for human, assistant in history:
        messages.append({'role': 'user', 'content': str(human)})
        messages.append({'role': 'assistant', 'content': str(assistant)})

        if CONTEXT_TRIGGER_INJECTION and CONTEXT_TRIGGER_PHRASE in human:
            do_injection = True
    messages.append({'role': 'user', 'content': prompt})

    if do_injection or (CONTEXT_TRIGGER_INJECTION and CONTEXT_TRIGGER_PHRASE in prompt):
        messages.insert(0, {'role': 'system', 'content': CONTEXT_TRIGGER_INJECTION})

    try:
        response = openai.ChatCompletion.create(
            model='0',
            messages=messages,
            temperature=0,
            max_tokens=300,
            stream=True,
            headers={'LLM-Source': 'huggingface-demo'}
        )
    except Exception:
        raise gr.Error("Failed to reach inference endpoint.")

    message = ''
    for chunk in response:
        if len(chunk['choices'][0]['delta']) != 0:
            message += chunk['choices'][0]['delta']['content']
            yield message


examples = ["hello"]
if CONTEXT_TRIGGER_PHRASE:
    examples.insert(0, CONTEXT_TRIGGER_PHRASE)

with gr.Blocks(analytics_enabled=False) as demo:
    gr.ChatInterface(stream_response, examples=examples, title=APP_TITLE, analytics_enabled=False, cache_examples=False, css='#component-0{height:100%!important}')

    if TRACKING_CODE:
        print('Inserting tracking code')
        gr.HTML(TRACKING_CODE)

demo.queue(concurrency_count=1, api_open=False).launch(show_api=False)