This repository has been archived on 2024-10-27. You can view files and clone it, but cannot push or open issues or pull requests.
local-llm-server/other/gradio/gradio_chat.py

104 lines
3.2 KiB
Python
Raw Normal View History

import os
import sys
import time
import traceback
import warnings
from threading import Thread
import gradio as gr
import openai
import requests
warnings.filterwarnings("ignore")
API_BASE = os.getenv('API_BASE')
if not API_BASE:
print('Must set the secret variable API_BASE to your https://your-site/api')
sys.exit(1)
API_BASE = API_BASE.strip('/')
APP_TITLE = os.getenv('APP_TITLE')
PRIMARY_MODEL_CHOICE = os.getenv('PRIMARY_MODEL_CHOICE')
TRACKING_CODE = os.getenv('TRACKING_CODE')
def background():
while True:
previous = openai.api_base
try:
r = requests.get(API_BASE + '/stats').json()
if PRIMARY_MODEL_CHOICE in r['models']['choices'].keys():
openai.api_base = API_BASE + '/openai/' + PRIMARY_MODEL_CHOICE + '/v1'
else:
openai.api_base = API_BASE + '/openai/v1'
except:
traceback.print_exc()
openai.api_base = API_BASE + '/openai/v1'
if openai.api_base != previous:
print('Set primary model to', openai.api_base)
time.sleep(10)
if PRIMARY_MODEL_CHOICE:
t = Thread(target=background)
t.daemon = True
t.start()
print('Started the background thread.')
# A system prompt can be injected into the very first spot in the context.
# If the user sends a message that contains the CONTEXT_TRIGGER_PHRASE,
# the content in CONTEXT_TRIGGER_INJECTION will be injected.
# Setting CONTEXT_TRIGGER_PHRASE will also add it to the selectable examples.
CONTEXT_TRIGGER_PHRASE = os.getenv('CONTEXT_TRIGGER_PHRASE')
CONTEXT_TRIGGER_INJECTION = os.getenv('CONTEXT_TRIGGER_INJECTION')
openai.api_key = 'null'
openai.api_base = API_BASE + '/openai/v1'
def stream_response(prompt, history):
messages = []
do_injection = False
for human, assistant in history:
messages.append({'role': 'user', 'content': str(human)})
messages.append({'role': 'assistant', 'content': str(assistant)})
if CONTEXT_TRIGGER_INJECTION and CONTEXT_TRIGGER_PHRASE in human:
do_injection = True
messages.append({'role': 'user', 'content': prompt})
if do_injection or (CONTEXT_TRIGGER_INJECTION and CONTEXT_TRIGGER_PHRASE in prompt):
messages.insert(0, {'role': 'system', 'content': CONTEXT_TRIGGER_INJECTION})
try:
response = openai.ChatCompletion.create(
model='0',
messages=messages,
temperature=0,
max_tokens=300,
stream=True,
headers={'LLM-Source': 'huggingface-demo'}
)
except Exception:
raise gr.Error("Failed to reach inference endpoint.")
message = ''
for chunk in response:
if len(chunk['choices'][0]['delta']) != 0:
message += chunk['choices'][0]['delta']['content']
yield message
examples = ["hello"]
if CONTEXT_TRIGGER_PHRASE:
examples.insert(0, CONTEXT_TRIGGER_PHRASE)
with gr.Blocks(analytics_enabled=False) as demo:
gr.ChatInterface(stream_response, examples=examples, title=APP_TITLE, analytics_enabled=False, cache_examples=False, css='#component-0{height:100%!important}')
if TRACKING_CODE:
print('Inserting tracking code')
gr.HTML(TRACKING_CODE)
demo.queue(concurrency_count=1, api_open=False).launch(show_api=False)