import os import sys import time import traceback import warnings from threading import Thread import gradio as gr import openai import requests warnings.filterwarnings("ignore") API_BASE = os.getenv('API_BASE') if not API_BASE: print('Must set the secret variable API_BASE to your https://your-site/api') sys.exit(1) API_BASE = API_BASE.strip('/') APP_TITLE = os.getenv('APP_TITLE') PRIMARY_MODEL_CHOICE = os.getenv('PRIMARY_MODEL_CHOICE') TRACKING_CODE = os.getenv('TRACKING_CODE') def background(): while True: previous = openai.api_base try: r = requests.get(API_BASE + '/stats').json() if PRIMARY_MODEL_CHOICE in r['models']['choices'].keys(): openai.api_base = API_BASE + '/openai/' + PRIMARY_MODEL_CHOICE + '/v1' else: openai.api_base = API_BASE + '/openai/v1' except: traceback.print_exc() openai.api_base = API_BASE + '/openai/v1' if openai.api_base != previous: print('Set primary model to', openai.api_base) time.sleep(10) if PRIMARY_MODEL_CHOICE: t = Thread(target=background) t.daemon = True t.start() print('Started the background thread.') # A system prompt can be injected into the very first spot in the context. # If the user sends a message that contains the CONTEXT_TRIGGER_PHRASE, # the content in CONTEXT_TRIGGER_INJECTION will be injected. # Setting CONTEXT_TRIGGER_PHRASE will also add it to the selectable examples. CONTEXT_TRIGGER_PHRASE = os.getenv('CONTEXT_TRIGGER_PHRASE') CONTEXT_TRIGGER_INJECTION = os.getenv('CONTEXT_TRIGGER_INJECTION') openai.api_key = 'null' openai.api_base = API_BASE + '/openai/v1' def stream_response(prompt, history): messages = [] do_injection = False for human, assistant in history: messages.append({'role': 'user', 'content': str(human)}) messages.append({'role': 'assistant', 'content': str(assistant)}) if CONTEXT_TRIGGER_INJECTION and CONTEXT_TRIGGER_PHRASE in human: do_injection = True messages.append({'role': 'user', 'content': prompt}) if do_injection or (CONTEXT_TRIGGER_INJECTION and CONTEXT_TRIGGER_PHRASE in prompt): messages.insert(0, {'role': 'system', 'content': CONTEXT_TRIGGER_INJECTION}) try: response = openai.ChatCompletion.create( model='0', messages=messages, temperature=0, max_tokens=300, stream=True, headers={'LLM-Source': 'huggingface-demo'} ) except Exception: raise gr.Error("Failed to reach inference endpoint.") message = '' for chunk in response: if len(chunk['choices'][0]['delta']) != 0: message += chunk['choices'][0]['delta']['content'] yield message examples = ["hello"] if CONTEXT_TRIGGER_PHRASE: examples.insert(0, CONTEXT_TRIGGER_PHRASE) with gr.Blocks(analytics_enabled=False) as demo: gr.ChatInterface(stream_response, examples=examples, title=APP_TITLE, analytics_enabled=False, cache_examples=False, css='#component-0{height:100%!important}') if TRACKING_CODE: print('Inserting tracking code') gr.HTML(TRACKING_CODE) demo.queue(concurrency_count=1, api_open=False).launch(show_api=False)