From f4e5b5275dd41bce0ef595746047dc9faff90dad Mon Sep 17 00:00:00 2001 From: Cyberes Date: Wed, 11 Oct 2023 09:09:41 -0600 Subject: [PATCH] test --- daemon.py | 4 ++-- llm_server/workers/inferencer.py | 2 ++ llm_server/workers/threader.py | 1 - other/gradio/gradio_chat.py | 36 +++++++++++++++++++++----------- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/daemon.py b/daemon.py index 35c1d59..9e0d5a9 100644 --- a/daemon.py +++ b/daemon.py @@ -20,8 +20,8 @@ else: config_path = Path(script_path, 'config', 'config.yml') if __name__ == "__main__": - Redis().flushall() - print('Flushed Redis.') + # Redis().flushall() + # print('Flushed Redis.') success, config, msg = load_config(config_path) if not success: diff --git a/llm_server/workers/inferencer.py b/llm_server/workers/inferencer.py index 324c13a..f06ff8c 100644 --- a/llm_server/workers/inferencer.py +++ b/llm_server/workers/inferencer.py @@ -19,6 +19,8 @@ def worker(backend_url): increment_ip_count(client_ip, 'processing_ips') incr_active_workers(selected_model, backend_url) + print('Worker starting processing for', client_ip) + try: if not request_json_body: # This was a dummy request from the streaming handlers. diff --git a/llm_server/workers/threader.py b/llm_server/workers/threader.py index 1f5266f..f19ce1c 100644 --- a/llm_server/workers/threader.py +++ b/llm_server/workers/threader.py @@ -44,7 +44,6 @@ def start_background(): t.start() print('Started the console printer.') - redis_running_models.flush() t = Thread(target=cluster_worker) t.daemon = True t.start() diff --git a/other/gradio/gradio_chat.py b/other/gradio/gradio_chat.py index e8e54b2..fa1b892 100644 --- a/other/gradio/gradio_chat.py +++ b/other/gradio/gradio_chat.py @@ -12,6 +12,12 @@ if not API_BASE: print('Must set the secret variable API_BASE to your https://your-site/api/openai/v1') sys.exit(1) +BACKUP_API_BASE = os.getenv('BACKUP_API_BASE') +if BACKUP_API_BASE: + print('Using BACKUP_API_BASE:', BACKUP_API_BASE) + +APP_TITLE = os.getenv('APP_TITLE') + # A system prompt can be injected into the very first spot in the context. # If the user sends a message that contains the CONTEXT_TRIGGER_PHRASE, # the content in CONTEXT_TRIGGER_INJECTION will be injected. @@ -37,16 +43,22 @@ def stream_response(prompt, history): if do_injection or (CONTEXT_TRIGGER_INJECTION and CONTEXT_TRIGGER_PHRASE in prompt): messages.insert(0, {'role': 'system', 'content': CONTEXT_TRIGGER_INJECTION}) - try: - response = openai.ChatCompletion.create( - model='0', - messages=messages, - temperature=0, - max_tokens=300, - stream=True - ) - except Exception: - raise gr.Error("Failed to reach inference endpoint.") + for _ in range(2): + try: + response = openai.ChatCompletion.create( + model='0', + messages=messages, + temperature=0, + max_tokens=300, + stream=True + ) + break + except Exception: + openai.api_base = BACKUP_API_BASE + raise gr.Error("Failed to reach inference endpoint.") + + # Go back to the default endpoint + openai.api_base = API_BASE message = '' for chunk in response: @@ -55,8 +67,8 @@ def stream_response(prompt, history): yield message -examples = ["hello", "hola", "merhaba"] +examples = ["hello"] if CONTEXT_TRIGGER_PHRASE: examples.insert(0, CONTEXT_TRIGGER_PHRASE) -gr.ChatInterface(stream_response, examples=examples, title="Chatbot Demo", analytics_enabled=False, cache_examples=False, css='#component-0{height:100%!important}').queue(concurrency_count=3).launch() +gr.ChatInterface(stream_response, examples=examples, title=APP_TITLE, analytics_enabled=False, cache_examples=False, css='#component-0{height:100%!important}').queue(concurrency_count=1, api_open=False).launch(show_api=False)