This repository has been archived on 2024-10-27. You can view files and clone it, but cannot push or open issues or pull requests.
local-llm-server/config/config.yml.sample

163 lines
6.5 KiB
Plaintext
Raw Normal View History

2024-03-18 12:42:44 -06:00
## Main ##
2023-08-24 00:09:57 -06:00
2024-03-18 12:42:44 -06:00
frontend_api_mode: ooba
2023-08-24 00:09:57 -06:00
2024-03-18 12:42:44 -06:00
cluster:
- backend_url: http://1.2.3.4:7000
concurrent_gens: 3
mode: vllm
# higher priority number means that if lower-number priority backends fail,
# the proxy will fall back to backends that have greater priority numbers.
priority: 16
- backend_url: http://4.5.6.7:9107
concurrent_gens: 3
mode: vllm
priority: 10
- backend_url: http://7.8.9.0:9208
concurrent_gens: 3
mode: vllm
priority: 10
# If enabled, the "priority" of the backends will be ignored
# and will be prioritized by the estimated parameter count instead.
# For example, a 70b model will be a higher priority than a 13b.
prioritize_by_size: true
# The token used to access various administration endpoints.
admin_token: password1234567
2023-08-24 00:09:57 -06:00
# How many requests a single IP is allowed to put in the queue.
# If an IP tries to put more than this their request will be rejected
# until the other(s) are completed.
2024-03-18 12:42:44 -06:00
simultaneous_requests_per_ip: 1
# The connection details for your MySQL database.
mysql:
host: 127.0.0.1
username: localllm
password: 'password1234'
database: localllm
# Manually set the HTTP host shown to the clients.
# Comment out to auto-detect.
# http_host: https://example.com
# Where the server will write its logs to.
webserver_log_directory: /var/log/localllm
2023-08-24 00:09:57 -06:00
2024-03-18 12:42:44 -06:00
## Optional ##
2023-08-30 18:59:29 -06:00
2024-03-18 12:42:44 -06:00
# Include SYSTEM tokens in the stats calculation.
# Applies to average_generation_elapsed_sec and estimated_avg_tps.
include_system_tokens_in_stats: true
2023-08-24 00:09:57 -06:00
2024-03-18 12:42:44 -06:00
# Run a background thread to cache the homepage. The homepage has to load
# a lot of data so it's good to keep it cached. The thread will call whatever
# the base API url.
background_homepage_cacher: true
2023-08-24 00:09:57 -06:00
2024-03-18 12:42:44 -06:00
# The maximum amount of tokens a client is allowed to generate.
max_new_tokens: 500
2023-08-30 18:59:29 -06:00
2024-03-18 12:42:44 -06:00
# Enable/disable streaming.
enable_streaming: true
2024-03-18 12:42:44 -06:00
# Show the backends that the server is configured to use. Disable this to hide them on the public homepage.
show_backends: true
# Log all prompt inputs and outputs.
log_prompts: false
# Disable the verification of SSL certificates in all HTTP requests made by the server.
verify_ssl: false
# Require a valid API key for all inference requests.
auth_required: false
2023-08-30 18:59:29 -06:00
# Name of your proxy, shown to clients.
2024-03-18 12:42:44 -06:00
llm_middleware_name: proxy.example.co
2024-03-18 12:42:44 -06:00
# Override the name of the model shown to clients. Comment out to auto-detect.
# manual_model_name: testing123
2023-08-24 00:09:57 -06:00
2023-08-24 12:19:59 -06:00
# JS tracking code to add to the home page.
2024-03-18 12:42:44 -06:00
# analytics_tracking_code: |
# var test = 123;
# alert(test);
2023-08-24 00:09:57 -06:00
# HTML to add under the "Estimated Wait Time" line.
2024-03-18 12:42:44 -06:00
info_html: |
If you are having issues with ratelimiting, try using streaming.
# Enable/disable the OpenAI-compatible endpoint.
enable_openi_compatible_backend: true
# Your OpenAI API key. Only used for the moderation API and fetching data.
openai_api_key: sk-123456
# Enable/disable the endpoint that shows the system prompt sent to the AI when calling the OpenAI-compatible endpoint.
expose_openai_system_prompt: true
# Should we show our model in the OpenAI API or simulate it? If false, make sure you set
# openai_api_key since the actual OpenAI models response will be cloned.
openai_expose_our_model: false
2024-03-18 12:42:44 -06:00
# Add the string "###" to the stop string to prevent the AI from trying to speak as other characters.
openai_force_no_hashes: true
# Enable moderating requests via OpenAI's moderation endpoint.
openai_moderation_enabled: true
# Don't wait longer than this many seconds for the moderation request
# to OpenAI to complete.
openai_moderation_timeout: 5
# Send the last N messages in an OpenAI request to the moderation endpoint.
openai_moderation_scan_last_n: 5
# The organization name to tell the LLM on the OpenAI endpoint so it can better simulate OpenAI's response.
openai_org_name: OpenAI
# Silently trim prompts to the OpenAI endpoint to fit the model's length.
openai_silent_trim: true
# Set the system prompt for the OpenAI-compatible endpoint. Comment out to use the default.
#openai_system_prompt: |
# You are an assistant chatbot. Your main function is to provide accurate and helpful responses to the user's queries. You should always be polite, respectful, and patient. You should not provide any personal opinions or advice unless specifically asked by the user. You should not make any assumptions about the user's knowledge or abilities. You should always strive to provide clear and concise answers. If you do not understand a user's query, ask for clarification. If you cannot provide an answer, apologize and suggest the user seek help elsewhere.\nLines that start with "### ASSISTANT" were messages you sent previously.\nLines that start with "### USER" were messages sent by the user you are chatting with.\nYou will respond to the "### RESPONSE:" prompt as the assistant and follow the instructions given by the user.\n\n
2023-08-30 18:59:29 -06:00
### Tuneables ##
# Path that is shown to users for them to connect to
2024-03-18 12:42:44 -06:00
frontend_api_client: /api
2023-08-30 18:59:29 -06:00
# How to calculate the average generation time.
2024-03-18 12:42:44 -06:00
# Valid options: database, minute
2023-08-30 18:59:29 -06:00
# "database" calculates average from historical data in the database, with the more recent data weighted more.
# "minute" calculates it from the last minute of data.
2024-03-18 12:42:44 -06:00
average_generation_time_mode: database
2023-08-24 00:09:57 -06:00
## STATS ##
2024-03-18 12:42:44 -06:00
# These options control what is shown on the stats endpoint.
2023-08-24 00:09:57 -06:00
2024-03-18 12:42:44 -06:00
# Display the total_proompts item on the stats screen.
show_num_prompts: true
2023-08-30 18:59:29 -06:00
# Load the number of prompts from the database to display on the stats page.
2024-03-18 12:42:44 -06:00
# If enabled, count all prompts in the database. If disabled, only count the prompts since the server started.
load_num_prompts: true
2023-08-24 00:09:57 -06:00
2024-03-18 12:42:44 -06:00
# Display the uptime item on the stats screen.
show_uptime: true
2024-03-18 12:42:44 -06:00
# Display the total number of tokens generated.
show_total_output_tokens: true
2024-07-07 15:05:35 -06:00
# seconds
backend_generate_request_timeout: 95
# ms
redis_stream_timeout: 25000