local-llm-server/config/config.yml.sample

## Main ##

frontend_api_mode:                ooba

cluster:
  - backend_url:                  http://1.2.3.4:7000
    concurrent_gens:              3
    mode:                         vllm
    # higher priority number means that if lower-number priority backends fail,
    # the proxy will fall back to backends that have greater priority numbers.
    priority:                     16

  - backend_url:                  http://4.5.6.7:9107
    concurrent_gens:              3
    mode:                         vllm
    priority:                     10

  - backend_url:                  http://7.8.9.0:9208
    concurrent_gens:              3
    mode:                         vllm
    priority:                     10

# If enabled, the "priority" of the backends will be ignored
# and will be prioritized by the estimated parameter count instead.
# For example, a 70b model will be a higher priority than a 13b.
prioritize_by_size:               true

# The token used to access various administration endpoints.
admin_token:                      password1234567

# How many requests a single IP is allowed to put in the queue.
# If an IP tries to put more than this their request will be rejected
# until the other(s) are completed.
simultaneous_requests_per_ip:     1

# The connection details for your MySQL database.
mysql:
  host:                           127.0.0.1
  username:                       localllm
  password:                       'password1234'
  database:                       localllm

# Manually set the HTTP host shown to the clients.
# Comment out to auto-detect.
# http_host:                        https://example.com

# Where the server will write its logs to.
webserver_log_directory:          /var/log/localllm


## Optional ##

# Include SYSTEM tokens in the stats calculation.
# Applies to average_generation_elapsed_sec and estimated_avg_tps.
include_system_tokens_in_stats:   true

# Run a background thread to cache the homepage. The homepage has to load
# a lot of data so it's good to keep it cached. The thread will call whatever
# the base API url.
background_homepage_cacher:       true

# The maximum amount of tokens a client is allowed to generate.
max_new_tokens:                   500

# Enable/disable streaming.
enable_streaming:                 true

# Show the backends that the server is configured to use. Disable this to hide them on the public homepage.
show_backends:                    true

# Log all prompt inputs and outputs.
log_prompts:                      false

# Disable the verification of SSL certificates in all HTTP requests made by the server.
verify_ssl:                       false

# Require a valid API key for all inference requests.
auth_required:                    false

# Name of your proxy, shown to clients.
llm_middleware_name:              proxy.example.co

# Override the name of the model shown to clients. Comment out to auto-detect.
# manual_model_name:              testing123

# JS tracking code to add to the home page.
# analytics_tracking_code:          |
#   var test = 123;
#   alert(test);

# HTML to add under the "Estimated Wait Time" line.
info_html:                        |
  If you are having issues with ratelimiting, try using streaming.

# Enable/disable the OpenAI-compatible endpoint.
enable_openi_compatible_backend:  true

# Your OpenAI API key. Only used for the moderation API and fetching data.
openai_api_key:                   sk-123456

# Enable/disable the endpoint that shows the system prompt sent to the AI when calling the OpenAI-compatible endpoint.
expose_openai_system_prompt:      true

# Should we show our model in the OpenAI API or simulate it? If false, make sure you set
# openai_api_key since the actual OpenAI models response will be cloned.
openai_expose_our_model:           false

# Add the string "###" to the stop string to prevent the AI from trying to speak as other characters.
openai_force_no_hashes:           true

# Enable moderating requests via OpenAI's moderation endpoint.
openai_moderation_enabled:        true

# Don't wait longer than this many seconds for the moderation request
# to OpenAI to complete.
openai_moderation_timeout:        5

# Send the last N messages in an OpenAI request to the moderation endpoint.
openai_moderation_scan_last_n:    5

# The organization name to tell the LLM on the OpenAI endpoint so it can better simulate OpenAI's response.
openai_org_name:                  OpenAI

# Silently trim prompts to the OpenAI endpoint to fit the model's length.
openai_silent_trim:               true

# Set the system prompt for the OpenAI-compatible endpoint. Comment out to use the default.
#openai_system_prompt:            |
#  You are an assistant chatbot. Your main function is to provide accurate and helpful responses to the user's queries. You should always be polite, respectful, and patient. You should not provide any personal opinions or advice unless specifically asked by the user. You should not make any assumptions about the user's knowledge or abilities. You should always strive to provide clear and concise answers. If you do not understand a user's query, ask for clarification. If you cannot provide an answer, apologize and suggest the user seek help elsewhere.\nLines that start with "### ASSISTANT" were messages you sent previously.\nLines that start with "### USER" were messages sent by the user you are chatting with.\nYou will respond to the "### RESPONSE:" prompt as the assistant and follow the instructions given by the user.\n\n

### Tuneables ##

# Path that is shown to users for them to connect to
frontend_api_client:              /api

# How to calculate the average generation time.
# Valid options:                  database, minute
# "database" calculates average from historical data in the database, with the more recent data weighted more.
# "minute" calculates it from the last minute of data.
average_generation_time_mode:     database

## STATS ##

# These options control what is shown on the stats endpoint.

# Display the total_proompts item on the stats screen.
show_num_prompts:                 true

# Load the number of prompts from the database to display on the stats page.
# If enabled, count all prompts in the database. If disabled, only count the prompts since the server started.
load_num_prompts:                 true

# Display the uptime item on the stats screen.
show_uptime:                      true

# Display the total number of tokens generated.
show_total_output_tokens:         true