From b13fda722a019b5f786fa8d5b45a4814ca86048c Mon Sep 17 00:00:00 2001 From: Cyberes Date: Sun, 27 Oct 2024 12:13:26 -0600 Subject: [PATCH] archive project --- README.md | 4 +++- server.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e2089d4..d3e0f84 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ _An HTTP API to serve local LLM Models._ +**ARCHIVED PROJECT:** this project was created before any good solution existed for managing LLM endpoints and has now been superseded by many good options. [LiteLLM](https://github.com/BerriAI/litellm) is the best replacement. If a need for an un-authenticated public model arises, check out [cyberes/litellm-public](https://git.evulid.cc/cyberes/litellm-public). + The purpose of this server is to abstract your LLM backend from your frontend API. This enables you to switch your backend while providing a stable frontend clients. @@ -111,4 +113,4 @@ Then, update the VLLM version in `requirements.txt`. - [ ] Make sure stats work when starting from an empty database - [ ] Make sure we're correctly canceling requests when the client cancels. The blocking endpoints can't detect when a client cancels generation. - [ ] Add test to verify the OpenAI endpoint works as expected -- [ ] Document the `Llm-Disable-Openai` header \ No newline at end of file +- [ ] Document the `Llm-Disable-Openai` header diff --git a/server.py b/server.py index 70c7333..74c26ad 100644 --- a/server.py +++ b/server.py @@ -21,6 +21,7 @@ from llm_server.sock import init_wssocket # TODO: is frequency penalty the same as ooba repetition penalty??? # TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions # TODO: insert pydantic object into database +# TODO: figure out blocking API disconnect https://news.ycombinator.com/item?id=41168033 # Lower priority # TODO: if a backend is at its limit of concurrent requests, choose a different one