41 lines
1.3 KiB
Python
41 lines
1.3 KiB
Python
import time
|
|
|
|
from flask import jsonify, request
|
|
|
|
from llm_server.custom_redis import flask_cache
|
|
from . import bp
|
|
from ... import opts
|
|
from ...cluster.backend import get_backends_from_model, is_valid_model
|
|
from ...cluster.cluster_config import cluster_config, get_a_cluster_backend
|
|
|
|
|
|
@bp.route('/v1/model', methods=['GET'])
|
|
@bp.route('/<model_name>/v1/model', methods=['GET'])
|
|
def get_model(model_name=None):
|
|
# We will manage caching ourself since we don't want to cache
|
|
# when the backend is down. Also, Cloudflare won't cache 500 errors.
|
|
cache_key = 'model_cache::' + request.url
|
|
cached_response = flask_cache.get(cache_key)
|
|
|
|
if cached_response:
|
|
return cached_response
|
|
|
|
if not model_name:
|
|
model_name = cluster_config.get_backend(get_a_cluster_backend()).get('model')
|
|
|
|
if not is_valid_model(model_name):
|
|
response = jsonify({
|
|
'code': 400,
|
|
'msg': 'Model does not exist.',
|
|
}), 400
|
|
else:
|
|
num_backends = len(get_backends_from_model(model_name))
|
|
response = jsonify({
|
|
'result': opts.manual_model_name if opts.manual_model_name else model_name,
|
|
'model_backend_count': num_backends,
|
|
'timestamp': int(time.time())
|
|
}), 200
|
|
flask_cache.set(cache_key, response, timeout=60)
|
|
|
|
return response
|