fix(docker): fix api-inference deployment (#30)
This commit is contained in:
parent
f9d0ec376a
commit
ab2ad91da3
|
@ -31,6 +31,7 @@ ENV LANG=C.UTF-8 \
|
||||||
QUANTIZE=false \
|
QUANTIZE=false \
|
||||||
NUM_GPUS=1 \
|
NUM_GPUS=1 \
|
||||||
SAFETENSORS_FAST_GPU=1 \
|
SAFETENSORS_FAST_GPU=1 \
|
||||||
|
PORT=80 \
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
|
||||||
NCCL_ASYNC_ERROR_HANDLING=1 \
|
NCCL_ASYNC_ERROR_HANDLING=1 \
|
||||||
CUDA_HOME=/usr/local/cuda \
|
CUDA_HOME=/usr/local/cuda \
|
||||||
|
@ -70,4 +71,4 @@ COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/loca
|
||||||
# Install launcher
|
# Install launcher
|
||||||
COPY --from=launcher-builder /usr/local/cargo/bin/text-generation-launcher /usr/local/bin/text-generation-launcher
|
COPY --from=launcher-builder /usr/local/cargo/bin/text-generation-launcher /usr/local/bin/text-generation-launcher
|
||||||
|
|
||||||
CMD HUGGINGFACE_HUB_CACHE=$MODEL_BASE_PATH text-generation-launcher --num-shard $NUM_GPUS --json-output
|
CMD HUGGINGFACE_HUB_CACHE=$MODEL_BASE_PATH text-generation-launcher --num-shard $NUM_GPUS --model-name $MODEL_ID --json-output
|
|
@ -11,13 +11,13 @@ environment:
|
||||||
image: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference:0.3.1
|
image: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference:0.3.1
|
||||||
inference_config:
|
inference_config:
|
||||||
liveness_route:
|
liveness_route:
|
||||||
port: 3000
|
port: 80
|
||||||
path: /health
|
path: /health
|
||||||
readiness_route:
|
readiness_route:
|
||||||
port: 3000
|
port: 80
|
||||||
path: /health
|
path: /health
|
||||||
scoring_route:
|
scoring_route:
|
||||||
port: 3000
|
port: 80
|
||||||
path: /generate
|
path: /generate
|
||||||
instance_type: Standard_ND96amsr_A100_v4
|
instance_type: Standard_ND96amsr_A100_v4
|
||||||
request_settings:
|
request_settings:
|
||||||
|
|
Loading…
Reference in New Issue