Support `HF_TOKEN` environment variable (#2066)
* Support HF_TOKEN environement variable * Load test. --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
parent
405765b18c
commit
3447c722fd
|
@ -178,6 +178,6 @@ jobs:
|
||||||
export DOCKER_VOLUME=/mnt/cache
|
export DOCKER_VOLUME=/mnt/cache
|
||||||
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
|
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
|
||||||
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
|
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
|
||||||
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||||
echo $DOCKER_IMAGE
|
echo $DOCKER_IMAGE
|
||||||
pytest -s -vv integration-tests
|
pytest -s -vv integration-tests
|
||||||
|
|
|
@ -11,66 +11,24 @@ on:
|
||||||
- 'main'
|
- 'main'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
start-runner:
|
|
||||||
name: Start self-hosted EC2 runner
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
env:
|
|
||||||
AWS_REGION: eu-central-1
|
|
||||||
EC2_AMI_ID: ami-0ab09c07cfd194259
|
|
||||||
EC2_INSTANCE_TYPE: g5.12xlarge
|
|
||||||
EC2_SUBNET_ID: subnet-988fd9f2,subnet-6f56db13,subnet-6a039326
|
|
||||||
EC2_SECURITY_GROUP: sg-072f92ae3082936c6
|
|
||||||
outputs:
|
|
||||||
label: ${{ steps.start-ec2-runner.outputs.label }}
|
|
||||||
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
|
|
||||||
steps:
|
|
||||||
- name: Configure AWS credentials
|
|
||||||
uses: aws-actions/configure-aws-credentials@v1
|
|
||||||
with:
|
|
||||||
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
|
||||||
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
|
||||||
aws-region: ${{ env.AWS_REGION }}
|
|
||||||
- name: Start EC2 runner
|
|
||||||
id: start-ec2-runner
|
|
||||||
uses: philschmid/philschmid-ec2-github-runner@main
|
|
||||||
with:
|
|
||||||
mode: start
|
|
||||||
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
|
||||||
ec2-image-id: ${{ env.EC2_AMI_ID }}
|
|
||||||
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
|
|
||||||
subnet-id: ${{ env.EC2_SUBNET_ID }}
|
|
||||||
security-group-id: ${{ env.EC2_SECURITY_GROUP }}
|
|
||||||
aws-resource-tags: > # optional, requires additional permissions
|
|
||||||
[
|
|
||||||
{"Key": "Name", "Value": "ec2-tgi-github-runner"},
|
|
||||||
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
|
|
||||||
]
|
|
||||||
|
|
||||||
load-tests:
|
load-tests:
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
needs: start-runner # required to start the main job when the runner is ready
|
runs-on: [self-hosted, nvidia-gpu , multi-gpu, 4-a10, ci]
|
||||||
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
|
|
||||||
env:
|
env:
|
||||||
DOCKER_VOLUME: /cache
|
DOCKER_VOLUME: /cache
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Prepare disks
|
|
||||||
run: |
|
|
||||||
sudo mkfs -t ext4 /dev/nvme1n1
|
|
||||||
sudo mkdir ${{ env.DOCKER_VOLUME }}
|
|
||||||
sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }}
|
|
||||||
|
|
||||||
- name: Install k6
|
- name: Install k6
|
||||||
run: |
|
run: |
|
||||||
curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
|
curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1
|
||||||
|
|
||||||
- name: Start starcoder
|
- name: Start starcoder
|
||||||
run: |
|
run: |
|
||||||
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
|
docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v /mnt/cache:/data -e HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768
|
||||||
sleep 10
|
sleep 10
|
||||||
wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
|
wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health
|
||||||
|
|
||||||
|
@ -82,27 +40,3 @@ jobs:
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
run: |
|
run: |
|
||||||
docker stop tgi-starcoder || true
|
docker stop tgi-starcoder || true
|
||||||
|
|
||||||
stop-runner:
|
|
||||||
name: Stop self-hosted EC2 runner
|
|
||||||
needs:
|
|
||||||
- start-runner
|
|
||||||
- load-tests
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
env:
|
|
||||||
AWS_REGION: eu-central-1
|
|
||||||
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
|
|
||||||
steps:
|
|
||||||
- name: Configure AWS credentials
|
|
||||||
uses: aws-actions/configure-aws-credentials@v1
|
|
||||||
with:
|
|
||||||
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
|
||||||
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
|
||||||
aws-region: ${{ env.AWS_REGION }}
|
|
||||||
- name: Stop EC2 runner
|
|
||||||
uses: philschmid/philschmid-ec2-github-runner@main
|
|
||||||
with:
|
|
||||||
mode: stop
|
|
||||||
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
|
|
||||||
label: ${{ needs.start-runner.outputs.label }}
|
|
||||||
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
|
|
||||||
|
|
|
@ -72,7 +72,7 @@ jobs:
|
||||||
- name: Run server tests
|
- name: Run server tests
|
||||||
run: |
|
run: |
|
||||||
pip install pytest
|
pip install pytest
|
||||||
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
export HF_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||||
pytest -s -vv server/tests
|
pytest -s -vv server/tests
|
||||||
- name: Pre-commit checks
|
- name: Pre-commit checks
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -105,14 +105,14 @@ The Swagger UI is also available at: [https://huggingface.github.io/text-generat
|
||||||
|
|
||||||
### Using a private or gated model
|
### Using a private or gated model
|
||||||
|
|
||||||
You have the option to utilize the `HUGGING_FACE_HUB_TOKEN` environment variable for configuring the token employed by
|
You have the option to utilize the `HF_TOKEN` environment variable for configuring the token employed by
|
||||||
`text-generation-inference`. This allows you to gain access to protected resources.
|
`text-generation-inference`. This allows you to gain access to protected resources.
|
||||||
|
|
||||||
For example, if you want to serve the gated Llama V2 model variants:
|
For example, if you want to serve the gated Llama V2 model variants:
|
||||||
|
|
||||||
1. Go to https://huggingface.co/settings/tokens
|
1. Go to https://huggingface.co/settings/tokens
|
||||||
2. Copy your cli READ token
|
2. Copy your cli READ token
|
||||||
3. Export `HUGGING_FACE_HUB_TOKEN=<your cli READ token>`
|
3. Export `HF_TOKEN=<your cli READ token>`
|
||||||
|
|
||||||
or with Docker:
|
or with Docker:
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ model=meta-llama/Llama-2-7b-chat-hf
|
||||||
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
||||||
token=<your cli READ token>
|
token=<your cli READ token>
|
||||||
|
|
||||||
docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
|
docker run --gpus all --shm-size 1g -e HF_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model
|
||||||
```
|
```
|
||||||
|
|
||||||
### A note on Shared Memory (shm)
|
### A note on Shared Memory (shm)
|
||||||
|
|
|
@ -147,7 +147,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
tracing::info!("Downloading tokenizer");
|
tracing::info!("Downloading tokenizer");
|
||||||
|
|
||||||
// Parse Huggingface hub token
|
// Parse Huggingface hub token
|
||||||
let auth_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
|
let auth_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
|
||||||
|
|
||||||
// Download and instantiate tokenizer
|
// Download and instantiate tokenizer
|
||||||
// We need to download it outside of the Tokio runtime
|
// We need to download it outside of the Tokio runtime
|
||||||
|
|
|
@ -2,13 +2,13 @@
|
||||||
|
|
||||||
If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
|
If the model you wish to serve is behind gated access or the model repository on Hugging Face Hub is private, and you have access to the model, you can provide your Hugging Face Hub access token. You can generate and copy a read token from [Hugging Face Hub tokens page](https://huggingface.co/settings/tokens)
|
||||||
|
|
||||||
If you're using the CLI, set the `HUGGING_FACE_HUB_TOKEN` environment variable. For example:
|
If you're using the CLI, set the `HF_TOKEN` environment variable. For example:
|
||||||
|
|
||||||
```
|
```
|
||||||
export HUGGING_FACE_HUB_TOKEN=<YOUR READ TOKEN>
|
export HF_TOKEN=<YOUR READ TOKEN>
|
||||||
```
|
```
|
||||||
|
|
||||||
If you would like to do it through Docker, you can provide your token by specifying `HUGGING_FACE_HUB_TOKEN` as shown below.
|
If you would like to do it through Docker, you can provide your token by specifying `HF_TOKEN` as shown below.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
model=meta-llama/Llama-2-7b-chat-hf
|
model=meta-llama/Llama-2-7b-chat-hf
|
||||||
|
@ -17,7 +17,7 @@ token=<your READ token>
|
||||||
|
|
||||||
docker run --gpus all \
|
docker run --gpus all \
|
||||||
--shm-size 1g \
|
--shm-size 1g \
|
||||||
-e HUGGING_FACE_HUB_TOKEN=$token \
|
-e HF_TOKEN=$token \
|
||||||
-p 8080:80 \
|
-p 8080:80 \
|
||||||
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
|
-v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0.4 \
|
||||||
--model-id $model
|
--model-id $model
|
||||||
|
|
|
@ -1,38 +1,38 @@
|
||||||
import sys
|
|
||||||
import subprocess
|
|
||||||
import contextlib
|
|
||||||
import pytest
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import contextlib
|
||||||
import docker
|
|
||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import random
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from docker.errors import NotFound
|
import docker
|
||||||
from typing import Optional, List, Dict
|
import pytest
|
||||||
from syrupy.extensions.json import JSONSnapshotExtension
|
|
||||||
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
|
from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
|
||||||
|
from docker.errors import NotFound
|
||||||
|
from syrupy.extensions.json import JSONSnapshotExtension
|
||||||
from text_generation import AsyncClient
|
from text_generation import AsyncClient
|
||||||
from text_generation.types import (
|
from text_generation.types import (
|
||||||
Response,
|
|
||||||
Details,
|
|
||||||
InputToken,
|
|
||||||
Token,
|
|
||||||
BestOfSequence,
|
BestOfSequence,
|
||||||
Grammar,
|
|
||||||
ChatComplete,
|
ChatComplete,
|
||||||
ChatCompletionChunk,
|
ChatCompletionChunk,
|
||||||
ChatCompletionComplete,
|
ChatCompletionComplete,
|
||||||
Completion,
|
Completion,
|
||||||
|
Details,
|
||||||
|
Grammar,
|
||||||
|
InputToken,
|
||||||
|
Response,
|
||||||
|
Token,
|
||||||
)
|
)
|
||||||
|
|
||||||
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
|
DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", None)
|
||||||
HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
|
HF_TOKEN = os.getenv("HF_TOKEN", None)
|
||||||
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
|
DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
|
||||||
DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
|
DOCKER_DEVICES = os.getenv("DOCKER_DEVICES")
|
||||||
|
|
||||||
|
@ -447,8 +447,8 @@ def launcher(event_loop):
|
||||||
if not use_flash_attention:
|
if not use_flash_attention:
|
||||||
env["USE_FLASH_ATTENTION"] = "false"
|
env["USE_FLASH_ATTENTION"] = "false"
|
||||||
|
|
||||||
if HUGGING_FACE_HUB_TOKEN is not None:
|
if HF_TOKEN is not None:
|
||||||
env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
|
env["HF_TOKEN"] = HF_TOKEN
|
||||||
|
|
||||||
volumes = []
|
volumes = []
|
||||||
if DOCKER_VOLUME:
|
if DOCKER_VOLUME:
|
||||||
|
|
|
@ -592,7 +592,7 @@ fn shard_manager(
|
||||||
|
|
||||||
// Parse Inference API token
|
// Parse Inference API token
|
||||||
if let Ok(api_token) = env::var("HF_API_TOKEN") {
|
if let Ok(api_token) = env::var("HF_API_TOKEN") {
|
||||||
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
|
envs.push(("HF_TOKEN".into(), api_token.into()))
|
||||||
};
|
};
|
||||||
|
|
||||||
// Detect rope scaling
|
// Detect rope scaling
|
||||||
|
@ -925,7 +925,7 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
|
||||||
|
|
||||||
// Parse Inference API token
|
// Parse Inference API token
|
||||||
if let Ok(api_token) = env::var("HF_API_TOKEN") {
|
if let Ok(api_token) = env::var("HF_API_TOKEN") {
|
||||||
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
|
envs.push(("HF_TOKEN".into(), api_token.into()))
|
||||||
};
|
};
|
||||||
|
|
||||||
// If args.weights_cache_override is some, pass it to the download process
|
// If args.weights_cache_override is some, pass it to the download process
|
||||||
|
@ -1227,7 +1227,7 @@ fn spawn_webserver(
|
||||||
|
|
||||||
// Parse Inference API token
|
// Parse Inference API token
|
||||||
if let Ok(api_token) = env::var("HF_API_TOKEN") {
|
if let Ok(api_token) = env::var("HF_API_TOKEN") {
|
||||||
envs.push(("HUGGING_FACE_HUB_TOKEN".into(), api_token.into()))
|
envs.push(("HF_TOKEN".into(), api_token.into()))
|
||||||
};
|
};
|
||||||
|
|
||||||
// Parse Compute type
|
// Parse Compute type
|
||||||
|
|
|
@ -156,7 +156,7 @@ async fn main() -> Result<(), RouterError> {
|
||||||
});
|
});
|
||||||
|
|
||||||
// Parse Huggingface hub token
|
// Parse Huggingface hub token
|
||||||
let authorization_token = std::env::var("HUGGING_FACE_HUB_TOKEN").ok();
|
let authorization_token = std::env::var("HF_TOKEN").or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")).ok();
|
||||||
|
|
||||||
// Tokenizer instance
|
// Tokenizer instance
|
||||||
// This will only be used to validate payloads
|
// This will only be used to validate payloads
|
||||||
|
|
Loading…
Reference in New Issue