diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 61f7b59..9992e0a 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -148,8 +148,8 @@ jobs: DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }} tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} - cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max - cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=max + cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min + cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache,mode=min # Sign the resulting Docker image digest except on PRs. # This will only write to the public Rekor transparency log when the Docker # repository is public to avoid leaking data. diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml new file mode 100644 index 0000000..10e248e --- /dev/null +++ b/.github/workflows/load_test.yaml @@ -0,0 +1,108 @@ +name: Nightly load test + +on: + schedule: + - cron: '0 0 * * 1-5' + + pull_request: + paths: + - ".github/workflows/load_test.yaml" + branches: + - 'main' + +jobs: + start-runner: + name: Start self-hosted EC2 runner + runs-on: ubuntu-latest + env: + AWS_REGION: us-east-1 + EC2_AMI_ID: ami-03cfed9ea28f4b002 + EC2_INSTANCE_TYPE: g5.12xlarge + EC2_SUBNET_ID: subnet-931b34f5,subnet-ecb993cd,subnet-943dc2d8,subnet-45371f1a,subnet-ee93e0df,subnet-fddc3dfc + EC2_SECURITY_GROUP: sg-04d472c808f365022 + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ env.AWS_REGION }} + - name: Start EC2 runner + id: start-ec2-runner + uses: philschmid/philschmid-ec2-github-runner@main + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + ec2-image-id: ${{ env.EC2_AMI_ID }} + ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }} + subnet-id: ${{ env.EC2_SUBNET_ID }} + security-group-id: ${{ env.EC2_SECURITY_GROUP }} + aws-resource-tags: > # optional, requires additional permissions + [ + {"Key": "Name", "Value": "ec2-tgi-github-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"} + ] + + load-tests: + concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + needs: start-runner # required to start the main job when the runner is ready + runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner + env: + DOCKER_VOLUME: /cache + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Prepare disks + run: | + sudo mkfs -t ext4 /dev/nvme1n1 + sudo mkdir ${{ env.DOCKER_VOLUME }} + sudo mount /dev/nvme1n1 ${{ env.DOCKER_VOLUME }} + + - name: Install k6 + run: | + curl https://github.com/grafana/k6/releases/download/v0.44.0/k6-v0.44.0-linux-amd64.tar.gz -L | tar xvz --strip-components 1 + + - name: Start starcoder + run: | + docker run --name tgi-starcoder --rm --gpus all -p 3000:80 -v ${{ env.DOCKER_VOLUME }}:/data -e HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} --pull always -d ghcr.io/huggingface/text-generation-inference:latest --model-id bigcode/starcoder --num-shard 2 --max-batch-total-tokens 32768 + sleep 10 + wget --timeout 10 --retry-on-http-error --waitretry=1 --tries=240 http://localhost:3000/health + + - name: Run k6 + run: | + ./k6 run load_tests/starcoder_load.js + + - name: Stop starcoder + if: ${{ always() }} + run: | + docker stop tgi-starcoder || true + + stop-runner: + name: Stop self-hosted EC2 runner + needs: + - start-runner + - load-tests + runs-on: ubuntu-latest + env: + AWS_REGION: us-east-1 + if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ env.AWS_REGION }} + - name: Stop EC2 runner + uses: philschmid/philschmid-ec2-github-runner@main + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} \ No newline at end of file diff --git a/k6/load_test.js b/k6/load_test.js deleted file mode 100644 index 516b566..0000000 --- a/k6/load_test.js +++ /dev/null @@ -1,98 +0,0 @@ -import http from 'k6/http'; -import {check, sleep} from 'k6'; - -export const options = { - stages: [ - {duration: '1m', target: 50}, - {duration: '2m', target: 100}, - {duration: '1m', target: 0}, - ], - hosts: { - 'text-generation-inference.huggingface.co': '127.0.0.1:3000', - }, -}; -const SLEEP_DURATION = 1; - -function greedy_example(inputs, max_new_tokens, name) { - let body = JSON.stringify({ - inputs: inputs, - parameters: { - max_new_tokens: max_new_tokens, - do_sample: false, - } - }); - let params = { - headers: { - 'Content-Type': 'application/json', - }, - tags: { - name: name - } - }; - return http.post('http://text-generation-inference.huggingface.co/generate', body, params); -} - -function sample_example(inputs, max_new_tokens, name) { - let body = JSON.stringify({ - inputs: inputs, - parameters: { - max_new_tokens: max_new_tokens, - do_sample: true, - top_p: 0.9, - seed: 0 - } - }); - let params = { - headers: { - 'Content-Type': 'application/json', - }, - tags: { - name: name - } - }; - return http.post('http://text-generation-inference.huggingface.co/generate', body, params); -} - -export default function () { - const response_1 = sample_example('A "whatpu" is a small, furry animal native to Tanzania. An example of a sentence that uses the word whatpu is: We were traveling in Africa and we saw these very cute whatpus. To do a "farduddle" means to jump up and down really fast. An example of a sentence that uses the word farduddle is:', 32, 'example-1'); - check(response_1, { - 'is status 200': (r) => r.status === 200, - }); - sleep(SLEEP_DURATION); - - const response_2 = sample_example("A poem about the beauty of science by Alfred Edgar Brittle\\nTitle: The Magic Craft\\nIn the old times", 50, "example-2"); - check(response_2, { - 'is status 200': (r) => r.status === 200, - }); - sleep(SLEEP_DURATION); - - const response_3 = greedy_example("استخراج العدد العاملي في لغة بايثون: ", 30, "example-3"); - check(response_3, { - 'is status 200': (r) => r.status === 200, - }); - sleep(SLEEP_DURATION); - - const response_4 = sample_example("Pour déguster un ortolan, il faut tout d'abord", 32, "example-4"); - check(response_4, { - 'is status 200': (r) => r.status === 200, - }); - sleep(SLEEP_DURATION); - - const response_5 = sample_example("Traduce español de España a español de Argentina\nEl coche es rojo - el auto es rojo\nEl ordenador es nuevo - la computadora es nueva\nel boligrafo es negro -", 16, "example-5"); - check(response_5, { - 'is status 200': (r) => r.status === 200, - }); - sleep(SLEEP_DURATION); - - const response_6 = sample_example("Question: If I put cheese into the fridge, will it melt?\nAnswer:", 32, "example-6"); - check(response_6, { - 'is status 200': (r) => r.status === 200, - }); - sleep(SLEEP_DURATION); - - const response_7 = greedy_example("Question: Where does the Greek Goddess Persephone spend half of the year when she is not with her mother?\nAnswer:", 24, "example-7"); - check(response_7, { - 'is status 200': (r) => r.status === 200, - }); - sleep(SLEEP_DURATION); -} \ No newline at end of file diff --git a/load_tests/starcoder_load.js b/load_tests/starcoder_load.js new file mode 100644 index 0000000..76316b6 --- /dev/null +++ b/load_tests/starcoder_load.js @@ -0,0 +1,63 @@ +import {check} from 'k6'; +import http from 'k6/http'; +import {Trend} from 'k6/metrics'; + +const host = __ENV.HOST || '127.0.0.1:3000'; + +const totalTime = new Trend('total_time', true); +const validationTime = new Trend('validation_time', true); +const queueTime = new Trend('queue_time', true); +const inferenceTime = new Trend('inference_time', true); +const timePerToken = new Trend('time_per_token', true); + +const example = { + payload: JSON.stringify({ + inputs: '# This is a fibonacci function written in the Python programming language.' + + 'def fibonacci', + parameters: { + details: true, + max_new_tokens: 60, + temperature: 0.2, + top_p: 0.95, + seed: 0, + }, + }), + generated_tokens: 60 +}; + +export const options = { + thresholds: { + http_req_failed: ['rate==0'], + time_per_token: ['p(95)<90'], + queue_time: ['p(95)<1500'], + }, + scenarios: { + load_test: { + executor: 'constant-arrival-rate', + duration: '60s', + preAllocatedVUs: 100, + rate: 10, + timeUnit: '1s', + }, + }, +}; + +export default function () { + const headers = {'Content-Type': 'application/json'}; + const res = http.post(`http://${host}/generate`, example.payload, { + headers, + }); + + check(res, { + 'Post status is 200': (r) => res.status === 200, + 'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens, + }); + + if (res.status === 200) { + totalTime.add(res.headers["X-Total-Time"]); + validationTime.add(res.headers["X-Validation-Time"]); + queueTime.add(res.headers["X-Queue-Time"]); + inferenceTime.add(res.headers["X-Inference-Time"]); + timePerToken.add(res.headers["X-Time-Per-Token"]); + } +} \ No newline at end of file