debug

2024-06-26 10:43:57 +00:00 · 2024-06-26 10:43:57 +00:00 · 2330052aa2
parent 227f78f3fe
commit 2330052aa2
3 changed files with 19 additions and 5 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -168,13 +168,28 @@ jobs:
          fi


-  prepare_integration_tests:
+  login_tailscale:
    runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
    needs: build-and-push
    concurrency:
      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
      cancel-in-progress: true
    if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
+    steps:
+      - name: Tailscale
+        uses: huggingface/tailscale-action@main
+        if: needs.build-and-push.outputs.runs_on != 'amd-gpu-tgi'
+        with:
+          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
+
+
+  prepare_integration_tests:
+    runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
+    needs: [build-and-push, login_tailscale]
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
    container:
      image: ${{ needs.build-and-push.outputs.docker_image }}
      credentials:
--- a/integration-tests/clean_cache_and_download.py
+++ b/integration-tests/clean_cache_and_download.py
@ -112,6 +112,8 @@ def cleanup_cache(token: str, cache_dir: str):
    total_required_cached_size = sum(cached_required_size_per_model.values())
    total_other_cached_size = sum(cache_size_per_model.values())

+    print("total_required_size", total_required_size)
+    print("total_required_cached_size", total_required_cached_size)
    total_non_cached_required_size = total_required_size - total_required_cached_size
    assert total_non_cached_required_size >= 0

--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@ -394,7 +394,7 @@ def launcher(event_loop):
        with tempfile.TemporaryFile("w+") as tmp:
            # We'll output stdout/stderr to a temporary file. Using a pipe
            # cause the process to block until stdout is read.
-            print("call subprocess.Popen, with args", args)
+            print("subprocess.Popen:", args)
            with subprocess.Popen(
                args,
                stdout=tmp,
@ -426,7 +426,6 @@ def launcher(event_loop):
        max_batch_prefill_tokens: Optional[int] = None,
        max_total_tokens: Optional[int] = None,
    ):
-        print("call docker launcher")
        port = random.randint(8000, 10_000)

        args = ["--model-id", model_id, "--env"]
@ -494,8 +493,6 @@ def launcher(event_loop):
                docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
            ]

-        print("call client.containers.run")
-        print("container_name", container_name)
        container = client.containers.run(
            DOCKER_IMAGE,
            command=args,