AMD CI (#2589)
* Only run 1 valid test. * TRying the tailscale action quickly. * ? * bash spaces. * Remove tailscale. * More quotes. * mnt2 ? * Othername to avoid recursive directories. * Good old tmate. * Remove tmate. * Trying a few things. * Remove some stuff. * Sleep ? * Tmp * busybox * Launcher tgi * Starting hello * Busybox in python * No device. * Removing all variables ? * A un moment donné. * Tmp * Tmp2 * DEvice request, no container name * No device requests * Without pytest. * No pytest. * from env * Start with devices * Attemp #1 * Remove stdin messing * Only 1 test, no container name * Raw tgi * Sending args. * Show pip freeze. * Start downloading with token * Giving HIP devices. * Mount volume + port forward * Without pytest. * No token * Repeated arguments * Wrong kwarg. * On 2 GPUs * Fallback to single shard CI test. * Testing * yaml * Common cache ? * Trailing slash ? * Docker volume split. * Fix docker volume * Fixing ? * ? * Try no devices ? * Flash llama on intel CPU ? * Fix nvidia ? * Temp deactivate intel, activate nvidia ?
This commit is contained in:
parent
9ed0c85fe1
commit
43f39f6894
|
@ -21,9 +21,11 @@ jobs:
|
||||||
build-and-push:
|
build-and-push:
|
||||||
outputs:
|
outputs:
|
||||||
docker_image: ${{ steps.final.outputs.docker_image }}
|
docker_image: ${{ steps.final.outputs.docker_image }}
|
||||||
|
docker_volume: ${{ steps.final.outputs.docker_volume }}
|
||||||
docker_devices: ${{ steps.final.outputs.docker_devices }}
|
docker_devices: ${{ steps.final.outputs.docker_devices }}
|
||||||
runs_on: ${{ steps.final.outputs.runs_on }}
|
runs_on: ${{ steps.final.outputs.runs_on }}
|
||||||
label: ${{ steps.final.outputs.label }}
|
label: ${{ steps.final.outputs.label }}
|
||||||
|
extra_pytest: ${{ steps.final.outputs.extra_pytest }}
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-build-and-push-image-${{ inputs.hardware }}-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
@ -44,32 +46,39 @@ jobs:
|
||||||
cuda)
|
cuda)
|
||||||
export dockerfile="Dockerfile"
|
export dockerfile="Dockerfile"
|
||||||
export label_extension=""
|
export label_extension=""
|
||||||
|
export docker_volume="/mnt/cache"
|
||||||
export docker_devices=""
|
export docker_devices=""
|
||||||
export runs_on="aws-g6-12xl-plus-priv-cache"
|
export runs_on="aws-g6-12xl-plus-priv-cache"
|
||||||
export platform=""
|
export platform=""
|
||||||
|
export extra_pytest=""
|
||||||
;;
|
;;
|
||||||
rocm)
|
rocm)
|
||||||
export dockerfile="Dockerfile_amd"
|
export dockerfile="Dockerfile_amd"
|
||||||
export label_extension="-rocm"
|
export label_extension="-rocm"
|
||||||
export docker_devices="/dev/kfd,/dev/dri"
|
export docker_devices="/dev/kfd,/dev/dri"
|
||||||
# TODO Re-enable when they pass.
|
export docker_volume="/mnt"
|
||||||
# export runs_on="amd-gpu-tgi"
|
export runs_on="amd-gpu-runners"
|
||||||
export runs_on="ubuntu-latest"
|
|
||||||
export platform=""
|
export platform=""
|
||||||
|
export extra_pytest="-k test_flash_gemma_gptq_load"
|
||||||
;;
|
;;
|
||||||
intel-xpu)
|
intel-xpu)
|
||||||
export dockerfile="Dockerfile_intel"
|
export dockerfile="Dockerfile_intel"
|
||||||
export label_extension="-intel-xpu"
|
export label_extension="-intel-xpu"
|
||||||
export docker_devices=""
|
export docker_devices=""
|
||||||
|
export docker_volume="/mnt/cache"
|
||||||
export runs_on="ubuntu-latest"
|
export runs_on="ubuntu-latest"
|
||||||
export platform="xpu"
|
export platform="xpu"
|
||||||
|
export extra_pytest=""
|
||||||
;;
|
;;
|
||||||
intel-cpu)
|
intel-cpu)
|
||||||
export dockerfile="Dockerfile_intel"
|
export dockerfile="Dockerfile_intel"
|
||||||
export label_extension="-intel-cpu"
|
export label_extension="-intel-cpu"
|
||||||
export docker_devices=""
|
export docker_devices="none"
|
||||||
|
export docker_volume="/mnt/cache"
|
||||||
export runs_on="ubuntu-latest"
|
export runs_on="ubuntu-latest"
|
||||||
|
# export runs_on="aws-highmemory-32-plus-priv"
|
||||||
export platform="cpu"
|
export platform="cpu"
|
||||||
|
export extra_pytest="-k test_flash_llama_load"
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
echo $dockerfile
|
echo $dockerfile
|
||||||
|
@ -81,8 +90,10 @@ jobs:
|
||||||
echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
|
echo "DOCKERFILE=${dockerfile}" >> $GITHUB_ENV
|
||||||
echo "LABEL=${label_extension}" >> $GITHUB_ENV
|
echo "LABEL=${label_extension}" >> $GITHUB_ENV
|
||||||
echo "PLATFORM=${platform}" >> $GITHUB_ENV
|
echo "PLATFORM=${platform}" >> $GITHUB_ENV
|
||||||
|
echo "DOCKER_VOLUME=${docker_volume}" >> $GITHUB_ENV
|
||||||
echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
|
echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
|
||||||
echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
|
echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
|
||||||
|
echo "EXTRA_PYTEST=${extra_pytest}" >> $GITHUB_ENV
|
||||||
echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
|
echo REGISTRY_MIRROR=$REGISTRY_MIRROR >> $GITHUB_ENV
|
||||||
- name: Initialize Docker Buildx
|
- name: Initialize Docker Buildx
|
||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
|
@ -157,16 +168,18 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
echo "docker_image=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
|
echo "docker_image=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT}}${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
|
||||||
echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
|
echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "docker_volume=${{ env.DOCKER_VOLUME }}" >> "$GITHUB_OUTPUT"
|
||||||
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
|
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
|
||||||
echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
|
echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "extra_pytest=${{ env.EXTRA_PYTEST }}" >> "$GITHUB_OUTPUT"
|
||||||
integration_tests:
|
integration_tests:
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
needs: build-and-push
|
needs: build-and-push
|
||||||
|
if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
|
||||||
runs-on:
|
runs-on:
|
||||||
group: ${{ needs.build-and-push.outputs.runs_on }}
|
group: ${{ needs.build-and-push.outputs.runs_on }}
|
||||||
if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
|
|
||||||
env:
|
env:
|
||||||
PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
|
PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '--release' }}
|
||||||
steps:
|
steps:
|
||||||
|
@ -177,15 +190,16 @@ jobs:
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.10"
|
python-version: "3.11"
|
||||||
- name: Install
|
- name: Install
|
||||||
run: |
|
run: |
|
||||||
make install-integration-tests
|
make install-integration-tests
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
export DOCKER_VOLUME=/mnt/cache
|
export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
|
||||||
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
|
export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
|
||||||
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
|
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
|
||||||
|
export EXTRA_PYTEST="${{ needs.build-and-push.outputs.extra_pytest }}"
|
||||||
export HF_TOKEN=${{ secrets.HF_TOKEN }}
|
export HF_TOKEN=${{ secrets.HF_TOKEN }}
|
||||||
echo $DOCKER_IMAGE
|
echo $DOCKER_IMAGE
|
||||||
pytest -s -vv integration-tests ${PYTEST_FLAGS}
|
pytest -s -vv integration-tests ${PYTEST_FLAGS} ${EXTRA_PYTEST}
|
||||||
|
|
|
@ -492,6 +492,7 @@ def launcher(event_loop):
|
||||||
try:
|
try:
|
||||||
container = client.containers.get(container_name)
|
container = client.containers.get(container_name)
|
||||||
container.stop()
|
container.stop()
|
||||||
|
container.remove()
|
||||||
container.wait()
|
container.wait()
|
||||||
except NotFound:
|
except NotFound:
|
||||||
pass
|
pass
|
||||||
|
@ -514,13 +515,28 @@ def launcher(event_loop):
|
||||||
volumes = [f"{DOCKER_VOLUME}:/data"]
|
volumes = [f"{DOCKER_VOLUME}:/data"]
|
||||||
|
|
||||||
if DOCKER_DEVICES:
|
if DOCKER_DEVICES:
|
||||||
devices = DOCKER_DEVICES.split(",")
|
if DOCKER_DEVICES.lower() == "none":
|
||||||
|
devices = []
|
||||||
|
else:
|
||||||
|
devices = DOCKER_DEVICES.strip().split(",")
|
||||||
visible = os.getenv("ROCR_VISIBLE_DEVICES")
|
visible = os.getenv("ROCR_VISIBLE_DEVICES")
|
||||||
if visible:
|
if visible:
|
||||||
env["ROCR_VISIBLE_DEVICES"] = visible
|
env["ROCR_VISIBLE_DEVICES"] = visible
|
||||||
device_requests = []
|
device_requests = []
|
||||||
|
if not devices:
|
||||||
|
devices = None
|
||||||
|
elif devices == ["nvidia.com/gpu=all"]:
|
||||||
|
devices = None
|
||||||
|
device_requests = [
|
||||||
|
docker.types.DeviceRequest(
|
||||||
|
driver="cdi",
|
||||||
|
# count=gpu_count,
|
||||||
|
device_ids=[f"nvidia.com/gpu={i}"],
|
||||||
|
)
|
||||||
|
for i in range(gpu_count)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
devices = []
|
devices = None
|
||||||
device_requests = [
|
device_requests = [
|
||||||
docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
|
docker.types.DeviceRequest(count=gpu_count, capabilities=[["gpu"]])
|
||||||
]
|
]
|
||||||
|
@ -540,6 +556,7 @@ def launcher(event_loop):
|
||||||
shm_size="1G",
|
shm_size="1G",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
yield ContainerLauncherHandle(client, container.name, port)
|
yield ContainerLauncherHandle(client, container.name, port)
|
||||||
|
|
||||||
if not use_flash_attention:
|
if not use_flash_attention:
|
||||||
|
@ -554,6 +571,7 @@ def launcher(event_loop):
|
||||||
container_output = container.logs().decode("utf-8")
|
container_output = container.logs().decode("utf-8")
|
||||||
print(container_output, file=sys.stderr)
|
print(container_output, file=sys.stderr)
|
||||||
|
|
||||||
|
finally:
|
||||||
container.remove()
|
container.remove()
|
||||||
|
|
||||||
if DOCKER_IMAGE is not None:
|
if DOCKER_IMAGE is not None:
|
||||||
|
|
Loading…
Reference in New Issue