huggingface · Narsil · Jun 10, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -9,7 +9,7 @@ on:
           # options:
           # - cuda
           # - rocm
-          # - intel
+          # - xpu
         required: true
       release-tests:
         description: "Run release integration tests"
@@ -21,7 +21,9 @@ jobs:
   build-and-push:
     outputs:
       docker_image: ${{ steps.final.outputs.docker_image }}
+      base_docker_image: ${{ steps.final.outputs.base_docker_image }}
       docker_devices: ${{ steps.final.outputs.docker_devices }}
+      docker_volume: ${{ steps.final.outputs.docker_volume}}
       runs_on: ${{ steps.final.outputs.runs_on }}
       label: ${{ steps.final.outputs.label }}
     concurrency:
@@ -36,11 +38,13 @@ jobs:
       # with sigstore/fulcio when running outside of PRs.
       id-token: write
       security-events: write
+
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
       - name: Inject slug/short variables
         uses: rlespinasse/[email protected]
+
       - name: Construct harware variables
         shell: bash
         run: |
@@ -55,11 +59,9 @@ jobs:
                 export dockerfile="Dockerfile_amd"
                 export label_extension="-rocm"
                 export docker_devices="/dev/kfd,/dev/dri"
-                # TODO Re-enable when they pass.
-                # export runs_on="amd-gpu-tgi"
-                export runs_on="ubuntu-latest"
+                export runs_on="amd-gpu-tgi"
                 ;;
-            intel)
+            xpu)
                 export dockerfile="Dockerfile_intel"
                 export label_extension="-intel"
                 export docker_devices=""
@@ -75,27 +77,31 @@ jobs:
           echo "LABEL=${label_extension}" >> $GITHUB_ENV
           echo "DOCKER_DEVICES=${docker_devices}" >> $GITHUB_ENV
           echo "RUNS_ON=${runs_on}" >> $GITHUB_ENV
+
       - name: Initialize Docker Buildx
         uses: docker/setup-buildx-action@v3
         with:
           install: true
           config-inline: |
             [registry."docker.io"]
               mirrors = ["registry.github-runners.huggingface.tech"]
+
       - name: Login to GitHub Container Registry
         if: github.event_name != 'pull_request'
         uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
+
       - name: Login to Azure Container Registry
         if: github.event_name != 'pull_request'
         uses: docker/login-action@v3
         with:
           username: ${{ secrets.AZURE_DOCKER_USERNAME }}
           password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
           registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
+
       # If pull request
       - name: Extract metadata (tags, labels) for Docker
         if: ${{ github.event_name == 'pull_request' }}
@@ -145,32 +151,109 @@ jobs:
           echo "docker_devices=${{ env.DOCKER_DEVICES }}" >> "$GITHUB_OUTPUT"
           echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
           echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
+
+          if [[ ${{ inputs.hardware }} == "rocm" ]]
+          then
+            echo "base_docker_image=rocm/dev-ubuntu-22.04:6.1.1_hip_update" >> "$GITHUB_OUTPUT"
+          elif [[ ${{ inputs.hardware }} == "cuda" ]]
+          then
+            echo "base_docker_image=nvidia/cuda:12.1.0-base-ubuntu22.04" >> "$GITHUB_OUTPUT"
+          elif [[ ${{ inputs.hardware }} == "xpu" ]]
+          then
+            echo "base_docker_image=intel/intel-extension-for-pytorch:2.1.30-xpu" >> "$GITHUB_OUTPUT"
+          else
+            exit 1
+          fi
+
+          if [[ ${{ inputs.hardware }} == "rocm" ]]
+          then
+            echo "docker_volume=/data/cache/.cache/huggingface/hub" >> "$GITHUB_OUTPUT"
+          else
+            echo "docker_volume=/mnt/cache" >> "$GITHUB_OUTPUT"
+          fi
+
+  prepare_integration_tests:
+    runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
+    needs: [build-and-push]
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
+      cancel-in-progress: true
+    if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
+    # Ideally, we would use the image from registry.internal.huggingface.tech but we can not login to the private registry outside of tailscale,
+    # and even adding a previous job with tailscale login still results in `Docker login for 'registry.internal.huggingface.tech' failed with exit code 1`.
+    container:
+      image: ${{ needs.build-and-push.outputs.base_docker_image }}
+      options: --shm-size "16gb" --ipc host -v ${{ needs.build-and-push.outputs.docker_volume }}:/data
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Clean Hugging Face cache
+        shell: bash
+        run: |
+          if [[ ${{ inputs.hardware }} == "rocm" ]]
+          then
+            echo "pwd:"
+            pwd
+            echo "ls:"
+            ls
+
+            pip3 install -U huggingface_hub
+
+            python3 integration-tests/clean_cache_and_download.py --token ${{ secrets.HF_TOKEN }} --cache-dir /data
+
+            # Avoid permissions issues in the next step not run within docker (File was unable to be removed Error: EACCES).
+            if [[ $PWD == *"text-generation-inference"* ]]; then
+              rm -rf -- ..?* .[!.]* *
+            fi
+          fi
+
   integration_tests:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    needs: build-and-push
+    needs: [build-and-push, prepare_integration_tests]
     runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
     if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
     env:
       PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == true) && '--release' || '' }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
+
       - name: Inject slug/short variables
         uses: rlespinasse/[email protected]
+
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
           python-version: "3.10"
+
       - name: Install
         run: |
           make install-integration-tests
+
       - name: Run tests
         run: |
-          export DOCKER_VOLUME=/mnt/cache
-          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
           export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
           export HF_TOKEN=${{ secrets.HF_TOKEN }}
+
+          export DOCKER_IMAGE=${{ needs.build-and-push.outputs.docker_image }}
+          echo "DOCKER_IMAGE:"
           echo $DOCKER_IMAGE
-          pytest -s -vv integration-tests ${PYTEST_FLAGS}
+
+          export SYSTEM=${{ inputs.hardware }}
+          echo "SYSTEM:"
+          echo $SYSTEM
+
+          export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
+          echo "DOCKER_VOLUME:"
+          echo $DOCKER_VOLUME
+
+          # TunableOp warmup is rather slow, do it only for a few seqlens.
+          if [[ ${{ inputs.hardware }} == "rocm" ]]
+          then
+            PYTORCH_TUNABLEOP_SEQLENS=2,4
+          fi
+
+          pytest -s -vvvvv integration-tests ${PYTEST_FLAGS}
diff --git a/.github/workflows/ci_build.yaml → .github/workflows/build_caller.yaml b/.github/workflows/ci_build.yaml → .github/workflows/build_caller.yaml
@@ -36,7 +36,7 @@ jobs:
       # fail-fast is true by default
       fail-fast: false
       matrix:
-        hardware: ["cuda", "rocm", "intel"]
+        hardware: ["cuda", "rocm", "xpu"]
     uses: ./.github/workflows/build.yaml # calls the one above ^
     with:
       hardware: ${{ matrix.hardware }}

diff --git a/Dockerfile_amd b/Dockerfile_amd
@@ -96,7 +96,10 @@ RUN pip uninstall -y triton && \
     cd triton/python && \
     pip install .
 
-RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir
+RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && \
+    cd pytorch && \
+    git checkout ceaa1e4a7b66e818ea4e56925bb4a5dff8c56055 && \
+    pip install -r requirements.txt --no-cache-dir
 
 ARG _GLIBCXX_USE_CXX11_ABI="1"
 ARG CMAKE_PREFIX_PATH="/opt/conda"
@@ -122,6 +125,10 @@ ENV HIP_FORCE_DEV_KERNARG=1
 # However, Triton requires a tunning for each prompt length, which is prohibitive.
 ENV ROCM_USE_FLASH_ATTN_V2_TRITON=0
 
+# Although `torch.cuda.tunable.enable()`, there is a bug in TunableOp where GEMM benchmark is done again unless
+# this variable is set. TODO: probably remove once we bump to PyTorch 2.4
+ENV PYTORCH_TUNABLEOP_ENABLED=1
+
 FROM base AS kernel-builder
 
 # # Build vllm kernels