do not skip workflow on cuda, fix no space left no device

fxmarty · fxmarty · commit 7998a2b812d2 · 2024-06-24T11:54:09.000Z
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -176,7 +176,7 @@ jobs:
     concurrency:
       group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
       cancel-in-progress: true
-    if: needs.build-and-push.outputs.runs_on == 'amd-gpu-tgi'
+    if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
     container:
       image: ${{ needs.build-and-push.outputs.docker_image }}
       options: --shm-size "16gb" --ipc host -v ${{ needs.build-and-push.outputs.docker_volume }}:/data
@@ -193,7 +193,7 @@ jobs:
             pwd
             echo "ls:"
             ls
-            python integration-tests/clean_cache_and_download.py --token ${{ secrets.HF_TOKEN }}
+            python integration-tests/clean_cache_and_download.py --token ${{ secrets.HF_TOKEN }} --cache-dir /data
           fi
 
   integration_tests:
@@ -239,6 +239,10 @@ jobs:
           echo "SYSTEM:"
           echo $SYSTEM
 
+          export DOCKER_VOLUME=${{ needs.build-and-push.outputs.docker_volume }}
+          echo "DOCKER_VOLUME:"
+          echo $DOCKER_VOLUME
+
           pytest -s -vvvvv integration-tests
 
       # - name: Tailscale Wait
diff --git a/integration-tests/clean_cache_and_download.py b/integration-tests/clean_cache_and_download.py
@@ -35,7 +35,7 @@
 }
 
 
-def cleanup_cache(token: str):
+def cleanup_cache(token: str, cache_dir: str):
     # Retrieve the size per model for all models used in the CI.
     size_per_model = {}
     extension_per_model = {}
@@ -74,7 +74,7 @@ def cleanup_cache(token: str):
     total_required_size = sum(size_per_model.values())
     print(f"Total required disk: {total_required_size:.2f} GB")
 
-    cached_dir = huggingface_hub.scan_cache_dir()
+    cached_dir = huggingface_hub.scan_cache_dir(cache_dir)
 
     cache_size_per_model = {}
     cached_required_size_per_model = {}
@@ -121,7 +121,7 @@ def cleanup_cache(token: str):
 
         print("Removing", largest_model_id)
         for sha in cached_shas_per_model[largest_model_id]:
-            huggingface_hub.scan_cache_dir().delete_revisions(sha).execute()
+            huggingface_hub.scan_cache_dir(cache_dir).delete_revisions(sha).execute()
 
         del cache_size_per_model[largest_model_id]
 
@@ -135,10 +135,11 @@ def cleanup_cache(token: str):
     parser.add_argument(
         "--token", help="Hugging Face Hub token.", required=True, type=str
     )
+    parser.add_argument("--cache-dir", help="Hub cache path.", required=True, type=str)
     args = parser.parse_args()
 
     start = time.time()
-    extension_per_model = cleanup_cache(args.token)
+    extension_per_model = cleanup_cache(args.token, args.cache_dir)
     end = time.time()
 
     print(f"Cache cleanup done in {end - start:.2f} s")
@@ -153,6 +154,7 @@ def cleanup_cache(token: str):
             revision=revision,
             token=args.token,
             allow_patterns=f"*{extension_per_model[model_id]}",
+            cache_dir=args.cache_dir,
         )
     end = time.time()