Avoid downloading HF models in CI pipelines (#1263)

microsoft · Feb 20, 2025 · fba80c1 · fba80c1
1 parent 16fb079
commit fba80c1
Show file tree

Hide file tree

Showing 11 changed files with 67 additions and 62 deletions.
diff --git a/.github/workflows/linux-cpu-x64-build.yml b/.github/workflows/linux-cpu-x64-build.yml
@@ -84,10 +84,6 @@ jobs:
           python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
           python3 -m pip install --user --no-index --no-deps --find-links build/cpu/wheel onnxruntime_genai
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Verify Build Artifacts
         if: always()
         continue-on-error: true

diff --git a/.github/workflows/linux-cpu-x64-nightly-build.yml b/.github/workflows/linux-cpu-x64-nightly-build.yml
@@ -55,10 +55,6 @@ jobs:
           python3 -m pip install -r test/python/cpu/ort/requirements.txt --user
           python3 -m pip install build/cpu/wheel/onnxruntime_genai*.whl --no-deps
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Run the python tests
         run: |
           python3 test/python/test_onnxruntime_genai.py --cwd test/python --test_models test/test_models --e2e

diff --git a/.github/workflows/linux-gpu-x64-build.yml b/.github/workflows/linux-gpu-x64-build.yml
@@ -109,10 +109,6 @@ jobs:
             bash -c " \
               /usr/bin/cmake --build --preset linux_gcc_cuda_release"
 
-      - name: Use Dummy HuggingFace Token
-        run: |
-          echo "HF_TOKEN=12345" >> $GITHUB_ENV
-
       - name: Install the onnxruntime-genai Python wheel and run python test
         run: |
           echo "Installing the onnxruntime-genai Python wheel and running the Python tests"

diff --git a/.github/workflows/win-cpu-x64-build.yml b/.github/workflows/win-cpu-x64-build.yml
@@ -91,10 +91,6 @@ jobs:
         python3 -m pip install -r test\python\cpu\ort\requirements.txt --user
         python3 -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
-    - name: Use Dummy HuggingFace Token
-      run: |
-        Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
-
     - name: Run the Python Tests
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models"

diff --git a/.github/workflows/win-cuda-x64-build.yml b/.github/workflows/win-cuda-x64-build.yml
@@ -80,10 +80,6 @@ jobs:
         python -m pip install -r test\python\cuda\ort\requirements.txt
         python -m pip install (Get-ChildItem ("$env:binaryDir\wheel\*.whl")) --no-deps
 
-    - name: Use Dummy HuggingFace Token
-      run: |
-        Add-Content -Path $env:GITHUB_ENV -Value "HF_TOKEN=12345"
-
     - name: Run the Python Tests
       run: |
         python test/python/test_onnxruntime_genai.py --cwd "test\python" --test_models "test\test_models" --e2e

diff --git a/examples/csharp/HelloPhi/HelloPhi.csproj b/examples/csharp/HelloPhi/HelloPhi.csproj
@@ -10,9 +10,9 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.7.0-dev" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.7.0-dev" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.7.0-dev" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
   </ItemGroup>
 
   <ItemGroup>

diff --git a/examples/csharp/HelloPhi3V/HelloPhi3V.csproj b/examples/csharp/HelloPhi3V/HelloPhi3V.csproj
@@ -9,9 +9,9 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.7.0-dev" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.7.0-dev" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
-    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.7.0-dev" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug' OR '$(Configuration)' == 'Release' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.Cuda" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_Cuda' OR '$(Configuration)' == 'Release_Cuda' " />
+    <PackageReference Include="Microsoft.ML.OnnxRuntimeGenAI.DirectML" Version="0.6.0" Condition=" '$(Configuration)' == 'Debug_DirectML' OR '$(Configuration)' == 'Release_DirectML' " />
   </ItemGroup>
 
 </Project>
diff --git a/test/python/_test_utils.py b/test/python/_test_utils.py
@@ -53,33 +53,40 @@ def run_subprocess(
 
 
 def get_model_paths():
+    # TODO: Uncomment the following models as needed in the CI pipeline.
+
     hf_paths = {
         "phi-2": "microsoft/phi-2",
-        "olmo": "amd/AMD-OLMo-1B-SFT-DPO",
-        "qwen": "Qwen/Qwen2.5-0.5B",
-        "phi-3.5": "microsoft/Phi-3.5-mini-instruct",
+        # "olmo": "amd/AMD-OLMo-1B-SFT-DPO",
+        "qwen-2.5": "Qwen/Qwen2.5-0.5B",
+        # "phi-3.5": "microsoft/Phi-3.5-mini-instruct",
         # "llama-3.2": "meta-llama/Llama-3.2-1B-instruct",
-        "granite-3.0": "ibm-granite/granite-3.0-2b-instruct",
+        # "granite-3.0": "ibm-granite/granite-3.0-2b-instruct",
     }
 
-    ci_data_path = os.path.join("/", "data", "ortgenai", "pytorch")
+    ci_data_path = None
+    if is_windows():
+        ci_data_path = os.path.join(R"C:\\", "data", "models", "ortgenai", "pytorch")
+    else:
+        ci_data_path = os.path.join(os.path.abspath(os.sep), "data", "ortgenai", "pytorch")
+
     if not os.path.exists(ci_data_path):
         return {}, hf_paths
 
     # Note: If a model has over 4B parameters, please add a quantized version
     # to `ci_paths` instead of `hf_paths` to reduce file size and testing time.
     ci_paths = {
-        "llama-2": os.path.join(ci_data_path, "Llama-2-7B-Chat-GPTQ"),
-        "llama-3": os.path.join(ci_data_path, "Meta-Llama-3-8B-AWQ"),
-        "mistral-v0.2": os.path.join(ci_data_path, "Mistral-7B-Instruct-v0.2-GPTQ"),
+        # "llama-2": os.path.join(ci_data_path, "Llama-2-7B-Chat-GPTQ"),
+        # "llama-3": os.path.join(ci_data_path, "Meta-Llama-3-8B-AWQ"),
+        # "mistral-v0.2": os.path.join(ci_data_path, "Mistral-7B-Instruct-v0.2-GPTQ"),
         "phi-2": os.path.join(ci_data_path, "phi2"),
-        "gemma-2b": os.path.join(ci_data_path, "gemma-1.1-2b-it"),
-        "gemma-7b": os.path.join(ci_data_path, "gemma-7b-it-awq"),
-        "phi-3-mini": os.path.join(ci_data_path, "phi3-mini-128k-instruct"),
-        "gemma-2-2b": os.path.join(ci_data_path, "gemma-2-2b-it"),
-        "llama-3.2": os.path.join(ci_data_path, "llama-3.2b-1b-instruct"),
+        # "gemma-2b": os.path.join(ci_data_path, "gemma-1.1-2b-it"),
+        # "gemma-7b": os.path.join(ci_data_path, "gemma-7b-it-awq"),
+        # "phi-3-mini": os.path.join(ci_data_path, "phi3-mini-128k-instruct"),
+        # "gemma-2-2b": os.path.join(ci_data_path, "gemma-2-2b-it"),
+        # "llama-3.2": os.path.join(ci_data_path, "llama-3.2b-1b-instruct"),
         "qwen-2.5": os.path.join(ci_data_path, "qwen2.5-0.5b-instruct"),
-        "nemotron-mini": os.path.join(ci_data_path, "nemotron-mini-4b"),
+        # "nemotron-mini": os.path.join(ci_data_path, "nemotron-mini-4b"),
     }
 
     return ci_paths, hf_paths
@@ -123,22 +130,41 @@ def download_model(model_name, input_path, output_path, precision, device, one_l
     run_subprocess(command).check_returncode()
 
 
-def download_models(download_path, precision, device):
+def download_models(download_path, precision, device, log):
+    log.debug(f"Downloading models to {download_path} with precision {precision} and device {device}")
+
     ci_paths, hf_paths = get_model_paths()
     output_paths = []
 
+    log.debug(f"Downloading {len(ci_paths)} PyTorch models and {len(hf_paths)} Hugging Face models")
+
     # python -m onnxruntime_genai.models.builder -i <input_path> -o <output_path> -p <precision> -e <device>
     for model_name, input_path in ci_paths.items():
         output_path = os.path.join(download_path, model_name, precision, device)
+        log.debug(f"Downloading {model_name} from {input_path} to {output_path}")
         if not os.path.exists(output_path):
             download_model(None, input_path, output_path, precision, device)
             output_paths.append(output_path)
 
     # python -m onnxruntime_genai.models.builder -m <model_name> -o <output_path> -p <precision> -e <device>
     for model_name, hf_name in hf_paths.items():
+        try:
+            from huggingface_hub import model_info
+            model_info(hf_name)
+        except ImportError:
+            log.warning("huggingface_hub is not installed. Skipping downloading hugging face models.")
+            continue
+        except Exception as e:
+            log.warning(f"Error: {e}. Skipping downloading hugging face models")
+            continue
         output_path = os.path.join(download_path, model_name, precision, device)
+
+        log.debug(f"Downloading {model_name} from {hf_name} to {output_path}")
+
         if not os.path.exists(output_path):
             download_model(hf_name, "", output_path, precision, device)
             output_paths.append(output_path)
 
+    log.info(f"Successfully downloaded {len(output_paths)} models")
+
     return output_paths
diff --git a/test/python/conftest.py b/test/python/conftest.py
@@ -19,7 +19,10 @@ def pytest_addoption(parser):
 
 
 def get_path_for_model(data_path, model_name, precision, device):
-    return os.path.join(data_path, model_name, precision, device)
+    model_path = os.path.join(data_path, model_name, precision, device)
+    if not os.path.exists(model_path):
+        pytest.skip(f"Model {model_name} not found at {model_path}")
+    return model_path
 
 
 @pytest.fixture
@@ -52,6 +55,16 @@ def llama_for(request):
     )
 
 
+@pytest.fixture
+def qwen_for(request):
+    return functools.partial(
+        get_path_for_model,
+        request.config.getoption("--test_models"),
+        "qwen-2.5",
+        "int4",
+    )
+
+
 @pytest.fixture
 def path_for_model(request):
     return functools.partial(

diff --git a/test/python/test_onnxruntime_genai.py b/test/python/test_onnxruntime_genai.py
@@ -84,11 +84,11 @@ def main():
     if not (
         sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8
     ):
-        output_paths += download_models(os.path.abspath(args.test_models), "int4", "cpu")
+        output_paths += download_models(os.path.abspath(args.test_models), "int4", "cpu", log)
         if og.is_cuda_available():
-            output_paths += download_models(os.path.abspath(args.test_models), "int4", "cuda")
+            output_paths += download_models(os.path.abspath(args.test_models), "int4", "cuda", log)
         if og.is_dml_available():
-            output_paths += download_models(os.path.abspath(args.test_models), "int4", "dml")
+            output_paths += download_models(os.path.abspath(args.test_models), "int4", "dml", log)
 
     # Run ONNX Runtime GenAI tests
     run_onnxruntime_genai_api_tests(os.path.abspath(args.cwd), log, os.path.abspath(args.test_models))

diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py
@@ -391,23 +391,9 @@ def test_get_output(test_data_path, relative_model_path):
     sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8,
     reason="Python 3.8 is required for downloading models.",
 )
-@pytest.mark.parametrize(
-    "relative_model_path",
-    (
-        [
-            Path("qwen/int4/cpu"),
-            Path("qwen/int4/cuda"),
-        ]
-        if og.is_cuda_available()
-        else [
-            Path("qwen/int4/cpu"),
-        ]
-    ),
-)
-def test_hidden_states(test_data_path, relative_model_path):
-    model_path = os.fspath(Path(test_data_path) / relative_model_path)
-
-    model = og.Model(model_path)
+@pytest.mark.parametrize("device", devices)
+def test_hidden_states(qwen_for, device):
+    model = og.Model(qwen_for(device))
 
     search_params = og.GeneratorParams(model)
     input_ids = np.array(