diff --git a/.github/workflows/promptflow-evals-e2e-test.yml b/.github/workflows/promptflow-evals-e2e-test-azure.yml similarity index 90% rename from .github/workflows/promptflow-evals-e2e-test.yml rename to .github/workflows/promptflow-evals-e2e-test-azure.yml index 606d827723f..75cb2c5d422 100644 --- a/.github/workflows/promptflow-evals-e2e-test.yml +++ b/.github/workflows/promptflow-evals-e2e-test-azure.yml @@ -1,4 +1,4 @@ -name: promptflow-evals-e2e-test +name: promptflow-evals-e2e-test-azure on: schedule: @@ -6,7 +6,7 @@ on: pull_request: paths: - src/promptflow-evals/** - - .github/workflows/promptflow-evals-e2e-test.yml + - .github/workflows/promptflow-evals-e2e-test-azure.yml workflow_dispatch: env: @@ -83,10 +83,10 @@ jobs: creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }} enable-AzPSSession: true - name: run e2e tests - id: run_all_e2e_tests + id: run_e2e_tests_azure run: | - poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml - poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity all_e2e_tests_run_times --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }} + poetry run pytest -m azuretest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml + poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_azure --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }} working-directory: ${{ env.WORKING_DIRECTORY }} - name: upload coverage report uses: actions/upload-artifact@v4 diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml new file mode 100644 index 00000000000..f5cef2aa4d2 --- /dev/null +++ b/.github/workflows/promptflow-evals-e2e-test-local.yml @@ -0,0 +1,98 @@ +name: promptflow-evals-e2e-test-local + +on: + schedule: + - cron: "40 10 * * *" # 2:40 PST every day + pull_request: + paths: + - src/promptflow-evals/** + - .github/workflows/promptflow-evals-e2e-test-local.yml + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals + +jobs: + test: + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-13] + # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package + # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158 + # Add 3.9 back after we figure out the issue + python-version: ['3.8', '3.9', '3.10', '3.11'] + fail-fast: false + # snok/install-poetry need this to support Windows + defaults: + run: + shell: bash + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: set test mode + # Always run in replay mode for now until we figure out the test resource to run live mode + run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV + #run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - uses: snok/install-poetry@v1 + - name: install test dependency group + run: poetry install --only test + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: install promptflow packages in editable mode + run: | + poetry run pip install -e ../promptflow + poetry run pip install -e ../promptflow-core + poetry run pip install -e ../promptflow-devkit + poetry run pip install -e ../promptflow-tracing + poetry run pip install -e ../promptflow-tools + poetry run pip install -e ../promptflow-evals + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: generate end-to-end test config from secret + run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: check azure is not installed + run: poetry run pytest ../../scripts/code_qa/assert_local_install.py + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: run e2e tests + id: run_e2e_tests_local + run: | + poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml + poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }} + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: upload coverage report + uses: actions/upload-artifact@v4 + with: + name: report-${{ matrix.os }}-py${{ matrix.python-version }} + path: | + ${{ env.WORKING_DIRECTORY }}/*.xml + ${{ env.WORKING_DIRECTORY }}/htmlcov/ + + report: + needs: test + runs-on: ubuntu-latest + permissions: + checks: write + pull-requests: write + contents: read + issues: read + steps: + - uses: actions/download-artifact@v4 + with: + path: artifacts + - uses: EnricoMi/publish-unit-test-result-action@v2 + with: + check_name: promptflow-evals test result + comment_title: promptflow-evals test result + files: "artifacts/**/test-results.xml" # align with `--junit-xml` in pyproject.toml + - uses: irongut/CodeCoverageSummary@v1.3.0 + with: + filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml" + badge: true + fail_below_min: false + format: markdown + hide_complexity: true + output: both + thresholds: 40 80 diff --git a/.gitignore b/.gitignore index 9ef59b176df..5489d5b6745 100644 --- a/.gitignore +++ b/.gitignore @@ -197,6 +197,7 @@ src/promptflow-*/promptflow/__init__.py # Eclipse project files **/.project **/.pydevproject +**/.settings # benchmark results benchmark/promptflow-serve/test_runner/locust-results/ \ No newline at end of file diff --git a/scripts/code_qa/assert_local_install.py b/scripts/code_qa/assert_local_install.py new file mode 100644 index 00000000000..3c9f56bd6d5 --- /dev/null +++ b/scripts/code_qa/assert_local_install.py @@ -0,0 +1,21 @@ +"""Tests checking that azure packages are NOT installed.""" +import importlib +import pytest + + +class TestPackagesNotInstalles(): + """Test imports.""" + + @pytest.mark.parametrize('package', [ + 'promptflow.azure', + 'azure.ai.ml', + 'azure.identity', + 'azure.storage.blob' + ]) + def test_promptflow_azure(self, package): + """Test promptflow. azure is not installed.""" + try: + importlib.import_module(package) + assert False, f'Package {package} must be uninstalled for local test.' + except (ModuleNotFoundError, ImportError): + pass diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py index 6142747def4..01898087ebd 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py @@ -21,10 +21,21 @@ LOGGER = logging.getLogger(__name__) + +# Handle optional import. The azure libraries are only present if +# promptflow-azure is installed. try: - from azure.storage.blob import BlobServiceClient from azure.ai.ml.entities._credentials import AccountKeyConfiguration -except ImportError: + from azure.ai.ml.entities._datastore.datastore import Datastore + from azure.storage.blob import BlobServiceClient +except (ModuleNotFoundError, ImportError): + # If the above mentioned modules cannot be imported, we are running + # in local mode and MLClient in the constructor will be None, so + # we will not arrive to Azure-dependent code. + + # We are logging the import failure only if debug logging level is set because: + # - If the project configuration was not provided this import is not needed. + # - If the project configuration was provided, the error will be raised by PFClient. LOGGER.debug("promptflow.azure is not installed.") @@ -413,7 +424,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART if response.status_code != 200: self._log_warning('register artifact', response) - def _get_datastore_credential(self, datastore: 'Datastore'): + def _get_datastore_credential(self, datastore: "Datastore"): # Reference the logic in azure.ai.ml._artifact._artifact_utilities # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103 credential = datastore.credentials diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index 4d263152e7b..e184b334628 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -4,9 +4,8 @@ from typing import Dict from unittest.mock import patch -import jwt import pytest -from azure.ai.ml._ml_client import MLClient + from pytest_mock import MockerFixture from promptflow.client import PFClient @@ -20,8 +19,8 @@ from promptflow.recording.record_mode import is_in_ci_pipeline, is_live, is_record, is_replay except ImportError as e: print(f"Failed to import promptflow-recording: {e}") - # Run test in empty mode if promptflow-recording is not installed + def recording_array_reset(): pass @@ -37,6 +36,13 @@ def is_record(): def is_replay(): return False +# Import of optional packages +AZURE_INSTALLED = True +try: + import jwt + from azure.ai.ml._ml_client import MLClient +except ImportError: + AZURE_INSTALLED = False PROMPTFLOW_ROOT = Path(__file__) / "../../../.." CONNECTION_FILE = (PROMPTFLOW_ROOT / "promptflow-evals/connections.json").resolve().absolute().as_posix() @@ -147,12 +153,15 @@ def mock_validate_trace_destination(): @pytest.fixture def azure_ml_client(project_scope: Dict): """The fixture, returning MLClient""" - return MLClient( - subscription_id=project_scope["subscription_id"], - resource_group_name=project_scope["resource_group_name"], - workspace_name=project_scope["project_name"], - credential=get_cred(), - ) + if AZURE_INSTALLED: + return MLClient( + subscription_id=project_scope["subscription_id"], + resource_group_name=project_scope["resource_group_name"], + workspace_name=project_scope["project_name"], + credential=get_cred(), + ) + else: + return None @pytest.fixture @@ -293,6 +302,8 @@ def azure_cred(): @pytest.fixture(scope=package_scope_in_live_mode()) def user_object_id() -> str: + if not AZURE_INSTALLED: + return "" if pytest.is_replay: from promptflow.recording.azure import SanitizedValues @@ -305,6 +316,8 @@ def user_object_id() -> str: @pytest.fixture(scope=package_scope_in_live_mode()) def tenant_id() -> str: + if not AZURE_INSTALLED: + return "" if pytest.is_replay: from promptflow.recording.azure import SanitizedValues @@ -317,9 +330,12 @@ def tenant_id() -> str: @pytest.fixture(scope=package_scope_in_live_mode()) def variable_recorder(): - from promptflow.recording.azure import VariableRecorder + if pytest.is_record or pytest.is_replay: + from promptflow.recording.azure import VariableRecorder - yield VariableRecorder() + yield VariableRecorder() + else: + yield None @pytest.fixture(scope=package_scope_in_live_mode()) @@ -346,3 +362,18 @@ def vcr_recording(request: pytest.FixtureRequest, user_object_id: str, tenant_id yield recording else: yield None + + +def pytest_collection_modifyitems(items): + parents = {} + for item in items: + # Check if parent contains 'localtest' marker and remove it. + if any(mark.name == 'localtest' for mark in item.parent.own_markers) or id(item.parent) in parents: + if id(item.parent) not in parents: + item.parent.own_markers = [ + marker for marker in item.own_markers if getattr(marker, 'name', None) != 'localtest'] + parents[id(item.parent)] = item.parent + if not item.get_closest_marker('azuretest'): + # If item's parent was marked as 'localtest', mark the child as such, but not if + # it was marked as 'azuretest'. + item.add_marker(pytest.mark.localtest) diff --git a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py index 1faef92a46a..16cd0bab1df 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py @@ -6,7 +6,7 @@ @pytest.mark.usefixtures("recording_injection") -@pytest.mark.e2etest +@pytest.mark.azuretest class TestAdvSimulator: @pytest.mark.usefixtures("vcr_recording") def test_adv_sim_init_with_prod_url(self, azure_cred, project_scope): diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index 17bfb5029cf..e1a305ca388 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -11,7 +11,7 @@ @pytest.mark.usefixtures("recording_injection", "vcr_recording") -@pytest.mark.e2etest +@pytest.mark.localtest class TestBuiltInEvaluators: def test_individual_evaluator_prompt_based(self, model_config): eval_fn = FluencyEvaluator(model_config) @@ -31,6 +31,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config): assert score is not None assert score["gpt_fluency"] > 0.0 + @pytest.mark.azuretest def test_individual_evaluator_service_based(self, project_scope, azure_cred): eval_fn = ViolenceEvaluator(project_scope, azure_cred) score = eval_fn( @@ -73,6 +74,7 @@ def test_composite_evaluator_qa(self, model_config, parallel): assert score["gpt_similarity"] > 0.0 assert score["f1_score"] > 0.0 + @pytest.mark.azuretest def test_composite_evaluator_content_safety(self, project_scope, azure_cred): safety_eval = ContentSafetyEvaluator(project_scope, parallel=False, credential=azure_cred) score = safety_eval( @@ -156,6 +158,7 @@ def test_composite_evaluator_chat(self, model_config, eval_last_turn, parallel): assert score["evaluation_per_turn"]["gpt_retrieval"] is not None assert len(score["evaluation_per_turn"]["gpt_retrieval"]["score"]) == turn_count + @pytest.mark.azuretest @pytest.mark.parametrize( "eval_last_turn, parallel", [ diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py index 16b06daf85f..f57c05e35ce 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py @@ -6,7 +6,6 @@ import pandas as pd import pytest import requests -from azure.identity import DefaultAzureCredential from promptflow.evals.evaluate import evaluate from promptflow.evals.evaluators import ContentSafetyEvaluator, F1ScoreEvaluator, GroundednessEvaluator @@ -46,6 +45,7 @@ def question_evaluator(question): def _get_run_from_run_history(flow_run_id, ml_client, project_scope): """Get run info from run history""" + from azure.identity import DefaultAzureCredential token = "Bearer " + DefaultAzureCredential().get_token("https://management.azure.com/.default").token headers = { "Authorization": token, @@ -80,7 +80,7 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope): @pytest.mark.usefixtures("recording_injection") -@pytest.mark.e2etest +@pytest.mark.localtest class TestEvaluate: def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): # data @@ -118,6 +118,7 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): assert row_result_df["outputs.f1_score.f1_score"][2] == 1 assert result["studio_url"] is None + @pytest.mark.azuretest @pytest.mark.skip(reason="Failed in CI pipeline. Pending for investigation.") def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred): input_data = pd.read_json(data_file, lines=True) @@ -301,6 +302,7 @@ def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config): assert "answer.length" in metrics.keys() assert "f1_score.f1_score" in metrics.keys() + @pytest.mark.azuretest def test_evaluate_track_in_cloud( self, questions_file, @@ -344,6 +346,7 @@ def test_evaluate_track_in_cloud( assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run" assert remote_run["runMetadata"]["displayName"] == evaluation_name + @pytest.mark.azuretest def test_evaluate_track_in_cloud_no_target( self, data_file, diff --git a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py index b7dcaf07326..c027ac98a33 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py @@ -10,8 +10,12 @@ from promptflow.evals.evaluate._eval_run import EvalRun from promptflow.evals.evaluate._evaluate import evaluate from promptflow.evals.evaluators._f1_score._f1_score import F1ScoreEvaluator -from promptflow.recording.record_mode import is_live from promptflow.tracing import _start_trace +try: + from promptflow.recording.record_mode import is_live +except ModuleNotFoundError: + # The file is being imported by the local test + pass @pytest.fixture @@ -38,7 +42,7 @@ def tracking_uri(azure_ml_client, project_scope): @pytest.mark.usefixtures("model_config", "recording_injection", "project_scope") -@pytest.mark.e2etest +@pytest.mark.azuretest class TestMetricsUpload(object): """End to end tests to check how the metrics were uploaded to cloud."""