feat: added entity name override option in data contract error details to align with business rules

stevenhsd · stevenhsd · commit cb728cae3378 · 2026-01-08T13:24:19.000Z
diff --git a/docs/json_schemas/contract/components/field_error_detail.schema.json b/docs/json_schemas/contract/components/field_error_detail.schema.json
@@ -11,12 +11,11 @@
         },
         "error_message": {
             "description": "The message to be used for the field and error type specified. This can include templating (specified using jinja2 conventions). During templating, the full record will be available with an additional __error_value to easily obtain nested offending values.",
-            "type": "string",
-            "enum": [
-                "record_rejection",
-                "file_rejection",
-                "warning"
-            ]
+            "type": "string"
+        },
+        "reporting_entity": {
+            "description": "The entity name to be given for grouping in error report. If left blank will default to the contract entity name",
+            "type": "string"
         }
     },
     "required": [
diff --git a/src/dve/core_engine/message.py b/src/dve/core_engine/message.py
@@ -30,6 +30,7 @@ class DataContractErrorDetail(BaseModel):
 
     error_code: str
     error_message: Optional[str] = None
+    reporting_entity: Optional[str] = None
 
     def template_message(
         self,
@@ -232,7 +233,8 @@ def from_pydantic_error(
 
             messages.append(
                 cls(
-                    entity=entity,
+                    entity=error_detail.reporting_entity or entity,
+                    original_entity=entity,
                     record=record,
                     failure_type=failure_type,
                     is_informational=is_informational,
diff --git a/tests/features/movies.feature b/tests/features/movies.feature
@@ -1,47 +1,47 @@
 Feature: Pipeline tests using the movies dataset
-        Tests for the processing framework which use the movies dataset.
+    Tests for the processing framework which use the movies dataset.
 
-        This tests submissions in JSON format, with configuration in JSON config files.
-        Complex types are tested (arrays, nested structs)
+    This tests submissions in JSON format, with configuration in JSON config files.
+    Complex types are tested (arrays, nested structs)
 
-        Some validation of entity attributes is performed: SQL expressions and Python filter
-        functions are used, and templatable business rules feature in the transformations.
+    Some validation of entity attributes is performed: SQL expressions and Python filter
+    functions are used, and templatable business rules feature in the transformations.
 
-        Scenario: Validate and filter movies (spark)
-            Given I submit the movies file movies.json for processing
-            And A spark pipeline is configured
-            And I create the following reference data tables in the database movies_refdata
-                | table_name | parquet_path                                         |
-                | sequels    | tests/testdata/movies/refdata/movies_sequels.parquet |
-            And I add initial audit entries for the submission
-            Then the latest audit record for the submission is marked with processing status file_transformation
-            When I run the file transformation phase
-            Then the movies entity is stored as a parquet after the file_transformation phase
-            And the latest audit record for the submission is marked with processing status data_contract
-            When I run the data contract phase
-            Then there are 3 record rejections from the data_contract phase
-            And there are errors with the following details and associated error_count from the data_contract phase
-                | ErrorCode | ErrorMessage                              | error_count |
-                | BLANKYEAR | year not provided                         | 1           |
-                | DODGYYEAR | year value (NOT_A_NUMBER) is invalid      | 1           |
-                | DODGYDATE | date_joined value is not valid: daft_date | 1           |
-            And the movies entity is stored as a parquet after the data_contract phase
-            And the latest audit record for the submission is marked with processing status business_rules
-            When I run the business rules phase
-            Then The rules restrict "movies" to 4 qualifying records
-            And there are errors with the following details and associated error_count from the business_rules phase
-                | ErrorCode       | ErrorMessage                                           | error_count |
-                | LIMITED_RATINGS | Movie has too few ratings ([6.1])                      | 1           |
-                | RUBBISH_SEQUEL  | The movie The Greatest Movie Ever has a rubbish sequel | 1           |
-            And the latest audit record for the submission is marked with processing status error_report
-            When I run the error report phase
-            Then An error report is produced
-            And The statistics entry for the submission shows the following information
-                | parameter                | value |
-                | record_count             | 5     |
-                | number_record_rejections | 4     |
-                | number_warnings          | 1     |
-            And the error aggregates are persisted
+    Scenario: Validate and filter movies (spark)
+        Given I submit the movies file movies.json for processing
+        And A spark pipeline is configured
+        And I create the following reference data tables in the database movies_refdata
+            | table_name | parquet_path                                         |
+            | sequels    | tests/testdata/movies/refdata/movies_sequels.parquet |
+        And I add initial audit entries for the submission
+        Then the latest audit record for the submission is marked with processing status file_transformation
+        When I run the file transformation phase
+        Then the movies entity is stored as a parquet after the file_transformation phase
+        And the latest audit record for the submission is marked with processing status data_contract
+        When I run the data contract phase
+        Then there are 3 record rejections from the data_contract phase
+        And there are errors with the following details and associated error_count from the data_contract phase
+            | Entity             | ErrorCode | ErrorMessage                              | error_count |
+            | movies             | BLANKYEAR | year not provided                         | 1           |
+            | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid      | 1           |
+            | movies             | DODGYDATE | date_joined value is not valid: daft_date | 1           |
+        And the movies entity is stored as a parquet after the data_contract phase
+        And the latest audit record for the submission is marked with processing status business_rules
+        When I run the business rules phase
+        Then The rules restrict "movies" to 4 qualifying records
+        And there are errors with the following details and associated error_count from the business_rules phase
+            | ErrorCode       | ErrorMessage                                           | error_count |
+            | LIMITED_RATINGS | Movie has too few ratings ([6.1])                      | 1           |
+            | RUBBISH_SEQUEL  | The movie The Greatest Movie Ever has a rubbish sequel | 1           |
+        And the latest audit record for the submission is marked with processing status error_report
+        When I run the error report phase
+        Then An error report is produced
+        And The statistics entry for the submission shows the following information
+            | parameter                | value |
+            | record_count             | 5     |
+            | number_record_rejections | 4     |
+            | number_warnings          | 1     |
+        And the error aggregates are persisted
 
     Scenario: Validate and filter movies (duckdb)
         Given I submit the movies file movies.json for processing
@@ -57,10 +57,10 @@ Feature: Pipeline tests using the movies dataset
         When I run the data contract phase
         Then there are 3 record rejections from the data_contract phase
         And there are errors with the following details and associated error_count from the data_contract phase
-            | ErrorCode | ErrorMessage                              | error_count |
-            | BLANKYEAR | year not provided                         | 1           |
-            | DODGYYEAR | year value (NOT_A_NUMBER) is invalid      | 1           |
-            | DODGYDATE | date_joined value is not valid: daft_date | 1           |
+            | Entity             | ErrorCode | ErrorMessage                              | error_count |
+            | movies             | BLANKYEAR | year not provided                         | 1           |
+            | movies_rename_test | DODGYYEAR | year value (NOT_A_NUMBER) is invalid      | 1           |
+            | movies             | DODGYDATE | date_joined value is not valid: daft_date | 1           |
         And the movies entity is stored as a parquet after the data_contract phase
         And the latest audit record for the submission is marked with processing status business_rules
         When I run the business rules phase
diff --git a/tests/test_core_engine/test_backends/fixtures.py b/tests/test_core_engine/test_backends/fixtures.py
@@ -567,9 +567,11 @@ def nested_parquet_custom_dc_err_details(temp_dir):
     err_details = {
         "id": {
             "Blank": {"error_code": "TESTIDBLANK",
-                      "error_message": "id cannot be null"},
+                      "error_message": "id cannot be null",
+                      "reporting_entity": "test_rename"},
             "Bad value": {"error_code": "TESTIDBAD",
-                          "error_message": "id is invalid: id - {{id}}"}
+                          "error_message": "id is invalid: id - {{id}}",
+                          "reporting_entity": "test_rename"}
                 },
         "datetimefield": {
             "Bad value": {"error_code": "TESTDTFIELDBAD",
diff --git a/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_duckdb/test_data_contract.py
@@ -360,4 +360,5 @@ def test_duckdb_data_contract_custom_error_details(nested_all_string_parquet_w_e
     assert messages[0].error_code == "SUBFIELDTESTIDBAD"
     assert messages[0].error_message == "subfield id is invalid: subfield.id - WRONG"
     assert messages[1].error_code == "TESTIDBAD"
-    assert messages[1].error_message == "id is invalid: id - WRONG"
+    assert messages[1].error_message == "id is invalid: id - WRONG"
+    assert messages[1].entity == "test_rename"
diff --git a/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py b/tests/test_core_engine/test_backends/test_implementations/test_spark/test_data_contract.py
@@ -235,5 +235,6 @@ def test_spark_data_contract_custom_error_details(nested_all_string_parquet_w_er
     assert messages[0].error_message == "subfield id is invalid: subfield.id - WRONG"
     assert messages[1].error_code == "TESTIDBAD"
     assert messages[1].error_message == "id is invalid: id - WRONG"
+    assert messages[1].entity == "test_rename"
 
    
diff --git a/tests/testdata/movies/movies_contract_error_details.json b/tests/testdata/movies/movies_contract_error_details.json
@@ -12,7 +12,8 @@
         },
         "Bad value": {
             "error_code": "DODGYYEAR",
-            "error_message": "year value ({{year}}) is invalid"
+            "error_message": "year value ({{year}}) is invalid",
+            "reporting_entity": "movies_rename_test"
         }
     },
     "cast.date_joined": {