Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
5639642
Applying jsonata
gerrycampion Aug 19, 2025
8ffd326
Merge remote-tracking branch 'origin/main' into 1104-dataservice-and-…
gerrycampion Aug 19, 2025
edd1255
Need the builder
gerrycampion Aug 19, 2025
7a401b4
reporting works
gerrycampion Aug 21, 2025
3eef17c
jsonata can now be direct value of the Check property
gerrycampion Aug 22, 2025
a52cf9f
Ability to load custom functions
gerrycampion Aug 26, 2025
3d1439f
fix unit test missing arg
gerrycampion Aug 26, 2025
ed8da7a
Fix unit test missing arg
gerrycampion Aug 26, 2025
667b125
Merge branch 'main' into 1104-dataservice-and-datareader-for-arbitrar…
gerrycampion Aug 26, 2025
20fbfe1
jsonata unit test
gerrycampion Sep 3, 2025
cb81c6d
Merge branch 'main' into 1104-dataservice-and-datareader-for-arbitrar…
gerrycampion Sep 3, 2025
fe20f97
Added docs and fixed report message
gerrycampion Sep 4, 2025
0f5dcc7
Merge branch 'main' into 1104-dataservice-and-datareader-for-arbitrar…
gerrycampion Sep 4, 2025
fddb998
process rule once for jsonata
gerrycampion Sep 15, 2025
5a06310
Update result property names
gerrycampion Sep 16, 2025
84ee8fa
fixed json for utf8, added error handling
gerrycampion Sep 18, 2025
b89c812
error handling for jsonata loading
gerrycampion Sep 21, 2025
dacf981
Schema fix
gerrycampion Sep 21, 2025
c5a2de9
Added builtin jsonata functions, cli multi function paths, better err…
gerrycampion Sep 23, 2025
01c3d84
Merge branch 'main' into 1104-dataservice-and-datareader-for-arbitrar…
gerrycampion Sep 23, 2025
3ed3311
removed unneeded jsonata files. fixed md indentation
gerrycampion Sep 29, 2025
3de61a4
jsonata ignore scope
gerrycampion Sep 30, 2025
69bb7a2
tweak output variables mapping
gerrycampion Sep 30, 2025
d755646
jsonata scope readme update
gerrycampion Sep 30, 2025
b59660b
Merge branch 'main' into 1104-dataservice-and-datareader-for-arbitrar…
gerrycampion Sep 30, 2025
d1d0160
Merge branch 'main' into 1104-dataservice-and-datareader-for-arbitrar…
gerrycampion Oct 10, 2025
17295c5
Add _path preprocessing
gerrycampion Oct 10, 2025
2413bd1
Merge branch 'main' into 1104-dataservice-and-datareader-for-arbitrar…
gerrycampion Oct 13, 2025
07fa834
format fix
gerrycampion Oct 13, 2025
6ab5015
fix merge bug
gerrycampion Oct 13, 2025
770f8fa
missing test new args
gerrycampion Oct 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ repos:
- id: black
language_version: python3
- repo: https://github.com/pycqa/flake8
rev: 5.0.4
rev: 6.1.0
hooks:
- id: flake8
language_version: python3
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ Run `python core.py validate --help` to see the list of validation options.
progress. By default a progress bar like
"[████████████████████████████--------]
78%"is printed.
-jcf, --jsonata-custom-functions Pair containing a variable name and a Path to directory containing a set of custom JSONata functions. Can be specified multiple times
--help Show this message and exit.
```

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# flake8: noqa
from typing import Type

from cdisc_rules_engine.dataset_builders.jsonata_dataset_builder import (
JSONataDatasetBuilder,
)
from cdisc_rules_engine.interfaces import FactoryInterface
from cdisc_rules_engine.dataset_builders.contents_dataset_builder import (
ContentsDatasetBuilder,
Expand Down Expand Up @@ -73,6 +76,7 @@ class DatasetBuilderFactory(FactoryInterface):
RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE_XML_AND_LIBRARY.value: VariablesMetadataWithDefineAndLibraryDatasetBuilder,
RuleTypes.VALUE_CHECK_WITH_DATASET_METADATA.value: ValueCheckDatasetMetadataDatasetBuilder,
RuleTypes.VALUE_CHECK_WITH_VARIABLE_METADATA.value: ValueCheckVariableMetadataDatasetBuilder,
RuleTypes.JSONATA.value: JSONataDatasetBuilder,
}

@classmethod
Expand Down
35 changes: 35 additions & 0 deletions cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from json import load
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder


def add_json_pointer_paths(node, path=""):
"""
Recursively adds a '_path' attribute to each dict node in the JSON structure,
using JSON Pointer syntax.
"""
if isinstance(node, dict):
node["_path"] = path
for key, value in node.items():
if key != "_path":
add_json_pointer_paths(value, f"{path}/{key}")
elif isinstance(node, list):
for idx, item in enumerate(node):
add_json_pointer_paths(item, f"{path}/{idx}")


class JSONataDatasetBuilder(BaseDatasetBuilder):

def get_dataset(self, **kwargs):
if hasattr(self.data_service, "dataset_path"):
dataset_path = self.data_service.dataset_path
elif (
hasattr(self.data_service, "dataset_paths")
and len(self.data_service.dataset_paths) == 1
):
dataset_path = self.data_service.dataset_paths[0]
else:
return None
with self.data_service.read_data(dataset_path) as fp:
json = load(fp)
add_json_pointer_paths(json)
return json
6 changes: 4 additions & 2 deletions cdisc_rules_engine/enums/default_file_paths.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from os.path import join
from cdisc_rules_engine.enums.base_enum import BaseEnum


class DefaultFilePaths(BaseEnum):
CACHE = "resources/cache"
EXCEL_TEMPLATE_FILE = "resources/templates/report-template.xlsx"
CACHE = join("resources", "cache")
EXCEL_TEMPLATE_FILE = join("resources", "templates", "report-template.xlsx")
JSONATA_UTILS = join("resources", "jsonata")
RULES_CACHE_FILE = "rules.pkl"
RULES_DICTIONARY = "rules_dictionary.pkl"
STANDARD_DETAILS_CACHE_FILE = "standards_details.pkl"
Expand Down
1 change: 1 addition & 0 deletions cdisc_rules_engine/enums/rule_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class RuleTypes(BaseEnum):
DEFINE_ITEM_GROUP_METADATA_CHECK = "Define Item Group Metadata Check"
DEFINE_ITEM_METADATA_CHECK = "Define Item Metadata Check"
DOMAIN_PRESENCE_CHECK = "Domain Presence Check"
JSONATA = "JSONata"
VALUE_LEVEL_METADATA_CHECK_AGAINST_DEFINE = (
"Value Level Metadata Check against Define XML"
)
Expand Down
5 changes: 5 additions & 0 deletions cdisc_rules_engine/exceptions/custom_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ class InvalidDatasetFormat(EngineError):
description = "Dataset data is malformed."


class InvalidJSONFormat(EngineError):
code = 400
description = "JSON data is malformed."


class NumberOfAttemptsExceeded(EngineError):
pass

Expand Down
16 changes: 15 additions & 1 deletion cdisc_rules_engine/models/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(self, record_params: dict):
@classmethod
def from_cdisc_metadata(cls, rule_metadata: dict) -> dict:
if cls.is_cdisc_rule_metadata(rule_metadata):
rule_metadata = cls.spaces_to_underscores(rule_metadata)
authorities = rule_metadata.get("Authorities", [])
executable_rule = {
"core_id": rule_metadata.get("Core", {}).get("Id"),
Expand Down Expand Up @@ -72,6 +73,17 @@ def from_cdisc_metadata(cls, rule_metadata: dict) -> dict:
else:
return rule_metadata

@classmethod
def spaces_to_underscores(cls, obj):
if isinstance(obj, dict):
return {
key.replace(" ", "_"): cls.spaces_to_underscores(value)
for key, value in obj.items()
}
if isinstance(obj, list):
return [cls.spaces_to_underscores(item) for item in obj]
return obj

@classmethod
def parse_standards(cls, authorities: List[dict]) -> List[dict]:
standards = []
Expand Down Expand Up @@ -99,9 +111,11 @@ def is_cdisc_rule_metadata(cls, rule_metadata: dict) -> bool:
return "Core" in rule_metadata

@classmethod
def parse_conditions(cls, conditions: dict) -> dict:
def parse_conditions(cls, conditions: dict | str) -> dict | str:
if not conditions:
raise ValueError("No check data provided")
if isinstance(conditions, str):
return conditions
all_conditions = conditions.get("all")
any_conditions = conditions.get("any")
not_condition = conditions.get("not")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ class ConditionCompositeFactory:
"""

@classmethod
def get_condition_composite(cls, conditions: dict) -> ConditionInterface:
def get_condition_composite(
cls, conditions: dict | str
) -> ConditionInterface | str:
if isinstance(conditions, str):
return conditions
composite = ConditionComposite()
for key, condition_list in conditions.items():
# validate the rule structure
Expand Down
1 change: 1 addition & 0 deletions cdisc_rules_engine/models/validation_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"progress",
"define_xml_path",
"validate_xml",
"jsonata_custom_functions",
"max_report_rows",
"max_errors_per_rule",
],
Expand Down
82 changes: 54 additions & 28 deletions cdisc_rules_engine/rules_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from cdisc_rules_engine.exceptions.custom_exceptions import (
DatasetNotFoundError,
DomainNotFoundInDefineXMLError,
InvalidJSONFormat,
RuleFormatError,
VariableMetadataNotFoundError,
FailedSchemaValidation,
Expand All @@ -36,6 +37,7 @@
from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import (
DefineXMLReaderFactory,
)
from cdisc_rules_engine.utilities.jsonata_processor import JSONataProcessor
from cdisc_rules_engine.utilities.data_processor import DataProcessor
from cdisc_rules_engine.utilities.dataset_preprocessor import DatasetPreprocessor
from cdisc_rules_engine.utilities.rule_processor import RuleProcessor
Expand Down Expand Up @@ -92,6 +94,9 @@ def __init__(
self.external_dictionaries = external_dictionaries
self.define_xml_path: str = kwargs.get("define_xml_path")
self.validate_xml: bool = kwargs.get("validate_xml")
self.jsonata_custom_functions: tuple[()] | tuple[tuple[str, str], ...] = (
kwargs.get("jsonata_custom_functions", ())
)
self.max_errors_per_rule: int = kwargs.get("max_errors_per_rule")

def get_schema(self):
Expand All @@ -102,37 +107,47 @@ def validate_single_rule(self, rule: dict, datasets: Iterable[SDTMDatasetMetadat
rule["conditions"] = ConditionCompositeFactory.get_condition_composite(
rule["conditions"]
)
total_errors = 0
for dataset_metadata in datasets:
if self.max_errors_per_rule and total_errors >= self.max_errors_per_rule:
logger.info(
f"Rule {rule.get('core_id')}: Error limit ({self.max_errors_per_rule}) reached. "
f"Skipping remaining datasets."
)
break
if dataset_metadata.unsplit_name in results and "domains" in rule:
include_split = rule["domains"].get("include_split_datasets", False)
if not include_split:
continue # handling split datasets
dataset_results = self.validate_single_dataset(
if rule.get("rule_type") == RuleTypes.JSONATA.value:
results["json"] = self.validate_single_dataset(
rule,
datasets,
dataset_metadata,
SDTMDatasetMetadata(name="json"),
)
results[dataset_metadata.unsplit_name] = dataset_results
for result in dataset_results:
if result.get("executionStatus") == "success":
total_errors += len(result.get("errors"))
if (
self.max_errors_per_rule
and total_errors >= self.max_errors_per_rule
):
logger.info(
f"Rule {rule.get('core_id')}: Error limit ({self.max_errors_per_rule}) "
f"reached after processing {dataset_metadata.name}. "
f"Execution halted at {total_errors} total errors."
)
break
else:
total_errors = 0
for dataset_metadata in datasets:
if (
self.max_errors_per_rule
and total_errors >= self.max_errors_per_rule
):
logger.info(
f"Rule {rule.get('core_id')}: Error limit ({self.max_errors_per_rule}) reached. "
f"Skipping remaining datasets."
)
break
if dataset_metadata.unsplit_name in results and "domains" in rule:
include_split = rule["domains"].get("include_split_datasets", False)
if not include_split:
continue # handling split datasets
dataset_results = self.validate_single_dataset(
rule,
datasets,
dataset_metadata,
)
results[dataset_metadata.unsplit_name] = dataset_results
for result in dataset_results:
if result.get("executionStatus") == "success":
total_errors += len(result.get("errors"))
if (
self.max_errors_per_rule
and total_errors >= self.max_errors_per_rule
):
logger.info(
f"Rule {rule.get('core_id')}: Error limit ({self.max_errors_per_rule}) "
f"reached after processing {dataset_metadata.name}. "
f"Execution halted at {total_errors} total errors."
)
break
return results

def validate_single_dataset(
Expand Down Expand Up @@ -304,6 +319,10 @@ def validate_rule(
return self.execute_rule(
rule_copy, dataset, datasets, dataset_metadata, **kwargs
)
elif rule.get("rule_type") == RuleTypes.JSONATA.value:
return JSONataProcessor.execute_jsonata_rule(
rule, dataset, self.jsonata_custom_functions
)

kwargs["ct_packages"] = list(self.ct_packages)

Expand Down Expand Up @@ -433,6 +452,13 @@ def handle_validation_exceptions( # noqa
message=exception.args[0],
)
message = "rule execution error"
elif isinstance(exception, InvalidJSONFormat):
error_obj = FailedValidationEntity(
dataset=os.path.basename(dataset_path),
error=InvalidJSONFormat.description,
message=exception.args[0],
)
message = "rule execution error"
elif isinstance(exception, FailedSchemaValidation):
if self.validate_xml:
error_obj: ValidationErrorContainer = ValidationErrorContainer(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
DatasetNDJSONReader,
)
from cdisc_rules_engine.services.data_readers.parquet_reader import ParquetReader
from cdisc_rules_engine.services.data_readers.usdm_json_reader import USDMJSONReader
from cdisc_rules_engine.services.data_readers.json_reader import JSONReader
from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
from cdisc_rules_engine.models.dataset import PandasDataset

Expand All @@ -23,7 +23,7 @@ class DataReaderFactory(FactoryInterface):
DataFormatTypes.PARQUET.value: ParquetReader,
DataFormatTypes.JSON.value: DatasetJSONReader,
DataFormatTypes.NDJSON.value: DatasetNDJSONReader,
DataFormatTypes.USDM.value: USDMJSONReader,
DataFormatTypes.USDM.value: JSONReader,
}

def __init__(self, service_name: str = None, dataset_implementation=PandasDataset):
Expand Down
14 changes: 6 additions & 8 deletions cdisc_rules_engine/services/data_readers/dataset_json_reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd
import dask.dataframe as dd
import os
import json
import jsonschema

from cdisc_rules_engine.interfaces import (
Expand All @@ -12,19 +11,18 @@
from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset
import tempfile

from cdisc_rules_engine.services.data_readers.json_reader import JSONReader


class DatasetJSONReader(DataReaderInterface):
def get_schema(self) -> dict:
with open(
schema = JSONReader().from_file(
os.path.join("resources", "schema", "dataset.schema.json")
) as schemajson:
schema = schemajson.read()
return json.loads(schema)
)
return schema

def read_json_file(self, file_path: str) -> dict:
with open(file_path, "r") as file:
datasetjson = json.load(file)
return datasetjson
return JSONReader().from_file(file_path)

def _raw_dataset_from_file(self, file_path) -> pd.DataFrame:
# Load Dataset-JSON Schema
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset
import tempfile

from cdisc_rules_engine.services.data_readers.json_reader import JSONReader


class DatasetNDJSONReader(DataReaderInterface):
def get_schema(self) -> dict:
with open(
schema = JSONReader().from_file(
os.path.join("resources", "schema", "dataset-ndjson-schema.json")
) as schemandjson:
schema = schemandjson.read()
return json.loads(schema)
)
return schema

def read_json_file(self, file_path: str) -> dict:
with open(file_path, "r") as file:
Expand Down
21 changes: 21 additions & 0 deletions cdisc_rules_engine/services/data_readers/json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from json import load
from cdisc_rules_engine.exceptions.custom_exceptions import InvalidJSONFormat
from cdisc_rules_engine.interfaces import (
DataReaderInterface,
)


class JSONReader(DataReaderInterface):
def from_file(self, file_path):
try:
with open(file_path, "rb") as fp:
json = load(fp)
return json
except Exception as e:
raise InvalidJSONFormat(
f"\n Error reading JSON from: {file_path}"
f"\n {type(e).__name__}: {e}"
)

def read(self, data):
pass
Loading
Loading