diff --git a/.github/workflows/lint_format.yml b/.github/workflows/lint_format.yml index 7582e7511..644be249d 100644 --- a/.github/workflows/lint_format.yml +++ b/.github/workflows/lint_format.yml @@ -34,8 +34,7 @@ jobs: python-version: "3.12" - name: Install linters run: | - pip install flake8==6.1.0 - pip install black==24.10.0 + pip install black flake8 -c requirements-dev.txt - name: Run flake8 run: | flake8 ${{needs.get_changed_files.outputs.py}} --count --select=E9,F63,F7,F82 --show-source --statistics diff --git a/.github/workflows/test_unit.yml b/.github/workflows/test_unit.yml index 54bc21e1a..358be8b09 100644 --- a/.github/workflows/test_unit.yml +++ b/.github/workflows/test_unit.yml @@ -13,7 +13,7 @@ jobs: python-version: "3.12" - name: Install requirements run: | - pip install -r requirements.txt + pip install -r requirements-dev.txt - name: Running Tests run: | python -m pytest tests/unit/ --cov=cdisc_rules_engine --cov-fail-under=75 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d7d98fa12..fb6213a38 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: - id: black language_version: python3 - repo: https://github.com/pycqa/flake8 - rev: 5.0.4 + rev: 6.1.0 hooks: - id: flake8 language_version: python3 diff --git a/README.md b/README.md index 708ea1579..6c2fd4cb5 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ In the terminal, navigate to the directory you intend to install CORE rules engi This project uses the `black` code formatter, `flake8` linter for python and `prettier` for JSON, YAML and MD. It also uses `pre-commit` to run `black`, `flake8` and `prettier` when you commit. -Both dependencies are added to _requirements.txt_. +Both dependencies are added to _requirements-dev.txt_. **Required** @@ -101,7 +101,7 @@ NOTE: if you have multiple versions of python on your machine, you can call pyth - Install the requirements. -`python -m pip install -r requirements.txt` # From the root directory +`python -m pip install -r requirements-dev.txt` # From the root directory ### **Running The Tests** @@ -194,6 +194,7 @@ Run `python core.py validate --help` to see the list of validation options. progress. By default a progress bar like "[████████████████████████████--------] 78%"is printed. + -jcf, --jsonata-custom-functions Pair containing a variable name and a Path to directory containing a set of custom JSONata functions. Can be specified multiple times --help Show this message and exit. ``` diff --git a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py index 2831d26fe..84bea6f9b 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py +++ b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py @@ -1,6 +1,9 @@ # flake8: noqa from typing import Type +from cdisc_rules_engine.dataset_builders.jsonata_dataset_builder import ( + JSONataDatasetBuilder, +) from cdisc_rules_engine.interfaces import FactoryInterface from cdisc_rules_engine.dataset_builders.contents_dataset_builder import ( ContentsDatasetBuilder, @@ -73,6 +76,7 @@ class DatasetBuilderFactory(FactoryInterface): RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE_XML_AND_LIBRARY.value: VariablesMetadataWithDefineAndLibraryDatasetBuilder, RuleTypes.VALUE_CHECK_WITH_DATASET_METADATA.value: ValueCheckDatasetMetadataDatasetBuilder, RuleTypes.VALUE_CHECK_WITH_VARIABLE_METADATA.value: ValueCheckVariableMetadataDatasetBuilder, + RuleTypes.JSONATA.value: JSONataDatasetBuilder, } @classmethod diff --git a/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py new file mode 100644 index 000000000..9802b59fc --- /dev/null +++ b/cdisc_rules_engine/dataset_builders/jsonata_dataset_builder.py @@ -0,0 +1,35 @@ +from json import load +from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder + + +def add_json_pointer_paths(node, path=""): + """ + Recursively adds a '_path' attribute to each dict node in the JSON structure, + using JSON Pointer syntax. + """ + if isinstance(node, dict): + node["_path"] = path + for key, value in node.items(): + if key != "_path": + add_json_pointer_paths(value, f"{path}/{key}") + elif isinstance(node, list): + for idx, item in enumerate(node): + add_json_pointer_paths(item, f"{path}/{idx}") + + +class JSONataDatasetBuilder(BaseDatasetBuilder): + + def get_dataset(self, **kwargs): + if hasattr(self.data_service, "dataset_path"): + dataset_path = self.data_service.dataset_path + elif ( + hasattr(self.data_service, "dataset_paths") + and len(self.data_service.dataset_paths) == 1 + ): + dataset_path = self.data_service.dataset_paths[0] + else: + return None + with self.data_service.read_data(dataset_path) as fp: + json = load(fp) + add_json_pointer_paths(json) + return json diff --git a/cdisc_rules_engine/enums/default_file_paths.py b/cdisc_rules_engine/enums/default_file_paths.py index ecebbf2fb..053698870 100644 --- a/cdisc_rules_engine/enums/default_file_paths.py +++ b/cdisc_rules_engine/enums/default_file_paths.py @@ -1,9 +1,11 @@ +from os.path import join from cdisc_rules_engine.enums.base_enum import BaseEnum class DefaultFilePaths(BaseEnum): - CACHE = "resources/cache" - EXCEL_TEMPLATE_FILE = "resources/templates/report-template.xlsx" + CACHE = join("resources", "cache") + EXCEL_TEMPLATE_FILE = join("resources", "templates", "report-template.xlsx") + JSONATA_UTILS = join("resources", "jsonata") RULES_CACHE_FILE = "rules.pkl" RULES_DICTIONARY = "rules_dictionary.pkl" STANDARD_DETAILS_CACHE_FILE = "standards_details.pkl" diff --git a/cdisc_rules_engine/enums/rule_types.py b/cdisc_rules_engine/enums/rule_types.py index 1c50193af..0680659d5 100644 --- a/cdisc_rules_engine/enums/rule_types.py +++ b/cdisc_rules_engine/enums/rule_types.py @@ -11,6 +11,7 @@ class RuleTypes(BaseEnum): DEFINE_ITEM_GROUP_METADATA_CHECK = "Define Item Group Metadata Check" DEFINE_ITEM_METADATA_CHECK = "Define Item Metadata Check" DOMAIN_PRESENCE_CHECK = "Domain Presence Check" + JSONATA = "JSONata" VALUE_LEVEL_METADATA_CHECK_AGAINST_DEFINE = ( "Value Level Metadata Check against Define XML" ) diff --git a/cdisc_rules_engine/exceptions/custom_exceptions.py b/cdisc_rules_engine/exceptions/custom_exceptions.py index 07bc5e3a2..ccb88c236 100644 --- a/cdisc_rules_engine/exceptions/custom_exceptions.py +++ b/cdisc_rules_engine/exceptions/custom_exceptions.py @@ -57,6 +57,11 @@ class InvalidDatasetFormat(EngineError): description = "Dataset data is malformed." +class InvalidJSONFormat(EngineError): + code = 400 + description = "JSON data is malformed." + + class NumberOfAttemptsExceeded(EngineError): pass diff --git a/cdisc_rules_engine/models/actions.py b/cdisc_rules_engine/models/actions.py index e1a642d9c..bbf73b5f0 100644 --- a/cdisc_rules_engine/models/actions.py +++ b/cdisc_rules_engine/models/actions.py @@ -95,22 +95,22 @@ def generate_targeted_error_object( "dataset": "ae.xpt", "row": 0, "value": {"STUDYID": "Not in dataset"}, - "uSubjId": "2", - "seq": 1, + "USUBJID": "2", + "SEQ": 1, }, { "dataset": "ae.xpt", "row": 1, "value": {"AESTDY": "test", "DOMAIN": "test"}, - "uSubjId": 7, - "seq": 2, + "USUBJID": 7, + "SEQ": 2, }, { "dataset": "ae.xpt", "row": 9, "value": {"AESTDY": "test", "DOMAIN": "test"}, - "uSubjId": 12, - "seq": 10, + "USUBJID": 12, + "SEQ": 10, }, ], "message": "AESTDY and DOMAIN are equal to test", @@ -170,13 +170,11 @@ def generate_targeted_error_object( self.rule.get("sensitivity") is not None ): # rule sensitivity is incorrectly defined error_entity = ValidationErrorEntity( - { - "dataset": "N/A", - "row": 0, - "value": {"ERROR": "Invalid or undefined sensitivity in the rule"}, - "uSubjId": "N/A", - "SEQ": 0, - } + dataset="N/A", + row=0, + value={"ERROR": "Invalid or undefined sensitivity in the rule"}, + USUBJID="N/A", + SEQ=0, ) return ValidationErrorContainer( domain=( @@ -193,19 +191,17 @@ def generate_targeted_error_object( data, targets_not_in_dataset, all_targets_missing, errors_df ) return ValidationErrorContainer( - **{ - "domain": ( - f"SUPP{self.dataset_metadata.rdomain}" - if self.dataset_metadata.is_supp - else (self.dataset_metadata.domain or self.dataset_metadata.name) - ), - "dataset": ", ".join( - sorted(set(error._dataset or "" for error in errors_list)) - ), - "targets": sorted(targets), - "errors": errors_list, - "message": message.replace("--", self.dataset_metadata.domain or ""), - } + domain=( + f"SUPP{self.dataset_metadata.rdomain}" + if self.dataset_metadata.is_supp + else (self.dataset_metadata.domain or self.dataset_metadata.name) + ), + dataset=", ".join( + sorted(set(error.dataset or "" for error in errors_list)) + ), + targets=sorted(targets), + errors=errors_list, + message=message.replace("--", self.dataset_metadata.domain or ""), ) def _generate_errors_by_target_presence( @@ -239,12 +235,12 @@ def _generate_errors_by_target_presence( }, dataset=self._get_dataset_name(pd.DataFrame([row])), row=int(row.get(SOURCE_ROW_NUMBER, idx + 1)), - usubjid=( + USUBJID=( str(row.get("USUBJID")) if "USUBJID" in row and not pd.isna(row["USUBJID"]) else None ), - sequence=( + SEQ=( int(row.get(f"{self.dataset_metadata.domain or ''}SEQ")) if f"{self.dataset_metadata.domain or ''}SEQ" in row and self._sequence_exists( @@ -318,10 +314,10 @@ def _create_error_object( ) ), # record number should start at 1, not 0 value=filtered_dict, - usubjid=( + USUBJID=( str(usubjid[df_row.name]) if isinstance(usubjid, pd.Series) else None ), - sequence=( + SEQ=( int(sequence[df_row.name]) if self._sequence_exists(sequence, df_row.name) else None diff --git a/cdisc_rules_engine/models/base_validation_entity.py b/cdisc_rules_engine/models/base_validation_entity.py index 33e6b2026..cae1d9ff6 100644 --- a/cdisc_rules_engine/models/base_validation_entity.py +++ b/cdisc_rules_engine/models/base_validation_entity.py @@ -1,8 +1,10 @@ from abc import ABC +from dataclasses import dataclass +from cdisc_rules_engine.enums.execution_status import ExecutionStatus from cdisc_rules_engine.interfaces import RepresentationInterface +@dataclass class BaseValidationEntity(RepresentationInterface, ABC): - def __init__(self): - self.status = None + status: ExecutionStatus diff --git a/cdisc_rules_engine/models/failed_validation_entity.py b/cdisc_rules_engine/models/failed_validation_entity.py index cb1689cf8..72ae2d778 100644 --- a/cdisc_rules_engine/models/failed_validation_entity.py +++ b/cdisc_rules_engine/models/failed_validation_entity.py @@ -1,23 +1,23 @@ +from dataclasses import dataclass from cdisc_rules_engine.enums.execution_status import ExecutionStatus - from .base_validation_entity import BaseValidationEntity +@dataclass(kw_only=True) class FailedValidationEntity(BaseValidationEntity): """ The entity describes an error that occurred during validation indicating that the process has finished its execution with error. """ - def __init__(self, error: str, message: str, dataset: str): - self.dataset = dataset - self._error = error - self._message = message - self.status = ExecutionStatus.EXECUTION_ERROR + dataset: str + error: str + message: str + status: ExecutionStatus = ExecutionStatus.EXECUTION_ERROR def to_representation(self) -> dict: return { "dataset": self.dataset, - "error": self._error, - "message": self._message, + "error": self.error, + "message": self.message, } diff --git a/cdisc_rules_engine/models/rule.py b/cdisc_rules_engine/models/rule.py index 163daf294..0140c8151 100644 --- a/cdisc_rules_engine/models/rule.py +++ b/cdisc_rules_engine/models/rule.py @@ -37,6 +37,7 @@ def __init__(self, record_params: dict): @classmethod def from_cdisc_metadata(cls, rule_metadata: dict) -> dict: if cls.is_cdisc_rule_metadata(rule_metadata): + rule_metadata = cls.spaces_to_underscores(rule_metadata) authorities = rule_metadata.get("Authorities", []) executable_rule = { "core_id": rule_metadata.get("Core", {}).get("Id"), @@ -72,6 +73,17 @@ def from_cdisc_metadata(cls, rule_metadata: dict) -> dict: else: return rule_metadata + @classmethod + def spaces_to_underscores(cls, obj): + if isinstance(obj, dict): + return { + key.replace(" ", "_"): cls.spaces_to_underscores(value) + for key, value in obj.items() + } + if isinstance(obj, list): + return [cls.spaces_to_underscores(item) for item in obj] + return obj + @classmethod def parse_standards(cls, authorities: List[dict]) -> List[dict]: standards = [] @@ -99,9 +111,11 @@ def is_cdisc_rule_metadata(cls, rule_metadata: dict) -> bool: return "Core" in rule_metadata @classmethod - def parse_conditions(cls, conditions: dict) -> dict: + def parse_conditions(cls, conditions: dict | str) -> dict | str: if not conditions: raise ValueError("No check data provided") + if isinstance(conditions, str): + return conditions all_conditions = conditions.get("all") any_conditions = conditions.get("any") not_condition = conditions.get("not") diff --git a/cdisc_rules_engine/models/rule_conditions/condition_composite_factory.py b/cdisc_rules_engine/models/rule_conditions/condition_composite_factory.py index 1b9a46ae7..60880941c 100644 --- a/cdisc_rules_engine/models/rule_conditions/condition_composite_factory.py +++ b/cdisc_rules_engine/models/rule_conditions/condition_composite_factory.py @@ -14,7 +14,11 @@ class ConditionCompositeFactory: """ @classmethod - def get_condition_composite(cls, conditions: dict) -> ConditionInterface: + def get_condition_composite( + cls, conditions: dict | str + ) -> ConditionInterface | str: + if isinstance(conditions, str): + return conditions composite = ConditionComposite() for key, condition_list in conditions.items(): # validate the rule structure diff --git a/cdisc_rules_engine/models/rule_validation_result.py b/cdisc_rules_engine/models/rule_validation_result.py index 49b664796..033aef4c4 100644 --- a/cdisc_rules_engine/models/rule_validation_result.py +++ b/cdisc_rules_engine/models/rule_validation_result.py @@ -1,21 +1,30 @@ -from typing import List, Union +from typing import List +from dataclasses import dataclass, field from cdisc_rules_engine.interfaces import RepresentationInterface from cdisc_rules_engine.utilities.utils import get_execution_status from cdisc_rules_engine.models.rule import Rule +@dataclass class RuleValidationResult(RepresentationInterface): - def __init__(self, rule: Rule, results: List[Union[dict, str]]): - self.id: str = rule.get("core_id") - self.cdisc_rule_id: str = self._get_rule_ids(rule, "CDISC") - self.fda_rule_id: str = self._get_rule_ids(rule, "FDA") - self.executability: str = rule.get("executability") + id: str | None = None + cdisc_rule_id: str | None = None + fda_rule_id: str | None = None + executability: str | None = None + message: str | None = None + execution_status: str | None = None + results: List[dict | str] = field(default_factory=list) + + def __init__(self, rule: Rule, results: List[dict | str]): + self.id = rule.get("core_id") + self.cdisc_rule_id = self._get_rule_ids(rule, "CDISC") + self.fda_rule_id = self._get_rule_ids(rule, "FDA") + self.executability = rule.get("executability") actions = rule.get("actions") - self.message: str = None if actions and len(actions) == 1: self.message = actions[0].get("params", {}).get("message") - self.execution_status: str = get_execution_status(results) + self.execution_status = get_execution_status(results) self.results = results def _get_rule_ids(self, rule: Rule, org: str) -> str: diff --git a/cdisc_rules_engine/models/validation_args.py b/cdisc_rules_engine/models/validation_args.py index 31a16edad..961ac2bf4 100644 --- a/cdisc_rules_engine/models/validation_args.py +++ b/cdisc_rules_engine/models/validation_args.py @@ -24,6 +24,7 @@ "progress", "define_xml_path", "validate_xml", + "jsonata_custom_functions", "max_report_rows", "max_errors_per_rule", ], diff --git a/cdisc_rules_engine/models/validation_error_container.py b/cdisc_rules_engine/models/validation_error_container.py index 3670febfd..dfdece607 100644 --- a/cdisc_rules_engine/models/validation_error_container.py +++ b/cdisc_rules_engine/models/validation_error_container.py @@ -1,6 +1,5 @@ -from typing import List, Union - -from cdisc_rules_engine.enums.execution_status import ExecutionStatus +from typing import List +from dataclasses import dataclass, field from cdisc_rules_engine.utilities.utils import get_execution_status from .base_validation_entity import BaseValidationEntity @@ -8,22 +7,24 @@ from .validation_error_entity import ValidationErrorEntity +@dataclass class ValidationErrorContainer(BaseValidationEntity): - def __init__(self, **params): - self.dataset: str = params.get("dataset") - self.domain: str = params.get("domain") - self.targets: List[str] = params.get("targets", []) - self.errors: List[Union[ValidationErrorEntity, FailedValidationEntity]] = ( - params.get("errors", []) - ) - self.message: str = params.get("message") - self.status: ExecutionStatus = params.get("status") or get_execution_status( - self.errors - ) + dataset: str | None = None + domain: str | None = None + targets: List[str] = field(default_factory=list) + errors: List[ValidationErrorEntity | FailedValidationEntity] = field( + default_factory=list + ) + message: str | None = None + status: str | None = None + + @property + def executionStatus(self): + return self.status or get_execution_status(self.errors) def to_representation(self) -> dict: return { - "executionStatus": self.status, + "executionStatus": self.executionStatus, "dataset": self.dataset, "domain": self.domain, "variables": sorted(self.targets), diff --git a/cdisc_rules_engine/models/validation_error_entity.py b/cdisc_rules_engine/models/validation_error_entity.py index 9390193d5..146734918 100644 --- a/cdisc_rules_engine/models/validation_error_entity.py +++ b/cdisc_rules_engine/models/validation_error_entity.py @@ -1,28 +1,23 @@ +from dataclasses import dataclass, field + from cdisc_rules_engine.enums.execution_status import ExecutionStatus from cdisc_rules_engine.enums.base_enum import BaseEnum from .base_validation_entity import BaseValidationEntity +@dataclass class ValidationErrorEntity(BaseValidationEntity): """ The entity describes an error that been flagged because a dataset violates a rule in a certain row. """ - def __init__( - self, - value: dict, - dataset: str = None, - row: int = None, - usubjid: str = None, - sequence: int = None, - ): - self._dataset: str = dataset - self._row: int = row - self.value: dict = value - self._usubjid: str = usubjid - self._sequence: int = sequence - self.status: ExecutionStatus = ExecutionStatus.SUCCESS + value: dict = field(default_factory=dict) + dataset: str | None = None + row: int | None = None + USUBJID: str | None = None + SEQ: int | None = None + status: ExecutionStatus = ExecutionStatus.SUCCESS def _format_values(self) -> dict: """ @@ -42,12 +37,12 @@ def to_representation(self) -> dict: representation: dict = { "value": self._format_values(), } - if self._dataset is not None: - representation["dataset"] = self._dataset - if self._row is not None: - representation["row"] = self._row - if self._usubjid: - representation["USUBJID"] = self._usubjid - if self._sequence: - representation["SEQ"] = self._sequence + if self.dataset is not None: + representation["dataset"] = self.dataset + if self.row is not None: + representation["row"] = self.row + if self.USUBJID is not None: + representation["USUBJID"] = self.USUBJID + if self.SEQ is not None: + representation["SEQ"] = self.SEQ return representation diff --git a/cdisc_rules_engine/models/variable_metadata_container.py b/cdisc_rules_engine/models/variable_metadata_container.py index 06bbfdb0c..39ecd8ec9 100644 --- a/cdisc_rules_engine/models/variable_metadata_container.py +++ b/cdisc_rules_engine/models/variable_metadata_container.py @@ -1,6 +1,9 @@ +from dataclasses import dataclass + from cdisc_rules_engine.interfaces import RepresentationInterface +@dataclass class VariableMetadataContainer(RepresentationInterface): def __init__(self, contents_metadata: dict): variable_names = contents_metadata["variable_names"] diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 96a6d2346..ccde2537f 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -10,6 +10,7 @@ from cdisc_rules_engine.exceptions.custom_exceptions import ( DatasetNotFoundError, DomainNotFoundInDefineXMLError, + InvalidJSONFormat, RuleFormatError, VariableMetadataNotFoundError, FailedSchemaValidation, @@ -36,6 +37,7 @@ from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( DefineXMLReaderFactory, ) +from cdisc_rules_engine.utilities.jsonata_processor import JSONataProcessor from cdisc_rules_engine.utilities.data_processor import DataProcessor from cdisc_rules_engine.utilities.dataset_preprocessor import DatasetPreprocessor from cdisc_rules_engine.utilities.rule_processor import RuleProcessor @@ -92,6 +94,9 @@ def __init__( self.external_dictionaries = external_dictionaries self.define_xml_path: str = kwargs.get("define_xml_path") self.validate_xml: bool = kwargs.get("validate_xml") + self.jsonata_custom_functions: tuple[()] | tuple[tuple[str, str], ...] = ( + kwargs.get("jsonata_custom_functions", ()) + ) self.max_errors_per_rule: int = kwargs.get("max_errors_per_rule") def get_schema(self): @@ -102,37 +107,47 @@ def validate_single_rule(self, rule: dict, datasets: Iterable[SDTMDatasetMetadat rule["conditions"] = ConditionCompositeFactory.get_condition_composite( rule["conditions"] ) - total_errors = 0 - for dataset_metadata in datasets: - if self.max_errors_per_rule and total_errors >= self.max_errors_per_rule: - logger.info( - f"Rule {rule.get('core_id')}: Error limit ({self.max_errors_per_rule}) reached. " - f"Skipping remaining datasets." - ) - break - if dataset_metadata.unsplit_name in results and "domains" in rule: - include_split = rule["domains"].get("include_split_datasets", False) - if not include_split: - continue # handling split datasets - dataset_results = self.validate_single_dataset( + if rule.get("rule_type") == RuleTypes.JSONATA.value: + results["json"] = self.validate_single_dataset( rule, datasets, - dataset_metadata, + SDTMDatasetMetadata(name="json"), ) - results[dataset_metadata.unsplit_name] = dataset_results - for result in dataset_results: - if result.get("executionStatus") == "success": - total_errors += len(result.get("errors")) - if ( - self.max_errors_per_rule - and total_errors >= self.max_errors_per_rule - ): - logger.info( - f"Rule {rule.get('core_id')}: Error limit ({self.max_errors_per_rule}) " - f"reached after processing {dataset_metadata.name}. " - f"Execution halted at {total_errors} total errors." - ) - break + else: + total_errors = 0 + for dataset_metadata in datasets: + if ( + self.max_errors_per_rule + and total_errors >= self.max_errors_per_rule + ): + logger.info( + f"Rule {rule.get('core_id')}: Error limit ({self.max_errors_per_rule}) reached. " + f"Skipping remaining datasets." + ) + break + if dataset_metadata.unsplit_name in results and "domains" in rule: + include_split = rule["domains"].get("include_split_datasets", False) + if not include_split: + continue # handling split datasets + dataset_results = self.validate_single_dataset( + rule, + datasets, + dataset_metadata, + ) + results[dataset_metadata.unsplit_name] = dataset_results + for result in dataset_results: + if result.get("executionStatus") == "success": + total_errors += len(result.get("errors")) + if ( + self.max_errors_per_rule + and total_errors >= self.max_errors_per_rule + ): + logger.info( + f"Rule {rule.get('core_id')}: Error limit ({self.max_errors_per_rule}) " + f"reached after processing {dataset_metadata.name}. " + f"Execution halted at {total_errors} total errors." + ) + break return results def validate_single_dataset( @@ -170,19 +185,16 @@ def validate_single_dataset( # No errors were generated, create success error container return [ ValidationErrorContainer( - **{ - "dataset": dataset_metadata.filename, - "domain": dataset_metadata.domain - or dataset_metadata.rdomain, - "errors": [], - } + dataset=dataset_metadata.filename, + domain=dataset_metadata.domain or dataset_metadata.rdomain, + errors=[], ).to_representation() ] else: logger.info( f"Skipped dataset {dataset_metadata.name}. Reason: {reason}" ) - error_obj: ValidationErrorContainer = ValidationErrorContainer( + error_obj = ValidationErrorContainer( status=ExecutionStatus.SKIPPED.value, message=reason, dataset=dataset_metadata.filename, @@ -304,6 +316,10 @@ def validate_rule( return self.execute_rule( rule_copy, dataset, datasets, dataset_metadata, **kwargs ) + elif rule.get("rule_type") == RuleTypes.JSONATA.value: + return JSONataProcessor.execute_jsonata_rule( + rule, dataset, self.jsonata_custom_functions + ) kwargs["ct_packages"] = list(self.ct_packages) @@ -433,12 +449,19 @@ def handle_validation_exceptions( # noqa message=exception.args[0], ) message = "rule execution error" + elif isinstance(exception, InvalidJSONFormat): + error_obj = FailedValidationEntity( + dataset=os.path.basename(dataset_path), + error=InvalidJSONFormat.description, + message=exception.args[0], + ) + message = "rule execution error" elif isinstance(exception, FailedSchemaValidation): if self.validate_xml: - error_obj: ValidationErrorContainer = ValidationErrorContainer( - status=ExecutionStatus.SKIPPED.value, + error_obj = FailedValidationEntity( error=FailedSchemaValidation.description, message=exception.args[0], + dataset=os.path.basename(dataset_path), ) message = "Schema Validation Error" errors = [error_obj] @@ -449,11 +472,12 @@ def handle_validation_exceptions( # noqa dataset=os.path.basename(dataset_path), ) else: - error_obj: ValidationErrorContainer = ValidationErrorContainer( - status=ExecutionStatus.SKIPPED.value, + message = "Skipped because schema validation is off" + error_obj = FailedValidationEntity( + error="Schema validation is off", + message=message, dataset=os.path.basename(dataset_path), ) - message = "Skipped because schema validation is off" errors = [error_obj] return ValidationErrorContainer( dataset=os.path.basename(dataset_path), @@ -462,10 +486,10 @@ def handle_validation_exceptions( # noqa status=ExecutionStatus.SKIPPED.value, ) elif isinstance(exception, DomainNotFoundError): - error_obj = ValidationErrorContainer( + error_obj = FailedValidationEntity( dataset=os.path.basename(dataset_path), + error="Domain not found", message=str(exception), - status=ExecutionStatus.SKIPPED.value, ) message = "rule evaluation skipped - operation domain not found" errors = [error_obj] @@ -478,10 +502,10 @@ def handle_validation_exceptions( # noqa elif isinstance( exception, AttributeError ) and "'NoneType' object has no attribute" in str(exception): - error_obj = ValidationErrorContainer( + error_obj = FailedValidationEntity( dataset=os.path.basename(dataset_path), + error="Missing field during execution", message="Missing field during execution, rule may not be applicable- unable to process dataset", - status=ExecutionStatus.SKIPPED.value, ) message = "rule evaluation skipped - missing metadata" errors = [error_obj] diff --git a/cdisc_rules_engine/services/data_readers/data_reader_factory.py b/cdisc_rules_engine/services/data_readers/data_reader_factory.py index 66d177770..5fb718975 100644 --- a/cdisc_rules_engine/services/data_readers/data_reader_factory.py +++ b/cdisc_rules_engine/services/data_readers/data_reader_factory.py @@ -12,7 +12,7 @@ DatasetNDJSONReader, ) from cdisc_rules_engine.services.data_readers.parquet_reader import ParquetReader -from cdisc_rules_engine.services.data_readers.usdm_json_reader import USDMJSONReader +from cdisc_rules_engine.services.data_readers.json_reader import JSONReader from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from cdisc_rules_engine.models.dataset import PandasDataset @@ -23,7 +23,7 @@ class DataReaderFactory(FactoryInterface): DataFormatTypes.PARQUET.value: ParquetReader, DataFormatTypes.JSON.value: DatasetJSONReader, DataFormatTypes.NDJSON.value: DatasetNDJSONReader, - DataFormatTypes.USDM.value: USDMJSONReader, + DataFormatTypes.USDM.value: JSONReader, } def __init__(self, service_name: str = None, dataset_implementation=PandasDataset): diff --git a/cdisc_rules_engine/services/data_readers/dataset_json_reader.py b/cdisc_rules_engine/services/data_readers/dataset_json_reader.py index 4ea2dc695..937b7bf51 100644 --- a/cdisc_rules_engine/services/data_readers/dataset_json_reader.py +++ b/cdisc_rules_engine/services/data_readers/dataset_json_reader.py @@ -1,7 +1,6 @@ import pandas as pd import dask.dataframe as dd import os -import json import jsonschema from cdisc_rules_engine.interfaces import ( @@ -12,19 +11,18 @@ from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset import tempfile +from cdisc_rules_engine.services.data_readers.json_reader import JSONReader + class DatasetJSONReader(DataReaderInterface): def get_schema(self) -> dict: - with open( + schema = JSONReader().from_file( os.path.join("resources", "schema", "dataset.schema.json") - ) as schemajson: - schema = schemajson.read() - return json.loads(schema) + ) + return schema def read_json_file(self, file_path: str) -> dict: - with open(file_path, "r") as file: - datasetjson = json.load(file) - return datasetjson + return JSONReader().from_file(file_path) def _raw_dataset_from_file(self, file_path) -> pd.DataFrame: # Load Dataset-JSON Schema diff --git a/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py b/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py index 04a7af66b..89f0c663b 100644 --- a/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py +++ b/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py @@ -12,14 +12,15 @@ from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset import tempfile +from cdisc_rules_engine.services.data_readers.json_reader import JSONReader + class DatasetNDJSONReader(DataReaderInterface): def get_schema(self) -> dict: - with open( + schema = JSONReader().from_file( os.path.join("resources", "schema", "dataset-ndjson-schema.json") - ) as schemandjson: - schema = schemandjson.read() - return json.loads(schema) + ) + return schema def read_json_file(self, file_path: str) -> dict: with open(file_path, "r") as file: diff --git a/cdisc_rules_engine/services/data_readers/json_reader.py b/cdisc_rules_engine/services/data_readers/json_reader.py new file mode 100644 index 000000000..f7928ae07 --- /dev/null +++ b/cdisc_rules_engine/services/data_readers/json_reader.py @@ -0,0 +1,21 @@ +from json import load +from cdisc_rules_engine.exceptions.custom_exceptions import InvalidJSONFormat +from cdisc_rules_engine.interfaces import ( + DataReaderInterface, +) + + +class JSONReader(DataReaderInterface): + def from_file(self, file_path): + try: + with open(file_path, "rb") as fp: + json = load(fp) + return json + except Exception as e: + raise InvalidJSONFormat( + f"\n Error reading JSON from: {file_path}" + f"\n {type(e).__name__}: {e}" + ) + + def read(self, data): + pass diff --git a/cdisc_rules_engine/services/data_readers/usdm_json_reader.py b/cdisc_rules_engine/services/data_readers/usdm_json_reader.py deleted file mode 100644 index e4ebf4f75..000000000 --- a/cdisc_rules_engine/services/data_readers/usdm_json_reader.py +++ /dev/null @@ -1,14 +0,0 @@ -from json import load -from cdisc_rules_engine.interfaces import ( - DataReaderInterface, -) - - -class USDMJSONReader(DataReaderInterface): - def from_file(self, file_path): - with open(file_path) as fp: - json = load(fp) - return json - - def read(self, data): - pass diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index f6e0e27e3..f22247f97 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -1,17 +1,19 @@ from datetime import datetime from io import IOBase -from json import load from typing import List, Optional, Iterable, Sequence import os import pandas as pd from cdisc_rules_engine.dummy_models.dummy_dataset import DummyDataset -from cdisc_rules_engine.exceptions.custom_exceptions import DatasetNotFoundError +from cdisc_rules_engine.exceptions.custom_exceptions import ( + DatasetNotFoundError, +) from cdisc_rules_engine.interfaces import CacheServiceInterface, ConfigInterface from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.models.dataset_types import DatasetTypes from cdisc_rules_engine.services.data_readers import DataReaderFactory +from cdisc_rules_engine.services.data_readers.json_reader import JSONReader from cdisc_rules_engine.services.data_services import BaseDataService from cdisc_rules_engine.models.dataset import PandasDataset @@ -161,9 +163,8 @@ def get_datasets(self) -> Iterable[SDTMDatasetMetadata]: @staticmethod def get_data(dataset_paths: Sequence[str]): - with open(dataset_paths[0]) as fp: - json = load(fp) - return [DummyDataset(data) for data in json.get("datasets", [])] + json = JSONReader().from_file(dataset_paths[0]) + return [DummyDataset(data) for data in json.get("datasets", [])] @staticmethod def is_valid_data(dataset_paths: Sequence[str]): @@ -172,7 +173,6 @@ def is_valid_data(dataset_paths: Sequence[str]): and len(dataset_paths) == 1 and dataset_paths[0].lower().endswith(".json") ): - with open(dataset_paths[0]) as fp: - json = load(fp) - return "datasets" in json + json = JSONReader().from_file(dataset_paths[0]) + return "datasets" in json return False diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 0f02e2c5a..004fb3a6c 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -2,7 +2,6 @@ from io import IOBase from typing import List, Sequence, Any from dataclasses import dataclass -from json import load from jsonpath_ng import DatumInContext from jsonpath_ng.ext import parse from datetime import datetime @@ -18,9 +17,11 @@ from cdisc_rules_engine.models.variable_metadata_container import ( VariableMetadataContainer, ) + from cdisc_rules_engine.services.data_readers.data_reader_factory import ( DataReaderFactory, ) +from cdisc_rules_engine.services.data_readers.json_reader import JSONReader from cdisc_rules_engine.utilities.utils import ( extract_file_name_from_path_string, ) @@ -475,7 +476,6 @@ def is_valid_data(dataset_paths: Sequence[str]): and len(dataset_paths) == 1 and dataset_paths[0].lower().endswith(".json") ): - with open(dataset_paths[0]) as fp: - json = load(fp) - return "study" in json and "datasetJSONVersion" not in json + json = JSONReader().from_file(dataset_paths[0]) + return "study" in json and "datasetJSONVersion" not in json return False diff --git a/cdisc_rules_engine/services/datasetjson_metadata_reader.py b/cdisc_rules_engine/services/datasetjson_metadata_reader.py index 832f0d0b1..f77856977 100644 --- a/cdisc_rules_engine/services/datasetjson_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetjson_metadata_reader.py @@ -1,11 +1,11 @@ import os -import json import jsonschema import pandas as pd from cdisc_rules_engine.services import logger from cdisc_rules_engine.services.adam_variable_reader import AdamVariableReader +from cdisc_rules_engine.services.data_readers.json_reader import JSONReader class DatasetJSONMetadataReader: @@ -25,14 +25,11 @@ def read(self) -> dict: Extracts metadata from .json file. """ # Load Dataset-JSON Schema - with open( + schema = JSONReader().from_file( os.path.join("resources", "schema", "dataset.schema.json") - ) as schemajson: - schema = schemajson.read() - schema = json.loads(schema) + ) - with open(self._file_path, "r") as file: - datasetjson = json.load(file) + datasetjson = JSONReader().from_file(self._file_path) try: jsonschema.validate(datasetjson, schema) diff --git a/cdisc_rules_engine/services/datasetndjson_metadata_reader.py b/cdisc_rules_engine/services/datasetndjson_metadata_reader.py index 51c27644c..ded014fa5 100644 --- a/cdisc_rules_engine/services/datasetndjson_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetndjson_metadata_reader.py @@ -6,6 +6,7 @@ from cdisc_rules_engine.services import logger from cdisc_rules_engine.services.adam_variable_reader import AdamVariableReader +from cdisc_rules_engine.services.data_readers.json_reader import JSONReader class DatasetNDJSONMetadataReader: @@ -25,11 +26,9 @@ def read(self) -> dict: Extracts metadata from .ndjson file. """ # Load Dataset-NDJSON Schema - with open( + schema = JSONReader().from_file( os.path.join("resources", "schema", "dataset-ndjson-schema.json") - ) as schemandjson: - schema = schemandjson.read() - schema = json.loads(schema) + ) with open(self._file_path, "r") as file: lines = file.readlines() diff --git a/cdisc_rules_engine/utilities/jsonata_processor.py b/cdisc_rules_engine/utilities/jsonata_processor.py new file mode 100644 index 000000000..6ae73ebf1 --- /dev/null +++ b/cdisc_rules_engine/utilities/jsonata_processor.py @@ -0,0 +1,119 @@ +from collections import defaultdict +from functools import cache +from glob import glob +from jsonata import Jsonata + +from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths +from cdisc_rules_engine.enums.execution_status import ExecutionStatus +from cdisc_rules_engine.exceptions.custom_exceptions import ( + MissingDataError, + RuleExecutionError, + RuleFormatError, +) +from cdisc_rules_engine.models.validation_error_container import ( + ValidationErrorContainer, +) +from cdisc_rules_engine.models.validation_error_entity import ( + ValidationErrorEntity, +) + + +class JSONataProcessor: + + @staticmethod + def execute_jsonata_rule( + rule: dict, + dataset: dict, + jsonata_custom_functions: tuple[()] | tuple[tuple[str, str], ...], + ): + custom_functions = JSONataProcessor.get_all_custom_functions( + jsonata_custom_functions + ) + check = rule.get("conditions") + full_string = f"(\n{custom_functions}{check}\n)" + try: + expr = Jsonata(full_string) + except Exception as e: + raise RuleFormatError( + f"\n Error parsing JSONata Rule for Core Id: {rule.get("core_id")}" + f"\n {type(e).__name__}: {e}" + ) + try: + results = expr.evaluate(dataset) + except Exception as e: + raise RuleExecutionError( + f"\n Error evaluating JSONata Rule with Core Id: {rule.get("core_id")}" + f"\n {type(e).__name__}: {e}" + ) + errors = defaultdict(list) + if results: + if not isinstance(results, list): + raise RuleFormatError( + f"\n Error in return type of JSONata Rule with Core Id: {rule.get('core_id')}" + f"\n Expected a list, but got: {results}" + ) + for result in results: + error_entity = ValidationErrorEntity( + value=result, + dataset=result.get("dataset") or "", + row=result.get("row"), + usubjid=result.get("USUBJID"), + sequence=result.get("SEQ"), + ) + errors[result.get("dataset")].append(error_entity) + validation_error_container = [ + ValidationErrorContainer( + dataset=dataset, + domain=dataset, + targets=rule.get("output_variables"), + errors=error, + message=next(iter(rule.get("actions", [])), {}) + .get("params", {}) + .get("message"), + status=( + ExecutionStatus.SUCCESS.value + if results + else ExecutionStatus.EXECUTION_ERROR.value + ), + ).to_representation() + for dataset, error in errors.items() + ] + return validation_error_container + + @staticmethod + @cache + def get_all_custom_functions( + jsonata_custom_functions: tuple[()] | tuple[tuple[str, str], ...] + ): + builtins_and_customs = [ + ("utils", DefaultFilePaths.JSONATA_UTILS.value), + *jsonata_custom_functions, + ] + functions = [ + JSONataProcessor.get_custom_functions(name, path) + for name, path in builtins_and_customs + ] + return "\n".join(functions) + + @staticmethod + def get_custom_functions(jsonata_functions_name: str, jsonata_functions_path: str): + functions = [] + filepaths = glob(f"{jsonata_functions_path}/*.jsonata") + if not filepaths: + raise MissingDataError( + f"\n No JSONata custom functions found at path: {jsonata_functions_path}" + ) + for filepath in filepaths: + try: + with open(filepath, "r") as file: + function_definition = file.read() + function_definition = function_definition.replace("{", "", 1) + function_definition = "".join(function_definition.rsplit("}", 1)) + functions.append(function_definition) + except Exception as e: + raise RuleFormatError( + f"\n Error loading JSONata custom functions at path: {filepath}" + f"\n {type(e).__name__}: {e}" + ) + functions_str = ",\n".join(functions) + return f"${jsonata_functions_name}:={{\n{functions_str}\n}};\n" diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 42fee7aac..18ad570c9 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -1,5 +1,6 @@ import re from typing import Iterable, List, Optional, Set, Union, Tuple +from cdisc_rules_engine.enums.rule_types import RuleTypes from cdisc_rules_engine.interfaces.cache_service_interface import ( CacheServiceInterface, ) @@ -588,6 +589,14 @@ def duplicate_conditions_for_all_targets( new_conditions_dict[key] = new_conditions_list return new_conditions_dict + @staticmethod + def log_suitable_for_validation(rule_id: str, dataset_name: str): + logger.info( + f"is_suitable_for_validation. rule id={rule_id}, " + f"dataset={dataset_name}, result=True" + ) + return True, "" + def is_suitable_for_validation( self, rule: dict, @@ -603,6 +612,11 @@ def is_suitable_for_validation( reason = f"Rule skipped - invalid rule structure for rule id={rule_id}" logger.info(f"is_suitable_for_validation. {reason}, result=False") return False, reason + if ( + rule.get("rule_type") == RuleTypes.JSONATA.value + and dataset_metadata.name == "json" + ): + return self.log_suitable_for_validation(rule_id, dataset_name) if not self.rule_applies_to_use_case( dataset_metadata, rule, standard, standard_substandard ): @@ -633,11 +647,7 @@ def is_suitable_for_validation( ) logger.info(f"is_suitable_for_validation. {reason}, result=False") return False, reason - logger.info( - f"is_suitable_for_validation. rule id={rule_id}, " - f"dataset={dataset_name}, result=True" - ) - return True, "" + return self.log_suitable_for_validation(rule_id, dataset_name) @staticmethod def extract_target_names_from_rule( diff --git a/core.py b/core.py index eb1690125..f005f5eff 100644 --- a/core.py +++ b/core.py @@ -6,7 +6,6 @@ import tempfile from datetime import datetime from multiprocessing import freeze_support -from typing import Tuple from dotenv import load_dotenv import click @@ -33,7 +32,7 @@ from version import __version__ -def valid_data_file(data_path: list) -> Tuple[list, set]: +def valid_data_file(data_path: list) -> tuple[list, set]: allowed_formats = [format.value for format in DataFormatTypes] found_formats = set() file_list = [] @@ -211,6 +210,18 @@ def cli(): is_flag=True, help="This flag enables XML validation against a Define-XML schema.", ) +@click.option( + "-jcf", + "--jsonata-custom-functions", + default=[], + multiple=True, + required=False, + type=( + str, + click.Path(exists=True, file_okay=False, readable=True, resolve_path=True), + ), + help="Variable Name and Path to directory containing a set of custom JSONata functions.", +) @click.option( "-mr", "--max-report-rows", @@ -231,15 +242,15 @@ def validate( cache: str, pool_size: int, data: str, - dataset_path: Tuple[str], + dataset_path: tuple[str], log_level: str, report_template: str, standard: str, version: str, substandard: str, - controlled_terminology_package: Tuple[str], + controlled_terminology_package: tuple[str], output: str, - output_format: Tuple[str], + output_format: tuple[str], raw_report: bool, define_version: str, whodrug: str, @@ -250,13 +261,14 @@ def validate( snomed_version: str, snomed_edition: str, snomed_url: str, - rules: Tuple[str], - exclude_rules: Tuple[str], + rules: tuple[str], + exclude_rules: tuple[str], local_rules: str, custom_standard: bool, progress: str, define_xml_path: str, validate_xml: bool, + jsonata_custom_functions: tuple[()] | tuple[tuple[str, str], ...], max_report_rows: int, max_errors_per_rule: int, ): @@ -350,6 +362,7 @@ def validate( progress, define_xml_path, validate_xml, + jsonata_custom_functions, max_report_rows, max_errors_per_rule, ) @@ -590,7 +603,7 @@ def list_rule_sets(ctx: click.Context, cache_path: str, custom: bool): multiple=True, ) @click.pass_context -def list_dataset_metadata(ctx: click.Context, dataset_path: Tuple[str]): +def list_dataset_metadata(ctx: click.Context, dataset_path: tuple[str]): """ Command that lists metadata of given datasets. @@ -639,7 +652,7 @@ def version(): required=False, multiple=True, ) -def list_ct(cache_path: str, subsets: Tuple[str]): +def list_ct(cache_path: str, subsets: tuple[str]): """ Command to list the ct packages available in the cache. """ @@ -701,6 +714,7 @@ def test_validate(): max_report_rows = None max_report_errors = None json_output = os.path.join(temp_dir, "json_validation_output") + jsonata_custom_functions = () run_validation( Validation_args( cache_path, @@ -724,6 +738,7 @@ def test_validate(): progress, define_xml_path, validate_xml, + jsonata_custom_functions, max_report_rows, max_report_errors, ) @@ -753,6 +768,7 @@ def test_validate(): progress, define_xml_path, validate_xml, + jsonata_custom_functions, max_report_rows, max_report_errors, ) diff --git a/pyproject.toml b/pyproject.toml index 7b122c9ca..d0da6d416 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,34 +4,12 @@ build-backend = "setuptools.build_meta" [project] name = "cdisc-rules-engine" -dynamic = ["version"] +dynamic = ["version", "dependencies"] description = "Open source offering of the cdisc rules engine" readme = "PYPI.md" requires-python = ">=3.12, <3.13" license = { text = "MIT" } authors = [{ name = "cdisc-org", email = "info@cdisc.org" }] -dependencies = [ - "business_rules_enhanced==1.4.8", - "cdisc-library-client==0.1.6", - "importlib-metadata==8.5.0", - "jsonpath-ng==1.6.1", - "jsonschema==4.18.5", - "numpy~=1.23.2", - "odmlib==0.1.4", - "openpyxl==3.1.5", - "pandas==1.5.2", - "python-dotenv==0.20.0", - "pyyaml==6.0.2", - "redis==4.0.2", - "requests~=2.32.3", - "cachetools==6.1.0", - "Pympler==1.1", - "psutil==6.1.1", - "pyreadstat==1.2.7", - "fastparquet==2024.2.0", - "dask[dataframe]==2024.2.0", - "dask[array]==2024.2.0", -] [project.urls] "Homepage" = "https://github.com/cdisc-org/cdisc-rules-engine" @@ -49,3 +27,4 @@ py-modules = ["version"] [tool.setuptools.dynamic] version = { attr = "version.__version__" } +dependencies = {file = ["requirements.txt"]} \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..ac709f651 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,7 @@ +-r requirements.txt +black==24.10.0 +flake8==6.1.0 +pre-commit==2.20.0 +pytest==7.4.0 +pytest-asyncio==0.21.0 +pytest-cov==6.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 54a15b968..31ec8e820 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,15 @@ -black==24.10.0 business_rules_enhanced==1.4.8 cdisc-library-client==0.1.6 click==8.1.7 -flake8==6.1.0 importlib-metadata==8.5.0 +jsonata-python==0.6.0 jsonpath-ng==1.6.1 jsonschema==4.18.5 numpy~=1.26.0 odmlib==0.1.4 openpyxl==3.1.5 pandas==2.1.4 -pre-commit==2.20.0 pyinstaller==6.11.0 -pytest==7.4.0 -pytest-asyncio==0.21.0 -pytest-cov==6.0.0 python-dotenv==1.0.0 pyyaml==6.0.2 redis==4.5.0 @@ -26,4 +21,4 @@ psutil==6.1.1 dask[dataframe]==2024.6.0 dask[array]==2024.6.0 pyreadstat==1.2.7 -fastparquet==2024.2.0 +fastparquet==2024.2.0 \ No newline at end of file diff --git a/resources/jsonata/del_reps.jsonata b/resources/jsonata/del_reps.jsonata new file mode 100644 index 000000000..58ac0cb4b --- /dev/null +++ b/resources/jsonata/del_reps.jsonata @@ -0,0 +1,11 @@ +{ + /* + Delete repeated sequential elements from an array. + - Input: [1, 1, 2, 2, 2, 1, 3, 3] + - Output: [1, 2, 1, 3] + */ + "del_reps": function($lst) + { + $lst ~> $filter(function($v,$i,$a){$i = 0 or ($i > 0 and $v != $a[$i-1])}) + } +} \ No newline at end of file diff --git a/resources/jsonata/get_ref_value.jsonata b/resources/jsonata/get_ref_value.jsonata new file mode 100644 index 000000000..4c8f7d495 --- /dev/null +++ b/resources/jsonata/get_ref_value.jsonata @@ -0,0 +1,16 @@ +{ + "get_ref_value": function($usdm_ref,$within,$not_found_value) + { + ( + $found_obj := $within.** + [ + id=$usdm_ref.id and + instanceType=$usdm_ref.klass and + $usdm_ref.attribute in $keys($) + ]; + $found_obj + ? $found_obj.$lookup($,$usdm_ref.attribute) + : $not_found_value + ) + } +} \ No newline at end of file diff --git a/resources/jsonata/parse_refs.jsonata b/resources/jsonata/parse_refs.jsonata new file mode 100644 index 000000000..8ccc2f9c8 --- /dev/null +++ b/resources/jsonata/parse_refs.jsonata @@ -0,0 +1,30 @@ +{ + "parse_refs": function($from,$to,$within) + { + ( + $iter := function($dict) + { + $count($dict.**[$from in $keys($)]) > 0 + ? ( + $d := $dict ~> |**[$from in $keys($)]| + { + $to: + ( + $refs:=$lookup($,$from); + $map($refs,function($v) + { + $v in $dict[id=$v].**.$lookup($,$from) + ? {"circularReference": $v} + : $dict[id=$v] + } + ) + ) + },[$from]|; + $iter($d) + ) + : $dict + }; + $iter($within) + ) + } +} \ No newline at end of file diff --git a/resources/jsonata/sift_tree.jsonata b/resources/jsonata/sift_tree.jsonata new file mode 100644 index 000000000..e0faeeefa --- /dev/null +++ b/resources/jsonata/sift_tree.jsonata @@ -0,0 +1,34 @@ +{ + "sift_tree" : function($tree,$find_val,$include,$prefix) + { + ( + $iter := function($t) + { + $type($t) = "array" + ? [ + $map($t,function($v) + { + $v.**.[$find_val in $] + ? $iter($v) + } + ) + ] + : $type($t) = "object" + ? $t ~> + $sift(function($v,$k) + { + $k in $include or $v.**.[$find_val in $] + } + ) ~> + $each(function($v, $k) + { + $k in $include + ? {$join([$prefix ? $t.instanceType,$k],"."): $v} + : {$join([$prefix ? $t.instanceType,$k],"."): $iter($v)} + }) ~> $merge + : $t + }; + $iter($tree) + ) + } +} \ No newline at end of file diff --git a/resources/schema/CORE-base.json b/resources/schema/CORE-base.json index 6bcbc9d53..e28dd6de3 100644 --- a/resources/schema/CORE-base.json +++ b/resources/schema/CORE-base.json @@ -360,7 +360,14 @@ "type": "array" }, "Check": { - "$ref": "#/$defs/Boolean" + "anyOf": [ + { + "$ref": "#/$defs/Boolean" + }, + { + "type": "string" + } + ] }, "Core": { "properties": { diff --git a/resources/schema/Rule_Type.json b/resources/schema/Rule_Type.json index e8f98a2a6..ca1270b90 100644 --- a/resources/schema/Rule_Type.json +++ b/resources/schema/Rule_Type.json @@ -34,6 +34,10 @@ "const": "Domain Presence Check", "title": "Content domain presence at study level" }, + { + "const": "JSONata", + "title": "Apply a JSONata query to a JSON file" + }, { "const": "Record Data", "title": "Content data at record level. Most common Rule Type" @@ -50,7 +54,6 @@ "const": "Value Level Metadata Check against Define XML", "title": "Content data at record level and define xml metadata at value level" }, - { "const": "Variable Metadata Check", "title": "Content metadata at variable level" diff --git a/resources/schema/Rule_Type.md b/resources/schema/Rule_Type.md index 5cc64dd10..c40ad9a84 100644 --- a/resources/schema/Rule_Type.md +++ b/resources/schema/Rule_Type.md @@ -179,6 +179,110 @@ all: operator: not_exists ``` +## JSONata + +Apply a JSONata query to a JSON file. [JSONata documentation](https://docs.jsonata.org) + +### Example + +#### Rule + +```yaml +Check: | + **.$filter($, $myutils.equals).{"row":_path, "A":A, "B":B} +Core: + Id: JSONATA Test + Status: Draft +Outcome: + Message: "A equals B" + Output Variables: + - row + - A + - B +Rule Type: JSONata +Scope: + Entities: + Include: + - ALL +Sensitivity: Record +``` + +#### Custom user function contained in external file "equals.jsonata" + +\* Note that in the CLI, you can pass a variable name and directory of such files using `-jcf` or `--jsonata-custom-functions`. The engine's built-in JSONata functions are accessible from the `$utils` variable. For example to load two more directories containing functions into `$myutils` and `$yourutils`, add the options: +`-jcf myutils path/to/myutils -jcf yourutils path/to/yourutils` + +```yaml +{ + "equals": function($v){ $v.A=$v.B } +} +``` + +#### JSON Data + +```json +{ + "A": "same value 1", + "B": "same value 1", + "C": { + "A": "different value 1", + "B": "different value 2", + "C": { "A": "same value 2", "B": "same value 2" } + } +} +``` + +#### Result + +```json +[ + { + "executionStatus": "success", + "dataset": "", + "domain": "", + "variables": ["A", "B", "row"], + "message": "A equals B", + "errors": [ + { + "value": { "row": "", "A": "same value 1", "B": "same value 1" }, + "dataset": "", + "row": "" + }, + { + "value": { + "row": "/C/C", + "A": "same value 2", + "B": "same value 2" + }, + "dataset": "", + "row": "/C/C" + } + ] + } +] +``` + +### Preprocessing + +When the JSONata Rule Type is used, the input JSON file will be preprocessed to assign a `_path` attribute to each node in the JSON tree. The syntax for this path value will use the [JSON Pointer](https://datatracker.ietf.org/doc/html/rfc6901) syntax. This `_path` attribute can be referenced throughout the JSONata query. + +### Output Variables and Report column mapping + +You can use `Outcome.Output Variables` to specify which properties to display from the result JSON. The following result property names will map to the column names in the Excel output report. + +Mapping of Result property names to Report Issue Details Column Names: + +| JSONata Result Name | JSON report property | Excel Column | +| ------------------- | -------------------- | ------------ | +| dataset | dataset | Dataset | +| row | row | Record | +| SEQ | SEQ | Sequence | +| USUBJID | USUBJID | USUBJID | + +### Scope + +A JSONata rule will always run once for the entire JSON file, regardless of the Scope. The `Dataset` determination must come from the rule's JSONata result property. + ## Record Data #### Columns diff --git a/scripts/run_validation.py b/scripts/run_validation.py index ab53ded3a..8fae7228c 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -91,6 +91,7 @@ def validate_single_rule( max_dataset_size=max_dataset_size, dataset_paths=args.dataset_paths, validate_xml=args.validate_xml, + jsonata_custom_functions=args.jsonata_custom_functions, max_errors_per_rule=max_errors_per_rule, ) results = engine.validate_single_rule(rule, datasets) diff --git a/tests/unit/test_rule_tester/test_rule_tester.py b/tests/unit/test_rule_tester/test_rule_tester.py index b2b15556f..c2e46b6d5 100644 --- a/tests/unit/test_rule_tester/test_rule_tester.py +++ b/tests/unit/test_rule_tester/test_rule_tester.py @@ -74,10 +74,10 @@ def patched_init(self, *args, **kwargs): assert "LB" in data assert len(data["LB"]) == 1 assert len(data["LB"][0]["errors"]) == 1 - error = data["LB"][0]["errors"][0]["value"] + error = data["LB"][0]["errors"][0] assert error["row"] == 0 assert error["SEQ"] == 0 - assert error["uSubjId"] == "N/A" + assert error["USUBJID"] == "N/A" assert error["value"] == {"ERROR": "Invalid or undefined sensitivity in the rule"} diff --git a/tests/unit/test_services/test_data_service/test_data_service.py b/tests/unit/test_services/test_data_service/test_data_service.py index f129057a7..00b4cb208 100644 --- a/tests/unit/test_services/test_data_service/test_data_service.py +++ b/tests/unit/test_services/test_data_service/test_data_service.py @@ -205,6 +205,7 @@ def test_get_dataset_class(dataset_metadata, data, expected_class): False, None, None, + None, ) ) data_service = LocalDataService( @@ -285,6 +286,7 @@ def test_get_dataset_class_associated_domains(): False, None, None, + None, ) ) data_service = LocalDataService( diff --git a/tests/unit/test_utilities/test_jsonata_processor.py b/tests/unit/test_utilities/test_jsonata_processor.py new file mode 100644 index 000000000..dbeaf1f20 --- /dev/null +++ b/tests/unit/test_utilities/test_jsonata_processor.py @@ -0,0 +1,162 @@ +from unittest import TestCase +from unittest.mock import MagicMock, patch +from yaml import safe_load +from cdisc_rules_engine.dataset_builders.jsonata_dataset_builder import ( + add_json_pointer_paths, +) +from cdisc_rules_engine.exceptions.custom_exceptions import ( + MissingDataError, + RuleExecutionError, + RuleFormatError, +) +from cdisc_rules_engine.models.rule import Rule +from cdisc_rules_engine.utilities.jsonata_processor import JSONataProcessor + + +class TestJSONataProcessor(TestCase): + + rule = """ + Check: | + **.$filter($, $utils.equals).{"row":_path, "A":A, "B":B} + Core: + Id: JSONATA Test + Status: Draft + Outcome: + Message: "A equals B" + Output Variables: + - row + - A + - B + Rule Type: JSONata + Scope: + Entities: + Include: + - ALL + Sensitivity: Record + """ + get_custom_functions = """ + $utils:={ + "equals": function($v){ $v.A=$v.B } + }; + """ + dataset = { + "A": "same value 1", + "B": "same value 1", + "C": { + "A": "different value 1", + "B": "different value 2", + "C": {"A": "same value 2", "B": "same value 2"}, + }, + } + expected = [ + { + "executionStatus": "success", + "dataset": None, + "domain": None, + "variables": ["A", "B", "row"], + "message": "A equals B", + "errors": [ + { + "value": {"row": "", "A": "same value 1", "B": "same value 1"}, + "dataset": "", + "row": "", + }, + { + "value": { + "row": "/C/C", + "A": "same value 2", + "B": "same value 2", + }, + "dataset": "", + "row": "/C/C", + }, + ], + } + ] + + @patch( + "cdisc_rules_engine.utilities.jsonata_processor.JSONataProcessor.get_custom_functions" + ) + def test_jsonata_processor(self, mock_get_custom_functions: MagicMock): + mock_get_custom_functions.return_value = self.get_custom_functions + rule = Rule.from_cdisc_metadata(safe_load(self.rule)) + add_json_pointer_paths(self.dataset) + result = JSONataProcessor.execute_jsonata_rule( + rule=rule, + dataset=self.dataset, + jsonata_custom_functions=(), + ) + assert result == self.expected + + @patch( + "cdisc_rules_engine.utilities.jsonata_processor.JSONataProcessor.get_custom_functions" + ) + def test_jsonata_rule_parsing_error(self, mock_get_custom_functions: MagicMock): + rule = """ + Check: | + Bad jsonata rule + Core: + Id: JSONATA Test + Status: Draft + Outcome: + Message: "A equals B" + Output Variables: + - row + - A + - B + Rule Type: JSONata + Scope: + Entities: + Include: + - ALL + Sensitivity: Record + """ + mock_get_custom_functions.return_value = self.get_custom_functions + rule = Rule.from_cdisc_metadata(safe_load(rule)) + with self.assertRaises(RuleFormatError): + JSONataProcessor.execute_jsonata_rule( + rule=rule, + dataset=self.dataset, + jsonata_custom_functions=(), + ) + + @patch( + "cdisc_rules_engine.utilities.jsonata_processor.JSONataProcessor.get_custom_functions" + ) + def test_jsonata_rule_execution_error(self, mock_get_custom_functions: MagicMock): + rule = """ + Check: | + **.$filter($, $missing_utils.equals).{"row":path, "A":A, "B":B} + Core: + Id: JSONATA Test + Status: Draft + Outcome: + Message: "A equals B" + Output Variables: + - row + - A + - B + Rule Type: JSONata + Scope: + Entities: + Include: + - ALL + Sensitivity: Record + """ + mock_get_custom_functions.return_value = self.get_custom_functions + rule = Rule.from_cdisc_metadata(safe_load(rule)) + with self.assertRaises(RuleExecutionError): + JSONataProcessor.execute_jsonata_rule( + rule=rule, + dataset=self.dataset, + jsonata_custom_functions=(), + ) + + def test_jsonata_rule_custom_load_error(self): + rule = Rule.from_cdisc_metadata(safe_load(self.rule)) + with self.assertRaises(MissingDataError): + JSONataProcessor.execute_jsonata_rule( + rule=rule, + dataset=self.dataset, + jsonata_custom_functions=(("utils_name", "bad_path"),), + )