Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6ce5647
add get xhtml errors
alexfurmenkov Sep 25, 2025
afad2bd
add get xhtml errors
alexfurmenkov Sep 26, 2025
6ae7bfc
validator for XHTML
alexfurmenkov Sep 30, 2025
1a573f7
refactor validator for XHTML
alexfurmenkov Sep 30, 2025
93c19d4
add test resources
alexfurmenkov Sep 30, 2025
573e5a7
add tests
alexfurmenkov Oct 2, 2025
e500bec
delete unused files
alexfurmenkov Oct 2, 2025
dcb3129
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engin…
alexfurmenkov Oct 8, 2025
6e55589
897: add html5lib to requirements and pyproject
alexfurmenkov Oct 8, 2025
840cf98
897: minor code improvements
alexfurmenkov Oct 8, 2025
19fa5de
Merge branch 'main' into 897-validate-XHTML-values
alexfurmenkov Oct 9, 2025
ee435b0
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engin…
alexfurmenkov Oct 14, 2025
6ed10ea
report all validation errors at once
alexfurmenkov Oct 14, 2025
b7d7d8d
adjust regression tests
alexfurmenkov Oct 15, 2025
6a58d9d
update unit test
alexfurmenkov Oct 15, 2025
0f25471
delete unit test
alexfurmenkov Oct 15, 2025
c261062
Merge branch 'main' of https://github.com/cdisc-org/cdisc-rules-engin…
alexfurmenkov Oct 15, 2025
9848659
update the operation to use xsd schema
alexfurmenkov Oct 22, 2025
cad2e54
delete html5lib from requirements.txt
alexfurmenkov Oct 22, 2025
f26ec14
xsd definitions
ASL-rmarshall Oct 22, 2025
883f0b1
Merge branch '897-validate-XHTML-values' of https://github.com/cdisc-…
ASL-rmarshall Oct 22, 2025
f4d8131
Updated operation
ASL-rmarshall Oct 22, 2025
845c658
add unit tests for XHTML validation operation
alexfurmenkov Oct 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions cdisc_rules_engine/operations/get_xhtml_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import os
from lxml import etree
import re

from cdisc_rules_engine.operations.base_operation import BaseOperation


class GetXhtmlErrors(BaseOperation):
"""Validate XHTML fragments in the target column.

Steps:
1. Make sure the column is a valid XML -> on failure generate a list of XML validation errors
2. XMLSchema validation -> on failure generate a list of XMLSchema validation errors
3. Return all validation errors in one go

Empty / None values return an empty list.
"""

def _execute_operation(self):
dataset = self.evaluation_dataset
target = self.params.target
if target not in dataset:
raise KeyError(target)
# TODO: It would be good to make the XSD path configurable via config and the XSD itself an operation parameter.
# The XSD should be referenced as specified in DOCTYPE (so that external entities can be resolved correctly).
# For example:
#
# namespaces:
# - uri: http://www.w3.org/1999/xhtml
# - uri: http://www.cdisc.org/ns/usdm/xhtml/v1.0
# prefix: usdm
# - uri: http://www.w3.org/2000/svg
# prefix: svg
# - uri: http://www.w3.org/1998/Math/MathML
# prefix: math
#
# The schemaLocation values would probably needed to be configurable as well.
try:
self.schema = etree.XMLSchema(
file=os.path.join(
"resources",
"schema",
"xml",
"cdisc-usdm-xhtml-1.0",
"usdm-xhtml-1.0.xsd",
)
)
except Exception as e:
# TODO: Raise a custom exception or handle etree exceptions
# specifically in RulesEngine.handle_validation_exceptions
raise Exception(f"Failed to parse XMLSchema: {e.error_log}")

# TODO: Generate from namespaces provided in config / operation parameters
self.nsdec = (
'xmlns="http://www.w3.org/1999/xhtml" xmlns:usdm="http://www.cdisc.org/ns/usdm/xhtml/v1.0" '
+ 'xmlns:svg="http://www.w3.org/2000/svg" xmlns:math="http://www.w3.org/1998/Math/MathML" '
+ 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="'
+ "http://www.w3.org/1999/xhtml ../resources/schema/xml/xhtml-1.1/xhtml1-loose.xsd "
+ "http://www.w3.org/1998/Math/MathML ../resources/schema/xml/mathml2/mathml2.xsd "
+ "http://www.w3.org/2000/svg ../resources/schema/xml/svg-1.1/svg.xsd "
+ "http://www.cdisc.org/ns/usdm/xhtml/v1.0 "
+ "../resources/schema/xml/cdisc-usdm-xhtml-1.0/usdm-xhtml-extension.xsd "
+ 'http://www.w3.org/1999/xlink ../resources/schema/xml/xlink/xlink.xsd"'
)
self.line_pattern = re.compile(r"line (\d+)")

return dataset[target].apply(self._ensure_dataset_is_valid_xhtml)

def _ensure_dataset_is_valid_xhtml(self, value: str) -> list[str]:
value: str = value.strip()
if not value:
return []

text = value.strip()
if not text:
return []

errors = []

xhtml_mod, text = self._wrap_xhtml(text)

line_labels = (
{
1: "(wrapper start)",
len(text.split("\n")): "(wrapper end)",
}
if xhtml_mod
else {}
)

parser = etree.XMLParser(recover=True, ns_clean=True)
xhtml_to_validate = etree.XML(text.encode("utf-8"), parser)

self._report_errors(
xhtml_to_validate, parser.error_log, errors, xhtml_mod, line_labels
)

if not self.schema.validate(xhtml_to_validate):
self._report_errors(
xhtml_to_validate,
self.schema.error_log,
errors,
xhtml_mod,
line_labels,
)
return errors

def _wrap_xhtml(self, text: str) -> tuple[bool, str]:
"""Wraps the input text in <html><head><title></title></head><body>...</body></html> if not already present."""
if not text.startswith("<"):
return (
True,
f"<html {self.nsdec}><head><title></title></head><body><div>\n{text}\n</div></body></html>",
)
if "<body>" not in text:
return (
True,
f"<html {self.nsdec}><head><title></title></head><body>\n{text}\n</body></html>",
)
if "<head>" not in text:
return True, (
text.replace("<body>", "<head><title></title></head><body>")
if text.startswith("<html")
else f"<html {self.nsdec}><head><title></title></head><body>\n{text}\n</body></html>"
)
return False, text

def _report_errors(
self,
xhtml: etree.ElementTree,
error_log: etree._ErrorLog,
errors: list[str],
xhtml_mod: bool = False,
line_lbls: dict = {},
) -> list[str]:
for error in error_log:
msg = error.message.strip()
if xhtml_mod and re.search(self.line_pattern, msg):
# Adjust line numbers in message
msg = self.line_pattern.sub(
lambda x: self._get_line_name(line_lbls, int(x.groups()[0])),
msg,
)

line_col = (
(
f"{self._get_line_name(line_lbls, error.line, error.column)}"
if xhtml_mod
else f"line {error.line}"
)
if error.line
else "unknown pos"
)

if xhtml.nsmap:
for k, v in xhtml.nsmap.items():
if v in msg:
prefix = f"{k}:" if k else ""
msg = re.sub(r"\{" + re.escape(v) + r"\}", prefix, msg)

errors.append(f"Invalid XHTML {line_col} [{error.level_name}]: {msg}")

def _get_line_name(self, line_labels, line: int, col: int | None = None) -> str:
return line_labels.get(
line, f"line {line - 1}" + (f", col {col}" if col else "")
)
4 changes: 3 additions & 1 deletion cdisc_rules_engine/operations/operations_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
)
from cdisc_rules_engine.operations.distinct import Distinct
from cdisc_rules_engine.operations.extract_metadata import ExtractMetadata
from cdisc_rules_engine.operations.get_xhtml_errors import GetXhtmlErrors
from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder
from cdisc_rules_engine.operations.library_model_column_order import (
LibraryModelColumnOrder,
Expand Down Expand Up @@ -132,6 +133,7 @@ class OperationsFactory(FactoryInterface):
"valid_external_dictionary_code_term_pair": ValidExternalDictionaryCodeTermPair,
"valid_define_external_dictionary_version": DefineDictionaryVersionValidator,
"get_dataset_filtered_variables": GetDatasetFilteredVariables,
"get_xhtml_errors": GetXhtmlErrors,
}

@classmethod
Expand Down Expand Up @@ -172,6 +174,6 @@ def get_service(
kwargs.get("library_metadata"),
)
raise ValueError(
f"Operation name must be in {list(self._operations_map.keys())}, "
f"Operation name must be in {list(self._operations_map.keys())}, "
f"given operation name is {name}"
)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ psutil==6.1.1
dask[dataframe]==2024.6.0
dask[array]==2024.6.0
pyreadstat==1.2.7
fastparquet==2024.2.0
fastparquet==2024.2.0
lxml==5.2.1
7 changes: 7 additions & 0 deletions resources/schema/Operations.json
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,13 @@
},
"required": ["id", "operator"],
"type": "object"
},
{
"properties": {
"operator": { "const": "get_xhtml_errors" }
},
"required": ["id", "operator", "name"],
"type": "object"
}
],
"properties": {
Expand Down
18 changes: 18 additions & 0 deletions resources/schema/xml/cdisc-usdm-xhtml-1.0/usdm-xhtml-1.0.xsd
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<xs:schema targetNamespace="http://www.w3.org/1999/xhtml"
xmlns="http://www.w3.org/1999/xhtml"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:usdm="http://www.cdisc.org/ns/usdm/xhtml/v1.0"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns:math="http://www.w3.org/1998/Math/MathML"
elementFormDefault="qualified" attributeFormDefault="unqualified"
version="1.0">

<xs:annotation>
<xs:documentation>USDM-XHTML 1.0 schema as developed by the CDISC USDM Team</xs:documentation>
</xs:annotation>

<!-- include USDM-XHTML extensions to core XHTML -->
<xs:include schemaLocation="usdm-xhtml-extension.xsd"/>

</xs:schema>
38 changes: 38 additions & 0 deletions resources/schema/xml/cdisc-usdm-xhtml-1.0/usdm-xhtml-extension.xsd
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<?xml version="1.0" encoding="UTF-8"?>
<xs:schema targetNamespace="http://www.w3.org/1999/xhtml"
xmlns="http://www.w3.org/1999/xhtml"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:usdm="http://www.cdisc.org/ns/usdm/xhtml/v1.0"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns:math="http://www.w3.org/1998/Math/MathML"
elementFormDefault="qualified" attributeFormDefault="unqualified"
version="1.0">


<!--
USDM-XHTML 1.0 usdm-xhtml-extension schema draft
as developed by the CDISC USDM Team
-->

<xs:annotation>
<xs:documentation>USDM-XHTML 1.0 usdm-xhtml-extension schema as developed by the CDISC USDM
Team</xs:documentation>
</xs:annotation>

<xs:import namespace="http://www.w3.org/2000/svg" schemaLocation="../svg-1.1/svg.xsd"/>
<xs:import namespace="http://www.w3.org/1998/Math/MathML"
schemaLocation="../mathml2/mathml2.xsd" />
<xs:import namespace="http://www.cdisc.org/ns/usdm/xhtml/v1.0"
schemaLocation="usdm-xhtml-ns.xsd" />
<xs:redefine schemaLocation="../xhtml-1.1/xhtml11.xsd">
<xs:group name="xhtml.Misc.extra">
<xs:choice>
<xs:group ref="xhtml.Misc.extra" />
<xs:element ref="svg:svg" />
<xs:element ref="math:math" />
<xs:element ref="usdm:ref" />
<xs:element ref="usdm:tag" />
</xs:choice>
</xs:group>
</xs:redefine>
</xs:schema>
48 changes: 48 additions & 0 deletions resources/schema/xml/cdisc-usdm-xhtml-1.0/usdm-xhtml-ns.xsd
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<?xml version="1.0" encoding="UTF-8"?>
<xs:schema targetNamespace="http://www.cdisc.org/ns/usdm/xhtml/v1.0"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:xh11d="http://www.w3.org/1999/xhtml/datatypes/"
xmlns:usdm="http://www.cdisc.org/ns/usdm/xhtml/v1.0"
elementFormDefault="qualified" attributeFormDefault="unqualified"
version="1.0">

<!--
USDM-XHTML 1.0 usdm-xhtml-ns schema draft
as developed by the CDISC USDM Team
-->

<xs:annotation>
<xs:documentation>USDM-XHTML 1.0 usdm-xhtml-ns schema as developed by the CDISC USDM Team</xs:documentation>
</xs:annotation>

<xs:import namespace="http://www.w3.org/1999/xhtml/datatypes/" schemaLocation="http://www.w3.org/MarkUp/SCHEMA/xhtml-datatypes-1.xsd"/>

<!--
Ref
-->
<xs:element name="ref" type="usdm:USDMcomplexTypeDefinition-ref"/>
<xs:complexType name="USDMcomplexTypeDefinition-ref">
<xs:annotation><xs:documentation>A reference to content held within the remainder of the model.</xs:documentation></xs:annotation>
<xs:attribute name="klass" type="xh11d:CDATA" use="required">
<xs:annotation><xs:documentation>The name of the class that holds the referenced data element.</xs:documentation></xs:annotation>
</xs:attribute>
<xs:attribute name="id" type="xh11d:CDATA" use="required">
<xs:annotation><xs:documentation>The id value of the referenced instance of the referenced class.</xs:documentation></xs:annotation>
</xs:attribute>
<xs:attribute name="attribute" type="xh11d:CDATA" use="required">
<xs:annotation><xs:documentation>The attribute name of the referenced data element within the referenced instance of the referenced class.</xs:documentation></xs:annotation>
</xs:attribute>
</xs:complexType>

<!--
Tag
-->
<xs:element name="tag" type="usdm:USDMcomplexTypeDefinition-tag"/>
<xs:complexType name="USDMcomplexTypeDefinition-tag">
<xs:annotation><xs:documentation>A reference to a parameter that is mapped to a value.</xs:documentation></xs:annotation>
<xs:attribute name="name" type="xh11d:CDATA" use="required">
<xs:annotation><xs:documentation>The name of the referenced parameter.</xs:documentation></xs:annotation>
</xs:attribute>
</xs:complexType>

</xs:schema>
41 changes: 41 additions & 0 deletions resources/schema/xml/mathml2/common/common-attribs.xsd
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>

<xs:schema
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns="http://www.w3.org/1998/Math/MathML"
xmlns:xlink="http://www.w3.org/1999/xlink"
targetNamespace="http://www.w3.org/1998/Math/MathML"
elementFormDefault="qualified"
>

<xs:annotation>
<xs:documentation>
This is the common attributes module for MathML.
Author: St&#233;phane Dalmas, INRIA.
</xs:documentation>
</xs:annotation>


<xs:import namespace="http://www.w3.org/1999/xlink" schemaLocation="xlink-href.xsd"/>
<xs:import/> <!-- import any foreign namespace -->


<!-- The type of "class" is from the XHTML modularization with Schema
document -->
<xs:attributeGroup name="Common.attrib">
<xs:attribute name="class" type="xs:NMTOKENS"/>
<xs:attribute name="style" type="xs:string"/>
<xs:attribute name="xref" type="xs:IDREF"/>
<xs:attribute name="id" type="xs:ID"/>
<xs:attribute ref="xlink:href"/>
<!-- allow attributes from foreign namespaces, and don't check them -->
<xs:anyAttribute namespace="##other" processContents="skip"/>
</xs:attributeGroup>

</xs:schema>
<!--
Copyright &#251; 2002 World Wide Web Consortium, (Massachusetts Institute
of Technology, Institut National de Recherche en Informatique et en
Automatique, Keio University). All Rights Reserved. See
http://www.w3.org/Consortium/Legal/.
-->
Loading
Loading