diff --git a/test_unstructured/partition/html/test_partition.py b/test_unstructured/partition/html/test_partition.py index ca0e14b01b..c1e0b834c7 100644 --- a/test_unstructured/partition/html/test_partition.py +++ b/test_unstructured/partition/html/test_partition.py @@ -1517,3 +1517,101 @@ def test_partition_html_with_empty_content_raises_error(test_case, content): assert len(elements) == 0 finally: os.unlink(temp_filename) + + +# ================================================================================================ +# DETAILS/SUMMARY (ACCORDION) ELEMENTS +# ================================================================================================ + + +def test_partition_html_extracts_details_element_content(): + """
content should not be silently discarded.""" + html_text = "

This is inside a details block.

" + + elements = partition_html(text=html_text) + + assert len(elements) == 1 + assert elements[0].text == "This is inside a details block." + + +def test_partition_html_extracts_summary_element_text(): + """ text (the accordion heading) should appear as an element.""" + html_text = "
Click to expand

Hidden content here.

" + + elements = partition_html(text=html_text) + + texts = [e.text for e in elements] + assert "Click to expand" in texts + assert "Hidden content here." in texts + + +def test_partition_html_extracts_accordion_faq(): + """A typical FAQ accordion should produce elements for every question and answer.""" + html_text = """ +
+
+ What is unstructured? +

A library for document processing.

+
+
+ How do I install it? +

Use pip install unstructured.

+
+
+ """ + + elements = partition_html(text=html_text) + + texts = [e.text for e in elements] + assert "What is unstructured?" in texts + assert "A library for document processing." in texts + assert "How do I install it?" in texts + assert "Use pip install unstructured." in texts + + +def test_partition_html_extracts_nested_details(): + """Nested
elements should all be extracted.""" + html_text = """ +
+ Outer +

Outer content

+
+ Inner +

Inner content

+
+
+ """ + + elements = partition_html(text=html_text) + + texts = [e.text for e in elements] + assert "Outer" in texts + assert "Outer content" in texts + assert "Inner" in texts + assert "Inner content" in texts + + +def test_partition_html_details_without_summary(): + """
without a should still extract body content.""" + html_text = "

No summary here, just content.

" + + elements = partition_html(text=html_text) + + assert len(elements) == 1 + assert elements[0].text == "No summary here, just content." + + +def test_partition_html_summary_with_inline_markup(): + """ containing inline markup should preserve the text.""" + html_text = ( + "
" + "This has bold and italic text" + "

Body text.

" + "
" + ) + + elements = partition_html(text=html_text) + + texts = [e.text for e in elements] + assert any("bold" in t and "italic" in t for t in texts) + assert "Body text." in texts diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index f3df2b8a15..5aa0ba2be1 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -1002,7 +1002,7 @@ def derive_element_type_from_text(text: str) -> type[Text] | None: "button": RemovedPhrasing, "label": RemovedPhrasing, # -- removed block -- - "details": RemovedBlock, # -- likely boilerplate -- + "details": Flow, # -- collapsible section container (accordion) -- "dl": RemovedBlock, "dd": RemovedBlock, "dt": RemovedBlock, @@ -1013,6 +1013,6 @@ def derive_element_type_from_text(text: str) -> type[Text] | None: # -- removed form-related -- "form": RemovedBlock, "input": RemovedBlock, - "summary": RemovedBlock, # -- child of `details` + "summary": Flow, # -- visible heading of a `
` accordion -- } )