Skip to content

Commit 8687acf

Browse files
committed
Fix hasSingleTagInsideElement method
It would fail for e.g. `<div> <p>foo</p> </div>`. mozilla/readability uses children for the tag lookup, which return only elements. PHP does not have children property so b580cf2 mistakenly used `childNodes` instead, but that can return any node type. Let’s filter the children ourselves. Also add comments from mozilla/readability’s `_hasSingleTagInsideElement`.
1 parent 38870cd commit 8687acf

File tree

1 file changed

+12
-2
lines changed

1 file changed

+12
-2
lines changed

src/Readability.php

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1469,13 +1469,23 @@ private function isPhrasingContent($node): bool
14691469
}, iterator_to_array($node->childNodes)), true));
14701470
}
14711471

1472+
/**
1473+
* Checks if `$node` has only whitespace and a single element with `$tag` for the tag name.
1474+
* Returns false if `$node` contains non-empty text nodes
1475+
* or if it contains no element with given tag or more than 1 element.
1476+
*/
14721477
private function hasSingleTagInsideElement(\DOMElement $node, string $tag): bool
14731478
{
1474-
if (1 !== $node->childNodes->length || $node->childNodes->item(0)->nodeName !== $tag) {
1479+
$childNodes = iterator_to_array($node->childNodes);
1480+
$children = array_filter($childNodes, fn ($childNode) => $childNode instanceof \DOMElement);
1481+
1482+
// There should be exactly 1 element child with given tag
1483+
if (1 !== \count($children) || $children[0]->nodeName !== $tag) {
14751484
return false;
14761485
}
14771486

1478-
$a = array_filter(iterator_to_array($node->childNodes), function ($childNode) {
1487+
// And there should be no text nodes with real content
1488+
$a = array_filter($childNodes, function ($childNode) {
14791489
return $childNode instanceof \DOMText &&
14801490
preg_match($this->regexps['hasContent'], $this->getInnerText($childNode));
14811491
});

0 commit comments

Comments
 (0)