diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 6287d654..26325d58 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -81,7 +81,7 @@ code coverage of the fixed bugs and the new features. To run the existing PHPUnit tests, run this command: ```shell -composer ci:tests:unit +composer ci:tests ``` diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 64ea04be..18f59dae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -144,4 +144,4 @@ jobs: composer show; - name: Run Tests - run: composer ci:tests:unit + run: composer ci:tests diff --git a/composer.json b/composer.json index 4af333b4..a5d91d6d 100644 --- a/composer.json +++ b/composer.json @@ -43,10 +43,14 @@ "symfony/css-selector": "^3.4.32 || ^4.3.5 || ^5.0" }, "require-dev": { + "masterminds/html5": "^2.7", "php-parallel-lint/php-parallel-lint": "^1.2.0", "slevomat/coding-standard": "^4.0.0", "squizlabs/php_codesniffer": "^3.5.1" }, + "suggest": { + "masterminds/html5": "Use instead of PHP's built-in DOMDocument for HTML5 support." + }, "autoload": { "psr-4": { "Pelago\\Emogrifier\\": "src/" @@ -73,9 +77,12 @@ "ci:php:md": "\"./tools/phpmd.phar\" src text config/phpmd.xml", "ci:php:psalm": "\"./tools/psalm.phar\" --show-info=false", "ci:tests:unit": "\"./tools/phpunit.phar\"", + "ci:tests:html5:unit": "EMOGRIFIER_HTML5=true \"./tools/phpunit.phar\"", "ci:tests:sof": "\"./tools/phpunit.phar\" --stop-on-failure", + "ci:tests:html5:sof": "EMOGRIFIER_HTML5=true \"./tools/phpunit.phar\" --stop-on-failure", "ci:tests": [ - "@ci:tests:unit" + "@ci:tests:unit", + "@ci:tests:html5:unit" ], "ci:dynamic": [ "@ci:tests" diff --git a/src/HtmlProcessor/AbstractHtmlProcessor.php b/src/HtmlProcessor/AbstractHtmlProcessor.php index 479e6e34..0c73695e 100644 --- a/src/HtmlProcessor/AbstractHtmlProcessor.php +++ b/src/HtmlProcessor/AbstractHtmlProcessor.php @@ -4,6 +4,9 @@ namespace Pelago\Emogrifier\HtmlProcessor; +use DOMNode; +use Masterminds\HTML5; + /** * Base class for HTML processor that e.g., can remove, add or modify nodes or attributes. * @@ -37,6 +40,11 @@ abstract class AbstractHtmlProcessor */ protected $domDocument = null; + /** + * @var HTML5|null + */ + protected $html5 = null; + /** * @var \DOMXPath */ @@ -55,19 +63,20 @@ private function __construct() * Builds a new instance from the given HTML. * * @param string $unprocessedHtml raw HTML, must be UTF-encoded, must not be empty + * @param bool $html5 use masterminds/html5 parser instead of DOMDocument. * * @return static * * @throws \InvalidArgumentException if $unprocessedHtml is anything other than a non-empty string */ - public static function fromHtml(string $unprocessedHtml): self + public static function fromHtml(string $unprocessedHtml, ?bool $html5 = null): self { if ($unprocessedHtml === '') { throw new \InvalidArgumentException('The provided HTML must not be empty.', 1515763647); } $instance = new static(); - $instance->setHtml($unprocessedHtml); + $instance->setHtml($unprocessedHtml, $html5); return $instance; } @@ -91,10 +100,14 @@ public static function fromDomDocument(\DOMDocument $document): self * Sets the HTML to process. * * @param string $html the HTML to process, must be UTF-8-encoded + * @param bool $html5 use masterminds/html5 parser instead of DOMDocument. */ - private function setHtml(string $html): void + private function setHtml(string $html, ?bool $html5): void { - $this->createUnifiedDomDocument($html); + // If html5 is NULL, fallback to the environment flag. + $html5 = $html5 ?? $this->isHtml5Env(); + + $this->createUnifiedDomDocument($html, $html5); } /** @@ -136,7 +149,7 @@ private function setDomDocument(\DOMDocument $domDocument): void */ public function render(): string { - $htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML(); + $htmlWithPossibleErroneousClosingTags = $this->saveHTML(); return $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags); } @@ -148,7 +161,7 @@ public function render(): string */ public function renderBodyContent(): string { - $htmlWithPossibleErroneousClosingTags = $this->getDomDocument()->saveHTML($this->getBodyElement()); + $htmlWithPossibleErroneousClosingTags = $this->saveHTML($this->getBodyElement()); $bodyNodeHtml = $this->removeSelfClosingTagsClosingTags($htmlWithPossibleErroneousClosingTags); return \preg_replace('%]*+)?+>%', '', $bodyNodeHtml); @@ -184,13 +197,36 @@ private function getBodyElement(): \DOMElement * The DOM document will always have a BODY element and a document type. * * @param string $html + * @param bool $html5 */ - private function createUnifiedDomDocument(string $html): void + private function createUnifiedDomDocument(string $html, bool $html5): void { - $this->createRawDomDocument($html); + $html = $this->prepareHtmlForDomConversion($html); + + $html5 ? $this->createHtml5Document($html) : $this->createRawDomDocument($html); + $this->ensureExistenceOfBodyElement(); } + /** + * Creates a HTML5 document parser instance from the given HTML. + * + * @param string $html + * + * @throws \RuntimeException + */ + private function createHtml5Document(string $html): void + { + if (!\class_exists(HTML5::class)) { + throw new \RuntimeException('Class ' . HTML5::class . 'not found. Install the masterminds/html5 library.'); + } + + $this->html5 = new HTML5(['disable_html_ns' => true]); + $domDocument = $this->html5->parse($html); + + $this->setDomDocument($domDocument); + } + /** * Creates a DOMDocument instance from the given HTML and stores it in $this->domDocument. * @@ -202,7 +238,7 @@ private function createRawDomDocument(string $html): void $domDocument->strictErrorChecking = false; $domDocument->formatOutput = true; $libXmlState = \libxml_use_internal_errors(true); - $domDocument->loadHTML($this->prepareHtmlForDomConversion($html)); + $domDocument->loadHTML($html); \libxml_clear_errors(); \libxml_use_internal_errors($libXmlState); @@ -334,4 +370,40 @@ private function ensureExistenceOfBodyElement(): void } $htmlElement->appendChild($this->getDomDocument()->createElement('body')); } + + /** + * Dumps the internal document into a string using HTML formatting. + * + * @param DOMNode $dom [optional] parameter to output a subset of the document. + * + * @return string the HTML, or false if an error occurred. + */ + private function saveHTML(DOMNode $dom = null): string + { + if (isset($this->html5)) { + if ($dom === null) { + $dom = $this->domDocument; + } + + return $this->html5->saveHTML($dom); + } + + // Fall back to DOMDocument. + return $this->getDomDocument()->saveHTML($dom); + } + + /** + * Check whether HTML5 environment is enabled. + * + * @return bool + */ + private function isHtml5Env(): bool + { + $env = \getenv('EMOGRIFIER_HTML5'); + if (is_bool($env)) { + return $env; + } + + return false; + } }