Skip to content

Commit ab5dcb7

Browse files
committed
[HtmlSanitizer] Introduce HtmlSanitizer component
0 parents  commit ab5dcb7

31 files changed

+4340
-0
lines changed

.gitattributes

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/Tests export-ignore
2+
/phpunit.xml.dist export-ignore
3+
/.gitattributes export-ignore
4+
/.gitignore export-ignore

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
composer.lock
2+
phpunit.xml
3+
vendor/

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
CHANGELOG
2+
=========
3+
4+
6.1
5+
---
6+
7+
* Add the component as experimental

HtmlSanitizer.php

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
<?php
2+
3+
/*
4+
* This file is part of the Symfony package.
5+
*
6+
* (c) Fabien Potencier <[email protected]>
7+
*
8+
* For the full copyright and license information, please view the LICENSE
9+
* file that was distributed with this source code.
10+
*/
11+
12+
namespace Symfony\Component\HtmlSanitizer;
13+
14+
use Symfony\Component\HtmlSanitizer\Parser\MastermindsParser;
15+
use Symfony\Component\HtmlSanitizer\Parser\ParserInterface;
16+
use Symfony\Component\HtmlSanitizer\Reference\W3CReference;
17+
use Symfony\Component\HtmlSanitizer\TextSanitizer\StringSanitizer;
18+
use Symfony\Component\HtmlSanitizer\Visitor\DomVisitor;
19+
20+
/**
21+
* @author Titouan Galopin <[email protected]>
22+
*
23+
* @experimental
24+
*/
25+
final class HtmlSanitizer implements HtmlSanitizerInterface
26+
{
27+
private HtmlSanitizerConfig $config;
28+
private int $maxInputLength;
29+
private ParserInterface $parser;
30+
31+
/**
32+
* @var array<string, DomVisitor>
33+
*/
34+
private array $domVisitors = [];
35+
36+
public function __construct(HtmlSanitizerConfig $config, int $maxInputLength = 20000, ParserInterface $parser = null)
37+
{
38+
$this->config = $config;
39+
$this->maxInputLength = $maxInputLength;
40+
$this->parser = $parser ?? new MastermindsParser();
41+
}
42+
43+
public function sanitize(string $input): string
44+
{
45+
return $this->sanitizeWithContext(W3CReference::CONTEXT_BODY, $input);
46+
}
47+
48+
public function sanitizeFor(string $element, string $input): string
49+
{
50+
return $this->sanitizeWithContext(
51+
W3CReference::CONTEXTS_MAP[StringSanitizer::htmlLower($element)] ?? W3CReference::CONTEXT_BODY,
52+
$input
53+
);
54+
}
55+
56+
private function sanitizeWithContext(string $context, string $input): string
57+
{
58+
// Text context: early return with HTML encoding
59+
if (W3CReference::CONTEXT_TEXT === $context) {
60+
return StringSanitizer::encodeHtmlEntities($input);
61+
}
62+
63+
// Other context: build a DOM visitor
64+
$this->domVisitors[$context] ??= $this->createDomVisitorForContext($context);
65+
66+
// Prevent DOS attack induced by extremely long HTML strings
67+
if (\strlen($input) > $this->maxInputLength) {
68+
$input = substr($input, 0, $this->maxInputLength);
69+
}
70+
71+
// Only operate on valid UTF-8 strings. This is necessary to prevent cross
72+
// site scripting issues on Internet Explorer 6. Idea from Drupal (filter_xss).
73+
if (!$this->isValidUtf8($input)) {
74+
return '';
75+
}
76+
77+
// Remove NULL character
78+
$input = str_replace(\chr(0), '', $input);
79+
80+
// Parse as HTML
81+
if (!$parsed = $this->parser->parse($input)) {
82+
return '';
83+
}
84+
85+
// Visit the DOM tree and render the sanitized nodes
86+
return $this->domVisitors[$context]->visit($parsed)?->render() ?? '';
87+
}
88+
89+
private function isValidUtf8(string $html): bool
90+
{
91+
// preg_match() fails silently on strings containing invalid UTF-8.
92+
return '' === $html || preg_match('//u', $html);
93+
}
94+
95+
private function createDomVisitorForContext(string $context): DomVisitor
96+
{
97+
$elementsConfig = [];
98+
99+
// Head: only a few elements are allowed
100+
if (W3CReference::CONTEXT_HEAD === $context) {
101+
foreach ($this->config->getAllowedElements() as $allowedElement => $allowedAttributes) {
102+
if (\array_key_exists($allowedElement, W3CReference::HEAD_ELEMENTS)) {
103+
$elementsConfig[$allowedElement] = $allowedAttributes;
104+
}
105+
}
106+
107+
foreach ($this->config->getBlockedElements() as $blockedElement => $v) {
108+
if (\array_key_exists($blockedElement, W3CReference::HEAD_ELEMENTS)) {
109+
$elementsConfig[$blockedElement] = false;
110+
}
111+
}
112+
113+
return new DomVisitor($this->config, $elementsConfig);
114+
}
115+
116+
// Body: allow any configured element that isn't in <head>
117+
foreach ($this->config->getAllowedElements() as $allowedElement => $allowedAttributes) {
118+
if (!\array_key_exists($allowedElement, W3CReference::HEAD_ELEMENTS)) {
119+
$elementsConfig[$allowedElement] = $allowedAttributes;
120+
}
121+
}
122+
123+
foreach ($this->config->getBlockedElements() as $blockedElement => $v) {
124+
if (!\array_key_exists($blockedElement, W3CReference::HEAD_ELEMENTS)) {
125+
$elementsConfig[$blockedElement] = false;
126+
}
127+
}
128+
129+
return new DomVisitor($this->config, $elementsConfig);
130+
}
131+
}

0 commit comments

Comments
 (0)