diff --git a/documentation/docs/html-rewriter/Element/prototype/after.mdx b/documentation/docs/html-rewriter/Element/prototype/after.mdx new file mode 100644 index 0000000000..9286acdc79 --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/after.mdx @@ -0,0 +1,35 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# after + +The `after` method inserts content after the closing tag of the element. + +## Syntax + +```js +element.after(content); +element.after(content, options); +``` + +### Parameters + +- `content` _: string_ + - The content to insert after the element's closing tag. + +- `options` _: ElementRewriterOptions_ + - An optional object that can have the following properties: + - `escapeHTML` _: boolean_ + - If `true`, any HTML markup in `content` will be escaped so it is safe to insert as text. + +### Examples + +```js +// Assuming `e` is an Element representing
Hello
+e.after("World"); +// Result:
Hello
World +``` diff --git a/documentation/docs/html-rewriter/Element/prototype/append.mdx b/documentation/docs/html-rewriter/Element/prototype/append.mdx new file mode 100644 index 0000000000..0a6ef7ba63 --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/append.mdx @@ -0,0 +1,35 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# append + +The `append` method inserts content at the end of the element's content. + +## Syntax + +```js +element.append(content); +element.append(content, options); +``` + +### Parameters + +- `content` _: string_ + - The content to insert at the end of the element's content. + +- `options` _: ElementRewriterOptions_ + - An optional object that can have the following properties: + - `escapeHTML` _: boolean_ + - If `true`, any HTML markup in `content` will be escaped so it is safe to insert as text. + +### Examples + +```js +// Assuming `e` is an Element representing
Hello
+e.append(", World"); +// Result:
Hello, World
+``` diff --git a/documentation/docs/html-rewriter/Element/prototype/before.mdx b/documentation/docs/html-rewriter/Element/prototype/before.mdx new file mode 100644 index 0000000000..a4fe975e0f --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/before.mdx @@ -0,0 +1,35 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# before + +The `before` method inserts content before the opening tag of the element. + +## Syntax + +```js +element.before(content); +element.before(content, options); +``` + +### Parameters + +- `content` _: string_ + - The content to insert before the element's opening tag. + +- `options` _: ElementRewriterOptions_ + - An optional object that can have the following properties: + - `escapeHTML` _: boolean_ + - If `true`, any HTML markup in `content` will be escaped so it is safe to insert as text. + +### Examples + +```js +// Assuming `e` is an Element representing
Hello
+e.before("Well"); +// Result: Well
Hello
+``` diff --git a/documentation/docs/html-rewriter/Element/prototype/hasAttribute.mdx b/documentation/docs/html-rewriter/Element/prototype/hasAttribute.mdx new file mode 100644 index 0000000000..56b93ed343 --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/hasAttribute.mdx @@ -0,0 +1,26 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# hasAttribute + +The `hasAttribute` method returns a `boolean` value indicating whether the specified attribute is present on the element. + +## Syntax + +```js +element.hasAttribute(attributeName); +``` + +### Parameters + +- `attributeName` _: string_ + - The name of the attribute to check for. + +### Return value + +A boolean value indicating whether the attribute is present. + diff --git a/documentation/docs/html-rewriter/Element/prototype/prepend.mdx b/documentation/docs/html-rewriter/Element/prototype/prepend.mdx new file mode 100644 index 0000000000..c73e44efe4 --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/prepend.mdx @@ -0,0 +1,35 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# prepend + +The `prepend` method inserts content at the beginning of the element's content. + +## Syntax + +```js +element.prepend(content); +element.prepend(content, options); +``` + +### Parameters + +- `content` _: string_ + - The content to insert at the beginning of the element's content. + +- `options` _: ElementRewriterOptions_ + - An optional object that can have the following properties: + - `escapeHTML` _: boolean_ + - If `true`, any HTML markup in `content` will be escaped so it is safe to insert as text. + +### Examples + +```js +// Assuming `e` is an Element representing
Hello
+e.prepend("Well, "); +// Result:
Well, Hello
+``` diff --git a/documentation/docs/html-rewriter/Element/prototype/removeAttribute.mdx b/documentation/docs/html-rewriter/Element/prototype/removeAttribute.mdx new file mode 100644 index 0000000000..04fd24ee72 --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/removeAttribute.mdx @@ -0,0 +1,21 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# removeAttribute + +The `removeAttribute` method removes the specified attribute from the element. + +## Syntax + +```js +element.removeAttribute(attributeName); +``` + +### Parameters + +- `attributeName` _: string_ + - The name of the attribute to remove. diff --git a/documentation/docs/html-rewriter/Element/prototype/replaceChildren.mdx b/documentation/docs/html-rewriter/Element/prototype/replaceChildren.mdx new file mode 100644 index 0000000000..afa02b5620 --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/replaceChildren.mdx @@ -0,0 +1,35 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# replaceChildren + +The `replaceChildren` method replaces the element's children with new content. + +## Syntax + +```js +element.replaceChildren(content); +element.replaceChildren(content, options); +``` + +### Parameters + +- `content` _: string_ + - The content to replace the element's children with. + +- `options` _: ElementRewriterOptions_ + - An optional object that can have the following properties: + - `escapeHTML` _: boolean_ + - If `true`, any HTML markup in `content` will be escaped so it is safe to insert as text. + +### Examples + +```js +// Assuming `e` is an Element representing
Hello
+e.replaceChildren("Greetings!"); +// Result:
Greetings!
+``` diff --git a/documentation/docs/html-rewriter/Element/prototype/replaceWith.mdx b/documentation/docs/html-rewriter/Element/prototype/replaceWith.mdx new file mode 100644 index 0000000000..a2984231c0 --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/replaceWith.mdx @@ -0,0 +1,35 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# replaceWith + +The `replaceWith` method replaces the element with new content. + +## Syntax + +```js +element.replaceWith(content); +element.replaceWith(content, options); +``` + +### Parameters + +- `content` _: string_ + - The content to replace the element with. + +- `options` _: ElementRewriterOptions_ + - An optional object that can have the following properties: + - `escapeHTML` _: boolean_ + - If `true`, any HTML markup in `content` will be escaped so it is safe to insert as text. + +### Examples + +```js +// Assuming `e` is an Element representing
Hello
+e.replaceWith("

Greetings!

"); +// Result:

Greetings!

+``` diff --git a/documentation/docs/html-rewriter/Element/prototype/selector.mdx b/documentation/docs/html-rewriter/Element/prototype/selector.mdx new file mode 100644 index 0000000000..67d0ead852 --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/selector.mdx @@ -0,0 +1,11 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# selector + +The `selector` read-only property is a `string` representing the [CSS selector](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors) that matches the element. + diff --git a/documentation/docs/html-rewriter/Element/prototype/setAttribute.mdx b/documentation/docs/html-rewriter/Element/prototype/setAttribute.mdx new file mode 100644 index 0000000000..da031ce410 --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/setAttribute.mdx @@ -0,0 +1,24 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# setAttribute + +The `setAttribute` method sets the value of the specified attribute on the element. If the value already exists, it will be updated; otherwise, a new attribute with the specified name and value will be added to the element. + +## Syntax + +```js +element.setAttribute(attributeName, value); +``` + +### Parameters + +- `attributeName` _: string_ + - The name of the attribute to set. +- `value` _: string_ + - The value to assign to the attribute. + diff --git a/documentation/docs/html-rewriter/Element/prototype/tag.mdx b/documentation/docs/html-rewriter/Element/prototype/tag.mdx new file mode 100644 index 0000000000..7a6f95f47a --- /dev/null +++ b/documentation/docs/html-rewriter/Element/prototype/tag.mdx @@ -0,0 +1,10 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- + +# tag + +The `tag` read-only property is a `string` representing the tag name of the element. diff --git a/documentation/docs/html-rewriter/HTMLRewritingStream/HTMLRewritingStream.mdx b/documentation/docs/html-rewriter/HTMLRewritingStream/HTMLRewritingStream.mdx new file mode 100644 index 0000000000..af4387c22d --- /dev/null +++ b/documentation/docs/html-rewriter/HTMLRewritingStream/HTMLRewritingStream.mdx @@ -0,0 +1,45 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- +# `HTMLRewritingStream()` + +The **`HTMLRewritingStream`** lets you rewrite HTML by registering callbacks on CSS selectors. When an element matching the selector is encountered, the rewriter calls your callback. This callback can manipulate the attributes of the element, and add or remove content from the immediate context. + +## Syntax + +```js +new HTMLRewritingStream() +``` + +### Return value + +A new `HTMLRewritingStream` object. + +## Examples + +In this example, we fetch an HTML page and use the HTML rewriter to add an attribute to all `div` tags and prepend the text `Header:` to all `h1` tags: + +```js +/// + +import { HTMLRewritingStream } from 'fastly/html-rewriter'; + +async function handleRequest(event) { + let transformer = new HTMLRewritingStream() + .onElement("h1", e => e.prepend("Header: ")) + .onElement("div", e => e.setAttribute("special-attribute", "top-secret")); + let body = (await fetch("https://example.com/")).body.pipeThrough(transformer); + + return new Response(body, { + status: 200, + headers: new Headers({ + "content-type": "text/html" + }) + }) +} + +addEventListener("fetch", (event) => event.respondWith(handleRequest(event))); +``` diff --git a/documentation/docs/html-rewriter/HTMLRewritingStream/prototype/onElement.mdx b/documentation/docs/html-rewriter/HTMLRewritingStream/prototype/onElement.mdx new file mode 100644 index 0000000000..c4b5982525 --- /dev/null +++ b/documentation/docs/html-rewriter/HTMLRewritingStream/prototype/onElement.mdx @@ -0,0 +1,90 @@ +--- +hide_title: false +hide_table_of_contents: false +pagination_next: null +pagination_prev: null +--- +# onElement + +▸ **onElement**`(selector: string, handler: (element: Element) => void): this` + +Registers an element handler with the [`HTMLRewritingStream`] that will be called for each [`Element`] that matches the [CSS selector](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_selectors) `selector`. + +Elements added by handlers will not be processed by other handlers. + +## Syntax + +```js +.onElement(selector, handler) +``` + +### Parameters + +- `selector` _: string_ + - A CSS selector that determines the elements for which `handler` will be called + - The following types of CSS selector are supported: + +Currently the rewriter supports the following CSS selectors: + +| Pattern | Description | +|----------------------|----------------------------------------------------------------------------| +| `*` | Any element | +| `E` | All elements of type `E` | +| `E F` | `F` elements inside `E` elements | +| `E > F` | `F` elements directly inside `E` elements | +| `E:nth-child(n)` | The n-th child of type `E` | +| `E:first-child` | First child of type `E` | +| `E:nth-of-type(n)` | The n-th sibling of type `E` | +| `E:first-of-type` | First sibling of type `E` | +| `E:not(s)` | Type `E` elements not matching selector `s` | +| `E.myclass` | Type `E` elements with class `"myclass"` | +| `E#myid` | Type `E` elements with ID `"myid"` | +| `E[attr]` | Type `E` elements with attribute `attr` | +| `E[attr="val"]` | Type `E` elements where `attr` is `"val"` | +| `E[attr="val" i]` | Type `E` elements where `attr` is `"val"`, case-insensitive | +| `E[attr="val" s]` | Type `E` elements where `attr` is `"val"`, case-sensitive | +| `E[attr~="val"]` | Type `E` elements where `attr` contains `"val"` in a space-separated list | +| `E[attr`|`;="val"]`| Type `E` elements where `attr` is hyphen-separated and starts with `"val"` | +| `E[attr^="val"]` | Type `E` elements where `attr` starts with `"val"` | +| `E[attr$="val"]` | Type `E` elements where `attr` ends with `"val"` | +| `E[attr*="val"]` | Type `E` elements where `attr` contains `"val"` | + +- `handler` _: (element: Element) => void_ + - A callback function that will be called once for each element that matches `selector` + +### Return value + +The `HTMLRewritingStream`, so multiple calls to `onElement` can be chained. + +### Exceptions + +- `Error` + - If the provided `selector` is not a valid CSS selector. + - If the provided `handler` is not a function. + +## Examples + + +In this example, we fetch an HTML page and use the HTML rewriter to add an attribute to all `div` tags and prepend the text `Header:` to all `h1` tags: + +```js +/// + +import { HTMLRewritingStream } from 'fastly/html-rewriter'; + +async function handleRequest(event) { + let transformer = new HTMLRewritingStream() + .onElement("h1", e => e.prepend("Header: ")) + .onElement("div", e => e.setAttribute("special-attribute", "top-secret")); + let body = (await fetch("https://example.com/")).body.pipeThrough(transformer); + + return new Response(body, { + status: 200, + headers: new Headers({ + "content-type": "text/html" + }) + }) +} + +addEventListener("fetch", (event) => event.respondWith(handleRequest(event))); +``` \ No newline at end of file diff --git a/documentation/rename-docs.mjs b/documentation/rename-docs.mjs index 78bf9c49ff..ed2951a3d0 100644 --- a/documentation/rename-docs.mjs +++ b/documentation/rename-docs.mjs @@ -25,7 +25,8 @@ const subsystems = [ 'kv-store', 'logger', 'object-store', - 'secret-store' + 'secret-store', + 'html-rewriter' ]; const files = readdirSync('docs'); diff --git a/integration-tests/js-compute/fixtures/app/src/html-rewriter.js b/integration-tests/js-compute/fixtures/app/src/html-rewriter.js new file mode 100644 index 0000000000..08e2079d52 --- /dev/null +++ b/integration-tests/js-compute/fixtures/app/src/html-rewriter.js @@ -0,0 +1,297 @@ +/* eslint-env serviceworker */ + +import { routes } from './routes.js'; +import { HTMLRewritingStream } from 'fastly:html-rewriter'; +import { + assert, + assertThrows, + assertRejects, + strictEqual, +} from './assertions.js'; + +routes.set('/html-rewriter/set-attribute', async () => { + const toRewrite = + 'Test

Hello, World!

'; + const expected = + 'Test

Hello, World!

'; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('h1', (e) => { + e.setAttribute('class', 'a-rewritten-class'); + e.setAttribute('id', 'a-rewritten-id'); + e.setAttribute('custom-attr', 'custom-value'); + }), + ); + let text = await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(text, expected); +}); + +routes.set('/html-rewriter/get-attribute', async () => { + const toRewrite = + 'Test

Hello, World!

'; + let classAttr, idAttr; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('h1', (e) => { + classAttr = e.getAttribute('class'); + idAttr = e.getAttribute('id'); + }), + ); + await new Response(body, { headers: { 'Content-Type': 'text/html' } }).text(); + strictEqual(classAttr, 'a-class'); + strictEqual(idAttr, 'an-id'); +}); + +routes.set('/html-rewriter/remove-attribute', async () => { + const toRewrite = + 'Test

Hello, World!

'; + const expected = + 'Test

Hello, World!

'; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('h1', (e) => { + e.removeAttribute('class'); + e.removeAttribute('id'); + e.removeAttribute('custom-attr'); + }), + ); + let text = await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(text, expected); +}); + +routes.set('/html-rewriter/replace-with', async () => { + const toRewrite = + 'Test

Hello, World!

'; + const expected = + 'Test

Goodbye, World!

'; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('h1', (e) => { + e.replaceWith('

Goodbye, World!

'); + }), + ); + let text = await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(text, expected); +}); + +routes.set('/html-rewriter/replace-children', async () => { + const toRewrite = + 'Test

Hello, World!

'; + const expected = + 'Test

Goodbye, World!

'; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('h1', (e) => { + e.replaceChildren('Goodbye, World!'); + }), + ); + let text = await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(text, expected); +}); + +routes.set('/html-rewriter/insert', async () => { + const toRewrite = + 'Test

Hello, World!

'; + const expected = + 'TestBefore -

Prefix - Hello, World! - Suffix

- After'; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('h1', (e) => { + e.before('Before - '); + e.prepend('Prefix - '); + e.append(' - Suffix'); + e.after(' - After'); + }), + ); + let text = await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(text, expected); +}); + +routes.set('/html-rewriter/complex-selector', async () => { + const toRewrite = + "Test

Hello, World!

Hello again, World!

"; + const expected = + "Test

Hello, World!

Hello again, World!

"; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('div.a-class > h1[id^="an"]', (e) => { + e.setAttribute('custom-attr', 'custom-value'); + }), + ); + let text = await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(text, expected); +}); + +routes.set('/html-rewriter/no-match-rewritten-content', async () => { + const toRewrite = + 'Test
'; + const expected = + 'Test

Hello, World!

'; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream() + .onElement('div', (e) => { + e.setAttribute('class', 'a-class'); + e.append('

Hello, World!

'); + }) + .onElement('h1', (e) => { + // should not be called, as h1 does not exist in original content + e.setAttribute('custom-attr', 'custom-value'); + }), + ); + let text = await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(text, expected); +}); + +routes.set('/html-rewriter/multiple-handlers', async () => { + const toRewrite = + "Test

Hello, World!

Hello again, World!

"; + const expected = + 'Test

Hello, World!

Hello again, World!

'; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream() + .onElement('div.a-class', (e) => { + e.setAttribute('custom-attr', 'custom-value'); + }) + .onElement('h1', (e) => { + e.setAttribute('another-attr', 'another-value'); + }), + ); + let text = await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(text, expected); +}); + +routes.set('/html-rewriter/invalid-selector', async () => { + assertThrows(() => { + new HTMLRewritingStream().onElement('div..a-class', (e) => { + e.setAttribute('custom-attr', 'custom-value'); + }); + }, Error); +}); + +routes.set('/html-rewriter/invalid-handler', async () => { + assertThrows(() => { + new HTMLRewritingStream().onElement( + 'div.a-class', + 'this is not a function', + ); + }, Error); +}); + +routes.set('/html-rewriter/throw-in-handler', async () => { + const toRewrite = + "Test

Hello, World!

"; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('div.a-class', (e) => { + throw new Error('This is an error from the handler'); + }), + ); + assertRejects(async () => { + await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + }, Error); +}); + +routes.set('/html-rewriter/invalid-html', async () => { + const toRewrite = 'This is not HTML content'; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('div.a-class', (e) => { + e.setAttribute('custom-attr', 'custom-value'); + }), + ); + assertRejects(async () => { + await new Response(body, { + headers: { 'Content-Type': 'text/plain' }, + }).text(); + }, Error); +}); + +routes.set('/html-rewriter/insertion-order', async () => { + const toRewrite = + 'Test

Hello, World!

'; + const expected = + 'TestFirst - Before -

Prefix - Other Prefix - Hello, World! - Suffix - Other Suffix

- After - Last'; + let body = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('h1', (e) => { + e.before('First - '); + e.before('Before - '); + // The insertion position is maintained, so prepends are inserted in reverse order + e.prepend('Other Prefix - '); + e.prepend('Prefix - '); + e.append(' - Suffix'); + e.append(' - Other Suffix'); + // The insertion position is maintained, so appends are inserted in reverse order + e.after(' - Last'); + e.after(' - After'); + }), + ); + let text = await new Response(body, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(text, expected); +}); + +routes.set('/html-rewriter/escape-html', async () => { + const toRewrite = + 'Test

Hello, World!

'; + const expectedNoEscape = + 'Test

Hello, Beautiful World!

'; + const expectedEscape = + 'Test

Hello, <strong>Beautiful</strong> World!

'; + let bodyNoEscape = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('em', (e) => { + e.before('Beautiful ', { escapeHTML: false }); + }), + ); + let textNoEscape = await new Response(bodyNoEscape, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(textNoEscape, expectedNoEscape); + + let bodyEscape = new Response(toRewrite, { + headers: { 'Content-Type': 'text/html' }, + }).body.pipeThrough( + new HTMLRewritingStream().onElement('em', (e) => { + e.before('Beautiful ', { escapeHTML: true }); + }), + ); + let textEscape = await new Response(bodyEscape, { + headers: { 'Content-Type': 'text/html' }, + }).text(); + strictEqual(textEscape, expectedEscape); +}); diff --git a/integration-tests/js-compute/fixtures/app/src/index.js b/integration-tests/js-compute/fixtures/app/src/index.js index 1ecbd89ebc..68b67a3fc2 100644 --- a/integration-tests/js-compute/fixtures/app/src/index.js +++ b/integration-tests/js-compute/fixtures/app/src/index.js @@ -26,6 +26,7 @@ import './fastly-global.js'; import './fetch-errors.js'; import './geoip.js'; import './headers.js'; +import './html-rewriter.js'; import './include-bytes.js'; import './logger.js'; import './manual-framing-headers.js'; diff --git a/integration-tests/js-compute/fixtures/app/tests.json b/integration-tests/js-compute/fixtures/app/tests.json index f7fe5d5505..d9710e2ffd 100644 --- a/integration-tests/js-compute/fixtures/app/tests.json +++ b/integration-tests/js-compute/fixtures/app/tests.json @@ -2998,5 +2998,20 @@ }, "GET /compute/purge-surrogate-key-soft": { "environments": ["compute"] - } + }, + "GET /html-rewriter/set-attribute": {}, + "GET /html-rewriter/get-attribute": {}, + "GET /html-rewriter/remove-attribute": {}, + "GET /html-rewriter/replace-with": {}, + "GET /html-rewriter/replace-children": {}, + "GET /html-rewriter/insert": {}, + "GET /html-rewriter/complex-selector": {}, + "GET /html-rewriter/no-match-rewritten-content": {}, + "GET /html-rewriter/multiple-handlers": {}, + "GET /html-rewriter/invalid-selector": {}, + "GET /html-rewriter/invalid-handler": {}, + "GET /html-rewriter/throw-in-handler": {}, + "GET /html-rewriter/invalid-html": {}, + "GET /html-rewriter/insertion-order": {}, + "GET /html-rewriter/escape-html": {} } diff --git a/runtime/fastly/CMakeLists.txt b/runtime/fastly/CMakeLists.txt index e12e6c4b27..3bed3fc4c5 100644 --- a/runtime/fastly/CMakeLists.txt +++ b/runtime/fastly/CMakeLists.txt @@ -47,6 +47,15 @@ add_builtin(fastly::fetch_event DEPENDENCIES OpenSSL) +add_rust_lib(lol_html_c_api "${CMAKE_CURRENT_SOURCE_DIR}/crates/rust-lol-html" "\"capi\"") + +add_builtin(fastly::html_rewriter + SRC + builtins/html-rewriter.cpp + INCLUDE_DIRS + ${CMAKE_CURRENT_SOURCE_DIR}/crates/rust-lol-html/include +) + add_compile_definitions(PUBLIC RUNTIME_VERSION=${RUNTIME_VERSION}) project(FastlyJS) diff --git a/runtime/fastly/builtins/html-rewriter.cpp b/runtime/fastly/builtins/html-rewriter.cpp new file mode 100644 index 0000000000..7404a088f5 --- /dev/null +++ b/runtime/fastly/builtins/html-rewriter.cpp @@ -0,0 +1,589 @@ +#include "html-rewriter.h" +#include "../../../StarlingMonkey/builtins/web/streams/transform-stream-default-controller.h" +#include "../../../StarlingMonkey/builtins/web/streams/transform-stream.h" +#include "../../../StarlingMonkey/runtime/encode.h" +#include "../host-api/host_api_fastly.h" +#include + +using builtins::web::streams::TransformStream; +using builtins::web::streams::TransformStreamDefaultController; + +namespace fastly::html_rewriter { +const JSFunctionSpec Element::static_methods[] = {JS_FS_END}; +const JSPropertySpec Element::static_properties[] = {JS_PS_END}; +const JSFunctionSpec Element::methods[] = { + JS_FN("before", before, 1, JSPROP_ENUMERATE), + JS_FN("prepend", prepend, 1, JSPROP_ENUMERATE), + JS_FN("append", append, 1, JSPROP_ENUMERATE), + JS_FN("after", after, 1, JSPROP_ENUMERATE), + JS_FN("getAttribute", getAttribute, 1, JSPROP_ENUMERATE), + JS_FN("setAttribute", setAttribute, 2, JSPROP_ENUMERATE), + JS_FN("removeAttribute", removeAttribute, 1, JSPROP_ENUMERATE), + JS_FN("replaceChildren", replaceChildren, 1, JSPROP_ENUMERATE), + JS_FN("replaceWith", replaceWith, 1, JSPROP_ENUMERATE), + JS_FS_END}; +const JSPropertySpec Element::properties[] = { + JS_PSG("selector", Element::selector_get, JSPROP_ENUMERATE), + JS_PSG("tag", Element::tag_get, JSPROP_ENUMERATE), JS_PS_END}; + +lol_html_element_t *raw_element(JSObject *self) { + MOZ_ASSERT(Element::is_instance(self)); + return static_cast( + JS::GetReservedSlot(self, static_cast(Element::Slots::Raw)).toPrivate()); +} + +bool Element::selector_get(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(0) + auto selector = JS::GetReservedSlot(self, static_cast(Slots::Selector)).toString(); + args.rval().setString(selector); + return true; +} + +bool Element::tag_get(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(0) + auto element = raw_element(self); + MOZ_ASSERT(element); + auto str = lol_html_element_tag_name_get(element); + args.rval().setString(JS_NewStringCopyN(cx, str.data, str.len)); + return true; +} + +// We should escape only if the user supplies { escapeHTML: true } in the options object +static bool should_escape_html(JSContext *cx, JS::HandleValue options_arg) { + if (!options_arg.isObject()) { + return false; + } + + JS::RootedObject options_obj(cx, &options_arg.toObject()); + JS::RootedValue escape_html_val(cx); + if (!JS_GetProperty(cx, options_obj, "escapeHTML", &escape_html_val)) { + return false; + } + return escape_html_val.isBoolean() && escape_html_val.toBoolean(); +} + +bool Element::before(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(1) + + JS::HandleValue content_arg = args.get(0); + auto content = core::encode(cx, content_arg); + if (!content) { + return false; + } + + auto element = raw_element(self); + MOZ_ASSERT(element); + return lol_html_element_before(element, content.begin(), content.size(), + !should_escape_html(cx, args.get(1))) == 0; +} + +bool Element::prepend(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(1) + + JS::HandleValue content_arg = args.get(0); + auto content = core::encode(cx, content_arg); + if (!content) { + return false; + } + + auto element = raw_element(self); + MOZ_ASSERT(element); + return lol_html_element_prepend(element, content.begin(), content.size(), + !should_escape_html(cx, args.get(1))) == 0; +} + +bool Element::append(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(1) + + JS::HandleValue content_arg = args.get(0); + auto content = core::encode(cx, content_arg); + if (!content) { + return false; + } + + auto element = raw_element(self); + MOZ_ASSERT(element); + return lol_html_element_append(element, content.begin(), content.size(), + !should_escape_html(cx, args.get(1))) == 0; +} + +bool Element::after(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(1) + + JS::HandleValue content_arg = args.get(0); + auto content = core::encode(cx, content_arg); + if (!content) { + return false; + } + auto element = raw_element(self); + MOZ_ASSERT(element); + return lol_html_element_after(element, content.begin(), content.size(), + !should_escape_html(cx, args.get(1))) == 0; +} + +bool Element::getAttribute(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(1) + + JS::HandleValue name_arg = args.get(0); + auto name = core::encode(cx, name_arg); + if (!name) { + return false; + } + + auto element = raw_element(self); + MOZ_ASSERT(element); + auto attr = lol_html_element_get_attribute(element, name.begin(), name.size()); + if (!attr.data) { + args.rval().setNull(); + } else { + args.rval().setString(JS_NewStringCopyN(cx, attr.data, attr.len)); + } + + return true; +} + +bool Element::setAttribute(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(2) + + JS::HandleValue name_arg = args.get(0); + auto name = core::encode(cx, name_arg); + if (!name) { + return false; + } + + JS::HandleValue value_arg = args.get(1); + auto value = core::encode(cx, value_arg); + if (!value) { + return false; + } + + auto element = raw_element(self); + MOZ_ASSERT(element); + return lol_html_element_set_attribute(element, name.begin(), name.size(), value.begin(), + value.size()) == 0; +} + +bool Element::removeAttribute(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(1) + + JS::HandleValue name_arg = args.get(0); + auto name = core::encode(cx, name_arg); + if (!name) { + return false; + } + + auto element = raw_element(self); + MOZ_ASSERT(element); + lol_html_element_remove_attribute(element, name.begin(), name.size()); + + return true; +} + +bool Element::replaceChildren(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(1) + + JS::HandleValue content_arg = args.get(0); + auto content = core::encode(cx, content_arg); + if (!content) { + return false; + } + + auto element = raw_element(self); + MOZ_ASSERT(element); + return lol_html_element_set_inner_content(element, content.begin(), content.size(), + !should_escape_html(cx, args.get(1))) == 0; +} + +bool Element::replaceWith(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(1) + + JS::HandleValue content_arg = args.get(0); + auto content = core::encode(cx, content_arg); + if (!content) { + return false; + } + + auto element = raw_element(self); + MOZ_ASSERT(element); + return lol_html_element_replace(element, content.begin(), content.size(), + !should_escape_html(cx, args.get(1))) == 0; +} + +static JSObject *create_element(JSContext *cx, lol_html_element_t *element, + JS::HandleString selector) { + JS::RootedObject obj(cx, JS_NewObjectWithGivenProto(cx, &Element::class_, Element::proto_obj)); + if (!obj) { + return nullptr; + } + JS::SetReservedSlot(obj, static_cast(Element::Slots::Raw), JS::PrivateValue(element)); + JS::SetReservedSlot(obj, static_cast(Element::Slots::Selector), + JS::StringValue(selector)); + return obj; +} + +const JSFunctionSpec HTMLRewritingStream::static_methods[] = {JS_FS_END}; +const JSPropertySpec HTMLRewritingStream::static_properties[] = {JS_PS_END}; +const JSFunctionSpec HTMLRewritingStream::methods[] = { + JS_FN("onElement", onElement, 2, JSPROP_ENUMERATE), JS_FS_END}; +const JSPropertySpec HTMLRewritingStream::properties[] = { + JS_PSG("readable", HTMLRewritingStream::readable_get, JSPROP_ENUMERATE), + JS_PSG("writable", HTMLRewritingStream::writable_get, JSPROP_ENUMERATE), JS_PS_END}; + +static lol_html_rewriter_builder_t *builder(JSObject *self) { + MOZ_ASSERT(HTMLRewritingStream::is_instance(self)); + return static_cast( + JS::GetReservedSlot(self, static_cast(HTMLRewritingStream::Slots::RawBuilder)) + .toPrivate()); +} + +static void set_builder(JSObject *self, lol_html_rewriter_builder_t *builder) { + MOZ_ASSERT(HTMLRewritingStream::is_instance(self)); + JS::SetReservedSlot(self, static_cast(HTMLRewritingStream::Slots::RawBuilder), + JS::PrivateValue(builder)); +} + +static JSObject *transform(JSObject *self) { + MOZ_ASSERT(HTMLRewritingStream::is_instance(self)); + return &JS::GetReservedSlot(self, HTMLRewritingStream::Slots::Transform).toObject(); +} + +static lol_html_rewriter_t *raw_rewriter(JSObject *self) { + MOZ_ASSERT(HTMLRewritingStream::is_instance(self)); + return static_cast( + JS::GetReservedSlot(self, static_cast(HTMLRewritingStream::Slots::RawRewriter)) + .toPrivate()); +} +static void set_raw_rewriter(JSObject *self, lol_html_rewriter_t *rewriter) { + MOZ_ASSERT(HTMLRewritingStream::is_instance(self)); + JS::SetReservedSlot(self, HTMLRewritingStream::Slots::RawRewriter, JS::PrivateValue(rewriter)); +} + +// Data needed to call an element handler from lol_html +// This also manages the lifetime of the lol-html selector +// There should be exactly one of these per element handler registered on the rewriter +// They should all be deleted when the HTMLRewritingStream is finalized +class ElementHandlerData { +public: + ElementHandlerData(JSContext *cx, JSObject *handler, JSString *js_selector, + lol_html_selector_t *raw_selector) + : cx_(cx), handler_(handler), js_selector_(js_selector), raw_selector_(raw_selector) {} + + ~ElementHandlerData() { lol_html_selector_free(raw_selector_); } + + JSContext *cx() const { return cx_; } + JSObject *handler() const { return handler_; } + JSString *selector() const { return js_selector_; } + +private: + JSContext *cx_; + JS::Heap handler_; + JS::Heap js_selector_; + lol_html_selector_t *raw_selector_; +}; + +// Called by lol_html when an element matching a registered selector is found +static lol_html_rewriter_directive_t handle_element(lol_html_element_t *element, void *user_data) { + auto *data = static_cast(user_data); + JS::RootedString selector(data->cx(), data->selector()); + JS::RootedObject jsElement(data->cx(), create_element(data->cx(), element, selector)); + if (!jsElement) { + return LOL_HTML_STOP; + } + JS::RootedValue jsElementVal(data->cx(), JS::ObjectValue(*jsElement)); + JS::RootedValue handlerVal(data->cx(), JS::ObjectValue(*data->handler())); + JS::HandleValueArray arg(jsElementVal); + JS::RootedValue rval(data->cx()); + if (!JS_CallFunctionValue(data->cx(), nullptr, handlerVal, arg, &rval)) { + return LOL_HTML_STOP; + } + return LOL_HTML_CONTINUE; +} + +bool HTMLRewritingStream::onElement(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER(2) + + if (!builder(self)) { + JS_ReportErrorASCII(cx, "HTMLRewriter: cannot add handlers after the rewriter has been used"); + return false; + } + + JS::HandleValue selector_arg = args.get(0); + auto selector_str = core::encode(cx, selector_arg); + if (!selector_str) { + return false; + } + + JS::HandleValue handler = args.get(1); + if (!handler.isObject() || !JS_ObjectIsFunction(&handler.toObject())) { + JS_ReportErrorASCII(cx, "HTMLRewriter: element handler must be a function"); + return false; + } + + auto raw_selector = lol_html_selector_parse(selector_str.begin(), selector_str.size()); + if (!raw_selector) { + auto error = lol_html_take_last_error(); + if (error.data) { + // Error may not be null-terminated + std::string msg(error.data, error.len); + JS_ReportErrorASCII(cx, "HTMLRewriter: invalid selector - %s", msg.c_str()); + } else { + JS_ReportErrorASCII(cx, "HTMLRewriter: invalid selector"); + } + return false; + } + // Create a unique_ptr so we don't leak if we error out below + auto handler_data = std::make_unique(cx, &handler.toObject(), + selector_arg.toString(), raw_selector); + + if (lol_html_rewriter_builder_add_element_content_handlers( + builder(self), raw_selector, handle_element, handler_data.get(), nullptr, nullptr, + nullptr, nullptr) != 0) { + return false; + } + + // This slot holds all element handlers so we can free them when the stream is finalized. + // This will also free the lol-html selectors. + auto element_handlers = static_cast *>( + JS::GetReservedSlot(self, static_cast(Slots::ElementHandlers)).toPrivate()); + element_handlers->push_back(handler_data.release()); + + args.rval().setObject(*self); + return true; +} + +struct OutputContextData { + JSContext *cx; + JSObject *self; +}; + +void HTMLRewritingStream::finalize(JS::GCContext *gcx, JSObject *self) { + MOZ_ASSERT(is_instance(self)); + auto build = builder(self); + if (build) { + lol_html_rewriter_builder_free(static_cast(build)); + } + + auto element_handlers = static_cast *>( + JS::GetReservedSlot(self, static_cast(Slots::ElementHandlers)).toPrivate()); + if (element_handlers) { + for (auto handler : *element_handlers) { + delete handler; + } + delete element_handlers; + } + + auto output_context = static_cast( + JS::GetReservedSlot(self, HTMLRewritingStream::Slots::OutputContext).toPrivate()); + if (output_context) { + delete output_context; + } + + auto rewriter = raw_rewriter(self); + if (rewriter) { + lol_html_rewriter_free(static_cast(rewriter)); + } +} + +static void output_callback(const char *chunk, size_t chunk_len, void *user_data) { + auto *ctx = static_cast(user_data); + JSContext *cx = ctx->cx; + JSObject *self = ctx->self; + + JS::RootedObject out_obj(cx, JS_NewUint8Array(cx, chunk_len)); + if (!out_obj) { + return; + } + + { + bool is_shared; + JS::AutoCheckCannotGC nogc(cx); + uint8_t *out_buffer = JS_GetUint8ArrayData(out_obj, &is_shared, nogc); + memcpy(out_buffer, chunk, chunk_len); + } + + JS::RootedObject controller(cx, TransformStream::controller(transform(self))); + JS::RootedValue out_chunk(cx, JS::ObjectValue(*out_obj)); + + if (!TransformStreamDefaultController::Enqueue(cx, controller, out_chunk)) { + return; + } +} + +// lol-html doesn't support registering handlers after any input has been processed, +// so we finalize the builder and create the rewriter on the first chunk submitted to transform +bool HTMLRewritingStream::finish_building(JSContext *cx, JS::HandleObject stream) { + MOZ_ASSERT(is_instance(stream)); + + // The output callback needs the JSContext and the stream object so it can enqueue into output + // stream. We use a unique_ptr to ensure we don't leak if something fails. + auto output_context = std::make_unique(cx, stream); + // Same defaults as Rust + lol_html_memory_settings_t memory_settings = {1024, std::numeric_limits::max()}; + auto encoding_string_length = 5; // "utf-8" + auto rewriter = + lol_html_rewriter_build(builder(stream), "utf-8", encoding_string_length, memory_settings, + output_callback, output_context.get(), true); + if (!rewriter) { + return false; + } + + JS_SetReservedSlot(stream, HTMLRewritingStream::Slots::OutputContext, + JS::PrivateValue(output_context.release())); + set_raw_rewriter(stream, rewriter); + + // The builder is no longer needed after building the rewriter and can be safely freed + lol_html_rewriter_builder_free(builder(stream)); + set_builder(stream, nullptr); // Ensure we don't try to free it again in finalize + + return true; +} + +bool HTMLRewritingStream::transformAlgorithm(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER_WITH_NAME(1, "HTML rewriter transform algorithm") + + if (!raw_rewriter(self)) { + HTMLRewritingStream::finish_building(cx, self); + } + + auto rewriter = raw_rewriter(self); + MOZ_ASSERT(rewriter); + + auto chunk = args.get(0); + auto data = value_to_buffer(cx, chunk, "HTMLRewritingStream transform: chunks"); + if (!data.has_value()) { + return false; + } + + if (data->size() == 0) { + return true; + } + + lol_html_take_last_error(); // Clear any previous error + // lol-html will call output_callback with the processed data + lol_html_rewriter_write(rewriter, reinterpret_cast(data->data()), data->size()); + auto err = lol_html_take_last_error(); + if (err.data) { + // Error may not be null-terminated + JS_ReportErrorASCII(cx, "Error processing HTML: %s", std::string(err.data, err.len).c_str()); + return false; + } + + args.rval().setUndefined(); + return true; +} + +bool HTMLRewritingStream::flushAlgorithm(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER_WITH_NAME(0, "HTML rewriter flush algorithm") + + // Just in case the stream is flushed immediately + if (!raw_rewriter(self)) { + HTMLRewritingStream::finish_building(cx, self); + } + + auto rewriter = raw_rewriter(self); + MOZ_ASSERT(rewriter); + + lol_html_take_last_error(); // Clear any previous error + // lol-html will call output_callback with any remaining data + lol_html_rewriter_end(rewriter); + auto err = lol_html_take_last_error(); + if (err.data) { + // Error may not be null-terminated + JS_ReportErrorASCII(cx, "Error processing HTML: %s", std::string(err.data, err.len).c_str()); + return false; + } + + args.rval().setUndefined(); + return true; +} + +bool HTMLRewritingStream::readable_get(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER_WITH_NAME(0, "get readable") + args.rval().setObject(*TransformStream::readable(transform(self))); + return true; +} + +bool HTMLRewritingStream::writable_get(JSContext *cx, unsigned argc, JS::Value *vp) { + METHOD_HEADER_WITH_NAME(0, "get writable") + args.rval().setObject(*TransformStream::writable(transform(self))); + return true; +} + +JS::PersistentRooted transformAlgo; +JS::PersistentRooted flushAlgo; + +bool HTMLRewritingStream::constructor(JSContext *cx, unsigned argc, JS::Value *vp) { + CTOR_HEADER("HTMLRewritingStream", 0) + + JS::RootedObject instance(cx, JS_NewObjectForConstructor(cx, &class_, args)); + if (!instance) { + return false; + } + auto builder = lol_html_rewriter_builder_new(); + if (!builder) { + return false; + } + set_builder(instance, builder); + // We have no rewriter initially; it will be created on the first chunk processed + set_raw_rewriter(instance, nullptr); + JS::RootedValue stream_val(cx, JS::ObjectValue(*instance)); + JS::RootedObject transform(cx, TransformStream::create(cx, 1, nullptr, 0, nullptr, stream_val, + nullptr, transformAlgo, flushAlgo)); + if (!transform) { + return false; + } + + TransformStream::set_used_as_mixin(transform); + JS::SetReservedSlot(instance, HTMLRewritingStream::Slots::Transform, JS::ObjectValue(*transform)); + + JS::SetReservedSlot(instance, static_cast(Slots::ElementHandlers), + JS::PrivateValue(new std::vector())); + + args.rval().setObject(*instance); + return true; +} + +bool HTMLRewritingStream::init_class(JSContext *cx, JS::HandleObject global) { + if (!init_class_impl(cx, global)) { + return false; + } + + JSFunction *transformFun = + JS_NewFunction(cx, transformAlgorithm, 1, 0, "HTML Rewriter Transform"); + if (!transformFun) + return false; + transformAlgo.init(cx, JS_GetFunctionObject(transformFun)); + + JSFunction *flushFun = JS_NewFunction(cx, flushAlgorithm, 1, 0, "HTML Rewriter Flush"); + if (!flushFun) + return false; + flushAlgo.init(cx, JS_GetFunctionObject(flushFun)); + + return true; +} + +bool install(api::Engine *engine) { + if (!HTMLRewritingStream::init_class(engine->cx(), engine->global())) { + return false; + } + + if (!Element::init_class(engine->cx(), engine->global())) { + return false; + } + + RootedObject html_rewriter_obj( + engine->cx(), + JS_GetConstructor(engine->cx(), builtins::BuiltinImpl::proto_obj)); + RootedValue html_rewriter_val(engine->cx(), ObjectValue(*html_rewriter_obj)); + RootedObject html_rewriter_ns(engine->cx(), JS_NewObject(engine->cx(), nullptr)); + if (!JS_SetProperty(engine->cx(), html_rewriter_ns, "HTMLRewritingStream", html_rewriter_val)) { + return false; + } + RootedValue html_rewriter_ns_val(engine->cx(), JS::ObjectValue(*html_rewriter_ns)); + if (!engine->define_builtin_module("fastly:html-rewriter", html_rewriter_ns_val)) { + return false; + } + + return true; +} +} // namespace fastly::html_rewriter \ No newline at end of file diff --git a/runtime/fastly/builtins/html-rewriter.h b/runtime/fastly/builtins/html-rewriter.h new file mode 100644 index 0000000000..43794b8df1 --- /dev/null +++ b/runtime/fastly/builtins/html-rewriter.h @@ -0,0 +1,62 @@ +// An HTML rewriter implementation based on TransformStreams. + +#ifndef FASTLY_HTML_REWRITER_H +#define FASTLY_HTML_REWRITER_H + +#include "../host-api/host_api_fastly.h" +#include "builtin.h" +#include "extension-api.h" + +namespace fastly::html_rewriter { +class Element : public builtins::BuiltinNoConstructor { +private: + static bool selector_get(JSContext *cx, unsigned argc, JS::Value *vp); + static bool tag_get(JSContext *cx, unsigned argc, JS::Value *vp); + +public: + static constexpr const char *class_name = "Element"; + static const int ctor_length = 0; + enum Slots { Raw, Selector, Count }; + static const JSFunctionSpec static_methods[]; + static const JSPropertySpec static_properties[]; + static const JSFunctionSpec methods[]; + static const JSPropertySpec properties[]; + + static bool before(JSContext *cx, unsigned argc, JS::Value *vp); + static bool prepend(JSContext *cx, unsigned argc, JS::Value *vp); + static bool append(JSContext *cx, unsigned argc, JS::Value *vp); + static bool after(JSContext *cx, unsigned argc, JS::Value *vp); + static bool getAttribute(JSContext *cx, unsigned argc, JS::Value *vp); + static bool setAttribute(JSContext *cx, unsigned argc, JS::Value *vp); + static bool removeAttribute(JSContext *cx, unsigned argc, JS::Value *vp); + static bool replaceChildren(JSContext *cx, unsigned argc, JS::Value *vp); + static bool replaceWith(JSContext *cx, unsigned argc, JS::Value *vp); +}; + +class HTMLRewritingStream : public builtins::BuiltinImpl { +private: + static bool transformAlgorithm(JSContext *cx, unsigned argc, JS::Value *vp); + static bool flushAlgorithm(JSContext *cx, unsigned argc, JS::Value *vp); + static bool readable_get(JSContext *cx, unsigned argc, JS::Value *vp); + static bool writable_get(JSContext *cx, unsigned argc, JS::Value *vp); + +public: + static constexpr const char *class_name = "HTMLRewritingStream"; + static const int ctor_length = 0; + enum Slots { RawBuilder, RawRewriter, Buffer, OutputContext, ElementHandlers, Transform, Count }; + static const JSFunctionSpec static_methods[]; + static const JSPropertySpec static_properties[]; + static const JSFunctionSpec methods[]; + static const JSPropertySpec properties[]; + + static bool init_class(JSContext *cx, JS::HandleObject global); + static bool constructor(JSContext *cx, unsigned argc, JS::Value *vp); + + static bool onElement(JSContext *cx, unsigned argc, JS::Value *vp); + + static bool finish_building(JSContext *cx, JS::HandleObject stream); + static void finalize(JS::GCContext *gcx, JSObject *self); +}; +} // namespace fastly::html_rewriter + +#endif \ No newline at end of file diff --git a/runtime/fastly/crates/rust-lol-html/Cargo.toml b/runtime/fastly/crates/rust-lol-html/Cargo.toml new file mode 100644 index 0000000000..3406bbbf11 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/Cargo.toml @@ -0,0 +1,41 @@ +[package] +name = "lol_html_c_api" +version = "1.3.0" +description = "Low output latency streaming HTML parser/rewriter" +authors = [ + "Ivan Nikulin ", + "Joshua Nelson ", +] +edition = "2021" +links = "lolhtml" +publish = false + +[features] +default = ["capi"] +# Required to exist for cargo-c to work +capi = [] + +[dependencies] +encoding_rs = "0.8.35" +lol_html = "2.6.0" +libc = "0" +thiserror = "2" + +[profile.release] +panic = "abort" +lto = true + +[lib] +crate-type = ["staticlib", "cdylib", "rlib"] + +[package.metadata.capi.header] +name = "lol_html" +subdirectory = "" +generation = false + +[package.metadata.capi.install.include] +asset = [{ from = "include/lol_html.h" }] + +[package.metadata.capi.pkg_config] +name = "lol-html" +filename = "lol-html" diff --git a/runtime/fastly/crates/rust-lol-html/LICENSE b/runtime/fastly/crates/rust-lol-html/LICENSE new file mode 100644 index 0000000000..cd274ca9e5 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/LICENSE @@ -0,0 +1,27 @@ +Copyright (C) 2019, Cloudflare, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/runtime/fastly/crates/rust-lol-html/build.rs b/runtime/fastly/crates/rust-lol-html/build.rs new file mode 100644 index 0000000000..30b0c5330f --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/build.rs @@ -0,0 +1,2 @@ +// Required for the links attribute +fn main() {} diff --git a/runtime/fastly/crates/rust-lol-html/include/lol_html.h b/runtime/fastly/crates/rust-lol-html/include/lol_html.h new file mode 100644 index 0000000000..bd60cc4909 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/include/lol_html.h @@ -0,0 +1,949 @@ +#ifndef LOL_HTML_H +#define LOL_HTML_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include + +// NOTE: all functions that accept pointers will panic abort the thread +// if NULL pointer is passed (with an exception for the cases where +// explicitly stated that function can accept NULL pointers). + +// NOTE: all UTF8-strings passed to the API functions allow interior '\0's +// and their length determined by the corresponding length parameter only. + +// Opaque structures used by the rewriter. +// WARNING: these structures should never be deallocated by the C code. +// There are appropriate methods exposed that take care of these structures +// deallocation. +typedef struct lol_html_HtmlRewriterBuilder lol_html_rewriter_builder_t; +typedef struct lol_html_HtmlRewriter lol_html_rewriter_t; +typedef struct lol_html_Doctype lol_html_doctype_t; +typedef struct lol_html_DocumentEnd lol_html_doc_end_t; +typedef struct lol_html_EndTag lol_html_end_tag_t; +typedef struct lol_html_Comment lol_html_comment_t; +typedef struct lol_html_TextChunk lol_html_text_chunk_t; +typedef struct lol_html_Element lol_html_element_t; +typedef struct lol_html_AttributesIterator lol_html_attributes_iterator_t; +typedef struct lol_html_Attribute lol_html_attribute_t; +typedef struct lol_html_Selector lol_html_selector_t; +typedef struct lol_html_CStreamingHandlerSink lol_html_streaming_sink_t; + +// Library-allocated UTF8 string fat pointer. +// +// The string is not NULL-terminated. +// +// Should NEVER be deallocated in the C code. Use special `lol_html_str_free` +// function instead. +typedef struct { + // String data pointer. + const char *data; + + // The length of the string in bytes. + size_t len; +} lol_html_str_t; + +// A fat pointer to text chunk content. +// +// The difference between this struct and `lol_html_str_t` is +// that text chunk content shouldn't be deallocated manually via +// `lol_html_str_free` method call. Instead the pointer becomes +// invalid ones related `lol_html_text_chunk_t` struct goes out +// of scope. +typedef struct { + // String data pointer. + const char *data; + + // The length of the string in bytes. + size_t len; +} lol_html_text_chunk_content_t; + +// Utilities +//--------------------------------------------------------------------- + +// Frees the memory held by the library-allocated string. +// +// This is valid to call even if `str.data == NULL` (it does nothing, like `free(NULL)`). +void lol_html_str_free(lol_html_str_t str); + +// Returns the last error message and resets last error to NULL. +// +// The `data` field will be NULL if there was no error. +lol_html_str_t lol_html_take_last_error(); + +// Creates new HTML rewriter builder. +lol_html_rewriter_builder_t *lol_html_rewriter_builder_new(); + +// Content handlers +//--------------------------------------------------------------------- +// Rewriter directive that should be returned from each content handler. +// If LOL_HTML_STOP directive is returned then rewriting stops immediately +// and `write()` or `end()` methods of the rewriter return an error code. +typedef enum { LOL_HTML_CONTINUE, LOL_HTML_STOP } lol_html_rewriter_directive_t; + +typedef lol_html_rewriter_directive_t (*lol_html_doctype_handler_t)(lol_html_doctype_t *doctype, + void *user_data); + +typedef lol_html_rewriter_directive_t (*lol_html_comment_handler_t)(lol_html_comment_t *comment, + void *user_data); + +typedef lol_html_rewriter_directive_t (*lol_html_text_handler_handler_t)( + lol_html_text_chunk_t *chunk, void *user_data); + +typedef lol_html_rewriter_directive_t (*lol_html_element_handler_t)(lol_html_element_t *element, + void *user_data); + +typedef lol_html_rewriter_directive_t (*lol_html_doc_end_handler_t)(lol_html_doc_end_t *doc_end, + void *user_data); + +typedef lol_html_rewriter_directive_t (*lol_html_end_tag_handler_t)(lol_html_end_tag_t *end_tag, + void *user_data); + +// `size_t` byte offsets from the start of the input document +typedef struct lol_html_SourceLocationBytes { + size_t start; + size_t end; +} lol_html_source_location_bytes_t; + +// For use with streaming content handlers. +// +// Safety: the user data and the callbacks must be safe to use from a different thread (e.g. can't +// rely on thread-local storage). It doesn't have to be `Sync`, it will be used only by one thread +// at a time. +// +// Handler functions copy this struct. It can (and should) be created on the stack. +typedef struct lol_html_CStreamingHandler { + // Anything you like + void *user_data; + // Called when the handler is supposed to produce its output. Return `0` for success. + // The `sink` argument is guaranteed non-`NULL`. It is valid only for the duration of this call, + // and can only be used on the same thread. The sink is for [`lol_html_streaming_sink_write_str`] + // and [`lol_html_streaming_sink_write_utf8_chunk`]. `user_data` comes from this struct. + // `write_all_callback` must not be `NULL`. + int (*write_all_callback)(lol_html_streaming_sink_t *sink, void *user_data); + // Called exactly once, after the last use of this handler. + // `user_data` comes from this struct. + // May be `NULL`. + void (*drop_callback)(void *user_data); + // *Always* initialize to `NULL`. + void *reserved; +} lol_html_streaming_handler_t; + +// Selector +//--------------------------------------------------------------------- + +// Parses given CSS selector string. +// +// Returns NULL if parsing error occurs. The actual error message +// can be obtained using `lol_html_take_last_error` function. +// +// WARNING: Selector SHOULD NOT be deallocated if there are any active rewriter +// builders that accepted it as an argument to +// `lol_html_rewriter_builder_add_element_content_handlers()` method. Deallocate all dependant +// rewriter builders first and then use `lol_html_selector_free` function to free the selector. +lol_html_selector_t *lol_html_selector_parse(const char *selector, size_t selector_len); + +// Frees the memory held by the parsed selector object. +void lol_html_selector_free(lol_html_selector_t *selector); + +// Rewriter builder +//--------------------------------------------------------------------- + +// Adds document-level content handlers to the builder. +// +// If a particular handler is not required then NULL can be passed +// instead. Don't use stub handlers in this case as this affects +// performance - rewriter skips parsing of the content that doesn't +// need to be processed. +// +// Each handler can optionally have associated user data which will be +// passed to the handler on each invocation along with the rewritable +// unit argument. +// +// If any of handlers return LOL_HTML_STOP directive then rewriting +// stops immediately and `write()` or `end()` of the rewriter methods +// return an error code. +// +// WARNING: Pointers passed to handlers are valid only during the +// handler execution. So they should never be leaked outside of handlers. +void lol_html_rewriter_builder_add_document_content_handlers( + lol_html_rewriter_builder_t *builder, lol_html_doctype_handler_t doctype_handler, + void *doctype_handler_user_data, lol_html_comment_handler_t comment_handler, + void *comment_handler_user_data, lol_html_text_handler_handler_t text_handler, + void *text_handler_user_data, lol_html_doc_end_handler_t doc_end_handler, + void *doc_end_user_data); + +// Adds element content handlers to the builder for the +// given CSS selector. +// +// Selector should be a valid UTF8-string. +// +// If a particular handler is not required then NULL can be passed +// instead. Don't use stub handlers in this case as this affects +// performance - rewriter skips parsing of the content that doesn't +// need to be processed. +// +// Each handler can optionally have associated user data which will be +// passed to the handler on each invocation along with the rewritable +// unit argument. +// +// If any of handlers return LOL_HTML_STOP directive then rewriting +// stops immediately and `write()` or `end()` of the rewriter methods +// return an error code. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +// +// WARNING: Pointers passed to handlers are valid only during the +// handler execution. So they should never be leaked outside of handlers. +int lol_html_rewriter_builder_add_element_content_handlers( + lol_html_rewriter_builder_t *builder, const lol_html_selector_t *selector, + lol_html_element_handler_t element_handler, void *element_handler_user_data, + lol_html_comment_handler_t comment_handler, void *comment_handler_user_data, + lol_html_text_handler_handler_t text_handler, void *text_handler_user_data); + +// Frees the memory held by the builder. +// +// Note that builder can be freed before any rewriters constructed from +// it if it's not intended to be used anymore. +void lol_html_rewriter_builder_free(lol_html_rewriter_builder_t *builder); + +// Rewriter +//--------------------------------------------------------------------- + +// Memory management settings for the rewriter. +typedef struct { + // Preallocated size of the parsing buffer. + // + // Can be set to 0. In this case rewriter won't consume any memory initially, + // though there might be a performance penalty due to later reallocations. + size_t preallocated_parsing_buffer_size; + // Maximum amount of memory to be used by a rewriter. + // + // `lol_html_rewriter_write` and `lol_html_rewriter_end` will return an error + // if this limit is exceeded. + size_t max_allowed_memory_usage; +} lol_html_memory_settings_t; + +// Builds HTML-rewriter out of the provided builder. Can be called +// multiple times to construct different rewriters from the same +// builder. +// +// `output_sink` receives a zero-length chunk on the end of the output. +// +// `output_sink` can optionally have associated user data that will +// be passed to handler on each invocation along with other arguments. +// +// `strict` mode will bail out from tokenization process in cases when +// there is no way to determine correct parsing context. Recommended +// setting for safety reasons. +// +// In case of an error the function returns a NULL pointer. +lol_html_rewriter_t * +lol_html_rewriter_build(lol_html_rewriter_builder_t *builder, const char *encoding, + size_t encoding_len, lol_html_memory_settings_t memory_settings, + void (*output_sink)(const char *chunk, size_t chunk_len, void *user_data), + void *output_sink_user_data, bool strict); + +lol_html_rewriter_t *unstable_lol_html_rewriter_build_with_esi_tags( + lol_html_rewriter_builder_t *builder, const char *encoding, size_t encoding_len, + lol_html_memory_settings_t memory_settings, + void (*output_sink)(const char *chunk, size_t chunk_len, void *user_data), + void *output_sink_user_data, bool strict); + +// Write HTML chunk to rewriter. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +// +// WARNING: if this function errors the rewriter gets into the unrecoverable state, +// so any further attempts to use the rewriter will cause a thread panic. +int lol_html_rewriter_write(lol_html_rewriter_t *rewriter, const char *chunk, size_t chunk_len); + +// Completes rewriting and flushes the remaining output. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +// +// WARNING: after calling this function, further attempts to use the rewriter +// (other than `lol_html_rewriter_free`) will cause a thread panic. +int lol_html_rewriter_end(lol_html_rewriter_t *rewriter); + +// Frees the memory held by the rewriter. +void lol_html_rewriter_free(lol_html_rewriter_t *rewriter); + +// Doctype +//--------------------------------------------------------------------- + +// Returns doctype's name. +// +// The `data` field will be NULL if the doctype doesn't have a name. +lol_html_str_t lol_html_doctype_name_get(const lol_html_doctype_t *doctype); + +// Returns doctype's PUBLIC identifier. +// +// The `data` field will be NULL if the doctype doesn't have a PUBLIC identifier. +lol_html_str_t lol_html_doctype_public_id_get(const lol_html_doctype_t *doctype); + +// Returns doctype's SYSTEM identifier. +// +// The `data` field will be NULL if the doctype doesn't have a SYSTEM identifier. +lol_html_str_t lol_html_doctype_system_id_get(const lol_html_doctype_t *doctype); + +// Attaches custom user data to the doctype. +// +// The same doctype can be passed to multiple handlers if it has been +// captured by multiple selectors. It might be handy to store some processing +// state on the doctype, so it can be shared between handlers. +void lol_html_doctype_user_data_set(const lol_html_doctype_t *doctype, void *user_data); + +// Returns user data attached to the doctype. +void *lol_html_doctype_user_data_get(const lol_html_doctype_t *doctype); + +// Removes the doctype. +void lol_html_doctype_remove(lol_html_doctype_t *doctype); + +// Returns `true` if the doctype has been removed. +bool lol_html_doctype_is_removed(const lol_html_doctype_t *doctype); + +// Returns [`SourceLocationBytes`]. +// +// `doctype` must be valid and non-`NULL`. +lol_html_source_location_bytes_t +lol_html_doctype_source_location_bytes(lol_html_doctype_t *doctype); + +// Comment +//--------------------------------------------------------------------- + +// Returns comment text. +lol_html_str_t lol_html_comment_text_get(const lol_html_comment_t *comment); + +// Sets comment text. +// +// Text should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_comment_text_set(lol_html_comment_t *comment, const char *text, size_t text_len); + +// Inserts the content string before the comment either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_comment_before(lol_html_comment_t *comment, const char *content, size_t content_len, + bool is_html); + +// Inserts the content string after the comment either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_comment_after(lol_html_comment_t *comment, const char *content, size_t content_len, + bool is_html); + +// Replace the comment with the content of the string which is interpreted +// either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_comment_replace(lol_html_comment_t *comment, const char *content, size_t content_len, + bool is_html); + +// Removes the comment. +// `comment` must be valid and non-`NULL`. +// +// Calls [`Comment::remove`]. +void lol_html_comment_remove(lol_html_comment_t *comment); + +// Returns `true` if the comment has been removed. +bool lol_html_comment_is_removed(const lol_html_comment_t *comment); + +// Attaches custom user data to the comment. +// +// The same comment can be passed to multiple handlers if it has been +// captured by multiple selectors. It might be handy to store some +// processing state on the comment, so it can be shared between handlers. +void lol_html_comment_user_data_set(const lol_html_comment_t *comment, void *user_data); + +// Returns user data attached to the comment. +void *lol_html_comment_user_data_get(const lol_html_comment_t *comment); + +// Returns [`SourceLocationBytes`]. +// +// `comment` must be valid and non-`NULL`. +lol_html_source_location_bytes_t +lol_html_comment_source_location_bytes(lol_html_comment_t *comment); + +// Element +//--------------------------------------------------------------------- + +// Returns the tag name of the element. +lol_html_str_t lol_html_element_tag_name_get(const lol_html_element_t *element); + +// Returns the tag name of the element, preserving its case. +lol_html_str_t lol_html_element_tag_name_get_preserve_case(const lol_html_element_t *element); + +// Sets the tag name of the element. +// +// Name should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_element_tag_name_set(lol_html_element_t *element, const char *name, size_t name_len); + +// Whether the tag syntactically ends with `/>`. In HTML content this is purely a decorative, +// unnecessary, and has no effect of any kind. +// +// The `/>` syntax only affects parsing of elements in foreign content (SVG and MathML). +// It will never close any HTML tags that aren't already defined as void in HTML. +// +// This function only reports the parsed syntax, and will not report which elements are actually +// void in HTML. Use `lol_html_element_can_have_content` to check if the element is non-void. +// +// If the `/` is part of an unquoted attribute, it's not parsed as the self-closing syntax. +bool lol_html_element_is_self_closing(lol_html_element_t *element); + +// Whether the element can have inner content. Returns `true` unless the element is an [HTML void +// element](https://html.spec.whatwg.org/multipage/syntax.html#void-elements) or has a +// self-closing tag (eg, ``). +bool lol_html_element_can_have_content(lol_html_element_t *element); + +// Returns the namespace URI of the element. +// +// NOTE: This method returns static zero-terminated C string, so it don't +// need to be freed. +const char *lol_html_element_namespace_uri_get(const lol_html_element_t *element); + +// Returns the iterator over the element attributes. +// +// WARNING: The iterator is valid only during the handler execution and +// should never be leaked outside of it. +// +// Use `lol_html_attributes_iterator_free` function to deallocate +// returned iterator. +lol_html_attributes_iterator_t *lol_html_attributes_iterator_get(const lol_html_element_t *element); + +// Frees the memory held by the attribute iterator. +void lol_html_attributes_iterator_free(lol_html_attributes_iterator_t *iterator); + +// Advances the iterator and returns next attribute. +// +// Returns NULL if iterator has been exhausted. +// +// WARNING: Returned attribute is valid only during the handler +// execution and should never be leaked outside of it. +const lol_html_attribute_t * +lol_html_attributes_iterator_next(lol_html_attributes_iterator_t *iterator); + +// Returns the attribute name. +lol_html_str_t lol_html_attribute_name_get(const lol_html_attribute_t *attribute); + +// Returns the attribute name, preserving its case. +lol_html_str_t lol_html_attribute_name_get_preserve_case(const lol_html_attribute_t *attribute); + +// Returns the attribute value. +lol_html_str_t lol_html_attribute_value_get(const lol_html_attribute_t *attribute); + +// Returns [`SourceLocationBytes`]. +// +// `element` must be valid and non-`NULL`. +lol_html_source_location_bytes_t +lol_html_element_source_location_bytes(lol_html_element_t *element); + +// Returns the attribute value. The `data` field will be NULL if an attribute with the given name +// doesn't exist on the element. +// +// Name should be a valid UTF8-string. +// +// If the provided name is invalid UTF8-string the function returns NULL as well. +// Therefore one should always check `lol_html_take_last_error` result after the call. +lol_html_str_t lol_html_element_get_attribute(const lol_html_element_t *element, const char *name, + size_t name_len); + +// Returns 1 if element has attribute with the given name, and 0 otherwise. +// Returns -1 in case of an error. +// +// Name should be a valid UTF8-string. +int lol_html_element_has_attribute(const lol_html_element_t *element, const char *name, + size_t name_len); + +// Updates the attribute value if attribute with the given name already exists on +// the element, or creates adds new attribute with given name and value otherwise. +// +// Name and value should be valid UTF8-strings. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_element_set_attribute(lol_html_element_t *element, const char *name, size_t name_len, + const char *value, size_t value_len); + +// Removes the attribute with the given name from the element. +// +// Name should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_element_remove_attribute(lol_html_element_t *element, const char *name, + size_t name_len); + +// Inserts the content string before the element either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +// +// Calls [`Element::before`]. +int lol_html_element_before(lol_html_element_t *element, const char *content, size_t content_len, + bool is_html); + +// Inserts the content string right after the element's start tag +// either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_element_prepend(lol_html_element_t *element, const char *content, size_t content_len, + bool is_html); + +// Inserts the content string right before the element's end tag +// either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +// +// Calls [`Element::append`]. +int lol_html_element_append(lol_html_element_t *element, const char *content, size_t content_len, + bool is_html); + +// Inserts the content string right after the element's end tag as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_element_after(lol_html_element_t *element, const char *content, size_t content_len, + bool is_html); + +// Sets either text or HTML inner content of the element. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_element_set_inner_content(lol_html_element_t *element, const char *content, + size_t content_len, bool is_html); + +// Replaces the element with the provided text or HTML content. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_element_replace(lol_html_element_t *element, const char *content, size_t content_len, + bool is_html); + +// Removes the element. +void lol_html_element_remove(const lol_html_element_t *element); + +// Removes the element, but leaves its inner content intact. +void lol_html_element_remove_and_keep_content(const lol_html_element_t *element); + +// Returns `true` if the element has been removed. +bool lol_html_element_is_removed(const lol_html_element_t *element); + +// Attaches custom user data to the element. +// +// The same element can be passed to multiple handlers if it has been +// captured by multiple selectors. It might be handy to store some processing +// state on the element, so it can be shared between handlers. +void lol_html_element_user_data_set(const lol_html_element_t *element, void *user_data); + +// Returns user data attached to the element. +void *lol_html_element_user_data_get(const lol_html_element_t *element); + +// Adds content handlers to the builder for the end tag of the given element. +// +// Subsequent calls to the method on the same element adds new handler. +// They will run in the order in which they were registered. +// +// The handler can optionally have associated user data which will be +// passed to the handler on each invocation along with the rewritable +// unit argument. +// +// If the handler returns LOL_HTML_STOP directive then rewriting +// stops immediately and `write()` or `end()` of the rewriter methods +// return an error code. +// +// Not all elements (for example, `
`) support end tags. If this function is +// called on such an element, this function returns an error code as described +// below. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +// +// WARNING: Pointers passed to handlers are valid only during the +// handler execution. So they should never be leaked outside of handlers. +int lol_html_element_add_end_tag_handler(lol_html_element_t *element, + lol_html_end_tag_handler_t end_tag_handler, + void *user_data); + +// Clears the handlers that would run on the end tag of the given element. +void lol_html_element_clear_end_tag_handlers(lol_html_element_t *element); + +// Inserts the content string before the element's end tag either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +// +// Calls [`EndTag::before`]. +int lol_html_end_tag_before(lol_html_end_tag_t *end_tag, const char *content, size_t content_len, + bool is_html); + +// Inserts the content string right after the element's end tag as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_end_tag_after(lol_html_end_tag_t *end_tag, const char *content, size_t content_len, + bool is_html); + +// Removes the end tag. +// `end_tag` must be valid and non-`NULL`. +// +// Calls [`EndTag::remove`]. +void lol_html_end_tag_remove(lol_html_end_tag_t *end_tag); + +// Returns the end tag name. +lol_html_str_t lol_html_end_tag_name_get(const lol_html_end_tag_t *end_tag); + +// Returns the end tag name, preserving its case. +lol_html_str_t lol_html_end_tag_name_get_preserve_case(const lol_html_end_tag_t *end_tag); + +// Sets the tag name of the end tag. +// +// Name should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_end_tag_name_set(lol_html_end_tag_t *end_tag, const char *name, size_t name_len); + +// Inserts the content at the end of the document, either as raw text or as HTML. +// +// The content should be a valid UTF-8 string. +// +// Returns 0 if successful, and -1 otherwise. The actual error message +// can be obtained using the `lol_html_take_last_error` function. +int lol_html_doc_end_append(lol_html_doc_end_t *doc_end, const char *content, size_t content_len, + bool is_html); + +// [`Element::streaming_prepend`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `element` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_prepend(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +// [`Element::streaming_append`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `element` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_append(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +// [`Element::streaming_before`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `element` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_before(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +// [`Element::streaming_after`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `element` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_after(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +// [`Element::streaming_set_inner_content`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `element` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_set_inner_content(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +// [`Element::streaming_replace`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `element` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +int lol_html_element_streaming_replace(lol_html_element_t *element, + lol_html_streaming_handler_t *streaming_writer); + +// Returns [`SourceLocationBytes`]. +// +// `end_tag` must be valid and non-`NULL`. +lol_html_source_location_bytes_t +lol_html_end_tag_source_location_bytes(lol_html_end_tag_t *end_tag); + +// [`EndTag::streaming_before`] +// +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `end_tag` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +// +// Calls [`EndTag::streaming_before`]. +int lol_html_end_tag_streaming_before(lol_html_end_tag_t *end_tag, + lol_html_streaming_handler_t *streaming_writer); + +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `end_tag` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +// +// Calls [`EndTag::streaming_after`]. +int lol_html_end_tag_streaming_after(lol_html_end_tag_t *end_tag, + lol_html_streaming_handler_t *streaming_writer); + +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `end_tag` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +// +// Calls [`EndTag::streaming_replace`]. +int lol_html_end_tag_streaming_replace(lol_html_end_tag_t *end_tag, + lol_html_streaming_handler_t *streaming_writer); + +// Write another piece of UTF-8 data to the output. Returns `0` on success, and `-1` if it wasn't +// valid UTF-8. All pointers must be non-NULL. +int lol_html_streaming_sink_write_str(lol_html_streaming_sink_t *sink, const char *string_utf8, + size_t string_utf8_len, bool is_html); + +// [`StreamingHandlerSink::write_utf8_chunk`] +// +// Writes as much of the given UTF-8 fragment as possible, converting the encoding and HTML-escaping +// if `is_html` is `false`. +// +// The `bytes_utf8` doesn't need to be a complete UTF-8 string, as long as consecutive calls to this +// function create a valid UTF-8 string. Any incomplete UTF-8 sequence at the end of the content is +// buffered and flushed as soon as it's completed. +// +// Other functions like [`lol_html_streaming_sink_write_str`] should not be called after a +// `lol_html_streaming_sink_write_utf8_chunk` call with an incomplete UTF-8 sequence. +// +// Returns `0` on success, and `-1` if it wasn't valid UTF-8. +// All pointers must be non-`NULL`. +int lol_html_streaming_sink_write_utf8_chunk(lol_html_streaming_sink_t *sink, + const char *bytes_utf8, size_t bytes_utf8_len, + bool is_html); + +// Text chunk +//--------------------------------------------------------------------- + +// Returns a fat pointer to the UTF8 representation of content of the chunk. +// +// If the chunk is last in the current text node then content can be an empty string. +// +// WARNING: The pointer is valid only during the handler execution and +// should never be leaked outside of handlers. +lol_html_text_chunk_content_t lol_html_text_chunk_content_get(const lol_html_text_chunk_t *chunk); + +// Inserts the content string before the text chunk either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_text_chunk_before(lol_html_text_chunk_t *chunk, const char *content, + size_t content_len, bool is_html); + +// Inserts the content string after the text chunk either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_text_chunk_after(lol_html_text_chunk_t *chunk, const char *content, size_t content_len, + bool is_html); + +// Replace the text chunk with the content of the string which is interpreted +// either as raw text or as HTML. +// +// Content should be a valid UTF8-string. +// +// Returns 0 in case of success and -1 otherwise. The actual error message +// can be obtained using `lol_html_take_last_error` function. +int lol_html_text_chunk_replace(lol_html_text_chunk_t *chunk, const char *content, + size_t content_len, bool is_html); + +// Removes the text chunk. +void lol_html_text_chunk_remove(lol_html_text_chunk_t *chunk); + +// Returns `true` if the text chunk has been removed. +bool lol_html_text_chunk_is_removed(const lol_html_text_chunk_t *chunk); + +// Returns `true` if the chunk is last in the current text node. +// `text_chunk` must be valid and non-`NULL`. +// Returns `_Bool`. +// +// Calls [`TextChunk::last_in_text_node`]. +bool lol_html_text_chunk_is_last_in_text_node(lol_html_text_chunk_t *text_chunk); + +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `text_chunk` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +// +// Calls [`TextChunk::streaming_before`]. +int lol_html_text_chunk_streaming_before(lol_html_text_chunk_t *text_chunk, + lol_html_streaming_handler_t *streaming_writer); + +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `text_chunk` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +// +// Calls [`TextChunk::streaming_after`]. +int lol_html_text_chunk_streaming_after(lol_html_text_chunk_t *text_chunk, + lol_html_streaming_handler_t *streaming_writer); + +// The [`CStreamingHandler`] contains callbacks that will be called +// when the content needs to be written. +// +// `streaming_writer` is copied immediately, and doesn't have a stable address. +// `streaming_writer` may be used from another thread (`Send`), but it's only going +// to be used by one thread at a time (`!Sync`). +// +// `text_chunk` must be valid and non-`NULL`. +// If `streaming_writer` is `NULL`, an error will be reported. +// +// Returns 0 on success. +// +// Calls [`TextChunk::streaming_replace`]. +int lol_html_text_chunk_streaming_replace(lol_html_text_chunk_t *text_chunk, + lol_html_streaming_handler_t *streaming_writer); + +// Returns [`SourceLocationBytes`]. +// +// `text_chunk` must be valid and non-`NULL`. +lol_html_source_location_bytes_t +lol_html_text_chunk_source_location_bytes(lol_html_text_chunk_t *text_chunk); + +// Attaches custom user data to the text chunk. +// +// The same text chunk can be passed to multiple handlers if it has been +// captured by multiple selectors. It might be handy to store some processing +// state on the chunk, so it can be shared between handlers. +void lol_html_text_chunk_user_data_set(lol_html_text_chunk_t *chunk, void *user_data); + +// Returns user data attached to the text chunk. +void *lol_html_text_chunk_user_data_get(const lol_html_text_chunk_t *chunk); + +#if defined(__cplusplus) +} // extern C +#endif + +#endif // LOL_HTML_H diff --git a/runtime/fastly/crates/rust-lol-html/src/comment.rs b/runtime/fastly/crates/rust-lol-html/src/comment.rs new file mode 100644 index 0000000000..8c5c9e1564 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/comment.rs @@ -0,0 +1,71 @@ +use super::*; + +#[no_mangle] +pub unsafe extern "C" fn lol_html_comment_text_get(comment: *const Comment) -> Str { + Str::new(to_ref!(comment).text()) +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_comment_text_set( + comment: *mut Comment, + text: *const c_char, + text_len: size_t, +) -> c_int { + let comment = to_ref_mut!(comment); + let text = unwrap_or_ret_err_code! { to_str!(text, text_len) }; + + unwrap_or_ret_err_code! { comment.set_text(text) }; + + 0 +} + +impl_content_mutation_handlers! { comment: Comment [ + /// Inserts the content string before the comment either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_comment_before => before, + /// Inserts the content string after the comment either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_comment_after => after, + /// Replace the comment with the content of the string which is interpreted + /// either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_comment_replace => replace, + /// Removes the comment. + @VOID lol_html_comment_remove => remove, + /// Returns `true` if the comment has been removed. + @BOOL lol_html_comment_is_removed => removed, + @STREAM lol_html_comment_streaming_before => streaming_before, + @STREAM lol_html_comment_streaming_after => streaming_after, + @STREAM lol_html_comment_streaming_replace => streaming_replace, + lol_html_comment_source_location_bytes => source_location_bytes, +] } + +/// Attaches custom user data to the comment. +/// +/// The same comment can be passed to multiple handlers if it has been +/// captured by multiple selectors. It might be handy to store some +/// processing state on the comment, so it can be shared between handlers. +#[no_mangle] +pub unsafe extern "C" fn lol_html_comment_user_data_set( + comment: *mut Comment, + user_data: *mut c_void, +) { + to_ref_mut!(comment).set_user_data(user_data); +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_comment_user_data_get(comment: *const Comment) -> *mut c_void { + get_user_data!(comment) +} diff --git a/runtime/fastly/crates/rust-lol-html/src/doctype.rs b/runtime/fastly/crates/rust-lol-html/src/doctype.rs new file mode 100644 index 0000000000..c41f870bda --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/doctype.rs @@ -0,0 +1,37 @@ +use super::*; + +#[no_mangle] +pub unsafe extern "C" fn lol_html_doctype_name_get(doctype: *const Doctype) -> Str { + Str::from_opt(to_ref!(doctype).name()) +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_doctype_public_id_get(doctype: *const Doctype) -> Str { + Str::from_opt(to_ref!(doctype).public_id()) +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_doctype_system_id_get(doctype: *const Doctype) -> Str { + Str::from_opt(to_ref!(doctype).system_id()) +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_doctype_user_data_set( + doctype: *mut Doctype, + user_data: *mut c_void, +) { + to_ref_mut!(doctype).set_user_data(user_data); +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_doctype_user_data_get(doctype: *const Doctype) -> *mut c_void { + get_user_data!(doctype) +} + +impl_content_mutation_handlers! { doctype: Doctype [ + /// Removes the doctype. + @VOID lol_html_doctype_remove => remove, + /// Returns `true` if the doctype has been removed. + @BOOL lol_html_doctype_is_removed => removed, + lol_html_doctype_source_location_bytes => source_location_bytes, +] } diff --git a/runtime/fastly/crates/rust-lol-html/src/document_end.rs b/runtime/fastly/crates/rust-lol-html/src/document_end.rs new file mode 100644 index 0000000000..2b1059e913 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/document_end.rs @@ -0,0 +1,11 @@ +use super::*; + +impl_content_mutation_handlers! { doc_end: DocumentEnd [ + /// Inserts the content at the end of the document, either as raw text or as HTML. + /// + /// The content should be a valid UTF-8 string. + /// + /// Returns 0 if successful, and -1 otherwise. The actual error message + /// can be obtained using the `lol_html_take_last_error` function. + lol_html_doc_end_append => append, +] } diff --git a/runtime/fastly/crates/rust-lol-html/src/element.rs b/runtime/fastly/crates/rust-lol-html/src/element.rs new file mode 100644 index 0000000000..8d65e5db2c --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/element.rs @@ -0,0 +1,411 @@ +use super::*; +use std::slice::Iter; + +/// Returns the tag name of the element. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_tag_name_get(element: *const Element) -> Str { + let element = to_ref!(element); + + Str::new(element.tag_name()) +} + +/// Returns the tag name of the element, preserving its case. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_tag_name_get_preserve_case( + element: *const Element, +) -> Str { + let element = to_ref!(element); + + Str::new(element.tag_name_preserve_case()) +} + +/// Sets the tag name of the element. +/// +/// Name should be a valid UTF8-string. +/// +/// Returns 0 in case of success and -1 otherwise. The actual error message +/// can be obtained using `lol_html_take_last_error` function. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_tag_name_set( + element: *mut Element, + name: *const c_char, + name_len: size_t, +) -> c_int { + let element = to_ref_mut!(element); + let name = unwrap_or_ret_err_code! { to_str!(name, name_len) }; + + unwrap_or_ret_err_code! { element.set_tag_name(name) }; + + 0 +} + +/// Returns the namespace URI of the element. +/// +/// NOTE: This method returns static zero-terminated C string, so it don't +/// need to be freed. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_namespace_uri_get( + element: *mut Element, +) -> *const c_char { + let element = to_ref!(element); + + match element.namespace_uri() { + "http://www.w3.org/1999/xhtml" => static_c_str!("http://www.w3.org/1999/xhtml"), + "http://www.w3.org/2000/svg" => static_c_str!("http://www.w3.org/2000/svg"), + "http://www.w3.org/1998/Math/MathML" => static_c_str!("http://www.w3.org/1998/Math/MathML"), + _ => unreachable!("Unknown namespace URI"), + } +} + +/// Returns the iterator over the element attributes. +/// +/// WARNING: The iterator is valid only during the handler execution and +/// should never be leaked outside of it. +/// +/// Use `lol_html_attributes_iterator_free` function to deallocate +/// returned iterator. +#[no_mangle] +pub unsafe extern "C" fn lol_html_attributes_iterator_get<'r, 't>( + element: *const Element<'r, 't>, +) -> *mut Iter<'r, Attribute<'t>> { + let attributes = to_ref!(element).attributes(); + + to_ptr_mut(attributes.iter()) +} + +// Advances the iterator and returns next attribute. +// +// Returns NULL if iterator has been exhausted. +// +// WARNING: Returned attribute is valid only during the handler +// execution and should never be leaked outside of it. +#[no_mangle] +pub unsafe extern "C" fn lol_html_attributes_iterator_next<'t>( + iterator: *mut Iter<'_, Attribute<'t>>, +) -> *const Attribute<'t> { + let iterator = to_ref_mut!(iterator); + + match iterator.next() { + Some(attr) => attr, + None => ptr::null(), + } +} + +// Frees the memory held by the attribute iterator. +#[no_mangle] +pub unsafe extern "C" fn lol_html_attributes_iterator_free(iterator: *mut Iter) { + drop(to_box!(iterator)); +} + +/// Returns the attribute name. +#[no_mangle] +pub unsafe extern "C" fn lol_html_attribute_name_get(attribute: *const Attribute) -> Str { + let attribute = to_ref!(attribute); + + Str::new(attribute.name()) +} + +/// Returns the attribute name, preserving its case. +#[no_mangle] +pub unsafe extern "C" fn lol_html_attribute_name_get_preserve_case( + attribute: *const Attribute, +) -> Str { + let attribute = to_ref!(attribute); + + Str::new(attribute.name_preserve_case()) +} + +/// Returns the attribute value. +#[no_mangle] +pub unsafe extern "C" fn lol_html_attribute_value_get(attribute: *const Attribute) -> Str { + let attribute = to_ref!(attribute); + + Str::new(attribute.value()) +} + +/// Returns the attribute value. The `data` field will be NULL if an attribute with the given name +/// doesn't exist on the element. +/// +/// Name should be a valid UTF8-string. +/// +/// If the provided name is invalid UTF8-string the function returns NULL as well. +/// Therefore one should always check `lol_html_take_last_error` result after the call. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_get_attribute( + element: *const Element, + name: *const c_char, + name_len: size_t, +) -> Str { + let element = to_ref!(element); + let name = unwrap_or_ret!(to_str!(name, name_len), Str::from_opt(None)); + + Str::from_opt(element.get_attribute(name)) +} + +/// Returns 1 if element has attribute with the given name, and 0 otherwise. +/// Returns -1 in case of an error. +/// +/// Name should be a valid UTF8-string. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_has_attribute( + element: *const Element, + name: *const c_char, + name_len: size_t, +) -> c_int { + let element = to_ref!(element); + let name = unwrap_or_ret_err_code! { to_str!(name, name_len) }; + + if element.has_attribute(name) { + 1 + } else { + 0 + } +} + +/// Updates the attribute value if attribute with the given name already exists on +/// the element, or creates adds new attribute with given name and value otherwise. +/// +/// Name and value should be valid UTF8-strings. +/// +/// Returns 0 in case of success and -1 otherwise. The actual error message +/// can be obtained using `lol_html_take_last_error` function. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_set_attribute( + element: *mut Element, + name: *const c_char, + name_len: size_t, + value: *const c_char, + value_len: size_t, +) -> c_int { + let element = to_ref_mut!(element); + let name = unwrap_or_ret_err_code! { to_str!(name, name_len) }; + let value = unwrap_or_ret_err_code! { to_str!(value, value_len) }; + + unwrap_or_ret_err_code! { element.set_attribute(name, value) }; + + 0 +} + +/// Removes the attribute with the given name from the element. +/// +/// Name should be a valid UTF8-string. +/// +/// Returns 0 in case of success and -1 otherwise. The actual error message +/// can be obtained using `lol_html_take_last_error` function. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_remove_attribute( + element: *mut Element, + name: *const c_char, + name_len: size_t, +) -> c_int { + let element = to_ref_mut!(element); + let name = unwrap_or_ret_err_code! { to_str!(name, name_len) }; + + element.remove_attribute(name); + + 0 +} + +impl_content_mutation_handlers! { element: Element [ + /// Inserts the content string right after the element's start tag + /// either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_element_prepend => prepend, + /// Inserts the content string right before the element's end tag + /// either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_element_append => append, + /// Inserts the content string before the element either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_element_before => before, + /// Inserts the content string right after the element's end tag as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_element_after => after, + /// Sets either text or HTML inner content of the element. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_element_set_inner_content => set_inner_content, + /// Replaces the element with the provided text or HTML content. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_element_replace => replace, + /// Removes the element. + @VOID lol_html_element_remove => remove, + /// Removes the element, but leaves its inner content intact. + @VOID lol_html_element_remove_and_keep_content => remove_and_keep_content, + /// Returns `true` if the element has been removed. + @BOOL lol_html_element_is_removed => removed, + /// Whether the tag syntactically ends with `/>`. In HTML content this is purely a decorative, unnecessary, and has no effect of any kind. + /// + /// The `/>` syntax only affects parsing of elements in foreign content (SVG and MathML). + /// It will never close any HTML tags that aren't already defined as void in HTML. + /// + /// This function only reports the parsed syntax, and will not report which elements are actually void in HTML. + /// Use `lol_html_element_can_have_content` to check if the element is non-void. + /// + /// If the `/` is part of an unquoted attribute, it's not parsed as the self-closing syntax. + @BOOL lol_html_element_is_self_closing => is_self_closing, + /// Whether the element can have inner content. Returns `true` unless the element is an [HTML void + /// element](https://html.spec.whatwg.org/multipage/syntax.html#void-elements) or has a + /// self-closing tag (eg, ``). + @BOOL lol_html_element_can_have_content => can_have_content, + @STREAM lol_html_element_streaming_prepend => streaming_prepend, + @STREAM lol_html_element_streaming_append => streaming_append, + @STREAM lol_html_element_streaming_before => streaming_before, + @STREAM lol_html_element_streaming_after => streaming_after, + @STREAM lol_html_element_streaming_set_inner_content => streaming_set_inner_content, + @STREAM lol_html_element_streaming_replace => streaming_replace, + lol_html_element_source_location_bytes => source_location_bytes, +] } + +/// Attaches custom user data to the element. +/// +/// The same element can be passed to multiple handlers if it has been +/// captured by multiple selectors. It might be handy to store some processing +/// state on the element, so it can be shared between handlers. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_user_data_set( + element: *mut Element, + user_data: *mut c_void, +) { + to_ref_mut!(element).set_user_data(user_data); +} + +/// Returns user data attached to the element. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_user_data_get(element: *mut Element) -> *mut c_void { + get_user_data!(element) +} + +type EndTagHandler = unsafe extern "C" fn(*mut EndTag, *mut c_void) -> RewriterDirective; + +/// Adds content handlers to the builder for the end tag of the given element. +/// +/// Subsequent calls to the method on the same element adds new handler. +/// They will run in the order in which they were registered. +/// +/// The handler can optionally have associated user data which will be +/// passed to the handler on each invocation along with the rewritable +/// unit argument. +/// +/// If the handler returns LOL_HTML_STOP directive then rewriting +/// stops immediately and `write()` or `end()` of the rewriter methods +/// return an error code. +/// +/// Not all elements (for example, `
`) support end tags. If this function is +/// called on such an element, this function returns an error code as described +/// below. +/// +/// Returns 0 in case of success and -1 otherwise. The actual error message +/// can be obtained using `lol_html_take_last_error` function. +/// +/// WARNING: Pointers passed to handlers are valid only during the +/// handler execution. So they should never be leaked outside of handlers. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_add_end_tag_handler( + element: *mut Element, + handler: EndTagHandler, + user_data: *mut c_void, +) -> c_int { + let element = to_ref_mut!(element); + + let handlers = unwrap_or_ret_err_code! { + element.end_tag_handlers().ok_or("No end tag.") + }; + + handlers.push(Box::new(move |end_tag| { + match unsafe { handler(end_tag, user_data) } { + RewriterDirective::Continue => Ok(()), + RewriterDirective::Stop => Err("The rewriter has been stopped.".into()), + } + })); + + 0 +} + +/// Clears the handlers that would run on the end tag of the given element. +#[no_mangle] +pub unsafe extern "C" fn lol_html_element_clear_end_tag_handlers(element: *mut Element) { + let element = to_ref_mut!(element); + if let Some(handlers) = element.end_tag_handlers() { + handlers.clear(); + } +} + +impl_content_mutation_handlers! { end_tag: EndTag [ + /// Inserts the content string before the element's end tag either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_end_tag_before => before, + /// Inserts the content string right after the element's end tag as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_end_tag_after => after, + lol_html_end_tag_replace => replace, + /// Removes the end tag. + @VOID lol_html_end_tag_remove => remove, + @STREAM lol_html_end_tag_streaming_before => streaming_before, + @STREAM lol_html_end_tag_streaming_after => streaming_after, + @STREAM lol_html_end_tag_streaming_replace => streaming_replace, + lol_html_end_tag_source_location_bytes => source_location_bytes, +] } + +/// Returns the end tag name. +#[no_mangle] +pub unsafe extern "C" fn lol_html_end_tag_name_get(end_tag: *mut EndTag) -> Str { + let tag = to_ref_mut!(end_tag); + Str::new(tag.name()) +} + +/// Returns the end tag name, preserving its case. +#[no_mangle] +pub unsafe extern "C" fn lol_html_end_tag_name_get_preserve_case(end_tag: *mut EndTag) -> Str { + let tag = to_ref_mut!(end_tag); + Str::new(tag.name_preserve_case()) +} + +/// Sets the tag name of the end tag. +/// +/// Name should be a valid UTF8-string. +/// +/// Returns 0 in case of success and -1 otherwise. The actual error message +/// can be obtained using `lol_html_take_last_error` function. +#[no_mangle] +pub unsafe extern "C" fn lol_html_end_tag_name_set( + end_tag: *mut EndTag, + name: *const c_char, + len: size_t, +) -> c_int { + let tag = to_ref_mut!(end_tag); + let name = unwrap_or_ret_err_code! { to_str!(name, len) }; + tag.set_name_str(name.to_string()); + 0 +} diff --git a/runtime/fastly/crates/rust-lol-html/src/errors.rs b/runtime/fastly/crates/rust-lol-html/src/errors.rs new file mode 100644 index 0000000000..27413e8cbf --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/errors.rs @@ -0,0 +1,22 @@ +use super::*; +use std::error::Error; + +thread_local! { + pub static LAST_ERROR: RefCell>> = RefCell::new(None); +} + +#[no_mangle] +pub extern "C" fn lol_html_take_last_error() -> Str { + let err = LAST_ERROR.with(|e| e.borrow_mut().take()); + + Str::from_opt(err.map(|e| e.to_string())) +} + +#[derive(Error, Debug, Eq, PartialEq, Copy, Clone)] +pub enum CStreamingHandlerError { + #[error("Not all fields of the struct were initialized")] + Uninitialized, + + #[error("write_all_callback reported error: {0}")] + HandlerError(c_int), +} diff --git a/runtime/fastly/crates/rust-lol-html/src/lib.rs b/runtime/fastly/crates/rust-lol-html/src/lib.rs new file mode 100644 index 0000000000..1c87ce3569 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/lib.rs @@ -0,0 +1,280 @@ +#![allow(clippy::missing_safety_doc)] + +pub use crate::streaming::CStreamingHandler; +use libc::{c_char, c_int, c_void, size_t}; +use lol_html::html_content::*; +use lol_html::*; +use std::cell::RefCell; +use std::{ptr, slice, str}; +use thiserror::Error; + +#[inline] +fn to_ptr_mut(val: T) -> *mut T { + Box::into_raw(Box::new(val)) +} + +// NOTE: abort the thread if we receive NULL where unexpected +macro_rules! assert_not_null { + ($var:ident) => { + assert!(!$var.is_null(), "{} is NULL", stringify!($var)); + }; +} + +// NOTE: all these utilities are macros so we can propagate the variable +// name to the null pointer assertion. +macro_rules! to_ref { + ($ptr:ident) => {{ + unsafe { $ptr.as_ref().expect(concat!(stringify!($var), " is NULL")) } + }}; +} + +macro_rules! to_ref_mut { + ($ptr:ident) => {{ + unsafe { $ptr.as_mut().expect(concat!(stringify!($var), " is NULL")) } + }}; +} + +macro_rules! to_box { + ($ptr:ident) => {{ + assert_not_null!($ptr); + unsafe { Box::from_raw($ptr) } + }}; +} + +macro_rules! to_bytes { + ($data:ident, $len:ident) => {{ + assert_not_null!($data); + unsafe { slice::from_raw_parts($data as *const u8, $len) } + }}; +} + +macro_rules! to_str { + ($data:ident, $len:ident) => { + str::from_utf8(to_bytes!($data, $len)).into() + }; +} + +macro_rules! static_c_str { + ($s:expr) => { + concat!($s, "\0").as_ptr() as *const c_char + }; +} + +macro_rules! unwrap_or_ret { + ($expr:expr, $ret_val:expr) => { + match $expr { + Ok(v) => v, + Err(err) => { + crate::errors::LAST_ERROR.with(|e| *e.borrow_mut() = Some(err.into())); + return $ret_val; + } + } + }; +} + +macro_rules! unwrap_or_ret_err_code { + ($expr:expr) => { + unwrap_or_ret!($expr, -1) + }; +} + +macro_rules! unwrap_or_ret_null { + ($expr:expr) => { + unwrap_or_ret!($expr, ptr::null_mut()) + }; +} + +macro_rules! impl_content_mutation_handlers { + ($name:ident: $typ:ty [ $($(#[$meta:meta])* $(@$kind:ident)? $fn_name:ident => $method:ident),+$(,)? ]) => { + $( + // stable Rust can't concatenate idents, so fn_name must be written out manually, + // but it is possible to compare concatenated strings. + #[cfg(debug_assertions)] + const _: () = { + let expected_fn_name_prefix = concat!("lol_html_", stringify!($name), "_").as_bytes(); + let fn_name = stringify!($fn_name).as_bytes(); + // removed vs is_removed prevents exact comparison + assert!(fn_name.len() >= expected_fn_name_prefix.len() + (stringify!($method).len()), stringify!($fn_name)); + let mut i = 0; + while i < expected_fn_name_prefix.len() { + assert!(expected_fn_name_prefix[i] == fn_name[i], stringify!($fn_name)); + i += 1; + } + }; + impl_content_mutation_handlers! { IMPL $($kind)? $name: $typ, $(#[$meta])* $fn_name => $method } + )+ + }; + (IMPL $name:ident: $typ:ty, $fn_name:ident => source_location_bytes) => { + /// Returns [`SourceLocationBytes`]. + /// + #[doc = concat!(" `", stringify!($name), "` must be valid and non-`NULL`.")] + #[no_mangle] + pub unsafe extern "C" fn $fn_name($name: *mut $typ) -> SourceLocationBytes { + let loc = to_ref_mut!($name).source_location().bytes(); + SourceLocationBytes { + start: loc.start, + end: loc.end, + } + } + }; + (IMPL $name:ident: $typ:ty, $(#[$meta:meta])* $fn_name:ident => $method:ident) => { + $(#[$meta])* + /// The `content` must be a valid UTF-8 string. It's copied immediately. + /// If `is_html` is `true`, then the `content` will be written without HTML-escaping. + /// + #[doc = concat!(" `", stringify!($name), "` must be valid and non-`NULL`.")] + /// If `content` is `NULL`, an error will be reported. + /// + /// Returns 0 on success. + /// + #[doc = concat!(" Calls [`", stringify!($typ), "::", stringify!($method), "`].")] + #[no_mangle] + pub unsafe extern "C" fn $fn_name( + $name: *mut $typ, + content: *const c_char, + content_len: size_t, + is_html: bool, + ) -> c_int { + content_insertion_fn_body! { $name.$method(content, content_len, is_html) } + } + }; + (IMPL STREAM $name:ident: $typ:ty, $(#[$meta:meta])* $fn_name:ident => $method:ident) => { + $(#[$meta])* + /// The [`CStreamingHandler`] contains callbacks that will be called + /// when the content needs to be written. + /// + /// `streaming_writer` is copied immediately, and doesn't have a stable address. + /// `streaming_writer` may be used from another thread (`Send`), but it's only going + /// to be used by one thread at a time (`!Sync`). + /// + #[doc = concat!(" `", stringify!($name), "` must be valid and non-`NULL`.")] + /// If `streaming_writer` is `NULL`, an error will be reported. + /// + /// Returns 0 on success. + /// + #[doc = concat!(" Calls [`", stringify!($typ), "::", stringify!($method), "`].")] + #[no_mangle] + pub unsafe extern "C" fn $fn_name( + $name: *mut $typ, + streaming_writer: *mut CStreamingHandler, + ) -> c_int { + content_insertion_fn_body! { $name.$method(streaming_writer) } + } + }; + (IMPL VOID $name:ident: $typ:ty, $(#[$meta:meta])* $fn_name:ident => $method:ident) => { + $(#[$meta])* + #[doc = concat!(" `", stringify!($name), "` must be valid and non-`NULL`.")] + /// + #[doc = concat!(" Calls [`", stringify!($typ), "::", stringify!($method), "`].")] + #[no_mangle] + pub unsafe extern "C" fn $fn_name( + $name: *mut $typ, + ) { + to_ref_mut!($name).$method(); + } + }; + (IMPL BOOL $name:ident: $typ:ty, $(#[$meta:meta])* $fn_name:ident => $method:ident) => { + $(#[$meta])* + #[doc = concat!(" `", stringify!($name), "` must be valid and non-`NULL`.")] + /// Returns `_Bool`. + /// + #[doc = concat!(" Calls [`", stringify!($typ), "::", stringify!($method), "`].")] + #[no_mangle] + pub unsafe extern "C" fn $fn_name( + $name: *mut $typ, + ) -> bool { + to_ref_mut!($name).$method() + } + }; +} + +macro_rules! content_insertion_fn_body { + ($target:ident.$method:ident($content:ident, $content_len:ident, $is_html:ident)) => {{ + let target = to_ref_mut!($target); + let content = unwrap_or_ret_err_code! { to_str!($content, $content_len) }; + + target.$method( + content, + if $is_html { + ContentType::Html + } else { + ContentType::Text + }, + ); + + 0 + }}; + ($target:ident.$method:ident($handler:expr)) => {{ + let handler_ptr: *mut CStreamingHandler = $handler; + if unsafe { handler_ptr.as_ref() }.is_none() + || !handler_ptr.as_ref().unwrap().reserved.is_null() + { + // we can't even safely call drop callback on this + return -1; + } + // Taking ownership of the CStreamingHandler + let handler: Box = Box::new(unsafe { handler_ptr.read() }); + if handler.write_all_callback.is_none() { + return -1; + } + if let Some(target) = unsafe { $target.as_mut() } { + target.$method(handler); + 0 + } else { + -1 + } + }}; +} + +macro_rules! get_user_data { + ($unit:ident) => { + to_ref!($unit) + .user_data() + .downcast_ref::<*mut c_void>() + .map(|d| *d) + .unwrap_or(ptr::null_mut()) + }; +} + +pub mod comment; +pub mod doctype; +pub mod document_end; +pub mod element; +pub mod errors; +pub mod rewriter; +pub mod rewriter_builder; +pub mod selector; +pub mod streaming; +pub mod string; +pub mod text_chunk; + +pub use self::string::Str; + +/// `size_t` byte offsets from the start of the input document +#[repr(C)] +pub struct SourceLocationBytes { + pub start: usize, + pub end: usize, +} + +// NOTE: prevent dead code from complaining about enum +// never being constructed in the Rust code. +pub use self::rewriter_builder::RewriterDirective; + +/// An error that occurs if incorrect [`encoding`] label was provided in [`Settings`]. +/// +/// [`encoding`]: ../struct.Settings.html#structfield.encoding +/// [`Settings`]: ../struct.Settings.html +#[derive(Error, Debug, PartialEq, Copy, Clone)] +pub enum EncodingError { + /// The provided value doesn't match any of the [labels specified in the standard]. + /// + /// [labels specified in the standard]: https://encoding.spec.whatwg.org/#names-and-labels + #[error("Unknown character encoding has been provided.")] + UnknownEncoding, + + /// The provided label is for one of the non-ASCII-compatible encodings (`UTF-16LE`, `UTF-16BE`, + /// `ISO-2022-JP` and `replacement`). These encodings are not supported. + #[error("Expected ASCII-compatible encoding.")] + NonAsciiCompatibleEncoding, +} diff --git a/runtime/fastly/crates/rust-lol-html/src/rewriter.rs b/runtime/fastly/crates/rust-lol-html/src/rewriter.rs new file mode 100644 index 0000000000..18c174d192 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/rewriter.rs @@ -0,0 +1,136 @@ +use super::rewriter_builder::HtmlRewriterBuilder; +use super::*; +use libc::c_void; + +// NOTE: we use `ExternOutputSink` proxy type, because we need an +// existential type parameter for the `HtmlRewriter` and FnMut can't +// be used as such since it's a trait. +pub struct ExternOutputSink { + handler: unsafe extern "C" fn(*const c_char, size_t, *mut c_void), + user_data: *mut c_void, +} + +/// This is a wrapper around `lol_html::HtmlRewriter` which allows +/// use after the rewriter itself is dropped. +pub struct HtmlRewriter(Option>); + +impl ExternOutputSink { + #[inline] + fn new( + handler: unsafe extern "C" fn(*const c_char, size_t, *mut c_void), + user_data: *mut c_void, + ) -> Self { + Self { handler, user_data } + } +} + +impl OutputSink for ExternOutputSink { + #[inline] + fn handle_chunk(&mut self, chunk: &[u8]) { + let chunk_len = chunk.len(); + let chunk = chunk.as_ptr().cast::(); + + unsafe { (self.handler)(chunk, chunk_len, self.user_data) }; + } +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_rewriter_build( + builder: *mut HtmlRewriterBuilder, + encoding: *const c_char, + encoding_len: size_t, + memory_settings: MemorySettings, + output_sink: unsafe extern "C" fn(*const c_char, size_t, *mut c_void), + output_sink_user_data: *mut c_void, + strict: bool, +) -> *mut HtmlRewriter { + let builder = to_ref!(builder); + let handlers = builder.get_safe_handlers(); + + let maybe_encoding = + encoding_rs::Encoding::for_label_no_replacement(to_bytes!(encoding, encoding_len)); + let encoding = unwrap_or_ret_null! { maybe_encoding.ok_or(EncodingError::UnknownEncoding) }; + let settings = Settings { + element_content_handlers: handlers.element, + document_content_handlers: handlers.document, + encoding: unwrap_or_ret_null! { encoding.try_into().or(Err(EncodingError::NonAsciiCompatibleEncoding)) }, + memory_settings, + strict, + enable_esi_tags: false, + adjust_charset_on_meta_tag: false, + }; + + let output_sink = ExternOutputSink::new(output_sink, output_sink_user_data); + let rewriter = lol_html::HtmlRewriter::new(settings, output_sink); + + to_ptr_mut(HtmlRewriter(Some(rewriter))) +} + +#[no_mangle] +pub unsafe extern "C" fn unstable_lol_html_rewriter_build_with_esi_tags( + builder: *mut HtmlRewriterBuilder, + encoding: *const c_char, + encoding_len: size_t, + memory_settings: MemorySettings, + output_sink: unsafe extern "C" fn(*const c_char, size_t, *mut c_void), + output_sink_user_data: *mut c_void, + strict: bool, +) -> *mut HtmlRewriter { + let builder = to_ref!(builder); + let handlers = builder.get_safe_handlers(); + + let maybe_encoding = + encoding_rs::Encoding::for_label_no_replacement(to_bytes!(encoding, encoding_len)); + let encoding = unwrap_or_ret_null! { maybe_encoding.ok_or(EncodingError::UnknownEncoding) }; + let settings = Settings { + element_content_handlers: handlers.element, + document_content_handlers: handlers.document, + encoding: unwrap_or_ret_null! { encoding.try_into().or(Err(EncodingError::NonAsciiCompatibleEncoding)) }, + memory_settings, + strict, + enable_esi_tags: true, + adjust_charset_on_meta_tag: false, + }; + + let output_sink = ExternOutputSink::new(output_sink, output_sink_user_data); + let rewriter = lol_html::HtmlRewriter::new(settings, output_sink); + + to_ptr_mut(HtmlRewriter(Some(rewriter))) +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_rewriter_write( + rewriter: *mut HtmlRewriter, + chunk: *const c_char, + chunk_len: size_t, +) -> c_int { + let chunk = to_bytes!(chunk, chunk_len); + let rewriter = to_ref_mut!(rewriter) + .0 + .as_mut() + .expect("cannot call `lol_html_rewriter_write` after calling `end()`"); + + unwrap_or_ret_err_code! { rewriter.write(chunk) }; + + 0 +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_rewriter_end(rewriter: *mut HtmlRewriter) -> c_int { + let rewriter = to_ref_mut!(rewriter) + .0 + .take() // Using `take()` allows calling `free()` afterwards (it will be a no-op). + .expect("cannot call `lol_html_rewriter_end` after calling `end()`"); + + unwrap_or_ret_err_code! { rewriter.end() }; + + 0 +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_rewriter_free(rewriter: *mut HtmlRewriter) { + // SAFETY: `to_box` includes a check that `rewriter` is non-null. + // The caller is required to ensure that `rewriter` is aligned and that `free` has not been called before. + // NOTE: if `end()` was called before, it is valid (but not recommended) to call `free()` more than once. + drop(to_box!(rewriter)); +} diff --git a/runtime/fastly/crates/rust-lol-html/src/rewriter_builder.rs b/runtime/fastly/crates/rust-lol-html/src/rewriter_builder.rs new file mode 100644 index 0000000000..037878923f --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/rewriter_builder.rs @@ -0,0 +1,179 @@ +use super::*; +use libc::c_void; +use std::borrow::Cow; + +#[repr(C)] +pub enum RewriterDirective { + Continue, + Stop, +} + +type ElementHandler = unsafe extern "C" fn(*mut Element, *mut c_void) -> RewriterDirective; +type DoctypeHandler = unsafe extern "C" fn(*mut Doctype, *mut c_void) -> RewriterDirective; +type CommentsHandler = unsafe extern "C" fn(*mut Comment, *mut c_void) -> RewriterDirective; +type TextHandler = unsafe extern "C" fn(*mut TextChunk, *mut c_void) -> RewriterDirective; +type DocumentEndHandler = unsafe extern "C" fn(*mut DocumentEnd, *mut c_void) -> RewriterDirective; + +struct ExternHandler { + func: Option, + user_data: *mut c_void, +} + +impl ExternHandler { + const fn new(func: Option, user_data: *mut c_void) -> Self { + Self { func, user_data } + } +} + +macro_rules! add_handler { + ($handlers:ident, $el_ty:ident, $self:ident.$ty:ident) => {{ + if let Some(handler) = $self.$ty.func { + // NOTE: the closure actually holds a reference to the content + // handler object, but since we pass the object to the C side this + // ownership information gets erased. + // It's not a problem since handler is an extern static function that + // will remain intact even if Rust-side builder object gets freed. + // However, it's not a case for the user data pointer, it might become + // invalid if content handlers object that holds it gets freed before + // a handler invocation. Therefore, we close on a local variable instead + // of structure field. + let user_data = $self.$ty.user_data; + + $handlers = + $handlers.$ty( + move |arg: &mut $el_ty| match unsafe { handler(arg, user_data) } { + RewriterDirective::Continue => Ok(()), + RewriterDirective::Stop => Err("The rewriter has been stopped.".into()), + }, + ); + } + }}; +} + +pub struct ExternDocumentContentHandlers { + doctype: ExternHandler, + comments: ExternHandler, + text: ExternHandler, + end: ExternHandler, +} + +impl ExternDocumentContentHandlers { + #[must_use] + pub fn as_safe_document_content_handlers(&self) -> DocumentContentHandlers<'_> { + let mut handlers = DocumentContentHandlers::default(); + + add_handler!(handlers, Doctype, self.doctype); + add_handler!(handlers, Comment, self.comments); + add_handler!(handlers, TextChunk, self.text); + add_handler!(handlers, DocumentEnd, self.end); + + handlers + } +} + +pub struct ExternElementContentHandlers { + element: ExternHandler, + comments: ExternHandler, + text: ExternHandler, +} + +impl ExternElementContentHandlers { + #[must_use] + pub fn as_safe_element_content_handlers(&self) -> ElementContentHandlers<'_> { + let mut handlers = ElementContentHandlers::default(); + + add_handler!(handlers, Element, self.element); + add_handler!(handlers, Comment, self.comments); + add_handler!(handlers, TextChunk, self.text); + + handlers + } +} + +pub struct SafeContentHandlers<'b> { + pub document: Vec>, + pub element: Vec<(Cow<'b, Selector>, ElementContentHandlers<'b>)>, +} + +#[derive(Default)] +pub struct HtmlRewriterBuilder { + document_content_handlers: Vec, + element_content_handlers: Vec<(&'static Selector, ExternElementContentHandlers)>, +} + +impl HtmlRewriterBuilder { + #[must_use] + pub fn get_safe_handlers(&self) -> SafeContentHandlers<'_> { + SafeContentHandlers { + document: self + .document_content_handlers + .iter() + .map(|h| h.as_safe_document_content_handlers()) + .collect(), + element: self + .element_content_handlers + .iter() + .map(|(s, h)| (Cow::Borrowed(*s), h.as_safe_element_content_handlers())) + .collect(), + } + } +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_rewriter_builder_new() -> *mut HtmlRewriterBuilder { + to_ptr_mut(HtmlRewriterBuilder::default()) +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_rewriter_builder_add_document_content_handlers( + builder: *mut HtmlRewriterBuilder, + doctype_handler: Option, + doctype_handler_user_data: *mut c_void, + comments_handler: Option, + comments_handler_user_data: *mut c_void, + text_handler: Option, + text_handler_user_data: *mut c_void, + document_end_handler: Option, + document_end_handler_user_data: *mut c_void, +) { + let builder = to_ref_mut!(builder); + + let handlers = ExternDocumentContentHandlers { + doctype: ExternHandler::new(doctype_handler, doctype_handler_user_data), + comments: ExternHandler::new(comments_handler, comments_handler_user_data), + text: ExternHandler::new(text_handler, text_handler_user_data), + end: ExternHandler::new(document_end_handler, document_end_handler_user_data), + }; + + builder.document_content_handlers.push(handlers); +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_rewriter_builder_add_element_content_handlers( + builder: *mut HtmlRewriterBuilder, + selector: *const Selector, + element_handler: Option, + element_handler_user_data: *mut c_void, + comments_handler: Option, + comments_handler_user_data: *mut c_void, + text_handler: Option, + text_handler_user_data: *mut c_void, +) -> c_int { + let selector = to_ref!(selector); + let builder = to_ref_mut!(builder); + + let handlers = ExternElementContentHandlers { + element: ExternHandler::new(element_handler, element_handler_user_data), + comments: ExternHandler::new(comments_handler, comments_handler_user_data), + text: ExternHandler::new(text_handler, text_handler_user_data), + }; + + builder.element_content_handlers.push((selector, handlers)); + + 0 +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_rewriter_builder_free(builder: *mut HtmlRewriterBuilder) { + drop(to_box!(builder)); +} diff --git a/runtime/fastly/crates/rust-lol-html/src/selector.rs b/runtime/fastly/crates/rust-lol-html/src/selector.rs new file mode 100644 index 0000000000..9e39963f37 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/selector.rs @@ -0,0 +1,17 @@ +use super::*; + +#[no_mangle] +pub unsafe extern "C" fn lol_html_selector_parse( + selector: *const c_char, + selector_len: size_t, +) -> *mut Selector { + let selector = unwrap_or_ret_null! { to_str!(selector, selector_len) }; + let selector = unwrap_or_ret_null! { selector.parse::() }; + + to_ptr_mut(selector) +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_selector_free(selector: *mut Selector) { + drop(to_box!(selector)); +} diff --git a/runtime/fastly/crates/rust-lol-html/src/streaming.rs b/runtime/fastly/crates/rust-lol-html/src/streaming.rs new file mode 100644 index 0000000000..f827c96e37 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/streaming.rs @@ -0,0 +1,117 @@ +use super::*; +use crate::errors::CStreamingHandlerError; +use lol_html::html_content::StreamingHandler; +use lol_html::html_content::StreamingHandlerSink; + +/// Opaque type from C's perspective +pub type CStreamingHandlerSink<'tmp> = StreamingHandlerSink<'tmp>; + +/// Write another piece of UTF-8 data to the output. Returns `0` on success, and `-1` if it wasn't valid UTF-8. +/// All pointers must be non-NULL. +#[no_mangle] +pub unsafe extern "C" fn lol_html_streaming_sink_write_str( + sink: *mut CStreamingHandlerSink<'_>, + string_utf8: *const c_char, + string_utf8_len: size_t, + is_html: bool, +) -> c_int { + let sink = to_ref_mut!(sink); + let content = unwrap_or_ret_err_code! { to_str!(string_utf8, string_utf8_len) }; + let is_html = if is_html { + ContentType::Html + } else { + ContentType::Text + }; + + sink.write_str(content, is_html); + 0 +} + +/// [`StreamingHandlerSink::write_utf8_chunk`] +/// +/// Writes as much of the given UTF-8 fragment as possible, converting the encoding and HTML-escaping if `is_html` is `false`. +/// +/// The `bytes_utf8` doesn't need to be a complete UTF-8 string, as long as consecutive calls to this function create a valid UTF-8 string. +/// Any incomplete UTF-8 sequence at the end of the content is buffered and flushed as soon as it's completed. +/// +/// Other functions like [`lol_html_streaming_sink_write_str`] should not be called after a +/// `lol_html_streaming_sink_write_utf8_chunk` call with an incomplete UTF-8 sequence. +/// +/// Returns `0` on success, and `-1` if it wasn't valid UTF-8. +/// All pointers must be non-`NULL`. +#[no_mangle] +pub unsafe extern "C" fn lol_html_streaming_sink_write_utf8_chunk( + sink: *mut CStreamingHandlerSink<'_>, + bytes_utf8: *const c_char, + bytes_utf8_len: size_t, + is_html: bool, +) -> c_int { + let sink = to_ref_mut!(sink); + let content = to_bytes!(bytes_utf8, bytes_utf8_len); + let is_html = if is_html { + ContentType::Html + } else { + ContentType::Text + }; + + unwrap_or_ret_err_code! { sink.write_utf8_chunk(content, is_html) }; + 0 +} + +/// Safety: the user data and the callbacks must be safe to use from a different thread (e.g. can't rely on thread-local storage). +/// +/// It doesn't have to be `Sync`, it will be used only by one thread at a time. +/// +/// Handler functions copy this struct. It can (and should) be created on the stack. +#[repr(C)] +pub struct CStreamingHandler { + /// Anything you like + pub user_data: *mut c_void, + /// Called when the handler is supposed to produce its output. Return `0` for success. + /// The `sink` argument is guaranteed non-`NULL`. It is valid only for the duration of this call, and can only be used on the same thread. + /// The sink is for [`lol_html_streaming_sink_write_str`] and [`lol_html_streaming_sink_write_utf8_chunk`]. + /// `user_data` comes from this struct. + /// `write_all_callback` must not be `NULL`. + pub write_all_callback: Option< + unsafe extern "C" fn(sink: &mut CStreamingHandlerSink<'_>, user_data: *mut c_void) -> c_int, + >, + /// Called exactly once, after the last use of this handler. + /// `user_data` comes from this struct. + /// May be `NULL`. + pub drop_callback: Option, + /// *Always* initialize to `NULL`. + pub reserved: *mut c_void, +} + +// It's up to C to obey this +unsafe impl Send for CStreamingHandler {} + +impl StreamingHandler for CStreamingHandler { + fn write_all( + self: Box, + sink: &mut StreamingHandlerSink<'_>, + ) -> Result<(), Box<(dyn std::error::Error + Send + Sync)>> { + if !self.reserved.is_null() { + return Err(CStreamingHandlerError::Uninitialized.into()); + } + let cb = self + .write_all_callback + .ok_or(CStreamingHandlerError::Uninitialized)?; + let res = unsafe { (cb)(sink, self.user_data) }; + if res == 0 { + Ok(()) + } else { + Err(CStreamingHandlerError::HandlerError(res).into()) + } + } +} + +impl Drop for CStreamingHandler { + fn drop(&mut self) { + if let Some(cb) = self.drop_callback { + unsafe { + cb(self.user_data); + } + } + } +} diff --git a/runtime/fastly/crates/rust-lol-html/src/string.rs b/runtime/fastly/crates/rust-lol-html/src/string.rs new file mode 100644 index 0000000000..1794d037c9 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/string.rs @@ -0,0 +1,50 @@ +use super::*; + +// NOTE: we don't use CStr and CString as the transfer type because UTF8 +// string comming from both sides can contain interior NULLs. +#[repr(C)] +pub struct Str { + data: *const c_char, + len: size_t, +} + +impl Str { + #[must_use] + pub fn new(string: String) -> Self { + Self { + len: string.len(), + data: Box::into_raw(string.into_boxed_str()) as *const c_char, + } + } + + /// Convert an `Option` to a C-style string. + /// + /// If `string` is `None`, `data` will be set to `NULL`. + #[inline] + #[must_use] + pub fn from_opt(string: Option) -> Self { + match string { + Some(string) => Self::new(string), + None => Self { + data: ptr::null(), + len: 0, + }, + } + } +} + +impl Drop for Str { + fn drop(&mut self) { + if self.data.is_null() { + return; + } + let bytes = unsafe { slice::from_raw_parts_mut(self.data.cast_mut(), self.len) }; + + drop(unsafe { Box::from_raw(bytes) }); + } +} + +#[no_mangle] +pub unsafe extern "C" fn lol_html_str_free(string: Str) { + drop(string); +} diff --git a/runtime/fastly/crates/rust-lol-html/src/text_chunk.rs b/runtime/fastly/crates/rust-lol-html/src/text_chunk.rs new file mode 100644 index 0000000000..2b9475e701 --- /dev/null +++ b/runtime/fastly/crates/rust-lol-html/src/text_chunk.rs @@ -0,0 +1,85 @@ +use super::*; + +#[repr(C)] +pub struct TextChunkContent { + data: *const c_char, + len: size_t, +} + +impl TextChunkContent { + fn new(chunk: &TextChunk) -> Self { + let content = chunk.as_str(); + + Self { + data: content.as_ptr().cast::(), + len: content.len(), + } + } +} + +/// Returns a fat pointer to the UTF8 representation of content of the chunk. +/// +/// If the chunk is last in the current text node then content can be an empty string. +/// +/// WARNING: The pointer is valid only during the handler execution and +/// should never be leaked outside of handlers. +#[no_mangle] +pub unsafe extern "C" fn lol_html_text_chunk_content_get( + chunk: *mut TextChunk, +) -> TextChunkContent { + TextChunkContent::new(to_ref!(chunk)) +} + +impl_content_mutation_handlers! { text_chunk: TextChunk [ + /// Inserts the content string before the text chunk either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_text_chunk_before => before, + /// Inserts the content string after the text chunk either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_text_chunk_after => after, + /// Replace the text chunk with the content of the string which is interpreted + /// either as raw text or as HTML. + /// + /// Content should be a valid UTF8-string. + /// + /// Returns 0 in case of success and -1 otherwise. The actual error message + /// can be obtained using `lol_html_take_last_error` function. + lol_html_text_chunk_replace => replace, + /// Removes the text chunk. + @VOID lol_html_text_chunk_remove => remove, + /// Returns `true` if the text chunk has been removed. + @BOOL lol_html_text_chunk_is_removed => removed, + /// Returns `true` if the chunk is last in the current text node. + @BOOL lol_html_text_chunk_is_last_in_text_node => last_in_text_node, + @STREAM lol_html_text_chunk_streaming_before => streaming_before, + @STREAM lol_html_text_chunk_streaming_after => streaming_after, + @STREAM lol_html_text_chunk_streaming_replace => streaming_replace, + lol_html_text_chunk_source_location_bytes => source_location_bytes, +] } + +/// Attaches custom user data to the text chunk. +/// +/// The same text chunk can be passed to multiple handlers if it has been +/// captured by multiple selectors. It might be handy to store some processing +/// state on the chunk, so it can be shared between handlers. +#[no_mangle] +pub unsafe extern "C" fn lol_html_text_chunk_user_data_set( + chunk: *mut TextChunk, + user_data: *mut c_void, +) { + to_ref_mut!(chunk).set_user_data(user_data); +} + +/// Returns user data attached to the text chunk. +#[no_mangle] +pub unsafe extern "C" fn lol_html_text_chunk_user_data_get(chunk: *const TextChunk) -> *mut c_void { + get_user_data!(chunk) +} diff --git a/src/bundle.js b/src/bundle.js index 9720395a1a..3050b17760 100644 --- a/src/bundle.js +++ b/src/bundle.js @@ -125,6 +125,11 @@ export const TransactionCacheEntry = globalThis.TransactionCacheEntry; contents: `export const { purgeSurrogateKey, vCpuTime } = globalThis.fastly;`, }; } + case 'html-rewriter': { + return { + contents: `export const HTMLRewritingStream = globalThis.HTMLRewritingStream;`, + }; + } } }); }, diff --git a/types/html-rewriter.d.ts b/types/html-rewriter.d.ts new file mode 100644 index 0000000000..d8cc3fa247 --- /dev/null +++ b/types/html-rewriter.d.ts @@ -0,0 +1,95 @@ +declare module 'fastly:html-rewriter' { + + /** + * Stream for rewriting HTML content. + */ + export class HTMLRewritingStream implements TransformStream { + constructor(); + /** + * Registers a callback for elements matching the selector. + * @param selector CSS selector string + * @param handler Function called with each matching Element + * @returns The HTMLRewritingStream instance for chaining + * @throws {Error} If the selector or handler is invalid + */ + onElement(selector: string, handler: (element: Element) => void): this; + + /** + * The writable stream to which HTML content should be written. + */ + writable: WritableStream; + /** + * The readable stream from which transformed HTML content can be read. + */ + readable: ReadableStream; + } + + /** + * Options for rewriting HTML elements. + */ + export interface ElementRewriterOptions { + /** + * Whether to escape HTML in rewritten content. + */ + escapeHTML?: boolean; + } + + /** + * Represents an HTML element in the rewriting stream. + */ + export class Element { + /** + * Sets an attribute on the element. + * @param name Attribute name + * @param value Attribute value + */ + setAttribute(name: string, value: string): void; + /** + * Gets the value of an attribute. + * @param name Attribute name + * @returns Attribute value or null if not present + */ + getAttribute(name: string): string | null; + /** + * Removes an attribute from the element. + * @param name Attribute name + */ + removeAttribute(name: string): void; + /** + * Replaces the element with new content. + * @param content Replacement HTML or text + * @param options Optional rewriting options + */ + replaceWith(content: string, options?: ElementRewriterOptions): void; + /** + * Replaces the element's children with new content. + * @param content Replacement HTML or text + * @param options Optional rewriting options + */ + replaceChildren(content: string, options?: ElementRewriterOptions): void; + /** + * Inserts content before the element. + * @param content HTML or text to insert + * @param options Optional rewriting options + */ + before(content: string, options?: ElementRewriterOptions): void; + /** + * Inserts content after the element. + * @param content HTML or text to insert + * @param options Optional rewriting options + */ + after(content: string, options?: ElementRewriterOptions): void; + /** + * Prepends content to the element's children. + * @param content HTML or text to prepend + * @param options Optional rewriting options + */ + prepend(content: string, options?: ElementRewriterOptions): void; + /** + * Appends content to the element's children. + * @param content HTML or text to append + * @param options Optional rewriting options + */ + append(content: string, options?: ElementRewriterOptions): void; + } +} \ No newline at end of file diff --git a/types/index.d.ts b/types/index.d.ts index 2ab7aa7046..4bdd2bc169 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -17,3 +17,4 @@ /// /// /// +///