Skip to content

Commit d5405e4

Browse files
committed
fix: Normalize line breaks according to spec
> XML parsed entities are often stored in computer files which, for editing convenience, are organized into lines. These lines are typically separated by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA). > > To simplify the tasks of applications, the XML processor must behave as if it normalized all line breaks in external parsed entities (including the document entity) on input, before parsing, by translating both the two-character sequence #xD #xA and any #xD that is not followed by #xA to a single #xA character. Where `#xD` == `\r` and `#xA` == `\n`, so ` \r\n ` => ` \n ` ` \n\r ` => ` \n\n ` ` \n ` => ` \n ` ` \r ` => ` \n ` BREAKING CHANGE: Certain combination of line break characters are normalized before parsing takes place and will no longer be preserved. For details see https://www.w3.org/TR/xml/#sec-line-ends fixes #303
1 parent 64647d1 commit d5405e4

File tree

5 files changed

+70
-10
lines changed

5 files changed

+70
-10
lines changed

lib/dom-parser.js

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,14 @@ DOMParser.prototype.parseFromString = function(source,mimeType){
3333
defaultNSMap[''] = NAMESPACE.HTML;
3434
}
3535
defaultNSMap.xml = defaultNSMap.xml || NAMESPACE.XML;
36-
if(source && typeof source === 'string'){
37-
sax.parse(source,defaultNSMap,entityMap);
38-
}else{
39-
sax.errorHandler.error("invalid doc source");
36+
if (source && typeof source === 'string') {
37+
sax.parse(
38+
source.replace(/\r\n/g, '\n').replace(/\r/g, '\n'), // https://www.w3.org/TR/xml/#sec-line-ends
39+
defaultNSMap,
40+
entityMap
41+
)
42+
} else {
43+
sax.errorHandler.error('invalid doc source')
4044
}
4145
return domBuilder.doc;
4246
}

test/parse/__snapshots__/test-doc-whitespace.test.js.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ Object {
88
}
99
`;
1010

11-
exports[`errorHandle test 1`] = `
11+
exports[`errorHandle should encode < literal when not part of a tag 1`] = `
1212
Object {
1313
"actual": "<p xmlns=\\"http://www.w3.org/1999/xhtml\\">populaciji (&lt; 0.1%), te se</p>",
1414
"error": Array [

test/parse/test-doc-whitespace.test.js

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ describe('errorHandle', () => {
2020
expect({ actual, ...errors }).toMatchSnapshot()
2121
})
2222

23-
it('test', () => {
23+
it('should encode < literal when not part of a tag', () => {
2424
const description = '<p>populaciji (< 0.1%), te se</p>'
2525
const { errors, parser } = getTestParser()
2626

@@ -29,3 +29,62 @@ describe('errorHandle', () => {
2929
expect({ actual, ...errors }).toMatchSnapshot()
3030
})
3131
})
32+
33+
describe('whitespace', () => {
34+
const whitespaceToHex = (str) =>
35+
str.replace(/\s/g, (c) => `#x${c.charCodeAt(0).toString(16)}`)
36+
it.each([
37+
[
38+
'in text node before first element',
39+
'\r\n<xml/>',
40+
(dom) => dom.firstChild.nodeValue,
41+
'#xa',
42+
],
43+
[
44+
'in attributes',
45+
'<xml attr="\r\n"/>',
46+
(dom) => dom.documentElement.getAttribute('attr'),
47+
'#xa',
48+
],
49+
[
50+
'in firstChild text node',
51+
'<xml>\x0D\x0A</xml>',
52+
(dom) => dom.documentElement.firstChild.nodeValue,
53+
'#xa',
54+
],
55+
])('should normalize "\\r\\n" %s', (_, xml, resolveNode, expected) => {
56+
const { parser } = getTestParser()
57+
58+
const dom = parser.parseFromString(xml, 'text/html')
59+
60+
expect(whitespaceToHex(resolveNode(dom))).toBe(expected)
61+
})
62+
it.each([
63+
[
64+
'before first node',
65+
'\r \n<xml/>',
66+
(dom) => dom.firstChild.nodeValue,
67+
'#xa#x20#xa',
68+
],
69+
[
70+
'in attributes',
71+
'<xml attr=" \r\r"/>',
72+
(dom) => dom.documentElement.getAttribute('attr'),
73+
'#x20#xa#xa',
74+
],
75+
[
76+
'in firstChild text node',
77+
'<xml>\x0A\x0D</xml>',
78+
(dom) => dom.documentElement.firstChild.nodeValue,
79+
// only the "inner" combination is replaced, which results in a new combination
80+
// (which would be normalized on the next roundtrip)
81+
'#xa#xa',
82+
],
83+
])('should normalize "\\r" not followed by "\\n" %s', (_, xml, resolveNode, expected) => {
84+
const { parser } = getTestParser()
85+
86+
const dom = parser.parseFromString(xml, 'text/html')
87+
88+
expect(whitespaceToHex(resolveNode(dom))).toBe(expected)
89+
})
90+
})

test/xmltest/__snapshots__/not-wf.test.js.snap

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,7 @@ exports[`xmltest/not-wellformed standalone should match 019.xml with snapshot 1`
152152
Object {
153153
"actual": "<doc/>",
154154
"error": Array [
155-
"[xmldom error] end tag name: >
156-
is not complete:undefined
155+
"[xmldom error] end tag name: > is not complete:undefined
157156
@#[line:1,col:1]",
158157
],
159158
"warning": Array [

test/xmltest/__snapshots__/valid.test.js.snap

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -782,8 +782,6 @@ Object {
782782
exports[`xmltest/valid standalone should match 098.xml with snapshot 1`] = `
783783
Object {
784784
"actual": "<doc><?pi x
785-
y?></doc>",
786-
"expected": "<doc><?pi x
787785
y?></doc>",
788786
}
789787
`;

0 commit comments

Comments
 (0)