fix: Normalize line breaks according to spec

karfau · karfau · commit d5405e48a56e · 2021-08-28T05:22:48.000+02:00
> XML parsed entities are often stored in computer files which, for editing convenience, are organized into lines. These lines are typically separated by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA). > > To simplify the tasks of applications, the XML processor must behave as if it normalized all line breaks in external parsed entities (including the document entity) on input, before parsing, by translating both the two-character sequence #xD #xA and any #xD that is not followed by #xA to a single #xA character. Where `#xD` == `\r` and `#xA` == `\n`, so ` \r\n ` => ` \n ` ` \n\r ` => ` \n\n ` ` \n ` => ` \n ` ` \r ` => ` \n ` BREAKING CHANGE: Certain combination of line break characters are normalized before parsing takes place and will no longer be preserved. For details see https://www.w3.org/TR/xml/#sec-line-ends fixes #303
diff --git a/lib/dom-parser.js b/lib/dom-parser.js
@@ -33,10 +33,14 @@ DOMParser.prototype.parseFromString = function(source,mimeType){
 		defaultNSMap[''] = NAMESPACE.HTML;
 	}
 	defaultNSMap.xml = defaultNSMap.xml || NAMESPACE.XML;
-	if(source && typeof source === 'string'){
-		sax.parse(source,defaultNSMap,entityMap);
-	}else{
-		sax.errorHandler.error("invalid doc source");
+	if (source && typeof source === 'string') {
+		sax.parse(
+			source.replace(/\r\n/g, '\n').replace(/\r/g, '\n'), // https://www.w3.org/TR/xml/#sec-line-ends
+			defaultNSMap,
+			entityMap
+		)
+	} else {
+		sax.errorHandler.error('invalid doc source')
 	}
 	return domBuilder.doc;
 }
diff --git a/test/parse/__snapshots__/test-doc-whitespace.test.js.snap b/test/parse/__snapshots__/test-doc-whitespace.test.js.snap
@@ -8,7 +8,7 @@ Object {
 }
 `;
 
-exports[`errorHandle test 1`] = `
+exports[`errorHandle should encode < literal when not part of a tag 1`] = `
 Object {
   "actual": "<p xmlns=\\"http://www.w3.org/1999/xhtml\\">populaciji (&lt; 0.1%), te se</p>",
   "error": Array [
diff --git a/test/parse/test-doc-whitespace.test.js b/test/parse/test-doc-whitespace.test.js
@@ -20,7 +20,7 @@ describe('errorHandle', () => {
 		expect({ actual, ...errors }).toMatchSnapshot()
 	})
 
-	it('test', () => {
+	it('should encode < literal when not part of a tag', () => {
 		const description = '<p>populaciji (< 0.1%), te se</p>'
 		const { errors, parser } = getTestParser()
 
@@ -29,3 +29,62 @@ describe('errorHandle', () => {
 		expect({ actual, ...errors }).toMatchSnapshot()
 	})
 })
+
+describe('whitespace', () => {
+	const whitespaceToHex = (str) =>
+		str.replace(/\s/g, (c) => `#x${c.charCodeAt(0).toString(16)}`)
+	it.each([
+		[
+			'in text node before first element',
+			'\r\n<xml/>',
+			(dom) => dom.firstChild.nodeValue,
+			'#xa',
+		],
+		[
+			'in attributes',
+			'<xml attr="\r\n"/>',
+			(dom) => dom.documentElement.getAttribute('attr'),
+			'#xa',
+		],
+		[
+			'in firstChild text node',
+			'<xml>\x0D\x0A</xml>',
+			(dom) => dom.documentElement.firstChild.nodeValue,
+			'#xa',
+		],
+	])('should normalize "\\r\\n" %s', (_, xml, resolveNode, expected) => {
+		const { parser } = getTestParser()
+
+		const dom = parser.parseFromString(xml, 'text/html')
+
+		expect(whitespaceToHex(resolveNode(dom))).toBe(expected)
+	})
+	it.each([
+		[
+			'before first node',
+			'\r \n<xml/>',
+			(dom) => dom.firstChild.nodeValue,
+			'#xa#x20#xa',
+		],
+		[
+			'in attributes',
+			'<xml attr=" \r\r"/>',
+			(dom) => dom.documentElement.getAttribute('attr'),
+			'#x20#xa#xa',
+		],
+		[
+			'in firstChild text node',
+			'<xml>\x0A\x0D</xml>',
+			(dom) => dom.documentElement.firstChild.nodeValue,
+			// only the "inner" combination is replaced, which results in a new combination
+			// (which would be normalized on the next roundtrip)
+			'#xa#xa',
+		],
+	])('should normalize "\\r" not followed by "\\n" %s', (_, xml, resolveNode, expected) => {
+		const { parser } = getTestParser()
+
+		const dom = parser.parseFromString(xml, 'text/html')
+
+		expect(whitespaceToHex(resolveNode(dom))).toBe(expected)
+	})
+})
diff --git a/test/xmltest/__snapshots__/not-wf.test.js.snap b/test/xmltest/__snapshots__/not-wf.test.js.snap
@@ -152,8 +152,7 @@ exports[`xmltest/not-wellformed standalone should match 019.xml with snapshot 1`
 Object {
   "actual": "<doc/>",
   "error": Array [
-    "[xmldom error]	end tag name: >
- is not complete:undefined
+    "[xmldom error]	end tag name: > is not complete:undefined
 @#[line:1,col:1]",
   ],
   "warning": Array [
diff --git a/test/xmltest/__snapshots__/valid.test.js.snap b/test/xmltest/__snapshots__/valid.test.js.snap
@@ -782,8 +782,6 @@ Object {
 exports[`xmltest/valid standalone should match 098.xml with snapshot 1`] = `
 Object {
   "actual": "<doc><?pi x
-y?></doc>",
-  "expected": "<doc><?pi x
 y?></doc>",
 }
 `;

Original file line number	Diff line number	Diff line change
`@@ -33,10 +33,14 @@ DOMParser.prototype.parseFromString = function(source,mimeType){`
`33`	`33`	`defaultNSMap[''] = NAMESPACE.HTML;`
`34`	`34`	`}`
`35`	`35`	`defaultNSMap.xml = defaultNSMap.xml \|\| NAMESPACE.XML;`
`36`		`- if(source && typeof source === 'string'){`
`37`		`- sax.parse(source,defaultNSMap,entityMap);`
`38`		`- }else{`
`39`		`- sax.errorHandler.error("invalid doc source");`
	`36`	`+ if (source && typeof source === 'string') {`
	`37`	`+ sax.parse(`
	`38`	`+ source.replace(/\r\n/g, '\n').replace(/\r/g, '\n'), // https://www.w3.org/TR/xml/#sec-line-ends`
	`39`	`+ defaultNSMap,`
	`40`	`+ entityMap`
	`41`	`+ )`
	`42`	`+ } else {`
	`43`	`+ sax.errorHandler.error('invalid doc source')`
`40`	`44`	`}`
`41`	`45`	`return domBuilder.doc;`
`42`	`46`	`}`
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ Object {`
`8`	`8`	`}`
`9`	`9`	`;
`10`	`10`
`11`		-exports[`errorHandle test 1`] = `
	`11`	+exports[`errorHandle should encode < literal when not part of a tag 1`] = `
`12`	`12`	`Object {`
`13`	`13`	`"actual": "<p xmlns=\\"http://www.w3.org/1999/xhtml\\">populaciji (< 0.1%), te se</p>",`
`14`	`14`	`"error": Array [`
Original file line number	Diff line number	Diff line change
`@@ -782,8 +782,6 @@ Object {`
`782`	`782`	exports[`xmltest/valid standalone should match 098.xml with snapshot 1`] = `
`783`	`783`	`Object {`
`784`	`784`	`"actual": "<doc><?pi x`
`785`		`-y?></doc>",`
`786`		`- "expected": "<doc><?pi x`
`787`	`785`	`y?></doc>",`
`788`	`786`	`}`
`789`	`787`	`;