From 6bcd609e62e48a1e7efabb80abfc67a1f88422ab Mon Sep 17 00:00:00 2001 From: Dean Mao Date: Tue, 1 May 2012 20:26:30 -0700 Subject: [PATCH 1/3] added doctype --- lib/htmlparser.js | 37 +++++++++++++++++++++++++++++++++++++ tests/parser.js | 5 +++++ 2 files changed, 42 insertions(+) diff --git a/lib/htmlparser.js b/lib/htmlparser.js index fbaa31d..7f8f8b1 100644 --- a/lib/htmlparser.js +++ b/lib/htmlparser.js @@ -49,6 +49,7 @@ var Mode = { Tag: 'tag', Attr: 'attr', CData: 'cdata', + Doctype: 'doctype', Comment: 'comment' }; @@ -136,6 +137,8 @@ function Parser (builder, options) { return this._parseAttr(this._state); case Mode.CData: return this._parseCData(this._state); + case Mode.Doctype: + return this._parseDoctype(this._state); case Mode.Comment: return this._parseComment(this._state); } @@ -224,6 +227,11 @@ function Parser (builder, options) { state.pos += 8; return; } + if (!match[1] && match[2].substr(0, 8) === '!DOCTYPE') { + state.mode = Mode.Doctype; + state.pos += 8; + return; + } if (!state.done && (state.pos + match[0].length) === state.data.length) { //We're at the and of the data, might be incomplete state.needData = true; @@ -400,6 +408,35 @@ function Parser (builder, options) { } }; + Parser.prototype._parseDoctype = function Parser$_parseDoctype () { + var state = this._state; + var foundPos = state.data.indexOf('>', state.pos); + if (foundPos < 0 && state.done) { + foundPos = state.data.length; + } + if (foundPos < 0) { + Parser.re_parseCData_findEnding.lastIndex = state.pos; + if (!state.pendingText) { + state.pendingText = []; + } + state.pendingText.push(state.data.substr(state.pos, state.data.length)); + state.pos = state.data.length; + state.needData = true; + } else { + var text; + if (state.pendingText) { + state.pendingText.push(state.data.substring(state.pos, foundPos)); + text = state.pendingText.join(''); + state.pendingText = null; + } else { + text = state.data.substring(state.pos, foundPos); + } + this._write({ type: Mode.Doctype, data: text }); + state.mode = Mode.Text; + state.pos = foundPos + 1; + } + }; + Parser.re_parseComment_findEnding = /\-{1,2}$/; Parser.prototype._parseComment = function Parser$_parseComment () { var state = this._state; diff --git a/tests/parser.js b/tests/parser.js index bf70ac1..b2df208 100644 --- a/tests/parser.js +++ b/tests/parser.js @@ -367,6 +367,11 @@ exports['html inside comment'] = { , expected: [{ type: 'comment', data: '
foo
'}] }; +exports['transitional doctype'] = { + data: [''] + , expected: [{ type: 'doctype', data: ' HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"'}] +}; + exports['html inside cdata'] = { data: ['foo ]]>'] , expected: [{ type: 'cdata', data: '
foo
'}] From 2267f32a02ef63bfd9ba241dfea74283c128cad7 Mon Sep 17 00:00:00 2001 From: Dean Mao Date: Tue, 1 May 2012 20:33:25 -0700 Subject: [PATCH 2/3] make test a little better --- tests/parser.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/parser.js b/tests/parser.js index b2df208..d00ee22 100644 --- a/tests/parser.js +++ b/tests/parser.js @@ -368,8 +368,12 @@ exports['html inside comment'] = { }; exports['transitional doctype'] = { - data: [''] - , expected: [{ type: 'doctype', data: ' HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"'}] + data: [''] + , expected: [ + { type: 'doctype', data: ' HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"'}, + { type: 'tag', name: 'html', raw: 'html' }, + { type: 'tag', name: '/html', raw: '/html' } + ] }; exports['html inside cdata'] = { From fdcdd0a3f421e7ca28aa10816b79e5d78b1cfba8 Mon Sep 17 00:00:00 2001 From: Dean Mao Date: Fri, 4 May 2012 10:23:56 -0700 Subject: [PATCH 3/3] fix attribute with no value --- lib/htmlparser.js | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/lib/htmlparser.js b/lib/htmlparser.js index 7f8f8b1..27eb8ba 100644 --- a/lib/htmlparser.js +++ b/lib/htmlparser.js @@ -349,6 +349,7 @@ function Parser (builder, options) { } state.pos += name_data.match.length; var value_data = this._parseAttr_findValue(state); + var end = state.data.indexOf(' ', state.pos); if (value_data) { if (!state.done && state.pos + value_data.match.length === state.data.length) { state.needData = true; @@ -357,16 +358,24 @@ function Parser (builder, options) { } state.pos += value_data.match.length; } else { - Parser.re_parseAttr_splitValue.lastIndex = state.pos; - if (Parser.re_parseAttr_splitValue.exec(state.data)) { - state.needData = true; - state.pos -= name_data.match.length; - return; + if (state.data.indexOf(' ', state.pos-1)) { + value_data = { + match: '' + , value: name_data.name + }; + + } else { + Parser.re_parseAttr_splitValue.lastIndex = state.pos; + if (Parser.re_parseAttr_splitValue.exec(state.data)) { + state.needData = true; + state.pos -= name_data.match.length; + return; + } + value_data = { + match: '' + , value: null + }; } - value_data = { - match: '' - , value: null - }; } state.lastTag.raw += name_data.match + value_data.match;