Skip to content

Commit 7e36b2e

Browse files
dojsudheerhebbalemanoelcampos
authored
Fix #79 :: Dom handler (PR #86)
* correctly set a ROOT node as the first node of the dom object. set the DECL node as the first child node of ROOT. fix indent in dom:toXml(). * fix detection of an empty ELEMENT node. * XmlParser: fix parsing of a DTD element. dom handler: fix handling a DTD element. * Fix dom.lua docs * Rename people.xml to people1.xml Moves complex tags (DOCTYPE and CDATA) to people2.xml to make people1 a basic XML. * Restructure example5.lua to parse the XML files whose names are defined by an array inside the example, instead of receiving them in the STDIN. This way, the entry to run this example inside the Makefile were removed. Running a specific example inside the Makefile is too specific. If that was for test purposes, they should be inside the test files. --------- Signed-off-by: Manoel Campos <[email protected]> Co-authored-by: Sudheer Hebbale <[email protected]> Co-authored-by: Manoel Campos <[email protected]>
1 parent f6cf04b commit 7e36b2e

File tree

10 files changed

+204
-34
lines changed

10 files changed

+204
-34
lines changed

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,7 @@ test:
77
lint:
88
docker-compose run --rm lint
99

10-
.PHONY: lint test all
10+
clean:
11+
find . -name '*~' -delete
12+
13+
.PHONY: lint test all example5 clean

README.adoc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ local xml = [[
8080
<person type="legal">
8181
<name>University of Brasília</name>
8282
<city>Brasília-DF</city>
83-
</person>
84-
</people>
83+
</person>
84+
</people>
8585
]]
8686
8787
--Instantiates the XML parser
@@ -121,7 +121,7 @@ Execute `lua testxml.lua -help` on the terminal for more details.
121121

122122
== Running tests
123123

124-
=== Requeriments
124+
=== Requirements
125125

126126
You must have https://docs.docker.com/compose/install/[installed docker and docker compose].
127127

XmlParser.lua

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ local function hexadecimalToHtmlChar(code)
4343
end
4444

4545
local XmlParser = {
46-
-- Private attribures/functions
46+
-- Private attributes/functions
4747
_XML = '^([^<]*)<(%/?)([^>]-)(%/?)>',
4848
_ATTR1 = '([%w-:_]+)%s*=%s*"(.-)"',
4949
_ATTR2 = '([%w-:_]+)%s*=%s*\'(.-)\'',
@@ -56,10 +56,10 @@ local XmlParser = {
5656
_WS = '^%s*$',
5757
_DTD1 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*(%b[])%s*>',
5858
_DTD2 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*(%b[])%s*>',
59-
--_DTD3 = '<!DOCTYPE%s+(.-)%s*(%b[])%s*>',
60-
_DTD3 = '<!DOCTYPE%s.->',
59+
_DTD3 = '<!DOCTYPE%s+(.-)%s+%[%s+.-%]>', -- Inline DTD Schema
6160
_DTD4 = '<!DOCTYPE%s+(.-)%s+(SYSTEM)%s+["\'](.-)["\']%s*>',
6261
_DTD5 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s+["\'](.-)["\']%s*>',
62+
_DTD6 = '<!DOCTYPE%s+(.-)%s+(PUBLIC)%s+["\'](.-)["\']%s*>',
6363

6464
--Matches an attribute with non-closing double quotes (The equal sign is matched non-greedly by using =+?)
6565
_ATTRERR1 = '=+?%s*"[^"]*$',
@@ -246,7 +246,7 @@ end
246246

247247
local function _parseDtd(self, xml, pos)
248248
-- match,endMatch,root,type,name,uri,internal
249-
local dtdPatterns = {self._DTD1, self._DTD2, self._DTD3, self._DTD4, self._DTD5}
249+
local dtdPatterns = {self._DTD1, self._DTD2, self._DTD3, self._DTD4, self._DTD5, self._DTD6}
250250

251251
for _, dtd in pairs(dtdPatterns) do
252252
local m,e,r,t,n,u,i = string.find(xml, dtd, pos)

books.xml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<?xml version="1.0"?>
22
<!-- Source: https://msdn.microsoft.com/en-us/library/ms762271(v=vs.85).aspx -->
3+
<!DOCTYPE name PUBLIC "-//Beginning XML//DTD Address Example//EN">
34
<catalog>
45
<book>
56
<author>Gambardella, Matthew</author>
@@ -30,4 +31,4 @@
3031
society in England, the young survivors lay the
3132
foundation for a new society.</description>
3233
</book>
33-
</catalog>
34+
</catalog>

example1.lua

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ print("xml2lua v" .. xml2lua._VERSION.."\n")
1010
local handler = require("xmlhandler.tree")
1111

1212

13-
local xml = xml2lua.loadFile("people.xml")
13+
local xml = xml2lua.loadFile("people1.xml")
1414

1515
--Instantiates the XML parser
1616
local parser = xml2lua.parser(handler)

example2.lua

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ print("xml2lua v" .. xml2lua._VERSION.."\n")
99
--Uses a handler that converts the XML to a Lua table
1010
local handler = require("xmlhandler.tree")
1111

12-
----------------------- people.xml parse code -----------------------
13-
print("people.xml")
12+
----------------------- people1.xml parse code -----------------------
13+
print("people1.xml")
1414
local peopleHandler = handler:new()
1515
local peopleParser = xml2lua.parser(peopleHandler)
16-
peopleParser:parse(xml2lua.loadFile("people.xml"))
16+
peopleParser:parse(xml2lua.loadFile("people1.xml"))
1717
xml2lua.printable(peopleHandler.root)
1818

1919
----------------------- books.xml parse code -----------------------

example5.lua

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/usr/bin/env lua
2+
-- Read XML documents containing DOCTYPE and CDATA tags,
3+
-- parse with the dom parser,
4+
-- print the XML documents to STDOUT.
5+
local xml2lua = require("xml2lua")
6+
local xmlhandler = require("xmlhandler.dom")
7+
8+
local files = {"books.xml", "people2.xml"}
9+
for _, file in ipairs(files) do
10+
print(file, "-----------------------------------------------------------")
11+
local xml = xml2lua.loadFile(file)
12+
local dom = xmlhandler:new()
13+
local parser = xml2lua.parser(dom)
14+
parser:parse(xml)
15+
if not dom.root then
16+
print("parsing ", file , " as XML failed")
17+
else
18+
print(dom:toXml(dom.root))
19+
end
20+
end

people1.xml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<people>
3+
<person type="natural">
4+
<!-- Just an example comment that will be ignored by the tree
5+
handler and processed by the other ones. -->
6+
7+
<name>Manoel</name>
8+
<city>Palmas-TO</city>
9+
</person>
10+
<person type="natural">
11+
<name>Breno</name>
12+
<city>Palmas-TO</city>
13+
</person>
14+
<person type="legal">
15+
<name>University of Brasília</name>
16+
<city>Brasília-DF</city>
17+
<empty></empty>
18+
<void/>
19+
</person>
20+
</people>

people.xml renamed to people2.xml

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
11
<?xml version="1.0" encoding="UTF-8"?>
2+
<!-- A more complex people XML with DOCTYPE and CDATA tags -->
3+
4+
<!DOCTYPE person [
5+
<!ELEMENT person (name,city,empty,void)>
6+
<!ELEMENT name (#PCDATA)>
7+
<!ELEMENT city (#PCDATA)>
8+
<!ELEMENT void (#PCDATA)>
9+
<!ELEMENT empty (#PCDATA)>
10+
]>
211
<people>
312
<person type="natural">
413
<![CDATA[
@@ -7,8 +16,6 @@
716
Its content is extracted but not processed.
817
]]>
918

10-
<!-- Just an example comment that will be ignored by the tree handler and processed by the other ones. -->
11-
1219
<name>Manoel</name>
1320
<city>Palmas-TO</city>
1421
</person>
@@ -19,5 +26,7 @@
1926
<person type="legal">
2027
<name>University of Brasília</name>
2128
<city>Brasília-DF</city>
22-
</person>
29+
<empty></empty>
30+
<void/>
31+
</person>
2332
</people>

xmlhandler/dom.lua

Lines changed: 135 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ function dom:starttag(tag)
6262
_children = {}
6363
}
6464

65-
if self.root == nil then
65+
if not self.root then
6666
self.root = node
6767
end
6868

@@ -76,7 +76,7 @@ end
7676
-- @param tag a {name, attrs} table
7777
-- where name is the name of the tag and attrs
7878
-- is a table containing the attributes of the tag
79-
function dom:endtag(tag, s)
79+
function dom:endtag(tag)
8080
--Table representing the containing tag of the current tag
8181
local prev = self._stack[#self._stack]
8282

@@ -86,6 +86,22 @@ function dom:endtag(tag, s)
8686

8787
table.remove(self._stack)
8888
self.current = self._stack[#self._stack]
89+
if not self.current then
90+
local node = { _children = {}, _type = "ROOT" }
91+
if self.decl then
92+
table.insert(node._children, self.decl)
93+
self.decl = nil
94+
end
95+
if self.dtd then
96+
table.insert(node._children, self.dtd)
97+
self.dtd = nil
98+
end
99+
if self.root then
100+
table.insert(node._children, self.root)
101+
self.root = node
102+
end
103+
self.current = node
104+
end
89105
end
90106

91107
---Parses a tag content.
@@ -127,27 +143,128 @@ end
127143
-- where name is the name of the tag and attrs
128144
-- is a table containing the attributes of the tag
129145
function dom:decl(tag)
130-
if self.options.declNode then
131-
local node = { _type = "DECL",
132-
_name = tag.name,
133-
_attr = tag.attrs,
134-
}
135-
table.insert(self.current._children, node)
136-
end
146+
if self.options.declNode then
147+
self.decl = { _type = "DECL",
148+
_name = tag.name,
149+
_attr = tag.attrs,
150+
}
151+
end
137152
end
138153

139154
---Parses a DTD tag.
140-
-- @param tag a {name, attrs} table
141-
-- where name is the name of the tag and attrs
155+
-- @param tag a {name, value} table
156+
-- where name is the name of the tag and value
142157
-- is a table containing the attributes of the tag
143158
function dom:dtd(tag)
144-
if self.options.dtdNode then
145-
local node = { _type = "DTD",
146-
_name = tag.name,
147-
_attr = tag.attrs,
148-
}
149-
table.insert(self.current._children, node)
150-
end
159+
if self.options.dtdNode then
160+
self.dtd = { _type = "DTD",
161+
_name = tag.name,
162+
_text = tag.value
163+
}
164+
end
165+
end
166+
167+
--- XML escape characters for a TEXT node.
168+
-- @param s a string
169+
-- @return @p s XML escaped.
170+
local function xmlEscape(s)
171+
s = string.gsub(s, '&', '&amp;')
172+
s = string.gsub(s, '<', '&lt;')
173+
return string.gsub(s, '>', '&gt;')
174+
end
175+
176+
--- return a string of XML attributes
177+
-- @param tab table with XML attribute pairs. key and value are supposed to be strings.
178+
-- @return a string.
179+
local function attrsToStr(tab)
180+
if not tab then
181+
return ''
182+
end
183+
if type(tab) == 'table' then
184+
local s = ''
185+
for n,v in pairs(tab) do
186+
-- determine a safe quote character
187+
local val = tostring(v)
188+
local found_single_quote = string.find(val, "'")
189+
local found_double_quote = string.find(val, '"')
190+
local quot = '"'
191+
if found_single_quote and found_double_quote then
192+
-- XML escape both quote characters
193+
val = string.gsub(val, '"', '&quot;')
194+
val = string.gsub(val, "'", '&apos;')
195+
elseif found_double_quote then
196+
quot = "'"
197+
end
198+
s = ' ' .. tostring(n) .. '=' .. quot .. val .. quot
199+
end
200+
return s
201+
end
202+
return 'BUG:unknown type:' .. type(tab)
203+
end
204+
205+
--- return a XML formatted string of @p node.
206+
-- @param node a Node object (table) of the xml2lua DOM tree structure.
207+
-- @return a string.
208+
local function toXmlStr(node, indentLevel)
209+
if not node then
210+
return 'BUG:node==nil'
211+
end
212+
if not node._type then
213+
return 'BUG:node._type==nil'
214+
end
215+
216+
local indent = ''
217+
for i=0, indentLevel+1, 1 do
218+
indent = indent .. ' '
219+
end
220+
221+
if node._type == 'ROOT' then
222+
local s = ''
223+
for i, n in pairs(node._children) do
224+
s = s .. toXmlStr(n, indentLevel+2)
225+
end
226+
return s
227+
elseif node._type == 'ELEMENT' then
228+
local s = indent .. '<' .. node._name .. attrsToStr(node._attr)
229+
230+
-- check if ELEMENT has no children
231+
if not node._children or
232+
#node._children == 0 then
233+
return s .. '/>\n'
234+
end
235+
236+
s = s .. '>\n'
237+
238+
for i, n in pairs(node._children) do
239+
local xx = toXmlStr(n, indentLevel+2)
240+
if not xx then
241+
print('BUG:xx==nil')
242+
else
243+
s = s .. xx
244+
end
245+
end
246+
247+
return s .. indent .. '</' .. node._name .. '>\n'
248+
249+
elseif node._type == 'TEXT' then
250+
return indent .. xmlEscape(node._text) .. '\n'
251+
elseif node._type == 'COMMENT' then
252+
return indent .. '<!--' .. node._text .. '-->\n'
253+
elseif node._type == 'PI' then
254+
return indent .. '<?' .. node._name .. ' ' .. node._attr._text .. '?>\n'
255+
elseif node._type == 'DECL' then
256+
return indent .. '<?' .. node._name .. attrsToStr(node._attr) .. '?>\n'
257+
elseif node._type == 'DTD' then
258+
return indent .. '<!' .. node._name .. ' ' .. node._text .. '>\n'
259+
end
260+
return 'BUG:unknown type:' .. tostring(node._type)
261+
end
262+
263+
---create a string in XML format from the dom root object @p node.
264+
-- @param node a root object, typically created with `dom` XML parser handler.
265+
-- @return a string, XML formatted.
266+
function dom:toXml(node)
267+
return toXmlStr(node, -4)
151268
end
152269

153270
---Parses CDATA tag content.

0 commit comments

Comments
 (0)