Skip to content

Commit 96166be

Browse files
committed
Improved summarize() - nested elements are handled better, and more tags are kept, for example.
1 parent 962f88a commit 96166be

File tree

5 files changed

+140
-85
lines changed

5 files changed

+140
-85
lines changed

build/meta.lua

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
NOSPACE
2424
PUSH_CONTEXT, POP_CONTEXT
2525
readFile, writeFile, writeTextFile
26+
Set
2627
templateToLua
2728
templateToString, templateToStringUtf16
2829
toWindowsPath
@@ -461,3 +462,13 @@ function _G.unindent(s)
461462
end
462463

463464

465+
466+
function _G.Set(values)
467+
local set = {}
468+
for _, v in ipairs(values) do
469+
set[v] = true
470+
end
471+
return set
472+
end
473+
474+

src/app.lua2p

Lines changed: 116 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -820,122 +820,154 @@ local function setup()
820820
return t
821821
end,
822822

823+
--
823824
-- html = summarize( html, maxCharacters [, keepSomeElements=false ] ) -- @Doc
824-
-- keepSomeElements includes <a>, <code>, <img>, <math> and <svg>.
825+
--
826+
-- Limit the amount of text and restrict allowed tags in an HTML string
827+
-- to be used e.g. as the description for an item in an RSS feed.
828+
--
829+
-- 'html' must be valid HTML code.
830+
-- 'keepSomeElements' includes elements like <a>, <em>, <code>, <img> and <svg>.
831+
-- Elements like <br> and <bdo>, and any occurrences of the 'dir' attribute, are always preserved.
832+
--
825833
summarize = function(html, maxChars, keepSomeElements)
826-
html = "<div>" .. html .. "</div>"
827-
local doc = assert(xmlLib.parseHtml(html))
828-
local protected
834+
local doc = assert(xmlLib.parseHtml("<div>"..html.."</div>"))
829835

830836
-- Remove (probably) unwanted elements.
831-
local elementsToRemove = {
837+
local elementsToRemove = !(Set{
832838
"dialog",
833839
"menu", -- (Non-standard.)
834840
"nav",
835841
"script",
836842
"style",
837843
"template",
838-
}
844+
})
839845
if not keepSomeElements then
840-
table.insert(elementsToRemove, "math")
841-
table.insert(elementsToRemove, "svg")
846+
elementsToRemove.math = true
847+
elementsToRemove.svg = true
842848
end
843849

844850
xmlLib.walk(doc, false, function(tag, el)
845-
for i, childNode in ipairsr(el) do
846-
if xmlLib.isElement(childNode) and isAny(childNode.tag, unpack(elementsToRemove)) then
851+
for i, child in ipairsr(el) do
852+
if xmlLib.isElement(child) and elementsToRemove[child.tag] then
847853
table.remove(el, i)
848854
end
849855
end
850856
end)
851857

852-
-- Protect certain things, like links.
853-
-- @Incomplete: Always protect things like <br>, <wbr>, <bdi> and ruby tags. (Also <sub>?)
854-
-- @Incomplete: Always protect attributes like 'dir'.
855-
-- @Incomplete: Protect things like <i> and <strong>.
856-
-- @Incomplete: Handle <a><img></a>.
857-
if keepSomeElements then
858-
protected = {--[[ element1, ... ]]}
859-
860-
xmlLib.walk(doc, false, function(tag, el)
861-
for i, childNode in ipairs(el) do
862-
-- @Robustness: Make sure $$PLACEHOLDER#$$ strings don't exist in the original content.
863-
if xmlLib.isText(childNode) then
864-
-- void
865-
866-
elseif childNode.tag == "a" and childNode.attr.href and not childNode.attr.href:find"^javascript:" then
867-
table.insert(protected, xmlLib.element("a", {href=childNode.attr.href, childNode:getHtmlText()}))
868-
el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
869-
870-
elseif childNode.tag == "code" then
871-
table.insert(protected, xmlLib.element("code", childNode:getHtmlText()))
872-
el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
873-
874-
elseif childNode.tag == "img" and childNode.attr.src then
875-
table.insert(protected, xmlLib.element("img", {src=childNode.attr.src, alt=(childNode.attr.alt or "")}))
876-
el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
877-
878-
elseif childNode.tag == "math" or childNode.tag == "svg" then
879-
table.insert(protected, childNode)
880-
el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
881-
end
882-
end
883-
end)
884-
end
858+
-- Filter the HTML.
859+
local function processChildren(buffer, textIndices, el)
860+
for _, child in ipairs(el) do
861+
if xmlLib.isText(child) then
862+
insertMultiple(buffer, (encodeHtmlEntities(child):gsub("%s+", " ")))
863+
table.insert(textIndices, #buffer)
885864

886-
-- Make the content into a list of paragraphs or similar.
887-
-- @Incomplete: Handle tables better?
888-
for i, childNode in ipairsr(doc) do
889-
if xmlLib.isText(childNode) then
890-
table.remove(doc, i)
865+
-- Elements to always keep.
891866

892-
else
893-
if childNode.tag == "ul" or childNode.tag == "ol" then
894-
doc[i] = xmlLib.newElement(childNode.tag)
867+
elseif isAny(child.tag, "br","wbr") then
868+
insertMultiple(buffer, "<",child.tag,">")
895869

896-
for li in childNode:eachChildElement() do
897-
local text = trim(li:getHtmlText():gsub("%s+", " "))
898-
table.insert(doc[i], xmlLib.element("li", text))
899-
end
870+
elseif isAny(child.tag, "ruby","rt","rp","bdi","bdo") then
871+
insertMultiple(buffer, "<",child.tag)
872+
if child.attr.dir then insertMultiple(buffer, " dir='",encodeHtmlEntities(child.attr.dir),"'") end
873+
insertMultiple(buffer, ">")
874+
processChildren(buffer, textIndices, child)
875+
insertMultiple(buffer, "</",child.tag,">")
876+
877+
-- Elements to sometimes keep.
878+
879+
elseif keepSomeElements and child.tag == "a" and child.attr.href and not child.attr.href:find"^javascript:" then
880+
insertMultiple(buffer, "<a href='",encodeHtmlEntities(child.attr.href),"'")
881+
if child.attr.dir then insertMultiple(buffer, " dir='",encodeHtmlEntities(child.attr.dir),"'") end
882+
insertMultiple(buffer, ">")
883+
processChildren(buffer, textIndices, child)
884+
insertMultiple(buffer, "</a>")
900885

901-
elseif childNode.tag == "pre" then
902-
local text = trim(childNode:getHtmlText():gsub("%s+", " "))
903-
doc[i] = xmlLib.element("pre", text)
886+
elseif keepSomeElements and isAny(child.tag, "em","strong","b","i","code","sub","sup") then
887+
insertMultiple(buffer, "<",child.tag)
888+
if child.attr.dir then insertMultiple(buffer, " dir='",encodeHtmlEntities(child.attr.dir),"'") end
889+
insertMultiple(buffer, ">")
890+
processChildren(buffer, textIndices, child)
891+
insertMultiple(buffer, "</",child.tag,">")
892+
893+
elseif keepSomeElements and child.tag == "img" and child.attr.src then
894+
insertMultiple(buffer, "<img src='",encodeHtmlEntities(child.attr.src),"' alt='",encodeHtmlEntities(child.attr.alt or ""),"'>")
895+
elseif not keepSomeElements and child.tag == "img" and (child.attr.alt or "") ~= "" then
896+
insertMultiple(buffer, (encodeHtmlEntities(child.attr.alt):gsub("%s+", " ")))
897+
table.insert(textIndices, #buffer)
898+
899+
elseif keepSomeElements and child.tag == "math" or child.tag == "svg" then
900+
insertMultiple(buffer, child:toHtml()) -- (We could call toXml() instead since these elements will be encoded as XML anyway.)
901+
902+
-- For anything else we just care about the text contents.
904903

905904
else
906-
local text = trim(childNode:getHtmlText():gsub("%s+", " "))
907-
doc[i] = xmlLib.element("p", text)
905+
if child.attr.dir then insertMultiple(buffer, "<span dir='",encodeHtmlEntities(child.attr.dir),"'>") end
906+
processChildren(buffer, textIndices, child)
907+
if child.attr.dir then insertMultiple(buffer, "</span") end
908+
end
909+
end
910+
end
911+
912+
local function trimText(buffer, textIndices, i1, i2, iDir)
913+
for i = i1, i2, iDir do
914+
local textNode = buffer[textIndices[i]]
915+
916+
if not textNode:find"^%s*$" then
917+
local pat = (iDir > 0) and "^%s+" or "%s+$"
918+
buffer[textIndices[i]] = textNode:gsub(pat, "")
919+
break
908920
end
909921

910-
if doc[i+1] then table.insert(doc, i+1, "\n") end -- Not needed, but it looks nicer.
922+
buffer[textIndices[i]] = ""
911923
end
912924
end
913925

914-
-- Remove protections.
915-
if keepSomeElements then
916-
xmlLib.walk(doc, true, function(tag, el)
917-
for i, childNode in ipairsr(el) do
918-
if xmlLib.isText(childNode) and childNode:find("$$PLACEHOLDER", 1, true) then
919-
local newNodes = {}
920-
921-
for pos, isMatch, textOrProtIndex in gmatchAndBetween(childNode, "%$%$PLACEHOLDER(%d+)%$%$") do
922-
if isMatch then
923-
table.insert(newNodes, protected[tonumber(textOrProtIndex)])
924-
else
925-
table.insert(newNodes, textOrProtIndex)
926-
end
927-
end
926+
local buffer = {"<div>"}
927+
local textIndices = {}
928928

929-
table.remove(el, i)
929+
for _, child in ipairs(doc) do
930+
-- @Incomplete: Handle tables better?
930931

931-
for _, node in ipairsr(newNodes) do
932-
table.insert(el, i, node)
933-
end
932+
if xmlLib.isText(child) then
933+
-- void
934+
935+
else
936+
local textStartIndex = #textIndices + 1
937+
938+
if child.tag == "ul" or child.tag == "ol" then
939+
insertMultiple(buffer, "<",child.tag,">")
940+
941+
for li in child:eachChildElement() do
942+
insertMultiple(buffer, "<li>")
943+
processChildren(buffer, textIndices, li)
944+
insertMultiple(buffer, "</li>")
934945
end
946+
947+
insertMultiple(buffer, "</",child.tag,">\n")
948+
949+
elseif child.tag == "pre" then
950+
insertMultiple(buffer, "<",child.tag,">")
951+
processChildren(buffer, textIndices, child)
952+
insertMultiple(buffer, "</",child.tag,">\n")
953+
954+
else
955+
insertMultiple(buffer, "<p>")
956+
processChildren(buffer, textIndices, child)
957+
insertMultiple(buffer, "</p>\n")
935958
end
936-
end)
959+
960+
trimText(buffer, textIndices, textStartIndex, #textIndices, 1)
961+
trimText(buffer, textIndices, #textIndices, textStartIndex, -1)
962+
end
937963
end
938964

965+
table.insert(buffer, "</div>")
966+
967+
local len = #buffer
968+
buffer[len] = buffer[len]:gsub("\n$", "")
969+
doc = assert(xmlLib.parseHtml(table.concat(buffer))) -- @Speed: So much back and forth between HTML strings and parsed documents!
970+
939971
-- Limit text length.
940972
local charsRemaining = maxChars
941973
local hasText = {}
@@ -966,8 +998,8 @@ local function setup()
966998
end
967999

9681000
else
969-
for i, childNode in ipairs(node) do
970-
limit(childNode, node, i)
1001+
for i, child in ipairs(node) do
1002+
limit(child, node, i)
9711003
end
9721004

9731005
if hasText[node] and parentEl then
@@ -978,6 +1010,8 @@ local function setup()
9781010

9791011
limit(doc, nil, nil)
9801012

1013+
-- @Polish: Remove empty elements, like <p></p> (but not <br>).
1014+
9811015
return (doc:contentsToHtml())
9821016
end,
9831017

src/functions.lua2p

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
gsubPlainSub
3636
htaccessRewriteEscapeTestString, htaccessRewriteEscapeCondPattern, htaccessRewriteEscapeRuleSubstitution
3737
indexOf, itemWith, itemWithAll
38+
insertMultiple
3839
ipairsr, iprev
3940
isAny
4041
isArgs
@@ -2889,3 +2890,12 @@ do
28892890
end
28902891

28912892

2893+
2894+
-- insertMultiple( array, value1, ... )
2895+
function insertMultiple(t, ...)
2896+
for i = 1, select("#", ...) do
2897+
table.insert(t, (select(i, ...)))
2898+
end
2899+
end
2900+
2901+

src/xml.lua2p

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1684,12 +1684,12 @@ local CHAR_TO_ENTITY = {
16841684
}
16851685

16861686
-- Encode special characters.
1687-
function xml.encodeRequiredEntities(s)
1687+
function xml.encodeRequiredEntities(s) -- @Doc
16881688
return (s:gsub("[&<>\"']", CHAR_TO_ENTITY))
16891689
end
16901690

16911691
-- Encode special characters and some additional characters (like non-breaking space).
1692-
function xml.encodeMoreEntities(s)
1692+
function xml.encodeMoreEntities(s) -- @Doc
16931693
s = xml.encodeRequiredEntities(s)
16941694

16951695
-- Note: We encode HTML entities in XML documents too. It's probably fine, but who knows!

testsite/content/tests.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ local localVar = function()end
4242
}}
4343

4444
<p style="text-align: center;">
45-
Centered text in HTML.
45+
Centered text in HTML. The following is not a list!
4646
- 1
4747
- 2
4848
</p>

0 commit comments

Comments
 (0)