@@ -820,122 +820,154 @@ local function setup()
820820 return t
821821 end,
822822
823+ --
823824 -- html = summarize( html, maxCharacters [, keepSomeElements=false ] ) -- @Doc
824- -- keepSomeElements includes <a>, <code>, <img>, <math> and <svg>.
825+ --
826+ -- Limit the amount of text and restrict allowed tags in an HTML string
827+ -- to be used e.g. as the description for an item in an RSS feed.
828+ --
829+ -- 'html' must be valid HTML code.
830+ -- 'keepSomeElements' includes elements like <a>, <em>, <code>, <img> and <svg>.
831+ -- Elements like <br> and <bdo>, and any occurrences of the 'dir' attribute, are always preserved.
832+ --
825833 summarize = function(html, maxChars, keepSomeElements)
826- html = "<div>" .. html .. "</div>"
827- local doc = assert(xmlLib.parseHtml(html))
828- local protected
834+ local doc = assert(xmlLib.parseHtml("<div>"..html.."</div>"))
829835
830836 -- Remove (probably) unwanted elements.
831- local elementsToRemove = {
837+ local elementsToRemove = !(Set {
832838 "dialog",
833839 "menu", -- (Non-standard.)
834840 "nav",
835841 "script",
836842 "style",
837843 "template",
838- }
844+ })
839845 if not keepSomeElements then
840- table.insert( elementsToRemove, " math")
841- table.insert( elementsToRemove, " svg")
846+ elementsToRemove. math = true
847+ elementsToRemove. svg = true
842848 end
843849
844850 xmlLib.walk(doc, false, function(tag, el)
845- for i, childNode in ipairsr(el) do
846- if xmlLib.isElement(childNode ) and isAny(childNode .tag, unpack(elementsToRemove)) then
851+ for i, child in ipairsr(el) do
852+ if xmlLib.isElement(child ) and elementsToRemove[child .tag] then
847853 table.remove(el, i)
848854 end
849855 end
850856 end)
851857
852- -- Protect certain things, like links.
853- -- @Incomplete: Always protect things like <br>, <wbr>, <bdi> and ruby tags. (Also <sub>?)
854- -- @Incomplete: Always protect attributes like 'dir'.
855- -- @Incomplete: Protect things like <i> and <strong>.
856- -- @Incomplete: Handle <a><img></a>.
857- if keepSomeElements then
858- protected = {--[[ element1, ... ]]}
859-
860- xmlLib.walk(doc, false, function(tag, el)
861- for i, childNode in ipairs(el) do
862- -- @Robustness: Make sure $$PLACEHOLDER#$$ strings don't exist in the original content.
863- if xmlLib.isText(childNode) then
864- -- void
865-
866- elseif childNode.tag == "a" and childNode.attr.href and not childNode.attr.href:find"^javascript:" then
867- table.insert(protected, xmlLib.element("a", {href=childNode.attr.href, childNode:getHtmlText()}))
868- el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
869-
870- elseif childNode.tag == "code" then
871- table.insert(protected, xmlLib.element("code", childNode:getHtmlText()))
872- el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
873-
874- elseif childNode.tag == "img" and childNode.attr.src then
875- table.insert(protected, xmlLib.element("img", {src=childNode.attr.src, alt=(childNode.attr.alt or "")}))
876- el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
877-
878- elseif childNode.tag == "math" or childNode.tag == "svg" then
879- table.insert(protected, childNode)
880- el[i] = "$$PLACEHOLDER" .. #protected .. "$$"
881- end
882- end
883- end)
884- end
858+ -- Filter the HTML.
859+ local function processChildren(buffer, textIndices, el)
860+ for _, child in ipairs(el) do
861+ if xmlLib.isText(child) then
862+ insertMultiple(buffer, (encodeHtmlEntities(child):gsub("%s+", " ")))
863+ table.insert(textIndices, #buffer)
885864
886- -- Make the content into a list of paragraphs or similar.
887- -- @Incomplete: Handle tables better?
888- for i, childNode in ipairsr(doc) do
889- if xmlLib.isText(childNode) then
890- table.remove(doc, i)
865+ -- Elements to always keep.
891866
892- else
893- if childNode.tag == "ul" or childNode.tag == "ol" then
894- doc[i] = xmlLib.newElement(childNode.tag)
867+ elseif isAny(child.tag, "br","wbr") then
868+ insertMultiple(buffer, "<",child.tag,">")
895869
896- for li in childNode:eachChildElement() do
897- local text = trim(li:getHtmlText():gsub("%s+", " "))
898- table.insert(doc[i], xmlLib.element("li", text))
899- end
870+ elseif isAny(child.tag, "ruby","rt","rp","bdi","bdo") then
871+ insertMultiple(buffer, "<",child.tag)
872+ if child.attr.dir then insertMultiple(buffer, " dir='",encodeHtmlEntities(child.attr.dir),"'") end
873+ insertMultiple(buffer, ">")
874+ processChildren(buffer, textIndices, child)
875+ insertMultiple(buffer, "</",child.tag,">")
876+
877+ -- Elements to sometimes keep.
878+
879+ elseif keepSomeElements and child.tag == "a" and child.attr.href and not child.attr.href:find"^javascript:" then
880+ insertMultiple(buffer, "<a href='",encodeHtmlEntities(child.attr.href),"'")
881+ if child.attr.dir then insertMultiple(buffer, " dir='",encodeHtmlEntities(child.attr.dir),"'") end
882+ insertMultiple(buffer, ">")
883+ processChildren(buffer, textIndices, child)
884+ insertMultiple(buffer, "</a>")
900885
901- elseif childNode.tag == "pre" then
902- local text = trim(childNode:getHtmlText():gsub("%s+", " "))
903- doc[i] = xmlLib.element("pre", text)
886+ elseif keepSomeElements and isAny(child.tag, "em","strong","b","i","code","sub","sup") then
887+ insertMultiple(buffer, "<",child.tag)
888+ if child.attr.dir then insertMultiple(buffer, " dir='",encodeHtmlEntities(child.attr.dir),"'") end
889+ insertMultiple(buffer, ">")
890+ processChildren(buffer, textIndices, child)
891+ insertMultiple(buffer, "</",child.tag,">")
892+
893+ elseif keepSomeElements and child.tag == "img" and child.attr.src then
894+ insertMultiple(buffer, "<img src='",encodeHtmlEntities(child.attr.src),"' alt='",encodeHtmlEntities(child.attr.alt or ""),"'>")
895+ elseif not keepSomeElements and child.tag == "img" and (child.attr.alt or "") ~= "" then
896+ insertMultiple(buffer, (encodeHtmlEntities(child.attr.alt):gsub("%s+", " ")))
897+ table.insert(textIndices, #buffer)
898+
899+ elseif keepSomeElements and child.tag == "math" or child.tag == "svg" then
900+ insertMultiple(buffer, child:toHtml()) -- (We could call toXml() instead since these elements will be encoded as XML anyway.)
901+
902+ -- For anything else we just care about the text contents.
904903
905904 else
906- local text = trim(childNode:getHtmlText():gsub("%s+", " "))
907- doc[i] = xmlLib.element("p", text)
905+ if child.attr.dir then insertMultiple(buffer, "<span dir='",encodeHtmlEntities(child.attr.dir),"'>") end
906+ processChildren(buffer, textIndices, child)
907+ if child.attr.dir then insertMultiple(buffer, "</span") end
908+ end
909+ end
910+ end
911+
912+ local function trimText(buffer, textIndices, i1, i2, iDir)
913+ for i = i1, i2, iDir do
914+ local textNode = buffer[textIndices[i]]
915+
916+ if not textNode:find"^%s*$" then
917+ local pat = (iDir > 0) and "^%s+" or "%s+$"
918+ buffer[textIndices[i]] = textNode:gsub(pat, "")
919+ break
908920 end
909921
910- if doc[i+1] then table.insert(doc, i+1, "\n") end -- Not needed, but it looks nicer.
922+ buffer[textIndices[i]] = ""
911923 end
912924 end
913925
914- -- Remove protections.
915- if keepSomeElements then
916- xmlLib.walk(doc, true, function(tag, el)
917- for i, childNode in ipairsr(el) do
918- if xmlLib.isText(childNode) and childNode:find("$$PLACEHOLDER", 1, true) then
919- local newNodes = {}
920-
921- for pos, isMatch, textOrProtIndex in gmatchAndBetween(childNode, "%$%$PLACEHOLDER(%d+)%$%$") do
922- if isMatch then
923- table.insert(newNodes, protected[tonumber(textOrProtIndex)])
924- else
925- table.insert(newNodes, textOrProtIndex)
926- end
927- end
926+ local buffer = {"<div>"}
927+ local textIndices = {}
928928
929- table.remove(el, i)
929+ for _, child in ipairs(doc) do
930+ -- @Incomplete: Handle tables better?
930931
931- for _, node in ipairsr(newNodes) do
932- table.insert(el, i, node)
933- end
932+ if xmlLib.isText(child) then
933+ -- void
934+
935+ else
936+ local textStartIndex = #textIndices + 1
937+
938+ if child.tag == "ul" or child.tag == "ol" then
939+ insertMultiple(buffer, "<",child.tag,">")
940+
941+ for li in child:eachChildElement() do
942+ insertMultiple(buffer, "<li>")
943+ processChildren(buffer, textIndices, li)
944+ insertMultiple(buffer, "</li>")
934945 end
946+
947+ insertMultiple(buffer, "</",child.tag,">\n")
948+
949+ elseif child.tag == "pre" then
950+ insertMultiple(buffer, "<",child.tag,">")
951+ processChildren(buffer, textIndices, child)
952+ insertMultiple(buffer, "</",child.tag,">\n")
953+
954+ else
955+ insertMultiple(buffer, "<p>")
956+ processChildren(buffer, textIndices, child)
957+ insertMultiple(buffer, "</p>\n")
935958 end
936- end)
959+
960+ trimText(buffer, textIndices, textStartIndex, #textIndices, 1)
961+ trimText(buffer, textIndices, #textIndices, textStartIndex, -1)
962+ end
937963 end
938964
965+ table.insert(buffer, "</div>")
966+
967+ local len = #buffer
968+ buffer[len] = buffer[len]:gsub("\n$", "")
969+ doc = assert(xmlLib.parseHtml(table.concat(buffer))) -- @Speed: So much back and forth between HTML strings and parsed documents!
970+
939971 -- Limit text length.
940972 local charsRemaining = maxChars
941973 local hasText = {}
@@ -966,8 +998,8 @@ local function setup()
966998 end
967999
9681000 else
969- for i, childNode in ipairs(node) do
970- limit(childNode , node, i)
1001+ for i, child in ipairs(node) do
1002+ limit(child , node, i)
9711003 end
9721004
9731005 if hasText[node] and parentEl then
@@ -978,6 +1010,8 @@ local function setup()
9781010
9791011 limit(doc, nil, nil)
9801012
1013+ -- @Polish: Remove empty elements, like <p></p> (but not <br>).
1014+
9811015 return (doc:contentsToHtml())
9821016 end,
9831017
0 commit comments