Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions html-parse.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ cabal-version: >=1.10
tested-with: GHC==8.4.*, GHC==8.6.*, GHC==8.8.*, GHC==8.10.*, GHC==9.0.*, GHC==9.2.*, GHC==9.4.*
extra-source-files: changelog.md

flag case-insensitive
Description: Implement case insensitive HTML tags.
Default: True
Manual: True

source-repository head
type: git
Expand All @@ -71,13 +75,20 @@ library
attoparsec >=0.13 && <0.15,
text >=1.2 && <2.1,
containers >=0.5 && <0.7
if flag(case-insensitive)
build-depends: case-insensitive
cpp-options: -DCASE_INSENSITIVE=1
default-language: Haskell2010

benchmark bench
type: exitcode-stdio-1.0
main-is: Benchmark.hs
other-extensions: OverloadedStrings, DeriveGeneric
if flag(case-insensitive)
build-depends: case-insensitive
cpp-options: -DCASE_INSENSITIVE=1
build-depends: base,
bytestring,
deepseq,
attoparsec,
text,
Expand All @@ -92,6 +103,9 @@ test-suite spec
main-is: Spec.hs
other-modules: Text.HTML.ParserSpec, Text.HTML.TreeSpec
ghc-options: -Wall -with-rtsopts=-T
if flag(case-insensitive)
build-depends: case-insensitive
cpp-options: -DCASE_INSENSITIVE=1
build-tool-depends: hspec-discover:hspec-discover
build-depends: base,
containers,
Expand Down
25 changes: 19 additions & 6 deletions src/Text/HTML/Parser.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE LambdaCase #-}
Expand Down Expand Up @@ -32,6 +33,9 @@ import Control.Applicative
import Data.Monoid
import Control.Monad (guard)
import Control.DeepSeq
#if CASE_INSENSITIVE
import Data.CaseInsensitive (CI, mk, original)
#endif

import Data.Attoparsec.Text
import qualified Data.Attoparsec.Text.Lazy as AL
Expand All @@ -48,7 +52,15 @@ import qualified Data.Trie as Trie
-- Section numbers refer to W3C HTML 5.2 specification.

-- | A tag name (e.g. @body@)
#if CASE_INSENSITIVE
type TagName = CI Text
#else
type TagName = Text
original :: TagName -> Text
original = id
mk :: Text -> TagName
mk = id
#endif

-- | An attribute name (e.g. @href@)
type AttrName = Text
Expand Down Expand Up @@ -139,6 +151,7 @@ isWhitespace :: Char -> Bool
isWhitespace '\x09' = True
isWhitespace '\x0a' = True
isWhitespace '\x0c' = True
isWhitespace '\x0d' = True
isWhitespace ' ' = True
isWhitespace _ = False

Expand Down Expand Up @@ -169,8 +182,8 @@ tagNameClose = do
-- | /§8.2.4.10/: Tag name state: common code
--
-- deviation: no lower-casing, don't handle NULL characters
tagName' :: Parser Text
tagName' = do
tagName' :: Parser TagName
tagName' = mk <$> do
c <- peekChar'
guard $ isAsciiUpper c || isAsciiLower c
takeWhile $ not . (isWhitespace `orC` isC '/' `orC` isC '<' `orC` isC '>')
Expand Down Expand Up @@ -395,10 +408,10 @@ renderTokens = mconcat . fmap renderToken
-- | (Somewhat) canonical string representation of 'Token'.
renderToken :: Token -> TL.Text
renderToken = TL.fromStrict . mconcat . \case
(TagOpen n []) -> ["<", n, ">"]
(TagOpen n attrs) -> ["<", n, " ", renderAttrs attrs, ">"]
(TagSelfClose n attrs) -> ["<", n, " ", renderAttrs attrs, " />"]
(TagClose n) -> ["</", n, ">"]
(TagOpen n []) -> ["<", original n, ">"]
(TagOpen n attrs) -> ["<", original n, " ", renderAttrs attrs, ">"]
(TagSelfClose n attrs) -> ["<", original n, " ", renderAttrs attrs, " />"]
(TagClose n) -> ["</", original n, ">"]
(ContentChar c) -> [T.singleton c]
(ContentText t) -> [t]
(Comment builder) -> ["<!--", TL.toStrict $ B.toLazyText builder, "-->"]
Expand Down
2 changes: 1 addition & 1 deletion src/Text/HTML/Tree.hs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ tokensToForest = f (PStack [] [])
-- @ nonClosing = ["br", "hr", "img", "meta", "area", "base", "col", "embed", "input", "link", "param", "source", "track", "wbr"] @
--
-- <https:\/\/www.w3.org\/TR\/html52\/syntax.html#void-elements>
nonClosing :: [Text]
nonClosing :: [TagName]
nonClosing = ["br", "hr", "img", "meta", "area", "base", "col", "embed", "input", "link", "param", "source", "track", "wbr"]

data ParseTokenForestError =
Expand Down
10 changes: 8 additions & 2 deletions tests/Text/HTML/ParserSpec.hs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{-# LANGUAGE CPP #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings #-}
Expand All @@ -20,6 +21,11 @@ import Test.QuickCheck
import Test.QuickCheck.Instances ()
import Text.HTML.Parser

#if CASE_INSENSITIVE
import Data.CaseInsensitive
#else
mk = id
#endif

instance Arbitrary Token where
arbitrary = oneof [validOpen, validClose, validFlat]
Expand Down Expand Up @@ -60,11 +66,11 @@ validXmlChar = elements (['\x20'..'\x7E'] \\ "\x09\x0a\x0c &/<>")
validXmlText :: Gen T.Text
validXmlText = T.pack <$> sized (`maxListOf` validXmlChar)

validXmlTagName :: Gen T.Text
validXmlTagName :: Gen TagName
validXmlTagName = do
initchar <- elements $ ['a'..'z'] <> ['A'..'Z']
thenchars <- sized (`maxListOf` elements (['\x20'..'\x7E'] \\ "\x09\x0a\x0c &/<>"))
pure . T.pack $ initchar : thenchars
pure . mk . T.pack $ initchar : thenchars

validXmlAttrName :: Gen T.Text
validXmlAttrName = do
Expand Down