diff --git a/html-parse.cabal b/html-parse.cabal
index 6c8b655..2871d9f 100644
--- a/html-parse.cabal
+++ b/html-parse.cabal
@@ -55,6 +55,10 @@ cabal-version: >=1.10
tested-with: GHC==8.4.*, GHC==8.6.*, GHC==8.8.*, GHC==8.10.*, GHC==9.0.*, GHC==9.2.*, GHC==9.4.*
extra-source-files: changelog.md
+flag case-insensitive
+ Description: Implement case insensitive HTML tags.
+ Default: True
+ Manual: True
source-repository head
type: git
@@ -71,13 +75,20 @@ library
attoparsec >=0.13 && <0.15,
text >=1.2 && <2.1,
containers >=0.5 && <0.7
+ if flag(case-insensitive)
+ build-depends: case-insensitive
+ cpp-options: -DCASE_INSENSITIVE=1
default-language: Haskell2010
benchmark bench
type: exitcode-stdio-1.0
main-is: Benchmark.hs
other-extensions: OverloadedStrings, DeriveGeneric
+ if flag(case-insensitive)
+ build-depends: case-insensitive
+ cpp-options: -DCASE_INSENSITIVE=1
build-depends: base,
+ bytestring,
deepseq,
attoparsec,
text,
@@ -92,6 +103,9 @@ test-suite spec
main-is: Spec.hs
other-modules: Text.HTML.ParserSpec, Text.HTML.TreeSpec
ghc-options: -Wall -with-rtsopts=-T
+ if flag(case-insensitive)
+ build-depends: case-insensitive
+ cpp-options: -DCASE_INSENSITIVE=1
build-tool-depends: hspec-discover:hspec-discover
build-depends: base,
containers,
diff --git a/src/Text/HTML/Parser.hs b/src/Text/HTML/Parser.hs
index 0776d45..435ba84 100644
--- a/src/Text/HTML/Parser.hs
+++ b/src/Text/HTML/Parser.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE LambdaCase #-}
@@ -32,6 +33,9 @@ import Control.Applicative
import Data.Monoid
import Control.Monad (guard)
import Control.DeepSeq
+#if CASE_INSENSITIVE
+import Data.CaseInsensitive (CI, mk, original)
+#endif
import Data.Attoparsec.Text
import qualified Data.Attoparsec.Text.Lazy as AL
@@ -48,7 +52,15 @@ import qualified Data.Trie as Trie
-- Section numbers refer to W3C HTML 5.2 specification.
-- | A tag name (e.g. @body@)
+#if CASE_INSENSITIVE
+type TagName = CI Text
+#else
type TagName = Text
+original :: TagName -> Text
+original = id
+mk :: Text -> TagName
+mk = id
+#endif
-- | An attribute name (e.g. @href@)
type AttrName = Text
@@ -139,6 +151,7 @@ isWhitespace :: Char -> Bool
isWhitespace '\x09' = True
isWhitespace '\x0a' = True
isWhitespace '\x0c' = True
+isWhitespace '\x0d' = True
isWhitespace ' ' = True
isWhitespace _ = False
@@ -169,8 +182,8 @@ tagNameClose = do
-- | /ยง8.2.4.10/: Tag name state: common code
--
-- deviation: no lower-casing, don't handle NULL characters
-tagName' :: Parser Text
-tagName' = do
+tagName' :: Parser TagName
+tagName' = mk <$> do
c <- peekChar'
guard $ isAsciiUpper c || isAsciiLower c
takeWhile $ not . (isWhitespace `orC` isC '/' `orC` isC '<' `orC` isC '>')
@@ -395,10 +408,10 @@ renderTokens = mconcat . fmap renderToken
-- | (Somewhat) canonical string representation of 'Token'.
renderToken :: Token -> TL.Text
renderToken = TL.fromStrict . mconcat . \case
- (TagOpen n []) -> ["<", n, ">"]
- (TagOpen n attrs) -> ["<", n, " ", renderAttrs attrs, ">"]
- (TagSelfClose n attrs) -> ["<", n, " ", renderAttrs attrs, " />"]
- (TagClose n) -> ["", n, ">"]
+ (TagOpen n []) -> ["<", original n, ">"]
+ (TagOpen n attrs) -> ["<", original n, " ", renderAttrs attrs, ">"]
+ (TagSelfClose n attrs) -> ["<", original n, " ", renderAttrs attrs, " />"]
+ (TagClose n) -> ["", original n, ">"]
(ContentChar c) -> [T.singleton c]
(ContentText t) -> [t]
(Comment builder) -> [""]
diff --git a/src/Text/HTML/Tree.hs b/src/Text/HTML/Tree.hs
index b1bfa85..2024a3d 100644
--- a/src/Text/HTML/Tree.hs
+++ b/src/Text/HTML/Tree.hs
@@ -48,7 +48,7 @@ tokensToForest = f (PStack [] [])
-- @ nonClosing = ["br", "hr", "img", "meta", "area", "base", "col", "embed", "input", "link", "param", "source", "track", "wbr"] @
--
--
-nonClosing :: [Text]
+nonClosing :: [TagName]
nonClosing = ["br", "hr", "img", "meta", "area", "base", "col", "embed", "input", "link", "param", "source", "track", "wbr"]
data ParseTokenForestError =
diff --git a/tests/Text/HTML/ParserSpec.hs b/tests/Text/HTML/ParserSpec.hs
index 9f5a64a..c47772b 100644
--- a/tests/Text/HTML/ParserSpec.hs
+++ b/tests/Text/HTML/ParserSpec.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE CPP #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings #-}
@@ -20,6 +21,11 @@ import Test.QuickCheck
import Test.QuickCheck.Instances ()
import Text.HTML.Parser
+#if CASE_INSENSITIVE
+import Data.CaseInsensitive
+#else
+mk = id
+#endif
instance Arbitrary Token where
arbitrary = oneof [validOpen, validClose, validFlat]
@@ -60,11 +66,11 @@ validXmlChar = elements (['\x20'..'\x7E'] \\ "\x09\x0a\x0c &/<>")
validXmlText :: Gen T.Text
validXmlText = T.pack <$> sized (`maxListOf` validXmlChar)
-validXmlTagName :: Gen T.Text
+validXmlTagName :: Gen TagName
validXmlTagName = do
initchar <- elements $ ['a'..'z'] <> ['A'..'Z']
thenchars <- sized (`maxListOf` elements (['\x20'..'\x7E'] \\ "\x09\x0a\x0c &/<>"))
- pure . T.pack $ initchar : thenchars
+ pure . mk . T.pack $ initchar : thenchars
validXmlAttrName :: Gen T.Text
validXmlAttrName = do