diff --git a/html-parse.cabal b/html-parse.cabal index 6c8b655..2871d9f 100644 --- a/html-parse.cabal +++ b/html-parse.cabal @@ -55,6 +55,10 @@ cabal-version: >=1.10 tested-with: GHC==8.4.*, GHC==8.6.*, GHC==8.8.*, GHC==8.10.*, GHC==9.0.*, GHC==9.2.*, GHC==9.4.* extra-source-files: changelog.md +flag case-insensitive + Description: Implement case insensitive HTML tags. + Default: True + Manual: True source-repository head type: git @@ -71,13 +75,20 @@ library attoparsec >=0.13 && <0.15, text >=1.2 && <2.1, containers >=0.5 && <0.7 + if flag(case-insensitive) + build-depends: case-insensitive + cpp-options: -DCASE_INSENSITIVE=1 default-language: Haskell2010 benchmark bench type: exitcode-stdio-1.0 main-is: Benchmark.hs other-extensions: OverloadedStrings, DeriveGeneric + if flag(case-insensitive) + build-depends: case-insensitive + cpp-options: -DCASE_INSENSITIVE=1 build-depends: base, + bytestring, deepseq, attoparsec, text, @@ -92,6 +103,9 @@ test-suite spec main-is: Spec.hs other-modules: Text.HTML.ParserSpec, Text.HTML.TreeSpec ghc-options: -Wall -with-rtsopts=-T + if flag(case-insensitive) + build-depends: case-insensitive + cpp-options: -DCASE_INSENSITIVE=1 build-tool-depends: hspec-discover:hspec-discover build-depends: base, containers, diff --git a/src/Text/HTML/Parser.hs b/src/Text/HTML/Parser.hs index 0776d45..435ba84 100644 --- a/src/Text/HTML/Parser.hs +++ b/src/Text/HTML/Parser.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE LambdaCase #-} @@ -32,6 +33,9 @@ import Control.Applicative import Data.Monoid import Control.Monad (guard) import Control.DeepSeq +#if CASE_INSENSITIVE +import Data.CaseInsensitive (CI, mk, original) +#endif import Data.Attoparsec.Text import qualified Data.Attoparsec.Text.Lazy as AL @@ -48,7 +52,15 @@ import qualified Data.Trie as Trie -- Section numbers refer to W3C HTML 5.2 specification. -- | A tag name (e.g. @body@) +#if CASE_INSENSITIVE +type TagName = CI Text +#else type TagName = Text +original :: TagName -> Text +original = id +mk :: Text -> TagName +mk = id +#endif -- | An attribute name (e.g. @href@) type AttrName = Text @@ -139,6 +151,7 @@ isWhitespace :: Char -> Bool isWhitespace '\x09' = True isWhitespace '\x0a' = True isWhitespace '\x0c' = True +isWhitespace '\x0d' = True isWhitespace ' ' = True isWhitespace _ = False @@ -169,8 +182,8 @@ tagNameClose = do -- | /ยง8.2.4.10/: Tag name state: common code -- -- deviation: no lower-casing, don't handle NULL characters -tagName' :: Parser Text -tagName' = do +tagName' :: Parser TagName +tagName' = mk <$> do c <- peekChar' guard $ isAsciiUpper c || isAsciiLower c takeWhile $ not . (isWhitespace `orC` isC '/' `orC` isC '<' `orC` isC '>') @@ -395,10 +408,10 @@ renderTokens = mconcat . fmap renderToken -- | (Somewhat) canonical string representation of 'Token'. renderToken :: Token -> TL.Text renderToken = TL.fromStrict . mconcat . \case - (TagOpen n []) -> ["<", n, ">"] - (TagOpen n attrs) -> ["<", n, " ", renderAttrs attrs, ">"] - (TagSelfClose n attrs) -> ["<", n, " ", renderAttrs attrs, " />"] - (TagClose n) -> [""] + (TagOpen n []) -> ["<", original n, ">"] + (TagOpen n attrs) -> ["<", original n, " ", renderAttrs attrs, ">"] + (TagSelfClose n attrs) -> ["<", original n, " ", renderAttrs attrs, " />"] + (TagClose n) -> [""] (ContentChar c) -> [T.singleton c] (ContentText t) -> [t] (Comment builder) -> [""] diff --git a/src/Text/HTML/Tree.hs b/src/Text/HTML/Tree.hs index b1bfa85..2024a3d 100644 --- a/src/Text/HTML/Tree.hs +++ b/src/Text/HTML/Tree.hs @@ -48,7 +48,7 @@ tokensToForest = f (PStack [] []) -- @ nonClosing = ["br", "hr", "img", "meta", "area", "base", "col", "embed", "input", "link", "param", "source", "track", "wbr"] @ -- -- -nonClosing :: [Text] +nonClosing :: [TagName] nonClosing = ["br", "hr", "img", "meta", "area", "base", "col", "embed", "input", "link", "param", "source", "track", "wbr"] data ParseTokenForestError = diff --git a/tests/Text/HTML/ParserSpec.hs b/tests/Text/HTML/ParserSpec.hs index 9f5a64a..c47772b 100644 --- a/tests/Text/HTML/ParserSpec.hs +++ b/tests/Text/HTML/ParserSpec.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE CPP #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} @@ -20,6 +21,11 @@ import Test.QuickCheck import Test.QuickCheck.Instances () import Text.HTML.Parser +#if CASE_INSENSITIVE +import Data.CaseInsensitive +#else +mk = id +#endif instance Arbitrary Token where arbitrary = oneof [validOpen, validClose, validFlat] @@ -60,11 +66,11 @@ validXmlChar = elements (['\x20'..'\x7E'] \\ "\x09\x0a\x0c &/<>") validXmlText :: Gen T.Text validXmlText = T.pack <$> sized (`maxListOf` validXmlChar) -validXmlTagName :: Gen T.Text +validXmlTagName :: Gen TagName validXmlTagName = do initchar <- elements $ ['a'..'z'] <> ['A'..'Z'] thenchars <- sized (`maxListOf` elements (['\x20'..'\x7E'] \\ "\x09\x0a\x0c &/<>")) - pure . T.pack $ initchar : thenchars + pure . mk . T.pack $ initchar : thenchars validXmlAttrName :: Gen T.Text validXmlAttrName = do