1
- from __future__ import absolute_import
1
+ from __future__ import absolute_import , unicode_literals
2
2
from future .utils import PY3
3
3
__future_module__ = True
4
4
13
13
General functions for HTML manipulation.
14
14
"""
15
15
16
+ import re as _re
17
+ from future .moves .html .entities import html5 as _html5
18
+
19
+ _chr = chr
20
+ def chr (num ):
21
+ if num in range (256 ):
22
+ return _chr (num )
23
+ try :
24
+ return unichr (num )
25
+ except ValueError :
26
+ return str ('\\ U%08x' % num ).decode ('unicode-escape' )
27
+
16
28
def escape (s , quote = True ):
17
29
"""
18
30
Replace special characters "&", "<" and ">" to HTML-safe sequences.
@@ -28,4 +40,111 @@ def escape(s, quote=True):
28
40
s = s .replace ('\' ' , "'" )
29
41
return s
30
42
31
- __all__ = ['escape' ]
43
+
44
+ # see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
45
+
46
+ _invalid_charrefs = {
47
+ 0x00 : '\ufffd ' , # REPLACEMENT CHARACTER
48
+ 0x0d : '\r ' , # CARRIAGE RETURN
49
+ 0x80 : '\u20ac ' , # EURO SIGN
50
+ 0x81 : '\x81 ' , # <control>
51
+ 0x82 : '\u201a ' , # SINGLE LOW-9 QUOTATION MARK
52
+ 0x83 : '\u0192 ' , # LATIN SMALL LETTER F WITH HOOK
53
+ 0x84 : '\u201e ' , # DOUBLE LOW-9 QUOTATION MARK
54
+ 0x85 : '\u2026 ' , # HORIZONTAL ELLIPSIS
55
+ 0x86 : '\u2020 ' , # DAGGER
56
+ 0x87 : '\u2021 ' , # DOUBLE DAGGER
57
+ 0x88 : '\u02c6 ' , # MODIFIER LETTER CIRCUMFLEX ACCENT
58
+ 0x89 : '\u2030 ' , # PER MILLE SIGN
59
+ 0x8a : '\u0160 ' , # LATIN CAPITAL LETTER S WITH CARON
60
+ 0x8b : '\u2039 ' , # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
61
+ 0x8c : '\u0152 ' , # LATIN CAPITAL LIGATURE OE
62
+ 0x8d : '\x8d ' , # <control>
63
+ 0x8e : '\u017d ' , # LATIN CAPITAL LETTER Z WITH CARON
64
+ 0x8f : '\x8f ' , # <control>
65
+ 0x90 : '\x90 ' , # <control>
66
+ 0x91 : '\u2018 ' , # LEFT SINGLE QUOTATION MARK
67
+ 0x92 : '\u2019 ' , # RIGHT SINGLE QUOTATION MARK
68
+ 0x93 : '\u201c ' , # LEFT DOUBLE QUOTATION MARK
69
+ 0x94 : '\u201d ' , # RIGHT DOUBLE QUOTATION MARK
70
+ 0x95 : '\u2022 ' , # BULLET
71
+ 0x96 : '\u2013 ' , # EN DASH
72
+ 0x97 : '\u2014 ' , # EM DASH
73
+ 0x98 : '\u02dc ' , # SMALL TILDE
74
+ 0x99 : '\u2122 ' , # TRADE MARK SIGN
75
+ 0x9a : '\u0161 ' , # LATIN SMALL LETTER S WITH CARON
76
+ 0x9b : '\u203a ' , # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
77
+ 0x9c : '\u0153 ' , # LATIN SMALL LIGATURE OE
78
+ 0x9d : '\x9d ' , # <control>
79
+ 0x9e : '\u017e ' , # LATIN SMALL LETTER Z WITH CARON
80
+ 0x9f : '\u0178 ' , # LATIN CAPITAL LETTER Y WITH DIAERESIS
81
+ }
82
+
83
+ _invalid_codepoints = {
84
+ # 0x0001 to 0x0008
85
+ 0x1 , 0x2 , 0x3 , 0x4 , 0x5 , 0x6 , 0x7 , 0x8 ,
86
+ # 0x000E to 0x001F
87
+ 0xe , 0xf , 0x10 , 0x11 , 0x12 , 0x13 , 0x14 , 0x15 , 0x16 , 0x17 , 0x18 , 0x19 ,
88
+ 0x1a , 0x1b , 0x1c , 0x1d , 0x1e , 0x1f ,
89
+ # 0x007F to 0x009F
90
+ 0x7f , 0x80 , 0x81 , 0x82 , 0x83 , 0x84 , 0x85 , 0x86 , 0x87 , 0x88 , 0x89 , 0x8a ,
91
+ 0x8b , 0x8c , 0x8d , 0x8e , 0x8f , 0x90 , 0x91 , 0x92 , 0x93 , 0x94 , 0x95 , 0x96 ,
92
+ 0x97 , 0x98 , 0x99 , 0x9a , 0x9b , 0x9c , 0x9d , 0x9e , 0x9f ,
93
+ # 0xFDD0 to 0xFDEF
94
+ 0xfdd0 , 0xfdd1 , 0xfdd2 , 0xfdd3 , 0xfdd4 , 0xfdd5 , 0xfdd6 , 0xfdd7 , 0xfdd8 ,
95
+ 0xfdd9 , 0xfdda , 0xfddb , 0xfddc , 0xfddd , 0xfdde , 0xfddf , 0xfde0 , 0xfde1 ,
96
+ 0xfde2 , 0xfde3 , 0xfde4 , 0xfde5 , 0xfde6 , 0xfde7 , 0xfde8 , 0xfde9 , 0xfdea ,
97
+ 0xfdeb , 0xfdec , 0xfded , 0xfdee , 0xfdef ,
98
+ # others
99
+ 0xb , 0xfffe , 0xffff , 0x1fffe , 0x1ffff , 0x2fffe , 0x2ffff , 0x3fffe , 0x3ffff ,
100
+ 0x4fffe , 0x4ffff , 0x5fffe , 0x5ffff , 0x6fffe , 0x6ffff , 0x7fffe , 0x7ffff ,
101
+ 0x8fffe , 0x8ffff , 0x9fffe , 0x9ffff , 0xafffe , 0xaffff , 0xbfffe , 0xbffff ,
102
+ 0xcfffe , 0xcffff , 0xdfffe , 0xdffff , 0xefffe , 0xeffff , 0xffffe , 0xfffff ,
103
+ 0x10fffe , 0x10ffff
104
+ }
105
+
106
+
107
+ def _replace_charref (s ):
108
+ s = s .group (1 )
109
+ if s [0 ] == '#' :
110
+ # numeric charref
111
+ if s [1 ] in 'xX' :
112
+ num = int (s [2 :].rstrip (';' ), 16 )
113
+ else :
114
+ num = int (s [1 :].rstrip (';' ))
115
+ if num in _invalid_charrefs :
116
+ return _invalid_charrefs [num ]
117
+ if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF :
118
+ return '\uFFFD '
119
+ if num in _invalid_codepoints :
120
+ return ''
121
+ return chr (num )
122
+ else :
123
+ # named charref
124
+ if s in _html5 :
125
+ return _html5 [s ]
126
+ # find the longest matching name (as defined by the standard)
127
+ for x in range (len (s )- 1 , 1 , - 1 ):
128
+ if s [:x ] in _html5 :
129
+ return _html5 [s [:x ]] + s [x :]
130
+ else :
131
+ return '&' + s
132
+
133
+
134
+ _charref = _re .compile (r'&(#[0-9]+;?'
135
+ r'|#[xX][0-9a-fA-F]+;?'
136
+ r'|[^\t\n\f <&#;]{1,32};?)' )
137
+
138
+ def unescape (s ):
139
+ """
140
+ Convert all named and numeric character references (e.g. >, >,
141
+ &x3e;) in the string s to the corresponding unicode characters.
142
+ This function uses the rules defined by the HTML 5 standard
143
+ for both valid and invalid character references, and the list of
144
+ HTML 5 named character references defined in html.entities.html5.
145
+ """
146
+ if '&' not in s :
147
+ return s
148
+ return _charref .sub (_replace_charref , s )
149
+
150
+ __all__ = [b'escape' , b'unescape' ]
0 commit comments