Skip to content

Commit 821865d

Browse files
committed
change: Translate WSGI strings to utf8 immediately, not only on demand.
1 parent 7912616 commit 821865d

6 files changed

+54
-113
lines changed

bottle.py

+34-68
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ def _cli_patch(cli_args): # pragma: no coverage
8888
import _thread as thread
8989
from urllib.parse import urljoin, SplitResult as UrlSplitResult
9090
from urllib.parse import urlencode, quote as urlquote, unquote as urlunquote
91-
urlunquote = functools.partial(urlunquote, encoding='latin1')
9291
from http.cookies import SimpleCookie, Morsel, CookieError
9392
from collections.abc import MutableMapping as DictMixin
9493
from types import ModuleType as new_module
@@ -112,6 +111,10 @@ def getargspec(func):
112111
callable = lambda x: hasattr(x, '__call__')
113112
imap = map
114113

114+
def _wsgi_recode(src, target='utf8'):
115+
return src.encode('latin1').decode(target)
116+
117+
115118
def _raise(*a):
116119
raise a[0](a[1]).with_traceback(a[2])
117120

@@ -679,11 +682,8 @@ def mountpoint_wrapper():
679682
def start_response(status, headerlist, exc_info=None):
680683
if exc_info:
681684
_raise(*exc_info)
682-
# Errors here mean that the mounted WSGI app did not
683-
# follow PEP-3333 (which requires latin1) or used a
684-
# pre-encoding other than utf8 :/
685-
status = status.encode('latin1').decode('utf8')
686-
headerlist = [(k, v.encode('latin1').decode('utf8'))
685+
status = _wsgi_recode(status)
686+
headerlist = [(k, _wsgi_recode(v))
687687
for (k, v) in headerlist]
688688
rs.status = status
689689
for name, value in headerlist:
@@ -934,7 +934,7 @@ def default_error_handler(self, res):
934934

935935
def _handle(self, environ):
936936
path = environ['bottle.raw_path'] = environ['PATH_INFO']
937-
environ['PATH_INFO'] = path.encode('latin1').decode('utf8', 'ignore')
937+
environ['PATH_INFO'] = _wsgi_recode(path)
938938

939939
environ['bottle.app'] = self
940940
request.bind(environ)
@@ -1158,7 +1158,8 @@ def get_header(self, name, default=None):
11581158
def cookies(self):
11591159
""" Cookies parsed into a :class:`FormsDict`. Signed cookies are NOT
11601160
decoded. Use :meth:`get_cookie` if you expect signed cookies. """
1161-
cookies = SimpleCookie(self.environ.get('HTTP_COOKIE', '')).values()
1161+
cookie_header = _wsgi_recode(self.environ.get('HTTP_COOKIE', ''))
1162+
cookies = SimpleCookie(cookie_header).values()
11621163
return FormsDict((c.key, c.value) for c in cookies)
11631164

11641165
def get_cookie(self, key, default=None, secret=None, digestmod=hashlib.sha256):
@@ -1186,7 +1187,7 @@ def query(self):
11861187
not to be confused with "URL wildcards" as they are provided by the
11871188
:class:`Router`. """
11881189
get = self.environ['bottle.get'] = FormsDict()
1189-
pairs = _parse_qsl(self.environ.get('QUERY_STRING', ''))
1190+
pairs = _parse_qsl(self.environ.get('QUERY_STRING', ''), 'utf8')
11901191
for key, value in pairs:
11911192
get[key] = value
11921193
return get
@@ -1198,7 +1199,6 @@ def forms(self):
11981199
:class:`FormsDict`. All keys and values are strings. File uploads
11991200
are stored separately in :attr:`files`. """
12001201
forms = FormsDict()
1201-
forms.recode_unicode = self.POST.recode_unicode
12021202
for name, item in self.POST.allitems():
12031203
if not isinstance(item, FileUpload):
12041204
forms[name] = item
@@ -1222,7 +1222,6 @@ def files(self):
12221222
12231223
"""
12241224
files = FormsDict()
1225-
files.recode_unicode = self.POST.recode_unicode
12261225
for name, item in self.POST.allitems():
12271226
if isinstance(item, FileUpload):
12281227
files[name] = item
@@ -1345,12 +1344,11 @@ def POST(self):
13451344
# We default to application/x-www-form-urlencoded for everything that
13461345
# is not multipart and take the fast path (also: 3.1 workaround)
13471346
if not content_type.startswith('multipart/'):
1348-
body = self._get_body_string(self.MEMFILE_MAX).decode('latin1')
1349-
for key, value in _parse_qsl(body):
1347+
body = self._get_body_string(self.MEMFILE_MAX).decode('utf8')
1348+
for key, value in _parse_qsl(body, 'utf8'):
13501349
post[key] = value
13511350
return post
13521351

1353-
post.recode_unicode = False
13541352
charset = options.get("charset", "utf8")
13551353
boundary = options.get("boundary")
13561354
if not boundary:
@@ -2134,49 +2132,32 @@ def getall(self, key):
21342132

21352133
class FormsDict(MultiDict):
21362134
""" This :class:`MultiDict` subclass is used to store request form data.
2137-
Additionally to the normal dict-like item access methods (which return
2138-
unmodified data as native strings), this container also supports
2139-
attribute-like access to its values. Attributes are automatically de-
2140-
or recoded to match :attr:`input_encoding` (default: 'utf8'). Missing
2141-
attributes default to an empty string. """
2142-
2143-
#: Encoding used for attribute values.
2144-
input_encoding = 'utf8'
2145-
#: If true (default), unicode strings are first encoded with `latin1`
2146-
#: and then decoded to match :attr:`input_encoding`.
2147-
recode_unicode = True
2148-
2149-
def _fix(self, s, encoding=None):
2150-
if isinstance(s, unicode) and self.recode_unicode: # Python 3 WSGI
2151-
return s.encode('latin1').decode(encoding or self.input_encoding)
2152-
elif isinstance(s, bytes): # Python 2 WSGI
2153-
return s.decode(encoding or self.input_encoding)
2154-
else:
2155-
return s
2135+
Additionally to the normal dict-like item access methods, this container
2136+
also supports attribute-like access to its values. Missing attributes
2137+
default to an empty string.
2138+
2139+
.. versionchanged:: 0.14
2140+
All keys and values are now decoded as utf8 by default, item and
2141+
attribute access will return the same string.
2142+
"""
21562143

21572144
def decode(self, encoding=None):
2158-
""" Returns a copy with all keys and values de- or recoded to match
2159-
:attr:`input_encoding`. Some libraries (e.g. WTForms) want a
2160-
unicode dictionary. """
2145+
""" (deprecated) Starting with 0.13 all keys and values are already
2146+
correctly decoded. """
21612147
copy = FormsDict()
2162-
enc = copy.input_encoding = encoding or self.input_encoding
2163-
copy.recode_unicode = False
21642148
for key, value in self.allitems():
2165-
copy.append(self._fix(key, enc), self._fix(value, enc))
2149+
copy[key] = value
21662150
return copy
21672151

21682152
def getunicode(self, name, default=None, encoding=None):
2169-
""" Return the value as a unicode string, or the default. """
2170-
try:
2171-
return self._fix(self[name], encoding)
2172-
except (UnicodeError, KeyError):
2173-
return default
2153+
""" (deprecated) Return the value as a unicode string, or the default. """
2154+
return self.get(name, default)
21742155

21752156
def __getattr__(self, name, default=unicode()):
21762157
# Without this guard, pickle generates a cryptic TypeError:
21772158
if name.startswith('__') and name.endswith('__'):
21782159
return super(FormsDict, self).__getattr__(name)
2179-
return self.getunicode(name, default=default)
2160+
return self.get(name, default=default)
21802161

21812162
class HeaderDict(MultiDict):
21822163
""" A case-insensitive version of :class:`MultiDict` that defaults to
@@ -2218,14 +2199,7 @@ def filter(self, names):
22182199

22192200
class WSGIHeaderDict(DictMixin):
22202201
""" This dict-like class wraps a WSGI environ dict and provides convenient
2221-
access to HTTP_* fields. Keys and values are native strings
2222-
(2.x bytes or 3.x unicode) and keys are case-insensitive. If the WSGI
2223-
environment contains non-native string values, these are de- or encoded
2224-
using a lossless 'latin1' character set.
2225-
2226-
The API will remain stable even on changes to the relevant PEPs.
2227-
Currently PEP 333, 444 and 3333 are supported. (PEP 444 is the only one
2228-
that uses non-native strings.)
2202+
access to HTTP_* fields. Header names are case-insensitive and titled by default.
22292203
"""
22302204
#: List of keys that do not have a ``HTTP_`` prefix.
22312205
cgikeys = ('CONTENT_TYPE', 'CONTENT_LENGTH')
@@ -2241,16 +2215,11 @@ def _ekey(self, key):
22412215
return 'HTTP_' + key
22422216

22432217
def raw(self, key, default=None):
2244-
""" Return the header value as is (may be bytes or unicode). """
2218+
""" Return the header value as is (not utf8-translated). """
22452219
return self.environ.get(self._ekey(key), default)
22462220

22472221
def __getitem__(self, key):
2248-
val = self.environ[self._ekey(key)]
2249-
if isinstance(val, unicode):
2250-
val = val.encode('latin1').decode('utf8')
2251-
else:
2252-
val = val.decode('utf8')
2253-
return val
2222+
return _wsgi_recode(self.environ[self._ekey(key)])
22542223

22552224
def __setitem__(self, key, value):
22562225
raise TypeError("%s is read-only." % self.__class__)
@@ -2684,8 +2653,6 @@ def filename(self):
26842653
or dashes are removed. The filename is limited to 255 characters.
26852654
"""
26862655
fname = self.raw_filename
2687-
if not isinstance(fname, unicode):
2688-
fname = fname.decode('utf8', 'ignore')
26892656
fname = normalize('NFKD', fname)
26902657
fname = fname.encode('ASCII', 'ignore').decode('ASCII')
26912658
fname = os.path.basename(fname.replace('\\', os.path.sep))
@@ -2966,14 +2933,14 @@ def _parse_http_header(h):
29662933
return values
29672934

29682935

2969-
def _parse_qsl(qs):
2936+
def _parse_qsl(qs, encoding="utf8"):
29702937
r = []
29712938
for pair in qs.split('&'):
29722939
if not pair: continue
29732940
nv = pair.split('=', 1)
29742941
if len(nv) != 2: nv.append('')
2975-
key = urlunquote(nv[0].replace('+', ' '))
2976-
value = urlunquote(nv[1].replace('+', ' '))
2942+
key = urlunquote(nv[0].replace('+', ' '), encoding)
2943+
value = urlunquote(nv[1].replace('+', ' '), encoding)
29772944
r.append((key, value))
29782945
return r
29792946

@@ -3283,7 +3250,7 @@ def feed(self, line, nl=""):
32833250
return self.write_header(line, nl)
32843251

32853252
def write_header(self, line, nl):
3286-
line = line.decode(self.charset)
3253+
line = str(line, self.charset)
32873254

32883255
if not nl:
32893256
raise MultipartError("Unexpected end of line in header.")
@@ -3355,8 +3322,7 @@ def is_buffered(self):
33553322
@property
33563323
def value(self):
33573324
""" Data decoded with the specified charset """
3358-
3359-
return self.raw.decode(self.charset)
3325+
return str(self.raw, self.charset)
33603326

33613327
@property
33623328
def raw(self):

docs/changelog.rst

+8-4
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,19 @@ Release Notes
88
Release 0.14 (in development)
99
=============================
1010

11-
.. rubric:: Removed APIs (deprecated since 0.13)
11+
.. rubric:: Removed APIs
1212

13-
* Dropped support for Python 2 and removed helpers and workarounds that only make sense in a Python 2/3 dual codebase (e.g. ``tonat()`` or the ``py3k`` flag).
13+
* Dropped support for Python 2 and removed workarounds or helpers that only make sense in a Python 2/3 dual codebase.
1414
* Removed the ``RouteReset`` exception and associated logic.
1515
* Removed the `bottle.py` console script entrypoint in favour of the new `bottle` script. You can still execute `bottle.py` directly or via `python -m bottle`. The only change is that the command installed by pip or similar tools into the bin/Scripts folder of the (virtual) environment is now called `bottle` to avoid circular import errors.
1616

17-
.. rubric:: Changes
17+
.. rubric:: Changed APIs
18+
19+
* ``bottle.FormsDict`` no longer translates between PEP-3333 `latin1` and the correct `utf8` encoding on demand. The `getunicode()` and `decode()` methods are deprecated and do nothing, as all values are already decoded correctly.
20+
21+
.. rubric:: New features
1822

19-
* ``bottle.HTTPError`` raised on Invalid JSON now include the underlying exception in their ``exception`` field.
23+
* ``bottle.HTTPError`` raised on Invalid JSON now include the underlying exception in the ``exception`` field.
2024

2125

2226
Release 0.13

docs/tutorial.rst

+3-13
Original file line numberDiff line numberDiff line change
@@ -552,28 +552,18 @@ Property Data source
552552

553553
Bottle uses a special type of dictionary to store those parameters. :class:`FormsDict` behaves like a normal dictionary, but has some additional features to make your life easier.
554554

555-
First of all, :class:`FormsDict` is a subclass of :class:`MultiDict` and can store more than one value per key. The standard dictionary access methods will only return the first of many values, but the :meth:`MultiDict.getall` method returns a (possibly empty) list of all values for a specific key::
555+
First of all, :class:`FormsDict` is a subclass of :class:`MultiDict` and can store more than one value per key. Only the first value is returned by default, but :meth:`MultiDict.getall` can be used to get a (possibly empty) list of all values for a specific key::
556556

557557
for choice in request.forms.getall('multiple_choice'):
558558
do_something(choice)
559559

560-
To simplify dealing with lots of unreliable user input, :class:`FormsDict` exposes all its values as attributes, but with a twist: These virtual attributes always return properly encoded unicode strings, even if the value is missing or character decoding fails. They never return ``None`` or throw an exception, but return an empty string instead::
560+
Attribute-like access is also supported, returning empty strings for missing values. This simplifies code a lot whend ealing with lots of optional attributes::
561561

562562
name = request.query.name # may be an empty string
563563

564564
.. rubric:: A word on unicode and character encodings
565565

566-
HTTP is a byte-based wire protocol. The server has to decode byte strings somehow before they are passed to the application. To be on the safe side, WSGI suggests ISO-8859-1 (aka latin1), a reversible single-byte codec that can be re-encoded with a different encoding later. Bottle does that for :meth:`FormsDict.getunicode` and attribute access, but not for :meth:`FormsDict.get` or item-access. These return the unchanged values as provided by the server implementation, which is probably not what you want.
567-
568-
::
569-
570-
>>> request.query['city']
571-
'Göttingen' # An utf8 string provisionally decoded as ISO-8859-1 by the server
572-
>>> request.query.city
573-
'Göttingen' # The same string correctly re-encoded as utf8 by bottle
574-
575-
If you need the whole dictionary with correctly decoded values (e.g. for WTForms), you can call :meth:`FormsDict.decode` to get a fully re-encoded copy.
576-
566+
Unicode characters in the request path, query parameters or cookies are a bit tricky. HTTP is a very old byte-based protocol that predates unicode and lacks explicit encoding information. This is why WSGI servers have to fall back on `ISO-8859-1` (aka `latin1`, a reversible input encoding) for those estrings. Modern browsers default to `utf8`, though. It's a bit much to ask application developers to translate every single user input string to the correct encoding manually. Bottle makes this easy and just assumes `utf8` for everything. All strings returned by Bottle APIs support the full range of unicode characters, as long as the webpage or HTTP client follows best practices and does not break with established standards.
577567

578568
Query Parameters
579569
--------------------------------------------------------------------------------

test/test_environ.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,8 @@ def test_get(self):
168168
self.assertEqual(['b'], request.query.getall('b'))
169169
self.assertEqual('1', request.query['a'])
170170
self.assertEqual('b', request.query['b'])
171-
self.assertEqual(touni(tob('瓶'), 'latin1'), request.query['cn'])
172-
self.assertEqual(touni('瓶'), request.query.cn)
171+
self.assertEqual('瓶', request.query['cn'])
172+
self.assertEqual('瓶', request.query.cn)
173173

174174
def test_post(self):
175175
""" Environ: POST data """
@@ -189,8 +189,8 @@ def test_post(self):
189189
self.assertEqual('b', request.POST['b'])
190190
self.assertEqual('', request.POST['c'])
191191
self.assertEqual('', request.POST['d'])
192-
self.assertEqual(touni(tob('瓶'), 'latin1'), request.POST['cn'])
193-
self.assertEqual(touni('瓶'), request.POST.cn)
192+
self.assertEqual('瓶', request.POST['cn'])
193+
self.assertEqual('瓶', request.POST.cn)
194194

195195
def test_bodypost(self):
196196
sq = tob('foobar')
@@ -890,10 +890,6 @@ def test_native(self):
890890
self.env['HTTP_TEST_HEADER'] = 'foobar'
891891
self.assertEqual(self.headers['Test-header'], 'foobar')
892892

893-
def test_bytes(self):
894-
self.env['HTTP_TEST_HEADER'] = tob('foobar')
895-
self.assertEqual(self.headers['Test-Header'], 'foobar')
896-
897893
def test_unicode(self):
898894
self.env['HTTP_TEST_HEADER'] = touni('foobar')
899895
self.assertEqual(self.headers['Test-Header'], 'foobar')

test/test_fileupload.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@ def test_filename(self):
3333
self.assertFilename('.name.cfg', 'name.cfg')
3434
self.assertFilename(' . na me . ', 'na-me')
3535
self.assertFilename('path/', 'empty')
36-
self.assertFilename(bottle.tob('ümläüts$'), 'umlauts')
37-
self.assertFilename(bottle.touni('ümläüts$'), 'umlauts')
36+
self.assertFilename('ümläüts$', 'umlauts')
3837
self.assertFilename('', 'empty')
3938
self.assertFilename('a'+'b'*1337+'c', 'a'+'b'*254)
4039

test/test_formsdict.py

+4-18
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,11 @@
77
class TestFormsDict(unittest.TestCase):
88
def test_attr_access(self):
99
""" FomsDict.attribute returs string values as unicode. """
10-
d = FormsDict(py2=tob('瓶'), py3=tob('瓶').decode('latin1'))
11-
self.assertEqual(touni('瓶'), d.py2)
12-
self.assertEqual(touni('瓶'), d.py3)
10+
d = FormsDict(py3='瓶')
11+
self.assertEqual('瓶', d.py3)
12+
self.assertEqual('瓶', d["py3"])
1313

1414
def test_attr_missing(self):
1515
""" FomsDict.attribute returs u'' on missing keys. """
1616
d = FormsDict()
17-
self.assertEqual(touni(''), d.missing)
18-
19-
def test_attr_unicode_error(self):
20-
""" FomsDict.attribute returs u'' on UnicodeError. """
21-
d = FormsDict(latin=touni('öäüß').encode('latin1'))
22-
self.assertEqual(touni(''), d.latin)
23-
d.input_encoding = 'latin1'
24-
self.assertEqual(touni('öäüß'), d.latin)
25-
26-
def test_decode_method(self):
27-
d = FormsDict(py2=tob('瓶'), py3=tob('瓶').decode('latin1'))
28-
d = d.decode()
29-
self.assertFalse(d.recode_unicode)
30-
self.assertTrue(hasattr(list(d.keys())[0], 'encode'))
31-
self.assertTrue(hasattr(list(d.values())[0], 'encode'))
17+
self.assertEqual('', d.missing)

0 commit comments

Comments
 (0)