Skip to content

Commit 13b17e2

Browse files
authored
gh-91156: Fix encoding="locale" in UTF-8 mode (GH-70056)
1 parent 7b87e8a commit 13b17e2

File tree

11 files changed

+35
-24
lines changed

11 files changed

+35
-24
lines changed

Doc/library/io.rst

+2-3
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ Text Encoding
112112
-------------
113113

114114
The default encoding of :class:`TextIOWrapper` and :func:`open` is
115-
locale-specific (:func:`locale.getpreferredencoding(False) <locale.getpreferredencoding>`).
115+
locale-specific (:func:`locale.getencoding`).
116116

117117
However, many developers forget to specify the encoding when opening text files
118118
encoded in UTF-8 (e.g. JSON, TOML, Markdown, etc...) since most Unix
@@ -948,8 +948,7 @@ Text I/O
948948
:class:`TextIOBase`.
949949

950950
*encoding* gives the name of the encoding that the stream will be decoded or
951-
encoded with. It defaults to
952-
:func:`locale.getpreferredencoding(False) <locale.getpreferredencoding>`.
951+
encoded with. It defaults to :func:`locale.getencoding()`.
953952
``encoding="locale"`` can be used to specify the current locale's encoding
954953
explicitly. See :ref:`io-text-encoding` for more information.
955954

Doc/using/windows.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -618,7 +618,7 @@ UTF-8 mode
618618

619619
Windows still uses legacy encodings for the system encoding (the ANSI Code
620620
Page). Python uses it for the default encoding of text files (e.g.
621-
:func:`locale.getpreferredencoding`).
621+
:func:`locale.getencoding`).
622622

623623
This may cause issues because UTF-8 is widely used on the internet
624624
and most Unix systems, including WSL (Windows Subsystem for Linux).

Lib/_pyio.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1988,7 +1988,7 @@ class TextIOWrapper(TextIOBase):
19881988
r"""Character and line based layer over a BufferedIOBase object, buffer.
19891989
19901990
encoding gives the name of the encoding that the stream will be
1991-
decoded or encoded with. It defaults to locale.getpreferredencoding(False).
1991+
decoded or encoded with. It defaults to locale.getencoding().
19921992
19931993
errors determines the strictness of encoding and decoding (see the
19941994
codecs.register) and defaults to "strict".
@@ -2021,7 +2021,9 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
20212021
self._check_newline(newline)
20222022
encoding = text_encoding(encoding)
20232023

2024-
if encoding == "locale":
2024+
if encoding == "locale" and sys.platform == "win32":
2025+
# On Unix, os.device_encoding() returns "utf-8" instead of locale encoding
2026+
# in the UTF-8 mode. So we use os.device_encoding() only on Windows.
20252027
try:
20262028
encoding = os.device_encoding(buffer.fileno()) or "locale"
20272029
except (AttributeError, UnsupportedOperation):
@@ -2034,7 +2036,7 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None,
20342036
# Importing locale may fail if Python is being built
20352037
encoding = "utf-8"
20362038
else:
2037-
encoding = locale.getpreferredencoding(False)
2039+
encoding = locale.getencoding()
20382040

20392041
if not isinstance(encoding, str):
20402042
raise ValueError("invalid encoding: %r" % encoding)

Lib/locale.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ def getdefaultlocale(envvars=('LC_ALL', 'LC_CTYPE', 'LANG', 'LANGUAGE')):
557557

558558
import warnings
559559
warnings.warn(
560-
"Use setlocale(), getpreferredencoding(False) and getlocale() instead",
560+
"Use setlocale(), getencoding() and getlocale() instead",
561561
DeprecationWarning, stacklevel=2
562562
)
563563

Lib/test/test_io.py

+1
Original file line numberDiff line numberDiff line change
@@ -2737,6 +2737,7 @@ def test_default_encoding(self):
27372737
os.environ.update(old_environ)
27382738

27392739
@support.cpython_only
2740+
@unittest.skipIf(sys.platform != "win32", "Windows-only test")
27402741
@unittest.skipIf(sys.flags.utf8_mode, "utf-8 mode is enabled")
27412742
def test_device_encoding(self):
27422743
# Issue 15989
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Make :class:`TextIOWrapper` uses locale encoding when ``encoding="locale"``
2+
is specified even in UTF-8 mode.

Modules/_io/_iomodule.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,9 @@ it already exists), 'x' for creating and writing to a new file, and
9292
'a' for appending (which on some Unix systems, means that all writes
9393
append to the end of the file regardless of the current seek position).
9494
In text mode, if encoding is not specified the encoding used is platform
95-
dependent: locale.getpreferredencoding(False) is called to get the
96-
current locale encoding. (For reading and writing raw bytes use binary
97-
mode and leave encoding unspecified.) The available modes are:
95+
dependent: locale.getencoding() is called to get the current locale encoding.
96+
(For reading and writing raw bytes use binary mode and leave encoding
97+
unspecified.) The available modes are:
9898
9999
========= ===============================================================
100100
Character Meaning
@@ -196,7 +196,7 @@ static PyObject *
196196
_io_open_impl(PyObject *module, PyObject *file, const char *mode,
197197
int buffering, const char *encoding, const char *errors,
198198
const char *newline, int closefd, PyObject *opener)
199-
/*[clinic end generated code: output=aefafc4ce2b46dc0 input=1543f4511d2356a5]*/
199+
/*[clinic end generated code: output=aefafc4ce2b46dc0 input=5bb37f174cb2fb11]*/
200200
{
201201
unsigned i;
202202

Modules/_io/clinic/_iomodule.c.h

+4-4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/_io/clinic/textio.c.h

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/_io/textio.c

+13-5
Original file line numberDiff line numberDiff line change
@@ -1023,7 +1023,7 @@ _io.TextIOWrapper.__init__
10231023
Character and line based layer over a BufferedIOBase object, buffer.
10241024
10251025
encoding gives the name of the encoding that the stream will be
1026-
decoded or encoded with. It defaults to locale.getpreferredencoding(False).
1026+
decoded or encoded with. It defaults to locale.getencoding().
10271027
10281028
errors determines the strictness of encoding and decoding (see
10291029
help(codecs.Codec) or the documentation for codecs.register) and
@@ -1055,12 +1055,12 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
10551055
const char *encoding, PyObject *errors,
10561056
const char *newline, int line_buffering,
10571057
int write_through)
1058-
/*[clinic end generated code: output=72267c0c01032ed2 input=77d8696d1a1f460b]*/
1058+
/*[clinic end generated code: output=72267c0c01032ed2 input=72590963698f289b]*/
10591059
{
10601060
PyObject *raw, *codec_info = NULL;
1061-
_PyIO_State *state = NULL;
10621061
PyObject *res;
10631062
int r;
1063+
int use_locale_encoding = 0; // Use locale encoding even in UTF-8 mode.
10641064

10651065
self->ok = 0;
10661066
self->detached = 0;
@@ -1076,6 +1076,7 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
10761076
}
10771077
else if (strcmp(encoding, "locale") == 0) {
10781078
encoding = NULL;
1079+
use_locale_encoding = 1;
10791080
}
10801081

10811082
if (errors == Py_None) {
@@ -1113,10 +1114,15 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
11131114
self->encodefunc = NULL;
11141115
self->b2cratio = 0.0;
11151116

1117+
#ifdef MS_WINDOWS
1118+
// os.device_encoding() on Unix is the locale encoding or UTF-8
1119+
// according to UTF-8 Mode.
1120+
// Since UTF-8 mode shouldn't affect `encoding="locale"`, we call
1121+
// os.device_encoding() only on Windows.
11161122
if (encoding == NULL) {
11171123
/* Try os.device_encoding(fileno) */
11181124
PyObject *fileno;
1119-
state = IO_STATE();
1125+
_PyIO_State *state = IO_STATE();
11201126
if (state == NULL)
11211127
goto error;
11221128
fileno = PyObject_CallMethodNoArgs(buffer, &_Py_ID(fileno));
@@ -1144,8 +1150,10 @@ _io_TextIOWrapper___init___impl(textio *self, PyObject *buffer,
11441150
Py_CLEAR(self->encoding);
11451151
}
11461152
}
1153+
#endif
1154+
11471155
if (encoding == NULL && self->encoding == NULL) {
1148-
if (_PyRuntime.preconfig.utf8_mode) {
1156+
if (_PyRuntime.preconfig.utf8_mode && !use_locale_encoding) {
11491157
_Py_DECLARE_STR(utf_8, "utf-8");
11501158
self->encoding = Py_NewRef(&_Py_STR(utf_8));
11511159
}

Tools/c-analyzer/TODO

-1
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,6 @@ Modules/_io/textio.c:PyId_close _Py_IDENTIFIER(
251251
Modules/_io/textio.c:PyId_decode _Py_IDENTIFIER(decode)
252252
Modules/_io/textio.c:PyId_fileno _Py_IDENTIFIER(fileno)
253253
Modules/_io/textio.c:PyId_flush _Py_IDENTIFIER(flush)
254-
Modules/_io/textio.c:PyId_getpreferredencoding _Py_IDENTIFIER(getpreferredencoding)
255254
Modules/_io/textio.c:PyId_isatty _Py_IDENTIFIER(isatty)
256255
Modules/_io/textio.c:PyId_mode _Py_IDENTIFIER(mode)
257256
Modules/_io/textio.c:PyId_name _Py_IDENTIFIER(name)

0 commit comments

Comments
 (0)