17db96d56Sopenharmony_ci""" 27db96d56Sopenharmony_ciTest the implementation of the PEP 540: the UTF-8 Mode. 37db96d56Sopenharmony_ci""" 47db96d56Sopenharmony_ci 57db96d56Sopenharmony_ciimport locale 67db96d56Sopenharmony_ciimport subprocess 77db96d56Sopenharmony_ciimport sys 87db96d56Sopenharmony_ciimport textwrap 97db96d56Sopenharmony_ciimport unittest 107db96d56Sopenharmony_cifrom test import support 117db96d56Sopenharmony_cifrom test.support.script_helper import assert_python_ok, assert_python_failure 127db96d56Sopenharmony_cifrom test.support import os_helper 137db96d56Sopenharmony_ci 147db96d56Sopenharmony_ci 157db96d56Sopenharmony_ciMS_WINDOWS = (sys.platform == 'win32') 167db96d56Sopenharmony_ciPOSIX_LOCALES = ('C', 'POSIX') 177db96d56Sopenharmony_ciVXWORKS = (sys.platform == "vxworks") 187db96d56Sopenharmony_ci 197db96d56Sopenharmony_ciclass UTF8ModeTests(unittest.TestCase): 207db96d56Sopenharmony_ci DEFAULT_ENV = { 217db96d56Sopenharmony_ci 'PYTHONUTF8': '', 227db96d56Sopenharmony_ci 'PYTHONLEGACYWINDOWSFSENCODING': '', 237db96d56Sopenharmony_ci 'PYTHONCOERCECLOCALE': '0', 247db96d56Sopenharmony_ci } 257db96d56Sopenharmony_ci 267db96d56Sopenharmony_ci def posix_locale(self): 277db96d56Sopenharmony_ci loc = locale.setlocale(locale.LC_CTYPE, None) 287db96d56Sopenharmony_ci return (loc in POSIX_LOCALES) 297db96d56Sopenharmony_ci 307db96d56Sopenharmony_ci def get_output(self, *args, failure=False, **kw): 317db96d56Sopenharmony_ci kw = dict(self.DEFAULT_ENV, **kw) 327db96d56Sopenharmony_ci if failure: 337db96d56Sopenharmony_ci out = assert_python_failure(*args, **kw) 347db96d56Sopenharmony_ci out = out[2] 357db96d56Sopenharmony_ci else: 367db96d56Sopenharmony_ci out = assert_python_ok(*args, **kw) 377db96d56Sopenharmony_ci out = out[1] 387db96d56Sopenharmony_ci return out.decode().rstrip("\n\r") 397db96d56Sopenharmony_ci 407db96d56Sopenharmony_ci @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale') 417db96d56Sopenharmony_ci def test_posix_locale(self): 427db96d56Sopenharmony_ci code = 'import sys; print(sys.flags.utf8_mode)' 437db96d56Sopenharmony_ci 447db96d56Sopenharmony_ci for loc in POSIX_LOCALES: 457db96d56Sopenharmony_ci with self.subTest(LC_ALL=loc): 467db96d56Sopenharmony_ci out = self.get_output('-c', code, LC_ALL=loc) 477db96d56Sopenharmony_ci self.assertEqual(out, '1') 487db96d56Sopenharmony_ci 497db96d56Sopenharmony_ci def test_xoption(self): 507db96d56Sopenharmony_ci code = 'import sys; print(sys.flags.utf8_mode)' 517db96d56Sopenharmony_ci 527db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code) 537db96d56Sopenharmony_ci self.assertEqual(out, '1') 547db96d56Sopenharmony_ci 557db96d56Sopenharmony_ci # undocumented but accepted syntax: -X utf8=1 567db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8=1', '-c', code) 577db96d56Sopenharmony_ci self.assertEqual(out, '1') 587db96d56Sopenharmony_ci 597db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8=0', '-c', code) 607db96d56Sopenharmony_ci self.assertEqual(out, '0') 617db96d56Sopenharmony_ci 627db96d56Sopenharmony_ci if MS_WINDOWS: 637db96d56Sopenharmony_ci # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 Mode 647db96d56Sopenharmony_ci # and has the priority over -X utf8 657db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code, 667db96d56Sopenharmony_ci PYTHONLEGACYWINDOWSFSENCODING='1') 677db96d56Sopenharmony_ci self.assertEqual(out, '0') 687db96d56Sopenharmony_ci 697db96d56Sopenharmony_ci def test_env_var(self): 707db96d56Sopenharmony_ci code = 'import sys; print(sys.flags.utf8_mode)' 717db96d56Sopenharmony_ci 727db96d56Sopenharmony_ci out = self.get_output('-c', code, PYTHONUTF8='1') 737db96d56Sopenharmony_ci self.assertEqual(out, '1') 747db96d56Sopenharmony_ci 757db96d56Sopenharmony_ci out = self.get_output('-c', code, PYTHONUTF8='0') 767db96d56Sopenharmony_ci self.assertEqual(out, '0') 777db96d56Sopenharmony_ci 787db96d56Sopenharmony_ci # -X utf8 has the priority over PYTHONUTF8 797db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1') 807db96d56Sopenharmony_ci self.assertEqual(out, '0') 817db96d56Sopenharmony_ci 827db96d56Sopenharmony_ci if MS_WINDOWS: 837db96d56Sopenharmony_ci # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode 847db96d56Sopenharmony_ci # and has the priority over PYTHONUTF8 857db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1', 867db96d56Sopenharmony_ci PYTHONLEGACYWINDOWSFSENCODING='1') 877db96d56Sopenharmony_ci self.assertEqual(out, '0') 887db96d56Sopenharmony_ci 897db96d56Sopenharmony_ci # Cannot test with the POSIX locale, since the POSIX locale enables 907db96d56Sopenharmony_ci # the UTF-8 mode 917db96d56Sopenharmony_ci if not self.posix_locale(): 927db96d56Sopenharmony_ci # PYTHONUTF8 should be ignored if -E is used 937db96d56Sopenharmony_ci out = self.get_output('-E', '-c', code, PYTHONUTF8='1') 947db96d56Sopenharmony_ci self.assertEqual(out, '0') 957db96d56Sopenharmony_ci 967db96d56Sopenharmony_ci # invalid mode 977db96d56Sopenharmony_ci out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True) 987db96d56Sopenharmony_ci self.assertIn('invalid PYTHONUTF8 environment variable value', 997db96d56Sopenharmony_ci out.rstrip()) 1007db96d56Sopenharmony_ci 1017db96d56Sopenharmony_ci def test_filesystemencoding(self): 1027db96d56Sopenharmony_ci code = textwrap.dedent(''' 1037db96d56Sopenharmony_ci import sys 1047db96d56Sopenharmony_ci print("{}/{}".format(sys.getfilesystemencoding(), 1057db96d56Sopenharmony_ci sys.getfilesystemencodeerrors())) 1067db96d56Sopenharmony_ci ''') 1077db96d56Sopenharmony_ci 1087db96d56Sopenharmony_ci if MS_WINDOWS: 1097db96d56Sopenharmony_ci expected = 'utf-8/surrogatepass' 1107db96d56Sopenharmony_ci else: 1117db96d56Sopenharmony_ci expected = 'utf-8/surrogateescape' 1127db96d56Sopenharmony_ci 1137db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code) 1147db96d56Sopenharmony_ci self.assertEqual(out, expected) 1157db96d56Sopenharmony_ci 1167db96d56Sopenharmony_ci if MS_WINDOWS: 1177db96d56Sopenharmony_ci # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode 1187db96d56Sopenharmony_ci # and has the priority over -X utf8 and PYTHONUTF8 1197db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code, 1207db96d56Sopenharmony_ci PYTHONUTF8='strict', 1217db96d56Sopenharmony_ci PYTHONLEGACYWINDOWSFSENCODING='1') 1227db96d56Sopenharmony_ci self.assertEqual(out, 'mbcs/replace') 1237db96d56Sopenharmony_ci 1247db96d56Sopenharmony_ci def test_stdio(self): 1257db96d56Sopenharmony_ci code = textwrap.dedent(''' 1267db96d56Sopenharmony_ci import sys 1277db96d56Sopenharmony_ci print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}") 1287db96d56Sopenharmony_ci print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}") 1297db96d56Sopenharmony_ci print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}") 1307db96d56Sopenharmony_ci ''') 1317db96d56Sopenharmony_ci 1327db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code, 1337db96d56Sopenharmony_ci PYTHONIOENCODING='') 1347db96d56Sopenharmony_ci self.assertEqual(out.splitlines(), 1357db96d56Sopenharmony_ci ['stdin: utf-8/surrogateescape', 1367db96d56Sopenharmony_ci 'stdout: utf-8/surrogateescape', 1377db96d56Sopenharmony_ci 'stderr: utf-8/backslashreplace']) 1387db96d56Sopenharmony_ci 1397db96d56Sopenharmony_ci # PYTHONIOENCODING has the priority over PYTHONUTF8 1407db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code, 1417db96d56Sopenharmony_ci PYTHONIOENCODING="latin1") 1427db96d56Sopenharmony_ci self.assertEqual(out.splitlines(), 1437db96d56Sopenharmony_ci ['stdin: iso8859-1/strict', 1447db96d56Sopenharmony_ci 'stdout: iso8859-1/strict', 1457db96d56Sopenharmony_ci 'stderr: iso8859-1/backslashreplace']) 1467db96d56Sopenharmony_ci 1477db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code, 1487db96d56Sopenharmony_ci PYTHONIOENCODING=":namereplace") 1497db96d56Sopenharmony_ci self.assertEqual(out.splitlines(), 1507db96d56Sopenharmony_ci ['stdin: utf-8/namereplace', 1517db96d56Sopenharmony_ci 'stdout: utf-8/namereplace', 1527db96d56Sopenharmony_ci 'stderr: utf-8/backslashreplace']) 1537db96d56Sopenharmony_ci 1547db96d56Sopenharmony_ci def test_io(self): 1557db96d56Sopenharmony_ci code = textwrap.dedent(''' 1567db96d56Sopenharmony_ci import sys 1577db96d56Sopenharmony_ci filename = sys.argv[1] 1587db96d56Sopenharmony_ci with open(filename) as fp: 1597db96d56Sopenharmony_ci print(f"{fp.encoding}/{fp.errors}") 1607db96d56Sopenharmony_ci ''') 1617db96d56Sopenharmony_ci filename = __file__ 1627db96d56Sopenharmony_ci 1637db96d56Sopenharmony_ci out = self.get_output('-c', code, filename, PYTHONUTF8='1') 1647db96d56Sopenharmony_ci self.assertEqual(out.lower(), 'utf-8/strict') 1657db96d56Sopenharmony_ci 1667db96d56Sopenharmony_ci def _check_io_encoding(self, module, encoding=None, errors=None): 1677db96d56Sopenharmony_ci filename = __file__ 1687db96d56Sopenharmony_ci 1697db96d56Sopenharmony_ci # Encoding explicitly set 1707db96d56Sopenharmony_ci args = [] 1717db96d56Sopenharmony_ci if encoding: 1727db96d56Sopenharmony_ci args.append(f'encoding={encoding!r}') 1737db96d56Sopenharmony_ci if errors: 1747db96d56Sopenharmony_ci args.append(f'errors={errors!r}') 1757db96d56Sopenharmony_ci code = textwrap.dedent(''' 1767db96d56Sopenharmony_ci import sys 1777db96d56Sopenharmony_ci from %s import open 1787db96d56Sopenharmony_ci filename = sys.argv[1] 1797db96d56Sopenharmony_ci with open(filename, %s) as fp: 1807db96d56Sopenharmony_ci print(f"{fp.encoding}/{fp.errors}") 1817db96d56Sopenharmony_ci ''') % (module, ', '.join(args)) 1827db96d56Sopenharmony_ci out = self.get_output('-c', code, filename, 1837db96d56Sopenharmony_ci PYTHONUTF8='1') 1847db96d56Sopenharmony_ci 1857db96d56Sopenharmony_ci if not encoding: 1867db96d56Sopenharmony_ci encoding = 'utf-8' 1877db96d56Sopenharmony_ci if not errors: 1887db96d56Sopenharmony_ci errors = 'strict' 1897db96d56Sopenharmony_ci self.assertEqual(out.lower(), f'{encoding}/{errors}') 1907db96d56Sopenharmony_ci 1917db96d56Sopenharmony_ci def check_io_encoding(self, module): 1927db96d56Sopenharmony_ci self._check_io_encoding(module, encoding="latin1") 1937db96d56Sopenharmony_ci self._check_io_encoding(module, errors="namereplace") 1947db96d56Sopenharmony_ci self._check_io_encoding(module, 1957db96d56Sopenharmony_ci encoding="latin1", errors="namereplace") 1967db96d56Sopenharmony_ci 1977db96d56Sopenharmony_ci def test_io_encoding(self): 1987db96d56Sopenharmony_ci self.check_io_encoding('io') 1997db96d56Sopenharmony_ci 2007db96d56Sopenharmony_ci def test_pyio_encoding(self): 2017db96d56Sopenharmony_ci self.check_io_encoding('_pyio') 2027db96d56Sopenharmony_ci 2037db96d56Sopenharmony_ci def test_locale_getpreferredencoding(self): 2047db96d56Sopenharmony_ci code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))' 2057db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code) 2067db96d56Sopenharmony_ci self.assertEqual(out, 'utf-8 utf-8') 2077db96d56Sopenharmony_ci 2087db96d56Sopenharmony_ci for loc in POSIX_LOCALES: 2097db96d56Sopenharmony_ci with self.subTest(LC_ALL=loc): 2107db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-c', code, LC_ALL=loc) 2117db96d56Sopenharmony_ci self.assertEqual(out, 'utf-8 utf-8') 2127db96d56Sopenharmony_ci 2137db96d56Sopenharmony_ci @unittest.skipIf(MS_WINDOWS, 'test specific to Unix') 2147db96d56Sopenharmony_ci def test_cmd_line(self): 2157db96d56Sopenharmony_ci arg = 'h\xe9\u20ac'.encode('utf-8') 2167db96d56Sopenharmony_ci arg_utf8 = arg.decode('utf-8') 2177db96d56Sopenharmony_ci arg_ascii = arg.decode('ascii', 'surrogateescape') 2187db96d56Sopenharmony_ci code = 'import locale, sys; print("%s:%s" % (locale.getpreferredencoding(), ascii(sys.argv[1:])))' 2197db96d56Sopenharmony_ci 2207db96d56Sopenharmony_ci def check(utf8_opt, expected, **kw): 2217db96d56Sopenharmony_ci out = self.get_output('-X', utf8_opt, '-c', code, arg, **kw) 2227db96d56Sopenharmony_ci args = out.partition(':')[2].rstrip() 2237db96d56Sopenharmony_ci self.assertEqual(args, ascii(expected), out) 2247db96d56Sopenharmony_ci 2257db96d56Sopenharmony_ci check('utf8', [arg_utf8]) 2267db96d56Sopenharmony_ci for loc in POSIX_LOCALES: 2277db96d56Sopenharmony_ci with self.subTest(LC_ALL=loc): 2287db96d56Sopenharmony_ci check('utf8', [arg_utf8], LC_ALL=loc) 2297db96d56Sopenharmony_ci 2307db96d56Sopenharmony_ci if sys.platform == 'darwin' or support.is_android or VXWORKS: 2317db96d56Sopenharmony_ci c_arg = arg_utf8 2327db96d56Sopenharmony_ci elif sys.platform.startswith("aix"): 2337db96d56Sopenharmony_ci c_arg = arg.decode('iso-8859-1') 2347db96d56Sopenharmony_ci else: 2357db96d56Sopenharmony_ci c_arg = arg_ascii 2367db96d56Sopenharmony_ci for loc in POSIX_LOCALES: 2377db96d56Sopenharmony_ci with self.subTest(LC_ALL=loc): 2387db96d56Sopenharmony_ci check('utf8=0', [c_arg], LC_ALL=loc) 2397db96d56Sopenharmony_ci 2407db96d56Sopenharmony_ci def test_optim_level(self): 2417db96d56Sopenharmony_ci # CPython: check that Py_Main() doesn't increment Py_OptimizeFlag 2427db96d56Sopenharmony_ci # twice when -X utf8 requires to parse the configuration twice (when 2437db96d56Sopenharmony_ci # the encoding changes after reading the configuration, the 2447db96d56Sopenharmony_ci # configuration is read again with the new encoding). 2457db96d56Sopenharmony_ci code = 'import sys; print(sys.flags.optimize)' 2467db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-O', '-c', code) 2477db96d56Sopenharmony_ci self.assertEqual(out, '1') 2487db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-OO', '-c', code) 2497db96d56Sopenharmony_ci self.assertEqual(out, '2') 2507db96d56Sopenharmony_ci 2517db96d56Sopenharmony_ci code = 'import sys; print(sys.flags.ignore_environment)' 2527db96d56Sopenharmony_ci out = self.get_output('-X', 'utf8', '-E', '-c', code) 2537db96d56Sopenharmony_ci self.assertEqual(out, '1') 2547db96d56Sopenharmony_ci 2557db96d56Sopenharmony_ci @unittest.skipIf(MS_WINDOWS, 2567db96d56Sopenharmony_ci "os.device_encoding() doesn't implement " 2577db96d56Sopenharmony_ci "the UTF-8 Mode on Windows") 2587db96d56Sopenharmony_ci @support.requires_subprocess() 2597db96d56Sopenharmony_ci def test_device_encoding(self): 2607db96d56Sopenharmony_ci # Use stdout as TTY 2617db96d56Sopenharmony_ci if not sys.stdout.isatty(): 2627db96d56Sopenharmony_ci self.skipTest("sys.stdout is not a TTY") 2637db96d56Sopenharmony_ci 2647db96d56Sopenharmony_ci filename = 'out.txt' 2657db96d56Sopenharmony_ci self.addCleanup(os_helper.unlink, filename) 2667db96d56Sopenharmony_ci 2677db96d56Sopenharmony_ci code = (f'import os, sys; fd = sys.stdout.fileno(); ' 2687db96d56Sopenharmony_ci f'out = open({filename!r}, "w", encoding="utf-8"); ' 2697db96d56Sopenharmony_ci f'print(os.isatty(fd), os.device_encoding(fd), file=out); ' 2707db96d56Sopenharmony_ci f'out.close()') 2717db96d56Sopenharmony_ci cmd = [sys.executable, '-X', 'utf8', '-c', code] 2727db96d56Sopenharmony_ci # The stdout TTY is inherited to the child process 2737db96d56Sopenharmony_ci proc = subprocess.run(cmd, text=True) 2747db96d56Sopenharmony_ci self.assertEqual(proc.returncode, 0, proc) 2757db96d56Sopenharmony_ci 2767db96d56Sopenharmony_ci # In UTF-8 Mode, device_encoding(fd) returns "UTF-8" if fd is a TTY 2777db96d56Sopenharmony_ci with open(filename, encoding="utf8") as fp: 2787db96d56Sopenharmony_ci out = fp.read().rstrip() 2797db96d56Sopenharmony_ci self.assertEqual(out, 'True utf-8') 2807db96d56Sopenharmony_ci 2817db96d56Sopenharmony_ci 2827db96d56Sopenharmony_ciif __name__ == "__main__": 2837db96d56Sopenharmony_ci unittest.main() 284