diff --git a/Lib/test/test_json/test_unicode.py b/Lib/test/test_json/test_unicode.py index 1aa9546dc46306..33aa78fd1b72c7 100644 --- a/Lib/test/test_json/test_unicode.py +++ b/Lib/test/test_json/test_unicode.py @@ -39,6 +39,31 @@ def test_ascii_non_printable_encode(self): self.assertEqual(self.dumps(u, ensure_ascii=False), '"\\b\\t\\n\\f\\r\\u0000\\u001f\x7f"') + def test_ensure_ascii_false_long_string_paths(self): + # Exercise the encoder's escape-size scan for ensure_ascii=False over + # long runs that cross the 8-byte scan windows and the short-string + # guard: a special character at every offset, in 1-byte (ASCII and + # Latin-1) and wider (BMP, astral) strings. + dumps, loads = self.dumps, self.loads + for n in range(40): + run = "a" * n + for tail in ('"', "\\", "\n", "\x00", "\x1f", "\x7f", "\xe9", + "中", "\U0001f600"): + s = run + tail + "tail" + self.assertEqual(loads(dumps(s, ensure_ascii=False)), s) + # The no-escape fast path returns the string verbatim between quotes, + # including kept-as-is Latin-1 and 0x7f. + for s in ("x" * 20, "\xe9" * 20, "kept latin1 \xe9\xff \x7f text " * 3): + self.assertEqual(dumps(s, ensure_ascii=False), '"' + s + '"') + # The structural escapes and control chars are still escaped after a + # long no-escape run. + self.assertEqual(dumps("a" * 20 + '"', ensure_ascii=False), + '"' + "a" * 20 + '\\""') + self.assertEqual(dumps("a" * 20 + "\\", ensure_ascii=False), + '"' + "a" * 20 + '\\\\"') + self.assertEqual(dumps("a" * 20 + "\x01", ensure_ascii=False), + '"' + "a" * 20 + '\\u0001"') + def test_ascii_non_printable_decode(self): self.assertEqual(self.loads('"\\b\\t\\n\\f\\r"'), '\b\t\n\f\r') diff --git a/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst b/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst new file mode 100644 index 00000000000000..7da2b9f515b9c5 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst @@ -0,0 +1,4 @@ +Speed up :func:`json.dumps` with ``ensure_ascii=False`` for strings made up of +long runs of characters that need no escaping, by scanning eight bytes at a +time. Short strings, strings that need escaping, and strings with characters +above U+00FF are unaffected. Patch by Bernát Gábor. diff --git a/Modules/_json.c b/Modules/_json.c index 6c4f38834631d3..be44538558872f 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -281,6 +281,36 @@ escape_size(const void *input, int kind, Py_ssize_t input_chars) Py_ssize_t i; Py_ssize_t output_size; + /* SWAR no-escape fast path (1-byte): needs-escape is c == '"' || c == '\\' + || c < 0x20; non-ASCII (Latin-1 >= 0x80) is kept verbatim here. A length + guard keeps short strings on the original per-character loop. */ + if (kind == PyUnicode_1BYTE_KIND && input_chars >= 16 + && input_chars < PY_SSIZE_T_MAX - 2) { + const Py_UCS1 *p = (const Py_UCS1 *)input; + const uint64_t ones = 0x0101010101010101ULL; + const uint64_t high = 0x8080808080808080ULL; + const uint64_t bq = 0x22ULL * ones, bs = 0x5cULL * ones, bc = 0xE0ULL * ones; + Py_ssize_t j = 0; + int needs_escape = 0; + for (; j + 8 <= input_chars; j += 8) { + uint64_t w; + memcpy(&w, p + j, 8); + uint64_t mq = w ^ bq; mq = (mq - ones) & ~mq & high; + uint64_t ms = w ^ bs; ms = (ms - ones) & ~ms & high; + uint64_t vc = w & bc; uint64_t mlo = (vc - ones) & ~vc & high; + if (mq | ms | mlo) { needs_escape = 1; break; } + } + if (!needs_escape) { + for (; j < input_chars; j++) { + Py_UCS1 c = p[j]; + if (c == '"' || c == '\\' || c < 0x20) { needs_escape = 1; break; } + } + } + if (!needs_escape) { + return input_chars + 2; + } + } + /* Compute the output size */ for (i = 0, output_size = 2; i < input_chars; i++) { Py_UCS4 c = PyUnicode_READ(kind, input, i);