From 0d5d951251ec89a40cbb409341a303f8a68f8daf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bern=C3=A1t=20G=C3=A1bor?= <gaborjbernat@gmail.com>
Date: Wed, 3 Jun 2026 11:49:35 -0700
Subject: [PATCH 1/2] gh-150878: Speed up json.dumps(ensure_ascii=False) for
 long strings

escape_size() sizes the ensure_ascii=False encoder output one character at a
time; a character needs escaping only when c == '"' || c == '\\' || c < 0x20,
and non-ASCII is kept verbatim. For the one-byte representation, detect the
no-escape case eight bytes at a time and return the verbatim size directly; a
length guard keeps short strings on the original per-character loop. Strings
with characters above U+00FF keep the current path.

Output is byte-identical, verified against test_json and a 199-case dumps
differential in both ensure_ascii modes. dumps of long 1-byte strings runs up
to 5.8x faster (4.2x for Latin-1 text); short keys and non-Latin-1 strings are
unaffected.
---
 ...-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst |  4 +++
 Modules/_json.c                               | 30 +++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst

diff --git a/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst b/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst
new file mode 100644
index 000000000000000..7da2b9f515b9c54
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-06-03-11-49-35.gh-issue-150878.ZCL1T0.rst
@@ -0,0 +1,4 @@
+Speed up :func:`json.dumps` with ``ensure_ascii=False`` for strings made up of
+long runs of characters that need no escaping, by scanning eight bytes at a
+time. Short strings, strings that need escaping, and strings with characters
+above U+00FF are unaffected. Patch by Bernát Gábor.
diff --git a/Modules/_json.c b/Modules/_json.c
index 6c4f38834631d30..be44538558872fa 100644
--- a/Modules/_json.c
+++ b/Modules/_json.c
@@ -281,6 +281,36 @@ escape_size(const void *input, int kind, Py_ssize_t input_chars)
     Py_ssize_t i;
     Py_ssize_t output_size;
 
+    /* SWAR no-escape fast path (1-byte): needs-escape is c == '"' || c == '\\'
+       || c < 0x20; non-ASCII (Latin-1 >= 0x80) is kept verbatim here.  A length
+       guard keeps short strings on the original per-character loop. */
+    if (kind == PyUnicode_1BYTE_KIND && input_chars >= 16
+            && input_chars < PY_SSIZE_T_MAX - 2) {
+        const Py_UCS1 *p = (const Py_UCS1 *)input;
+        const uint64_t ones = 0x0101010101010101ULL;
+        const uint64_t high = 0x8080808080808080ULL;
+        const uint64_t bq = 0x22ULL * ones, bs = 0x5cULL * ones, bc = 0xE0ULL * ones;
+        Py_ssize_t j = 0;
+        int needs_escape = 0;
+        for (; j + 8 <= input_chars; j += 8) {
+            uint64_t w;
+            memcpy(&w, p + j, 8);
+            uint64_t mq = w ^ bq; mq = (mq - ones) & ~mq & high;
+            uint64_t ms = w ^ bs; ms = (ms - ones) & ~ms & high;
+            uint64_t vc = w & bc; uint64_t mlo = (vc - ones) & ~vc & high;
+            if (mq | ms | mlo) { needs_escape = 1; break; }
+        }
+        if (!needs_escape) {
+            for (; j < input_chars; j++) {
+                Py_UCS1 c = p[j];
+                if (c == '"' || c == '\\' || c < 0x20) { needs_escape = 1; break; }
+            }
+        }
+        if (!needs_escape) {
+            return input_chars + 2;
+        }
+    }
+
     /* Compute the output size */
     for (i = 0, output_size = 2; i < input_chars; i++) {
         Py_UCS4 c = PyUnicode_READ(kind, input, i);

From 27a63b9fe9b18355c40455da4ac28364f98dd746 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bern=C3=A1t=20G=C3=A1bor?= <gaborjbernat@gmail.com>
Date: Wed, 3 Jun 2026 14:59:35 -0700
Subject: [PATCH 2/2] Add tests exercising the ensure_ascii=False encoder paths

Cover long runs that cross the scan windows and the short-string guard, with
a special character at every offset in 1-byte and wider strings, plus the
no-escape verbatim fast path and the escaped fallback.
---
 Lib/test/test_json/test_unicode.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/Lib/test/test_json/test_unicode.py b/Lib/test/test_json/test_unicode.py
index 1aa9546dc463061..33aa78fd1b72c75 100644
--- a/Lib/test/test_json/test_unicode.py
+++ b/Lib/test/test_json/test_unicode.py
@@ -39,6 +39,31 @@ def test_ascii_non_printable_encode(self):
         self.assertEqual(self.dumps(u, ensure_ascii=False),
                          '"\\b\\t\\n\\f\\r\\u0000\\u001f\x7f"')
 
+    def test_ensure_ascii_false_long_string_paths(self):
+        # Exercise the encoder's escape-size scan for ensure_ascii=False over
+        # long runs that cross the 8-byte scan windows and the short-string
+        # guard: a special character at every offset, in 1-byte (ASCII and
+        # Latin-1) and wider (BMP, astral) strings.
+        dumps, loads = self.dumps, self.loads
+        for n in range(40):
+            run = "a" * n
+            for tail in ('"', "\\", "\n", "\x00", "\x1f", "\x7f", "\xe9",
+                         "中", "\U0001f600"):
+                s = run + tail + "tail"
+                self.assertEqual(loads(dumps(s, ensure_ascii=False)), s)
+        # The no-escape fast path returns the string verbatim between quotes,
+        # including kept-as-is Latin-1 and 0x7f.
+        for s in ("x" * 20, "\xe9" * 20, "kept latin1 \xe9\xff \x7f text " * 3):
+            self.assertEqual(dumps(s, ensure_ascii=False), '"' + s + '"')
+        # The structural escapes and control chars are still escaped after a
+        # long no-escape run.
+        self.assertEqual(dumps("a" * 20 + '"', ensure_ascii=False),
+                         '"' + "a" * 20 + '\\""')
+        self.assertEqual(dumps("a" * 20 + "\\", ensure_ascii=False),
+                         '"' + "a" * 20 + '\\\\"')
+        self.assertEqual(dumps("a" * 20 + "\x01", ensure_ascii=False),
+                         '"' + "a" * 20 + '\\u0001"')
+
     def test_ascii_non_printable_decode(self):
         self.assertEqual(self.loads('"\\b\\t\\n\\f\\r"'),
                          '\b\t\n\f\r')