diff --git a/Lib/test/test_json/test_decode.py b/Lib/test/test_json/test_decode.py index 1d51fb2de0e69e4..c881fe7f13aeba8 100644 --- a/Lib/test/test_json/test_decode.py +++ b/Lib/test/test_json/test_decode.py @@ -155,6 +155,25 @@ def test_limit_int(self): with self.assertRaises(ValueError): self.loads('1' * (maxdigits + 1)) + def test_long_string_scan_paths(self): + # Exercise the string scan over long runs that cross the 8-byte scan + # windows: a terminator, a backslash escape and a \uXXXX escape at every + # offset, in 1-byte and wider (BMP, astral) strings. + loads = self.loads + for n in range(40): + run = "a" * n + self.assertEqual(loads('"' + run + '"'), run) + self.assertEqual(loads('"' + run + '\\nz"'), run + "\nz") + self.assertEqual(loads('"' + run + '\\u00e9z"'), run + "\xe9z") + self.assertEqual(loads('"' + "中" * n + '\\n"'), "中" * n + "\n") + self.assertEqual(loads('"' + "\U0001f600" * n + '"'), "\U0001f600" * n) + # Strict control-character detection at the window boundaries, and the + # non-strict path that keeps them. + for n in (7, 8, 15, 16, 17, 23, 24): + self.assertRaises(self.JSONDecodeError, loads, '"' + "a" * n + '\x01"') + self.assertEqual(loads('"' + "a" * n + '\x01"', strict=False), + "a" * n + "\x01") + class TestPyDecode(TestDecode, PyTest): pass class TestCDecode(TestDecode, CTest): pass diff --git a/Misc/NEWS.d/next/Library/2026-06-03-09-23-45.gh-issue-150871.aEM9sM.rst b/Misc/NEWS.d/next/Library/2026-06-03-09-23-45.gh-issue-150871.aEM9sM.rst new file mode 100644 index 000000000000000..3243e20374400ab --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-03-09-23-45.gh-issue-150871.aEM9sM.rst @@ -0,0 +1,4 @@ +Speed up :func:`json.loads` decoding of strings that contain long runs of +ordinary characters by scanning eight bytes at a time for the closing quote, a +backslash, or a control character. Strings containing non-Latin-1 characters +and short strings are unaffected. Patch by Bernát Gábor. diff --git a/Modules/_json.c b/Modules/_json.c index 6c4f38834631d30..bc953520824ee45 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -501,7 +501,35 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next { // Use tight scope variable to help register allocation. Py_UCS4 d = 0; - for (next = end; next < len; next++) { + next = end; + /* For the 1-byte representation, skip 8 bytes at a time while none + is '"', '\\', or (strict) a control char < 0x20. The masks are + exact (no false negatives); the scalar loop below pins the exact + first special char and does the work. */ + if (kind == PyUnicode_1BYTE_KIND) { + const Py_UCS1 *p = (const Py_UCS1 *)buf; + const uint64_t ones = 0x0101010101010101ULL; + const uint64_t high = 0x8080808080808080ULL; + const uint64_t bq = 0x22ULL * ones; /* '"' */ + const uint64_t bs = 0x5cULL * ones; /* '\\' */ + const uint64_t bc = 0xE0ULL * ones; /* (b & 0xE0)==0 iff b<0x20 */ + while (next + 8 <= len) { + uint64_t w; + memcpy(&w, p + next, 8); + uint64_t mq = w ^ bq; mq = (mq - ones) & ~mq & high; + uint64_t ms = w ^ bs; ms = (ms - ones) & ~ms & high; + uint64_t mc = 0; + if (strict) { + uint64_t v = w & bc; + mc = (v - ones) & ~v & high; + } + if (mq | ms | mc) { + break; + } + next += 8; + } + } + for (; next < len; next++) { d = PyUnicode_READ(kind, buf, next); if (d == '"' || d == '\\') { break;