Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions Lib/test/test_json/test_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,25 @@ def test_limit_int(self):
with self.assertRaises(ValueError):
self.loads('1' * (maxdigits + 1))

def test_long_string_scan_paths(self):
# Exercise the string scan over long runs that cross the 8-byte scan
# windows: a terminator, a backslash escape and a \uXXXX escape at every
# offset, in 1-byte and wider (BMP, astral) strings.
loads = self.loads
for n in range(40):
run = "a" * n
self.assertEqual(loads('"' + run + '"'), run)
self.assertEqual(loads('"' + run + '\\nz"'), run + "\nz")
self.assertEqual(loads('"' + run + '\\u00e9z"'), run + "\xe9z")
self.assertEqual(loads('"' + "中" * n + '\\n"'), "中" * n + "\n")
self.assertEqual(loads('"' + "\U0001f600" * n + '"'), "\U0001f600" * n)
# Strict control-character detection at the window boundaries, and the
# non-strict path that keeps them.
for n in (7, 8, 15, 16, 17, 23, 24):
self.assertRaises(self.JSONDecodeError, loads, '"' + "a" * n + '\x01"')
self.assertEqual(loads('"' + "a" * n + '\x01"', strict=False),
"a" * n + "\x01")


class TestPyDecode(TestDecode, PyTest): pass
class TestCDecode(TestDecode, CTest): pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Speed up :func:`json.loads` decoding of strings that contain long runs of
ordinary characters by scanning eight bytes at a time for the closing quote, a
backslash, or a control character. Strings containing non-Latin-1 characters
and short strings are unaffected. Patch by Bernát Gábor.
30 changes: 29 additions & 1 deletion Modules/_json.c
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,35 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
{
// Use tight scope variable to help register allocation.
Py_UCS4 d = 0;
for (next = end; next < len; next++) {
next = end;
/* For the 1-byte representation, skip 8 bytes at a time while none
is '"', '\\', or (strict) a control char < 0x20. The masks are
exact (no false negatives); the scalar loop below pins the exact
first special char and does the work. */
if (kind == PyUnicode_1BYTE_KIND) {
const Py_UCS1 *p = (const Py_UCS1 *)buf;
const uint64_t ones = 0x0101010101010101ULL;
const uint64_t high = 0x8080808080808080ULL;
const uint64_t bq = 0x22ULL * ones; /* '"' */
const uint64_t bs = 0x5cULL * ones; /* '\\' */
const uint64_t bc = 0xE0ULL * ones; /* (b & 0xE0)==0 iff b<0x20 */
while (next + 8 <= len) {
uint64_t w;
memcpy(&w, p + next, 8);
uint64_t mq = w ^ bq; mq = (mq - ones) & ~mq & high;
uint64_t ms = w ^ bs; ms = (ms - ones) & ~ms & high;
uint64_t mc = 0;
if (strict) {
uint64_t v = w & bc;
mc = (v - ones) & ~v & high;
}
if (mq | ms | mc) {
break;
}
next += 8;
}
}
for (; next < len; next++) {
d = PyUnicode_READ(kind, buf, next);
if (d == '"' || d == '\\') {
break;
Expand Down
Loading