Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions Lib/test/test_json/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,31 @@ def test_ascii_non_printable_encode(self):
self.assertEqual(self.dumps(u, ensure_ascii=False),
'"\\b\\t\\n\\f\\r\\u0000\\u001f\x7f"')

def test_ensure_ascii_false_long_string_paths(self):
# Exercise the encoder's escape-size scan for ensure_ascii=False over
# long runs that cross the 8-byte scan windows and the short-string
# guard: a special character at every offset, in 1-byte (ASCII and
# Latin-1) and wider (BMP, astral) strings.
dumps, loads = self.dumps, self.loads
for n in range(40):
run = "a" * n
for tail in ('"', "\\", "\n", "\x00", "\x1f", "\x7f", "\xe9",
"中", "\U0001f600"):
s = run + tail + "tail"
self.assertEqual(loads(dumps(s, ensure_ascii=False)), s)
# The no-escape fast path returns the string verbatim between quotes,
# including kept-as-is Latin-1 and 0x7f.
for s in ("x" * 20, "\xe9" * 20, "kept latin1 \xe9\xff \x7f text " * 3):
self.assertEqual(dumps(s, ensure_ascii=False), '"' + s + '"')
# The structural escapes and control chars are still escaped after a
# long no-escape run.
self.assertEqual(dumps("a" * 20 + '"', ensure_ascii=False),
'"' + "a" * 20 + '\\""')
self.assertEqual(dumps("a" * 20 + "\\", ensure_ascii=False),
'"' + "a" * 20 + '\\\\"')
self.assertEqual(dumps("a" * 20 + "\x01", ensure_ascii=False),
'"' + "a" * 20 + '\\u0001"')

def test_ascii_non_printable_decode(self):
self.assertEqual(self.loads('"\\b\\t\\n\\f\\r"'),
'\b\t\n\f\r')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Speed up :func:`json.dumps` with ``ensure_ascii=False`` for strings made up of
long runs of characters that need no escaping, by scanning eight bytes at a
time. Short strings, strings that need escaping, and strings with characters
above U+00FF are unaffected. Patch by Bernát Gábor.
30 changes: 30 additions & 0 deletions Modules/_json.c
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,36 @@ escape_size(const void *input, int kind, Py_ssize_t input_chars)
Py_ssize_t i;
Py_ssize_t output_size;

/* SWAR no-escape fast path (1-byte): needs-escape is c == '"' || c == '\\'
|| c < 0x20; non-ASCII (Latin-1 >= 0x80) is kept verbatim here. A length
guard keeps short strings on the original per-character loop. */
if (kind == PyUnicode_1BYTE_KIND && input_chars >= 16
&& input_chars < PY_SSIZE_T_MAX - 2) {
const Py_UCS1 *p = (const Py_UCS1 *)input;
const uint64_t ones = 0x0101010101010101ULL;
const uint64_t high = 0x8080808080808080ULL;
const uint64_t bq = 0x22ULL * ones, bs = 0x5cULL * ones, bc = 0xE0ULL * ones;
Py_ssize_t j = 0;
int needs_escape = 0;
for (; j + 8 <= input_chars; j += 8) {
uint64_t w;
memcpy(&w, p + j, 8);
uint64_t mq = w ^ bq; mq = (mq - ones) & ~mq & high;
uint64_t ms = w ^ bs; ms = (ms - ones) & ~ms & high;
uint64_t vc = w & bc; uint64_t mlo = (vc - ones) & ~vc & high;
if (mq | ms | mlo) { needs_escape = 1; break; }
}
if (!needs_escape) {
for (; j < input_chars; j++) {
Py_UCS1 c = p[j];
if (c == '"' || c == '\\' || c < 0x20) { needs_escape = 1; break; }
}
}
if (!needs_escape) {
return input_chars + 2;
}
}

/* Compute the output size */
for (i = 0, output_size = 2; i < input_chars; i++) {
Py_UCS4 c = PyUnicode_READ(kind, input, i);
Expand Down
Loading