diff --git a/Misc/NEWS.d/next/Library/2026-06-04-10-44-36.gh-issue-150889.UYNLR_.rst b/Misc/NEWS.d/next/Library/2026-06-04-10-44-36.gh-issue-150889.UYNLR_.rst new file mode 100644 index 000000000000000..a5cc1da758637c5 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-04-10-44-36.gh-issue-150889.UYNLR_.rst @@ -0,0 +1 @@ +Speed up :func:`unicodedata.normalize` for the NFC and NFKC forms of non-ASCII text up to a factor 2. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 60df68216938134..bcdcc624e66f930 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -785,15 +785,19 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) static int find_nfc_index(const struct reindex* nfc, Py_UCS4 code) { - unsigned int index; - for (index = 0; nfc[index].start; index++) { - unsigned int start = nfc[index].start; - if (code < start) - return -1; - if (code <= start + nfc[index].count) { - unsigned int delta = code - start; - return nfc[index].index + delta; - } + /* The table is sorted by .start ascending with disjoint [start, start+count] + ranges and ends with a sentinel whose .start exceeds every codepoint, so + a single .start <= code test per entry also stops at the sentinel. Find + the first entry past code, then range-check the candidate (entry i - 1). */ + unsigned int i; + for (i = 0; (Py_UCS4)nfc[i].start <= code; i++) { + } + if (i == 0) { + return -1; + } + unsigned int start = nfc[i - 1].start; + if (code <= start + nfc[i - 1].count) { + return nfc[i - 1].index + (code - start); } return -1; } diff --git a/Modules/unicodedata_db.h b/Modules/unicodedata_db.h index 9e88f5cca7115b0..1c961c5e329341c 100644 --- a/Modules/unicodedata_db.h +++ b/Modules/unicodedata_db.h @@ -629,7 +629,7 @@ static struct reindex nfc_first[] = { { 93539, 0, 388}, { 93543, 0, 389}, { 93545, 0, 390}, - {0,0,0} + {0x7fffffff, 0, 0} }; static struct reindex nfc_last[] = { @@ -680,7 +680,7 @@ static struct reindex nfc_last[] = { { 90398, 2, 67}, { 90409, 0, 70}, { 93543, 0, 71}, - {0,0,0} + {0x7fffffff, 0, 0} }; /* string literals */ diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 5db850ca2d1f0c0..76283d6b794a0b0 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -342,15 +342,21 @@ def makeunicodedata(unicode, trace): fprint("#define TOTAL_FIRST",total_first) fprint("#define TOTAL_LAST",total_last) fprint("struct reindex{int start;short count,index;};") + # The reindex tables are read only by find_nfc_index(), which scans + # forward while .start <= code. The trailing sentinel's .start must + # exceed every codepoint (so the scan stops with a single comparison) + # and fit the signed int .start field. + nfc_sentinel = 0x7fffffff + assert sys.maxunicode < nfc_sentinel <= 0x7fffffff fprint("static struct reindex nfc_first[] = {") for start,end in comp_first_ranges: fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start])) - fprint(" {0,0,0}") + fprint(" {0x%x, 0, 0}" % nfc_sentinel) fprint("};\n") fprint("static struct reindex nfc_last[] = {") for start,end in comp_last_ranges: fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start])) - fprint(" {0,0,0}") + fprint(" {0x%x, 0, 0}" % nfc_sentinel) fprint("};\n") # FIXME: the following tables could be made static, and