python · behdad · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
@@ -89,8 +89,7 @@ _getrecord_ex(Py_UCS4 code)
     if (code >= 0x110000)
         index = 0;
     else {
-        index = index1[(code>>SHIFT)];
-        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
+        index = unicodedata_record_get_record_index(code);
     }
 
     return &_PyUnicode_Database_Records[index];
@@ -493,9 +492,7 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
     if (code < 0 || code >= 0x110000)
         index = 0;
     else {
-        index = decomp_index1[(code>>DECOMP_SHIFT)];
-        index = decomp_index2[(index<<DECOMP_SHIFT)+
-                             (code&((1<<DECOMP_SHIFT)-1))];
+        index = unicodedata_decomp_get_decomp_index(code);
     }
 
     /* high byte is number of hex bytes (usually one or two), low byte
@@ -539,9 +536,7 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
         *index = 0;
     }
     else {
-        *index = decomp_index1[(code>>DECOMP_SHIFT)];
-        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
-                               (code&((1<<DECOMP_SHIFT)-1))];
+        *index = unicodedata_decomp_get_decomp_index(code);
     }
 
     /* high byte is number of hex bytes (usually one or two), low byte
@@ -711,7 +706,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
     const void *data;
     Py_UCS4 *output;
     Py_ssize_t i, i1, o, len;
-    int f,l,index,index1,comb;
+    int f,l,index,comb;
     Py_UCS4 code;
     Py_ssize_t skipped[20];
     int cskipped = 0;
@@ -810,9 +805,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
               continue;
           }
           index = f*TOTAL_LAST + l;
-          index1 = comp_index[index >> COMP_SHIFT];
-          code = comp_data[(index1<<COMP_SHIFT)+
-                           (index&((1<<COMP_SHIFT)-1))];
+          code = unicodedata_comp_get_comp_data(index);
           if (code == 0)
               goto not_combinable;
 
@@ -1396,9 +1389,7 @@ _getucname(PyObject *self,
     }
 
     /* get position of codepoint in order of names in the dawg */
-    offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
-    offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
-                               (code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
+    offset = unicodename_get_dawg_codepoint_pos(code);
     if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
         return 0;
 

diff --git a/Modules/unicodedata_db.h b/Modules/unicodedata_db.h
diff --git a/Modules/unicodename_db.h b/Modules/unicodename_db.h
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
@@ -48,10 +48,8 @@ gettyperecord(Py_UCS4 code)
 
     if (code >= 0x110000)
         index = 0;
-    else
-    {
-        index = index1[(code>>SHIFT)];
-        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
+    else {
+        index = unicodetype_get_type_index(code);
     }
 
     return &_PyUnicode_TypeRecords[index];
@@ -285,4 +283,3 @@ int _PyUnicode_IsAlpha(Py_UCS4 ch)
 
     return (ctype->flags & ALPHA_MASK) != 0;
 }
-
diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h
diff --git a/Tools/unicode/benchmark_unicodedata_category.py b/Tools/unicode/benchmark_unicodedata_category.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""Benchmark Python-level unicodedata.category() lookups.
+
+Runs three fixed workloads:
+- all Unicode code points
+- BMP only
+- ASCII only
+"""
+
+from __future__ import annotations
+
+import statistics
+import time
+import unicodedata
+
+
+LOOPS = 5
+SAMPLES = 7
+DATASETS = {
+    "all": "".join(map(chr, range(0x110000))),
+    "bmp": "".join(map(chr, range(0x10000))),
+    "ascii": "".join(map(chr, range(0x80))),
+}
+
+
+def run_once(chars: str) -> tuple[float, int]:
+    category = unicodedata.category
+    checksum = 0
+    t0 = time.perf_counter()
+    for _ in range(LOOPS):
+        for ch in chars:
+            gc = category(ch)
+            checksum += ord(gc[0]) + ord(gc[1])
+    elapsed = time.perf_counter() - t0
+    return elapsed, checksum
+
+
+def benchmark(name: str, chars: str) -> None:
+    lookups = len(chars) * LOOPS
+
+    # Warm up specialization and caches before timing.
+    run_once(chars)
+
+    samples = []
+    checksum = None
+    for _ in range(SAMPLES):
+        elapsed, checksum = run_once(chars)
+        samples.append(elapsed)
+
+    best = min(samples)
+    median = statistics.median(samples)
+    mean = statistics.fmean(samples)
+
+    print(f"dataset: {name}")
+    print(f"codepoints: {len(chars)}")
+    print(f"lookups/sample: {lookups}")
+    print(f"checksum: {checksum}")
+    print(f"best_s: {best:.6f}")
+    print(f"median_s: {median:.6f}")
+    print(f"mean_s: {mean:.6f}")
+    print(f"best_ns_per_lookup: {best * 1e9 / lookups:.2f}")
+    print(f"median_ns_per_lookup: {median * 1e9 / lookups:.2f}")
+    print()
+
+
+def main() -> None:
+    print(f"python: {unicodedata.unidata_version=}")
+    print(f"samples: {SAMPLES}")
+    print(f"loops: {LOOPS}")
+    print()
+
+    benchmark("all", DATASETS["all"])
+    benchmark("bmp", DATASETS["bmp"])
+    benchmark("ascii", DATASETS["ascii"])
+
+
+if __name__ == "__main__":
+    main()