Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 6 additions & 15 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ _getrecord_ex(Py_UCS4 code)
if (code >= 0x110000)
index = 0;
else {
index = index1[(code>>SHIFT)];
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
index = unicodedata_record_get_record_index(code);
}

return &_PyUnicode_Database_Records[index];
Expand Down Expand Up @@ -493,9 +492,7 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
if (code < 0 || code >= 0x110000)
index = 0;
else {
index = decomp_index1[(code>>DECOMP_SHIFT)];
index = decomp_index2[(index<<DECOMP_SHIFT)+
(code&((1<<DECOMP_SHIFT)-1))];
index = unicodedata_decomp_get_decomp_index(code);
}

/* high byte is number of hex bytes (usually one or two), low byte
Expand Down Expand Up @@ -539,9 +536,7 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
*index = 0;
}
else {
*index = decomp_index1[(code>>DECOMP_SHIFT)];
*index = decomp_index2[(*index<<DECOMP_SHIFT)+
(code&((1<<DECOMP_SHIFT)-1))];
*index = unicodedata_decomp_get_decomp_index(code);
}

/* high byte is number of hex bytes (usually one or two), low byte
Expand Down Expand Up @@ -711,7 +706,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
const void *data;
Py_UCS4 *output;
Py_ssize_t i, i1, o, len;
int f,l,index,index1,comb;
int f,l,index,comb;
Py_UCS4 code;
Py_ssize_t skipped[20];
int cskipped = 0;
Expand Down Expand Up @@ -810,9 +805,7 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
continue;
}
index = f*TOTAL_LAST + l;
index1 = comp_index[index >> COMP_SHIFT];
code = comp_data[(index1<<COMP_SHIFT)+
(index&((1<<COMP_SHIFT)-1))];
code = unicodedata_comp_get_comp_data(index);
if (code == 0)
goto not_combinable;

Expand Down Expand Up @@ -1396,9 +1389,7 @@ _getucname(PyObject *self,
}

/* get position of codepoint in order of names in the dawg */
offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
(code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
offset = unicodename_get_dawg_codepoint_pos(code);
if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
return 0;

Expand Down
10,323 changes: 3,518 additions & 6,805 deletions Modules/unicodedata_db.h

Large diffs are not rendered by default.

7,845 changes: 3,147 additions & 4,698 deletions Modules/unicodename_db.h

Large diffs are not rendered by default.

7 changes: 2 additions & 5 deletions Objects/unicodectype.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,8 @@ gettyperecord(Py_UCS4 code)

if (code >= 0x110000)
index = 0;
else
{
index = index1[(code>>SHIFT)];
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
else {
index = unicodetype_get_type_index(code);
}

return &_PyUnicode_TypeRecords[index];
Expand Down Expand Up @@ -285,4 +283,3 @@ int _PyUnicode_IsAlpha(Py_UCS4 ch)

return (ctype->flags & ALPHA_MASK) != 0;
}

3,635 changes: 1,038 additions & 2,597 deletions Objects/unicodetype_db.h

Large diffs are not rendered by default.

78 changes: 78 additions & 0 deletions Tools/unicode/benchmark_unicodedata_category.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""Benchmark Python-level unicodedata.category() lookups.

Runs three fixed workloads:
- all Unicode code points
- BMP only
- ASCII only
"""

from __future__ import annotations

import statistics
import time
import unicodedata


LOOPS = 5
SAMPLES = 7
DATASETS = {
"all": "".join(map(chr, range(0x110000))),
"bmp": "".join(map(chr, range(0x10000))),
"ascii": "".join(map(chr, range(0x80))),
}


def run_once(chars: str) -> tuple[float, int]:
category = unicodedata.category
checksum = 0
t0 = time.perf_counter()
for _ in range(LOOPS):
for ch in chars:
gc = category(ch)
checksum += ord(gc[0]) + ord(gc[1])
elapsed = time.perf_counter() - t0
return elapsed, checksum


def benchmark(name: str, chars: str) -> None:
lookups = len(chars) * LOOPS

# Warm up specialization and caches before timing.
run_once(chars)

samples = []
checksum = None
for _ in range(SAMPLES):
elapsed, checksum = run_once(chars)
samples.append(elapsed)

best = min(samples)
median = statistics.median(samples)
mean = statistics.fmean(samples)

print(f"dataset: {name}")
print(f"codepoints: {len(chars)}")
print(f"lookups/sample: {lookups}")
print(f"checksum: {checksum}")
print(f"best_s: {best:.6f}")
print(f"median_s: {median:.6f}")
print(f"mean_s: {mean:.6f}")
print(f"best_ns_per_lookup: {best * 1e9 / lookups:.2f}")
print(f"median_ns_per_lookup: {median * 1e9 / lookups:.2f}")
print()


def main() -> None:
print(f"python: {unicodedata.unidata_version=}")
print(f"samples: {SAMPLES}")
print(f"loops: {LOOPS}")
print()

benchmark("all", DATASETS["all"])
benchmark("bmp", DATASETS["bmp"])
benchmark("ascii", DATASETS["ascii"])


if __name__ == "__main__":
main()
Loading
Loading