Skip to content

Commit bbf6059

Browse files
committed
Add unicodedata.category benchmark script
Add a small Python-level benchmark under Tools/unicode for comparing unicodedata.category() lookup speed across builds on three fixed workloads: all code points, BMP only, and ASCII only. Current results from optimized non-debug builds (-O3 -DNDEBUG), comparing clean HEAD vs the packtab branch: - all: baseline 98.98 ns median, packtab 108.44 ns median - bmp: baseline 97.44 ns median, packtab 105.01 ns median - ascii: baseline 83.80 ns median, packtab 82.53 ns median
1 parent 777e4e9 commit bbf6059

1 file changed

Lines changed: 78 additions & 0 deletions

File tree

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env python3
2+
"""Benchmark Python-level unicodedata.category() lookups.
3+
4+
Runs three fixed workloads:
5+
- all Unicode code points
6+
- BMP only
7+
- ASCII only
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import statistics
13+
import time
14+
import unicodedata
15+
16+
17+
LOOPS = 5
18+
SAMPLES = 7
19+
DATASETS = {
20+
"all": "".join(map(chr, range(0x110000))),
21+
"bmp": "".join(map(chr, range(0x10000))),
22+
"ascii": "".join(map(chr, range(0x80))),
23+
}
24+
25+
26+
def run_once(chars: str) -> tuple[float, int]:
27+
category = unicodedata.category
28+
checksum = 0
29+
t0 = time.perf_counter()
30+
for _ in range(LOOPS):
31+
for ch in chars:
32+
gc = category(ch)
33+
checksum += ord(gc[0]) + ord(gc[1])
34+
elapsed = time.perf_counter() - t0
35+
return elapsed, checksum
36+
37+
38+
def benchmark(name: str, chars: str) -> None:
39+
lookups = len(chars) * LOOPS
40+
41+
# Warm up specialization and caches before timing.
42+
run_once(chars)
43+
44+
samples = []
45+
checksum = None
46+
for _ in range(SAMPLES):
47+
elapsed, checksum = run_once(chars)
48+
samples.append(elapsed)
49+
50+
best = min(samples)
51+
median = statistics.median(samples)
52+
mean = statistics.fmean(samples)
53+
54+
print(f"dataset: {name}")
55+
print(f"codepoints: {len(chars)}")
56+
print(f"lookups/sample: {lookups}")
57+
print(f"checksum: {checksum}")
58+
print(f"best_s: {best:.6f}")
59+
print(f"median_s: {median:.6f}")
60+
print(f"mean_s: {mean:.6f}")
61+
print(f"best_ns_per_lookup: {best * 1e9 / lookups:.2f}")
62+
print(f"median_ns_per_lookup: {median * 1e9 / lookups:.2f}")
63+
print()
64+
65+
66+
def main() -> None:
67+
print(f"python: {unicodedata.unidata_version=}")
68+
print(f"samples: {SAMPLES}")
69+
print(f"loops: {LOOPS}")
70+
print()
71+
72+
benchmark("all", DATASETS["all"])
73+
benchmark("bmp", DATASETS["bmp"])
74+
benchmark("ascii", DATASETS["ascii"])
75+
76+
77+
if __name__ == "__main__":
78+
main()

0 commit comments

Comments
 (0)