Skip to content

Commit 4f3d60d

Browse files
gpsheadclaude
andcommitted
pystrhex: Add SSE2 SIMD optimization for hex conversion
Add SSE2 vectorized implementation that processes 16 bytes per iteration. SSE2 is always available on x86-64 (part of AMD64 baseline), so no runtime detection is needed. This provides SIMD acceleration for all x86-64 machines, even those without AVX2. The dispatch now cascades: AVX-512 (64+ bytes) → AVX2 (32+ bytes) → SSE2 (16+ bytes) → scalar. Benchmarks show ~5-6% improvement for 16-20 byte inputs, which is useful for common hash digest sizes (MD5=16 bytes, SHA1=20 bytes). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent bdceb9c commit 4f3d60d

2 files changed

Lines changed: 196 additions & 1 deletion

File tree

Python/pystrhex.c

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,55 @@ _Py_hexlify_get_simd_level(void)
6868
return _Py_hexlify_simd_level;
6969
}
7070

71+
/* SSE2-accelerated hexlify: converts 16 bytes to 32 hex chars per iteration.
72+
SSE2 is always available on x86-64 (part of AMD64 baseline). */
73+
static void
74+
_Py_hexlify_sse2(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
75+
{
76+
const __m128i mask_0f = _mm_set1_epi8(0x0f);
77+
const __m128i ascii_0 = _mm_set1_epi8('0');
78+
const __m128i offset = _mm_set1_epi8('a' - '0' - 10); /* 0x27 */
79+
const __m128i nine = _mm_set1_epi8(9);
80+
81+
Py_ssize_t i = 0;
82+
83+
/* Process 16 bytes at a time */
84+
for (; i + 16 <= len; i += 16, dst += 32) {
85+
/* Load 16 input bytes */
86+
__m128i data = _mm_loadu_si128((const __m128i *)(src + i));
87+
88+
/* Extract high and low nibbles */
89+
__m128i hi = _mm_and_si128(_mm_srli_epi16(data, 4), mask_0f);
90+
__m128i lo = _mm_and_si128(data, mask_0f);
91+
92+
/* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */
93+
__m128i hi_gt9 = _mm_cmpgt_epi8(hi, nine);
94+
__m128i lo_gt9 = _mm_cmpgt_epi8(lo, nine);
95+
96+
hi = _mm_add_epi8(hi, ascii_0);
97+
lo = _mm_add_epi8(lo, ascii_0);
98+
hi = _mm_add_epi8(hi, _mm_and_si128(hi_gt9, offset));
99+
lo = _mm_add_epi8(lo, _mm_and_si128(lo_gt9, offset));
100+
101+
/* Interleave hi/lo nibbles to get correct output order */
102+
__m128i result0 = _mm_unpacklo_epi8(hi, lo); /* First 16 hex chars */
103+
__m128i result1 = _mm_unpackhi_epi8(hi, lo); /* Second 16 hex chars */
104+
105+
/* Store 32 hex characters */
106+
_mm_storeu_si128((__m128i *)dst, result0);
107+
_mm_storeu_si128((__m128i *)(dst + 16), result1);
108+
}
109+
110+
/* Scalar fallback for remaining 0-15 bytes */
111+
for (; i < len; i++, dst += 2) {
112+
unsigned int c = src[i];
113+
unsigned int hi = c >> 4;
114+
unsigned int lo = c & 0x0f;
115+
dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10));
116+
dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10));
117+
}
118+
}
119+
71120
/* AVX2-accelerated hexlify: converts 32 bytes to 64 hex chars per iteration.
72121
Uses arithmetic nibble-to-hex conversion instead of table lookup. */
73122
__attribute__((target("avx2")))
@@ -367,13 +416,17 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
367416
if (bytes_per_sep_group == 0) {
368417
#if PY_HEXLIFY_CAN_COMPILE_X86_SIMD
369418
int simd_level = _Py_hexlify_get_simd_level();
370-
/* Use AVX-512 for inputs >= 64 bytes, AVX2 for >= 32 bytes */
419+
/* Use AVX-512 for inputs >= 64 bytes, AVX2 for >= 32 bytes,
420+
SSE2 for >= 16 bytes (SSE2 always available on x86-64) */
371421
if (arglen >= 64 && simd_level >= PY_HEXLIFY_SIMD_AVX512) {
372422
_Py_hexlify_avx512((const unsigned char *)argbuf, retbuf, arglen);
373423
}
374424
else if (arglen >= 32 && simd_level >= PY_HEXLIFY_SIMD_AVX2) {
375425
_Py_hexlify_avx2((const unsigned char *)argbuf, retbuf, arglen);
376426
}
427+
else if (arglen >= 16) {
428+
_Py_hexlify_sse2((const unsigned char *)argbuf, retbuf, arglen);
429+
}
377430
else
378431
#elif PY_HEXLIFY_CAN_COMPILE_NEON
379432
/* Use NEON for inputs >= 16 bytes (always available on AArch64) */

Tools/scripts/benchmark-sse2.txt

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
pystrhex.c benchmark
2+
==================================================
3+
Timing: best of 7 runs, 50000 iterations each
4+
5+
bytes.hex() by size:
6+
33.8 ns 0 bytes
7+
48.0 ns 1 byte
8+
49.4 ns 3 bytes
9+
49.3 ns 4 bytes
10+
50.1 ns 7 bytes
11+
50.5 ns 8 bytes
12+
54.7 ns 15 bytes
13+
54.9 ns 16 bytes
14+
58.7 ns 20 bytes
15+
49.3 ns 32 bytes
16+
49.9 ns 33 bytes
17+
50.3 ns 64 bytes
18+
51.5 ns 128 bytes
19+
67.9 ns 256 bytes
20+
114.4 ns 512 bytes
21+
299.5 ns 4096 bytes
22+
23+
bytes.hex(':') with separator (every byte):
24+
77.0 ns 0 bytes
25+
95.1 ns 1 byte
26+
99.4 ns 3 bytes
27+
101.7 ns 4 bytes
28+
107.2 ns 7 bytes
29+
107.8 ns 8 bytes
30+
119.5 ns 15 bytes
31+
119.1 ns 16 bytes
32+
127.7 ns 20 bytes
33+
145.6 ns 32 bytes
34+
145.5 ns 33 bytes
35+
201.5 ns 64 bytes
36+
318.7 ns 128 bytes
37+
551.5 ns 256 bytes
38+
1047.4 ns 512 bytes
39+
7313.9 ns 4096 bytes
40+
41+
bytes.hex(':', 2) with separator (every 2 bytes):
42+
101.6 ns 3 bytes
43+
101.0 ns 4 bytes
44+
106.8 ns 7 bytes
45+
108.3 ns 8 bytes
46+
116.5 ns 15 bytes
47+
115.9 ns 16 bytes
48+
119.8 ns 20 bytes
49+
137.5 ns 32 bytes
50+
139.3 ns 33 bytes
51+
171.4 ns 64 bytes
52+
254.3 ns 128 bytes
53+
410.7 ns 256 bytes
54+
768.2 ns 512 bytes
55+
5006.9 ns 4096 bytes
56+
57+
bytearray.hex() by size:
58+
33.8 ns 0 bytes
59+
49.3 ns 1 byte
60+
49.7 ns 3 bytes
61+
49.8 ns 4 bytes
62+
50.7 ns 7 bytes
63+
51.3 ns 8 bytes
64+
55.6 ns 15 bytes
65+
56.3 ns 16 bytes
66+
58.6 ns 20 bytes
67+
49.0 ns 32 bytes
68+
49.6 ns 33 bytes
69+
50.6 ns 64 bytes
70+
50.9 ns 128 bytes
71+
67.3 ns 256 bytes
72+
114.2 ns 512 bytes
73+
300.6 ns 4096 bytes
74+
75+
memoryview.hex() by size:
76+
34.6 ns 0 bytes
77+
49.7 ns 1 byte
78+
50.3 ns 3 bytes
79+
50.3 ns 4 bytes
80+
50.7 ns 7 bytes
81+
51.2 ns 8 bytes
82+
55.5 ns 15 bytes
83+
56.5 ns 16 bytes
84+
60.2 ns 20 bytes
85+
51.1 ns 32 bytes
86+
50.7 ns 33 bytes
87+
50.9 ns 64 bytes
88+
52.1 ns 128 bytes
89+
68.8 ns 256 bytes
90+
120.8 ns 512 bytes
91+
299.3 ns 4096 bytes
92+
93+
binascii.hexlify() by size:
94+
84.8 ns 0 bytes
95+
100.2 ns 1 byte
96+
100.5 ns 3 bytes
97+
100.7 ns 4 bytes
98+
102.8 ns 7 bytes
99+
101.3 ns 8 bytes
100+
109.0 ns 15 bytes
101+
111.4 ns 16 bytes
102+
112.1 ns 20 bytes
103+
100.0 ns 32 bytes
104+
102.1 ns 33 bytes
105+
100.8 ns 64 bytes
106+
101.6 ns 128 bytes
107+
119.4 ns 256 bytes
108+
175.8 ns 512 bytes
109+
334.9 ns 4096 bytes
110+
111+
binascii.hexlify(sep=':') with separator:
112+
93.3 ns 0 bytes
113+
107.0 ns 1 byte
114+
109.0 ns 3 bytes
115+
112.8 ns 4 bytes
116+
117.8 ns 7 bytes
117+
121.0 ns 8 bytes
118+
131.4 ns 15 bytes
119+
131.5 ns 16 bytes
120+
138.6 ns 20 bytes
121+
156.8 ns 32 bytes
122+
155.9 ns 33 bytes
123+
212.2 ns 64 bytes
124+
332.4 ns 128 bytes
125+
575.0 ns 256 bytes
126+
1061.9 ns 512 bytes
127+
8117.4 ns 4096 bytes
128+
129+
hashlib hexdigest (hash + hex conversion):
130+
565.9 ns md5 (16 byte digest)
131+
530.8 ns sha1 (20 byte digest)
132+
532.2 ns sha256 (32 byte digest)
133+
730.7 ns sha512 (64 byte digest)
134+
135+
hashlib hexdigest only (hex conversion, pre-computed hash):
136+
315.9 ns md5 (16 byte digest)
137+
277.2 ns sha1 (20 byte digest)
138+
279.5 ns sha256 (32 byte digest)
139+
450.9 ns sha512 (64 byte digest)
140+
141+
==================================================
142+
Done.

0 commit comments

Comments
 (0)