Skip to content

Commit bbb4a8a

Browse files
gpsheadclaude
andcommitted
pystrhex: Add AVX2 SIMD optimization for hex conversion
Add AVX2-accelerated hexlify for the no-separator path when converting bytes to hexadecimal strings. This processes 32 bytes per iteration instead of 1, using: - SIMD nibble extraction (shift + mask) - Arithmetic nibble-to-hex conversion (branchless) - Interleave operations for correct output ordering Runtime CPU detection via CPUID ensures AVX2 is only used when available. Falls back to scalar code for inputs < 32 bytes or when AVX2 is not supported. Performance improvement (bytes.hex() no separator): - 32 bytes: 1.3x faster - 64 bytes: 1.7x faster - 128 bytes: 3.0x faster - 256 bytes: 4.0x faster - 512 bytes: 4.9x faster - 4096 bytes: 11.9x faster Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 63cc125 commit bbb4a8a

4 files changed

Lines changed: 581 additions & 6 deletions

File tree

Python/pystrhex.c

Lines changed: 107 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,100 @@
44
#include "pycore_strhex.h" // _Py_strhex_with_sep()
55
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
66

7+
/* AVX2 SIMD optimization for hexlify.
8+
Only available on x86-64 with GCC/Clang. */
9+
#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
10+
# define PY_HEXLIFY_CAN_COMPILE_AVX2 1
11+
# include <cpuid.h>
12+
# include <immintrin.h>
13+
#else
14+
# define PY_HEXLIFY_CAN_COMPILE_AVX2 0
15+
#endif
16+
17+
#if PY_HEXLIFY_CAN_COMPILE_AVX2
18+
19+
/* Runtime CPU feature detection (lazy initialization) */
20+
static int _Py_hexlify_avx2_available = -1; /* -1 = not checked yet */
21+
22+
static void
23+
_Py_hexlify_detect_cpu_features(void)
24+
{
25+
unsigned int eax, ebx, ecx, edx;
26+
27+
/* Check for AVX2 support: CPUID.7H:EBX bit 5 */
28+
if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
29+
_Py_hexlify_avx2_available = (ebx & (1 << 5)) != 0;
30+
} else {
31+
_Py_hexlify_avx2_available = 0;
32+
}
33+
}
34+
35+
static inline int
36+
_Py_hexlify_can_use_avx2(void)
37+
{
38+
if (_Py_hexlify_avx2_available < 0) {
39+
_Py_hexlify_detect_cpu_features();
40+
}
41+
return _Py_hexlify_avx2_available;
42+
}
43+
44+
/* AVX2-accelerated hexlify: converts 32 bytes to 64 hex chars per iteration.
45+
Uses arithmetic nibble-to-hex conversion instead of table lookup. */
46+
__attribute__((target("avx2")))
47+
static void
48+
_Py_hexlify_avx2(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
49+
{
50+
const __m256i mask_0f = _mm256_set1_epi8(0x0f);
51+
const __m256i ascii_0 = _mm256_set1_epi8('0');
52+
const __m256i offset = _mm256_set1_epi8('a' - '0' - 10); /* 0x27 */
53+
const __m256i nine = _mm256_set1_epi8(9);
54+
55+
Py_ssize_t i = 0;
56+
57+
/* Process 32 bytes at a time */
58+
for (; i + 32 <= len; i += 32, dst += 64) {
59+
/* Load 32 input bytes */
60+
__m256i data = _mm256_loadu_si256((const __m256i *)(src + i));
61+
62+
/* Extract high and low nibbles */
63+
__m256i hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), mask_0f);
64+
__m256i lo = _mm256_and_si256(data, mask_0f);
65+
66+
/* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */
67+
__m256i hi_gt9 = _mm256_cmpgt_epi8(hi, nine);
68+
__m256i lo_gt9 = _mm256_cmpgt_epi8(lo, nine);
69+
70+
hi = _mm256_add_epi8(hi, ascii_0);
71+
lo = _mm256_add_epi8(lo, ascii_0);
72+
hi = _mm256_add_epi8(hi, _mm256_and_si256(hi_gt9, offset));
73+
lo = _mm256_add_epi8(lo, _mm256_and_si256(lo_gt9, offset));
74+
75+
/* Interleave hi/lo nibbles to get correct output order.
76+
unpacklo/hi work within 128-bit lanes, so we need permute to fix. */
77+
__m256i mixed_lo = _mm256_unpacklo_epi8(hi, lo);
78+
__m256i mixed_hi = _mm256_unpackhi_epi8(hi, lo);
79+
80+
/* Fix cross-lane ordering */
81+
__m256i result0 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x20);
82+
__m256i result1 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x31);
83+
84+
/* Store 64 hex characters */
85+
_mm256_storeu_si256((__m256i *)dst, result0);
86+
_mm256_storeu_si256((__m256i *)(dst + 32), result1);
87+
}
88+
89+
/* Scalar fallback for remaining 0-31 bytes */
90+
for (; i < len; i++, dst += 2) {
91+
unsigned int c = src[i];
92+
unsigned int hi = c >> 4;
93+
unsigned int lo = c & 0x0f;
94+
dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10));
95+
dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10));
96+
}
97+
}
98+
99+
#endif /* PY_HEXLIFY_CAN_COMPILE_AVX2 */
100+
7101
static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
8102
PyObject* sep, int bytes_per_sep_group,
9103
const int return_bytes)
@@ -82,13 +176,20 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
82176
unsigned char c;
83177

84178
if (bytes_per_sep_group == 0) {
85-
for (i = j = 0; i < arglen; ++i) {
86-
assert((j + 1) < resultlen);
87-
c = argbuf[i];
88-
retbuf[j++] = Py_hexdigits[c >> 4];
89-
retbuf[j++] = Py_hexdigits[c & 0x0f];
179+
#if PY_HEXLIFY_CAN_COMPILE_AVX2
180+
/* Use AVX2 for inputs >= 32 bytes when available */
181+
if (arglen >= 32 && _Py_hexlify_can_use_avx2()) {
182+
_Py_hexlify_avx2((const unsigned char *)argbuf, retbuf, arglen);
183+
}
184+
else
185+
#endif
186+
{
187+
for (i = j = 0; i < arglen; ++i) {
188+
c = argbuf[i];
189+
retbuf[j++] = Py_hexdigits[c >> 4];
190+
retbuf[j++] = Py_hexdigits[c & 0x0f];
191+
}
90192
}
91-
assert(j == resultlen);
92193
}
93194
else {
94195
/* The number of complete chunk+sep periods */

Tools/scripts/benchmark-avx2.txt

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
pystrhex.c benchmark
2+
==================================================
3+
Timing: best of 7 runs, 50000 iterations each
4+
5+
bytes.hex() by size:
6+
32.2 ns 0 bytes
7+
49.7 ns 1 byte
8+
50.2 ns 3 bytes
9+
49.8 ns 4 bytes
10+
53.0 ns 7 bytes
11+
53.0 ns 8 bytes
12+
58.7 ns 15 bytes
13+
60.6 ns 16 bytes
14+
63.6 ns 20 bytes
15+
49.4 ns 32 bytes
16+
51.9 ns 33 bytes
17+
51.8 ns 64 bytes
18+
51.2 ns 128 bytes
19+
66.9 ns 256 bytes
20+
105.7 ns 512 bytes
21+
286.8 ns 4096 bytes
22+
23+
bytes.hex(':') with separator (every byte):
24+
77.3 ns 0 bytes
25+
96.2 ns 1 byte
26+
102.0 ns 3 bytes
27+
103.2 ns 4 bytes
28+
107.6 ns 7 bytes
29+
109.9 ns 8 bytes
30+
120.3 ns 15 bytes
31+
120.8 ns 16 bytes
32+
126.8 ns 20 bytes
33+
145.4 ns 32 bytes
34+
148.0 ns 33 bytes
35+
199.8 ns 64 bytes
36+
319.5 ns 128 bytes
37+
546.6 ns 256 bytes
38+
1043.9 ns 512 bytes
39+
7212.9 ns 4096 bytes
40+
41+
bytes.hex(':', 2) with separator (every 2 bytes):
42+
101.4 ns 3 bytes
43+
101.6 ns 4 bytes
44+
105.9 ns 7 bytes
45+
106.7 ns 8 bytes
46+
113.6 ns 15 bytes
47+
114.2 ns 16 bytes
48+
119.6 ns 20 bytes
49+
136.4 ns 32 bytes
50+
139.1 ns 33 bytes
51+
169.9 ns 64 bytes
52+
251.4 ns 128 bytes
53+
407.6 ns 256 bytes
54+
752.9 ns 512 bytes
55+
4869.7 ns 4096 bytes
56+
57+
bytearray.hex() by size:
58+
31.8 ns 0 bytes
59+
47.4 ns 1 byte
60+
48.3 ns 3 bytes
61+
47.9 ns 4 bytes
62+
52.2 ns 7 bytes
63+
52.6 ns 8 bytes
64+
58.8 ns 15 bytes
65+
61.8 ns 16 bytes
66+
62.3 ns 20 bytes
67+
52.1 ns 32 bytes
68+
51.3 ns 33 bytes
69+
52.0 ns 64 bytes
70+
49.3 ns 128 bytes
71+
66.0 ns 256 bytes
72+
107.8 ns 512 bytes
73+
289.4 ns 4096 bytes
74+
75+
memoryview.hex() by size:
76+
34.0 ns 0 bytes
77+
50.0 ns 1 byte
78+
48.6 ns 3 bytes
79+
49.0 ns 4 bytes
80+
54.0 ns 7 bytes
81+
55.1 ns 8 bytes
82+
61.0 ns 15 bytes
83+
62.2 ns 16 bytes
84+
65.9 ns 20 bytes
85+
52.2 ns 32 bytes
86+
52.5 ns 33 bytes
87+
51.3 ns 64 bytes
88+
52.6 ns 128 bytes
89+
68.4 ns 256 bytes
90+
105.4 ns 512 bytes
91+
286.1 ns 4096 bytes
92+
93+
binascii.hexlify() by size:
94+
86.3 ns 0 bytes
95+
100.3 ns 1 byte
96+
102.3 ns 3 bytes
97+
101.3 ns 4 bytes
98+
104.0 ns 7 bytes
99+
102.7 ns 8 bytes
100+
112.7 ns 15 bytes
101+
114.1 ns 16 bytes
102+
117.1 ns 20 bytes
103+
103.2 ns 32 bytes
104+
102.5 ns 33 bytes
105+
103.4 ns 64 bytes
106+
105.0 ns 128 bytes
107+
119.2 ns 256 bytes
108+
181.2 ns 512 bytes
109+
337.9 ns 4096 bytes
110+
111+
binascii.hexlify(sep=':') with separator:
112+
95.0 ns 0 bytes
113+
110.5 ns 1 byte
114+
111.1 ns 3 bytes
115+
114.9 ns 4 bytes
116+
118.2 ns 7 bytes
117+
119.7 ns 8 bytes
118+
131.8 ns 15 bytes
119+
133.4 ns 16 bytes
120+
139.3 ns 20 bytes
121+
159.1 ns 32 bytes
122+
161.3 ns 33 bytes
123+
209.4 ns 64 bytes
124+
328.5 ns 128 bytes
125+
558.1 ns 256 bytes
126+
1059.8 ns 512 bytes
127+
7239.6 ns 4096 bytes
128+
129+
hashlib hexdigest (hash + hex conversion):
130+
552.0 ns md5 (16 byte digest)
131+
520.1 ns sha1 (20 byte digest)
132+
506.4 ns sha256 (32 byte digest)
133+
690.8 ns sha512 (64 byte digest)
134+
135+
hashlib hexdigest only (hex conversion, pre-computed hash):
136+
310.5 ns md5 (16 byte digest)
137+
275.0 ns sha1 (20 byte digest)
138+
274.5 ns sha256 (32 byte digest)
139+
445.9 ns sha512 (64 byte digest)
140+
141+
==================================================
142+
Done.

0 commit comments

Comments
 (0)