Skip to content

Commit 015cc55

Browse files
gpsheadclaude
andcommitted
pystrhex: Remove AVX2/AVX-512, keep SSE2-only for simplicity
Benchmarks showed SSE2 performs nearly as well as AVX2 for most input sizes (within 5% up to 256 bytes, within 8% at 512+ bytes). Since SSE2 is always available on x86-64 (part of the baseline), this eliminates: - Runtime CPU feature detection via CPUID - ~200 lines of AVX2/AVX-512 intrinsics code - Maintenance burden of multiple SIMD implementations The simpler SSE2-only approach provides most of the performance benefit with significantly less code complexity. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 4f3d60d commit 015cc55

1 file changed

Lines changed: 9 additions & 225 deletions

File tree

Python/pystrhex.c

Lines changed: 9 additions & 225 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,13 @@
55
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
66

77
/* SIMD optimization for hexlify.
8-
x86-64: AVX2/AVX-512 with runtime detection
8+
x86-64: SSE2 (always available, part of x86-64 baseline)
99
ARM64: NEON (always available on AArch64) */
1010
#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
11-
# define PY_HEXLIFY_CAN_COMPILE_X86_SIMD 1
12-
# include <cpuid.h>
13-
# include <immintrin.h>
11+
# define PY_HEXLIFY_CAN_COMPILE_SSE2 1
12+
# include <emmintrin.h>
1413
#else
15-
# define PY_HEXLIFY_CAN_COMPILE_X86_SIMD 0
14+
# define PY_HEXLIFY_CAN_COMPILE_SSE2 0
1615
#endif
1716

1817
#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
@@ -22,51 +21,7 @@
2221
# define PY_HEXLIFY_CAN_COMPILE_NEON 0
2322
#endif
2423

25-
#if PY_HEXLIFY_CAN_COMPILE_X86_SIMD
26-
27-
/* Runtime CPU feature detection (lazy initialization)
28-
-1 = not checked, 0 = no SIMD, 1 = AVX2, 2 = AVX-512 */
29-
static int _Py_hexlify_simd_level = -1;
30-
31-
#define PY_HEXLIFY_SIMD_NONE 0
32-
#define PY_HEXLIFY_SIMD_AVX2 1
33-
#define PY_HEXLIFY_SIMD_AVX512 2
34-
35-
static void
36-
_Py_hexlify_detect_cpu_features(void)
37-
{
38-
unsigned int eax, ebx, ecx, edx;
39-
40-
_Py_hexlify_simd_level = PY_HEXLIFY_SIMD_NONE;
41-
42-
if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) {
43-
return;
44-
}
45-
46-
/* Check for AVX2: CPUID.7H:EBX bit 5 */
47-
int has_avx2 = (ebx & (1 << 5)) != 0;
48-
49-
/* Check for AVX-512F + AVX-512BW + AVX-512VBMI:
50-
CPUID.7H:EBX bits 16 and 30, ECX bit 1 */
51-
int has_avx512f = (ebx & (1 << 16)) != 0;
52-
int has_avx512bw = (ebx & (1 << 30)) != 0;
53-
int has_avx512vbmi = (ecx & (1 << 1)) != 0;
54-
55-
if (has_avx512f && has_avx512bw && has_avx512vbmi) {
56-
_Py_hexlify_simd_level = PY_HEXLIFY_SIMD_AVX512;
57-
} else if (has_avx2) {
58-
_Py_hexlify_simd_level = PY_HEXLIFY_SIMD_AVX2;
59-
}
60-
}
61-
62-
static inline int
63-
_Py_hexlify_get_simd_level(void)
64-
{
65-
if (_Py_hexlify_simd_level < 0) {
66-
_Py_hexlify_detect_cpu_features();
67-
}
68-
return _Py_hexlify_simd_level;
69-
}
24+
#if PY_HEXLIFY_CAN_COMPILE_SSE2
7025

7126
/* SSE2-accelerated hexlify: converts 16 bytes to 32 hex chars per iteration.
7227
SSE2 is always available on x86-64 (part of AMD64 baseline). */
@@ -117,170 +72,7 @@ _Py_hexlify_sse2(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
11772
}
11873
}
11974

120-
/* AVX2-accelerated hexlify: converts 32 bytes to 64 hex chars per iteration.
121-
Uses arithmetic nibble-to-hex conversion instead of table lookup. */
122-
__attribute__((target("avx2")))
123-
static void
124-
_Py_hexlify_avx2(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
125-
{
126-
const __m256i mask_0f = _mm256_set1_epi8(0x0f);
127-
const __m256i ascii_0 = _mm256_set1_epi8('0');
128-
const __m256i offset = _mm256_set1_epi8('a' - '0' - 10); /* 0x27 */
129-
const __m256i nine = _mm256_set1_epi8(9);
130-
131-
Py_ssize_t i = 0;
132-
133-
/* Process 32 bytes at a time */
134-
for (; i + 32 <= len; i += 32, dst += 64) {
135-
/* Load 32 input bytes */
136-
__m256i data = _mm256_loadu_si256((const __m256i *)(src + i));
137-
138-
/* Extract high and low nibbles */
139-
__m256i hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), mask_0f);
140-
__m256i lo = _mm256_and_si256(data, mask_0f);
141-
142-
/* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */
143-
__m256i hi_gt9 = _mm256_cmpgt_epi8(hi, nine);
144-
__m256i lo_gt9 = _mm256_cmpgt_epi8(lo, nine);
145-
146-
hi = _mm256_add_epi8(hi, ascii_0);
147-
lo = _mm256_add_epi8(lo, ascii_0);
148-
hi = _mm256_add_epi8(hi, _mm256_and_si256(hi_gt9, offset));
149-
lo = _mm256_add_epi8(lo, _mm256_and_si256(lo_gt9, offset));
150-
151-
/* Interleave hi/lo nibbles to get correct output order.
152-
unpacklo/hi work within 128-bit lanes, so we need permute to fix. */
153-
__m256i mixed_lo = _mm256_unpacklo_epi8(hi, lo);
154-
__m256i mixed_hi = _mm256_unpackhi_epi8(hi, lo);
155-
156-
/* Fix cross-lane ordering */
157-
__m256i result0 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x20);
158-
__m256i result1 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x31);
159-
160-
/* Store 64 hex characters */
161-
_mm256_storeu_si256((__m256i *)dst, result0);
162-
_mm256_storeu_si256((__m256i *)(dst + 32), result1);
163-
}
164-
165-
/* Scalar fallback for remaining 0-31 bytes */
166-
for (; i < len; i++, dst += 2) {
167-
unsigned int c = src[i];
168-
unsigned int hi = c >> 4;
169-
unsigned int lo = c & 0x0f;
170-
dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10));
171-
dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10));
172-
}
173-
}
174-
175-
/* AVX-512 accelerated hexlify: converts 64 bytes to 128 hex chars per iteration.
176-
Requires AVX-512F, AVX-512BW, and AVX-512VBMI for byte-level permutation. */
177-
__attribute__((target("avx512f,avx512bw,avx512vbmi")))
178-
static void
179-
_Py_hexlify_avx512(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
180-
{
181-
const __m512i mask_0f = _mm512_set1_epi8(0x0f);
182-
const __m512i ascii_0 = _mm512_set1_epi8('0');
183-
const __m512i ascii_a = _mm512_set1_epi8('a' - 10);
184-
const __m512i nine = _mm512_set1_epi8(9);
185-
186-
/* Permutation indices for interleaving hi/lo nibbles.
187-
We need to transform:
188-
hi: H0 H1 H2 ... H63
189-
lo: L0 L1 L2 ... L63
190-
into:
191-
out0: H0 L0 H1 L1 ... H31 L31
192-
out1: H32 L32 H33 L33 ... H63 L63
193-
*/
194-
const __m512i interleave_lo = _mm512_set_epi8(
195-
39, 103, 38, 102, 37, 101, 36, 100, 35, 99, 34, 98, 33, 97, 32, 96,
196-
47, 111, 46, 110, 45, 109, 44, 108, 43, 107, 42, 106, 41, 105, 40, 104,
197-
55, 119, 54, 118, 53, 117, 52, 116, 51, 115, 50, 114, 49, 113, 48, 112,
198-
63, 127, 62, 126, 61, 125, 60, 124, 59, 123, 58, 122, 57, 121, 56, 120
199-
);
200-
const __m512i interleave_hi = _mm512_set_epi8(
201-
7, 71, 6, 70, 5, 69, 4, 68, 3, 67, 2, 66, 1, 65, 0, 64,
202-
15, 79, 14, 78, 13, 77, 12, 76, 11, 75, 10, 74, 9, 73, 8, 72,
203-
23, 87, 22, 86, 21, 85, 20, 84, 19, 83, 18, 82, 17, 81, 16, 80,
204-
31, 95, 30, 94, 29, 93, 28, 92, 27, 91, 26, 90, 25, 89, 24, 88
205-
);
206-
207-
Py_ssize_t i = 0;
208-
209-
/* Process 64 bytes at a time */
210-
for (; i + 64 <= len; i += 64, dst += 128) {
211-
/* Load 64 input bytes */
212-
__m512i data = _mm512_loadu_si512((const __m512i *)(src + i));
213-
214-
/* Extract high and low nibbles */
215-
__m512i hi = _mm512_and_si512(_mm512_srli_epi16(data, 4), mask_0f);
216-
__m512i lo = _mm512_and_si512(data, mask_0f);
217-
218-
/* Convert nibbles to hex using masked blend:
219-
if nibble > 9: use 'a' + (nibble - 10) = nibble + ('a' - 10)
220-
else: use '0' + nibble */
221-
__mmask64 hi_alpha = _mm512_cmpgt_epi8_mask(hi, nine);
222-
__mmask64 lo_alpha = _mm512_cmpgt_epi8_mask(lo, nine);
223-
224-
__m512i hi_digit = _mm512_add_epi8(hi, ascii_0);
225-
__m512i hi_letter = _mm512_add_epi8(hi, ascii_a);
226-
hi = _mm512_mask_blend_epi8(hi_alpha, hi_digit, hi_letter);
227-
228-
__m512i lo_digit = _mm512_add_epi8(lo, ascii_0);
229-
__m512i lo_letter = _mm512_add_epi8(lo, ascii_a);
230-
lo = _mm512_mask_blend_epi8(lo_alpha, lo_digit, lo_letter);
231-
232-
/* Interleave hi/lo to get correct output order using permutex2var */
233-
__m512i result0 = _mm512_permutex2var_epi8(hi, interleave_hi, lo);
234-
__m512i result1 = _mm512_permutex2var_epi8(hi, interleave_lo, lo);
235-
236-
/* Store 128 hex characters */
237-
_mm512_storeu_si512((__m512i *)dst, result0);
238-
_mm512_storeu_si512((__m512i *)(dst + 64), result1);
239-
}
240-
241-
/* Use AVX2 for remaining 32-63 bytes */
242-
if (i + 32 <= len) {
243-
const __m256i mask_0f_256 = _mm256_set1_epi8(0x0f);
244-
const __m256i ascii_0_256 = _mm256_set1_epi8('0');
245-
const __m256i offset_256 = _mm256_set1_epi8('a' - '0' - 10);
246-
const __m256i nine_256 = _mm256_set1_epi8(9);
247-
248-
__m256i data = _mm256_loadu_si256((const __m256i *)(src + i));
249-
__m256i hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), mask_0f_256);
250-
__m256i lo = _mm256_and_si256(data, mask_0f_256);
251-
252-
__m256i hi_gt9 = _mm256_cmpgt_epi8(hi, nine_256);
253-
__m256i lo_gt9 = _mm256_cmpgt_epi8(lo, nine_256);
254-
255-
hi = _mm256_add_epi8(hi, ascii_0_256);
256-
lo = _mm256_add_epi8(lo, ascii_0_256);
257-
hi = _mm256_add_epi8(hi, _mm256_and_si256(hi_gt9, offset_256));
258-
lo = _mm256_add_epi8(lo, _mm256_and_si256(lo_gt9, offset_256));
259-
260-
__m256i mixed_lo = _mm256_unpacklo_epi8(hi, lo);
261-
__m256i mixed_hi = _mm256_unpackhi_epi8(hi, lo);
262-
263-
__m256i r0 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x20);
264-
__m256i r1 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x31);
265-
266-
_mm256_storeu_si256((__m256i *)dst, r0);
267-
_mm256_storeu_si256((__m256i *)(dst + 32), r1);
268-
269-
i += 32;
270-
dst += 64;
271-
}
272-
273-
/* Scalar fallback for remaining 0-31 bytes */
274-
for (; i < len; i++, dst += 2) {
275-
unsigned int c = src[i];
276-
unsigned int hi = c >> 4;
277-
unsigned int lo = c & 0x0f;
278-
dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10));
279-
dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10));
280-
}
281-
}
282-
283-
#endif /* PY_HEXLIFY_CAN_COMPILE_X86_SIMD */
75+
#endif /* PY_HEXLIFY_CAN_COMPILE_SSE2 */
28476

28577
#if PY_HEXLIFY_CAN_COMPILE_NEON
28678

@@ -414,17 +206,9 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
414206
unsigned char c;
415207

416208
if (bytes_per_sep_group == 0) {
417-
#if PY_HEXLIFY_CAN_COMPILE_X86_SIMD
418-
int simd_level = _Py_hexlify_get_simd_level();
419-
/* Use AVX-512 for inputs >= 64 bytes, AVX2 for >= 32 bytes,
420-
SSE2 for >= 16 bytes (SSE2 always available on x86-64) */
421-
if (arglen >= 64 && simd_level >= PY_HEXLIFY_SIMD_AVX512) {
422-
_Py_hexlify_avx512((const unsigned char *)argbuf, retbuf, arglen);
423-
}
424-
else if (arglen >= 32 && simd_level >= PY_HEXLIFY_SIMD_AVX2) {
425-
_Py_hexlify_avx2((const unsigned char *)argbuf, retbuf, arglen);
426-
}
427-
else if (arglen >= 16) {
209+
#if PY_HEXLIFY_CAN_COMPILE_SSE2
210+
/* Use SSE2 for inputs >= 16 bytes (always available on x86-64) */
211+
if (arglen >= 16) {
428212
_Py_hexlify_sse2((const unsigned char *)argbuf, retbuf, arglen);
429213
}
430214
else

0 commit comments

Comments
 (0)