Skip to content

Commit ae3d7be

Browse files
gpsheadclaude
andcommitted
pystrhex: Replace SSE2/NEON with portable SIMD using GCC vector extensions
Replace separate platform-specific SSE2 and NEON implementations with a single unified implementation using GCC/Clang vector extensions. The portable code uses __builtin_shufflevector for interleave operations, which compiles to native SIMD instructions: - x86-64: punpcklbw/punpckhbw (SSE2) - ARM64: zip1/zip2 (NEON) This eliminates code duplication while maintaining SIMD performance. Requires GCC 12+ or Clang 3.0+ on x86-64 or ARM64. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 015cc55 commit ae3d7be

1 file changed

Lines changed: 54 additions & 103 deletions

File tree

Python/pystrhex.c

Lines changed: 54 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -4,129 +4,86 @@
44
#include "pycore_strhex.h" // _Py_strhex_with_sep()
55
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
66

7-
/* SIMD optimization for hexlify.
8-
x86-64: SSE2 (always available, part of x86-64 baseline)
9-
ARM64: NEON (always available on AArch64) */
10-
#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
11-
# define PY_HEXLIFY_CAN_COMPILE_SSE2 1
12-
# include <emmintrin.h>
7+
/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
8+
Uses __builtin_shufflevector for portable interleave that compiles to
9+
native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64,
10+
NEON zip1/zip2 on ARM64).
11+
12+
Requirements:
13+
- GCC 12+ or Clang 3.0+ (for __builtin_shufflevector)
14+
- x86-64 or ARM64 architecture */
15+
#if (defined(__x86_64__) || defined(__aarch64__)) && \
16+
(defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 12))
17+
# define PY_HEXLIFY_CAN_COMPILE_SIMD 1
1318
#else
14-
# define PY_HEXLIFY_CAN_COMPILE_SSE2 0
19+
# define PY_HEXLIFY_CAN_COMPILE_SIMD 0
1520
#endif
1621

17-
#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
18-
# define PY_HEXLIFY_CAN_COMPILE_NEON 1
19-
# include <arm_neon.h>
20-
#else
21-
# define PY_HEXLIFY_CAN_COMPILE_NEON 0
22-
#endif
22+
#if PY_HEXLIFY_CAN_COMPILE_SIMD
2323

24-
#if PY_HEXLIFY_CAN_COMPILE_SSE2
24+
/* 128-bit vector of 16 unsigned bytes */
25+
typedef unsigned char v16u8 __attribute__((vector_size(16)));
2526

26-
/* SSE2-accelerated hexlify: converts 16 bytes to 32 hex chars per iteration.
27-
SSE2 is always available on x86-64 (part of AMD64 baseline). */
28-
static void
29-
_Py_hexlify_sse2(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
27+
/* Splat a byte value across all 16 lanes */
28+
static inline v16u8
29+
v16u8_splat(unsigned char x)
3030
{
31-
const __m128i mask_0f = _mm_set1_epi8(0x0f);
32-
const __m128i ascii_0 = _mm_set1_epi8('0');
33-
const __m128i offset = _mm_set1_epi8('a' - '0' - 10); /* 0x27 */
34-
const __m128i nine = _mm_set1_epi8(9);
35-
36-
Py_ssize_t i = 0;
37-
38-
/* Process 16 bytes at a time */
39-
for (; i + 16 <= len; i += 16, dst += 32) {
40-
/* Load 16 input bytes */
41-
__m128i data = _mm_loadu_si128((const __m128i *)(src + i));
42-
43-
/* Extract high and low nibbles */
44-
__m128i hi = _mm_and_si128(_mm_srli_epi16(data, 4), mask_0f);
45-
__m128i lo = _mm_and_si128(data, mask_0f);
46-
47-
/* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */
48-
__m128i hi_gt9 = _mm_cmpgt_epi8(hi, nine);
49-
__m128i lo_gt9 = _mm_cmpgt_epi8(lo, nine);
50-
51-
hi = _mm_add_epi8(hi, ascii_0);
52-
lo = _mm_add_epi8(lo, ascii_0);
53-
hi = _mm_add_epi8(hi, _mm_and_si128(hi_gt9, offset));
54-
lo = _mm_add_epi8(lo, _mm_and_si128(lo_gt9, offset));
55-
56-
/* Interleave hi/lo nibbles to get correct output order */
57-
__m128i result0 = _mm_unpacklo_epi8(hi, lo); /* First 16 hex chars */
58-
__m128i result1 = _mm_unpackhi_epi8(hi, lo); /* Second 16 hex chars */
59-
60-
/* Store 32 hex characters */
61-
_mm_storeu_si128((__m128i *)dst, result0);
62-
_mm_storeu_si128((__m128i *)(dst + 16), result1);
63-
}
64-
65-
/* Scalar fallback for remaining 0-15 bytes */
66-
for (; i < len; i++, dst += 2) {
67-
unsigned int c = src[i];
68-
unsigned int hi = c >> 4;
69-
unsigned int lo = c & 0x0f;
70-
dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10));
71-
dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10));
72-
}
31+
return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
7332
}
7433

75-
#endif /* PY_HEXLIFY_CAN_COMPILE_SSE2 */
76-
77-
#if PY_HEXLIFY_CAN_COMPILE_NEON
78-
79-
/* ARM NEON accelerated hexlify: converts 16 bytes to 32 hex chars per iteration.
80-
NEON is always available on AArch64, no runtime detection needed. */
34+
/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
35+
Compiles to native SSE2 on x86-64, NEON on ARM64. */
8136
static void
82-
_Py_hexlify_neon(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
37+
_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
8338
{
84-
const uint8x16_t mask_0f = vdupq_n_u8(0x0f);
85-
const uint8x16_t ascii_0 = vdupq_n_u8('0');
86-
const uint8x16_t offset = vdupq_n_u8('a' - '0' - 10); /* 0x27 */
87-
const uint8x16_t nine = vdupq_n_u8(9);
39+
const v16u8 mask_0f = v16u8_splat(0x0f);
40+
const v16u8 ascii_0 = v16u8_splat('0');
41+
const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */
42+
const v16u8 nine = v16u8_splat(9);
8843

8944
Py_ssize_t i = 0;
9045

9146
/* Process 16 bytes at a time */
9247
for (; i + 16 <= len; i += 16, dst += 32) {
93-
/* Load 16 input bytes */
94-
uint8x16_t data = vld1q_u8(src + i);
48+
/* Load 16 bytes (memcpy for safe unaligned access) */
49+
v16u8 data;
50+
memcpy(&data, src + i, 16);
9551

96-
/* Extract high and low nibbles */
97-
uint8x16_t hi = vandq_u8(vshrq_n_u8(data, 4), mask_0f);
98-
uint8x16_t lo = vandq_u8(data, mask_0f);
52+
/* Extract high and low nibbles using vector operators */
53+
v16u8 hi = (data >> 4) & mask_0f;
54+
v16u8 lo = data & mask_0f;
9955

100-
/* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */
101-
uint8x16_t hi_gt9 = vcgtq_u8(hi, nine);
102-
uint8x16_t lo_gt9 = vcgtq_u8(lo, nine);
56+
/* Compare > 9 produces all-ones mask where true */
57+
v16u8 hi_gt9 = hi > nine;
58+
v16u8 lo_gt9 = lo > nine;
10359

104-
hi = vaddq_u8(hi, ascii_0);
105-
lo = vaddq_u8(lo, ascii_0);
106-
hi = vaddq_u8(hi, vandq_u8(hi_gt9, offset));
107-
lo = vaddq_u8(lo, vandq_u8(lo_gt9, offset));
60+
/* Convert nibbles to hex ASCII */
61+
hi = hi + ascii_0 + (hi_gt9 & offset);
62+
lo = lo + ascii_0 + (lo_gt9 & offset);
10863

109-
/* Interleave hi/lo nibbles to get correct output order.
110-
vzip1/vzip2 interleave the low/high halves respectively. */
111-
uint8x16_t result0 = vzip1q_u8(hi, lo); /* First 16 hex chars */
112-
uint8x16_t result1 = vzip2q_u8(hi, lo); /* Second 16 hex chars */
64+
/* Interleave hi/lo nibbles using portable shufflevector.
65+
This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64. */
66+
v16u8 result0 = __builtin_shufflevector(hi, lo,
67+
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
68+
v16u8 result1 = __builtin_shufflevector(hi, lo,
69+
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
11370

11471
/* Store 32 hex characters */
115-
vst1q_u8(dst, result0);
116-
vst1q_u8(dst + 16, result1);
72+
memcpy(dst, &result0, 16);
73+
memcpy(dst + 16, &result1, 16);
11774
}
11875

11976
/* Scalar fallback for remaining 0-15 bytes */
12077
for (; i < len; i++, dst += 2) {
12178
unsigned int c = src[i];
122-
unsigned int hi = c >> 4;
123-
unsigned int lo = c & 0x0f;
124-
dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10));
125-
dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10));
79+
unsigned int h = c >> 4;
80+
unsigned int l = c & 0x0f;
81+
dst[0] = (Py_UCS1)(h + '0' + (h > 9) * ('a' - '0' - 10));
82+
dst[1] = (Py_UCS1)(l + '0' + (l > 9) * ('a' - '0' - 10));
12683
}
12784
}
12885

129-
#endif /* PY_HEXLIFY_CAN_COMPILE_NEON */
86+
#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */
13087

13188
static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
13289
PyObject* sep, int bytes_per_sep_group,
@@ -206,16 +163,10 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
206163
unsigned char c;
207164

208165
if (bytes_per_sep_group == 0) {
209-
#if PY_HEXLIFY_CAN_COMPILE_SSE2
210-
/* Use SSE2 for inputs >= 16 bytes (always available on x86-64) */
211-
if (arglen >= 16) {
212-
_Py_hexlify_sse2((const unsigned char *)argbuf, retbuf, arglen);
213-
}
214-
else
215-
#elif PY_HEXLIFY_CAN_COMPILE_NEON
216-
/* Use NEON for inputs >= 16 bytes (always available on AArch64) */
166+
#if PY_HEXLIFY_CAN_COMPILE_SIMD
167+
/* Use portable SIMD for inputs >= 16 bytes */
217168
if (arglen >= 16) {
218-
_Py_hexlify_neon((const unsigned char *)argbuf, retbuf, arglen);
169+
_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
219170
}
220171
else
221172
#endif

0 commit comments

Comments
 (0)