Skip to content

Commit bdceb9c

Browse files
gpsheadclaude
andcommitted
pystrhex: Add ARM NEON SIMD optimization for hex conversion
Add NEON vectorized implementation for AArch64 that processes 16 bytes per iteration using 128-bit NEON registers. Uses the same nibble-to-hex arithmetic approach as AVX2/AVX-512 versions. NEON is always available on AArch64, so no runtime detection is needed. The implementation uses vzip1q_u8/vzip2q_u8 for interleaving high/low nibbles into the correct output order. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 90da084 commit bdceb9c

1 file changed

Lines changed: 74 additions & 6 deletions

File tree

Python/pystrhex.c

Lines changed: 74 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,24 @@
55
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
66

77
/* SIMD optimization for hexlify.
8-
Only available on x86-64 with GCC/Clang. */
8+
x86-64: AVX2/AVX-512 with runtime detection
9+
ARM64: NEON (always available on AArch64) */
910
#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
10-
# define PY_HEXLIFY_CAN_COMPILE_SIMD 1
11+
# define PY_HEXLIFY_CAN_COMPILE_X86_SIMD 1
1112
# include <cpuid.h>
1213
# include <immintrin.h>
1314
#else
14-
# define PY_HEXLIFY_CAN_COMPILE_SIMD 0
15+
# define PY_HEXLIFY_CAN_COMPILE_X86_SIMD 0
1516
#endif
1617

17-
#if PY_HEXLIFY_CAN_COMPILE_SIMD
18+
#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
19+
# define PY_HEXLIFY_CAN_COMPILE_NEON 1
20+
# include <arm_neon.h>
21+
#else
22+
# define PY_HEXLIFY_CAN_COMPILE_NEON 0
23+
#endif
24+
25+
#if PY_HEXLIFY_CAN_COMPILE_X86_SIMD
1826

1927
/* Runtime CPU feature detection (lazy initialization)
2028
-1 = not checked, 0 = no SIMD, 1 = AVX2, 2 = AVX-512 */
@@ -223,7 +231,61 @@ _Py_hexlify_avx512(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
223231
}
224232
}
225233

226-
#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */
234+
#endif /* PY_HEXLIFY_CAN_COMPILE_X86_SIMD */
235+
236+
#if PY_HEXLIFY_CAN_COMPILE_NEON
237+
238+
/* ARM NEON accelerated hexlify: converts 16 bytes to 32 hex chars per iteration.
239+
NEON is always available on AArch64, no runtime detection needed. */
240+
static void
241+
_Py_hexlify_neon(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
242+
{
243+
const uint8x16_t mask_0f = vdupq_n_u8(0x0f);
244+
const uint8x16_t ascii_0 = vdupq_n_u8('0');
245+
const uint8x16_t offset = vdupq_n_u8('a' - '0' - 10); /* 0x27 */
246+
const uint8x16_t nine = vdupq_n_u8(9);
247+
248+
Py_ssize_t i = 0;
249+
250+
/* Process 16 bytes at a time */
251+
for (; i + 16 <= len; i += 16, dst += 32) {
252+
/* Load 16 input bytes */
253+
uint8x16_t data = vld1q_u8(src + i);
254+
255+
/* Extract high and low nibbles */
256+
uint8x16_t hi = vandq_u8(vshrq_n_u8(data, 4), mask_0f);
257+
uint8x16_t lo = vandq_u8(data, mask_0f);
258+
259+
/* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */
260+
uint8x16_t hi_gt9 = vcgtq_u8(hi, nine);
261+
uint8x16_t lo_gt9 = vcgtq_u8(lo, nine);
262+
263+
hi = vaddq_u8(hi, ascii_0);
264+
lo = vaddq_u8(lo, ascii_0);
265+
hi = vaddq_u8(hi, vandq_u8(hi_gt9, offset));
266+
lo = vaddq_u8(lo, vandq_u8(lo_gt9, offset));
267+
268+
/* Interleave hi/lo nibbles to get correct output order.
269+
vzip1/vzip2 interleave the low/high halves respectively. */
270+
uint8x16_t result0 = vzip1q_u8(hi, lo); /* First 16 hex chars */
271+
uint8x16_t result1 = vzip2q_u8(hi, lo); /* Second 16 hex chars */
272+
273+
/* Store 32 hex characters */
274+
vst1q_u8(dst, result0);
275+
vst1q_u8(dst + 16, result1);
276+
}
277+
278+
/* Scalar fallback for remaining 0-15 bytes */
279+
for (; i < len; i++, dst += 2) {
280+
unsigned int c = src[i];
281+
unsigned int hi = c >> 4;
282+
unsigned int lo = c & 0x0f;
283+
dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10));
284+
dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10));
285+
}
286+
}
287+
288+
#endif /* PY_HEXLIFY_CAN_COMPILE_NEON */
227289

228290
static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
229291
PyObject* sep, int bytes_per_sep_group,
@@ -303,7 +365,7 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
303365
unsigned char c;
304366

305367
if (bytes_per_sep_group == 0) {
306-
#if PY_HEXLIFY_CAN_COMPILE_SIMD
368+
#if PY_HEXLIFY_CAN_COMPILE_X86_SIMD
307369
int simd_level = _Py_hexlify_get_simd_level();
308370
/* Use AVX-512 for inputs >= 64 bytes, AVX2 for >= 32 bytes */
309371
if (arglen >= 64 && simd_level >= PY_HEXLIFY_SIMD_AVX512) {
@@ -313,6 +375,12 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
313375
_Py_hexlify_avx2((const unsigned char *)argbuf, retbuf, arglen);
314376
}
315377
else
378+
#elif PY_HEXLIFY_CAN_COMPILE_NEON
379+
/* Use NEON for inputs >= 16 bytes (always available on AArch64) */
380+
if (arglen >= 16) {
381+
_Py_hexlify_neon((const unsigned char *)argbuf, retbuf, arglen);
382+
}
383+
else
316384
#endif
317385
{
318386
for (i = j = 0; i < arglen; ++i) {

0 commit comments

Comments
 (0)