55#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
66
77/* SIMD optimization for hexlify.
8- Only available on x86-64 with GCC/Clang. */
8+ x86-64: AVX2/AVX-512 with runtime detection
9+ ARM64: NEON (always available on AArch64) */
910#if defined(__x86_64__ ) && (defined(__GNUC__ ) || defined(__clang__ ))
10- # define PY_HEXLIFY_CAN_COMPILE_SIMD 1
11+ # define PY_HEXLIFY_CAN_COMPILE_X86_SIMD 1
1112# include <cpuid.h>
1213# include <immintrin.h>
1314#else
14- # define PY_HEXLIFY_CAN_COMPILE_SIMD 0
15+ # define PY_HEXLIFY_CAN_COMPILE_X86_SIMD 0
1516#endif
1617
17- #if PY_HEXLIFY_CAN_COMPILE_SIMD
18+ #if defined(__aarch64__ ) && (defined(__GNUC__ ) || defined(__clang__ ))
19+ # define PY_HEXLIFY_CAN_COMPILE_NEON 1
20+ # include <arm_neon.h>
21+ #else
22+ # define PY_HEXLIFY_CAN_COMPILE_NEON 0
23+ #endif
24+
25+ #if PY_HEXLIFY_CAN_COMPILE_X86_SIMD
1826
1927/* Runtime CPU feature detection (lazy initialization)
2028 -1 = not checked, 0 = no SIMD, 1 = AVX2, 2 = AVX-512 */
@@ -223,7 +231,61 @@ _Py_hexlify_avx512(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
223231 }
224232}
225233
226- #endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */
234+ #endif /* PY_HEXLIFY_CAN_COMPILE_X86_SIMD */
235+
236+ #if PY_HEXLIFY_CAN_COMPILE_NEON
237+
238+ /* ARM NEON accelerated hexlify: converts 16 bytes to 32 hex chars per iteration.
239+ NEON is always available on AArch64, no runtime detection needed. */
240+ static void
241+ _Py_hexlify_neon (const unsigned char * src , Py_UCS1 * dst , Py_ssize_t len )
242+ {
243+ const uint8x16_t mask_0f = vdupq_n_u8 (0x0f );
244+ const uint8x16_t ascii_0 = vdupq_n_u8 ('0' );
245+ const uint8x16_t offset = vdupq_n_u8 ('a' - '0' - 10 ); /* 0x27 */
246+ const uint8x16_t nine = vdupq_n_u8 (9 );
247+
248+ Py_ssize_t i = 0 ;
249+
250+ /* Process 16 bytes at a time */
251+ for (; i + 16 <= len ; i += 16 , dst += 32 ) {
252+ /* Load 16 input bytes */
253+ uint8x16_t data = vld1q_u8 (src + i );
254+
255+ /* Extract high and low nibbles */
256+ uint8x16_t hi = vandq_u8 (vshrq_n_u8 (data , 4 ), mask_0f );
257+ uint8x16_t lo = vandq_u8 (data , mask_0f );
258+
259+ /* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */
260+ uint8x16_t hi_gt9 = vcgtq_u8 (hi , nine );
261+ uint8x16_t lo_gt9 = vcgtq_u8 (lo , nine );
262+
263+ hi = vaddq_u8 (hi , ascii_0 );
264+ lo = vaddq_u8 (lo , ascii_0 );
265+ hi = vaddq_u8 (hi , vandq_u8 (hi_gt9 , offset ));
266+ lo = vaddq_u8 (lo , vandq_u8 (lo_gt9 , offset ));
267+
268+ /* Interleave hi/lo nibbles to get correct output order.
269+ vzip1/vzip2 interleave the low/high halves respectively. */
270+ uint8x16_t result0 = vzip1q_u8 (hi , lo ); /* First 16 hex chars */
271+ uint8x16_t result1 = vzip2q_u8 (hi , lo ); /* Second 16 hex chars */
272+
273+ /* Store 32 hex characters */
274+ vst1q_u8 (dst , result0 );
275+ vst1q_u8 (dst + 16 , result1 );
276+ }
277+
278+ /* Scalar fallback for remaining 0-15 bytes */
279+ for (; i < len ; i ++ , dst += 2 ) {
280+ unsigned int c = src [i ];
281+ unsigned int hi = c >> 4 ;
282+ unsigned int lo = c & 0x0f ;
283+ dst [0 ] = (Py_UCS1 )(hi + '0' + (hi > 9 ) * ('a' - '0' - 10 ));
284+ dst [1 ] = (Py_UCS1 )(lo + '0' + (lo > 9 ) * ('a' - '0' - 10 ));
285+ }
286+ }
287+
288+ #endif /* PY_HEXLIFY_CAN_COMPILE_NEON */
227289
228290static PyObject * _Py_strhex_impl (const char * argbuf , const Py_ssize_t arglen ,
229291 PyObject * sep , int bytes_per_sep_group ,
@@ -303,7 +365,7 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
303365 unsigned char c ;
304366
305367 if (bytes_per_sep_group == 0 ) {
306- #if PY_HEXLIFY_CAN_COMPILE_SIMD
368+ #if PY_HEXLIFY_CAN_COMPILE_X86_SIMD
307369 int simd_level = _Py_hexlify_get_simd_level ();
308370 /* Use AVX-512 for inputs >= 64 bytes, AVX2 for >= 32 bytes */
309371 if (arglen >= 64 && simd_level >= PY_HEXLIFY_SIMD_AVX512 ) {
@@ -313,6 +375,12 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
313375 _Py_hexlify_avx2 ((const unsigned char * )argbuf , retbuf , arglen );
314376 }
315377 else
378+ #elif PY_HEXLIFY_CAN_COMPILE_NEON
379+ /* Use NEON for inputs >= 16 bytes (always available on AArch64) */
380+ if (arglen >= 16 ) {
381+ _Py_hexlify_neon ((const unsigned char * )argbuf , retbuf , arglen );
382+ }
383+ else
316384#endif
317385 {
318386 for (i = j = 0 ; i < arglen ; ++ i ) {
0 commit comments