|
4 | 4 | #include "pycore_strhex.h" // _Py_strhex_with_sep() |
5 | 5 | #include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency() |
6 | 6 |
|
7 | | -/* SIMD optimization for hexlify. |
8 | | - x86-64: SSE2 (always available, part of x86-64 baseline) |
9 | | - ARM64: NEON (always available on AArch64) */ |
10 | | -#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__)) |
11 | | -# define PY_HEXLIFY_CAN_COMPILE_SSE2 1 |
12 | | -# include <emmintrin.h> |
| 7 | +/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions. |
| 8 | + Uses __builtin_shufflevector for portable interleave that compiles to |
| 9 | + native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64, |
| 10 | + NEON zip1/zip2 on ARM64). |
| 11 | +
|
| 12 | + Requirements: |
| 13 | + - GCC 12+ or Clang 3.0+ (for __builtin_shufflevector) |
| 14 | + - x86-64 or ARM64 architecture */ |
| 15 | +#if (defined(__x86_64__) || defined(__aarch64__)) && \ |
| 16 | + (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 12)) |
| 17 | +# define PY_HEXLIFY_CAN_COMPILE_SIMD 1 |
13 | 18 | #else |
14 | | -# define PY_HEXLIFY_CAN_COMPILE_SSE2 0 |
| 19 | +# define PY_HEXLIFY_CAN_COMPILE_SIMD 0 |
15 | 20 | #endif |
16 | 21 |
|
17 | | -#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) |
18 | | -# define PY_HEXLIFY_CAN_COMPILE_NEON 1 |
19 | | -# include <arm_neon.h> |
20 | | -#else |
21 | | -# define PY_HEXLIFY_CAN_COMPILE_NEON 0 |
22 | | -#endif |
| 22 | +#if PY_HEXLIFY_CAN_COMPILE_SIMD |
23 | 23 |
|
24 | | -#if PY_HEXLIFY_CAN_COMPILE_SSE2 |
| 24 | +/* 128-bit vector of 16 unsigned bytes */ |
| 25 | +typedef unsigned char v16u8 __attribute__((vector_size(16))); |
25 | 26 |
|
26 | | -/* SSE2-accelerated hexlify: converts 16 bytes to 32 hex chars per iteration. |
27 | | - SSE2 is always available on x86-64 (part of AMD64 baseline). */ |
28 | | -static void |
29 | | -_Py_hexlify_sse2(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
| 27 | +/* Splat a byte value across all 16 lanes */ |
| 28 | +static inline v16u8 |
| 29 | +v16u8_splat(unsigned char x) |
30 | 30 | { |
31 | | - const __m128i mask_0f = _mm_set1_epi8(0x0f); |
32 | | - const __m128i ascii_0 = _mm_set1_epi8('0'); |
33 | | - const __m128i offset = _mm_set1_epi8('a' - '0' - 10); /* 0x27 */ |
34 | | - const __m128i nine = _mm_set1_epi8(9); |
35 | | - |
36 | | - Py_ssize_t i = 0; |
37 | | - |
38 | | - /* Process 16 bytes at a time */ |
39 | | - for (; i + 16 <= len; i += 16, dst += 32) { |
40 | | - /* Load 16 input bytes */ |
41 | | - __m128i data = _mm_loadu_si128((const __m128i *)(src + i)); |
42 | | - |
43 | | - /* Extract high and low nibbles */ |
44 | | - __m128i hi = _mm_and_si128(_mm_srli_epi16(data, 4), mask_0f); |
45 | | - __m128i lo = _mm_and_si128(data, mask_0f); |
46 | | - |
47 | | - /* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */ |
48 | | - __m128i hi_gt9 = _mm_cmpgt_epi8(hi, nine); |
49 | | - __m128i lo_gt9 = _mm_cmpgt_epi8(lo, nine); |
50 | | - |
51 | | - hi = _mm_add_epi8(hi, ascii_0); |
52 | | - lo = _mm_add_epi8(lo, ascii_0); |
53 | | - hi = _mm_add_epi8(hi, _mm_and_si128(hi_gt9, offset)); |
54 | | - lo = _mm_add_epi8(lo, _mm_and_si128(lo_gt9, offset)); |
55 | | - |
56 | | - /* Interleave hi/lo nibbles to get correct output order */ |
57 | | - __m128i result0 = _mm_unpacklo_epi8(hi, lo); /* First 16 hex chars */ |
58 | | - __m128i result1 = _mm_unpackhi_epi8(hi, lo); /* Second 16 hex chars */ |
59 | | - |
60 | | - /* Store 32 hex characters */ |
61 | | - _mm_storeu_si128((__m128i *)dst, result0); |
62 | | - _mm_storeu_si128((__m128i *)(dst + 16), result1); |
63 | | - } |
64 | | - |
65 | | - /* Scalar fallback for remaining 0-15 bytes */ |
66 | | - for (; i < len; i++, dst += 2) { |
67 | | - unsigned int c = src[i]; |
68 | | - unsigned int hi = c >> 4; |
69 | | - unsigned int lo = c & 0x0f; |
70 | | - dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10)); |
71 | | - dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10)); |
72 | | - } |
| 31 | + return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; |
73 | 32 | } |
74 | 33 |
|
75 | | -#endif /* PY_HEXLIFY_CAN_COMPILE_SSE2 */ |
76 | | - |
77 | | -#if PY_HEXLIFY_CAN_COMPILE_NEON |
78 | | - |
79 | | -/* ARM NEON accelerated hexlify: converts 16 bytes to 32 hex chars per iteration. |
80 | | - NEON is always available on AArch64, no runtime detection needed. */ |
| 34 | +/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration. |
| 35 | + Compiles to native SSE2 on x86-64, NEON on ARM64. */ |
81 | 36 | static void |
82 | | -_Py_hexlify_neon(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
| 37 | +_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
83 | 38 | { |
84 | | - const uint8x16_t mask_0f = vdupq_n_u8(0x0f); |
85 | | - const uint8x16_t ascii_0 = vdupq_n_u8('0'); |
86 | | - const uint8x16_t offset = vdupq_n_u8('a' - '0' - 10); /* 0x27 */ |
87 | | - const uint8x16_t nine = vdupq_n_u8(9); |
| 39 | + const v16u8 mask_0f = v16u8_splat(0x0f); |
| 40 | + const v16u8 ascii_0 = v16u8_splat('0'); |
| 41 | + const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */ |
| 42 | + const v16u8 nine = v16u8_splat(9); |
88 | 43 |
|
89 | 44 | Py_ssize_t i = 0; |
90 | 45 |
|
91 | 46 | /* Process 16 bytes at a time */ |
92 | 47 | for (; i + 16 <= len; i += 16, dst += 32) { |
93 | | - /* Load 16 input bytes */ |
94 | | - uint8x16_t data = vld1q_u8(src + i); |
| 48 | + /* Load 16 bytes (memcpy for safe unaligned access) */ |
| 49 | + v16u8 data; |
| 50 | + memcpy(&data, src + i, 16); |
95 | 51 |
|
96 | | - /* Extract high and low nibbles */ |
97 | | - uint8x16_t hi = vandq_u8(vshrq_n_u8(data, 4), mask_0f); |
98 | | - uint8x16_t lo = vandq_u8(data, mask_0f); |
| 52 | + /* Extract high and low nibbles using vector operators */ |
| 53 | + v16u8 hi = (data >> 4) & mask_0f; |
| 54 | + v16u8 lo = data & mask_0f; |
99 | 55 |
|
100 | | - /* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */ |
101 | | - uint8x16_t hi_gt9 = vcgtq_u8(hi, nine); |
102 | | - uint8x16_t lo_gt9 = vcgtq_u8(lo, nine); |
| 56 | + /* Compare > 9 produces all-ones mask where true */ |
| 57 | + v16u8 hi_gt9 = hi > nine; |
| 58 | + v16u8 lo_gt9 = lo > nine; |
103 | 59 |
|
104 | | - hi = vaddq_u8(hi, ascii_0); |
105 | | - lo = vaddq_u8(lo, ascii_0); |
106 | | - hi = vaddq_u8(hi, vandq_u8(hi_gt9, offset)); |
107 | | - lo = vaddq_u8(lo, vandq_u8(lo_gt9, offset)); |
| 60 | + /* Convert nibbles to hex ASCII */ |
| 61 | + hi = hi + ascii_0 + (hi_gt9 & offset); |
| 62 | + lo = lo + ascii_0 + (lo_gt9 & offset); |
108 | 63 |
|
109 | | - /* Interleave hi/lo nibbles to get correct output order. |
110 | | - vzip1/vzip2 interleave the low/high halves respectively. */ |
111 | | - uint8x16_t result0 = vzip1q_u8(hi, lo); /* First 16 hex chars */ |
112 | | - uint8x16_t result1 = vzip2q_u8(hi, lo); /* Second 16 hex chars */ |
| 64 | + /* Interleave hi/lo nibbles using portable shufflevector. |
| 65 | + This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64. */ |
| 66 | + v16u8 result0 = __builtin_shufflevector(hi, lo, |
| 67 | + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); |
| 68 | + v16u8 result1 = __builtin_shufflevector(hi, lo, |
| 69 | + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); |
113 | 70 |
|
114 | 71 | /* Store 32 hex characters */ |
115 | | - vst1q_u8(dst, result0); |
116 | | - vst1q_u8(dst + 16, result1); |
| 72 | + memcpy(dst, &result0, 16); |
| 73 | + memcpy(dst + 16, &result1, 16); |
117 | 74 | } |
118 | 75 |
|
119 | 76 | /* Scalar fallback for remaining 0-15 bytes */ |
120 | 77 | for (; i < len; i++, dst += 2) { |
121 | 78 | unsigned int c = src[i]; |
122 | | - unsigned int hi = c >> 4; |
123 | | - unsigned int lo = c & 0x0f; |
124 | | - dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10)); |
125 | | - dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10)); |
| 79 | + unsigned int h = c >> 4; |
| 80 | + unsigned int l = c & 0x0f; |
| 81 | + dst[0] = (Py_UCS1)(h + '0' + (h > 9) * ('a' - '0' - 10)); |
| 82 | + dst[1] = (Py_UCS1)(l + '0' + (l > 9) * ('a' - '0' - 10)); |
126 | 83 | } |
127 | 84 | } |
128 | 85 |
|
129 | | -#endif /* PY_HEXLIFY_CAN_COMPILE_NEON */ |
| 86 | +#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */ |
130 | 87 |
|
131 | 88 | static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, |
132 | 89 | PyObject* sep, int bytes_per_sep_group, |
@@ -206,16 +163,10 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, |
206 | 163 | unsigned char c; |
207 | 164 |
|
208 | 165 | if (bytes_per_sep_group == 0) { |
209 | | -#if PY_HEXLIFY_CAN_COMPILE_SSE2 |
210 | | - /* Use SSE2 for inputs >= 16 bytes (always available on x86-64) */ |
211 | | - if (arglen >= 16) { |
212 | | - _Py_hexlify_sse2((const unsigned char *)argbuf, retbuf, arglen); |
213 | | - } |
214 | | - else |
215 | | -#elif PY_HEXLIFY_CAN_COMPILE_NEON |
216 | | - /* Use NEON for inputs >= 16 bytes (always available on AArch64) */ |
| 166 | +#if PY_HEXLIFY_CAN_COMPILE_SIMD |
| 167 | + /* Use portable SIMD for inputs >= 16 bytes */ |
217 | 168 | if (arglen >= 16) { |
218 | | - _Py_hexlify_neon((const unsigned char *)argbuf, retbuf, arglen); |
| 169 | + _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen); |
219 | 170 | } |
220 | 171 | else |
221 | 172 | #endif |
|
0 commit comments