|
5 | 5 | #include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency() |
6 | 6 |
|
7 | 7 | /* SIMD optimization for hexlify. |
8 | | - x86-64: AVX2/AVX-512 with runtime detection |
| 8 | + x86-64: SSE2 (always available, part of x86-64 baseline) |
9 | 9 | ARM64: NEON (always available on AArch64) */ |
10 | 10 | #if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__)) |
11 | | -# define PY_HEXLIFY_CAN_COMPILE_X86_SIMD 1 |
12 | | -# include <cpuid.h> |
13 | | -# include <immintrin.h> |
| 11 | +# define PY_HEXLIFY_CAN_COMPILE_SSE2 1 |
| 12 | +# include <emmintrin.h> |
14 | 13 | #else |
15 | | -# define PY_HEXLIFY_CAN_COMPILE_X86_SIMD 0 |
| 14 | +# define PY_HEXLIFY_CAN_COMPILE_SSE2 0 |
16 | 15 | #endif |
17 | 16 |
|
18 | 17 | #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) |
|
22 | 21 | # define PY_HEXLIFY_CAN_COMPILE_NEON 0 |
23 | 22 | #endif |
24 | 23 |
|
25 | | -#if PY_HEXLIFY_CAN_COMPILE_X86_SIMD |
26 | | - |
27 | | -/* Runtime CPU feature detection (lazy initialization) |
28 | | - -1 = not checked, 0 = no SIMD, 1 = AVX2, 2 = AVX-512 */ |
29 | | -static int _Py_hexlify_simd_level = -1; |
30 | | - |
31 | | -#define PY_HEXLIFY_SIMD_NONE 0 |
32 | | -#define PY_HEXLIFY_SIMD_AVX2 1 |
33 | | -#define PY_HEXLIFY_SIMD_AVX512 2 |
34 | | - |
35 | | -static void |
36 | | -_Py_hexlify_detect_cpu_features(void) |
37 | | -{ |
38 | | - unsigned int eax, ebx, ecx, edx; |
39 | | - |
40 | | - _Py_hexlify_simd_level = PY_HEXLIFY_SIMD_NONE; |
41 | | - |
42 | | - if (!__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) { |
43 | | - return; |
44 | | - } |
45 | | - |
46 | | - /* Check for AVX2: CPUID.7H:EBX bit 5 */ |
47 | | - int has_avx2 = (ebx & (1 << 5)) != 0; |
48 | | - |
49 | | - /* Check for AVX-512F + AVX-512BW + AVX-512VBMI: |
50 | | - CPUID.7H:EBX bits 16 and 30, ECX bit 1 */ |
51 | | - int has_avx512f = (ebx & (1 << 16)) != 0; |
52 | | - int has_avx512bw = (ebx & (1 << 30)) != 0; |
53 | | - int has_avx512vbmi = (ecx & (1 << 1)) != 0; |
54 | | - |
55 | | - if (has_avx512f && has_avx512bw && has_avx512vbmi) { |
56 | | - _Py_hexlify_simd_level = PY_HEXLIFY_SIMD_AVX512; |
57 | | - } else if (has_avx2) { |
58 | | - _Py_hexlify_simd_level = PY_HEXLIFY_SIMD_AVX2; |
59 | | - } |
60 | | -} |
61 | | - |
62 | | -static inline int |
63 | | -_Py_hexlify_get_simd_level(void) |
64 | | -{ |
65 | | - if (_Py_hexlify_simd_level < 0) { |
66 | | - _Py_hexlify_detect_cpu_features(); |
67 | | - } |
68 | | - return _Py_hexlify_simd_level; |
69 | | -} |
| 24 | +#if PY_HEXLIFY_CAN_COMPILE_SSE2 |
70 | 25 |
|
71 | 26 | /* SSE2-accelerated hexlify: converts 16 bytes to 32 hex chars per iteration. |
72 | 27 | SSE2 is always available on x86-64 (part of AMD64 baseline). */ |
@@ -117,170 +72,7 @@ _Py_hexlify_sse2(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
117 | 72 | } |
118 | 73 | } |
119 | 74 |
|
120 | | -/* AVX2-accelerated hexlify: converts 32 bytes to 64 hex chars per iteration. |
121 | | - Uses arithmetic nibble-to-hex conversion instead of table lookup. */ |
122 | | -__attribute__((target("avx2"))) |
123 | | -static void |
124 | | -_Py_hexlify_avx2(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
125 | | -{ |
126 | | - const __m256i mask_0f = _mm256_set1_epi8(0x0f); |
127 | | - const __m256i ascii_0 = _mm256_set1_epi8('0'); |
128 | | - const __m256i offset = _mm256_set1_epi8('a' - '0' - 10); /* 0x27 */ |
129 | | - const __m256i nine = _mm256_set1_epi8(9); |
130 | | - |
131 | | - Py_ssize_t i = 0; |
132 | | - |
133 | | - /* Process 32 bytes at a time */ |
134 | | - for (; i + 32 <= len; i += 32, dst += 64) { |
135 | | - /* Load 32 input bytes */ |
136 | | - __m256i data = _mm256_loadu_si256((const __m256i *)(src + i)); |
137 | | - |
138 | | - /* Extract high and low nibbles */ |
139 | | - __m256i hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), mask_0f); |
140 | | - __m256i lo = _mm256_and_si256(data, mask_0f); |
141 | | - |
142 | | - /* Convert nibbles to hex: add '0', then add 0x27 where nibble > 9 */ |
143 | | - __m256i hi_gt9 = _mm256_cmpgt_epi8(hi, nine); |
144 | | - __m256i lo_gt9 = _mm256_cmpgt_epi8(lo, nine); |
145 | | - |
146 | | - hi = _mm256_add_epi8(hi, ascii_0); |
147 | | - lo = _mm256_add_epi8(lo, ascii_0); |
148 | | - hi = _mm256_add_epi8(hi, _mm256_and_si256(hi_gt9, offset)); |
149 | | - lo = _mm256_add_epi8(lo, _mm256_and_si256(lo_gt9, offset)); |
150 | | - |
151 | | - /* Interleave hi/lo nibbles to get correct output order. |
152 | | - unpacklo/hi work within 128-bit lanes, so we need permute to fix. */ |
153 | | - __m256i mixed_lo = _mm256_unpacklo_epi8(hi, lo); |
154 | | - __m256i mixed_hi = _mm256_unpackhi_epi8(hi, lo); |
155 | | - |
156 | | - /* Fix cross-lane ordering */ |
157 | | - __m256i result0 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x20); |
158 | | - __m256i result1 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x31); |
159 | | - |
160 | | - /* Store 64 hex characters */ |
161 | | - _mm256_storeu_si256((__m256i *)dst, result0); |
162 | | - _mm256_storeu_si256((__m256i *)(dst + 32), result1); |
163 | | - } |
164 | | - |
165 | | - /* Scalar fallback for remaining 0-31 bytes */ |
166 | | - for (; i < len; i++, dst += 2) { |
167 | | - unsigned int c = src[i]; |
168 | | - unsigned int hi = c >> 4; |
169 | | - unsigned int lo = c & 0x0f; |
170 | | - dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10)); |
171 | | - dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10)); |
172 | | - } |
173 | | -} |
174 | | - |
175 | | -/* AVX-512 accelerated hexlify: converts 64 bytes to 128 hex chars per iteration. |
176 | | - Requires AVX-512F, AVX-512BW, and AVX-512VBMI for byte-level permutation. */ |
177 | | -__attribute__((target("avx512f,avx512bw,avx512vbmi"))) |
178 | | -static void |
179 | | -_Py_hexlify_avx512(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
180 | | -{ |
181 | | - const __m512i mask_0f = _mm512_set1_epi8(0x0f); |
182 | | - const __m512i ascii_0 = _mm512_set1_epi8('0'); |
183 | | - const __m512i ascii_a = _mm512_set1_epi8('a' - 10); |
184 | | - const __m512i nine = _mm512_set1_epi8(9); |
185 | | - |
186 | | - /* Permutation indices for interleaving hi/lo nibbles. |
187 | | - We need to transform: |
188 | | - hi: H0 H1 H2 ... H63 |
189 | | - lo: L0 L1 L2 ... L63 |
190 | | - into: |
191 | | - out0: H0 L0 H1 L1 ... H31 L31 |
192 | | - out1: H32 L32 H33 L33 ... H63 L63 |
193 | | - */ |
194 | | - const __m512i interleave_lo = _mm512_set_epi8( |
195 | | - 39, 103, 38, 102, 37, 101, 36, 100, 35, 99, 34, 98, 33, 97, 32, 96, |
196 | | - 47, 111, 46, 110, 45, 109, 44, 108, 43, 107, 42, 106, 41, 105, 40, 104, |
197 | | - 55, 119, 54, 118, 53, 117, 52, 116, 51, 115, 50, 114, 49, 113, 48, 112, |
198 | | - 63, 127, 62, 126, 61, 125, 60, 124, 59, 123, 58, 122, 57, 121, 56, 120 |
199 | | - ); |
200 | | - const __m512i interleave_hi = _mm512_set_epi8( |
201 | | - 7, 71, 6, 70, 5, 69, 4, 68, 3, 67, 2, 66, 1, 65, 0, 64, |
202 | | - 15, 79, 14, 78, 13, 77, 12, 76, 11, 75, 10, 74, 9, 73, 8, 72, |
203 | | - 23, 87, 22, 86, 21, 85, 20, 84, 19, 83, 18, 82, 17, 81, 16, 80, |
204 | | - 31, 95, 30, 94, 29, 93, 28, 92, 27, 91, 26, 90, 25, 89, 24, 88 |
205 | | - ); |
206 | | - |
207 | | - Py_ssize_t i = 0; |
208 | | - |
209 | | - /* Process 64 bytes at a time */ |
210 | | - for (; i + 64 <= len; i += 64, dst += 128) { |
211 | | - /* Load 64 input bytes */ |
212 | | - __m512i data = _mm512_loadu_si512((const __m512i *)(src + i)); |
213 | | - |
214 | | - /* Extract high and low nibbles */ |
215 | | - __m512i hi = _mm512_and_si512(_mm512_srli_epi16(data, 4), mask_0f); |
216 | | - __m512i lo = _mm512_and_si512(data, mask_0f); |
217 | | - |
218 | | - /* Convert nibbles to hex using masked blend: |
219 | | - if nibble > 9: use 'a' + (nibble - 10) = nibble + ('a' - 10) |
220 | | - else: use '0' + nibble */ |
221 | | - __mmask64 hi_alpha = _mm512_cmpgt_epi8_mask(hi, nine); |
222 | | - __mmask64 lo_alpha = _mm512_cmpgt_epi8_mask(lo, nine); |
223 | | - |
224 | | - __m512i hi_digit = _mm512_add_epi8(hi, ascii_0); |
225 | | - __m512i hi_letter = _mm512_add_epi8(hi, ascii_a); |
226 | | - hi = _mm512_mask_blend_epi8(hi_alpha, hi_digit, hi_letter); |
227 | | - |
228 | | - __m512i lo_digit = _mm512_add_epi8(lo, ascii_0); |
229 | | - __m512i lo_letter = _mm512_add_epi8(lo, ascii_a); |
230 | | - lo = _mm512_mask_blend_epi8(lo_alpha, lo_digit, lo_letter); |
231 | | - |
232 | | - /* Interleave hi/lo to get correct output order using permutex2var */ |
233 | | - __m512i result0 = _mm512_permutex2var_epi8(hi, interleave_hi, lo); |
234 | | - __m512i result1 = _mm512_permutex2var_epi8(hi, interleave_lo, lo); |
235 | | - |
236 | | - /* Store 128 hex characters */ |
237 | | - _mm512_storeu_si512((__m512i *)dst, result0); |
238 | | - _mm512_storeu_si512((__m512i *)(dst + 64), result1); |
239 | | - } |
240 | | - |
241 | | - /* Use AVX2 for remaining 32-63 bytes */ |
242 | | - if (i + 32 <= len) { |
243 | | - const __m256i mask_0f_256 = _mm256_set1_epi8(0x0f); |
244 | | - const __m256i ascii_0_256 = _mm256_set1_epi8('0'); |
245 | | - const __m256i offset_256 = _mm256_set1_epi8('a' - '0' - 10); |
246 | | - const __m256i nine_256 = _mm256_set1_epi8(9); |
247 | | - |
248 | | - __m256i data = _mm256_loadu_si256((const __m256i *)(src + i)); |
249 | | - __m256i hi = _mm256_and_si256(_mm256_srli_epi16(data, 4), mask_0f_256); |
250 | | - __m256i lo = _mm256_and_si256(data, mask_0f_256); |
251 | | - |
252 | | - __m256i hi_gt9 = _mm256_cmpgt_epi8(hi, nine_256); |
253 | | - __m256i lo_gt9 = _mm256_cmpgt_epi8(lo, nine_256); |
254 | | - |
255 | | - hi = _mm256_add_epi8(hi, ascii_0_256); |
256 | | - lo = _mm256_add_epi8(lo, ascii_0_256); |
257 | | - hi = _mm256_add_epi8(hi, _mm256_and_si256(hi_gt9, offset_256)); |
258 | | - lo = _mm256_add_epi8(lo, _mm256_and_si256(lo_gt9, offset_256)); |
259 | | - |
260 | | - __m256i mixed_lo = _mm256_unpacklo_epi8(hi, lo); |
261 | | - __m256i mixed_hi = _mm256_unpackhi_epi8(hi, lo); |
262 | | - |
263 | | - __m256i r0 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x20); |
264 | | - __m256i r1 = _mm256_permute2x128_si256(mixed_lo, mixed_hi, 0x31); |
265 | | - |
266 | | - _mm256_storeu_si256((__m256i *)dst, r0); |
267 | | - _mm256_storeu_si256((__m256i *)(dst + 32), r1); |
268 | | - |
269 | | - i += 32; |
270 | | - dst += 64; |
271 | | - } |
272 | | - |
273 | | - /* Scalar fallback for remaining 0-31 bytes */ |
274 | | - for (; i < len; i++, dst += 2) { |
275 | | - unsigned int c = src[i]; |
276 | | - unsigned int hi = c >> 4; |
277 | | - unsigned int lo = c & 0x0f; |
278 | | - dst[0] = (Py_UCS1)(hi + '0' + (hi > 9) * ('a' - '0' - 10)); |
279 | | - dst[1] = (Py_UCS1)(lo + '0' + (lo > 9) * ('a' - '0' - 10)); |
280 | | - } |
281 | | -} |
282 | | - |
283 | | -#endif /* PY_HEXLIFY_CAN_COMPILE_X86_SIMD */ |
| 75 | +#endif /* PY_HEXLIFY_CAN_COMPILE_SSE2 */ |
284 | 76 |
|
285 | 77 | #if PY_HEXLIFY_CAN_COMPILE_NEON |
286 | 78 |
|
@@ -414,17 +206,9 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, |
414 | 206 | unsigned char c; |
415 | 207 |
|
416 | 208 | if (bytes_per_sep_group == 0) { |
417 | | -#if PY_HEXLIFY_CAN_COMPILE_X86_SIMD |
418 | | - int simd_level = _Py_hexlify_get_simd_level(); |
419 | | - /* Use AVX-512 for inputs >= 64 bytes, AVX2 for >= 32 bytes, |
420 | | - SSE2 for >= 16 bytes (SSE2 always available on x86-64) */ |
421 | | - if (arglen >= 64 && simd_level >= PY_HEXLIFY_SIMD_AVX512) { |
422 | | - _Py_hexlify_avx512((const unsigned char *)argbuf, retbuf, arglen); |
423 | | - } |
424 | | - else if (arglen >= 32 && simd_level >= PY_HEXLIFY_SIMD_AVX2) { |
425 | | - _Py_hexlify_avx2((const unsigned char *)argbuf, retbuf, arglen); |
426 | | - } |
427 | | - else if (arglen >= 16) { |
| 209 | +#if PY_HEXLIFY_CAN_COMPILE_SSE2 |
| 210 | + /* Use SSE2 for inputs >= 16 bytes (always available on x86-64) */ |
| 211 | + if (arglen >= 16) { |
428 | 212 | _Py_hexlify_sse2((const unsigned char *)argbuf, retbuf, arglen); |
429 | 213 | } |
430 | 214 | else |
|
0 commit comments