Skip to content

Commit 123045b

Browse files
jbrockmendelclaude
andauthored
PERF: use khash set types in ismember to avoid allocating unused vals array (#64434)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 38caf90 commit 123045b

4 files changed

Lines changed: 166 additions & 20 deletions

File tree

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 71 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,77 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
66

77
{{py:
88

9-
# name, dtype, ttype, c_type, to_c_type
9+
# name, dtype, ttype, c_type, to_c_type, set_ttype
1010
dtypes = [('Complex128', 'complex128', 'complex128',
11-
'khcomplex128_t', 'to_khcomplex128_t'),
11+
'khcomplex128_t', 'to_khcomplex128_t', 'set_complex128'),
1212
('Complex64', 'complex64', 'complex64',
13-
'khcomplex64_t', 'to_khcomplex64_t'),
14-
('Float64', 'float64', 'float64', 'float64_t', ''),
15-
('Float32', 'float32', 'float32', 'float32_t', ''),
16-
('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
17-
('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
18-
('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
19-
('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
20-
('Object', 'object', 'pymap', 'object', '<PyObject*>'),
21-
('Int64', 'int64', 'int64', 'int64_t', ''),
22-
('Int32', 'int32', 'int32', 'int32_t', ''),
23-
('Int16', 'int16', 'int16', 'int16_t', ''),
24-
('Int8', 'int8', 'int8', 'int8_t', '')]
13+
'khcomplex64_t', 'to_khcomplex64_t', 'set_complex64'),
14+
('Float64', 'float64', 'float64', 'float64_t', '', 'set_float64'),
15+
('Float32', 'float32', 'float32', 'float32_t', '', 'set_float32'),
16+
('UInt64', 'uint64', 'uint64', 'uint64_t', '', 'set_uint64'),
17+
('UInt32', 'uint32', 'uint32', 'uint32_t', '', 'set_uint32'),
18+
('UInt16', 'uint16', 'uint16', 'uint16_t', '', 'set_uint16'),
19+
('UInt8', 'uint8', 'uint8', 'uint8_t', '', 'set_uint8'),
20+
('Object', 'object', 'pymap', 'object', '<PyObject*>', 'pyset'),
21+
('Int64', 'int64', 'int64', 'int64_t', '', 'set_int64'),
22+
('Int32', 'int32', 'int32', 'int32_t', '', 'set_int32'),
23+
('Int16', 'int16', 'int16', 'int16_t', '', 'set_int16'),
24+
('Int8', 'int8', 'int8', 'int8_t', '', 'set_int8')]
25+
26+
# Primitive set types used by ismember. Declared here (not via cimport) so
27+
# that their companion functions (kh_init_*, kh_put_*, etc.) are also in
28+
# scope. kh_pyset_t (object) is handled via khash.pxd / hashtable.pxd.
29+
# name, c_type
30+
primitive_set_types_for_ismember = [
31+
('set_complex128', 'khcomplex128_t'),
32+
('set_complex64', 'khcomplex64_t'),
33+
('set_float64', 'float64_t'),
34+
('set_float32', 'float32_t'),
35+
('set_uint64', 'uint64_t'),
36+
('set_uint32', 'uint32_t'),
37+
('set_uint16', 'uint16_t'),
38+
('set_uint8', 'uint8_t'),
39+
('set_int64', 'int64_t'),
40+
('set_int32', 'int32_t'),
41+
('set_int16', 'int16_t'),
42+
('set_int8', 'int8_t'),
43+
]
2544

2645
}}
2746

28-
{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
47+
{{for set_name, set_c_type in primitive_set_types_for_ismember}}
48+
cdef extern from "pandas/vendored/klib/khash_python.h":
49+
ctypedef struct kh_{{set_name}}_t:
50+
uint32_t n_buckets, size, n_occupied, upper_bound
51+
uint32_t *flags
52+
{{set_c_type}} *keys
53+
char *vals
54+
55+
kh_{{set_name}}_t* kh_init_{{set_name}}() nogil
56+
void kh_destroy_{{set_name}}(kh_{{set_name}}_t*) nogil
57+
uint32_t kh_get_{{set_name}}(kh_{{set_name}}_t*, {{set_c_type}}) nogil
58+
void kh_resize_{{set_name}}(kh_{{set_name}}_t*, uint32_t) nogil
59+
uint32_t kh_put_{{set_name}}(kh_{{set_name}}_t*, {{set_c_type}}, int*) nogil
60+
61+
{{endfor}}
62+
63+
# kh_pyset_t is the set counterpart to kh_pymap_t (object dtype).
64+
# Redeclared here so kh_init_pyset / kh_put_pyset / kh_get_pyset are in scope.
65+
from cpython.object cimport PyObject
66+
cdef extern from "pandas/vendored/klib/khash_python.h":
67+
ctypedef struct kh_pyset_t:
68+
uint32_t n_buckets, size, n_occupied, upper_bound
69+
uint32_t *flags
70+
PyObject **keys
71+
char *vals
72+
73+
kh_pyset_t* kh_init_pyset()
74+
void kh_destroy_pyset(kh_pyset_t*)
75+
uint32_t kh_get_pyset(kh_pyset_t*, PyObject*)
76+
void kh_resize_pyset(kh_pyset_t*, uint32_t)
77+
uint32_t kh_put_pyset(kh_pyset_t*, PyObject*, int*)
78+
79+
{{for name, dtype, ttype, c_type, to_c_type, set_ttype in dtypes}}
2980

3081

3182
@cython.wraparound(False)
@@ -243,11 +294,11 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
243294
{{c_type}} val
244295
{{endif}}
245296

246-
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
297+
kh_{{set_ttype}}_t *table = kh_init_{{set_ttype}}()
247298

248299
# construct the table
249300
n = len(values)
250-
kh_resize_{{ttype}}(table, n)
301+
kh_resize_{{set_ttype}}(table, n)
251302

252303
{{if dtype == 'object'}}
253304
if True:
@@ -256,7 +307,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
256307
{{endif}}
257308
for i in range(n):
258309
val = {{to_c_type}}(values[i])
259-
kh_put_{{ttype}}(table, val, &ret)
310+
kh_put_{{set_ttype}}(table, val, &ret)
260311

261312
# test membership
262313
n = len(arr)
@@ -269,10 +320,10 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
269320
{{endif}}
270321
for i in range(n):
271322
val = {{to_c_type}}(arr[i])
272-
k = kh_get_{{ttype}}(table, val)
323+
k = kh_get_{{set_ttype}}(table, val)
273324
result[i] = (k != table.n_buckets)
274325

275-
kh_destroy_{{ttype}}(table)
326+
kh_destroy_{{set_ttype}}(table)
276327
return result.view(np.bool_)
277328

278329
# ----------------------------------------------------------------------

pandas/_libs/include/pandas/vendored/klib/khash.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,9 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) {
642642
#define KHASH_SET_INIT_INT(name) \
643643
KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
644644

645+
#define KHASH_SET_INIT_UINT(name) \
646+
KHASH_INIT(name, khuint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
647+
645648
/*! @function
646649
@abstract Instantiate a hash map containing integer keys
647650
@param name Name of the hash table [symbol]
@@ -687,6 +690,12 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) {
687690
#define KHASH_MAP_INIT_UINT16(name, khval_t) \
688691
KHASH_INIT(name, khuint16_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
689692

693+
#define KHASH_SET_INIT_INT16(name) \
694+
KHASH_INIT(name, khint16_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
695+
696+
#define KHASH_SET_INIT_UINT16(name) \
697+
KHASH_INIT(name, khuint16_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
698+
690699
/*! @function
691700
@abstract Instantiate a hash map containing 8bit-integer keys
692701
@param name Name of the hash table [symbol]
@@ -698,6 +707,12 @@ static inline khuint_t __ac_Wang_hash(khuint_t key) {
698707
#define KHASH_MAP_INIT_UINT8(name, khval_t) \
699708
KHASH_INIT(name, khuint8_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
700709

710+
#define KHASH_SET_INIT_INT8(name) \
711+
KHASH_INIT(name, khint8_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
712+
713+
#define KHASH_SET_INIT_UINT8(name) \
714+
KHASH_INIT(name, khuint8_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
715+
701716
typedef const char *kh_cstr_t;
702717
/*! @function
703718
@abstract Instantiate a hash map containing const char* keys
@@ -725,6 +740,14 @@ typedef const char *kh_cstr_t;
725740
#define kh_exist_uint16(h, k) (kh_exist(h, k))
726741
#define kh_exist_int8(h, k) (kh_exist(h, k))
727742
#define kh_exist_uint8(h, k) (kh_exist(h, k))
743+
#define kh_exist_set_int64(h, k) (kh_exist(h, k))
744+
#define kh_exist_set_uint64(h, k) (kh_exist(h, k))
745+
#define kh_exist_set_int32(h, k) (kh_exist(h, k))
746+
#define kh_exist_set_uint32(h, k) (kh_exist(h, k))
747+
#define kh_exist_set_int16(h, k) (kh_exist(h, k))
748+
#define kh_exist_set_uint16(h, k) (kh_exist(h, k))
749+
#define kh_exist_set_int8(h, k) (kh_exist(h, k))
750+
#define kh_exist_set_uint8(h, k) (kh_exist(h, k))
728751

729752
KHASH_MAP_INIT_STR(str, size_t)
730753
KHASH_MAP_INIT_INT(int32, size_t)
@@ -735,5 +758,13 @@ KHASH_MAP_INIT_INT16(int16, size_t)
735758
KHASH_MAP_INIT_UINT16(uint16, size_t)
736759
KHASH_MAP_INIT_INT8(int8, size_t)
737760
KHASH_MAP_INIT_UINT8(uint8, size_t)
761+
KHASH_SET_INIT_INT64(set_int64)
762+
KHASH_SET_INIT_UINT64(set_uint64)
763+
KHASH_SET_INIT_INT(set_int32)
764+
KHASH_SET_INIT_UINT(set_uint32)
765+
KHASH_SET_INIT_INT16(set_int16)
766+
KHASH_SET_INIT_UINT16(set_uint16)
767+
KHASH_SET_INIT_INT8(set_int8)
768+
KHASH_SET_INIT_UINT8(set_uint8)
738769

739770
#endif /* __AC_KHASH_H */

pandas/_libs/include/pandas/vendored/klib/khash_python.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,24 @@ static inline khuint32_t kh_float32_hash_func(float val) {
122122

123123
KHASH_MAP_INIT_FLOAT64(float64, size_t)
124124

125+
#define KHASH_SET_INIT_FLOAT64(name) \
126+
KHASH_INIT(name, khfloat64_t, char, 0, kh_float64_hash_func, \
127+
kh_floats_hash_equal)
128+
129+
KHASH_SET_INIT_FLOAT64(set_float64)
130+
125131
#define KHASH_MAP_INIT_FLOAT32(name, khval_t) \
126132
KHASH_INIT(name, khfloat32_t, khval_t, 1, kh_float32_hash_func, \
127133
kh_floats_hash_equal)
128134

129135
KHASH_MAP_INIT_FLOAT32(float32, size_t)
130136

137+
#define KHASH_SET_INIT_FLOAT32(name) \
138+
KHASH_INIT(name, khfloat32_t, char, 0, kh_float32_hash_func, \
139+
kh_floats_hash_equal)
140+
141+
KHASH_SET_INIT_FLOAT32(set_float32)
142+
131143
static inline khint32_t kh_complex128_hash_func(khcomplex128_t val) {
132144
return kh_float64_hash_func(val.real) ^ kh_float64_hash_func(val.imag);
133145
}
@@ -144,14 +156,30 @@ static inline khint32_t kh_complex64_hash_func(khcomplex64_t val) {
144156

145157
KHASH_MAP_INIT_COMPLEX64(complex64, size_t)
146158

159+
#define KHASH_SET_INIT_COMPLEX64(name) \
160+
KHASH_INIT(name, khcomplex64_t, char, 0, kh_complex64_hash_func, \
161+
kh_complex_hash_equal)
162+
163+
KHASH_SET_INIT_COMPLEX64(set_complex64)
164+
147165
#define KHASH_MAP_INIT_COMPLEX128(name, khval_t) \
148166
KHASH_INIT(name, khcomplex128_t, khval_t, 1, kh_complex128_hash_func, \
149167
kh_complex_hash_equal)
150168

151169
KHASH_MAP_INIT_COMPLEX128(complex128, size_t)
152170

171+
#define KHASH_SET_INIT_COMPLEX128(name) \
172+
KHASH_INIT(name, khcomplex128_t, char, 0, kh_complex128_hash_func, \
173+
kh_complex_hash_equal)
174+
175+
KHASH_SET_INIT_COMPLEX128(set_complex128)
176+
153177
#define kh_exist_complex64(h, k) (kh_exist(h, k))
154178
#define kh_exist_complex128(h, k) (kh_exist(h, k))
179+
#define kh_exist_set_float64(h, k) (kh_exist(h, k))
180+
#define kh_exist_set_float32(h, k) (kh_exist(h, k))
181+
#define kh_exist_set_complex64(h, k) (kh_exist(h, k))
182+
#define kh_exist_set_complex128(h, k) (kh_exist(h, k))
155183

156184
// NaN-floats should be in the same equivalency class, see GH 22119
157185
static inline int floatobject_cmp(PyFloatObject *a, PyFloatObject *b) {

pandas/_libs/khash_for_primitive_helper.pxi.in

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,21 @@ primitive_types = [('int64', 'int64_t'),
2020
('complex64', 'khcomplex64_t'),
2121
('complex128', 'khcomplex128_t'),
2222
]
23+
24+
# name, c_type — set (membership-only) counterparts to the map types above
25+
primitive_set_types = [('set_int64', 'int64_t'),
26+
('set_uint64', 'uint64_t'),
27+
('set_float64', 'float64_t'),
28+
('set_int32', 'int32_t'),
29+
('set_uint32', 'uint32_t'),
30+
('set_float32', 'float32_t'),
31+
('set_int16', 'int16_t'),
32+
('set_uint16', 'uint16_t'),
33+
('set_int8', 'int8_t'),
34+
('set_uint8', 'uint8_t'),
35+
('set_complex64', 'khcomplex64_t'),
36+
('set_complex128', 'khcomplex128_t'),
37+
]
2338
}}
2439

2540
{{for name, c_type in primitive_types}}
@@ -42,3 +57,24 @@ cdef extern from "pandas/vendored/klib/khash_python.h":
4257
bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil
4358

4459
{{endfor}}
60+
61+
{{for name, c_type in primitive_set_types}}
62+
63+
cdef extern from "pandas/vendored/klib/khash_python.h":
64+
ctypedef struct kh_{{name}}_t:
65+
khuint_t n_buckets, size, n_occupied, upper_bound
66+
uint32_t *flags
67+
{{c_type}} *keys
68+
char *vals
69+
70+
kh_{{name}}_t* kh_init_{{name}}() nogil
71+
void kh_destroy_{{name}}(kh_{{name}}_t*) nogil
72+
void kh_clear_{{name}}(kh_{{name}}_t*) nogil
73+
khuint_t kh_get_{{name}}(kh_{{name}}_t*, {{c_type}}) nogil
74+
void kh_resize_{{name}}(kh_{{name}}_t*, khuint_t) nogil
75+
khuint_t kh_put_{{name}}(kh_{{name}}_t*, {{c_type}}, int*) nogil
76+
void kh_del_{{name}}(kh_{{name}}_t*, khuint_t) nogil
77+
78+
bint kh_exist_{{name}}(kh_{{name}}_t*, khiter_t) nogil
79+
80+
{{endfor}}

0 commit comments

Comments
 (0)