Skip to content

Commit 33e3402

Browse files
committed
BUG: Categorical.map() sort categories for unordered categoricals (GH#58153)
Categorical.map() preserved the positional order of categories from the original (pre-mapped) categorical. For unordered categoricals, this meant the mapped values inherited an arbitrary category ordering, causing sort_values(key=...) to ignore custom sort orders. Fix: In Categorical.map(), when the categorical is unordered, sort the mapped categories and remap codes accordingly. For ordered categoricals, preserve the existing category order (since the ordering is user-defined). Mixed-type categories (e.g. str and float) that cannot be compared gracefully fall back to preserving the original category order. Closes #58153 Generated-by: GitHub Copilot
1 parent 11c5d0e commit 33e3402

File tree

4 files changed

+38
-7
lines changed

4 files changed

+38
-7
lines changed

doc/source/whatsnew/v3.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ Categorical
199199
^^^^^^^^^^^
200200
- Bug in :meth:`Categorical.__repr__` where the values and categories lines could exceed ``display.width`` (:issue:`12066`)
201201
- Bug in :meth:`CategoricalIndex.union` and :meth:`CategoricalIndex.intersection` giving incorrect results when the two indexes have the same unordered categories in different orders (:issue:`55335`)
202+
- Bug in :meth:`Categorical.map` where unordered categoricals preserved the positional category order from the original categories instead of sorting the mapped values, causing :meth:`DataFrame.sort_values` with ``key`` to ignore custom sort orders (:issue:`58153`)
202203
- Bug in :meth:`Index.fillna` raising ``TypeError`` when filling with a tuple value (e.g. on object-dtype or :class:`CategoricalIndex` with tuple categories) (:issue:`37681`)
203204
-
204205

pandas/core/arrays/categorical.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1641,8 +1641,26 @@ def map(
16411641
na_val = mapper(np.nan) if callable(mapper) else mapper.get(np.nan, np.nan)
16421642

16431643
if new_categories.is_unique and not new_categories.hasnans and na_val is np.nan:
1644+
codes = self._codes.copy()
1645+
if not self.ordered:
1646+
# GH#58153: For unordered categoricals, sort the mapped
1647+
# categories so that category order reflects the natural
1648+
# ordering of the new values, not the positional order
1649+
# inherited from the original categories.
1650+
try:
1651+
indexer = new_categories.argsort()
1652+
except TypeError:
1653+
# Mixed types (e.g. str and float) can't be compared;
1654+
# skip sorting and keep original category order.
1655+
pass
1656+
else:
1657+
new_categories = new_categories.take(indexer)
1658+
reverse_indexer = np.empty(len(indexer), dtype=np.intp)
1659+
reverse_indexer[indexer] = np.arange(len(indexer))
1660+
mask = codes >= 0
1661+
codes[mask] = reverse_indexer[codes[mask]]
16441662
new_dtype = CategoricalDtype(new_categories, ordered=self.ordered)
1645-
return self.from_codes(self._codes.copy(), dtype=new_dtype, validate=False)
1663+
return self.from_codes(codes, dtype=new_dtype, validate=False)
16461664

16471665
if has_nans:
16481666
new_categories = new_categories.insert(len(new_categories), na_val)

pandas/tests/arrays/categorical/test_map.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,12 @@ def test_map_str(data, categories, ordered, na_action):
2222
# GH 31202 - override base class since we want to maintain categorical/ordered
2323
cat = Categorical(data, categories=categories, ordered=ordered)
2424
result = cat.map(str, na_action=na_action)
25+
expected_categories = list(map(str, categories))
26+
if not ordered:
27+
# GH#58153: Unordered categoricals sort categories after map
28+
expected_categories = sorted(expected_categories)
2529
expected = Categorical(
26-
map(str, data), categories=map(str, categories), ordered=ordered
30+
map(str, data), categories=expected_categories, ordered=ordered
2731
)
2832
tm.assert_categorical_equal(result, expected)
2933

@@ -36,7 +40,8 @@ def test_map(na_action):
3640

3741
cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
3842
result = cat.map(lambda x: x.lower(), na_action=na_action)
39-
exp = Categorical(list("ababc"), categories=list("bac"), ordered=False)
43+
# GH#58153: Unordered categoricals sort categories after map
44+
exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
4045
tm.assert_categorical_equal(result, exp)
4146

4247
# GH 12766: Return an index not an array
@@ -51,7 +56,8 @@ def f(x):
5156
return {"A": 10, "B": 20, "C": 30}.get(x)
5257

5358
result = cat.map(f, na_action=na_action)
54-
exp = Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False)
59+
# GH#58153: Unordered categoricals sort categories after map
60+
exp = Categorical([10, 20, 10, 20, 30], categories=[10, 20, 30], ordered=False)
5561
tm.assert_categorical_equal(result, exp)
5662

5763
mapper = Series([10, 20, 30], index=["A", "B", "C"])

pandas/tests/indexes/categorical/test_map.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,12 @@ def test_map_str(data, categories, ordered):
2222
# GH 31202 - override base class since we want to maintain categorical/ordered
2323
index = CategoricalIndex(data, categories=categories, ordered=ordered)
2424
result = index.map(str)
25+
expected_categories = list(map(str, categories))
26+
if not ordered:
27+
# GH#58153: Unordered categoricals sort categories after map
28+
expected_categories = sorted(expected_categories)
2529
expected = CategoricalIndex(
26-
map(str, data), categories=map(str, categories), ordered=ordered
30+
map(str, data), categories=expected_categories, ordered=ordered
2731
)
2832
tm.assert_index_equal(result, expected)
2933

@@ -38,8 +42,9 @@ def test_map():
3842
list("ABABC"), categories=list("BAC"), ordered=False, name="XXX"
3943
)
4044
result = ci.map(lambda x: x.lower())
45+
# GH#58153: Unordered categoricals sort categories after map
4146
exp = CategoricalIndex(
42-
list("ababc"), categories=list("bac"), ordered=False, name="XXX"
47+
list("ababc"), categories=list("abc"), ordered=False, name="XXX"
4348
)
4449
tm.assert_index_equal(result, exp)
4550

@@ -55,7 +60,8 @@ def f(x):
5560
return {"A": 10, "B": 20, "C": 30}.get(x)
5661

5762
result = ci.map(f)
58-
exp = CategoricalIndex([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False)
63+
# GH#58153: Unordered categoricals sort categories after map
64+
exp = CategoricalIndex([10, 20, 10, 20, 30], categories=[10, 20, 30], ordered=False)
5965
tm.assert_index_equal(result, exp)
6066

6167
result = ci.map(Series([10, 20, 30], index=["A", "B", "C"]))

0 commit comments

Comments
 (0)