Skip to content

Commit 8667459

Browse files
committed
Fix pivot_table corruption with large datasets in Python 3.14
This commit addresses issue GH#63314 where pivot_table operations on large datasets produce corrupted output with duplicate index values when running on Python 3.14. The root cause appears to be changes in Python 3.14's hashtable implementation or dictionary behavior. The compress_group_index function was relying on Int64HashTable.get_labels_groupby() which produces incorrect results for large datasets in Python 3.14. The fix uses a numpy-based approach for Python 3.14+ that: - Explicitly sorts the group_index when needed - Uses numpy operations to identify unique values - Maps compressed IDs back to original order - Preserves the existing hashtable-based path for older Python versions Added regression test to ensure pivot_table correctly handles large datasets without producing duplicate indices.
1 parent 499c5d4 commit 8667459

File tree

2 files changed

+67
-5
lines changed

2 files changed

+67
-5
lines changed

pandas/core/sorting.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -680,14 +680,38 @@ def compress_group_index(
680680
space can be huge, so this function compresses it, by computing offsets
681681
(comp_ids) into the list of unique labels (obs_group_ids).
682682
"""
683-
if len(group_index) and np.all(group_index[1:] >= group_index[:-1]):
683+
import sys
684+
685+
# Use numpy-based approach for Python 3.14+ to avoid hashtable issues
686+
if sys.version_info >= (3, 14) or (len(group_index) and np.all(group_index[1:] >= group_index[:-1])):
684687
# GH 53806: fast path for sorted group_index
688+
# GH 63314: also use for Python 3.14+ due to hashtable behavior changes
689+
if len(group_index) == 0:
690+
return ensure_int64(np.array([], dtype=np.int64)), ensure_int64(np.array([], dtype=np.int64))
691+
692+
# Sort if needed
693+
if not np.all(group_index[1:] >= group_index[:-1]):
694+
sorted_idx = np.argsort(group_index, kind='stable')
695+
sorted_group_index = group_index[sorted_idx]
696+
unsort_idx = np.empty_like(sorted_idx)
697+
unsort_idx[sorted_idx] = np.arange(len(sorted_idx))
698+
else:
699+
sorted_group_index = group_index
700+
unsort_idx = None
701+
685702
unique_mask = np.concatenate(
686-
[group_index[:1] > -1, group_index[1:] != group_index[:-1]]
703+
[sorted_group_index[:1] > -1, sorted_group_index[1:] != sorted_group_index[:-1]]
687704
)
688-
comp_ids = unique_mask.cumsum()
689-
comp_ids -= 1
690-
obs_group_ids = group_index[unique_mask]
705+
comp_ids_sorted = unique_mask.cumsum() - 1
706+
obs_group_ids = sorted_group_index[unique_mask]
707+
708+
if unsort_idx is not None:
709+
comp_ids = comp_ids_sorted[unsort_idx]
710+
else:
711+
comp_ids = comp_ids_sorted
712+
713+
if sort and not np.all(obs_group_ids[1:] >= obs_group_ids[:-1]):
714+
obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
691715
else:
692716
size_hint = len(group_index)
693717
table = hashtable.Int64HashTable(size_hint)

pandas/tests/reshape/test_pivot.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2959,3 +2959,41 @@ def test_pivot_empty_dataframe_period_dtype(self, freq):
29592959
)
29602960

29612961
tm.assert_frame_equal(result, expected)
2962+
2963+
def test_pivot_table_large_dataset_no_duplicates(self):
2964+
# GH 63314: pivot_table with large datasets should not produce duplicate indices
2965+
# This test ensures that the fix for Python 3.14 hashtable issues works correctly
2966+
n_indices = 10000
2967+
metrics = ["apple", "banana", "coconut"]
2968+
2969+
data = []
2970+
for i in range(n_indices):
2971+
for metric in metrics:
2972+
data.append({
2973+
"idx": f"id_{i}",
2974+
"metric": metric,
2975+
"value": i * 10 + len(metric)
2976+
})
2977+
2978+
df = DataFrame(data)
2979+
2980+
result = df.pivot_table(
2981+
index=["idx"],
2982+
columns="metric",
2983+
values="value",
2984+
aggfunc="first",
2985+
)
2986+
2987+
# Verify no duplicate indices in the result
2988+
assert len(result.index) == len(result.index.unique()), \
2989+
f"Expected {len(result.index.unique())} unique indices, got {len(result.index)}"
2990+
2991+
# Verify we have the expected number of rows
2992+
assert len(result) == n_indices, \
2993+
f"Expected {n_indices} rows, got {len(result)}"
2994+
2995+
# Verify all expected indices are present
2996+
expected_indices = {f"id_{i}" for i in range(n_indices)}
2997+
actual_indices = set(result.index)
2998+
assert expected_indices == actual_indices, \
2999+
"Result indices don't match expected indices"

0 commit comments

Comments
 (0)