Fix pivot_table corruption with large datasets in Python 3.14

AKHIL-149 · AKHIL-149 · commit 8667459a6afb · 2025-12-09T13:52:36.000-06:00
This commit addresses issue GH#63314 where pivot_table operations
on large datasets produce corrupted output with duplicate index
values when running on Python 3.14.

The root cause appears to be changes in Python 3.14's hashtable
implementation or dictionary behavior. The compress_group_index
function was relying on Int64HashTable.get_labels_groupby() which
produces incorrect results for large datasets in Python 3.14.

The fix uses a numpy-based approach for Python 3.14+ that:
- Explicitly sorts the group_index when needed
- Uses numpy operations to identify unique values
- Maps compressed IDs back to original order
- Preserves the existing hashtable-based path for older Python versions

Added regression test to ensure pivot_table correctly handles
large datasets without producing duplicate indices.
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -680,14 +680,38 @@ def compress_group_index(
     space can be huge, so this function compresses it, by computing offsets
     (comp_ids) into the list of unique labels (obs_group_ids).
     """
-    if len(group_index) and np.all(group_index[1:] >= group_index[:-1]):
+    import sys
+
+    # Use numpy-based approach for Python 3.14+ to avoid hashtable issues
+    if sys.version_info >= (3, 14) or (len(group_index) and np.all(group_index[1:] >= group_index[:-1])):
         # GH 53806: fast path for sorted group_index
+        # GH 63314: also use for Python 3.14+ due to hashtable behavior changes
+        if len(group_index) == 0:
+            return ensure_int64(np.array([], dtype=np.int64)), ensure_int64(np.array([], dtype=np.int64))
+
+        # Sort if needed
+        if not np.all(group_index[1:] >= group_index[:-1]):
+            sorted_idx = np.argsort(group_index, kind='stable')
+            sorted_group_index = group_index[sorted_idx]
+            unsort_idx = np.empty_like(sorted_idx)
+            unsort_idx[sorted_idx] = np.arange(len(sorted_idx))
+        else:
+            sorted_group_index = group_index
+            unsort_idx = None
+
         unique_mask = np.concatenate(
-            [group_index[:1] > -1, group_index[1:] != group_index[:-1]]
+            [sorted_group_index[:1] > -1, sorted_group_index[1:] != sorted_group_index[:-1]]
         )
-        comp_ids = unique_mask.cumsum()
-        comp_ids -= 1
-        obs_group_ids = group_index[unique_mask]
+        comp_ids_sorted = unique_mask.cumsum() - 1
+        obs_group_ids = sorted_group_index[unique_mask]
+
+        if unsort_idx is not None:
+            comp_ids = comp_ids_sorted[unsort_idx]
+        else:
+            comp_ids = comp_ids_sorted
+
+        if sort and not np.all(obs_group_ids[1:] >= obs_group_ids[:-1]):
+            obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
     else:
         size_hint = len(group_index)
         table = hashtable.Int64HashTable(size_hint)
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -2959,3 +2959,41 @@ def test_pivot_empty_dataframe_period_dtype(self, freq):
         )
 
         tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_large_dataset_no_duplicates(self):
+        # GH 63314: pivot_table with large datasets should not produce duplicate indices
+        # This test ensures that the fix for Python 3.14 hashtable issues works correctly
+        n_indices = 10000
+        metrics = ["apple", "banana", "coconut"]
+
+        data = []
+        for i in range(n_indices):
+            for metric in metrics:
+                data.append({
+                    "idx": f"id_{i}",
+                    "metric": metric,
+                    "value": i * 10 + len(metric)
+                })
+
+        df = DataFrame(data)
+
+        result = df.pivot_table(
+            index=["idx"],
+            columns="metric",
+            values="value",
+            aggfunc="first",
+        )
+
+        # Verify no duplicate indices in the result
+        assert len(result.index) == len(result.index.unique()), \
+            f"Expected {len(result.index.unique())} unique indices, got {len(result.index)}"
+
+        # Verify we have the expected number of rows
+        assert len(result) == n_indices, \
+            f"Expected {n_indices} rows, got {len(result)}"
+
+        # Verify all expected indices are present
+        expected_indices = {f"id_{i}" for i in range(n_indices)}
+        actual_indices = set(result.index)
+        assert expected_indices == actual_indices, \
+            "Result indices don't match expected indices"