Skip to content

Commit 1f9c46b

Browse files
authored
perf: Avoid copying metadata for each data file in summary (#2674)
<!-- Thanks for opening a pull request! --> <!-- In the case this PR will resolve an issue, please replace ${GITHUB_ISSUE_ID} below with the actual Github issue id. --> <!-- Closes #${GITHUB_ISSUE_ID} --> Resolves #2673 # Rationale for this change `_SnapshotProducer._summary()` copies the metadata for _every_ added / deleted DataFile. This is pretty expensive. Instead we just copy it once at the beginning of the function and use the same value each DataFile. On my data, which overwrites a few million rows at a time, I saw the time for `table.overwrite` go from ~20 seconds to ~6 seconds. ## Are these changes tested? Yes, existing unit / integration tests ## Are there any user-facing changes? Just faster writes :) <!-- In the case of user-facing changes, please add the changelog label. -->
1 parent d3eb149 commit 1f9c46b

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

pyiceberg/table/update/snapshot.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -240,8 +240,11 @@ def _write_delete_manifest() -> List[ManifestFile]:
240240
def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
241241
from pyiceberg.table import TableProperties
242242

243+
# avoid copying metadata for each data file
244+
table_metadata = self._transaction.table_metadata
245+
243246
partition_summary_limit = int(
244-
self._transaction.table_metadata.properties.get(
247+
table_metadata.properties.get(
245248
TableProperties.WRITE_PARTITION_SUMMARY_LIMIT, TableProperties.WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT
246249
)
247250
)
@@ -250,23 +253,21 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary:
250253
for data_file in self._added_data_files:
251254
ssc.add_file(
252255
data_file=data_file,
253-
partition_spec=self._transaction.table_metadata.spec(),
254-
schema=self._transaction.table_metadata.schema(),
256+
partition_spec=table_metadata.spec(),
257+
schema=table_metadata.schema(),
255258
)
256259

257260
if len(self._deleted_data_files) > 0:
258-
specs = self._transaction.table_metadata.specs()
261+
specs = table_metadata.specs()
259262
for data_file in self._deleted_data_files:
260263
ssc.remove_file(
261264
data_file=data_file,
262265
partition_spec=specs[data_file.spec_id],
263-
schema=self._transaction.table_metadata.schema(),
266+
schema=table_metadata.schema(),
264267
)
265268

266269
previous_snapshot = (
267-
self._transaction.table_metadata.snapshot_by_id(self._parent_snapshot_id)
268-
if self._parent_snapshot_id is not None
269-
else None
270+
table_metadata.snapshot_by_id(self._parent_snapshot_id) if self._parent_snapshot_id is not None else None
270271
)
271272

272273
return update_snapshot_summaries(

0 commit comments

Comments
 (0)