refactor: add fragment_group_size to reduce lance scan task (#5261)

Jay-ju · web-flow · commit 153a72663a20 · 2025-09-24T16:25:52.000-07:00
## Changes Made When the number of fragments is large, the current implementation method assigns one task to each fragment, which results in a long planning time. Therefore, some fragment filtering and fragment grouping implementations have been added here to reduce the number of tasks.  ## Related Issues  ## Checklist - [ ] Documented in API Docs (if applicable) - [ ] Documented in User Guide (if applicable) - [ ] If adding a new documentation page, doc is added to `docs/mkdocs.yml` navigation - [ ] Documentation builds and is formatted properly (tag @/ccmao1130 for docs review)
diff --git a/daft/io/lance/_lance.py b/daft/io/lance/_lance.py
@@ -36,6 +36,7 @@ def read_lance(
     index_cache_size: Optional[int] = None,
     default_scan_options: Optional[dict[str, str]] = None,
     metadata_cache_size_bytes: Optional[int] = None,
+    fragment_group_size: Optional[int] = None,
 ) -> DataFrame:
     """Create a DataFrame from a LanceDB table.
 
@@ -60,7 +61,6 @@ def read_lance(
 
             Roughly, for an ``IVF_PQ`` partition with ``n`` rows, the size of each index
             page equals the combination of the pq code (``np.array([n,pq], dtype=uint8))``
-            and the row ids (``np.array([n], dtype=uint64)``).
             Approximately, ``n = Total Rows / number of IVF partitions``.
             ``pq = number of PQ sub-vectors``.
         storage_options : optional, dict
@@ -82,11 +82,13 @@ def read_lance(
             Size of the metadata cache in bytes. This cache is used to store metadata
             information about the dataset, such as schema and statistics. If not specified,
             a default size will be used.
+        fragment_group_size : optional, int
+            Number of fragments to group together in a single scan task. If None or <= 1,
+            each fragment will be processed individually (default behavior).
 
     Returns:
         DataFrame: a DataFrame with the schema converted from the specified LanceDB table
 
-    Note:
         This function requires the use of [LanceDB](https://lancedb.github.io/lancedb/), which is the Python library for the LanceDB project.
         To ensure that this is installed with Daft, you may install: `pip install daft[lance]`
 
@@ -104,6 +106,10 @@ def read_lance(
         Read a local LanceDB table and specify a version:
         >>> df = daft.read_lance("s3://my-lancedb-bucket/data/", version=1)
         >>> df.show()
+
+        Read a local LanceDB table with fragment grouping:
+        >>> df = daft.read_lance("s3://my-lancedb-bucket/data/", fragment_group_size=5)
+        >>> df.show()
     """
     try:
         import lance
@@ -126,7 +132,7 @@ def read_lance(
         default_scan_options=default_scan_options,
         metadata_cache_size_bytes=metadata_cache_size_bytes,
     )
-    lance_operator = LanceDBScanOperator(ds)
+    lance_operator = LanceDBScanOperator(ds, fragment_group_size=fragment_group_size)
 
     handle = ScanOperatorHandle.from_python_scan_operator(lance_operator)
     builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)
diff --git a/daft/io/lance/lance_scan.py b/daft/io/lance/lance_scan.py
@@ -52,9 +52,10 @@ def _lancedb_count_result_function(
 
 
 class LanceDBScanOperator(ScanOperator, SupportsPushdownFilters):
-    def __init__(self, ds: "lance.LanceDataset"):
+    def __init__(self, ds: "lance.LanceDataset", fragment_group_size: Optional[int] = None):
         self._ds = ds
         self._pushed_filters: Union[list[PyExpr], None] = None
+        self._fragment_group_size = fragment_group_size
 
     def name(self) -> str:
         return "LanceDBScanOperator"
@@ -203,30 +204,61 @@ def _create_regular_scan_tasks(
     ) -> Iterator[ScanTask]:
         """Create regular scan tasks without count pushdown."""
         fragments = self._ds.get_fragments()
-        for fragment in fragments:
-            # TODO: figure out how if we can get this metadata from LanceDB fragments cheaply
-            size_bytes = None
-            stats = None
-
-            # NOTE: `fragment.count_rows()` should result in 1 IO call for the data file
-            # (1 fragment = 1 data file) and 1 more IO call for the deletion file (if present).
-            # This could potentially be expensive to perform serially if there are thousands of files.
-            # Given that num_rows isn't leveraged for much at the moment, and without statistics
-            # we will probably end up materializing the data anyways for any operations, we leave this
-            # as None.
-            num_rows = None
-            pushed_expr = self._combine_filters_to_arrow()
+        pushed_expr = self._combine_filters_to_arrow()
 
-            yield ScanTask.python_factory_func_scan_task(
-                module=_lancedb_table_factory_function.__module__,
-                func_name=_lancedb_table_factory_function.__name__,
-                func_args=(self._ds, [fragment.fragment_id], required_columns, pushed_expr, pushdowns.limit),
-                schema=self.schema()._schema,
-                num_rows=num_rows,
-                size_bytes=size_bytes,
-                pushdowns=pushdowns,
-                stats=stats,
-            )
+        if self._fragment_group_size is None or self._fragment_group_size <= 1:
+            # Default behavior: one fragment per task
+            for fragment in fragments:
+                size_bytes = None
+                stats = None
+                num_rows = None
+                if fragment.count_rows(pushed_expr) == 0:
+                    continue
+
+                yield ScanTask.python_factory_func_scan_task(
+                    module=_lancedb_table_factory_function.__module__,
+                    func_name=_lancedb_table_factory_function.__name__,
+                    func_args=(self._ds, [fragment.fragment_id], required_columns, pushed_expr, pushdowns.limit),
+                    schema=self.schema()._schema,
+                    num_rows=num_rows,
+                    size_bytes=size_bytes,
+                    pushdowns=pushdowns,
+                    stats=stats,
+                )
+        else:
+            # Group fragments
+            fragment_groups = []
+            current_group = []
+
+            for fragment in fragments:
+                if fragment.count_rows(pushed_expr) == 0:
+                    continue
+                current_group.append(fragment)
+                if len(current_group) >= self._fragment_group_size:
+                    fragment_groups.append(current_group)
+                    current_group = []
+
+            # Add the last group if it has any fragments
+            if current_group:
+                fragment_groups.append(current_group)
+
+            # Create scan tasks for each fragment group
+            for fragment_group in fragment_groups:
+                fragment_ids = [fragment.fragment_id for fragment in fragment_group]
+                size_bytes = None
+                stats = None
+                num_rows = None
+
+                yield ScanTask.python_factory_func_scan_task(
+                    module=_lancedb_table_factory_function.__module__,
+                    func_name=_lancedb_table_factory_function.__name__,
+                    func_args=(self._ds, fragment_ids, required_columns, pushed_expr, pushdowns.limit),
+                    schema=self.schema()._schema,
+                    num_rows=num_rows,
+                    size_bytes=size_bytes,
+                    pushdowns=pushdowns,
+                    stats=stats,
+                )
 
     def _combine_filters_to_arrow(self) -> Optional["pa.compute.Expression"]:
         if self._pushed_filters is not None:
diff --git a/tests/io/lancedb/test_lancedb_reads.py b/tests/io/lancedb/test_lancedb_reads.py
@@ -142,6 +142,20 @@ def test_lancedb_read_pushdown(lance_dataset_path, capsys):
             ), f"Physical plan contains {filter_count} Filter nodes and {scan_source_count} ScanTaskSource nodes, which is not expected"
 
 
+def test_lancedb_read_parallelism_fragment_merging(large_lance_dataset_path):
+    """Test parallelism parameter reduces scan tasks by merging fragments."""
+    df_no_fragment_group = daft.read_lance(large_lance_dataset_path)
+    assert len(lance.dataset(large_lance_dataset_path).get_fragments()) == df_no_fragment_group.num_partitions()
+
+    df = daft.read_lance(large_lance_dataset_path, fragment_group_size=3)
+    df.explain(show_all=True)
+    assert df.num_partitions() == 4  # 10 fragments, group size 3 -> 4 scan tasks
+
+    result = df.to_pydict()
+    assert len(result["vector"]) == 10000
+    assert len(result["big_int"]) == 10000
+
+
 class TestLanceDBCountPushdown:
     tmp_data = {
         "a": ["a", "b", "c", "d", "e", None],