Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
aab78d6
Add REE file column helpers
gbrgr Nov 4, 2025
ee21cab
Add helper tests
gbrgr Nov 4, 2025
37b52e2
Add constants
gbrgr Nov 4, 2025
44463a0
Add support for _file constant
gbrgr Nov 4, 2025
b5449f6
Merge branch 'main' into feature/gb/file-column
gbrgr Nov 4, 2025
e034009
Update tests
gbrgr Nov 4, 2025
4f0a4f1
Fix clippy warning
gbrgr Nov 4, 2025
51f76d3
Fix doc test
gbrgr Nov 4, 2025
d84e16b
Track in field ids
gbrgr Nov 4, 2025
984dacd
Merge branch 'main' into feature/gb/file-column
gbrgr Nov 4, 2025
bd478cb
Add test
gbrgr Nov 4, 2025
8593db0
Merge branch 'feature/gb/file-column' of github.com:RelationalAI/iceb…
gbrgr Nov 4, 2025
9b186c7
Allow repeated virtual file column selection
gbrgr Nov 4, 2025
30ae5fb
Merge branch 'main' into feature/gb/file-column
gbrgr Nov 5, 2025
adf0da0
Refactor into own transformer step
gbrgr Nov 7, 2025
f4336a8
Merge branch 'feature/gb/file-column' of github.com:RelationalAI/iceb…
gbrgr Nov 7, 2025
ef3a965
Revert "Refactor into own transformer step"
gbrgr Nov 7, 2025
534490b
Avoid special casing in batch creation
gbrgr Nov 7, 2025
04bf463
.
gbrgr Nov 7, 2025
9e88edf
Modify record batch transformer to support reserved fields
gbrgr Nov 12, 2025
060b45d
Add metadata column helper functions
gbrgr Nov 12, 2025
8572dae
Store fields instead of constants
gbrgr Nov 12, 2025
f273add
Add comment
gbrgr Nov 12, 2025
5aa92ae
Adapt comment
gbrgr Nov 12, 2025
c05b886
.
gbrgr Nov 12, 2025
33bb0ad
Adapt error message
gbrgr Nov 12, 2025
42167ff
Consider field_id range
gbrgr Nov 12, 2025
cbc6b17
Merge remote-tracking branch 'upstream/main' into feature/gb/file-column
gbrgr Nov 13, 2025
977c813
Merge remote-tracking branch 'upstream/main' into feature/gb/file-column
gbrgr Nov 13, 2025
83443aa
Use REE encoding in record batch transformer
gbrgr Nov 14, 2025
35aba12
Fix clippy errors
gbrgr Nov 14, 2025
830e462
Format
gbrgr Nov 14, 2025
4eb8a63
Add `with_file_path_column` helper
gbrgr Nov 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 76 additions & 28 deletions crates/iceberg/src/arrow/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,9 @@ use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator;
use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator;
use crate::expr::{BoundPredicate, BoundReference};
use crate::io::{FileIO, FileMetadata, FileRead};
use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field};
use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream};
use crate::spec::{Datum, NameMapping, NestedField, PrimitiveType, Schema, Type};
use crate::spec::{Datum, NameMapping, NestedField, PrimitiveLiteral, PrimitiveType, Schema, Type};
use crate::utils::available_parallelism;
use crate::{Error, ErrorKind};

Expand Down Expand Up @@ -250,12 +251,20 @@ impl ArrowReader {
initial_stream_builder
};

// Filter out metadata fields for Parquet projection (they don't exist in files)
let project_field_ids_without_metadata: Vec<i32> = task
.project_field_ids
.iter()
.filter(|&&id| !is_metadata_field(id))
.copied()
.collect();

// Create projection mask based on field IDs
// - If file has embedded IDs: field-ID-based projection (missing_field_ids=false)
// - If name mapping applied: field-ID-based projection (missing_field_ids=true but IDs now match)
// - If fallback IDs: position-based projection (missing_field_ids=true)
let projection_mask = Self::get_arrow_projection_mask(
&task.project_field_ids,
&project_field_ids_without_metadata,
&task.schema,
record_batch_stream_builder.parquet_schema(),
record_batch_stream_builder.schema(),
Expand All @@ -266,16 +275,20 @@ impl ArrowReader {
record_batch_stream_builder.with_projection(projection_mask.clone());

// RecordBatchTransformer performs any transformations required on the RecordBatches
// that come back from the file, such as type promotion, default column insertion
// and column re-ordering.
// that come back from the file, such as type promotion, default column insertion,
// column re-ordering, partition constants, and virtual field addition (like _file)
let mut record_batch_transformer_builder =
RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids());
RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids())
.with_constant(
RESERVED_FIELD_ID_FILE,
PrimitiveLiteral::String(task.data_file_path.clone()),
)?;

if let (Some(partition_spec), Some(partition_data)) =
(task.partition_spec.clone(), task.partition.clone())
{
record_batch_transformer_builder =
record_batch_transformer_builder.with_partition(partition_spec, partition_data);
record_batch_transformer_builder.with_partition(partition_spec, partition_data)?;
}

let mut record_batch_transformer = record_batch_transformer_builder.build();
Expand Down Expand Up @@ -416,7 +429,10 @@ impl ArrowReader {
record_batch_stream_builder
.build()?
.map(move |batch| match batch {
Ok(batch) => record_batch_transformer.process_record_batch(batch),
Ok(batch) => {
// Process the record batch (type promotion, column reordering, virtual fields, etc.)
record_batch_transformer.process_record_batch(batch)
}
Err(err) => Err(err.into()),
});

Expand Down Expand Up @@ -1737,7 +1753,7 @@ mod tests {
use std::sync::Arc;

use arrow_array::cast::AsArray;
use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray};
use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, RunArray, StringArray};
use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit};
use futures::TryStreamExt;
use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
Expand Down Expand Up @@ -2553,14 +2569,24 @@ message schema {
.as_primitive::<arrow_array::types::Int32Type>();
assert_eq!(col_a.values(), &[1, 2, 3]);

// Column 'b' should be all NULLs (it didn't exist in the old file)
let col_b = batch
.column(1)
.as_primitive::<arrow_array::types::Int32Type>();
assert_eq!(col_b.null_count(), 3);
assert!(col_b.is_null(0));
assert!(col_b.is_null(1));
assert!(col_b.is_null(2));
// Column 'b' should be all NULLs (it didn't exist in the old file, added with REE)
let col_b = batch.column(1);
// For REE array with null, check the values array
if let Some(run_array) = col_b
.as_any()
.downcast_ref::<RunArray<arrow_array::types::Int32Type>>()
{
let values = run_array.values();
assert!(
values.is_null(0),
"REE values should contain null for added column"
);
} else {
// Fallback to direct null check for simple arrays
assert!(col_b.is_null(0));
assert!(col_b.is_null(1));
assert!(col_b.is_null(2));
}
}

/// Test for bug where position deletes in later row groups are not applied correctly.
Expand Down Expand Up @@ -3440,11 +3466,23 @@ message schema {
assert_eq!(age_array.value(0), 30);
assert_eq!(age_array.value(1), 25);

// Verify missing column filled with NULLs
let city_array = batch.column(2).as_string::<i32>();
assert_eq!(city_array.null_count(), 2);
assert!(city_array.is_null(0));
assert!(city_array.is_null(1));
// Verify missing column filled with NULLs (will be REE with null)
let city_col = batch.column(2);
if let Some(run_array) = city_col
.as_any()
.downcast_ref::<RunArray<arrow_array::types::Int32Type>>()
{
let values = run_array.values();
assert!(
values.is_null(0),
"REE values should contain null for added column"
);
} else {
let city_array = city_col.as_string::<i32>();
assert_eq!(city_array.null_count(), 2);
assert!(city_array.is_null(0));
assert!(city_array.is_null(1));
}
}

/// Test reading Parquet files without field IDs that have multiple row groups.
Expand Down Expand Up @@ -3761,13 +3799,23 @@ message schema {
assert_eq!(result_col0.value(0), 1);
assert_eq!(result_col0.value(1), 2);

// New column should be NULL (doesn't exist in old file)
let result_newcol = batch
.column(1)
.as_primitive::<arrow_array::types::Int32Type>();
assert_eq!(result_newcol.null_count(), 2);
assert!(result_newcol.is_null(0));
assert!(result_newcol.is_null(1));
// New column should be NULL (doesn't exist in old file, added with REE)
let newcol = batch.column(1);
if let Some(run_array) = newcol
.as_any()
.downcast_ref::<RunArray<arrow_array::types::Int32Type>>()
{
let values = run_array.values();
assert!(
values.is_null(0),
"REE values should contain null for added column"
);
} else {
let result_newcol = newcol.as_primitive::<arrow_array::types::Int32Type>();
assert_eq!(result_newcol.null_count(), 2);
assert!(result_newcol.is_null(0));
assert!(result_newcol.is_null(1));
}

let result_col1 = batch
.column(2)
Expand Down
Loading
Loading