apache · gbrgr · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs
@@ -54,8 +54,9 @@ use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator;
 use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator;
 use crate::expr::{BoundPredicate, BoundReference};
 use crate::io::{FileIO, FileMetadata, FileRead};
+use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field};
 use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream};
-use crate::spec::{Datum, NameMapping, NestedField, PrimitiveType, Schema, Type};
+use crate::spec::{Datum, NameMapping, NestedField, PrimitiveLiteral, PrimitiveType, Schema, Type};
 use crate::utils::available_parallelism;
 use crate::{Error, ErrorKind};
 
@@ -250,12 +251,20 @@ impl ArrowReader {
             initial_stream_builder
         };
 
+        // Filter out metadata fields for Parquet projection (they don't exist in files)
+        let project_field_ids_without_metadata: Vec<i32> = task
+            .project_field_ids
+            .iter()
+            .filter(|&&id| !is_metadata_field(id))
+            .copied()
+            .collect();
+
         // Create projection mask based on field IDs
         // - If file has embedded IDs: field-ID-based projection (missing_field_ids=false)
         // - If name mapping applied: field-ID-based projection (missing_field_ids=true but IDs now match)
         // - If fallback IDs: position-based projection (missing_field_ids=true)
         let projection_mask = Self::get_arrow_projection_mask(
-            &task.project_field_ids,
+            &project_field_ids_without_metadata,
             &task.schema,
             record_batch_stream_builder.parquet_schema(),
             record_batch_stream_builder.schema(),
@@ -266,16 +275,20 @@ impl ArrowReader {
             record_batch_stream_builder.with_projection(projection_mask.clone());
 
         // RecordBatchTransformer performs any transformations required on the RecordBatches
-        // that come back from the file, such as type promotion, default column insertion
-        // and column re-ordering.
+        // that come back from the file, such as type promotion, default column insertion,
+        // column re-ordering, partition constants, and virtual field addition (like _file)
         let mut record_batch_transformer_builder =
-            RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids());
+            RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids())
+                .with_constant(
+                    RESERVED_FIELD_ID_FILE,
+                    PrimitiveLiteral::String(task.data_file_path.clone()),
+                )?;
 
         if let (Some(partition_spec), Some(partition_data)) =
             (task.partition_spec.clone(), task.partition.clone())
         {
             record_batch_transformer_builder =
-                record_batch_transformer_builder.with_partition(partition_spec, partition_data);
+                record_batch_transformer_builder.with_partition(partition_spec, partition_data)?;
         }
 
         let mut record_batch_transformer = record_batch_transformer_builder.build();
@@ -416,7 +429,10 @@ impl ArrowReader {
             record_batch_stream_builder
                 .build()?
                 .map(move |batch| match batch {
-                    Ok(batch) => record_batch_transformer.process_record_batch(batch),
+                    Ok(batch) => {
+                        // Process the record batch (type promotion, column reordering, virtual fields, etc.)
+                        record_batch_transformer.process_record_batch(batch)
+                    }
                     Err(err) => Err(err.into()),
                 });
 
@@ -1737,7 +1753,7 @@ mod tests {
     use std::sync::Arc;
 
     use arrow_array::cast::AsArray;
-    use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray};
+    use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, RunArray, StringArray};
     use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit};
     use futures::TryStreamExt;
     use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
@@ -2553,14 +2569,24 @@ message schema {
             .as_primitive::<arrow_array::types::Int32Type>();
         assert_eq!(col_a.values(), &[1, 2, 3]);
 
-        // Column 'b' should be all NULLs (it didn't exist in the old file)
-        let col_b = batch
-            .column(1)
-            .as_primitive::<arrow_array::types::Int32Type>();
-        assert_eq!(col_b.null_count(), 3);
-        assert!(col_b.is_null(0));
-        assert!(col_b.is_null(1));
-        assert!(col_b.is_null(2));
+        // Column 'b' should be all NULLs (it didn't exist in the old file, added with REE)
+        let col_b = batch.column(1);
+        // For REE array with null, check the values array
+        if let Some(run_array) = col_b
+            .as_any()
+            .downcast_ref::<RunArray<arrow_array::types::Int32Type>>()
+        {
+            let values = run_array.values();
+            assert!(
+                values.is_null(0),
+                "REE values should contain null for added column"
+            );
+        } else {
+            // Fallback to direct null check for simple arrays
+            assert!(col_b.is_null(0));
+            assert!(col_b.is_null(1));
+            assert!(col_b.is_null(2));
+        }
     }
 
     /// Test for bug where position deletes in later row groups are not applied correctly.
@@ -3440,11 +3466,23 @@ message schema {
         assert_eq!(age_array.value(0), 30);
         assert_eq!(age_array.value(1), 25);
 
-        // Verify missing column filled with NULLs
-        let city_array = batch.column(2).as_string::<i32>();
-        assert_eq!(city_array.null_count(), 2);
-        assert!(city_array.is_null(0));
-        assert!(city_array.is_null(1));
+        // Verify missing column filled with NULLs (will be REE with null)
+        let city_col = batch.column(2);
+        if let Some(run_array) = city_col
+            .as_any()
+            .downcast_ref::<RunArray<arrow_array::types::Int32Type>>()
+        {
+            let values = run_array.values();
+            assert!(
+                values.is_null(0),
+                "REE values should contain null for added column"
+            );
+        } else {
+            let city_array = city_col.as_string::<i32>();
+            assert_eq!(city_array.null_count(), 2);
+            assert!(city_array.is_null(0));
+            assert!(city_array.is_null(1));
+        }
     }
 
     /// Test reading Parquet files without field IDs that have multiple row groups.
@@ -3761,13 +3799,23 @@ message schema {
         assert_eq!(result_col0.value(0), 1);
         assert_eq!(result_col0.value(1), 2);
 
-        // New column should be NULL (doesn't exist in old file)
-        let result_newcol = batch
-            .column(1)
-            .as_primitive::<arrow_array::types::Int32Type>();
-        assert_eq!(result_newcol.null_count(), 2);
-        assert!(result_newcol.is_null(0));
-        assert!(result_newcol.is_null(1));
+        // New column should be NULL (doesn't exist in old file, added with REE)
+        let newcol = batch.column(1);
+        if let Some(run_array) = newcol
+            .as_any()
+            .downcast_ref::<RunArray<arrow_array::types::Int32Type>>()
+        {
+            let values = run_array.values();
+            assert!(
+                values.is_null(0),
+                "REE values should contain null for added column"
+            );
+        } else {
+            let result_newcol = newcol.as_primitive::<arrow_array::types::Int32Type>();
+            assert_eq!(result_newcol.null_count(), 2);
+            assert!(result_newcol.is_null(0));
+            assert!(result_newcol.is_null(1));
+        }
 
         let result_col1 = batch
             .column(2)