From 61c3efbc8bf163aef3834e55f30d22c41c2ad6d3 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 5 Sep 2025 15:00:16 -0700 Subject: [PATCH 01/53] Define VariantBuilderExt::append_null --- .../src/variant_array_builder.rs | 24 +++++++++-- parquet-variant-json/src/from_json.rs | 26 +----------- parquet-variant/src/builder.rs | 41 +++++++++++++++++++ 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index d5f578421ed3..aa3e1dbdfcfe 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -199,9 +199,14 @@ pub struct VariantArrayVariantBuilder<'a> { metadata_offsets: &'a mut Vec, value_offsets: &'a mut Vec, nulls: &'a mut NullBufferBuilder, + is_null: bool, } impl VariantBuilderExt for VariantArrayVariantBuilder<'_> { + /// Appending NULL to a variant array produces an actual NULL value + fn append_null(&mut self) { + self.is_null = true; + } fn append_value<'m, 'v>(&mut self, value: impl Into>) { ValueBuilder::append_variant(self.parent_state(), value.into()); } @@ -228,6 +233,7 @@ impl<'a> VariantArrayVariantBuilder<'a> { metadata_offsets: &mut builder.metadata_offsets, value_offsets: &mut builder.value_offsets, nulls: &mut builder.nulls, + is_null: false, } } @@ -239,10 +245,20 @@ impl<'a> VariantArrayVariantBuilder<'a> { pub fn finish(mut self) { // Record the ending offsets after finishing metadata and finish the parent state. let (value_builder, metadata_builder) = self.parent_state.value_and_metadata_builders(); - self.metadata_offsets.push(metadata_builder.finish()); - self.value_offsets.push(value_builder.offset()); - self.nulls.append_non_null(); - self.parent_state.finish(); + let (metadata_offset, value_offset, not_null) = if self.is_null { + // Do not `finish`, just repeat the previous offset for a physically empty result + let metadata_offset = self.metadata_offsets.last().copied().unwrap_or(0); + let value_offset = self.value_offsets.last().copied().unwrap_or(0); + (metadata_offset, value_offset, false) + } else { + let metadata_offset = metadata_builder.finish(); + let value_offset = value_builder.offset(); + self.parent_state.finish(); + (metadata_offset, value_offset, true) + }; + self.metadata_offsets.push(metadata_offset); + self.value_offsets.push(value_offset); + self.nulls.append(not_null); } fn parent_state(&mut self) -> ParentState<'_> { diff --git a/parquet-variant-json/src/from_json.rs b/parquet-variant-json/src/from_json.rs index 90b26f7d307b..3a6e869ec1fc 100644 --- a/parquet-variant-json/src/from_json.rs +++ b/parquet-variant-json/src/from_json.rs @@ -18,7 +18,7 @@ //! Module for parsing JSON strings as Variant use arrow_schema::ArrowError; -use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilderExt}; +use parquet_variant::{ObjectFieldBuilder, Variant, VariantBuilderExt}; use serde_json::{Number, Value}; /// Converts a JSON string to Variant using a [`VariantBuilderExt`], such as @@ -120,10 +120,7 @@ fn append_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), Value::Object(obj) => { let mut obj_builder = builder.try_new_object()?; for (key, value) in obj.iter() { - let mut field_builder = ObjectFieldBuilder { - key, - builder: &mut obj_builder, - }; + let mut field_builder = ObjectFieldBuilder::new(key, &mut obj_builder); append_json(value, &mut field_builder)?; } obj_builder.finish(); @@ -132,25 +129,6 @@ fn append_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), Ok(()) } -struct ObjectFieldBuilder<'o, 'v, 's> { - key: &'s str, - builder: &'o mut ObjectBuilder<'v>, -} - -impl VariantBuilderExt for ObjectFieldBuilder<'_, '_, '_> { - fn append_value<'m, 'v>(&mut self, value: impl Into>) { - self.builder.insert(self.key, value); - } - - fn try_new_list(&mut self) -> Result, ArrowError> { - self.builder.try_new_list(self.key) - } - - fn try_new_object(&mut self) -> Result, ArrowError> { - self.builder.try_new_object(self.key) - } -} - #[cfg(test)] mod test { use super::*; diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 2fa8d0981c5b..12490b35dbd5 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -1706,6 +1706,10 @@ impl<'a> ObjectBuilder<'a> { /// Allows users to append values to a [`VariantBuilder`], [`ListBuilder`] or /// [`ObjectBuilder`]. using the same interface. pub trait VariantBuilderExt { + /// Appends a NULL value to this builder. The semantics depend on the implementation, but will + /// often translate to appending a [`Variant::Null`] value. + fn append_null(&mut self); + /// Appends a new variant value to this builder. See e.g. [`VariantBuilder::append_value`]. fn append_value<'m, 'v>(&mut self, value: impl Into>); @@ -1731,6 +1735,10 @@ pub trait VariantBuilderExt { } impl VariantBuilderExt for ListBuilder<'_> { + /// Variant arrays cannot encode NULL values, only `Variant::Null`. + fn append_null(&mut self) { + self.append_value(Variant::Null); + } fn append_value<'m, 'v>(&mut self, value: impl Into>) { self.append_value(value); } @@ -1745,6 +1753,11 @@ impl VariantBuilderExt for ListBuilder<'_> { } impl VariantBuilderExt for VariantBuilder { + /// Variant values cannot encode NULL, only [`Variant::Null`]. This is different from the column + /// that holds variant values being NULL at some positions. + fn append_null(&mut self) { + self.append_value(Variant::Null); + } fn append_value<'m, 'v>(&mut self, value: impl Into>) { self.append_value(value); } @@ -1758,6 +1771,34 @@ impl VariantBuilderExt for VariantBuilder { } } +/// A [`VariantBuilderExt`] that inserts a new field into a variant object. +pub struct ObjectFieldBuilder<'o, 'v, 's> { + key: &'s str, + builder: &'o mut ObjectBuilder<'v>, +} + +impl<'o, 'v, 's> ObjectFieldBuilder<'o, 'v, 's> { + pub fn new(key: &'s str, builder: &'o mut ObjectBuilder<'v>) -> Self { + Self { key, builder } + } +} + +impl VariantBuilderExt for ObjectFieldBuilder<'_, '_, '_> { + /// A NULL object field is interpreted as missing, so nothing gets inserted at all. + fn append_null(&mut self) { } + fn append_value<'m, 'v>(&mut self, value: impl Into>) { + self.builder.insert(self.key, value); + } + + fn try_new_list(&mut self) -> Result, ArrowError> { + self.builder.try_new_list(self.key) + } + + fn try_new_object(&mut self) -> Result, ArrowError> { + self.builder.try_new_object(self.key) + } +} + #[cfg(test)] mod tests { use crate::VariantMetadata; From ea1147f2900f9cde471cc6f4cdf0e9ca39cf508e Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 08:01:40 -0700 Subject: [PATCH 02/53] checkpoint - primitive builders --- .../src/cast_to_variant.rs | 213 +++++++++++++++++- 1 file changed, 212 insertions(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 412f207cfe46..b66ee5998c99 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -43,9 +43,151 @@ use arrow::temporal_conversions::{ use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit, UnionFields}; use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; use parquet_variant::{ - Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, + Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; +// ============================================================================ +// Row-oriented builders for efficient Arrow-to-Variant conversion +// ============================================================================ + +/// Row builder for converting Arrow arrays to VariantArray row by row +pub(crate) trait ArrowToVariantRowBuilder { + fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()>; + fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<()>; +} + +/// Generic primitive builder for all Arrow primitive types +struct PrimitiveArrowToVariantBuilder<'a, T: ArrowNativeType> { + array: &'a arrow::array::PrimitiveArray, +} + +impl<'a, T: ArrowNativeType> PrimitiveArrowToVariantBuilder<'a, T> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_primitive(), + } + } +} + +impl<'a, T: ArrowNativeType> ArrowToVariantRowBuilder for PrimitiveArrowToVariantBuilder<'a, T> { + fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()> { + builder.append_null(); + Ok(()) + } + + fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<()> { + let value = self.array.value(index); + builder.append_value(value); + Ok(()) + } +} + +/// Boolean builder for BooleanArray +struct BooleanArrowToVariantBuilder<'a> { + array: &'a arrow::array::BooleanArray, +} + +impl<'a> BooleanArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_boolean(), + } + } +} + +impl<'a> ArrowToVariantRowBuilder for BooleanArrowToVariantBuilder<'a> { + fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()> { + builder.append_null(); + Ok(()) + } + + fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<()> { + let value = self.array.value(index); + builder.append_value(value); + Ok(()) + } +} + +/// String builder for StringArray (both Utf8 and LargeUtf8) +struct StringArrowToVariantBuilder<'a> { + array: &'a dyn Array, +} + +impl<'a> StringArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { array } + } +} + +impl<'a> ArrowToVariantRowBuilder for StringArrowToVariantBuilder<'a> { + fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()> { + builder.append_null(); + Ok(()) + } + + fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<()> { + let value = match self.array.data_type() { + DataType::Utf8 => { + let string_array = self.array.as_string::(); + string_array.value(index) + } + DataType::LargeUtf8 => { + let string_array = self.array.as_string::(); + string_array.value(index) + } + _ => return Err(ArrowError::CastError("Expected string array".to_string())), + }; + builder.append_value(value); + Ok(()) + } +} + +/// Null builder that always appends null +struct NullArrowToVariantBuilder; + +impl ArrowToVariantRowBuilder for NullArrowToVariantBuilder { + fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()> { + builder.append_null(); + Ok(()) + } + + fn append_value(&mut self, _index: usize, builder: &mut impl VariantBuilderExt) -> Result<()> { + builder.append_null(); + Ok(()) + } +} + +/// Factory function to create the appropriate row builder for a given DataType +fn make_arrow_to_variant_row_builder<'a>( + data_type: &'a DataType, + array: &'a dyn Array, +) -> Result, ArrowError> { + match data_type { + // All integer types + DataType::Int8 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int16 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int32 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int64 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt8 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt16 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt32 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt64 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + + // Float types + DataType::Float32 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Float64 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + + // Special types + DataType::Boolean => Ok(Box::new(BooleanArrowToVariantBuilder::new(array))), + DataType::Utf8 => Ok(Box::new(StringArrowToVariantBuilder::new(array))), + DataType::LargeUtf8 => Ok(Box::new(StringArrowToVariantBuilder::new(array))), + DataType::Null => Ok(Box::new(NullArrowToVariantBuilder)), + + // TODO: Add other types (Binary, Date, Time, Decimal, etc.) + _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), + } +} + /// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you /// need to convert a specific data type /// @@ -2366,3 +2508,72 @@ mod tests { } } } + +#[cfg(test)] +mod row_builder_tests { + use super::*; + use arrow::array::{Int32Array, StringArray, BooleanArray}; + + #[test] + fn test_primitive_row_builder() { + // Test Int32Array + let int_array = Int32Array::from(vec![Some(42), None, Some(100)]); + let mut row_builder = make_arrow_to_variant_row_builder(int_array.data_type(), &int_array).unwrap(); + + let mut variant_builder = VariantArrayBuilder::new(3); + + // Test first value + row_builder.append_value(0, &mut variant_builder).unwrap(); + assert_eq!(variant_builder.len(), 1); + + // Test null value + row_builder.append_null(&mut variant_builder).unwrap(); + assert_eq!(variant_builder.len(), 2); + + // Test second value + row_builder.append_value(2, &mut variant_builder).unwrap(); + assert_eq!(variant_builder.len(), 3); + + let variant_array = variant_builder.finish(); + assert_eq!(variant_array.len(), 3); + assert_eq!(variant_array.value(0), Variant::Int32(42)); + assert!(variant_array.is_null(1)); + assert_eq!(variant_array.value(2), Variant::Int32(100)); + } + + #[test] + fn test_string_row_builder() { + let string_array = StringArray::from(vec![Some("hello"), None, Some("world")]); + let mut row_builder = make_arrow_to_variant_row_builder(string_array.data_type(), &string_array).unwrap(); + + let mut variant_builder = VariantArrayBuilder::new(3); + + row_builder.append_value(0, &mut variant_builder).unwrap(); + row_builder.append_null(&mut variant_builder).unwrap(); + row_builder.append_value(2, &mut variant_builder).unwrap(); + + let variant_array = variant_builder.finish(); + assert_eq!(variant_array.len(), 3); + assert_eq!(variant_array.value(0), Variant::String("hello".to_string())); + assert!(variant_array.is_null(1)); + assert_eq!(variant_array.value(2), Variant::String("world".to_string())); + } + + #[test] + fn test_boolean_row_builder() { + let bool_array = BooleanArray::from(vec![Some(true), None, Some(false)]); + let mut row_builder = make_arrow_to_variant_row_builder(bool_array.data_type(), &bool_array).unwrap(); + + let mut variant_builder = VariantArrayBuilder::new(3); + + row_builder.append_value(0, &mut variant_builder).unwrap(); + row_builder.append_null(&mut variant_builder).unwrap(); + row_builder.append_value(2, &mut variant_builder).unwrap(); + + let variant_array = variant_builder.finish(); + assert_eq!(variant_array.len(), 3); + assert_eq!(variant_array.value(0), Variant::Boolean(true)); + assert!(variant_array.is_null(1)); + assert_eq!(variant_array.value(2), Variant::Boolean(false)); + } +} From bd565cf6d74aee577c4a2c01673e3b6c45b6c825 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 11:50:29 -0700 Subject: [PATCH 03/53] manual fixup --- .../src/cast_to_variant.rs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index b66ee5998c99..cecdaff2f33b 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -30,7 +30,7 @@ use arrow::array::{ use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::kernels::cast; use arrow::datatypes::{ - i256, ArrowNativeType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, + i256, ArrowNativeType, ArrowPrimitiveType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, @@ -52,16 +52,16 @@ use parquet_variant::{ /// Row builder for converting Arrow arrays to VariantArray row by row pub(crate) trait ArrowToVariantRowBuilder { - fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()>; - fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<()>; + fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError>; + fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError>; } /// Generic primitive builder for all Arrow primitive types -struct PrimitiveArrowToVariantBuilder<'a, T: ArrowNativeType> { +struct PrimitiveArrowToVariantBuilder<'a, T: ArrowPrimitiveType> { array: &'a arrow::array::PrimitiveArray, } -impl<'a, T: ArrowNativeType> PrimitiveArrowToVariantBuilder<'a, T> { +impl<'a, T: ArrowPrimitiveType> PrimitiveArrowToVariantBuilder<'a, T> { fn new(array: &'a dyn Array) -> Self { Self { array: array.as_primitive(), @@ -69,13 +69,13 @@ impl<'a, T: ArrowNativeType> PrimitiveArrowToVariantBuilder<'a, T> { } } -impl<'a, T: ArrowNativeType> ArrowToVariantRowBuilder for PrimitiveArrowToVariantBuilder<'a, T> { - fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()> { +impl<'a, T: ArrowPrimitiveType> ArrowToVariantRowBuilder for PrimitiveArrowToVariantBuilder<'a, T> { + fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { builder.append_null(); Ok(()) } - fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<()> { + fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { let value = self.array.value(index); builder.append_value(value); Ok(()) @@ -96,12 +96,12 @@ impl<'a> BooleanArrowToVariantBuilder<'a> { } impl<'a> ArrowToVariantRowBuilder for BooleanArrowToVariantBuilder<'a> { - fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()> { + fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { builder.append_null(); Ok(()) } - fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<()> { + fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { let value = self.array.value(index); builder.append_value(value); Ok(()) @@ -120,12 +120,12 @@ impl<'a> StringArrowToVariantBuilder<'a> { } impl<'a> ArrowToVariantRowBuilder for StringArrowToVariantBuilder<'a> { - fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()> { + fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { builder.append_null(); Ok(()) } - fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<()> { + fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { let value = match self.array.data_type() { DataType::Utf8 => { let string_array = self.array.as_string::(); From 358e2f8069b744f4210a1a2a9d4be583a5b3ddfc Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 11:57:06 -0700 Subject: [PATCH 04/53] checkpoint - change to enum instead of trait --- .../src/cast_to_variant.rs | 171 ++++++++++-------- 1 file changed, 94 insertions(+), 77 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index cecdaff2f33b..5996f2e1cbeb 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -51,9 +51,40 @@ use parquet_variant::{ // ============================================================================ /// Row builder for converting Arrow arrays to VariantArray row by row -pub(crate) trait ArrowToVariantRowBuilder { - fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError>; - fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError>; +pub(crate) enum ArrowToVariantRowBuilder<'a> { + PrimitiveInt8(PrimitiveArrowToVariantBuilder<'a, Int8Type>), + PrimitiveInt16(PrimitiveArrowToVariantBuilder<'a, Int16Type>), + PrimitiveInt32(PrimitiveArrowToVariantBuilder<'a, Int32Type>), + PrimitiveInt64(PrimitiveArrowToVariantBuilder<'a, Int64Type>), + PrimitiveUInt8(PrimitiveArrowToVariantBuilder<'a, UInt8Type>), + PrimitiveUInt16(PrimitiveArrowToVariantBuilder<'a, UInt16Type>), + PrimitiveUInt32(PrimitiveArrowToVariantBuilder<'a, UInt32Type>), + PrimitiveUInt64(PrimitiveArrowToVariantBuilder<'a, UInt64Type>), + PrimitiveFloat32(PrimitiveArrowToVariantBuilder<'a, Float32Type>), + PrimitiveFloat64(PrimitiveArrowToVariantBuilder<'a, Float64Type>), + Boolean(BooleanArrowToVariantBuilder<'a>), + String(StringArrowToVariantBuilder<'a>), + Null(NullArrowToVariantBuilder), +} + +impl<'a, B: VariantBuilderExt> ArrowToVariantRowBuilder<'a> { + pub fn append_row(&mut self, index: usize, builder: &mut B) -> Result<(), ArrowError> { + match self { + ArrowToVariantRowBuilder::PrimitiveInt8(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveInt16(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveInt32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveInt64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveUInt8(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveUInt16(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveUInt32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveUInt64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveFloat32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveFloat64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Boolean(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::String(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Null(b) => b.append_row(index, builder), + } + } } /// Generic primitive builder for all Arrow primitive types @@ -67,17 +98,14 @@ impl<'a, T: ArrowPrimitiveType> PrimitiveArrowToVariantBuilder<'a, T> { array: array.as_primitive(), } } -} - -impl<'a, T: ArrowPrimitiveType> ArrowToVariantRowBuilder for PrimitiveArrowToVariantBuilder<'a, T> { - fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - builder.append_null(); - Ok(()) - } - - fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - let value = self.array.value(index); - builder.append_value(value); + + fn append_row(&mut self, index: usize, builder: &mut B) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let value = self.array.value(index); + builder.append_value(value); + } Ok(()) } } @@ -93,17 +121,14 @@ impl<'a> BooleanArrowToVariantBuilder<'a> { array: array.as_boolean(), } } -} - -impl<'a> ArrowToVariantRowBuilder for BooleanArrowToVariantBuilder<'a> { - fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - builder.append_null(); - Ok(()) - } - - fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - let value = self.array.value(index); - builder.append_value(value); + + fn append_row(&mut self, index: usize, builder: &mut B) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let value = self.array.value(index); + builder.append_value(value); + } Ok(()) } } @@ -117,27 +142,24 @@ impl<'a> StringArrowToVariantBuilder<'a> { fn new(array: &'a dyn Array) -> Self { Self { array } } -} - -impl<'a> ArrowToVariantRowBuilder for StringArrowToVariantBuilder<'a> { - fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - builder.append_null(); - Ok(()) - } - - fn append_value(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - let value = match self.array.data_type() { - DataType::Utf8 => { - let string_array = self.array.as_string::(); - string_array.value(index) - } - DataType::LargeUtf8 => { - let string_array = self.array.as_string::(); - string_array.value(index) - } - _ => return Err(ArrowError::CastError("Expected string array".to_string())), - }; - builder.append_value(value); + + fn append_row(&mut self, index: usize, builder: &mut B) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let value = match self.array.data_type() { + DataType::Utf8 => { + let string_array = self.array.as_string::(); + string_array.value(index) + } + DataType::LargeUtf8 => { + let string_array = self.array.as_string::(); + string_array.value(index) + } + _ => return Err(ArrowError::CastError("Expected string array".to_string())), + }; + builder.append_value(value); + } Ok(()) } } @@ -145,13 +167,8 @@ impl<'a> ArrowToVariantRowBuilder for StringArrowToVariantBuilder<'a> { /// Null builder that always appends null struct NullArrowToVariantBuilder; -impl ArrowToVariantRowBuilder for NullArrowToVariantBuilder { - fn append_null(&mut self, builder: &mut impl VariantBuilderExt) -> Result<()> { - builder.append_null(); - Ok(()) - } - - fn append_value(&mut self, _index: usize, builder: &mut impl VariantBuilderExt) -> Result<()> { +impl NullArrowToVariantBuilder { + fn append_row(&mut self, _index: usize, builder: &mut B) -> Result<(), ArrowError> { builder.append_null(); Ok(()) } @@ -161,27 +178,27 @@ impl ArrowToVariantRowBuilder for NullArrowToVariantBuilder { fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, array: &'a dyn Array, -) -> Result, ArrowError> { +) -> Result, ArrowError> { match data_type { // All integer types - DataType::Int8 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Int16 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Int32 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Int64 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt8 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt16 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt32 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt64 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::::new(array))), // Float types - DataType::Float32 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Float64 => Ok(Box::new(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::::new(array))), // Special types - DataType::Boolean => Ok(Box::new(BooleanArrowToVariantBuilder::new(array))), - DataType::Utf8 => Ok(Box::new(StringArrowToVariantBuilder::new(array))), - DataType::LargeUtf8 => Ok(Box::new(StringArrowToVariantBuilder::new(array))), - DataType::Null => Ok(Box::new(NullArrowToVariantBuilder)), + DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array))), + DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), + DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), + DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder)), // TODO: Add other types (Binary, Date, Time, Decimal, etc.) _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), @@ -2523,15 +2540,15 @@ mod row_builder_tests { let mut variant_builder = VariantArrayBuilder::new(3); // Test first value - row_builder.append_value(0, &mut variant_builder).unwrap(); + row_builder.append_row(0, &mut variant_builder).unwrap(); assert_eq!(variant_builder.len(), 1); // Test null value - row_builder.append_null(&mut variant_builder).unwrap(); + row_builder.append_row(1, &mut variant_builder).unwrap(); assert_eq!(variant_builder.len(), 2); // Test second value - row_builder.append_value(2, &mut variant_builder).unwrap(); + row_builder.append_row(2, &mut variant_builder).unwrap(); assert_eq!(variant_builder.len(), 3); let variant_array = variant_builder.finish(); @@ -2548,9 +2565,9 @@ mod row_builder_tests { let mut variant_builder = VariantArrayBuilder::new(3); - row_builder.append_value(0, &mut variant_builder).unwrap(); - row_builder.append_null(&mut variant_builder).unwrap(); - row_builder.append_value(2, &mut variant_builder).unwrap(); + row_builder.append_row(0, &mut variant_builder).unwrap(); + row_builder.append_row(1, &mut variant_builder).unwrap(); + row_builder.append_row(2, &mut variant_builder).unwrap(); let variant_array = variant_builder.finish(); assert_eq!(variant_array.len(), 3); @@ -2566,9 +2583,9 @@ mod row_builder_tests { let mut variant_builder = VariantArrayBuilder::new(3); - row_builder.append_value(0, &mut variant_builder).unwrap(); - row_builder.append_null(&mut variant_builder).unwrap(); - row_builder.append_value(2, &mut variant_builder).unwrap(); + row_builder.append_row(0, &mut variant_builder).unwrap(); + row_builder.append_row(1, &mut variant_builder).unwrap(); + row_builder.append_row(2, &mut variant_builder).unwrap(); let variant_array = variant_builder.finish(); assert_eq!(variant_array.len(), 3); From e6caaaf2211ff667746e4c9404ef688c60bbbe04 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 12:16:05 -0700 Subject: [PATCH 05/53] manual fixup --- .../src/cast_to_variant.rs | 75 ++++++++++++------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 5996f2e1cbeb..ff631f36a5cd 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -67,8 +67,8 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { Null(NullArrowToVariantBuilder), } -impl<'a, B: VariantBuilderExt> ArrowToVariantRowBuilder<'a> { - pub fn append_row(&mut self, index: usize, builder: &mut B) -> Result<(), ArrowError> { +impl<'a> ArrowToVariantRowBuilder<'a> { + pub fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { match self { ArrowToVariantRowBuilder::PrimitiveInt8(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::PrimitiveInt16(b) => b.append_row(index, builder), @@ -88,18 +88,26 @@ impl<'a, B: VariantBuilderExt> ArrowToVariantRowBuilder<'a> { } /// Generic primitive builder for all Arrow primitive types -struct PrimitiveArrowToVariantBuilder<'a, T: ArrowPrimitiveType> { +pub(crate) struct PrimitiveArrowToVariantBuilder<'a, T> +where + T : ArrowPrimitiveType, + T::Native: Into>, +{ array: &'a arrow::array::PrimitiveArray, } -impl<'a, T: ArrowPrimitiveType> PrimitiveArrowToVariantBuilder<'a, T> { +impl<'a, T> PrimitiveArrowToVariantBuilder<'a, T> +where + T : ArrowPrimitiveType, + T::Native: Into>, +{ fn new(array: &'a dyn Array) -> Self { Self { array: array.as_primitive(), } } - fn append_row(&mut self, index: usize, builder: &mut B) -> Result<(), ArrowError> { + fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { if self.array.is_null(index) { builder.append_null(); } else { @@ -111,7 +119,7 @@ impl<'a, T: ArrowPrimitiveType> PrimitiveArrowToVariantBuilder<'a, T> { } /// Boolean builder for BooleanArray -struct BooleanArrowToVariantBuilder<'a> { +pub(crate) struct BooleanArrowToVariantBuilder<'a> { array: &'a arrow::array::BooleanArray, } @@ -122,7 +130,7 @@ impl<'a> BooleanArrowToVariantBuilder<'a> { } } - fn append_row(&mut self, index: usize, builder: &mut B) -> Result<(), ArrowError> { + fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { if self.array.is_null(index) { builder.append_null(); } else { @@ -134,7 +142,7 @@ impl<'a> BooleanArrowToVariantBuilder<'a> { } /// String builder for StringArray (both Utf8 and LargeUtf8) -struct StringArrowToVariantBuilder<'a> { +pub(crate) struct StringArrowToVariantBuilder<'a> { array: &'a dyn Array, } @@ -143,7 +151,7 @@ impl<'a> StringArrowToVariantBuilder<'a> { Self { array } } - fn append_row(&mut self, index: usize, builder: &mut B) -> Result<(), ArrowError> { + fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { if self.array.is_null(index) { builder.append_null(); } else { @@ -165,10 +173,10 @@ impl<'a> StringArrowToVariantBuilder<'a> { } /// Null builder that always appends null -struct NullArrowToVariantBuilder; +pub(crate) struct NullArrowToVariantBuilder; impl NullArrowToVariantBuilder { - fn append_row(&mut self, _index: usize, builder: &mut B) -> Result<(), ArrowError> { + fn append_row(&mut self, _index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { builder.append_null(); Ok(()) } @@ -2537,21 +2545,24 @@ mod row_builder_tests { let int_array = Int32Array::from(vec![Some(42), None, Some(100)]); let mut row_builder = make_arrow_to_variant_row_builder(int_array.data_type(), &int_array).unwrap(); - let mut variant_builder = VariantArrayBuilder::new(3); + let mut array_builder = VariantArrayBuilder::new(3); // Test first value + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(0, &mut variant_builder).unwrap(); - assert_eq!(variant_builder.len(), 1); + variant_builder.finish(); // Test null value + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(1, &mut variant_builder).unwrap(); - assert_eq!(variant_builder.len(), 2); + variant_builder.finish(); // Test second value + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(2, &mut variant_builder).unwrap(); - assert_eq!(variant_builder.len(), 3); + variant_builder.finish(); - let variant_array = variant_builder.finish(); + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); assert_eq!(variant_array.value(0), Variant::Int32(42)); assert!(variant_array.is_null(1)); @@ -2563,17 +2574,23 @@ mod row_builder_tests { let string_array = StringArray::from(vec![Some("hello"), None, Some("world")]); let mut row_builder = make_arrow_to_variant_row_builder(string_array.data_type(), &string_array).unwrap(); - let mut variant_builder = VariantArrayBuilder::new(3); + let mut array_builder = VariantArrayBuilder::new(3); + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(0, &mut variant_builder).unwrap(); + variant_builder.finish(); + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(1, &mut variant_builder).unwrap(); + variant_builder.finish(); + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(2, &mut variant_builder).unwrap(); - - let variant_array = variant_builder.finish(); + variant_builder.finish(); + + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - assert_eq!(variant_array.value(0), Variant::String("hello".to_string())); + assert_eq!(variant_array.value(0), Variant::from("hello")); assert!(variant_array.is_null(1)); - assert_eq!(variant_array.value(2), Variant::String("world".to_string())); + assert_eq!(variant_array.value(2), Variant::from("world")); } #[test] @@ -2581,16 +2598,22 @@ mod row_builder_tests { let bool_array = BooleanArray::from(vec![Some(true), None, Some(false)]); let mut row_builder = make_arrow_to_variant_row_builder(bool_array.data_type(), &bool_array).unwrap(); - let mut variant_builder = VariantArrayBuilder::new(3); + let mut array_builder = VariantArrayBuilder::new(3); + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(0, &mut variant_builder).unwrap(); + variant_builder.finish(); + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(1, &mut variant_builder).unwrap(); + variant_builder.finish(); + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(2, &mut variant_builder).unwrap(); - - let variant_array = variant_builder.finish(); + variant_builder.finish(); + + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - assert_eq!(variant_array.value(0), Variant::Boolean(true)); + assert_eq!(variant_array.value(0), Variant::from(true)); assert!(variant_array.is_null(1)); - assert_eq!(variant_array.value(2), Variant::Boolean(false)); + assert_eq!(variant_array.value(2), Variant::from(false)); } } From 470fffb68ffafea62db498eb75eb3af76574ccc8 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 12:48:33 -0700 Subject: [PATCH 06/53] manual fixup --- .../src/cast_to_variant.rs | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index ff631f36a5cd..9e971c89de29 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -64,6 +64,7 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { PrimitiveFloat64(PrimitiveArrowToVariantBuilder<'a, Float64Type>), Boolean(BooleanArrowToVariantBuilder<'a>), String(StringArrowToVariantBuilder<'a>), + Struct(StructArrowToVariantBuilder<'a>), Null(NullArrowToVariantBuilder), } @@ -82,6 +83,7 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::PrimitiveFloat64(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Boolean(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::String(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Struct(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Null(b) => b.append_row(index, builder), } } @@ -172,6 +174,52 @@ impl<'a> StringArrowToVariantBuilder<'a> { } } +/// Struct builder for StructArray +pub(crate) struct StructArrowToVariantBuilder<'a> { + struct_array: &'a arrow::array::StructArray, + field_builders: Vec<(&'a str, ArrowToVariantRowBuilder<'a>)>, +} + +impl<'a> StructArrowToVariantBuilder<'a> { + fn new(struct_array: &'a arrow::array::StructArray) -> Result { + let mut field_builders = Vec::new(); + + // Create a row builder for each field + for (field_name, field_array) in struct_array.column_names().iter() + .zip(struct_array.columns().iter()) + { + let field_builder = make_arrow_to_variant_row_builder( + field_array.data_type(), + field_array.as_ref(), + )?; + field_builders.push((*field_name, field_builder)); + } + + Ok(Self { + struct_array, + field_builders, + }) + } + + fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.struct_array.is_null(index) { + builder.append_null(); + } else { + // Create object builder for this struct row + let mut obj_builder = builder.try_new_object()?; + + // Process each field + for (field_name, row_builder) in &mut self.field_builders { + let mut field_builder = parquet_variant::ObjectFieldBuilder::new(field_name, &mut obj_builder); + row_builder.append_row(index, &mut field_builder)?; + } + + obj_builder.finish(); + } + Ok(()) + } +} + /// Null builder that always appends null pub(crate) struct NullArrowToVariantBuilder; @@ -206,6 +254,7 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array))), DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), + DataType::Struct(_) => Ok(ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new(array.as_struct())?)), DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder)), // TODO: Add other types (Binary, Date, Time, Decimal, etc.) @@ -2616,4 +2665,63 @@ mod row_builder_tests { assert!(variant_array.is_null(1)); assert_eq!(variant_array.value(2), Variant::from(false)); } + + #[test] + fn test_struct_row_builder() { + use arrow::array::{StructArray, Int32Array, StringArray, ArrayRef}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + // Create a struct array with int and string fields + let int_field = Field::new("id", DataType::Int32, true); + let string_field = Field::new("name", DataType::Utf8, true); + let struct_field = Field::new("person", DataType::Struct(vec![int_field.clone(), string_field.clone()].into()), false); + + let int_array = Int32Array::from(vec![Some(1), None, Some(3)]); + let string_array = StringArray::from(vec![Some("Alice"), Some("Bob"), None]); + + let struct_array = StructArray::try_new( + vec![int_field, string_field].into(), + vec![Arc::new(int_array) as ArrayRef, Arc::new(string_array) as ArrayRef], + None, + ) + .unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder(struct_array.data_type(), &struct_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + // Test first row + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(0, &mut variant_builder).unwrap(); + variant_builder.finish(); + + // Test second row (with null int field) + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(1, &mut variant_builder).unwrap(); + variant_builder.finish(); + + // Test third row (with null string field) + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(2, &mut variant_builder).unwrap(); + variant_builder.finish(); + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Check first row - should have both fields + let first_variant = variant_array.value(0); + assert_eq!(first_variant.get_object_field("id"), Some(Variant::from(1))); + assert_eq!(first_variant.get_object_field("name"), Some(Variant::from("Alice"))); + + // Check second row - should have name field but not id (null field omitted) + let second_variant = variant_array.value(1); + assert_eq!(second_variant.get_object_field("id"), None); // null field omitted + assert_eq!(second_variant.get_object_field("name"), Some(Variant::from("Bob"))); + + // Check third row - should have id field but not name (null field omitted) + let third_variant = variant_array.value(2); + assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(3))); + assert_eq!(third_variant.get_object_field("name"), None); // null field omitted + } } From f129824db0da7734368328eb914b96496eb026d6 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 14:56:50 -0700 Subject: [PATCH 07/53] checkpoint - run end --- .../src/cast_to_variant.rs | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 9e971c89de29..c771ad85350f 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -66,6 +66,9 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { String(StringArrowToVariantBuilder<'a>), Struct(StructArrowToVariantBuilder<'a>), Null(NullArrowToVariantBuilder), + RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder<'a, Int16Type>), + RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder<'a, Int32Type>), + RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder<'a, Int64Type>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -85,6 +88,9 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::String(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Struct(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Null(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::RunEndEncodedInt16(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::RunEndEncodedInt32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::RunEndEncodedInt64(b) => b.append_row(index, builder), } } } @@ -230,6 +236,94 @@ impl NullArrowToVariantBuilder { } } +/// Run-end encoded array builder with efficient sequential access +pub(crate) struct RunEndEncodedArrowToVariantBuilder<'a, R: RunEndIndexType> { + run_array: &'a arrow::array::RunEndEncodedArray, + values_builder: ArrowToVariantRowBuilder<'a>, + + run_ends: &'a [R::Native], + run_number: usize, // Physical index into run_ends and values + run_start: usize, // Logical start index of current run +} + +impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { + fn new(array: &'a dyn Array) -> Result { + let run_array = array.as_run_end_encoded().ok_or_else(|| { + ArrowError::CastError("Expected RunEndEncodedArray".to_string()) + })?; + + let run_ends = run_array.run_ends().values(); + let values_builder = make_arrow_to_variant_row_builder( + run_array.values().data_type(), + run_array.values().as_ref(), + )?; + + Ok(Self { + run_array, + values_builder, + run_ends, + run_number: 0, + run_start: 0, // First run starts at logical index 0 + }) + } + + fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + // Three cases for run tracking + if self.run_number < self.run_ends.len() && + self.run_start <= index && + index < self.run_ends[self.run_number].as_usize() { + // Case 1: Still in same run - O(1) + // No need to update run_number or run_start + } else if self.run_number < self.run_ends.len() && + index == self.run_ends[self.run_number].as_usize() { + // Case 2: Advanced to next run - O(1) + self.advance_to_next_run(); + } else { + // Case 3: Binary search for any other case - O(log n) + self.find_run_containing(index)?; + } + + // Verify we have a valid run + if self.run_number >= self.run_ends.len() { + return Err(ArrowError::CastError(format!("Index {} beyond run array", index))); + } + + // Handle null values + if self.run_array.values().is_null(self.run_number) { + builder.append_null(); + return Ok(()); + } + + // Re-encode the value + self.values_builder.append_row(self.run_number, builder)?; + + Ok(()) + } + + fn advance_to_next_run(&mut self) { + self.run_start = self.run_ends[self.run_number].as_usize(); + self.run_number += 1; + } + + fn find_run_containing(&mut self, index: usize) -> Result<(), ArrowError> { + // Use partition_point for all non-sequential cases + self.run_number = self.run_ends.partition_point(|&run_end| run_end.as_usize() <= index); + + if self.run_number >= self.run_ends.len() { + return Err(ArrowError::CastError(format!("Index {} beyond run array", index))); + } + + // Set run_start + self.run_start = if self.run_number == 0 { + 0 + } else { + self.run_ends[self.run_number - 1].as_usize() + }; + + Ok(()) + } +} + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, @@ -257,6 +351,16 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Struct(_) => Ok(ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new(array.as_struct())?)), DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder)), + // Run-end encoded types + DataType::RunEndEncoded(run_ends, _) => { + match run_ends.data_type() { + DataType::Int16 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder::new(array)?)), + DataType::Int32 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder::new(array)?)), + DataType::Int64 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder::new(array)?)), + _ => Err(ArrowError::CastError(format!("Unsupported run-end type: {run_ends:?}"))), + } + } + // TODO: Add other types (Binary, Date, Time, Decimal, etc.) _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), } @@ -2724,4 +2828,96 @@ mod row_builder_tests { assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(3))); assert_eq!(third_variant.get_object_field("name"), None); // null field omitted } + + #[test] + fn test_run_end_encoded_row_builder() { + use arrow::array::{RunEndEncodedArray, Int32Array}; + use arrow::datatypes::Int32Type; + + // Create a run-end encoded array: [A, A, B, B, B, C] + // run_ends: [2, 5, 6] + // values: ["A", "B", "C"] + let values = StringArray::from(vec!["A", "B", "C"]); + let run_ends = Int32Array::from(vec![2, 5, 6]); + let run_array = RunEndEncodedArray::::try_new(&run_ends, &values).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(6); + + // Test sequential access (most common case) + for i in 0..6 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 6); + + // Verify the values + assert_eq!(variant_array.value(0), Variant::from("A")); // Run 0 + assert_eq!(variant_array.value(1), Variant::from("A")); // Run 0 + assert_eq!(variant_array.value(2), Variant::from("B")); // Run 1 + assert_eq!(variant_array.value(3), Variant::from("B")); // Run 1 + assert_eq!(variant_array.value(4), Variant::from("B")); // Run 1 + assert_eq!(variant_array.value(5), Variant::from("C")); // Run 2 + } + + #[test] + fn test_run_end_encoded_random_access() { + use arrow::array::{RunEndEncodedArray, Int32Array}; + use arrow::datatypes::Int32Type; + + // Create a run-end encoded array: [A, A, B, B, B, C] + let values = StringArray::from(vec!["A", "B", "C"]); + let run_ends = Int32Array::from(vec![2, 5, 6]); + let run_array = RunEndEncodedArray::::try_new(&run_ends, &values).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); + + // Test random access pattern (backward jumps, forward jumps) + let access_pattern = [0, 5, 2, 4, 1, 3]; // Mix of all cases + let expected_values = ["A", "C", "B", "B", "A", "B"]; + + for (i, &index) in access_pattern.iter().enumerate() { + let mut array_builder = VariantArrayBuilder::new(1); + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(index, &mut variant_builder).unwrap(); + variant_builder.finish(); + + let variant_array = array_builder.build(); + assert_eq!(variant_array.value(0), Variant::from(expected_values[i])); + } + } + + #[test] + fn test_run_end_encoded_with_nulls() { + use arrow::array::{RunEndEncodedArray, Int32Array}; + use arrow::datatypes::Int32Type; + + // Create a run-end encoded array with null values: [A, A, null, null, B] + let values = StringArray::from(vec![Some("A"), None, Some("B")]); + let run_ends = Int32Array::from(vec![2, 4, 5]); + let run_array = RunEndEncodedArray::::try_new(&run_ends, &values).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(5); + + // Test sequential access + for i in 0..5 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 5); + + // Verify the values + assert_eq!(variant_array.value(0), Variant::from("A")); // Run 0 + assert_eq!(variant_array.value(1), Variant::from("A")); // Run 0 + assert!(variant_array.is_null(2)); // Run 1 (null) + assert!(variant_array.is_null(3)); // Run 1 (null) + assert_eq!(variant_array.value(4), Variant::from("B")); // Run 2 + } } From 74d7da1d263168b621282da97a4b634660055f9d Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 15:04:34 -0700 Subject: [PATCH 08/53] manual cleanup --- .../src/cast_to_variant.rs | 46 +++++++++---------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index c771ad85350f..7f3f3571cac4 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -238,8 +238,8 @@ impl NullArrowToVariantBuilder { /// Run-end encoded array builder with efficient sequential access pub(crate) struct RunEndEncodedArrowToVariantBuilder<'a, R: RunEndIndexType> { - run_array: &'a arrow::array::RunEndEncodedArray, - values_builder: ArrowToVariantRowBuilder<'a>, + run_array: &'a arrow::array::RunArray, + values_builder: Box>, run_ends: &'a [R::Native], run_number: usize, // Physical index into run_ends and values @@ -248,22 +248,22 @@ pub(crate) struct RunEndEncodedArrowToVariantBuilder<'a, R: RunEndIndexType> { impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { fn new(array: &'a dyn Array) -> Result { - let run_array = array.as_run_end_encoded().ok_or_else(|| { - ArrowError::CastError("Expected RunEndEncodedArray".to_string()) - })?; + let Some(run_array) = array.as_run_opt() else { + return Err(ArrowError::CastError("Expected RunArray".to_string())); + }; - let run_ends = run_array.run_ends().values(); + let values_array = run_array.values(); let values_builder = make_arrow_to_variant_row_builder( - run_array.values().data_type(), - run_array.values().as_ref(), + values_array.data_type(), + values_array.as_ref(), )?; Ok(Self { run_array, - values_builder, - run_ends, + values_builder: Box::new(values_builder), + run_ends: run_array.run_ends().values(), run_number: 0, - run_start: 0, // First run starts at logical index 0 + run_start: 0, }) } @@ -275,9 +275,11 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { // Case 1: Still in same run - O(1) // No need to update run_number or run_start } else if self.run_number < self.run_ends.len() && - index == self.run_ends[self.run_number].as_usize() { + index == self.run_ends[self.run_number].as_usize() + { // Case 2: Advanced to next run - O(1) - self.advance_to_next_run(); + self.run_start = self.run_ends[self.run_number].as_usize(); + self.run_number += 1; } else { // Case 3: Binary search for any other case - O(log n) self.find_run_containing(index)?; @@ -300,11 +302,6 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { Ok(()) } - fn advance_to_next_run(&mut self) { - self.run_start = self.run_ends[self.run_number].as_usize(); - self.run_number += 1; - } - fn find_run_containing(&mut self, index: usize) -> Result<(), ArrowError> { // Use partition_point for all non-sequential cases self.run_number = self.run_ends.partition_point(|&run_end| run_end.as_usize() <= index); @@ -2779,7 +2776,6 @@ mod row_builder_tests { // Create a struct array with int and string fields let int_field = Field::new("id", DataType::Int32, true); let string_field = Field::new("name", DataType::Utf8, true); - let struct_field = Field::new("person", DataType::Struct(vec![int_field.clone(), string_field.clone()].into()), false); let int_array = Int32Array::from(vec![Some(1), None, Some(3)]); let string_array = StringArray::from(vec![Some("Alice"), Some("Bob"), None]); @@ -2831,7 +2827,7 @@ mod row_builder_tests { #[test] fn test_run_end_encoded_row_builder() { - use arrow::array::{RunEndEncodedArray, Int32Array}; + use arrow::array::{RunArray, Int32Array}; use arrow::datatypes::Int32Type; // Create a run-end encoded array: [A, A, B, B, B, C] @@ -2839,7 +2835,7 @@ mod row_builder_tests { // values: ["A", "B", "C"] let values = StringArray::from(vec!["A", "B", "C"]); let run_ends = Int32Array::from(vec![2, 5, 6]); - let run_array = RunEndEncodedArray::::try_new(&run_ends, &values).unwrap(); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(6); @@ -2865,13 +2861,13 @@ mod row_builder_tests { #[test] fn test_run_end_encoded_random_access() { - use arrow::array::{RunEndEncodedArray, Int32Array}; + use arrow::array::{RunArray, Int32Array}; use arrow::datatypes::Int32Type; // Create a run-end encoded array: [A, A, B, B, B, C] let values = StringArray::from(vec!["A", "B", "C"]); let run_ends = Int32Array::from(vec![2, 5, 6]); - let run_array = RunEndEncodedArray::::try_new(&run_ends, &values).unwrap(); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); @@ -2892,13 +2888,13 @@ mod row_builder_tests { #[test] fn test_run_end_encoded_with_nulls() { - use arrow::array::{RunEndEncodedArray, Int32Array}; + use arrow::array::{RunArray, Int32Array}; use arrow::datatypes::Int32Type; // Create a run-end encoded array with null values: [A, A, null, null, B] let values = StringArray::from(vec![Some("A"), None, Some("B")]); let run_ends = Int32Array::from(vec![2, 4, 5]); - let run_array = RunEndEncodedArray::::try_new(&run_ends, &values).unwrap(); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(5); From 9f361fd0bee22c098702753f75b9fd746cde9bdd Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 15:57:36 -0700 Subject: [PATCH 09/53] simplify run end management --- .../src/cast_to_variant.rs | 54 ++++++++----------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 7f3f3571cac4..47c1b1d61ab4 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -268,27 +268,7 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { } fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - // Three cases for run tracking - if self.run_number < self.run_ends.len() && - self.run_start <= index && - index < self.run_ends[self.run_number].as_usize() { - // Case 1: Still in same run - O(1) - // No need to update run_number or run_start - } else if self.run_number < self.run_ends.len() && - index == self.run_ends[self.run_number].as_usize() - { - // Case 2: Advanced to next run - O(1) - self.run_start = self.run_ends[self.run_number].as_usize(); - self.run_number += 1; - } else { - // Case 3: Binary search for any other case - O(log n) - self.find_run_containing(index)?; - } - - // Verify we have a valid run - if self.run_number >= self.run_ends.len() { - return Err(ArrowError::CastError(format!("Index {} beyond run array", index))); - } + self.set_run_for_index(index)?; // Handle null values if self.run_array.values().is_null(self.run_number) { @@ -302,21 +282,31 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { Ok(()) } - fn find_run_containing(&mut self, index: usize) -> Result<(), ArrowError> { + fn set_run_for_index(&mut self, index: usize) -> Result<(), ArrowError> { + if index >= self.run_start { + let Some(run_end) = self.run_ends.get(self.run_number) else { + return Err(ArrowError::CastError(format!("Index {} beyond run array", index))); + }; + if index < run_end.as_usize() { + return Ok(()); + } + if index == run_end.as_usize() { + self.run_number += 1; + self.run_start = run_end.as_usize(); + return Ok(()); + } + } + // Use partition_point for all non-sequential cases - self.run_number = self.run_ends.partition_point(|&run_end| run_end.as_usize() <= index); - - if self.run_number >= self.run_ends.len() { + let run_number = self.run_ends.partition_point(|&run_end| run_end.as_usize() <= index); + if run_number >= self.run_ends.len() { return Err(ArrowError::CastError(format!("Index {} beyond run array", index))); } - - // Set run_start - self.run_start = if self.run_number == 0 { - 0 - } else { - self.run_ends[self.run_number - 1].as_usize() + self.run_number = run_number; + self.run_start = match run_number { + 0 => 0, + _ => self.run_ends[run_number - 1].as_usize(), }; - Ok(()) } } From 471a8aba3c7b8e31acd7d211b7ac9ed00eb23e1a Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 16:37:20 -0700 Subject: [PATCH 10/53] checkpoint - dictionary array --- .../src/cast_to_variant.rs | 184 +++++++++++++++++- 1 file changed, 181 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 47c1b1d61ab4..41923eed20da 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -24,13 +24,13 @@ use crate::type_conversion::{ }; use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ - Array, AsArray, OffsetSizeTrait, TimestampMicrosecondArray, TimestampMillisecondArray, + AnyDictionaryArray, Array, AsArray, OffsetSizeTrait, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, }; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::kernels::cast; use arrow::datatypes::{ - i256, ArrowNativeType, ArrowPrimitiveType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, + i256, ArrowDictionaryKeyType, ArrowNativeType, ArrowPrimitiveType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, @@ -69,6 +69,7 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder<'a, Int16Type>), RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder<'a, Int32Type>), RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder<'a, Int64Type>), + Dictionary(DictionaryArrowToVariantBuilder<'a>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -91,6 +92,7 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::RunEndEncodedInt16(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::RunEndEncodedInt32(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::RunEndEncodedInt64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Dictionary(b) => b.append_row(index, builder), } } } @@ -311,6 +313,44 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { } } +/// Dictionary array builder with simple O(1) indexing +pub(crate) struct DictionaryArrowToVariantBuilder<'a> { + dict_array: &'a dyn arrow::array::AnyDictionaryArray, + normalized_keys: Vec, + values_builder: Box>, +} + +impl<'a> DictionaryArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Result { + let dict_array = array.as_any_dictionary(); + let normalized_keys = dict_array.normalized_keys().to_vec(); + + let values_builder = make_arrow_to_variant_row_builder( + dict_array.values().data_type(), + dict_array.values().as_ref(), + )?; + + Ok(Self { + dict_array, + normalized_keys, + values_builder: Box::new(values_builder), + }) + } + + fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + // Dictionary indexing is trivial - just a direct lookup using normalized keys! + let keys = self.dict_array.keys(); + + if keys.is_null(index) { + builder.append_null(); + } else { + let normalized_key = self.normalized_keys[index]; + self.values_builder.append_row(normalized_key, builder)?; + } + Ok(()) + } +} + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, @@ -348,6 +388,11 @@ fn make_arrow_to_variant_row_builder<'a>( } } + // Dictionary types + DataType::Dictionary(_, _) => { + Ok(ArrowToVariantRowBuilder::Dictionary(DictionaryArrowToVariantBuilder::new(array)?)) + } + // TODO: Add other types (Binary, Date, Time, Decimal, etc.) _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), } @@ -2677,7 +2722,7 @@ mod tests { #[cfg(test)] mod row_builder_tests { use super::*; - use arrow::array::{Int32Array, StringArray, BooleanArray}; + use arrow::array::{ArrayRef, Int32Array, StringArray, BooleanArray}; #[test] fn test_primitive_row_builder() { @@ -2906,4 +2951,137 @@ mod row_builder_tests { assert!(variant_array.is_null(3)); // Run 1 (null) assert_eq!(variant_array.value(4), Variant::from("B")); // Run 2 } + + #[test] + fn test_dictionary_row_builder() { + use arrow::array::{DictionaryArray, Int32Array}; + use arrow::datatypes::Int32Type; + + // Create a dictionary array: keys=[0, 1, 0, 2, 1], values=["apple", "banana", "cherry"] + let values = StringArray::from(vec!["apple", "banana", "cherry"]); + let keys = Int32Array::from(vec![0, 1, 0, 2, 1]); + let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(5); + + // Test sequential access + for i in 0..5 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 5); + + // Verify the values match the dictionary lookup + assert_eq!(variant_array.value(0), Variant::from("apple")); // keys[0] = 0 -> values[0] = "apple" + assert_eq!(variant_array.value(1), Variant::from("banana")); // keys[1] = 1 -> values[1] = "banana" + assert_eq!(variant_array.value(2), Variant::from("apple")); // keys[2] = 0 -> values[0] = "apple" + assert_eq!(variant_array.value(3), Variant::from("cherry")); // keys[3] = 2 -> values[2] = "cherry" + assert_eq!(variant_array.value(4), Variant::from("banana")); // keys[4] = 1 -> values[1] = "banana" + } + + #[test] + fn test_dictionary_with_nulls() { + use arrow::array::{DictionaryArray, Int32Array}; + use arrow::datatypes::Int32Type; + + // Create a dictionary array with null keys: keys=[0, null, 1, null, 2], values=["x", "y", "z"] + let values = StringArray::from(vec!["x", "y", "z"]); + let keys = Int32Array::from(vec![Some(0), None, Some(1), None, Some(2)]); + let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(5); + + // Test sequential access + for i in 0..5 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 5); + + // Verify the values and nulls + assert_eq!(variant_array.value(0), Variant::from("x")); // keys[0] = 0 -> values[0] = "x" + assert!(variant_array.is_null(1)); // keys[1] = null + assert_eq!(variant_array.value(2), Variant::from("y")); // keys[2] = 1 -> values[1] = "y" + assert!(variant_array.is_null(3)); // keys[3] = null + assert_eq!(variant_array.value(4), Variant::from("z")); // keys[4] = 2 -> values[2] = "z" + } + + #[test] + fn test_dictionary_random_access() { + use arrow::array::{DictionaryArray, Int32Array}; + use arrow::datatypes::Int32Type; + + // Create a dictionary array: keys=[0, 1, 2, 0, 1, 2], values=["red", "green", "blue"] + let values = StringArray::from(vec!["red", "green", "blue"]); + let keys = Int32Array::from(vec![0, 1, 2, 0, 1, 2]); + let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + + // Test random access pattern + let access_pattern = [5, 0, 3, 1, 4, 2]; // Random order + let expected_values = ["blue", "red", "red", "green", "green", "blue"]; + + for (i, &index) in access_pattern.iter().enumerate() { + let mut array_builder = VariantArrayBuilder::new(1); + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(index, &mut variant_builder).unwrap(); + variant_builder.finish(); + + let variant_array = array_builder.build(); + assert_eq!(variant_array.value(0), Variant::from(expected_values[i])); + } + } + + #[test] + fn test_nested_dictionary() { + use arrow::array::{DictionaryArray, Int32Array, StructArray}; + use arrow::datatypes::{Int32Type, Field}; + + // Create a dictionary with struct values + let id_array = Int32Array::from(vec![1, 2, 3]); + let name_array = StringArray::from(vec!["Alice", "Bob", "Charlie"]); + let struct_array = StructArray::from(vec![ + (Arc::new(Field::new("id", DataType::Int32, false)), Arc::new(id_array) as ArrayRef), + (Arc::new(Field::new("name", DataType::Utf8, false)), Arc::new(name_array) as ArrayRef), + ]); + + let keys = Int32Array::from(vec![0, 1, 0, 2, 1]); + let dict_array = DictionaryArray::::try_new(keys, Arc::new(struct_array)).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(5); + + // Test sequential access + for i in 0..5 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 5); + + // Verify the nested struct values + let first_variant = variant_array.value(0); + assert_eq!(first_variant.get_object_field("id"), Some(Variant::from(1))); + assert_eq!(first_variant.get_object_field("name"), Some(Variant::from("Alice"))); + + let second_variant = variant_array.value(1); + assert_eq!(second_variant.get_object_field("id"), Some(Variant::from(2))); + assert_eq!(second_variant.get_object_field("name"), Some(Variant::from("Bob"))); + + // Test that repeated keys give same values + let third_variant = variant_array.value(2); + assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(1))); + assert_eq!(third_variant.get_object_field("name"), Some(Variant::from("Alice"))); + } } From 82ccc23e0540cd11dffe98424e6cbddcce019bea Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 16:46:11 -0700 Subject: [PATCH 11/53] manual cleanup --- .../src/cast_to_variant.rs | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 41923eed20da..4cd3fb1eeaf4 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -24,13 +24,13 @@ use crate::type_conversion::{ }; use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ - AnyDictionaryArray, Array, AsArray, OffsetSizeTrait, TimestampMicrosecondArray, TimestampMillisecondArray, + Array, AsArray, OffsetSizeTrait, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, }; use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::kernels::cast; use arrow::datatypes::{ - i256, ArrowDictionaryKeyType, ArrowNativeType, ArrowPrimitiveType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, + i256, ArrowNativeType, ArrowPrimitiveType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, @@ -315,7 +315,7 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { /// Dictionary array builder with simple O(1) indexing pub(crate) struct DictionaryArrowToVariantBuilder<'a> { - dict_array: &'a dyn arrow::array::AnyDictionaryArray, + keys: &'a dyn Array, // only needed for null checks normalized_keys: Vec, values_builder: Box>, } @@ -323,25 +323,27 @@ pub(crate) struct DictionaryArrowToVariantBuilder<'a> { impl<'a> DictionaryArrowToVariantBuilder<'a> { fn new(array: &'a dyn Array) -> Result { let dict_array = array.as_any_dictionary(); - let normalized_keys = dict_array.normalized_keys().to_vec(); - + let values = dict_array.values(); let values_builder = make_arrow_to_variant_row_builder( - dict_array.values().data_type(), - dict_array.values().as_ref(), + values.data_type(), + values.as_ref(), )?; + // WARNING: normalized_keys panics if values is empty + let normalized_keys = match values.len() { + 0 => Vec::new(), + _ => dict_array.normalized_keys(), + }; + Ok(Self { - dict_array, + keys: dict_array.keys(), normalized_keys, values_builder: Box::new(values_builder), }) } fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - // Dictionary indexing is trivial - just a direct lookup using normalized keys! - let keys = self.dict_array.keys(); - - if keys.is_null(index) { + if self.keys.is_null(index) { builder.append_null(); } else { let normalized_key = self.normalized_keys[index]; From e08e07ce672c9571987d5e92c43c88866c4f3411 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 17:58:42 -0700 Subject: [PATCH 12/53] checkpoint - lists --- .../src/cast_to_variant.rs | 250 ++++++++++++++++++ 1 file changed, 250 insertions(+) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 4cd3fb1eeaf4..5d9beb254c37 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -70,6 +70,8 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder<'a, Int32Type>), RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder<'a, Int64Type>), Dictionary(DictionaryArrowToVariantBuilder<'a>), + List(ListArrowToVariantBuilder<'a, i32>), + LargeList(ListArrowToVariantBuilder<'a, i64>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -93,6 +95,8 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::RunEndEncodedInt32(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::RunEndEncodedInt64(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Dictionary(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::List(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::LargeList(b) => b.append_row(index, builder), } } } @@ -353,6 +357,55 @@ impl<'a> DictionaryArrowToVariantBuilder<'a> { } } +/// Generic list builder for List and LargeList types +pub(crate) struct ListArrowToVariantBuilder<'a, O: OffsetSizeTrait> { + list_array: &'a arrow::array::GenericListArray, + values_builder: Box>, + first_offset: O, +} + +impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { + fn new(array: &'a dyn Array) -> Result { + let list_array = array.as_list::(); + let values = list_array.values(); + let offsets = list_array.offsets(); + + // Required for correctness when list array is sliced + let first_offset = *offsets.first().expect("There should be an offset"); + let length = *offsets.last().expect("There should be an offset") - first_offset; + let sliced_values = values.slice(first_offset.as_usize(), length.as_usize()); + + let values_builder = make_arrow_to_variant_row_builder( + sliced_values.data_type(), + sliced_values.as_ref(), + )?; + + Ok(Self { + list_array, + values_builder: Box::new(values_builder), + first_offset, + }) + } + + fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.list_array.is_null(index) { + builder.append_null(); + return Ok(()); + } + + let offsets = self.list_array.offsets(); + let start = (offsets[index] - self.first_offset).as_usize(); + let end = (offsets[index + 1] - self.first_offset).as_usize(); + + let mut list_builder = builder.try_new_list()?; + for value_index in start..end { + self.values_builder.append_row(value_index, &mut list_builder)?; + } + list_builder.finish(); + Ok(()) + } +} + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, @@ -395,6 +448,10 @@ fn make_arrow_to_variant_row_builder<'a>( Ok(ArrowToVariantRowBuilder::Dictionary(DictionaryArrowToVariantBuilder::new(array)?)) } + // List types + DataType::List(_) => Ok(ArrowToVariantRowBuilder::List(ListArrowToVariantBuilder::new(array)?)), + DataType::LargeList(_) => Ok(ArrowToVariantRowBuilder::LargeList(ListArrowToVariantBuilder::new(array)?)), + // TODO: Add other types (Binary, Date, Time, Decimal, etc.) _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), } @@ -3086,4 +3143,197 @@ mod row_builder_tests { assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(1))); assert_eq!(third_variant.get_object_field("name"), Some(Variant::from("Alice"))); } + + #[test] + fn test_list_row_builder() { + use arrow::array::{ListArray, Int32Array}; + use arrow::datatypes::{DataType, Field}; + use std::sync::Arc; + + // Create a list array: [[1, 2], [3, 4, 5], null, []] + let data = vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + None, + Some(vec![]), + ]; + let list_array = ListArray::from_iter_primitive::(data); + + let mut row_builder = make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); + + // Test each row + for i in 0..list_array.len() { + row_builder.append_row(i, &mut variant_array_builder).unwrap(); + } + + let variant_array = variant_array_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 4); + + // Row 0: [1, 2] + let row0 = variant_array.value(0); + let list0 = row0.get_list().unwrap(); + assert_eq!(list0.len(), 2); + assert_eq!(list0.get(0), Some(Variant::from(1))); + assert_eq!(list0.get(1), Some(Variant::from(2))); + + // Row 1: [3, 4, 5] + let row1 = variant_array.value(1); + let list1 = row1.get_list().unwrap(); + assert_eq!(list1.len(), 3); + assert_eq!(list1.get(0), Some(Variant::from(3))); + assert_eq!(list1.get(1), Some(Variant::from(4))); + assert_eq!(list1.get(2), Some(Variant::from(5))); + + // Row 2: null + assert!(variant_array.is_null(2)); + + // Row 3: [] + let row3 = variant_array.value(3); + let list3 = row3.get_list().unwrap(); + assert_eq!(list3.len(), 0); + } + + #[test] + fn test_large_list_row_builder() { + use arrow::array::{LargeListArray, Int64Array}; + use arrow::datatypes::{DataType, Field}; + use std::sync::Arc; + + // Create a large list array: [[1, 2], null] + let data = vec![ + Some(vec![Some(1i64), Some(2i64)]), + None, + ]; + let list_array = LargeListArray::from_iter_primitive::(data); + + let mut row_builder = make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); + + // Test each row + for i in 0..list_array.len() { + row_builder.append_row(i, &mut variant_array_builder).unwrap(); + } + + let variant_array = variant_array_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 2); + + // Row 0: [1, 2] + let row0 = variant_array.value(0); + let list0 = row0.get_list().unwrap(); + assert_eq!(list0.len(), 2); + assert_eq!(list0.get(0), Some(Variant::from(1i64))); + assert_eq!(list0.get(1), Some(Variant::from(2i64))); + + // Row 1: null + assert!(variant_array.is_null(1)); + } + + #[test] + fn test_sliced_list_row_builder() { + use arrow::array::{ListArray, Int32Array}; + use arrow::datatypes::{DataType, Field}; + use std::sync::Arc; + + // Create a list array: [[1, 2], [3, 4, 5], [6]] + let data = vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + Some(vec![Some(6)]), + ]; + let list_array = ListArray::from_iter_primitive::(data); + + // Slice to get just the middle element: [[3, 4, 5]] + let sliced_array = list_array.slice(1, 1); + + let mut row_builder = make_arrow_to_variant_row_builder(sliced_array.data_type(), sliced_array.as_ref()).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(sliced_array.len()); + + // Test the single row + row_builder.append_row(0, &mut variant_array_builder).unwrap(); + + let variant_array = variant_array_builder.build(); + + // Verify result + assert_eq!(variant_array.len(), 1); + + // Row 0: [3, 4, 5] + let row0 = variant_array.value(0); + let list0 = row0.get_list().unwrap(); + assert_eq!(list0.len(), 3); + assert_eq!(list0.get(0), Some(Variant::from(3))); + assert_eq!(list0.get(1), Some(Variant::from(4))); + assert_eq!(list0.get(2), Some(Variant::from(5))); + } + + #[test] + fn test_nested_list_row_builder() { + use arrow::array::{ListArray, Int32Array}; + use arrow::datatypes::{DataType, Field}; + use std::sync::Arc; + + // Create nested list: [[[1, 2], [3]], null] + let inner_data = vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3)]), + ]; + let inner_list = ListArray::from_iter_primitive::(inner_data); + + let outer_data = vec![ + Some(vec![Some(0), Some(1)]), // References to inner list elements + None, + ]; + + // Build the nested structure manually + let inner_field = Arc::new(Field::new("item", DataType::Int32, true)); + let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_field), true)); + + let values_data = vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3)]), + ]; + let values_list = ListArray::from_iter_primitive::(values_data); + + let outer_offsets = arrow::buffer::OffsetBuffer::new(vec![0i32, 2, 2].into()); + let outer_list = ListArray::new( + inner_list_field, + outer_offsets, + Arc::new(values_list), + Some(arrow::buffer::NullBuffer::from(vec![true, false])), + ); + + let mut row_builder = make_arrow_to_variant_row_builder(outer_list.data_type(), &outer_list).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(outer_list.len()); + + // Test each row + for i in 0..outer_list.len() { + row_builder.append_row(i, &mut variant_array_builder).unwrap(); + } + + let variant_array = variant_array_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 2); + + // Row 0: [[1, 2], [3]] + let row0 = variant_array.value(0); + let outer_list0 = row0.get_list().unwrap(); + assert_eq!(outer_list0.len(), 2); + + let inner_list0_0 = outer_list0.get(0).unwrap().get_list().unwrap(); + assert_eq!(inner_list0_0.len(), 2); + assert_eq!(inner_list0_0.get(0), Some(Variant::from(1))); + assert_eq!(inner_list0_0.get(1), Some(Variant::from(2))); + + let inner_list0_1 = outer_list0.get(1).unwrap().get_list().unwrap(); + assert_eq!(inner_list0_1.len(), 1); + assert_eq!(inner_list0_1.get(0), Some(Variant::from(3))); + + // Row 1: null + assert!(variant_array.is_null(1)); + } } From 87a36786bf52e01136ce342bb67469d9d91dd972 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 18:23:27 -0700 Subject: [PATCH 13/53] checkpoint - lists w/o slicing --- parquet-variant-compute/src/cast_to_variant.rs | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 5d9beb254c37..b7bc95df76ae 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -361,29 +361,22 @@ impl<'a> DictionaryArrowToVariantBuilder<'a> { pub(crate) struct ListArrowToVariantBuilder<'a, O: OffsetSizeTrait> { list_array: &'a arrow::array::GenericListArray, values_builder: Box>, - first_offset: O, } impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { fn new(array: &'a dyn Array) -> Result { let list_array = array.as_list::(); let values = list_array.values(); - let offsets = list_array.offsets(); - - // Required for correctness when list array is sliced - let first_offset = *offsets.first().expect("There should be an offset"); - let length = *offsets.last().expect("There should be an offset") - first_offset; - let sliced_values = values.slice(first_offset.as_usize(), length.as_usize()); + // Create builder for the values array directly - no slicing needed let values_builder = make_arrow_to_variant_row_builder( - sliced_values.data_type(), - sliced_values.as_ref(), + values.data_type(), + values.as_ref(), )?; Ok(Self { list_array, values_builder: Box::new(values_builder), - first_offset, }) } @@ -394,8 +387,8 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { } let offsets = self.list_array.offsets(); - let start = (offsets[index] - self.first_offset).as_usize(); - let end = (offsets[index + 1] - self.first_offset).as_usize(); + let start = offsets[index].as_usize(); // Direct offset - no adjustment needed + let end = offsets[index + 1].as_usize(); // Direct offset - no adjustment needed let mut list_builder = builder.try_new_list()?; for value_index in start..end { From 2ac5f6dd5b29a85be61b7e51757f0a27954a7b5b Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 18:32:56 -0700 Subject: [PATCH 14/53] manual cleanup --- .../src/cast_to_variant.rs | 61 +++++++++---------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index b7bc95df76ae..4131c91a4a55 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -368,7 +368,6 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { let list_array = array.as_list::(); let values = list_array.values(); - // Create builder for the values array directly - no slicing needed let values_builder = make_arrow_to_variant_row_builder( values.data_type(), values.as_ref(), @@ -387,8 +386,8 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { } let offsets = self.list_array.offsets(); - let start = offsets[index].as_usize(); // Direct offset - no adjustment needed - let end = offsets[index + 1].as_usize(); // Direct offset - no adjustment needed + let start = offsets[index].as_usize(); + let end = offsets[index + 1].as_usize(); let mut list_builder = builder.try_new_list()?; for value_index in start..end { @@ -3139,9 +3138,7 @@ mod row_builder_tests { #[test] fn test_list_row_builder() { - use arrow::array::{ListArray, Int32Array}; - use arrow::datatypes::{DataType, Field}; - use std::sync::Arc; + use arrow::array::{ListArray}; // Create a list array: [[1, 2], [3, 4, 5], null, []] let data = vec![ @@ -3155,9 +3152,10 @@ mod row_builder_tests { let mut row_builder = make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); - // Test each row for i in 0..list_array.len() { - row_builder.append_row(i, &mut variant_array_builder).unwrap(); + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); } let variant_array = variant_array_builder.build(); @@ -3167,14 +3165,14 @@ mod row_builder_tests { // Row 0: [1, 2] let row0 = variant_array.value(0); - let list0 = row0.get_list().unwrap(); + let list0 = row0.as_list().unwrap(); assert_eq!(list0.len(), 2); assert_eq!(list0.get(0), Some(Variant::from(1))); assert_eq!(list0.get(1), Some(Variant::from(2))); // Row 1: [3, 4, 5] let row1 = variant_array.value(1); - let list1 = row1.get_list().unwrap(); + let list1 = row1.as_list().unwrap(); assert_eq!(list1.len(), 3); assert_eq!(list1.get(0), Some(Variant::from(3))); assert_eq!(list1.get(1), Some(Variant::from(4))); @@ -3185,15 +3183,13 @@ mod row_builder_tests { // Row 3: [] let row3 = variant_array.value(3); - let list3 = row3.get_list().unwrap(); + let list3 = row3.as_list().unwrap(); assert_eq!(list3.len(), 0); } #[test] fn test_large_list_row_builder() { - use arrow::array::{LargeListArray, Int64Array}; - use arrow::datatypes::{DataType, Field}; - use std::sync::Arc; + use arrow::array::{LargeListArray}; // Create a large list array: [[1, 2], null] let data = vec![ @@ -3205,9 +3201,10 @@ mod row_builder_tests { let mut row_builder = make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); - // Test each row for i in 0..list_array.len() { - row_builder.append_row(i, &mut variant_array_builder).unwrap(); + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); } let variant_array = variant_array_builder.build(); @@ -3217,7 +3214,7 @@ mod row_builder_tests { // Row 0: [1, 2] let row0 = variant_array.value(0); - let list0 = row0.get_list().unwrap(); + let list0 = row0.as_list().unwrap(); assert_eq!(list0.len(), 2); assert_eq!(list0.get(0), Some(Variant::from(1i64))); assert_eq!(list0.get(1), Some(Variant::from(2i64))); @@ -3228,9 +3225,7 @@ mod row_builder_tests { #[test] fn test_sliced_list_row_builder() { - use arrow::array::{ListArray, Int32Array}; - use arrow::datatypes::{DataType, Field}; - use std::sync::Arc; + use arrow::array::{ListArray}; // Create a list array: [[1, 2], [3, 4, 5], [6]] let data = vec![ @@ -3243,11 +3238,13 @@ mod row_builder_tests { // Slice to get just the middle element: [[3, 4, 5]] let sliced_array = list_array.slice(1, 1); - let mut row_builder = make_arrow_to_variant_row_builder(sliced_array.data_type(), sliced_array.as_ref()).unwrap(); + let mut row_builder = make_arrow_to_variant_row_builder(sliced_array.data_type(), &sliced_array).unwrap(); let mut variant_array_builder = VariantArrayBuilder::new(sliced_array.len()); // Test the single row - row_builder.append_row(0, &mut variant_array_builder).unwrap(); + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(0, &mut builder).unwrap(); + builder.finish(); let variant_array = variant_array_builder.build(); @@ -3256,7 +3253,7 @@ mod row_builder_tests { // Row 0: [3, 4, 5] let row0 = variant_array.value(0); - let list0 = row0.get_list().unwrap(); + let list0 = row0.as_list().unwrap(); assert_eq!(list0.len(), 3); assert_eq!(list0.get(0), Some(Variant::from(3))); assert_eq!(list0.get(1), Some(Variant::from(4))); @@ -3265,9 +3262,8 @@ mod row_builder_tests { #[test] fn test_nested_list_row_builder() { - use arrow::array::{ListArray, Int32Array}; - use arrow::datatypes::{DataType, Field}; - use std::sync::Arc; + use arrow::array::{ListArray}; + use arrow::datatypes::Field; // Create nested list: [[[1, 2], [3]], null] let inner_data = vec![ @@ -3302,9 +3298,10 @@ mod row_builder_tests { let mut row_builder = make_arrow_to_variant_row_builder(outer_list.data_type(), &outer_list).unwrap(); let mut variant_array_builder = VariantArrayBuilder::new(outer_list.len()); - // Test each row for i in 0..outer_list.len() { - row_builder.append_row(i, &mut variant_array_builder).unwrap(); + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); } let variant_array = variant_array_builder.build(); @@ -3314,15 +3311,17 @@ mod row_builder_tests { // Row 0: [[1, 2], [3]] let row0 = variant_array.value(0); - let outer_list0 = row0.get_list().unwrap(); + let outer_list0 = row0.as_list().unwrap(); assert_eq!(outer_list0.len(), 2); - let inner_list0_0 = outer_list0.get(0).unwrap().get_list().unwrap(); + let inner_list0_0 = outer_list0.get(0).unwrap(); + let inner_list0_0 = inner_list0_0.as_list().unwrap(); assert_eq!(inner_list0_0.len(), 2); assert_eq!(inner_list0_0.get(0), Some(Variant::from(1))); assert_eq!(inner_list0_0.get(1), Some(Variant::from(2))); - let inner_list0_1 = outer_list0.get(1).unwrap().get_list().unwrap(); + let inner_list0_1 = outer_list0.get(1).unwrap(); + let inner_list0_1 = inner_list0_1.as_list().unwrap(); assert_eq!(inner_list0_1.len(), 1); assert_eq!(inner_list0_1.get(0), Some(Variant::from(3))); From 4d8da85da36a77449dfba721bb3cde9ffcb835f0 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 18:44:15 -0700 Subject: [PATCH 15/53] checkpoint - maps and null map fix --- .../src/cast_to_variant.rs | 246 +++++++++++++++--- 1 file changed, 210 insertions(+), 36 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 4131c91a4a55..df6cedaf2fee 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -72,6 +72,7 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { Dictionary(DictionaryArrowToVariantBuilder<'a>), List(ListArrowToVariantBuilder<'a, i32>), LargeList(ListArrowToVariantBuilder<'a, i64>), + Map(MapArrowToVariantBuilder<'a>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -97,6 +98,7 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::Dictionary(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::List(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::LargeList(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Map(b) => b.append_row(index, builder), } } } @@ -398,6 +400,61 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { } } +/// Map builder for MapArray types +pub(crate) struct MapArrowToVariantBuilder<'a> { + map_array: &'a arrow::array::MapArray, + key_strings: arrow::array::StringArray, + values_builder: Box>, +} + +impl<'a> MapArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Result { + let map_array = array.as_map(); + + // Pre-cast keys to strings once (like existing convert_map code) + let keys = cast(map_array.keys(), &DataType::Utf8)?; + let key_strings = keys.as_string::().clone(); + + // Create recursive builder for values + let values = map_array.values(); + let values_builder = make_arrow_to_variant_row_builder( + values.data_type(), + values.as_ref(), + )?; + + Ok(Self { + map_array, + key_strings, + values_builder: Box::new(values_builder), + }) + } + + fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + // Check for NULL map first (via null bitmap) + if self.map_array.is_null(index) { + builder.append_null(); + return Ok(()); + } + + let offsets = self.map_array.offsets(); + let start = offsets[index].as_usize(); + let end = offsets[index + 1].as_usize(); + + // Create object builder for this map (even if empty) + let mut object_builder = builder.try_new_object()?; + + // Add each key-value pair (loop does nothing for empty maps - correct!) + for kv_index in start..end { + let key = self.key_strings.value(kv_index); + let mut field_builder = object_builder.field(key); + self.values_builder.append_row(kv_index, &mut field_builder)?; + } + + object_builder.finish(); // Empty map becomes empty object {} + Ok(()) + } +} + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, @@ -444,6 +501,9 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::List(_) => Ok(ArrowToVariantRowBuilder::List(ListArrowToVariantBuilder::new(array)?)), DataType::LargeList(_) => Ok(ArrowToVariantRowBuilder::LargeList(ListArrowToVariantBuilder::new(array)?)), + // Map types + DataType::Map(_, _) => Ok(ArrowToVariantRowBuilder::Map(MapArrowToVariantBuilder::new(array)?)), + // TODO: Add other types (Binary, Date, Time, Decimal, etc.) _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), } @@ -893,29 +953,29 @@ fn convert_map( let values = cast_to_variant(map_array.values())?; let offsets = map_array.offsets(); - let mut start_offset = offsets[0]; - for end_offset in offsets.iter().skip(1) { - if start_offset >= *end_offset { + for i in 0..map_array.len() { + // Check for NULL map first (FIXED: was checking offsets before) + if map_array.is_null(i) { builder.append_null(); continue; } - - let length = (end_offset - start_offset) as usize; + + let start = offsets[i].as_usize(); + let end = offsets[i + 1].as_usize(); let mut variant_builder = VariantBuilder::new(); let mut object_builder = variant_builder.new_object(); - for i in start_offset..*end_offset { - let value = values.value(i as usize); - object_builder.insert(key_strings.value(i as usize), value); + // Add key-value pairs (empty range = empty object, FIXED) + for j in start..end { + let value = values.value(j); + object_builder.insert(key_strings.value(j), value); } + object_builder.finish(); let (metadata, value) = variant_builder.finish(); let variant = Variant::try_new(&metadata, &value)?; - builder.append_variant(variant); - - start_offset += length as i32; } } _ => { @@ -2502,35 +2562,63 @@ mod tests { } #[test] - fn test_cast_to_variant_map_with_nulls() { - let keys = vec!["key1", "key2", "key3"]; - let values_data = Int32Array::from(vec![1, 2, 3]); - let entry_offsets = vec![0, 1, 1, 3]; - let map_array = - MapArray::new_from_strings(keys.clone().into_iter(), &values_data, &entry_offsets) - .unwrap(); + fn test_cast_to_variant_map_with_nulls_and_empty() { + use arrow::array::{MapArray, Int32Array, StringArray, StructArray}; + use arrow::buffer::{OffsetBuffer, NullBuffer}; + use arrow::datatypes::{DataType, Field, Fields}; + use std::sync::Arc; - let result = cast_to_variant(&map_array).unwrap(); - // [{"key1":1}] - let variant1 = result.value(0); - assert_eq!( - variant1.as_object().unwrap().get("key1").unwrap(), - Variant::from(1) + // Create entries struct array + let keys = StringArray::from(vec!["key1", "key2", "key3"]); + let values = Int32Array::from(vec![1, 2, 3]); + let entries_fields = Fields::from(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Int32, true), + ]); + let entries = StructArray::new( + entries_fields.clone(), + vec![Arc::new(keys), Arc::new(values)], + None, ); - // None - assert!(result.is_null(1)); + // Create offsets for 4 maps: [0..1], [1..1], [1..1], [1..3] + let offsets = OffsetBuffer::new(vec![0, 1, 1, 1, 3].into()); + + // Create null buffer - map at index 2 is NULL + let null_buffer = Some(NullBuffer::from(vec![true, true, false, true])); + + let map_field = Arc::new(Field::new( + "entries", + DataType::Struct(entries_fields), + false, + )); + + let map_array = MapArray::try_new( + map_field, + offsets, + entries, + null_buffer, + false, + ).unwrap(); - // [{"key2":2},{"key3":3}] - let variant2 = result.value(2); - assert_eq!( - variant2.as_object().unwrap().get("key2").unwrap(), - Variant::from(2) - ); - assert_eq!( - variant2.as_object().unwrap().get("key3").unwrap(), - Variant::from(3) - ); + let result = cast_to_variant(&map_array).unwrap(); + + // Map 0: {"key1": 1} + let variant0 = result.value(0); + assert_eq!(variant0.as_object().unwrap().get("key1").unwrap(), Variant::from(1)); + + // Map 1: {} (empty, not null) - FIXED: was incorrectly null before + let variant1 = result.value(1); + let obj1 = variant1.as_object().unwrap(); + assert_eq!(obj1.len(), 0); // Empty object + + // Map 2: null (actual NULL) + assert!(result.is_null(2)); + + // Map 3: {"key2": 2, "key3": 3} + let variant3 = result.value(3); + assert_eq!(variant3.as_object().unwrap().get("key2").unwrap(), Variant::from(2)); + assert_eq!(variant3.as_object().unwrap().get("key3").unwrap(), Variant::from(3)); } #[test] @@ -3328,4 +3416,90 @@ mod row_builder_tests { // Row 1: null assert!(variant_array.is_null(1)); } + + #[test] + fn test_map_row_builder() { + use arrow::array::{MapArray, Int32Array, StringArray, StructArray}; + use arrow::buffer::{OffsetBuffer, NullBuffer}; + use arrow::datatypes::{DataType, Field, Fields}; + use std::sync::Arc; + + // Create the entries struct array (key-value pairs) + let keys = StringArray::from(vec!["key1", "key2", "key3"]); + let values = Int32Array::from(vec![1, 2, 3]); + let entries_fields = Fields::from(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Int32, true), + ]); + let entries = StructArray::new( + entries_fields.clone(), + vec![Arc::new(keys), Arc::new(values)], + None, // No nulls in the entries themselves + ); + + // Create offsets for 4 maps: [0..1], [1..1], [1..1], [1..3] + // Map 0: {"key1": 1} (1 entry) + // Map 1: {} (0 entries - empty) + // Map 2: null (0 entries but NULL via null buffer) + // Map 3: {"key2": 2, "key3": 3} (2 entries) + let offsets = OffsetBuffer::new(vec![0, 1, 1, 1, 3].into()); + + // Create null buffer - map at index 2 is NULL + let null_buffer = Some(NullBuffer::from(vec![true, true, false, true])); + + // Create the map field + let map_field = Arc::new(Field::new( + "entries", + DataType::Struct(entries_fields), + false, // Keys are non-nullable + )); + + // Create MapArray using try_new + let map_array = MapArray::try_new( + map_field, + offsets, + entries, + null_buffer, + false, // not ordered + ).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder( + map_array.data_type(), + &map_array + ).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(4); + + // Test each row + for i in 0..4 { + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = variant_array_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 4); + + // Map 0: {"key1": 1} + let map0 = variant_array.value(0); + let obj0 = map0.as_object().unwrap(); + assert_eq!(obj0.len(), 1); + assert_eq!(obj0.get("key1"), Some(Variant::from(1))); + + // Map 1: {} (empty object, not null) + let map1 = variant_array.value(1); + let obj1 = map1.as_object().unwrap(); + assert_eq!(obj1.len(), 0); // Empty object + + // Map 2: null (actual NULL) + assert!(variant_array.is_null(2)); + + // Map 3: {"key2": 2, "key3": 3} + let map3 = variant_array.value(3); + let obj3 = map3.as_object().unwrap(); + assert_eq!(obj3.len(), 2); + assert_eq!(obj3.get("key2"), Some(Variant::from(2))); + assert_eq!(obj3.get("key3"), Some(Variant::from(3))); + } } From 70a0c759da3d09ab849b1038822952b8708a1bb3 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 18:47:37 -0700 Subject: [PATCH 16/53] manual cleanup --- parquet-variant-compute/src/cast_to_variant.rs | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index df6cedaf2fee..3354d08c3d60 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -43,7 +43,7 @@ use arrow::temporal_conversions::{ use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit, UnionFields}; use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; use parquet_variant::{ - Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, + ObjectFieldBuilder, Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; // ============================================================================ @@ -446,7 +446,7 @@ impl<'a> MapArrowToVariantBuilder<'a> { // Add each key-value pair (loop does nothing for empty maps - correct!) for kv_index in start..end { let key = self.key_strings.value(kv_index); - let mut field_builder = object_builder.field(key); + let mut field_builder = ObjectFieldBuilder::new(key, &mut object_builder); self.values_builder.append_row(kv_index, &mut field_builder)?; } @@ -2607,7 +2607,7 @@ mod tests { let variant0 = result.value(0); assert_eq!(variant0.as_object().unwrap().get("key1").unwrap(), Variant::from(1)); - // Map 1: {} (empty, not null) - FIXED: was incorrectly null before + // Map 1: {} (empty, not null) let variant1 = result.value(1); let obj1 = variant1.as_object().unwrap(); assert_eq!(obj1.len(), 0); // Empty object @@ -3352,18 +3352,6 @@ mod row_builder_tests { fn test_nested_list_row_builder() { use arrow::array::{ListArray}; use arrow::datatypes::Field; - - // Create nested list: [[[1, 2], [3]], null] - let inner_data = vec![ - Some(vec![Some(1), Some(2)]), - Some(vec![Some(3)]), - ]; - let inner_list = ListArray::from_iter_primitive::(inner_data); - - let outer_data = vec![ - Some(vec![Some(0), Some(1)]), // References to inner list elements - None, - ]; // Build the nested structure manually let inner_field = Arc::new(Field::new("item", DataType::Int32, true)); From b9239bb592bbfe4fe7197f17e7f2d45b557603ce Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 19:02:46 -0700 Subject: [PATCH 17/53] checkpoint - unions --- .../src/cast_to_variant.rs | 247 ++++++++++++++++++ 1 file changed, 247 insertions(+) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 3354d08c3d60..34d3b584505e 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -73,6 +73,7 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { List(ListArrowToVariantBuilder<'a, i32>), LargeList(ListArrowToVariantBuilder<'a, i64>), Map(MapArrowToVariantBuilder<'a>), + Union(UnionArrowToVariantBuilder<'a>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -99,6 +100,7 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::List(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::LargeList(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Map(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Union(b) => b.append_row(index, builder), } } } @@ -455,6 +457,50 @@ impl<'a> MapArrowToVariantBuilder<'a> { } } +/// Union builder for both sparse and dense union arrays +pub(crate) struct UnionArrowToVariantBuilder<'a> { + union_array: &'a arrow::array::UnionArray, + child_builders: HashMap>>, +} + +impl<'a> UnionArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Result { + let union_array = array.as_union(); + let union_fields = union_array.union_fields(); + + // Create child builders for each union field + let mut child_builders = HashMap::new(); + for (type_id, _field) in union_fields.iter() { + let child_array = union_array.child(type_id); + let child_builder = make_arrow_to_variant_row_builder( + child_array.data_type(), + child_array.as_ref(), + )?; + child_builders.insert(type_id, Box::new(child_builder)); + } + + Ok(Self { + union_array, + child_builders, + }) + } + + fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + let type_id = self.union_array.type_id(index); + let value_offset = self.union_array.value_offset(index); + + if let Some(child_builder) = self.child_builders.get_mut(&type_id) { + // Delegate to the appropriate child builder + child_builder.append_row(value_offset, builder)?; + } else { + // Invalid type_id - should not happen in valid union, handle gracefully + builder.append_null(); + } + + Ok(()) + } +} + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, @@ -504,6 +550,9 @@ fn make_arrow_to_variant_row_builder<'a>( // Map types DataType::Map(_, _) => Ok(ArrowToVariantRowBuilder::Map(MapArrowToVariantBuilder::new(array)?)), + // Union types + DataType::Union(_, _) => Ok(ArrowToVariantRowBuilder::Union(UnionArrowToVariantBuilder::new(array)?)), + // TODO: Add other types (Binary, Date, Time, Decimal, etc.) _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), } @@ -3490,4 +3539,202 @@ mod row_builder_tests { assert_eq!(obj3.get("key2"), Some(Variant::from(2))); assert_eq!(obj3.get("key3"), Some(Variant::from(3))); } + + #[test] + fn test_union_sparse_row_builder() { + use arrow::array::{Int32Array, Float64Array, StringArray, UnionArray}; + use arrow::buffer::ScalarBuffer; + use arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; + use std::sync::Arc; + + // Create a sparse union array with mixed types (int, float, string) + let int_array = Int32Array::from(vec![Some(1), None, None, None, Some(34), None]); + let float_array = Float64Array::from(vec![None, Some(3.2), None, Some(32.5), None, None]); + let string_array = StringArray::from(vec![None, None, Some("hello"), None, None, None]); + let type_ids = [0, 1, 2, 1, 0, 0].into_iter().collect::>(); + + let union_fields = UnionFields::new( + vec![0, 1, 2], + vec![ + Field::new("int_field", DataType::Int32, false), + Field::new("float_field", DataType::Float64, false), + Field::new("string_field", DataType::Utf8, false), + ], + ); + + let children: Vec> = vec![ + Arc::new(int_array), + Arc::new(float_array), + Arc::new(string_array), + ]; + + let union_array = UnionArray::try_new( + union_fields, + type_ids, + None, // Sparse union + children, + ) + .unwrap(); + + // Test the row builder + let mut builder = make_arrow_to_variant_row_builder( + union_array.data_type(), + &union_array, + ).unwrap(); + + let mut variant_builder = VariantArrayBuilder::new(union_array.len()); + for i in 0..union_array.len() { + builder.append_row(i, &mut variant_builder).unwrap(); + } + let variant_array = variant_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 6); + + // Row 0: int 1 + assert_eq!(variant_array.value(0), Variant::Int32(1)); + + // Row 1: float 3.2 + assert_eq!(variant_array.value(1), Variant::Double(3.2)); + + // Row 2: string "hello" + assert_eq!(variant_array.value(2), Variant::from("hello")); + + // Row 3: float 32.5 + assert_eq!(variant_array.value(3), Variant::Double(32.5)); + + // Row 4: int 34 + assert_eq!(variant_array.value(4), Variant::Int32(34)); + + // Row 5: null (int array has null at this position) + assert!(variant_array.is_null(5)); + } + + #[test] + fn test_union_dense_row_builder() { + use arrow::array::{Int32Array, Float64Array, StringArray, UnionArray}; + use arrow::buffer::ScalarBuffer; + use arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; + use std::sync::Arc; + + // Create a dense union array with mixed types (int, float, string) + let int_array = Int32Array::from(vec![Some(1), Some(34), None]); + let float_array = Float64Array::from(vec![3.2, 32.5]); + let string_array = StringArray::from(vec!["hello"]); + let type_ids = [0, 1, 2, 1, 0, 0].into_iter().collect::>(); + let offsets = [0, 0, 0, 1, 1, 2] + .into_iter() + .collect::>(); + + let union_fields = UnionFields::new( + vec![0, 1, 2], + vec![ + Field::new("int_field", DataType::Int32, false), + Field::new("float_field", DataType::Float64, false), + Field::new("string_field", DataType::Utf8, false), + ], + ); + + let children: Vec> = vec![ + Arc::new(int_array), + Arc::new(float_array), + Arc::new(string_array), + ]; + + let union_array = UnionArray::try_new( + union_fields, + type_ids, + Some(offsets), // Dense union + children, + ) + .unwrap(); + + // Test the row builder + let mut builder = make_arrow_to_variant_row_builder( + union_array.data_type(), + &union_array, + ).unwrap(); + + let mut variant_builder = VariantArrayBuilder::new(union_array.len()); + for i in 0..union_array.len() { + builder.append_row(i, &mut variant_builder).unwrap(); + } + let variant_array = variant_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 6); + + // Row 0: int 1 (offset 0 in int_array) + assert_eq!(variant_array.value(0), Variant::Int32(1)); + + // Row 1: float 3.2 (offset 0 in float_array) + assert_eq!(variant_array.value(1), Variant::Double(3.2)); + + // Row 2: string "hello" (offset 0 in string_array) + assert_eq!(variant_array.value(2), Variant::from("hello")); + + // Row 3: float 32.5 (offset 1 in float_array) + assert_eq!(variant_array.value(3), Variant::Double(32.5)); + + // Row 4: int 34 (offset 1 in int_array) + assert_eq!(variant_array.value(4), Variant::Int32(34)); + + // Row 5: null (offset 2 in int_array, which has null) + assert!(variant_array.is_null(5)); + } + + #[test] + fn test_union_sparse_type_ids_row_builder() { + use arrow::array::{Int32Array, StringArray, UnionArray}; + use arrow::buffer::ScalarBuffer; + use arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; + use std::sync::Arc; + + // Create a sparse union with non-contiguous type IDs (1, 3) + let int_array = Int32Array::from(vec![Some(42), None]); + let string_array = StringArray::from(vec![None, Some("test")]); + let type_ids = [1, 3].into_iter().collect::>(); + + let union_fields = UnionFields::new( + vec![1, 3], // Non-contiguous type IDs + vec![ + Field::new("int_field", DataType::Int32, false), + Field::new("string_field", DataType::Utf8, false), + ], + ); + + let children: Vec> = vec![ + Arc::new(int_array), + Arc::new(string_array), + ]; + + let union_array = UnionArray::try_new( + union_fields, + type_ids, + None, // Sparse union + children, + ) + .unwrap(); + + // Test the row builder + let mut builder = make_arrow_to_variant_row_builder( + union_array.data_type(), + &union_array, + ).unwrap(); + + let mut variant_builder = VariantArrayBuilder::new(union_array.len()); + for i in 0..union_array.len() { + builder.append_row(i, &mut variant_builder).unwrap(); + } + let variant_array = variant_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 2); + + // Row 0: int 42 (type_id = 1) + assert_eq!(variant_array.value(0), Variant::Int32(42)); + + // Row 1: string "test" (type_id = 3) + assert_eq!(variant_array.value(1), Variant::from("test")); + } } From 5b5ede7b812c915b7c6dfe79635c5e11dc6bef8e Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 19:07:31 -0700 Subject: [PATCH 18/53] manual cleanup --- .../src/cast_to_variant.rs | 38 ++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 34d3b584505e..c1f5655d7c08 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -466,11 +466,11 @@ pub(crate) struct UnionArrowToVariantBuilder<'a> { impl<'a> UnionArrowToVariantBuilder<'a> { fn new(array: &'a dyn Array) -> Result { let union_array = array.as_union(); - let union_fields = union_array.union_fields(); + let type_ids = union_array.type_ids(); // Create child builders for each union field let mut child_builders = HashMap::new(); - for (type_id, _field) in union_fields.iter() { + for &type_id in type_ids { let child_array = union_array.child(type_id); let child_builder = make_arrow_to_variant_row_builder( child_array.data_type(), @@ -489,12 +489,10 @@ impl<'a> UnionArrowToVariantBuilder<'a> { let type_id = self.union_array.type_id(index); let value_offset = self.union_array.value_offset(index); - if let Some(child_builder) = self.child_builders.get_mut(&type_id) { - // Delegate to the appropriate child builder - child_builder.append_row(value_offset, builder)?; - } else { - // Invalid type_id - should not happen in valid union, handle gracefully - builder.append_null(); + // Delegate to the appropriate child builder, or append null to handle an invalid type_id + match self.child_builders.get_mut(&type_id) { + Some(child_builder) => child_builder.append_row(value_offset, builder)?, + None => builder.append_null(), } Ok(()) @@ -3544,7 +3542,7 @@ mod row_builder_tests { fn test_union_sparse_row_builder() { use arrow::array::{Int32Array, Float64Array, StringArray, UnionArray}; use arrow::buffer::ScalarBuffer; - use arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; + use arrow::datatypes::{DataType, Field, UnionFields}; use std::sync::Arc; // Create a sparse union array with mixed types (int, float, string) @@ -3577,14 +3575,16 @@ mod row_builder_tests { .unwrap(); // Test the row builder - let mut builder = make_arrow_to_variant_row_builder( + let mut row_builder = make_arrow_to_variant_row_builder( union_array.data_type(), &union_array, ).unwrap(); let mut variant_builder = VariantArrayBuilder::new(union_array.len()); for i in 0..union_array.len() { - builder.append_row(i, &mut variant_builder).unwrap(); + let mut builder = variant_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); } let variant_array = variant_builder.build(); @@ -3614,7 +3614,7 @@ mod row_builder_tests { fn test_union_dense_row_builder() { use arrow::array::{Int32Array, Float64Array, StringArray, UnionArray}; use arrow::buffer::ScalarBuffer; - use arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; + use arrow::datatypes::{DataType, Field, UnionFields}; use std::sync::Arc; // Create a dense union array with mixed types (int, float, string) @@ -3650,14 +3650,16 @@ mod row_builder_tests { .unwrap(); // Test the row builder - let mut builder = make_arrow_to_variant_row_builder( + let mut row_builder = make_arrow_to_variant_row_builder( union_array.data_type(), &union_array, ).unwrap(); let mut variant_builder = VariantArrayBuilder::new(union_array.len()); for i in 0..union_array.len() { - builder.append_row(i, &mut variant_builder).unwrap(); + let mut builder = variant_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); } let variant_array = variant_builder.build(); @@ -3687,7 +3689,7 @@ mod row_builder_tests { fn test_union_sparse_type_ids_row_builder() { use arrow::array::{Int32Array, StringArray, UnionArray}; use arrow::buffer::ScalarBuffer; - use arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; + use arrow::datatypes::{DataType, Field, UnionFields}; use std::sync::Arc; // Create a sparse union with non-contiguous type IDs (1, 3) @@ -3717,14 +3719,16 @@ mod row_builder_tests { .unwrap(); // Test the row builder - let mut builder = make_arrow_to_variant_row_builder( + let mut row_builder = make_arrow_to_variant_row_builder( union_array.data_type(), &union_array, ).unwrap(); let mut variant_builder = VariantArrayBuilder::new(union_array.len()); for i in 0..union_array.len() { - builder.append_row(i, &mut variant_builder).unwrap(); + let mut builder = variant_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); } let variant_array = variant_builder.build(); From 18bfe040a564f07f23b1235747d7aada9e2f9890 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Sat, 6 Sep 2025 19:35:48 -0700 Subject: [PATCH 19/53] checkpoint - float16 --- parquet-variant-compute/src/cast_to_variant.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index c1f5655d7c08..e436ef02a78a 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -60,6 +60,7 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { PrimitiveUInt16(PrimitiveArrowToVariantBuilder<'a, UInt16Type>), PrimitiveUInt32(PrimitiveArrowToVariantBuilder<'a, UInt32Type>), PrimitiveUInt64(PrimitiveArrowToVariantBuilder<'a, UInt64Type>), + PrimitiveFloat16(PrimitiveArrowToVariantBuilder<'a, Float16Type>), PrimitiveFloat32(PrimitiveArrowToVariantBuilder<'a, Float32Type>), PrimitiveFloat64(PrimitiveArrowToVariantBuilder<'a, Float64Type>), Boolean(BooleanArrowToVariantBuilder<'a>), @@ -87,6 +88,7 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::PrimitiveUInt16(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::PrimitiveUInt32(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::PrimitiveUInt64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveFloat16(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::PrimitiveFloat32(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::PrimitiveFloat64(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Boolean(b) => b.append_row(index, builder), @@ -516,6 +518,7 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::::new(array))), // Float types + DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::::new(array))), DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::::new(array))), DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::::new(array))), From b5eaaa450f0163904a5e4a91c8bc1fc35616ea80 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 04:33:18 -0700 Subject: [PATCH 20/53] checkpoint - decimal --- .../src/cast_to_variant.rs | 226 ++++++++++++++++++ 1 file changed, 226 insertions(+) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index e436ef02a78a..37c824faf436 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -63,6 +63,10 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { PrimitiveFloat16(PrimitiveArrowToVariantBuilder<'a, Float16Type>), PrimitiveFloat32(PrimitiveArrowToVariantBuilder<'a, Float32Type>), PrimitiveFloat64(PrimitiveArrowToVariantBuilder<'a, Float64Type>), + Decimal32(Decimal32ArrowToVariantBuilder<'a>), + Decimal64(Decimal64ArrowToVariantBuilder<'a>), + Decimal128(Decimal128ArrowToVariantBuilder<'a>), + Decimal256(Decimal256ArrowToVariantBuilder<'a>), Boolean(BooleanArrowToVariantBuilder<'a>), String(StringArrowToVariantBuilder<'a>), Struct(StructArrowToVariantBuilder<'a>), @@ -91,6 +95,10 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::PrimitiveFloat16(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::PrimitiveFloat32(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::PrimitiveFloat64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Decimal32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Decimal64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Decimal128(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Decimal256(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Boolean(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::String(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Struct(b) => b.append_row(index, builder), @@ -501,6 +509,115 @@ impl<'a> UnionArrowToVariantBuilder<'a> { } } +/// Decimal32 builder for Arrow Decimal32Array +pub(crate) struct Decimal32ArrowToVariantBuilder<'a> { + array: &'a arrow::array::Decimal32Array, + scale: i8, +} + +impl<'a> Decimal32ArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array, scale: i8) -> Self { + Self { + array: array.as_primitive::(), + scale, + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let value = self.array.value(index); + let variant = decimal_to_variant_decimal!(value, &self.scale, i32, VariantDecimal4); + builder.append_value(variant); + } + Ok(()) + } +} + +/// Decimal64 builder for Arrow Decimal64Array +pub(crate) struct Decimal64ArrowToVariantBuilder<'a> { + array: &'a arrow::array::Decimal64Array, + scale: i8, +} + +impl<'a> Decimal64ArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array, scale: i8) -> Self { + Self { + array: array.as_primitive::(), + scale, + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let value = self.array.value(index); + let variant = decimal_to_variant_decimal!(value, &self.scale, i64, VariantDecimal8); + builder.append_value(variant); + } + Ok(()) + } +} + +/// Decimal128 builder for Arrow Decimal128Array +pub(crate) struct Decimal128ArrowToVariantBuilder<'a> { + array: &'a arrow::array::Decimal128Array, + scale: i8, +} + +impl<'a> Decimal128ArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array, scale: i8) -> Self { + Self { + array: array.as_primitive::(), + scale, + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let value = self.array.value(index); + let variant = decimal_to_variant_decimal!(value, &self.scale, i128, VariantDecimal16); + builder.append_value(variant); + } + Ok(()) + } +} + +/// Decimal256 builder for Arrow Decimal256Array +pub(crate) struct Decimal256ArrowToVariantBuilder<'a> { + array: &'a arrow::array::Decimal256Array, + scale: i8, +} + +impl<'a> Decimal256ArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array, scale: i8) -> Self { + Self { + array: array.as_primitive::(), + scale, + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let value = self.array.value(index); + // Special handling for Decimal256 like in original cast_to_variant + let variant = if let Some(v) = value.to_i128() { + decimal_to_variant_decimal!(v, &self.scale, i128, VariantDecimal16) + } else { + Variant::Null + }; + builder.append_value(variant); + } + Ok(()) + } +} + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, @@ -522,6 +639,12 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::::new(array))), DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::::new(array))), + // Decimal types + DataType::Decimal32(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale))), + DataType::Decimal64(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal64(Decimal64ArrowToVariantBuilder::new(array, *scale))), + DataType::Decimal128(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal128(Decimal128ArrowToVariantBuilder::new(array, *scale))), + DataType::Decimal256(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal256(Decimal256ArrowToVariantBuilder::new(array, *scale))), + // Special types DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array))), DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), @@ -3744,4 +3867,107 @@ mod row_builder_tests { // Row 1: string "test" (type_id = 3) assert_eq!(variant_array.value(1), Variant::from("test")); } + + #[test] + fn test_decimal32_row_builder() { + use arrow::array::Decimal32Array; + use parquet_variant::{VariantDecimal4}; + + // Test Decimal32Array with scale 2 (e.g., for currency: 12.34) + let decimal_array = Decimal32Array::from(vec![Some(1234), None, Some(-5678)]) + .with_precision_and_scale(10, 2).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder( + decimal_array.data_type(), + &decimal_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..decimal_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: 12.34 (1234 with scale 2) + assert_eq!(variant_array.value(0), Variant::from(VariantDecimal4::try_new(1234, 2).unwrap())); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: -56.78 (-5678 with scale 2) + assert_eq!(variant_array.value(2), Variant::from(VariantDecimal4::try_new(-5678, 2).unwrap())); + } + + #[test] + fn test_decimal128_row_builder() { + use arrow::array::Decimal128Array; + use parquet_variant::{VariantDecimal16}; + + // Test Decimal128Array with negative scale (multiply by 10^|scale|) + let decimal_array = Decimal128Array::from(vec![Some(123), None, Some(456)]) + .with_precision_and_scale(10, -2).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder( + decimal_array.data_type(), + &decimal_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..decimal_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: 123 * 10^2 = 12300 with scale 0 (negative scale handling) + assert_eq!(variant_array.value(0), Variant::from(VariantDecimal16::try_new(12300, 0).unwrap())); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 456 * 10^2 = 45600 with scale 0 + assert_eq!(variant_array.value(2), Variant::from(VariantDecimal16::try_new(45600, 0).unwrap())); + } + + #[test] + fn test_decimal256_overflow_row_builder() { + use arrow::array::Decimal256Array; + use arrow::datatypes::i256; + + // Test Decimal256Array with a value that overflows i128 + let large_value = i256::from_i128(i128::MAX) + i256::from(1); // Overflows i128 + let decimal_array = Decimal256Array::from(vec![Some(large_value), Some(i256::from(123))]) + .with_precision_and_scale(76, 3).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder( + decimal_array.data_type(), + &decimal_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(2); + + for i in 0..decimal_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 2); + + // Row 0: overflow value becomes Variant::Null + assert_eq!(variant_array.value(0), Variant::Null); + + // Row 1: normal value converts successfully + assert_eq!(variant_array.value(1), Variant::from(VariantDecimal16::try_new(123, 3).unwrap())); + } } From 24a1d449cf9c29ae293ec01d46fd7b9d45f9aa9f Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 04:37:03 -0700 Subject: [PATCH 21/53] manual fixup --- parquet-variant-compute/src/cast_to_variant.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 37c824faf436..84a3ba497b4f 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -3875,7 +3875,7 @@ mod row_builder_tests { // Test Decimal32Array with scale 2 (e.g., for currency: 12.34) let decimal_array = Decimal32Array::from(vec![Some(1234), None, Some(-5678)]) - .with_precision_and_scale(10, 2).unwrap(); + .with_precision_and_scale(9, 2).unwrap(); let mut row_builder = make_arrow_to_variant_row_builder( decimal_array.data_type(), From 9db05f1be68cbcbad9d12d9bef20d20a097653ab Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 04:47:01 -0700 Subject: [PATCH 22/53] checkpoint - binary and string --- .../src/cast_to_variant.rs | 330 ++++++++++++++++++ 1 file changed, 330 insertions(+) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 84a3ba497b4f..20d3f3fd994f 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -69,6 +69,11 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { Decimal256(Decimal256ArrowToVariantBuilder<'a>), Boolean(BooleanArrowToVariantBuilder<'a>), String(StringArrowToVariantBuilder<'a>), + Binary(BinaryArrowToVariantBuilder<'a>), + LargeBinary(LargeBinaryArrowToVariantBuilder<'a>), + BinaryView(BinaryViewArrowToVariantBuilder<'a>), + FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder<'a>), + Utf8View(Utf8ViewArrowToVariantBuilder<'a>), Struct(StructArrowToVariantBuilder<'a>), Null(NullArrowToVariantBuilder), RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder<'a, Int16Type>), @@ -101,6 +106,11 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::Decimal256(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Boolean(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::String(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Binary(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::LargeBinary(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::BinaryView(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::FixedSizeBinary(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Utf8View(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Struct(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Null(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::RunEndEncodedInt16(b) => b.append_row(index, builder), @@ -618,6 +628,121 @@ impl<'a> Decimal256ArrowToVariantBuilder<'a> { } } +/// Binary builder for Arrow BinaryArray +pub(crate) struct BinaryArrowToVariantBuilder<'a> { + array: &'a arrow::array::BinaryArray, +} + +impl<'a> BinaryArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_binary::(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let bytes = self.array.value(index); + builder.append_value(Variant::from(bytes)); + } + Ok(()) + } +} + +/// LargeBinary builder for Arrow LargeBinaryArray +pub(crate) struct LargeBinaryArrowToVariantBuilder<'a> { + array: &'a arrow::array::LargeBinaryArray, +} + +impl<'a> LargeBinaryArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_binary::(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let bytes = self.array.value(index); + builder.append_value(Variant::from(bytes)); + } + Ok(()) + } +} + +/// BinaryView builder for Arrow BinaryViewArray +pub(crate) struct BinaryViewArrowToVariantBuilder<'a> { + array: &'a arrow::array::BinaryViewArray, +} + +impl<'a> BinaryViewArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_byte_view(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let bytes = self.array.value(index); + builder.append_value(Variant::from(bytes)); + } + Ok(()) + } +} + +/// FixedSizeBinary builder for Arrow FixedSizeBinaryArray +pub(crate) struct FixedSizeBinaryArrowToVariantBuilder<'a> { + array: &'a arrow::array::FixedSizeBinaryArray, +} + +impl<'a> FixedSizeBinaryArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_fixed_size_binary(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let bytes = self.array.value(index); + builder.append_value(Variant::from(bytes)); + } + Ok(()) + } +} + +/// Utf8View builder for Arrow StringViewArray +pub(crate) struct Utf8ViewArrowToVariantBuilder<'a> { + array: &'a arrow::array::StringViewArray, +} + +impl<'a> Utf8ViewArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_string_view(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let string = self.array.value(index); + builder.append_value(Variant::from(string)); + } + Ok(()) + } +} + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, @@ -649,6 +774,14 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array))), DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), + DataType::Utf8View => Ok(ArrowToVariantRowBuilder::Utf8View(Utf8ViewArrowToVariantBuilder::new(array))), + + // Binary types + DataType::Binary => Ok(ArrowToVariantRowBuilder::Binary(BinaryArrowToVariantBuilder::new(array))), + DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary(LargeBinaryArrowToVariantBuilder::new(array))), + DataType::BinaryView => Ok(ArrowToVariantRowBuilder::BinaryView(BinaryViewArrowToVariantBuilder::new(array))), + DataType::FixedSizeBinary(_) => Ok(ArrowToVariantRowBuilder::FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder::new(array))), + DataType::Struct(_) => Ok(ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new(array.as_struct())?)), DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder)), @@ -3970,4 +4103,201 @@ mod row_builder_tests { // Row 1: normal value converts successfully assert_eq!(variant_array.value(1), Variant::from(VariantDecimal16::try_new(123, 3).unwrap())); } + + #[test] + fn test_binary_row_builder() { + use arrow::array::BinaryArray; + + // Test BinaryArray with various binary data + let binary_data = vec![ + Some(b"hello".as_slice()), + None, + Some(b"\x00\x01\x02\xFF".as_slice()), + Some(b"".as_slice()), // Empty binary + ]; + let binary_array = BinaryArray::from(binary_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + binary_array.data_type(), + &binary_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..binary_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: "hello" bytes + assert_eq!(variant_array.value(0), Variant::from(b"hello".to_vec())); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: binary with special bytes + assert_eq!(variant_array.value(2), Variant::from(vec![0x00, 0x01, 0x02, 0xFF])); + + // Row 3: empty binary + assert_eq!(variant_array.value(3), Variant::from(Vec::::new())); + } + + #[test] + fn test_large_binary_row_builder() { + use arrow::array::LargeBinaryArray; + + // Test LargeBinaryArray + let binary_data = vec![ + Some(b"large binary data".as_slice()), + None, + Some(b"another large chunk".as_slice()), + ]; + let large_binary_array = LargeBinaryArray::from(binary_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + large_binary_array.data_type(), + &large_binary_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..large_binary_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: large binary data + assert_eq!(variant_array.value(0), Variant::from(b"large binary data".to_vec())); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: another large chunk + assert_eq!(variant_array.value(2), Variant::from(b"another large chunk".to_vec())); + } + + #[test] + fn test_binary_view_row_builder() { + use arrow::array::BinaryViewArray; + + // Test BinaryViewArray + let binary_data = vec![ + Some(b"short".as_slice()), + None, + Some(b"this is a longer binary view that exceeds inline storage".as_slice()), + ]; + let binary_view_array = BinaryViewArray::from(binary_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + binary_view_array.data_type(), + &binary_view_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..binary_view_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: short binary + assert_eq!(variant_array.value(0), Variant::from(b"short".to_vec())); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: long binary view + assert_eq!(variant_array.value(2), Variant::from(b"this is a longer binary view that exceeds inline storage".to_vec())); + } + + #[test] + fn test_fixed_size_binary_row_builder() { + use arrow::array::FixedSizeBinaryArray; + + // Test FixedSizeBinaryArray with 4-byte values + let binary_data = vec![ + Some([0x01, 0x02, 0x03, 0x04]), + None, + Some([0xFF, 0xFE, 0xFD, 0xFC]), + ]; + let fixed_binary_array = FixedSizeBinaryArray::try_from_sparse_iter_with_size( + binary_data.into_iter().map(|x| x.map(|v| v.as_slice())), + 4, + ).unwrap(); + + let mut row_builder = make_arrow_to_variant_row_builder( + fixed_binary_array.data_type(), + &fixed_binary_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..fixed_binary_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: fixed size binary + assert_eq!(variant_array.value(0), Variant::from(vec![0x01, 0x02, 0x03, 0x04])); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: another fixed size binary + assert_eq!(variant_array.value(2), Variant::from(vec![0xFF, 0xFE, 0xFD, 0xFC])); + } + + #[test] + fn test_utf8_view_row_builder() { + use arrow::array::StringViewArray; + + // Test StringViewArray (Utf8View) + let string_data = vec![ + Some("short"), + None, + Some("this is a much longer string that will be stored out-of-line in the buffer"), + ]; + let string_view_array = StringViewArray::from(string_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + string_view_array.data_type(), + &string_view_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..string_view_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: short string + assert_eq!(variant_array.value(0), Variant::from("short")); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: long string view + assert_eq!(variant_array.value(2), Variant::from("this is a much longer string that will be stored out-of-line in the buffer")); + } } From abd937802fa0b4cae00f197d829caea85364184d Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 05:01:52 -0700 Subject: [PATCH 23/53] manual fixes --- .../src/cast_to_variant.rs | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 20d3f3fd994f..3490c3d910a0 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -4134,16 +4134,18 @@ mod row_builder_tests { assert_eq!(variant_array.len(), 4); // Row 0: "hello" bytes - assert_eq!(variant_array.value(0), Variant::from(b"hello".to_vec())); + assert_eq!(variant_array.value(0), Variant::from(b"hello".as_slice())); // Row 1: null assert!(variant_array.is_null(1)); // Row 2: binary with special bytes - assert_eq!(variant_array.value(2), Variant::from(vec![0x00, 0x01, 0x02, 0xFF])); + let bytes = [0x00, 0x01, 0x02, 0xFF]; + assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); // Row 3: empty binary - assert_eq!(variant_array.value(3), Variant::from(Vec::::new())); + let bytes = []; + assert_eq!(variant_array.value(3), Variant::from(bytes.as_slice())); } #[test] @@ -4175,13 +4177,13 @@ mod row_builder_tests { assert_eq!(variant_array.len(), 3); // Row 0: large binary data - assert_eq!(variant_array.value(0), Variant::from(b"large binary data".to_vec())); + assert_eq!(variant_array.value(0), Variant::from(b"large binary data".as_slice())); // Row 1: null assert!(variant_array.is_null(1)); // Row 2: another large chunk - assert_eq!(variant_array.value(2), Variant::from(b"another large chunk".to_vec())); + assert_eq!(variant_array.value(2), Variant::from(b"another large chunk".as_slice())); } #[test] @@ -4213,13 +4215,13 @@ mod row_builder_tests { assert_eq!(variant_array.len(), 3); // Row 0: short binary - assert_eq!(variant_array.value(0), Variant::from(b"short".to_vec())); + assert_eq!(variant_array.value(0), Variant::from(b"short".as_slice())); // Row 1: null assert!(variant_array.is_null(1)); // Row 2: long binary view - assert_eq!(variant_array.value(2), Variant::from(b"this is a longer binary view that exceeds inline storage".to_vec())); + assert_eq!(variant_array.value(2), Variant::from(b"this is a longer binary view that exceeds inline storage".as_slice())); } #[test] @@ -4233,7 +4235,7 @@ mod row_builder_tests { Some([0xFF, 0xFE, 0xFD, 0xFC]), ]; let fixed_binary_array = FixedSizeBinaryArray::try_from_sparse_iter_with_size( - binary_data.into_iter().map(|x| x.map(|v| v.as_slice())), + binary_data.into_iter(), 4, ).unwrap(); @@ -4254,13 +4256,15 @@ mod row_builder_tests { assert_eq!(variant_array.len(), 3); // Row 0: fixed size binary - assert_eq!(variant_array.value(0), Variant::from(vec![0x01, 0x02, 0x03, 0x04])); + let bytes = [0x01, 0x02, 0x03, 0x04]; + assert_eq!(variant_array.value(0), Variant::from(bytes.as_slice())); // Row 1: null assert!(variant_array.is_null(1)); // Row 2: another fixed size binary - assert_eq!(variant_array.value(2), Variant::from(vec![0xFF, 0xFE, 0xFD, 0xFC])); + let bytes = [0xFF, 0xFE, 0xFD, 0xFC]; + assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); } #[test] From 582dc7fee4cb83652cd9b5631e03628ea456b2e1 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 05:08:14 -0700 Subject: [PATCH 24/53] checkpoint - use offset size trait for string and binary buildres --- .../src/cast_to_variant.rs | 71 ++++++------------- 1 file changed, 21 insertions(+), 50 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 3490c3d910a0..dd3bf8c05bf7 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -68,9 +68,10 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { Decimal128(Decimal128ArrowToVariantBuilder<'a>), Decimal256(Decimal256ArrowToVariantBuilder<'a>), Boolean(BooleanArrowToVariantBuilder<'a>), - String(StringArrowToVariantBuilder<'a>), - Binary(BinaryArrowToVariantBuilder<'a>), - LargeBinary(LargeBinaryArrowToVariantBuilder<'a>), + String(StringArrowToVariantBuilder<'a, i32>), + LargeString(StringArrowToVariantBuilder<'a, i64>), + Binary(BinaryArrowToVariantBuilder<'a, i32>), + LargeBinary(BinaryArrowToVariantBuilder<'a, i64>), BinaryView(BinaryViewArrowToVariantBuilder<'a>), FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder<'a>), Utf8View(Utf8ViewArrowToVariantBuilder<'a>), @@ -106,6 +107,7 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::Decimal256(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Boolean(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::String(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::LargeString(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Binary(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::LargeBinary(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::BinaryView(b) => b.append_row(index, builder), @@ -179,31 +181,23 @@ impl<'a> BooleanArrowToVariantBuilder<'a> { } } -/// String builder for StringArray (both Utf8 and LargeUtf8) -pub(crate) struct StringArrowToVariantBuilder<'a> { - array: &'a dyn Array, +/// Generic String builder for StringArray (Utf8 and LargeUtf8) +pub(crate) struct StringArrowToVariantBuilder<'a, O: OffsetSizeTrait> { + array: &'a arrow::array::GenericStringArray, } -impl<'a> StringArrowToVariantBuilder<'a> { +impl<'a, O: OffsetSizeTrait> StringArrowToVariantBuilder<'a, O> { fn new(array: &'a dyn Array) -> Self { - Self { array } + Self { + array: array.as_string::(), + } } - fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { if self.array.is_null(index) { builder.append_null(); } else { - let value = match self.array.data_type() { - DataType::Utf8 => { - let string_array = self.array.as_string::(); - string_array.value(index) - } - DataType::LargeUtf8 => { - let string_array = self.array.as_string::(); - string_array.value(index) - } - _ => return Err(ArrowError::CastError("Expected string array".to_string())), - }; + let value = self.array.value(index); builder.append_value(value); } Ok(()) @@ -628,38 +622,15 @@ impl<'a> Decimal256ArrowToVariantBuilder<'a> { } } -/// Binary builder for Arrow BinaryArray -pub(crate) struct BinaryArrowToVariantBuilder<'a> { - array: &'a arrow::array::BinaryArray, -} - -impl<'a> BinaryArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_binary::(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let bytes = self.array.value(index); - builder.append_value(Variant::from(bytes)); - } - Ok(()) - } -} - -/// LargeBinary builder for Arrow LargeBinaryArray -pub(crate) struct LargeBinaryArrowToVariantBuilder<'a> { - array: &'a arrow::array::LargeBinaryArray, +/// Generic Binary builder for Arrow BinaryArray and LargeBinaryArray +pub(crate) struct BinaryArrowToVariantBuilder<'a, O: OffsetSizeTrait> { + array: &'a arrow::array::GenericBinaryArray, } -impl<'a> LargeBinaryArrowToVariantBuilder<'a> { +impl<'a, O: OffsetSizeTrait> BinaryArrowToVariantBuilder<'a, O> { fn new(array: &'a dyn Array) -> Self { Self { - array: array.as_binary::(), + array: array.as_binary::(), } } @@ -773,12 +744,12 @@ fn make_arrow_to_variant_row_builder<'a>( // Special types DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array))), DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), - DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), + DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::LargeString(StringArrowToVariantBuilder::new(array))), DataType::Utf8View => Ok(ArrowToVariantRowBuilder::Utf8View(Utf8ViewArrowToVariantBuilder::new(array))), // Binary types DataType::Binary => Ok(ArrowToVariantRowBuilder::Binary(BinaryArrowToVariantBuilder::new(array))), - DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary(LargeBinaryArrowToVariantBuilder::new(array))), + DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary(BinaryArrowToVariantBuilder::new(array))), DataType::BinaryView => Ok(ArrowToVariantRowBuilder::BinaryView(BinaryViewArrowToVariantBuilder::new(array))), DataType::FixedSizeBinary(_) => Ok(ArrowToVariantRowBuilder::FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder::new(array))), From 1c3fcf3205b289a34ab3f99c930aa4aa7608aca4 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 05:35:40 -0700 Subject: [PATCH 25/53] checkpoint - timstamps --- .../src/cast_to_variant.rs | 242 +++++++++++++++++- 1 file changed, 238 insertions(+), 4 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index dd3bf8c05bf7..7b160673f23b 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -30,14 +30,15 @@ use arrow::array::{ use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::kernels::cast; use arrow::datatypes::{ - i256, ArrowNativeType, ArrowPrimitiveType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, + i256, ArrowNativeType, ArrowPrimitiveType, ArrowTimestampType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType, Time32MillisecondType, - Time32SecondType, Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, + Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; use arrow::temporal_conversions::{ - timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, + as_datetime, timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_us_to_datetime, }; use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit, UnionFields}; @@ -85,6 +86,10 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { LargeList(ListArrowToVariantBuilder<'a, i64>), Map(MapArrowToVariantBuilder<'a>), Union(UnionArrowToVariantBuilder<'a>), + TimestampSecond(TimestampArrowToVariantBuilder<'a, TimestampSecondType>), + TimestampMillisecond(TimestampArrowToVariantBuilder<'a, TimestampMillisecondType>), + TimestampMicrosecond(TimestampArrowToVariantBuilder<'a, TimestampMicrosecondType>), + TimestampNanosecond(TimestampArrowToVariantBuilder<'a, TimestampNanosecondType>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -123,6 +128,10 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::LargeList(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Map(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::Union(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::TimestampSecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::TimestampMillisecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::TimestampMicrosecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::TimestampNanosecond(b) => b.append_row(index, builder), } } } @@ -714,6 +723,46 @@ impl<'a> Utf8ViewArrowToVariantBuilder<'a> { } } +/// Generic Timestamp builder for Arrow timestamp arrays +pub(crate) struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> { + array: &'a arrow::array::PrimitiveArray, + time_zone: Option>, +} + +impl<'a, T: ArrowTimestampType> TimestampArrowToVariantBuilder<'a, T> { + fn new(array: &'a dyn Array, time_zone: Option>) -> Self { + Self { + array: array.as_primitive::(), + time_zone, + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let timestamp_value = self.array.value(index); + + // Convert using Arrow's temporal conversion functions + if let Some(naive_datetime) = as_datetime::(timestamp_value) { + let variant = if self.time_zone.is_none() { + // No timezone -> NaiveDateTime -> TimestampNtzMicros/TimestampNtzNanos + Variant::from(naive_datetime) // Uses From for Variant + } else { + // Has timezone -> DateTime -> TimestampMicros/TimestampNanos + let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); + Variant::from(utc_dt) // Uses From> for Variant + }; + builder.append_value(variant); + } else { + // Conversion failed -> append null + builder.append_null(); + } + } + Ok(()) + } +} + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, @@ -781,7 +830,25 @@ fn make_arrow_to_variant_row_builder<'a>( // Union types DataType::Union(_, _) => Ok(ArrowToVariantRowBuilder::Union(UnionArrowToVariantBuilder::new(array)?)), - // TODO: Add other types (Binary, Date, Time, Decimal, etc.) + // Timestamp types + DataType::Timestamp(time_unit, time_zone) => { + match time_unit { + TimeUnit::Second => Ok(ArrowToVariantRowBuilder::TimestampSecond( + TimestampArrowToVariantBuilder::new(array, time_zone.clone()) + )), + TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::TimestampMillisecond( + TimestampArrowToVariantBuilder::new(array, time_zone.clone()) + )), + TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::TimestampMicrosecond( + TimestampArrowToVariantBuilder::new(array, time_zone.clone()) + )), + TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::TimestampNanosecond( + TimestampArrowToVariantBuilder::new(array, time_zone.clone()) + )), + } + } + + // TODO: Add other types (Date, Time, etc.) _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), } } @@ -4275,4 +4342,171 @@ mod row_builder_tests { // Row 2: long string view assert_eq!(variant_array.value(2), Variant::from("this is a much longer string that will be stored out-of-line in the buffer")); } + + #[test] + fn test_timestamp_second_row_builder() { + use arrow::array::TimestampSecondArray; + use chrono::{DateTime, NaiveDateTime, Utc}; + + // Test TimestampSecondArray without timezone + let timestamp_data = vec![ + Some(1609459200), // 2021-01-01 00:00:00 UTC + None, + Some(1640995200), // 2022-01-01 00:00:00 UTC + ]; + let timestamp_array = TimestampSecondArray::from(timestamp_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + timestamp_array.data_type(), + ×tamp_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..timestamp_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: 2021-01-01 00:00:00 (no timezone -> NaiveDateTime -> TimestampNtzMicros) + let expected_naive = NaiveDateTime::from_timestamp_opt(1609459200, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_naive)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 2022-01-01 00:00:00 + let expected_naive2 = NaiveDateTime::from_timestamp_opt(1640995200, 0).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_naive2)); + } + + #[test] + fn test_timestamp_with_timezone_row_builder() { + use arrow::array::TimestampMicrosecondArray; + use arrow_schema::DataType; + use chrono::{DateTime, Utc}; + + // Test TimestampMicrosecondArray with timezone + let timestamp_data = vec![ + Some(1609459200000000), // 2021-01-01 00:00:00 UTC (in microseconds) + None, + Some(1640995200000000), // 2022-01-01 00:00:00 UTC (in microseconds) + ]; + let timezone = Some("UTC".into()); + let timestamp_array = TimestampMicrosecondArray::from(timestamp_data) + .with_timezone(timezone.clone()); + + let mut row_builder = make_arrow_to_variant_row_builder( + timestamp_array.data_type(), + ×tamp_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..timestamp_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: 2021-01-01 00:00:00 UTC (with timezone -> DateTime -> TimestampMicros) + let expected_utc = DateTime::from_timestamp(1609459200, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_utc)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 2022-01-01 00:00:00 UTC + let expected_utc2 = DateTime::from_timestamp(1640995200, 0).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_utc2)); + } + + #[test] + fn test_timestamp_nanosecond_precision_row_builder() { + use arrow::array::TimestampNanosecondArray; + use chrono::NaiveDateTime; + + // Test TimestampNanosecondArray with nanosecond precision + let timestamp_data = vec![ + Some(1609459200123456789), // 2021-01-01 00:00:00.123456789 UTC + None, + Some(1609459200000000000), // 2021-01-01 00:00:00.000000000 UTC (no fractional seconds) + ]; + let timestamp_array = TimestampNanosecondArray::from(timestamp_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + timestamp_array.data_type(), + ×tamp_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..timestamp_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: with nanoseconds -> should use TimestampNtzNanos + let expected_with_nanos = NaiveDateTime::from_timestamp_opt(1609459200, 123456789).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_with_nanos)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: no fractional seconds -> should use TimestampNtzMicros + let expected_no_nanos = NaiveDateTime::from_timestamp_opt(1609459200, 0).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_no_nanos)); + } + + #[test] + fn test_timestamp_millisecond_row_builder() { + use arrow::array::TimestampMillisecondArray; + use chrono::NaiveDateTime; + + // Test TimestampMillisecondArray + let timestamp_data = vec![ + Some(1609459200123), // 2021-01-01 00:00:00.123 UTC + None, + Some(1609459200000), // 2021-01-01 00:00:00.000 UTC + ]; + let timestamp_array = TimestampMillisecondArray::from(timestamp_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + timestamp_array.data_type(), + ×tamp_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..timestamp_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: with milliseconds -> TimestampNtzMicros (123ms = 123000000ns) + let expected_with_millis = NaiveDateTime::from_timestamp_opt(1609459200, 123000000).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_with_millis)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: no fractional seconds -> TimestampNtzMicros + let expected_no_millis = NaiveDateTime::from_timestamp_opt(1609459200, 0).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_no_millis)); + } } From dc2d2aecffeea2596537a698996ef733dd4d83d9 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 07:40:24 -0700 Subject: [PATCH 26/53] manual fixes --- parquet-variant-compute/src/cast_to_variant.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 7b160673f23b..dbebe33e0897 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -4346,7 +4346,7 @@ mod row_builder_tests { #[test] fn test_timestamp_second_row_builder() { use arrow::array::TimestampSecondArray; - use chrono::{DateTime, NaiveDateTime, Utc}; + use chrono::{NaiveDateTime}; // Test TimestampSecondArray without timezone let timestamp_data = vec![ @@ -4387,8 +4387,7 @@ mod row_builder_tests { #[test] fn test_timestamp_with_timezone_row_builder() { use arrow::array::TimestampMicrosecondArray; - use arrow_schema::DataType; - use chrono::{DateTime, Utc}; + use chrono::{DateTime}; // Test TimestampMicrosecondArray with timezone let timestamp_data = vec![ @@ -4396,7 +4395,7 @@ mod row_builder_tests { None, Some(1640995200000000), // 2022-01-01 00:00:00 UTC (in microseconds) ]; - let timezone = Some("UTC".into()); + let timezone = "UTC".to_string(); let timestamp_array = TimestampMicrosecondArray::from(timestamp_data) .with_timezone(timezone.clone()); From 493de5dd26af6591ebacd94f448392f101364e5f Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 07:44:34 -0700 Subject: [PATCH 27/53] checkpoint - avoid deprecated timestamp functions --- parquet-variant-compute/src/cast_to_variant.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index dbebe33e0897..f229a100963e 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -4346,7 +4346,6 @@ mod row_builder_tests { #[test] fn test_timestamp_second_row_builder() { use arrow::array::TimestampSecondArray; - use chrono::{NaiveDateTime}; // Test TimestampSecondArray without timezone let timestamp_data = vec![ @@ -4373,14 +4372,14 @@ mod row_builder_tests { assert_eq!(variant_array.len(), 3); // Row 0: 2021-01-01 00:00:00 (no timezone -> NaiveDateTime -> TimestampNtzMicros) - let expected_naive = NaiveDateTime::from_timestamp_opt(1609459200, 0).unwrap(); + let expected_naive = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); assert_eq!(variant_array.value(0), Variant::from(expected_naive)); // Row 1: null assert!(variant_array.is_null(1)); // Row 2: 2022-01-01 00:00:00 - let expected_naive2 = NaiveDateTime::from_timestamp_opt(1640995200, 0).unwrap(); + let expected_naive2 = DateTime::from_timestamp(1640995200, 0).unwrap().naive_utc(); assert_eq!(variant_array.value(2), Variant::from(expected_naive2)); } @@ -4430,7 +4429,6 @@ mod row_builder_tests { #[test] fn test_timestamp_nanosecond_precision_row_builder() { use arrow::array::TimestampNanosecondArray; - use chrono::NaiveDateTime; // Test TimestampNanosecondArray with nanosecond precision let timestamp_data = vec![ @@ -4457,21 +4455,20 @@ mod row_builder_tests { assert_eq!(variant_array.len(), 3); // Row 0: with nanoseconds -> should use TimestampNtzNanos - let expected_with_nanos = NaiveDateTime::from_timestamp_opt(1609459200, 123456789).unwrap(); + let expected_with_nanos = DateTime::from_timestamp(1609459200, 123456789).unwrap().naive_utc(); assert_eq!(variant_array.value(0), Variant::from(expected_with_nanos)); // Row 1: null assert!(variant_array.is_null(1)); // Row 2: no fractional seconds -> should use TimestampNtzMicros - let expected_no_nanos = NaiveDateTime::from_timestamp_opt(1609459200, 0).unwrap(); + let expected_no_nanos = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); assert_eq!(variant_array.value(2), Variant::from(expected_no_nanos)); } #[test] fn test_timestamp_millisecond_row_builder() { use arrow::array::TimestampMillisecondArray; - use chrono::NaiveDateTime; // Test TimestampMillisecondArray let timestamp_data = vec![ @@ -4498,14 +4495,14 @@ mod row_builder_tests { assert_eq!(variant_array.len(), 3); // Row 0: with milliseconds -> TimestampNtzMicros (123ms = 123000000ns) - let expected_with_millis = NaiveDateTime::from_timestamp_opt(1609459200, 123000000).unwrap(); + let expected_with_millis = DateTime::from_timestamp(1609459200, 123000000).unwrap().naive_utc(); assert_eq!(variant_array.value(0), Variant::from(expected_with_millis)); // Row 1: null assert!(variant_array.is_null(1)); // Row 2: no fractional seconds -> TimestampNtzMicros - let expected_no_millis = NaiveDateTime::from_timestamp_opt(1609459200, 0).unwrap(); + let expected_no_millis = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); assert_eq!(variant_array.value(2), Variant::from(expected_no_millis)); } } From 9de68ebeb7f06e37727442236b736812d0aba8ec Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 07:54:10 -0700 Subject: [PATCH 28/53] only need to track whether a timestamp has a time zone --- .../src/cast_to_variant.rs | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index f229a100963e..bd9cd9e43d83 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -726,14 +726,14 @@ impl<'a> Utf8ViewArrowToVariantBuilder<'a> { /// Generic Timestamp builder for Arrow timestamp arrays pub(crate) struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> { array: &'a arrow::array::PrimitiveArray, - time_zone: Option>, + has_time_zone: bool, } impl<'a, T: ArrowTimestampType> TimestampArrowToVariantBuilder<'a, T> { - fn new(array: &'a dyn Array, time_zone: Option>) -> Self { + fn new(array: &'a dyn Array, has_time_zone: bool) -> Self { Self { array: array.as_primitive::(), - time_zone, + has_time_zone, } } @@ -744,20 +744,18 @@ impl<'a, T: ArrowTimestampType> TimestampArrowToVariantBuilder<'a, T> { let timestamp_value = self.array.value(index); // Convert using Arrow's temporal conversion functions - if let Some(naive_datetime) = as_datetime::(timestamp_value) { - let variant = if self.time_zone.is_none() { - // No timezone -> NaiveDateTime -> TimestampNtzMicros/TimestampNtzNanos - Variant::from(naive_datetime) // Uses From for Variant - } else { - // Has timezone -> DateTime -> TimestampMicros/TimestampNanos - let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); - Variant::from(utc_dt) // Uses From> for Variant - }; - builder.append_value(variant); + let Some(naive_datetime) = as_datetime::(timestamp_value) else { + return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); + }; + let variant = if self.has_time_zone { + // Has timezone -> DateTime -> TimestampMicros/TimestampNanos + let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); + Variant::from(utc_dt) // Uses From> for Variant } else { - // Conversion failed -> append null - builder.append_null(); - } + // No timezone -> NaiveDateTime -> TimestampNtzMicros/TimestampNtzNanos + Variant::from(naive_datetime) // Uses From for Variant + }; + builder.append_value(variant); } Ok(()) } @@ -834,16 +832,16 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Timestamp(time_unit, time_zone) => { match time_unit { TimeUnit::Second => Ok(ArrowToVariantRowBuilder::TimestampSecond( - TimestampArrowToVariantBuilder::new(array, time_zone.clone()) + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) )), TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::TimestampMillisecond( - TimestampArrowToVariantBuilder::new(array, time_zone.clone()) + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) )), TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::TimestampMicrosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.clone()) + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) )), TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::TimestampNanosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.clone()) + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) )), } } From 805abad6cc71e5d52747693a8f5e2a4a756e19bf Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 08:41:32 -0700 Subject: [PATCH 29/53] checkpoint - date and time --- .../src/cast_to_variant.rs | 513 +++++++++++++++++- 1 file changed, 511 insertions(+), 2 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index bd9cd9e43d83..765d9d45ddef 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -90,6 +90,12 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { TimestampMillisecond(TimestampArrowToVariantBuilder<'a, TimestampMillisecondType>), TimestampMicrosecond(TimestampArrowToVariantBuilder<'a, TimestampMicrosecondType>), TimestampNanosecond(TimestampArrowToVariantBuilder<'a, TimestampNanosecondType>), + Date32(Date32ArrowToVariantBuilder<'a>), + Date64(Date64ArrowToVariantBuilder<'a>), + Time32Second(Time32SecondArrowToVariantBuilder<'a>), + Time32Millisecond(Time32MillisecondArrowToVariantBuilder<'a>), + Time64Microsecond(Time64MicrosecondArrowToVariantBuilder<'a>), + Time64Nanosecond(Time64NanosecondArrowToVariantBuilder<'a>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -132,6 +138,12 @@ impl<'a> ArrowToVariantRowBuilder<'a> { ArrowToVariantRowBuilder::TimestampMillisecond(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::TimestampMicrosecond(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::TimestampNanosecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Date32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Date64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Time32Second(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Time32Millisecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Time64Microsecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Time64Nanosecond(b) => b.append_row(index, builder), } } } @@ -761,6 +773,196 @@ impl<'a, T: ArrowTimestampType> TimestampArrowToVariantBuilder<'a, T> { } } +/// Date32 builder for Arrow Date32 arrays +pub(crate) struct Date32ArrowToVariantBuilder<'a> { + array: &'a arrow::array::Date32Array, +} + +impl<'a> Date32ArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_primitive::(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let date_value = self.array.value(index); + + // Use Date32Type's specific conversion method + let naive_date = Date32Type::to_naive_date(date_value); + builder.append_value(Variant::from(naive_date)); + } + Ok(()) + } +} + +/// Date64 builder for Arrow Date64 arrays +pub(crate) struct Date64ArrowToVariantBuilder<'a> { + array: &'a arrow::array::Date64Array, +} + +impl<'a> Date64ArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_primitive::(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let date_value = self.array.value(index); + + // Use Date64Type's specific conversion method + let Some(naive_date) = Date64Type::to_naive_date_opt(date_value) else { + return Err(ArrowError::CastError(format!( + "Failed to convert Arrow date value {} to chrono::NaiveDate for Date64 type", + date_value + ))); + }; + builder.append_value(Variant::from(naive_date)); + } + Ok(()) + } +} + +/// Time32Second builder for Arrow Time32(Second) arrays +pub(crate) struct Time32SecondArrowToVariantBuilder<'a> { + array: &'a arrow::array::Time32SecondArray, +} + +impl<'a> Time32SecondArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_primitive::(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let time_value = self.array.value(index); + + // Convert using NaiveTime::from_num_seconds_from_midnight_opt (nanoseconds are 0) + let Some(naive_time) = NaiveTime::from_num_seconds_from_midnight_opt(time_value as u32, 0u32) else { + return Err(ArrowError::CastError(format!( + "Failed to convert Arrow time value {} to chrono::NaiveTime for Time32(Second) type", + time_value + ))); + }; + builder.append_value(Variant::from(naive_time)); + } + Ok(()) + } +} + +/// Time32Millisecond builder for Arrow Time32(Millisecond) arrays +pub(crate) struct Time32MillisecondArrowToVariantBuilder<'a> { + array: &'a arrow::array::Time32MillisecondArray, +} + +impl<'a> Time32MillisecondArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_primitive::(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let time_value = self.array.value(index); + + // Convert milliseconds to seconds and nanoseconds + let Some(naive_time) = NaiveTime::from_num_seconds_from_midnight_opt( + time_value as u32 / 1000, + (time_value as u32 % 1000) * 1_000_000 + ) else { + return Err(ArrowError::CastError(format!( + "Failed to convert Arrow time value {} to chrono::NaiveTime for Time32(Millisecond) type", + time_value + ))); + }; + builder.append_value(Variant::from(naive_time)); + } + Ok(()) + } +} + +/// Time64Microsecond builder for Arrow Time64(Microsecond) arrays +pub(crate) struct Time64MicrosecondArrowToVariantBuilder<'a> { + array: &'a arrow::array::Time64MicrosecondArray, +} + +impl<'a> Time64MicrosecondArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_primitive::(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let time_value = self.array.value(index); + + // Convert microseconds to seconds and nanoseconds + let Some(naive_time) = NaiveTime::from_num_seconds_from_midnight_opt( + (time_value / 1_000_000) as u32, + (time_value % 1_000_000 * 1_000) as u32 + ) else { + return Err(ArrowError::CastError(format!( + "Failed to convert Arrow time value {} to chrono::NaiveTime for Time64(Microsecond) type", + time_value + ))); + }; + builder.append_value(Variant::from(naive_time)); + } + Ok(()) + } +} + +/// Time64Nanosecond builder for Arrow Time64(Nanosecond) arrays +pub(crate) struct Time64NanosecondArrowToVariantBuilder<'a> { + array: &'a arrow::array::Time64NanosecondArray, +} + +impl<'a> Time64NanosecondArrowToVariantBuilder<'a> { + fn new(array: &'a dyn Array) -> Self { + Self { + array: array.as_primitive::(), + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let time_value = self.array.value(index); + + // Convert nanoseconds to seconds and nanoseconds + let Some(naive_time) = NaiveTime::from_num_seconds_from_midnight_opt( + (time_value / 1_000_000_000) as u32, + (time_value % 1_000_000_000) as u32 + ) else { + return Err(ArrowError::CastError(format!( + "Failed to convert Arrow time value {} to chrono::NaiveTime for Time64(Nanosecond) type", + time_value + ))); + }; + builder.append_value(Variant::from(naive_time)); + } + Ok(()) + } +} + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, @@ -846,7 +1048,38 @@ fn make_arrow_to_variant_row_builder<'a>( } } - // TODO: Add other types (Date, Time, etc.) + // Date types + DataType::Date32 => Ok(ArrowToVariantRowBuilder::Date32( + Date32ArrowToVariantBuilder::new(array) + )), + DataType::Date64 => Ok(ArrowToVariantRowBuilder::Date64( + Date64ArrowToVariantBuilder::new(array) + )), + + // Time types + DataType::Time32(time_unit) => { + match time_unit { + TimeUnit::Second => Ok(ArrowToVariantRowBuilder::Time32Second( + Time32SecondArrowToVariantBuilder::new(array) + )), + TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::Time32Millisecond( + Time32MillisecondArrowToVariantBuilder::new(array) + )), + _ => Err(ArrowError::CastError(format!("Unsupported Time32 unit: {time_unit:?}"))), + } + } + DataType::Time64(time_unit) => { + match time_unit { + TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::Time64Microsecond( + Time64MicrosecondArrowToVariantBuilder::new(array) + )), + TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::Time64Nanosecond( + Time64NanosecondArrowToVariantBuilder::new(array) + )), + _ => Err(ArrowError::CastError(format!("Unsupported Time64 unit: {time_unit:?}"))), + } + } + _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), } } @@ -1301,7 +1534,7 @@ fn convert_map( builder.append_null(); continue; } - + let start = offsets[i].as_usize(); let end = offsets[i + 1].as_usize(); @@ -4503,4 +4736,280 @@ mod row_builder_tests { let expected_no_millis = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); assert_eq!(variant_array.value(2), Variant::from(expected_no_millis)); } + + #[test] + fn test_date32_row_builder() { + use arrow::array::Date32Array; + use chrono::NaiveDate; + + // Test Date32Array with various dates + let date_data = vec![ + Some(0), // 1970-01-01 + None, + Some(19723), // 2024-01-01 (days since epoch) + Some(-719162), // 0001-01-01 (near minimum) + ]; + let date_array = Date32Array::from(date_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + date_array.data_type(), + &date_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..date_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 1970-01-01 (epoch) + let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 2024-01-01 + let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_2024)); + + // Row 3: 0001-01-01 (near minimum date) + let expected_min = NaiveDate::from_ymd_opt(1, 1, 1).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_min)); + } + + #[test] + fn test_date64_row_builder() { + use arrow::array::Date64Array; + use chrono::NaiveDate; + + // Test Date64Array with various dates (milliseconds since epoch) + let date_data = vec![ + Some(0), // 1970-01-01 + None, + Some(1704067200000), // 2024-01-01 (milliseconds since epoch) + Some(86400000), // 1970-01-02 + ]; + let date_array = Date64Array::from(date_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + date_array.data_type(), + &date_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..date_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 1970-01-01 (epoch) + let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 2024-01-01 + let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_2024)); + + // Row 3: 1970-01-02 + let expected_next_day = NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_next_day)); + } + + #[test] + fn test_time32_second_row_builder() { + use arrow::array::Time32SecondArray; + use chrono::NaiveTime; + + // Test Time32SecondArray with various times (seconds since midnight) + let time_data = vec![ + Some(0), // 00:00:00 + None, + Some(3661), // 01:01:01 + Some(86399), // 23:59:59 + ]; + let time_array = Time32SecondArray::from(time_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + time_array.data_type(), + &time_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..time_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 00:00:00 (midnight) + let expected_midnight = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 01:01:01 + let expected_time = NaiveTime::from_hms_opt(1, 1, 1).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_time)); + + // Row 3: 23:59:59 (last second of day) + let expected_last = NaiveTime::from_hms_opt(23, 59, 59).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_last)); + } + + #[test] + fn test_time32_millisecond_row_builder() { + use arrow::array::Time32MillisecondArray; + use chrono::NaiveTime; + + // Test Time32MillisecondArray with various times (milliseconds since midnight) + let time_data = vec![ + Some(0), // 00:00:00.000 + None, + Some(3661123), // 01:01:01.123 + Some(86399999), // 23:59:59.999 + ]; + let time_array = Time32MillisecondArray::from(time_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + time_array.data_type(), + &time_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..time_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 00:00:00.000 (midnight) + let expected_midnight = NaiveTime::from_hms_milli_opt(0, 0, 0, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 01:01:01.123 + let expected_time = NaiveTime::from_hms_milli_opt(1, 1, 1, 123).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_time)); + + // Row 3: 23:59:59.999 (last millisecond of day) + let expected_last = NaiveTime::from_hms_milli_opt(23, 59, 59, 999).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_last)); + } + + #[test] + fn test_time64_microsecond_row_builder() { + use arrow::array::Time64MicrosecondArray; + use chrono::NaiveTime; + + // Test Time64MicrosecondArray with various times (microseconds since midnight) + let time_data = vec![ + Some(0), // 00:00:00.000000 + None, + Some(3661123456), // 01:01:01.123456 + Some(86399999999), // 23:59:59.999999 + ]; + let time_array = Time64MicrosecondArray::from(time_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + time_array.data_type(), + &time_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..time_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 00:00:00.000000 (midnight) + let expected_midnight = NaiveTime::from_hms_micro_opt(0, 0, 0, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 01:01:01.123456 + let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_time)); + + // Row 3: 23:59:59.999999 (last microsecond of day) + let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_last)); + } + + #[test] + fn test_time64_nanosecond_row_builder() { + use arrow::array::Time64NanosecondArray; + use chrono::NaiveTime; + + // Test Time64NanosecondArray with various times (nanoseconds since midnight) + let time_data = vec![ + Some(0), // 00:00:00.000000000 + None, + Some(3661123456789), // 01:01:01.123456789 + Some(86399999999999), // 23:59:59.999999999 + ]; + let time_array = Time64NanosecondArray::from(time_data); + + let mut row_builder = make_arrow_to_variant_row_builder( + time_array.data_type(), + &time_array, + ).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..time_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 00:00:00.000000000 (midnight) + let expected_midnight = NaiveTime::from_hms_nano_opt(0, 0, 0, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 01:01:01.123456789 -> truncated to 01:01:01.123456000 (microsecond precision) + let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_time)); + + // Row 3: 23:59:59.999999999 -> truncated to 23:59:59.999999000 (microsecond precision) + let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_last)); + } } From 29d445a19ca60b5641bc2093ff22d31cc3093a4f Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 08:52:26 -0700 Subject: [PATCH 30/53] checkpoint - template date and time builders --- .../src/cast_to_variant.rs | 208 ++++-------------- 1 file changed, 46 insertions(+), 162 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 765d9d45ddef..2b6dc93d36b8 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -30,7 +30,7 @@ use arrow::array::{ use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::kernels::cast; use arrow::datatypes::{ - i256, ArrowNativeType, ArrowPrimitiveType, ArrowTimestampType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, + i256, ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, @@ -38,7 +38,7 @@ use arrow::datatypes::{ UInt64Type, UInt8Type, }; use arrow::temporal_conversions::{ - as_datetime, timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, + as_date, as_datetime, as_time, timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, timestamp_us_to_datetime, }; use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit, UnionFields}; @@ -90,12 +90,12 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { TimestampMillisecond(TimestampArrowToVariantBuilder<'a, TimestampMillisecondType>), TimestampMicrosecond(TimestampArrowToVariantBuilder<'a, TimestampMicrosecondType>), TimestampNanosecond(TimestampArrowToVariantBuilder<'a, TimestampNanosecondType>), - Date32(Date32ArrowToVariantBuilder<'a>), - Date64(Date64ArrowToVariantBuilder<'a>), - Time32Second(Time32SecondArrowToVariantBuilder<'a>), - Time32Millisecond(Time32MillisecondArrowToVariantBuilder<'a>), - Time64Microsecond(Time64MicrosecondArrowToVariantBuilder<'a>), - Time64Nanosecond(Time64NanosecondArrowToVariantBuilder<'a>), + Date32(DateArrowToVariantBuilder<'a, Date32Type>), + Date64(DateArrowToVariantBuilder<'a, Date64Type>), + Time32Second(TimeArrowToVariantBuilder<'a, Time32SecondType>), + Time32Millisecond(TimeArrowToVariantBuilder<'a, Time32MillisecondType>), + Time64Microsecond(TimeArrowToVariantBuilder<'a, Time64MicrosecondType>), + Time64Nanosecond(TimeArrowToVariantBuilder<'a, Time64NanosecondType>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -773,41 +773,21 @@ impl<'a, T: ArrowTimestampType> TimestampArrowToVariantBuilder<'a, T> { } } -/// Date32 builder for Arrow Date32 arrays -pub(crate) struct Date32ArrowToVariantBuilder<'a> { - array: &'a arrow::array::Date32Array, -} - -impl<'a> Date32ArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_primitive::(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let date_value = self.array.value(index); - - // Use Date32Type's specific conversion method - let naive_date = Date32Type::to_naive_date(date_value); - builder.append_value(Variant::from(naive_date)); - } - Ok(()) - } -} - -/// Date64 builder for Arrow Date64 arrays -pub(crate) struct Date64ArrowToVariantBuilder<'a> { - array: &'a arrow::array::Date64Array, +/// Generic Date builder for Arrow date arrays (Date32, Date64) +pub(crate) struct DateArrowToVariantBuilder<'a, T: ArrowTemporalType> +where + i64: From, +{ + array: &'a arrow::array::PrimitiveArray, } -impl<'a> Date64ArrowToVariantBuilder<'a> { +impl<'a, T: ArrowTemporalType> DateArrowToVariantBuilder<'a, T> +where + i64: From, +{ fn new(array: &'a dyn Array) -> Self { Self { - array: array.as_primitive::(), + array: array.as_primitive::(), } } @@ -815,13 +795,13 @@ impl<'a> Date64ArrowToVariantBuilder<'a> { if self.array.is_null(index) { builder.append_null(); } else { - let date_value = self.array.value(index); + let date_value = i64::from(self.array.value(index)); - // Use Date64Type's specific conversion method - let Some(naive_date) = Date64Type::to_naive_date_opt(date_value) else { + // Use Arrow's generic date conversion function + let Some(naive_date) = as_date::(date_value) else { return Err(ArrowError::CastError(format!( - "Failed to convert Arrow date value {} to chrono::NaiveDate for Date64 type", - date_value + "Failed to convert Arrow date value {} to chrono::NaiveDate for type {:?}", + date_value, T::DATA_TYPE ))); }; builder.append_value(Variant::from(naive_date)); @@ -830,114 +810,21 @@ impl<'a> Date64ArrowToVariantBuilder<'a> { } } -/// Time32Second builder for Arrow Time32(Second) arrays -pub(crate) struct Time32SecondArrowToVariantBuilder<'a> { - array: &'a arrow::array::Time32SecondArray, -} - -impl<'a> Time32SecondArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_primitive::(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let time_value = self.array.value(index); - - // Convert using NaiveTime::from_num_seconds_from_midnight_opt (nanoseconds are 0) - let Some(naive_time) = NaiveTime::from_num_seconds_from_midnight_opt(time_value as u32, 0u32) else { - return Err(ArrowError::CastError(format!( - "Failed to convert Arrow time value {} to chrono::NaiveTime for Time32(Second) type", - time_value - ))); - }; - builder.append_value(Variant::from(naive_time)); - } - Ok(()) - } -} - -/// Time32Millisecond builder for Arrow Time32(Millisecond) arrays -pub(crate) struct Time32MillisecondArrowToVariantBuilder<'a> { - array: &'a arrow::array::Time32MillisecondArray, -} - -impl<'a> Time32MillisecondArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_primitive::(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let time_value = self.array.value(index); - - // Convert milliseconds to seconds and nanoseconds - let Some(naive_time) = NaiveTime::from_num_seconds_from_midnight_opt( - time_value as u32 / 1000, - (time_value as u32 % 1000) * 1_000_000 - ) else { - return Err(ArrowError::CastError(format!( - "Failed to convert Arrow time value {} to chrono::NaiveTime for Time32(Millisecond) type", - time_value - ))); - }; - builder.append_value(Variant::from(naive_time)); - } - Ok(()) - } -} - -/// Time64Microsecond builder for Arrow Time64(Microsecond) arrays -pub(crate) struct Time64MicrosecondArrowToVariantBuilder<'a> { - array: &'a arrow::array::Time64MicrosecondArray, -} - -impl<'a> Time64MicrosecondArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_primitive::(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let time_value = self.array.value(index); - - // Convert microseconds to seconds and nanoseconds - let Some(naive_time) = NaiveTime::from_num_seconds_from_midnight_opt( - (time_value / 1_000_000) as u32, - (time_value % 1_000_000 * 1_000) as u32 - ) else { - return Err(ArrowError::CastError(format!( - "Failed to convert Arrow time value {} to chrono::NaiveTime for Time64(Microsecond) type", - time_value - ))); - }; - builder.append_value(Variant::from(naive_time)); - } - Ok(()) - } -} - -/// Time64Nanosecond builder for Arrow Time64(Nanosecond) arrays -pub(crate) struct Time64NanosecondArrowToVariantBuilder<'a> { - array: &'a arrow::array::Time64NanosecondArray, +/// Generic Time builder for Arrow time arrays (Time32, Time64) +pub(crate) struct TimeArrowToVariantBuilder<'a, T: ArrowTemporalType> +where + i64: From, +{ + array: &'a arrow::array::PrimitiveArray, } -impl<'a> Time64NanosecondArrowToVariantBuilder<'a> { +impl<'a, T: ArrowTemporalType> TimeArrowToVariantBuilder<'a, T> +where + i64: From, +{ fn new(array: &'a dyn Array) -> Self { Self { - array: array.as_primitive::(), + array: array.as_primitive::(), } } @@ -945,16 +832,13 @@ impl<'a> Time64NanosecondArrowToVariantBuilder<'a> { if self.array.is_null(index) { builder.append_null(); } else { - let time_value = self.array.value(index); + let time_value = i64::from(self.array.value(index)); - // Convert nanoseconds to seconds and nanoseconds - let Some(naive_time) = NaiveTime::from_num_seconds_from_midnight_opt( - (time_value / 1_000_000_000) as u32, - (time_value % 1_000_000_000) as u32 - ) else { + // Use Arrow's generic time conversion function + let Some(naive_time) = as_time::(time_value) else { return Err(ArrowError::CastError(format!( - "Failed to convert Arrow time value {} to chrono::NaiveTime for Time64(Nanosecond) type", - time_value + "Failed to convert Arrow time value {} to chrono::NaiveTime for type {:?}", + time_value, T::DATA_TYPE ))); }; builder.append_value(Variant::from(naive_time)); @@ -1050,20 +934,20 @@ fn make_arrow_to_variant_row_builder<'a>( // Date types DataType::Date32 => Ok(ArrowToVariantRowBuilder::Date32( - Date32ArrowToVariantBuilder::new(array) + DateArrowToVariantBuilder::::new(array) )), DataType::Date64 => Ok(ArrowToVariantRowBuilder::Date64( - Date64ArrowToVariantBuilder::new(array) + DateArrowToVariantBuilder::::new(array) )), // Time types DataType::Time32(time_unit) => { match time_unit { TimeUnit::Second => Ok(ArrowToVariantRowBuilder::Time32Second( - Time32SecondArrowToVariantBuilder::new(array) + TimeArrowToVariantBuilder::::new(array) )), TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::Time32Millisecond( - Time32MillisecondArrowToVariantBuilder::new(array) + TimeArrowToVariantBuilder::::new(array) )), _ => Err(ArrowError::CastError(format!("Unsupported Time32 unit: {time_unit:?}"))), } @@ -1071,10 +955,10 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Time64(time_unit) => { match time_unit { TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::Time64Microsecond( - Time64MicrosecondArrowToVariantBuilder::new(array) + TimeArrowToVariantBuilder::::new(array) )), TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::Time64Nanosecond( - Time64NanosecondArrowToVariantBuilder::new(array) + TimeArrowToVariantBuilder::::new(array) )), _ => Err(ArrowError::CastError(format!("Unsupported Time64 unit: {time_unit:?}"))), } From 7698de93c42f2a26919386a8128b7ee197177294 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 09:11:30 -0700 Subject: [PATCH 31/53] checkpoint - cast_to_variant uses row builders --- .../src/cast_to_variant.rs | 246 +----------------- 1 file changed, 14 insertions(+), 232 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 2b6dc93d36b8..9c2f71f0a071 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -998,238 +998,20 @@ fn make_arrow_to_variant_row_builder<'a>( /// will be truncated to /// `1970-01-01T00:00:01.234567Z` pub fn cast_to_variant(input: &dyn Array) -> Result { - let mut builder = VariantArrayBuilder::new(input.len()); - - let input_type = input.data_type(); - match input_type { - DataType::Null => { - for _ in 0..input.len() { - builder.append_null(); - } - } - DataType::Boolean => { - non_generic_conversion_array!(input.as_boolean(), |v| v, builder); - } - DataType::Int8 => { - primitive_conversion_array!(Int8Type, input, builder); - } - DataType::Int16 => { - primitive_conversion_array!(Int16Type, input, builder); - } - DataType::Int32 => { - primitive_conversion_array!(Int32Type, input, builder); - } - DataType::Int64 => { - primitive_conversion_array!(Int64Type, input, builder); - } - DataType::UInt8 => { - primitive_conversion_array!(UInt8Type, input, builder); - } - DataType::UInt16 => { - primitive_conversion_array!(UInt16Type, input, builder); - } - DataType::UInt32 => { - primitive_conversion_array!(UInt32Type, input, builder); - } - DataType::UInt64 => { - primitive_conversion_array!(UInt64Type, input, builder); - } - DataType::Float16 => { - generic_conversion_array!(Float16Type, as_primitive, f32::from, input, builder); - } - DataType::Float32 => { - primitive_conversion_array!(Float32Type, input, builder); - } - DataType::Float64 => { - primitive_conversion_array!(Float64Type, input, builder); - } - DataType::Decimal32(_, scale) => { - generic_conversion_array!( - Decimal32Type, - as_primitive, - |v| decimal_to_variant_decimal!(v, scale, i32, VariantDecimal4), - input, - builder - ); - } - DataType::Decimal64(_, scale) => { - generic_conversion_array!( - Decimal64Type, - as_primitive, - |v| decimal_to_variant_decimal!(v, scale, i64, VariantDecimal8), - input, - builder - ); - } - DataType::Decimal128(_, scale) => { - generic_conversion_array!( - Decimal128Type, - as_primitive, - |v| decimal_to_variant_decimal!(v, scale, i128, VariantDecimal16), - input, - builder - ); - } - DataType::Decimal256(_, scale) => { - generic_conversion_array!( - Decimal256Type, - as_primitive, - |v: i256| { - // Since `i128::MAX` is larger than the max value of `VariantDecimal16`, - // any `i256` value that cannot be cast to `i128` is unable to be cast to `VariantDecimal16` either. - // Therefore, we can safely convert `i256` to `i128` first and process it like `i128`. - if let Some(v) = v.to_i128() { - decimal_to_variant_decimal!(v, scale, i128, VariantDecimal16) - } else { - Variant::Null - } - }, - input, - builder - ); - } - DataType::Timestamp(time_unit, time_zone) => { - convert_timestamp(time_unit, time_zone, input, &mut builder); - } - DataType::Date32 => { - generic_conversion_array!( - Date32Type, - as_primitive, - |v: i32| -> NaiveDate { Date32Type::to_naive_date(v) }, - input, - builder - ); - } - DataType::Date64 => { - generic_conversion_array!( - Date64Type, - as_primitive, - |v: i64| { Date64Type::to_naive_date_opt(v).unwrap() }, - input, - builder - ); - } - DataType::Time32(unit) => { - match *unit { - TimeUnit::Second => { - generic_conversion_array!( - Time32SecondType, - as_primitive, - // nano second are always 0 - |v| NaiveTime::from_num_seconds_from_midnight_opt(v as u32, 0u32).unwrap(), - input, - builder - ); - } - TimeUnit::Millisecond => { - generic_conversion_array!( - Time32MillisecondType, - as_primitive, - |v| NaiveTime::from_num_seconds_from_midnight_opt( - v as u32 / 1000, - (v as u32 % 1000) * 1_000_000 - ) - .unwrap(), - input, - builder - ); - } - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported Time32 unit: {:?}", - unit - ))); - } - }; - } - DataType::Time64(unit) => { - match *unit { - TimeUnit::Microsecond => { - generic_conversion_array!( - Time64MicrosecondType, - as_primitive, - |v| NaiveTime::from_num_seconds_from_midnight_opt( - (v / 1_000_000) as u32, - (v % 1_000_000 * 1_000) as u32 - ) - .unwrap(), - input, - builder - ); - } - TimeUnit::Nanosecond => { - generic_conversion_array!( - Time64NanosecondType, - as_primitive, - |v| NaiveTime::from_num_seconds_from_midnight_opt( - (v / 1_000_000_000) as u32, - (v % 1_000_000_000) as u32 - ) - .unwrap(), - input, - builder - ); - } - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported Time64 unit: {:?}", - unit - ))); - } - }; - } - DataType::Duration(_) | DataType::Interval(_) => { - return Err(ArrowError::InvalidArgumentError( - "Casting duration/interval types to Variant is not supported. \ - The Variant format does not define duration/interval types." - .to_string(), - )); - } - DataType::Binary => { - generic_conversion_array!(BinaryType, as_bytes, |v| v, input, builder); - } - DataType::LargeBinary => { - generic_conversion_array!(LargeBinaryType, as_bytes, |v| v, input, builder); - } - DataType::BinaryView => { - generic_conversion_array!(BinaryViewType, as_byte_view, |v| v, input, builder); - } - DataType::FixedSizeBinary(_) => { - non_generic_conversion_array!(input.as_fixed_size_binary(), |v| v, builder); - } - DataType::Utf8 => { - generic_conversion_array!(i32, as_string, |v| v, input, builder); - } - DataType::LargeUtf8 => { - generic_conversion_array!(i64, as_string, |v| v, input, builder); - } - DataType::Utf8View => { - non_generic_conversion_array!(input.as_string_view(), |v| v, builder); - } - DataType::List(_) => convert_list::(input, &mut builder)?, - DataType::LargeList(_) => convert_list::(input, &mut builder)?, - DataType::Struct(_) => convert_struct(input, &mut builder)?, - DataType::Map(field, _) => convert_map(field, input, &mut builder)?, - DataType::Union(fields, _) => convert_union(fields, input, &mut builder)?, - DataType::Dictionary(_, _) => convert_dictionary_encoded(input, &mut builder)?, - DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { - DataType::Int16 => convert_run_end_encoded::(input, &mut builder)?, - DataType::Int32 => convert_run_end_encoded::(input, &mut builder)?, - DataType::Int64 => convert_run_end_encoded::(input, &mut builder)?, - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported run ends type: {:?}", - run_ends.data_type() - ))); - } - }, - dt => { - return Err(ArrowError::CastError(format!( - "Unsupported data type for casting to Variant: {dt:?}", - ))); - } - }; - Ok(builder.build()) + // Create row builder for the input array type + let mut row_builder = make_arrow_to_variant_row_builder(input.data_type(), input)?; + + // Create output array builder + let mut array_builder = VariantArrayBuilder::new(input.len()); + + // Process each row using the row builder + for i in 0..input.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder)?; + builder.finish(); + } + + Ok(array_builder.build()) } // TODO do we need a cast_with_options to allow specifying conversion behavior, From ca4ba9f813af46cf00a58a1e222c31288a19e92e Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 09:16:48 -0700 Subject: [PATCH 32/53] fix one unit test --- parquet-variant-compute/src/cast_to_variant.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 9c2f71f0a071..af91d262578c 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -964,6 +964,13 @@ fn make_arrow_to_variant_row_builder<'a>( } } + DataType::Duration(_) | DataType::Interval(_) => { + return Err(ArrowError::InvalidArgumentError( + "Casting duration/interval types to Variant is not supported. \ + The Variant format does not define duration/interval types." + .to_string(), + )); + } _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), } } From 658c263be223981f123693f33c3538b66935b2aa Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 09:27:53 -0700 Subject: [PATCH 33/53] delete dead code --- .../src/cast_to_variant.rs | 328 +----------------- .../src/type_conversion.rs | 6 + 2 files changed, 14 insertions(+), 320 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index af91d262578c..16ea0daa5851 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -19,15 +19,12 @@ use std::collections::HashMap; use std::sync::Arc; use crate::type_conversion::{ - decimal_to_variant_decimal, generic_conversion_array, non_generic_conversion_array, - primitive_conversion_array, + decimal_to_variant_decimal, }; use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ - Array, AsArray, OffsetSizeTrait, TimestampMicrosecondArray, TimestampMillisecondArray, - TimestampNanosecondArray, TimestampSecondArray, + Array, AsArray, OffsetSizeTrait, }; -use arrow::buffer::{OffsetBuffer, ScalarBuffer}; use arrow::compute::kernels::cast; use arrow::datatypes::{ i256, ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, @@ -38,11 +35,10 @@ use arrow::datatypes::{ UInt64Type, UInt8Type, }; use arrow::temporal_conversions::{ - as_date, as_datetime, as_time, timestamp_ms_to_datetime, timestamp_ns_to_datetime, timestamp_s_to_datetime, - timestamp_us_to_datetime, + as_date, as_datetime, as_time, }; -use arrow_schema::{ArrowError, DataType, FieldRef, TimeUnit, UnionFields}; -use chrono::{DateTime, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; +use arrow_schema::{ArrowError, DataType, TimeUnit}; +use chrono::{DateTime, NaiveDate, NaiveTime, TimeZone, Utc}; use parquet_variant::{ ObjectFieldBuilder, Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; @@ -965,11 +961,11 @@ fn make_arrow_to_variant_row_builder<'a>( } DataType::Duration(_) | DataType::Interval(_) => { - return Err(ArrowError::InvalidArgumentError( + Err(ArrowError::InvalidArgumentError( "Casting duration/interval types to Variant is not supported. \ The Variant format does not define duration/interval types." .to_string(), - )); + )) } _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), } @@ -1025,314 +1021,6 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { // e.g. how to handle overflows, whether to convert to Variant::Null or return // an error, etc. ? -/// Convert timestamp arrays to native datetimes -fn convert_timestamp( - time_unit: &TimeUnit, - time_zone: &Option>, - input: &dyn Array, - builder: &mut VariantArrayBuilder, -) { - let native_datetimes: Vec> = match time_unit { - arrow_schema::TimeUnit::Second => { - let ts_array = input - .as_any() - .downcast_ref::() - .expect("Array is not TimestampSecondArray"); - - ts_array - .iter() - .map(|x| x.map(|y| timestamp_s_to_datetime(y).unwrap())) - .collect() - } - arrow_schema::TimeUnit::Millisecond => { - let ts_array = input - .as_any() - .downcast_ref::() - .expect("Array is not TimestampMillisecondArray"); - - ts_array - .iter() - .map(|x| x.map(|y| timestamp_ms_to_datetime(y).unwrap())) - .collect() - } - arrow_schema::TimeUnit::Microsecond => { - let ts_array = input - .as_any() - .downcast_ref::() - .expect("Array is not TimestampMicrosecondArray"); - ts_array - .iter() - .map(|x| x.map(|y| timestamp_us_to_datetime(y).unwrap())) - .collect() - } - arrow_schema::TimeUnit::Nanosecond => { - let ts_array = input - .as_any() - .downcast_ref::() - .expect("Array is not TimestampNanosecondArray"); - ts_array - .iter() - .map(|x| x.map(|y| timestamp_ns_to_datetime(y).unwrap())) - .collect() - } - }; - - for x in native_datetimes { - match x { - Some(ndt) => { - if time_zone.is_none() { - builder.append_variant(ndt.into()); - } else { - let utc_dt: DateTime = Utc.from_utc_datetime(&ndt); - builder.append_variant(utc_dt.into()); - } - } - None => { - builder.append_null(); - } - } - } -} - -/// Generic function to convert list arrays (both List and LargeList) to variant arrays -fn convert_list( - input: &dyn Array, - builder: &mut VariantArrayBuilder, -) -> Result<(), ArrowError> { - let list_array = input.as_list::(); - let values = list_array.values(); - let offsets = list_array.offsets(); - - let first_offset = *offsets.first().expect("There should be an offset"); - let length = *offsets.last().expect("There should be an offset") - first_offset; - let sliced_values = values.slice(first_offset.as_usize(), length.as_usize()); - - let values_variant_array = cast_to_variant(sliced_values.as_ref())?; - let new_offsets = OffsetBuffer::new(ScalarBuffer::from_iter( - offsets.iter().map(|o| *o - first_offset), - )); - - for i in 0..list_array.len() { - if list_array.is_null(i) { - builder.append_null(); - continue; - } - - let start = new_offsets[i].as_usize(); - let end = new_offsets[i + 1].as_usize(); - - // Start building the inner VariantList - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - - // Add all values from the slice - for j in start..end { - list_builder.append_value(values_variant_array.value(j)); - } - - list_builder.finish(); - - let (metadata, value) = variant_builder.finish(); - let variant = Variant::new(&metadata, &value); - builder.append_variant(variant) - } - - Ok(()) -} - -fn convert_struct(input: &dyn Array, builder: &mut VariantArrayBuilder) -> Result<(), ArrowError> { - let struct_array = input.as_struct(); - - // Pre-convert all field arrays once for better performance - // This avoids converting the same field array multiple times - // Alternative approach: Use slicing per row: field_array.slice(i, 1) - // However, pre-conversion is more efficient for typical use cases - let field_variant_arrays: Result, _> = struct_array - .columns() - .iter() - .map(|field_array| cast_to_variant(field_array.as_ref())) - .collect(); - let field_variant_arrays = field_variant_arrays?; - - // Cache column names to avoid repeated calls - let column_names = struct_array.column_names(); - - for i in 0..struct_array.len() { - if struct_array.is_null(i) { - builder.append_null(); - continue; - } - - // Create a VariantBuilder for this struct instance - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - - // Iterate through all fields in the struct - for (field_idx, field_name) in column_names.iter().enumerate() { - // Use pre-converted field variant arrays for better performance - // Check nulls directly from the pre-converted arrays instead of accessing column again - if !field_variant_arrays[field_idx].is_null(i) { - let field_variant = field_variant_arrays[field_idx].value(i); - object_builder.insert(field_name, field_variant); - } - // Note: we skip null fields rather than inserting Variant::Null - // to match Arrow struct semantics where null fields are omitted - } - - object_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - builder.append_variant(variant); - } - - Ok(()) -} - -fn convert_map( - field: &FieldRef, - input: &dyn Array, - builder: &mut VariantArrayBuilder, -) -> Result<(), ArrowError> { - match field.data_type() { - DataType::Struct(_) => { - let map_array = input.as_map(); - let keys = cast(map_array.keys(), &DataType::Utf8)?; - let key_strings = keys.as_string::(); - let values = cast_to_variant(map_array.values())?; - let offsets = map_array.offsets(); - - for i in 0..map_array.len() { - // Check for NULL map first (FIXED: was checking offsets before) - if map_array.is_null(i) { - builder.append_null(); - continue; - } - - let start = offsets[i].as_usize(); - let end = offsets[i + 1].as_usize(); - - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - - // Add key-value pairs (empty range = empty object, FIXED) - for j in start..end { - let value = values.value(j); - object_builder.insert(key_strings.value(j), value); - } - - object_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - builder.append_variant(variant); - } - } - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported map field type for casting to Variant: {field:?}", - ))); - } - } - - Ok(()) -} - -fn convert_union( - fields: &UnionFields, - input: &dyn Array, - builder: &mut VariantArrayBuilder, -) -> Result<(), ArrowError> { - let union_array = input.as_union(); - - // Convert each child array to variant arrays - let mut child_variant_arrays = HashMap::new(); - for (type_id, _) in fields.iter() { - let child_array = union_array.child(type_id); - let child_variant_array = cast_to_variant(child_array.as_ref())?; - child_variant_arrays.insert(type_id, child_variant_array); - } - - // Process each element in the union array - for i in 0..union_array.len() { - let type_id = union_array.type_id(i); - let value_offset = union_array.value_offset(i); - - if let Some(child_variant_array) = child_variant_arrays.get(&type_id) { - if child_variant_array.is_null(value_offset) { - builder.append_null(); - } else { - let value = child_variant_array.value(value_offset); - builder.append_variant(value); - } - } else { - // This should not happen in a valid union, but handle gracefully - builder.append_null(); - } - } - - Ok(()) -} - -fn convert_dictionary_encoded( - input: &dyn Array, - builder: &mut VariantArrayBuilder, -) -> Result<(), ArrowError> { - let dict_array = input.as_any_dictionary(); - let values_variant_array = cast_to_variant(dict_array.values().as_ref())?; - let normalized_keys = dict_array.normalized_keys(); - let keys = dict_array.keys(); - - for (i, key_idx) in normalized_keys.iter().enumerate() { - if keys.is_null(i) { - builder.append_null(); - continue; - } - - if values_variant_array.is_null(*key_idx) { - builder.append_null(); - continue; - } - - let value = values_variant_array.value(*key_idx); - builder.append_variant(value); - } - - Ok(()) -} - -fn convert_run_end_encoded( - input: &dyn Array, - builder: &mut VariantArrayBuilder, -) -> Result<(), ArrowError> { - let run_array = input.as_run::(); - let values_variant_array = cast_to_variant(run_array.values().as_ref())?; - - // Process runs in batches for better performance - let run_ends = run_array.run_ends().values(); - let mut logical_start = 0; - - for (physical_idx, &run_end) in run_ends.iter().enumerate() { - let logical_end = run_end.as_usize(); - let run_length = logical_end - logical_start; - - if values_variant_array.is_null(physical_idx) { - // Append nulls for the entire run - for _ in 0..run_length { - builder.append_null(); - } - } else { - // Get the value once and append it for the entire run - let value = values_variant_array.value(physical_idx); - for _ in 0..run_length { - builder.append_variant(value.clone()); - } - } - - logical_start = logical_end; - } - - Ok(()) -} - #[cfg(test)] mod tests { use super::*; @@ -1346,7 +1034,7 @@ mod tests { LargeStringArray, ListArray, MapArray, NullArray, StringArray, StringRunBuilder, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, UnionArray, + UInt8Array, UnionArray, TimestampSecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampMicrosecondArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{IntervalDayTime, IntervalMonthDayNano}; diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 647d2c705ff0..da25a638f1a6 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -20,6 +20,7 @@ /// Convert the input array to a `VariantArray` row by row, using `method` /// not requiring a generic type to downcast the generic array to a specific /// array type and `cast_fn` to transform each element to a type compatible with Variant +#[allow(unused)] macro_rules! non_generic_conversion_array { ($array:expr, $cast_fn:expr, $builder:expr) => {{ let array = $array; @@ -33,6 +34,7 @@ macro_rules! non_generic_conversion_array { } }}; } +#[allow(unused)] pub(crate) use non_generic_conversion_array; /// Convert the value at a specific index in the given array into a `Variant`. @@ -52,6 +54,7 @@ pub(crate) use non_generic_conversion_single_value; /// Convert the input array to a `VariantArray` row by row, using `method` /// requiring a generic type to downcast the generic array to a specific /// array type and `cast_fn` to transform each element to a type compatible with Variant +#[allow(unused)] macro_rules! generic_conversion_array { ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ $crate::type_conversion::non_generic_conversion_array!( @@ -61,6 +64,7 @@ macro_rules! generic_conversion_array { ) }}; } +#[allow(unused)] pub(crate) use generic_conversion_array; /// Convert the value at a specific index in the given array into a `Variant`, @@ -79,6 +83,7 @@ pub(crate) use generic_conversion_single_value; /// Convert the input array of a specific primitive type to a `VariantArray` /// row by row +#[allow(unused)] macro_rules! primitive_conversion_array { ($t:ty, $input:expr, $builder:expr) => {{ $crate::type_conversion::generic_conversion_array!( @@ -90,6 +95,7 @@ macro_rules! primitive_conversion_array { ) }}; } +#[allow(unused)] pub(crate) use primitive_conversion_array; /// Convert the value at a specific index in the given array into a `Variant`. From fe14de2ad378a3f0fd57e38b2f601b8beb487e7e Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 11:06:25 -0700 Subject: [PATCH 34/53] checkpoint - use macros to define row builders --- .../src/cast_to_variant.rs | 405 +++++++++++++++--- 1 file changed, 355 insertions(+), 50 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 16ea0daa5851..5c0fe3bdc64e 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -43,37 +43,342 @@ use parquet_variant::{ ObjectFieldBuilder, Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; +// ============================================================================ +// Macros for generating row-oriented builders +// ============================================================================ + +/// Base macro for generating row builders with optional state fields +macro_rules! impl_row_builder_base { + ($name:ident, $array_field_type:ty, $array_init:expr, $variant_expr:expr $(, $state_field:ident: $state_type:ty)*) => { + pub(crate) struct $name<'a> { + array: &'a $array_field_type, + $($state_field: $state_type,)* + } + + impl<'a> $name<'a> { + fn new(array: &'a dyn Array $(, $state_field: $state_type)*) -> Self { + Self { + array: $array_init, + $($state_field,)* + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let variant = $variant_expr; + builder.append_value(variant); + } + Ok(()) + } + } + }; +} + +/// Convenience macro for generic builders (with type parameter and method) +macro_rules! impl_generic_row_builder { + ($name:ident<$t:ty>, $method:ident, $cast_fn:expr, $array_type:ty $(, $state_field:ident: $state_type:ty)*) => { + impl_row_builder_base!( + $name, + $array_type, + array.$method::<$t>(), + $crate::type_conversion::generic_conversion_single_value!($t, $method, $cast_fn, self.array, index) + $(, $state_field: $state_type)* + ); + }; +} + +/// Convenience macro for primitive builders (uses as_primitive method) +macro_rules! impl_primitive_row_builder { + ($name:ident<$t:ty> $(, $state_field:ident: $state_type:ty)*) => { + impl_generic_row_builder!( + $name<$t>, + as_primitive, + |v| v, + arrow::array::PrimitiveArray<$t> + $(, $state_field: $state_type)* + ); + }; +} + +// ============================================================================ +// Macro-generated row builders +// ============================================================================ + +// Primitive builders (11 variants) - stateless +impl_primitive_row_builder!(PrimitiveInt8ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveInt16ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveInt32ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveInt64ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveUInt8ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveUInt16ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveUInt32ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveUInt64ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveFloat16ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveFloat32ArrowToVariantBuilder); +impl_primitive_row_builder!(PrimitiveFloat64ArrowToVariantBuilder); + +// Generic offset builders (4 variants) - stateless +impl_generic_row_builder!( + StringArrowToVariantBuilder, + as_string, + |v| v, + arrow::array::GenericStringArray +); +impl_generic_row_builder!( + LargeStringArrowToVariantBuilder, + as_string, + |v| v, + arrow::array::GenericStringArray +); +impl_generic_row_builder!( + BinaryArrowToVariantBuilder, + as_binary, + |v| v, + arrow::array::GenericBinaryArray +); +impl_generic_row_builder!( + LargeBinaryArrowToVariantBuilder, + as_binary, + |v| v, + arrow::array::GenericBinaryArray +); + +// Non-generic simple builders (4 variants) - stateless +impl_row_builder_base!( + BooleanArrowToVariantBuilder, + arrow::array::BooleanArray, + array.as_boolean(), + $crate::type_conversion::non_generic_conversion_single_value!(self.array, |v| v, index) +); +impl_row_builder_base!( + BinaryViewArrowToVariantBuilder, + arrow::array::BinaryViewArray, + array.as_byte_view(), + $crate::type_conversion::non_generic_conversion_single_value!(self.array, |v| v, index) +); +impl_row_builder_base!( + FixedSizeBinaryArrowToVariantBuilder, + arrow::array::FixedSizeBinaryArray, + array.as_fixed_size_binary(), + $crate::type_conversion::non_generic_conversion_single_value!(self.array, |v| v, index) +); +impl_row_builder_base!( + Utf8ViewArrowToVariantBuilder, + arrow::array::StringViewArray, + array.as_string_view(), + $crate::type_conversion::non_generic_conversion_single_value!(self.array, |v| v, index) +); + +// Null builder - special case (always appends null) +impl_row_builder_base!( + NullArrowToVariantBuilder, + (), + (), + Variant::Null +); + +// Decimal builders (4 variants) - stateful with scale +impl_row_builder_base!( + Decimal32ArrowToVariantBuilder, + arrow::array::PrimitiveArray, + array.as_primitive::(), + decimal_to_variant_decimal!(self.array.value(index), &self.scale, i32, VariantDecimal4), + scale: i8 +); +impl_row_builder_base!( + Decimal64ArrowToVariantBuilder, + arrow::array::PrimitiveArray, + array.as_primitive::(), + decimal_to_variant_decimal!(self.array.value(index), &self.scale, i64, VariantDecimal8), + scale: i8 +); +impl_row_builder_base!( + Decimal128ArrowToVariantBuilder, + arrow::array::PrimitiveArray, + array.as_primitive::(), + decimal_to_variant_decimal!(self.array.value(index), &self.scale, i128, VariantDecimal16), + scale: i8 +); +impl_row_builder_base!( + Decimal256ArrowToVariantBuilder, + arrow::array::PrimitiveArray, + array.as_primitive::(), + { + let value = self.array.value(index); + if let Some(v) = value.to_i128() { + decimal_to_variant_decimal!(v, &self.scale, i128, VariantDecimal16) + } else { + Variant::Null + } + }, + scale: i8 +); + +// Timestamp builders (4 variants) - stateful with timezone +impl_row_builder_base!( + TimestampSecondArrowToVariantBuilder, + arrow::array::PrimitiveArray, + array.as_primitive::(), + { + let timestamp_value = self.array.value(index); + let Some(naive_datetime) = as_datetime::(timestamp_value) else { + return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); + }; + if self.has_time_zone { + let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); + Variant::from(utc_dt) + } else { + Variant::from(naive_datetime) + } + }, + has_time_zone: bool +); +impl_row_builder_base!( + TimestampMillisecondArrowToVariantBuilder, + arrow::array::PrimitiveArray, + array.as_primitive::(), + { + let timestamp_value = self.array.value(index); + let Some(naive_datetime) = as_datetime::(timestamp_value) else { + return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); + }; + if self.has_time_zone { + let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); + Variant::from(utc_dt) + } else { + Variant::from(naive_datetime) + } + }, + has_time_zone: bool +); +impl_row_builder_base!( + TimestampMicrosecondArrowToVariantBuilder, + arrow::array::PrimitiveArray, + array.as_primitive::(), + { + let timestamp_value = self.array.value(index); + let Some(naive_datetime) = as_datetime::(timestamp_value) else { + return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); + }; + if self.has_time_zone { + let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); + Variant::from(utc_dt) + } else { + Variant::from(naive_datetime) + } + }, + has_time_zone: bool +); +impl_row_builder_base!( + TimestampNanosecondArrowToVariantBuilder, + arrow::array::PrimitiveArray, + array.as_primitive::(), + { + let timestamp_value = self.array.value(index); + let Some(naive_datetime) = as_datetime::(timestamp_value) else { + return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); + }; + if self.has_time_zone { + let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); + Variant::from(utc_dt) + } else { + Variant::from(naive_datetime) + } + }, + has_time_zone: bool +); + +// Date builders (2 variants) - stateless temporal transform +impl_generic_row_builder!( + Date32ArrowToVariantBuilder, + as_primitive, + |value| { + let date_value = i64::from(value); + as_date::(date_value).map(Variant::from).unwrap_or(Variant::Null) + }, + arrow::array::PrimitiveArray +); +impl_generic_row_builder!( + Date64ArrowToVariantBuilder, + as_primitive, + |value| { + let date_value = i64::from(value); + as_date::(date_value).map(Variant::from).unwrap_or(Variant::Null) + }, + arrow::array::PrimitiveArray +); + +// Time builders (4 variants) - stateless temporal transform +impl_generic_row_builder!( + Time32SecondArrowToVariantBuilder, + as_primitive, + |value| { + let time_value = i64::from(value); + as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) + }, + arrow::array::PrimitiveArray +); +impl_generic_row_builder!( + Time32MillisecondArrowToVariantBuilder, + as_primitive, + |value| { + let time_value = i64::from(value); + as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) + }, + arrow::array::PrimitiveArray +); +impl_generic_row_builder!( + Time64MicrosecondArrowToVariantBuilder, + as_primitive, + |value| { + let time_value = i64::from(value); + as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) + }, + arrow::array::PrimitiveArray +); +impl_generic_row_builder!( + Time64NanosecondArrowToVariantBuilder, + as_primitive, + |value| { + let time_value = i64::from(value); + as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) + }, + arrow::array::PrimitiveArray +); + // ============================================================================ // Row-oriented builders for efficient Arrow-to-Variant conversion // ============================================================================ /// Row builder for converting Arrow arrays to VariantArray row by row pub(crate) enum ArrowToVariantRowBuilder<'a> { - PrimitiveInt8(PrimitiveArrowToVariantBuilder<'a, Int8Type>), - PrimitiveInt16(PrimitiveArrowToVariantBuilder<'a, Int16Type>), - PrimitiveInt32(PrimitiveArrowToVariantBuilder<'a, Int32Type>), - PrimitiveInt64(PrimitiveArrowToVariantBuilder<'a, Int64Type>), - PrimitiveUInt8(PrimitiveArrowToVariantBuilder<'a, UInt8Type>), - PrimitiveUInt16(PrimitiveArrowToVariantBuilder<'a, UInt16Type>), - PrimitiveUInt32(PrimitiveArrowToVariantBuilder<'a, UInt32Type>), - PrimitiveUInt64(PrimitiveArrowToVariantBuilder<'a, UInt64Type>), - PrimitiveFloat16(PrimitiveArrowToVariantBuilder<'a, Float16Type>), - PrimitiveFloat32(PrimitiveArrowToVariantBuilder<'a, Float32Type>), - PrimitiveFloat64(PrimitiveArrowToVariantBuilder<'a, Float64Type>), + PrimitiveInt8(PrimitiveInt8ArrowToVariantBuilder<'a>), + PrimitiveInt16(PrimitiveInt16ArrowToVariantBuilder<'a>), + PrimitiveInt32(PrimitiveInt32ArrowToVariantBuilder<'a>), + PrimitiveInt64(PrimitiveInt64ArrowToVariantBuilder<'a>), + PrimitiveUInt8(PrimitiveUInt8ArrowToVariantBuilder<'a>), + PrimitiveUInt16(PrimitiveUInt16ArrowToVariantBuilder<'a>), + PrimitiveUInt32(PrimitiveUInt32ArrowToVariantBuilder<'a>), + PrimitiveUInt64(PrimitiveUInt64ArrowToVariantBuilder<'a>), + PrimitiveFloat16(PrimitiveFloat16ArrowToVariantBuilder<'a>), + PrimitiveFloat32(PrimitiveFloat32ArrowToVariantBuilder<'a>), + PrimitiveFloat64(PrimitiveFloat64ArrowToVariantBuilder<'a>), Decimal32(Decimal32ArrowToVariantBuilder<'a>), Decimal64(Decimal64ArrowToVariantBuilder<'a>), Decimal128(Decimal128ArrowToVariantBuilder<'a>), Decimal256(Decimal256ArrowToVariantBuilder<'a>), Boolean(BooleanArrowToVariantBuilder<'a>), - String(StringArrowToVariantBuilder<'a, i32>), - LargeString(StringArrowToVariantBuilder<'a, i64>), - Binary(BinaryArrowToVariantBuilder<'a, i32>), - LargeBinary(BinaryArrowToVariantBuilder<'a, i64>), + String(StringArrowToVariantBuilder<'a>), + LargeString(LargeStringArrowToVariantBuilder<'a>), + Binary(BinaryArrowToVariantBuilder<'a>), + LargeBinary(LargeBinaryArrowToVariantBuilder<'a>), BinaryView(BinaryViewArrowToVariantBuilder<'a>), FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder<'a>), Utf8View(Utf8ViewArrowToVariantBuilder<'a>), Struct(StructArrowToVariantBuilder<'a>), - Null(NullArrowToVariantBuilder), + Null(NullArrowToVariantBuilder<'a>), RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder<'a, Int16Type>), RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder<'a, Int32Type>), RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder<'a, Int64Type>), @@ -82,16 +387,16 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { LargeList(ListArrowToVariantBuilder<'a, i64>), Map(MapArrowToVariantBuilder<'a>), Union(UnionArrowToVariantBuilder<'a>), - TimestampSecond(TimestampArrowToVariantBuilder<'a, TimestampSecondType>), - TimestampMillisecond(TimestampArrowToVariantBuilder<'a, TimestampMillisecondType>), - TimestampMicrosecond(TimestampArrowToVariantBuilder<'a, TimestampMicrosecondType>), - TimestampNanosecond(TimestampArrowToVariantBuilder<'a, TimestampNanosecondType>), - Date32(DateArrowToVariantBuilder<'a, Date32Type>), - Date64(DateArrowToVariantBuilder<'a, Date64Type>), - Time32Second(TimeArrowToVariantBuilder<'a, Time32SecondType>), - Time32Millisecond(TimeArrowToVariantBuilder<'a, Time32MillisecondType>), - Time64Microsecond(TimeArrowToVariantBuilder<'a, Time64MicrosecondType>), - Time64Nanosecond(TimeArrowToVariantBuilder<'a, Time64NanosecondType>), + TimestampSecond(TimestampSecondArrowToVariantBuilder<'a>), + TimestampMillisecond(TimestampMillisecondArrowToVariantBuilder<'a>), + TimestampMicrosecond(TimestampMicrosecondArrowToVariantBuilder<'a>), + TimestampNanosecond(TimestampNanosecondArrowToVariantBuilder<'a>), + Date32(Date32ArrowToVariantBuilder<'a>), + Date64(Date64ArrowToVariantBuilder<'a>), + Time32Second(Time32SecondArrowToVariantBuilder<'a>), + Time32Millisecond(Time32MillisecondArrowToVariantBuilder<'a>), + Time64Microsecond(Time64MicrosecondArrowToVariantBuilder<'a>), + Time64Nanosecond(Time64NanosecondArrowToVariantBuilder<'a>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -850,19 +1155,19 @@ fn make_arrow_to_variant_row_builder<'a>( ) -> Result, ArrowError> { match data_type { // All integer types - DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveInt8ArrowToVariantBuilder::new(array))), + DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveInt16ArrowToVariantBuilder::new(array))), + DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveInt32ArrowToVariantBuilder::new(array))), + DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveInt64ArrowToVariantBuilder::new(array))), + DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveUInt8ArrowToVariantBuilder::new(array))), + DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveUInt16ArrowToVariantBuilder::new(array))), + DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveUInt32ArrowToVariantBuilder::new(array))), + DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveUInt64ArrowToVariantBuilder::new(array))), // Float types - DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveFloat16ArrowToVariantBuilder::new(array))), + DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveFloat32ArrowToVariantBuilder::new(array))), + DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveFloat64ArrowToVariantBuilder::new(array))), // Decimal types DataType::Decimal32(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale))), @@ -873,17 +1178,17 @@ fn make_arrow_to_variant_row_builder<'a>( // Special types DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array))), DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), - DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::LargeString(StringArrowToVariantBuilder::new(array))), + DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::LargeString(LargeStringArrowToVariantBuilder::new(array))), DataType::Utf8View => Ok(ArrowToVariantRowBuilder::Utf8View(Utf8ViewArrowToVariantBuilder::new(array))), // Binary types DataType::Binary => Ok(ArrowToVariantRowBuilder::Binary(BinaryArrowToVariantBuilder::new(array))), - DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary(BinaryArrowToVariantBuilder::new(array))), + DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary(LargeBinaryArrowToVariantBuilder::new(array))), DataType::BinaryView => Ok(ArrowToVariantRowBuilder::BinaryView(BinaryViewArrowToVariantBuilder::new(array))), DataType::FixedSizeBinary(_) => Ok(ArrowToVariantRowBuilder::FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder::new(array))), DataType::Struct(_) => Ok(ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new(array.as_struct())?)), - DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder)), + DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder::new(array))), // Run-end encoded types DataType::RunEndEncoded(run_ends, _) => { @@ -914,36 +1219,36 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Timestamp(time_unit, time_zone) => { match time_unit { TimeUnit::Second => Ok(ArrowToVariantRowBuilder::TimestampSecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) + TimestampSecondArrowToVariantBuilder::new(array, time_zone.is_some()) )), TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::TimestampMillisecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) + TimestampMillisecondArrowToVariantBuilder::new(array, time_zone.is_some()) )), TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::TimestampMicrosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) + TimestampMicrosecondArrowToVariantBuilder::new(array, time_zone.is_some()) )), TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::TimestampNanosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) + TimestampNanosecondArrowToVariantBuilder::new(array, time_zone.is_some()) )), } } // Date types DataType::Date32 => Ok(ArrowToVariantRowBuilder::Date32( - DateArrowToVariantBuilder::::new(array) + Date32ArrowToVariantBuilder::new(array) )), DataType::Date64 => Ok(ArrowToVariantRowBuilder::Date64( - DateArrowToVariantBuilder::::new(array) + Date64ArrowToVariantBuilder::new(array) )), // Time types DataType::Time32(time_unit) => { match time_unit { TimeUnit::Second => Ok(ArrowToVariantRowBuilder::Time32Second( - TimeArrowToVariantBuilder::::new(array) + Time32SecondArrowToVariantBuilder::new(array) )), TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::Time32Millisecond( - TimeArrowToVariantBuilder::::new(array) + Time32MillisecondArrowToVariantBuilder::new(array) )), _ => Err(ArrowError::CastError(format!("Unsupported Time32 unit: {time_unit:?}"))), } @@ -951,10 +1256,10 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Time64(time_unit) => { match time_unit { TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::Time64Microsecond( - TimeArrowToVariantBuilder::::new(array) + Time64MicrosecondArrowToVariantBuilder::new(array) )), TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::Time64Nanosecond( - TimeArrowToVariantBuilder::::new(array) + Time64NanosecondArrowToVariantBuilder::new(array) )), _ => Err(ArrowError::CastError(format!("Unsupported Time64 unit: {time_unit:?}"))), } From fa4ea64352ecd1a6ef62013cfb6d72f7d30eb6fe Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 11:40:06 -0700 Subject: [PATCH 35/53] Revert "checkpoint - use macros to define row builders" This reverts commit 27e2a9c5107607af6fc000dcc85499744c8f6a98. --- .../src/cast_to_variant.rs | 405 +++--------------- 1 file changed, 50 insertions(+), 355 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 5c0fe3bdc64e..16ea0daa5851 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -43,342 +43,37 @@ use parquet_variant::{ ObjectFieldBuilder, Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; -// ============================================================================ -// Macros for generating row-oriented builders -// ============================================================================ - -/// Base macro for generating row builders with optional state fields -macro_rules! impl_row_builder_base { - ($name:ident, $array_field_type:ty, $array_init:expr, $variant_expr:expr $(, $state_field:ident: $state_type:ty)*) => { - pub(crate) struct $name<'a> { - array: &'a $array_field_type, - $($state_field: $state_type,)* - } - - impl<'a> $name<'a> { - fn new(array: &'a dyn Array $(, $state_field: $state_type)*) -> Self { - Self { - array: $array_init, - $($state_field,)* - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let variant = $variant_expr; - builder.append_value(variant); - } - Ok(()) - } - } - }; -} - -/// Convenience macro for generic builders (with type parameter and method) -macro_rules! impl_generic_row_builder { - ($name:ident<$t:ty>, $method:ident, $cast_fn:expr, $array_type:ty $(, $state_field:ident: $state_type:ty)*) => { - impl_row_builder_base!( - $name, - $array_type, - array.$method::<$t>(), - $crate::type_conversion::generic_conversion_single_value!($t, $method, $cast_fn, self.array, index) - $(, $state_field: $state_type)* - ); - }; -} - -/// Convenience macro for primitive builders (uses as_primitive method) -macro_rules! impl_primitive_row_builder { - ($name:ident<$t:ty> $(, $state_field:ident: $state_type:ty)*) => { - impl_generic_row_builder!( - $name<$t>, - as_primitive, - |v| v, - arrow::array::PrimitiveArray<$t> - $(, $state_field: $state_type)* - ); - }; -} - -// ============================================================================ -// Macro-generated row builders -// ============================================================================ - -// Primitive builders (11 variants) - stateless -impl_primitive_row_builder!(PrimitiveInt8ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveInt16ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveInt32ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveInt64ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveUInt8ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveUInt16ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveUInt32ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveUInt64ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveFloat16ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveFloat32ArrowToVariantBuilder); -impl_primitive_row_builder!(PrimitiveFloat64ArrowToVariantBuilder); - -// Generic offset builders (4 variants) - stateless -impl_generic_row_builder!( - StringArrowToVariantBuilder, - as_string, - |v| v, - arrow::array::GenericStringArray -); -impl_generic_row_builder!( - LargeStringArrowToVariantBuilder, - as_string, - |v| v, - arrow::array::GenericStringArray -); -impl_generic_row_builder!( - BinaryArrowToVariantBuilder, - as_binary, - |v| v, - arrow::array::GenericBinaryArray -); -impl_generic_row_builder!( - LargeBinaryArrowToVariantBuilder, - as_binary, - |v| v, - arrow::array::GenericBinaryArray -); - -// Non-generic simple builders (4 variants) - stateless -impl_row_builder_base!( - BooleanArrowToVariantBuilder, - arrow::array::BooleanArray, - array.as_boolean(), - $crate::type_conversion::non_generic_conversion_single_value!(self.array, |v| v, index) -); -impl_row_builder_base!( - BinaryViewArrowToVariantBuilder, - arrow::array::BinaryViewArray, - array.as_byte_view(), - $crate::type_conversion::non_generic_conversion_single_value!(self.array, |v| v, index) -); -impl_row_builder_base!( - FixedSizeBinaryArrowToVariantBuilder, - arrow::array::FixedSizeBinaryArray, - array.as_fixed_size_binary(), - $crate::type_conversion::non_generic_conversion_single_value!(self.array, |v| v, index) -); -impl_row_builder_base!( - Utf8ViewArrowToVariantBuilder, - arrow::array::StringViewArray, - array.as_string_view(), - $crate::type_conversion::non_generic_conversion_single_value!(self.array, |v| v, index) -); - -// Null builder - special case (always appends null) -impl_row_builder_base!( - NullArrowToVariantBuilder, - (), - (), - Variant::Null -); - -// Decimal builders (4 variants) - stateful with scale -impl_row_builder_base!( - Decimal32ArrowToVariantBuilder, - arrow::array::PrimitiveArray, - array.as_primitive::(), - decimal_to_variant_decimal!(self.array.value(index), &self.scale, i32, VariantDecimal4), - scale: i8 -); -impl_row_builder_base!( - Decimal64ArrowToVariantBuilder, - arrow::array::PrimitiveArray, - array.as_primitive::(), - decimal_to_variant_decimal!(self.array.value(index), &self.scale, i64, VariantDecimal8), - scale: i8 -); -impl_row_builder_base!( - Decimal128ArrowToVariantBuilder, - arrow::array::PrimitiveArray, - array.as_primitive::(), - decimal_to_variant_decimal!(self.array.value(index), &self.scale, i128, VariantDecimal16), - scale: i8 -); -impl_row_builder_base!( - Decimal256ArrowToVariantBuilder, - arrow::array::PrimitiveArray, - array.as_primitive::(), - { - let value = self.array.value(index); - if let Some(v) = value.to_i128() { - decimal_to_variant_decimal!(v, &self.scale, i128, VariantDecimal16) - } else { - Variant::Null - } - }, - scale: i8 -); - -// Timestamp builders (4 variants) - stateful with timezone -impl_row_builder_base!( - TimestampSecondArrowToVariantBuilder, - arrow::array::PrimitiveArray, - array.as_primitive::(), - { - let timestamp_value = self.array.value(index); - let Some(naive_datetime) = as_datetime::(timestamp_value) else { - return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); - }; - if self.has_time_zone { - let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); - Variant::from(utc_dt) - } else { - Variant::from(naive_datetime) - } - }, - has_time_zone: bool -); -impl_row_builder_base!( - TimestampMillisecondArrowToVariantBuilder, - arrow::array::PrimitiveArray, - array.as_primitive::(), - { - let timestamp_value = self.array.value(index); - let Some(naive_datetime) = as_datetime::(timestamp_value) else { - return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); - }; - if self.has_time_zone { - let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); - Variant::from(utc_dt) - } else { - Variant::from(naive_datetime) - } - }, - has_time_zone: bool -); -impl_row_builder_base!( - TimestampMicrosecondArrowToVariantBuilder, - arrow::array::PrimitiveArray, - array.as_primitive::(), - { - let timestamp_value = self.array.value(index); - let Some(naive_datetime) = as_datetime::(timestamp_value) else { - return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); - }; - if self.has_time_zone { - let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); - Variant::from(utc_dt) - } else { - Variant::from(naive_datetime) - } - }, - has_time_zone: bool -); -impl_row_builder_base!( - TimestampNanosecondArrowToVariantBuilder, - arrow::array::PrimitiveArray, - array.as_primitive::(), - { - let timestamp_value = self.array.value(index); - let Some(naive_datetime) = as_datetime::(timestamp_value) else { - return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); - }; - if self.has_time_zone { - let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); - Variant::from(utc_dt) - } else { - Variant::from(naive_datetime) - } - }, - has_time_zone: bool -); - -// Date builders (2 variants) - stateless temporal transform -impl_generic_row_builder!( - Date32ArrowToVariantBuilder, - as_primitive, - |value| { - let date_value = i64::from(value); - as_date::(date_value).map(Variant::from).unwrap_or(Variant::Null) - }, - arrow::array::PrimitiveArray -); -impl_generic_row_builder!( - Date64ArrowToVariantBuilder, - as_primitive, - |value| { - let date_value = i64::from(value); - as_date::(date_value).map(Variant::from).unwrap_or(Variant::Null) - }, - arrow::array::PrimitiveArray -); - -// Time builders (4 variants) - stateless temporal transform -impl_generic_row_builder!( - Time32SecondArrowToVariantBuilder, - as_primitive, - |value| { - let time_value = i64::from(value); - as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) - }, - arrow::array::PrimitiveArray -); -impl_generic_row_builder!( - Time32MillisecondArrowToVariantBuilder, - as_primitive, - |value| { - let time_value = i64::from(value); - as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) - }, - arrow::array::PrimitiveArray -); -impl_generic_row_builder!( - Time64MicrosecondArrowToVariantBuilder, - as_primitive, - |value| { - let time_value = i64::from(value); - as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) - }, - arrow::array::PrimitiveArray -); -impl_generic_row_builder!( - Time64NanosecondArrowToVariantBuilder, - as_primitive, - |value| { - let time_value = i64::from(value); - as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) - }, - arrow::array::PrimitiveArray -); - // ============================================================================ // Row-oriented builders for efficient Arrow-to-Variant conversion // ============================================================================ /// Row builder for converting Arrow arrays to VariantArray row by row pub(crate) enum ArrowToVariantRowBuilder<'a> { - PrimitiveInt8(PrimitiveInt8ArrowToVariantBuilder<'a>), - PrimitiveInt16(PrimitiveInt16ArrowToVariantBuilder<'a>), - PrimitiveInt32(PrimitiveInt32ArrowToVariantBuilder<'a>), - PrimitiveInt64(PrimitiveInt64ArrowToVariantBuilder<'a>), - PrimitiveUInt8(PrimitiveUInt8ArrowToVariantBuilder<'a>), - PrimitiveUInt16(PrimitiveUInt16ArrowToVariantBuilder<'a>), - PrimitiveUInt32(PrimitiveUInt32ArrowToVariantBuilder<'a>), - PrimitiveUInt64(PrimitiveUInt64ArrowToVariantBuilder<'a>), - PrimitiveFloat16(PrimitiveFloat16ArrowToVariantBuilder<'a>), - PrimitiveFloat32(PrimitiveFloat32ArrowToVariantBuilder<'a>), - PrimitiveFloat64(PrimitiveFloat64ArrowToVariantBuilder<'a>), + PrimitiveInt8(PrimitiveArrowToVariantBuilder<'a, Int8Type>), + PrimitiveInt16(PrimitiveArrowToVariantBuilder<'a, Int16Type>), + PrimitiveInt32(PrimitiveArrowToVariantBuilder<'a, Int32Type>), + PrimitiveInt64(PrimitiveArrowToVariantBuilder<'a, Int64Type>), + PrimitiveUInt8(PrimitiveArrowToVariantBuilder<'a, UInt8Type>), + PrimitiveUInt16(PrimitiveArrowToVariantBuilder<'a, UInt16Type>), + PrimitiveUInt32(PrimitiveArrowToVariantBuilder<'a, UInt32Type>), + PrimitiveUInt64(PrimitiveArrowToVariantBuilder<'a, UInt64Type>), + PrimitiveFloat16(PrimitiveArrowToVariantBuilder<'a, Float16Type>), + PrimitiveFloat32(PrimitiveArrowToVariantBuilder<'a, Float32Type>), + PrimitiveFloat64(PrimitiveArrowToVariantBuilder<'a, Float64Type>), Decimal32(Decimal32ArrowToVariantBuilder<'a>), Decimal64(Decimal64ArrowToVariantBuilder<'a>), Decimal128(Decimal128ArrowToVariantBuilder<'a>), Decimal256(Decimal256ArrowToVariantBuilder<'a>), Boolean(BooleanArrowToVariantBuilder<'a>), - String(StringArrowToVariantBuilder<'a>), - LargeString(LargeStringArrowToVariantBuilder<'a>), - Binary(BinaryArrowToVariantBuilder<'a>), - LargeBinary(LargeBinaryArrowToVariantBuilder<'a>), + String(StringArrowToVariantBuilder<'a, i32>), + LargeString(StringArrowToVariantBuilder<'a, i64>), + Binary(BinaryArrowToVariantBuilder<'a, i32>), + LargeBinary(BinaryArrowToVariantBuilder<'a, i64>), BinaryView(BinaryViewArrowToVariantBuilder<'a>), FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder<'a>), Utf8View(Utf8ViewArrowToVariantBuilder<'a>), Struct(StructArrowToVariantBuilder<'a>), - Null(NullArrowToVariantBuilder<'a>), + Null(NullArrowToVariantBuilder), RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder<'a, Int16Type>), RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder<'a, Int32Type>), RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder<'a, Int64Type>), @@ -387,16 +82,16 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { LargeList(ListArrowToVariantBuilder<'a, i64>), Map(MapArrowToVariantBuilder<'a>), Union(UnionArrowToVariantBuilder<'a>), - TimestampSecond(TimestampSecondArrowToVariantBuilder<'a>), - TimestampMillisecond(TimestampMillisecondArrowToVariantBuilder<'a>), - TimestampMicrosecond(TimestampMicrosecondArrowToVariantBuilder<'a>), - TimestampNanosecond(TimestampNanosecondArrowToVariantBuilder<'a>), - Date32(Date32ArrowToVariantBuilder<'a>), - Date64(Date64ArrowToVariantBuilder<'a>), - Time32Second(Time32SecondArrowToVariantBuilder<'a>), - Time32Millisecond(Time32MillisecondArrowToVariantBuilder<'a>), - Time64Microsecond(Time64MicrosecondArrowToVariantBuilder<'a>), - Time64Nanosecond(Time64NanosecondArrowToVariantBuilder<'a>), + TimestampSecond(TimestampArrowToVariantBuilder<'a, TimestampSecondType>), + TimestampMillisecond(TimestampArrowToVariantBuilder<'a, TimestampMillisecondType>), + TimestampMicrosecond(TimestampArrowToVariantBuilder<'a, TimestampMicrosecondType>), + TimestampNanosecond(TimestampArrowToVariantBuilder<'a, TimestampNanosecondType>), + Date32(DateArrowToVariantBuilder<'a, Date32Type>), + Date64(DateArrowToVariantBuilder<'a, Date64Type>), + Time32Second(TimeArrowToVariantBuilder<'a, Time32SecondType>), + Time32Millisecond(TimeArrowToVariantBuilder<'a, Time32MillisecondType>), + Time64Microsecond(TimeArrowToVariantBuilder<'a, Time64MicrosecondType>), + Time64Nanosecond(TimeArrowToVariantBuilder<'a, Time64NanosecondType>), } impl<'a> ArrowToVariantRowBuilder<'a> { @@ -1155,19 +850,19 @@ fn make_arrow_to_variant_row_builder<'a>( ) -> Result, ArrowError> { match data_type { // All integer types - DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveInt8ArrowToVariantBuilder::new(array))), - DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveInt16ArrowToVariantBuilder::new(array))), - DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveInt32ArrowToVariantBuilder::new(array))), - DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveInt64ArrowToVariantBuilder::new(array))), - DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveUInt8ArrowToVariantBuilder::new(array))), - DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveUInt16ArrowToVariantBuilder::new(array))), - DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveUInt32ArrowToVariantBuilder::new(array))), - DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveUInt64ArrowToVariantBuilder::new(array))), + DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::::new(array))), // Float types - DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveFloat16ArrowToVariantBuilder::new(array))), - DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveFloat32ArrowToVariantBuilder::new(array))), - DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveFloat64ArrowToVariantBuilder::new(array))), + DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::::new(array))), // Decimal types DataType::Decimal32(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale))), @@ -1178,17 +873,17 @@ fn make_arrow_to_variant_row_builder<'a>( // Special types DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array))), DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), - DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::LargeString(LargeStringArrowToVariantBuilder::new(array))), + DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::LargeString(StringArrowToVariantBuilder::new(array))), DataType::Utf8View => Ok(ArrowToVariantRowBuilder::Utf8View(Utf8ViewArrowToVariantBuilder::new(array))), // Binary types DataType::Binary => Ok(ArrowToVariantRowBuilder::Binary(BinaryArrowToVariantBuilder::new(array))), - DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary(LargeBinaryArrowToVariantBuilder::new(array))), + DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary(BinaryArrowToVariantBuilder::new(array))), DataType::BinaryView => Ok(ArrowToVariantRowBuilder::BinaryView(BinaryViewArrowToVariantBuilder::new(array))), DataType::FixedSizeBinary(_) => Ok(ArrowToVariantRowBuilder::FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder::new(array))), DataType::Struct(_) => Ok(ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new(array.as_struct())?)), - DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder::new(array))), + DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder)), // Run-end encoded types DataType::RunEndEncoded(run_ends, _) => { @@ -1219,36 +914,36 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Timestamp(time_unit, time_zone) => { match time_unit { TimeUnit::Second => Ok(ArrowToVariantRowBuilder::TimestampSecond( - TimestampSecondArrowToVariantBuilder::new(array, time_zone.is_some()) + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) )), TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::TimestampMillisecond( - TimestampMillisecondArrowToVariantBuilder::new(array, time_zone.is_some()) + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) )), TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::TimestampMicrosecond( - TimestampMicrosecondArrowToVariantBuilder::new(array, time_zone.is_some()) + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) )), TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::TimestampNanosecond( - TimestampNanosecondArrowToVariantBuilder::new(array, time_zone.is_some()) + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) )), } } // Date types DataType::Date32 => Ok(ArrowToVariantRowBuilder::Date32( - Date32ArrowToVariantBuilder::new(array) + DateArrowToVariantBuilder::::new(array) )), DataType::Date64 => Ok(ArrowToVariantRowBuilder::Date64( - Date64ArrowToVariantBuilder::new(array) + DateArrowToVariantBuilder::::new(array) )), // Time types DataType::Time32(time_unit) => { match time_unit { TimeUnit::Second => Ok(ArrowToVariantRowBuilder::Time32Second( - Time32SecondArrowToVariantBuilder::new(array) + TimeArrowToVariantBuilder::::new(array) )), TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::Time32Millisecond( - Time32MillisecondArrowToVariantBuilder::new(array) + TimeArrowToVariantBuilder::::new(array) )), _ => Err(ArrowError::CastError(format!("Unsupported Time32 unit: {time_unit:?}"))), } @@ -1256,10 +951,10 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Time64(time_unit) => { match time_unit { TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::Time64Microsecond( - Time64MicrosecondArrowToVariantBuilder::new(array) + TimeArrowToVariantBuilder::::new(array) )), TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::Time64Nanosecond( - Time64NanosecondArrowToVariantBuilder::new(array) + TimeArrowToVariantBuilder::::new(array) )), _ => Err(ArrowError::CastError(format!("Unsupported Time64 unit: {time_unit:?}"))), } From e89b642a369d3c43991bc579e177959deff8e32f Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 12:43:31 -0700 Subject: [PATCH 36/53] make primitive and string builder templates more similar --- parquet-variant-compute/src/cast_to_variant.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 16ea0daa5851..1e954b984151 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -199,14 +199,20 @@ impl<'a> BooleanArrowToVariantBuilder<'a> { } /// Generic String builder for StringArray (Utf8 and LargeUtf8) -pub(crate) struct StringArrowToVariantBuilder<'a, O: OffsetSizeTrait> { +pub(crate) struct StringArrowToVariantBuilder<'a, O> +where + O : OffsetSizeTrait, +{ array: &'a arrow::array::GenericStringArray, } -impl<'a, O: OffsetSizeTrait> StringArrowToVariantBuilder<'a, O> { +impl<'a, O> StringArrowToVariantBuilder<'a, O> +where + O : OffsetSizeTrait, +{ fn new(array: &'a dyn Array) -> Self { Self { - array: array.as_string::(), + array: array.as_string(), } } From 53db9dcd87533c4f9e3aa0068b412a3e7df0a6bd Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 13:27:34 -0700 Subject: [PATCH 37/53] checkpoint - start at simple builder macro --- .../src/cast_to_variant.rs | 93 +++++++++++++------ 1 file changed, 63 insertions(+), 30 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 1e954b984151..6b9444e1d035 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -24,6 +24,7 @@ use crate::type_conversion::{ use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ Array, AsArray, OffsetSizeTrait, + PrimitiveArray, GenericStringArray, GenericBinaryArray, GenericListArray, }; use arrow::compute::kernels::cast; use arrow::datatypes::{ @@ -43,6 +44,68 @@ use parquet_variant::{ ObjectFieldBuilder, Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; +// ============================================================================ +// Macro for generating generic row builders +// ============================================================================ + +/// Macro to define generic row builders with consistent structure and behavior +macro_rules! define_row_builder { + ( + struct $name:ident<$generic:ident: $($bound:path)+> + $(where $where_clause:tt)?, + |$array_param:ident| -> $array_type:ty { $init_expr:expr } + ) => { + pub(crate) struct $name<'a, $generic: $($bound)+> + $(where $where_clause)? + { + array: &'a $array_type, + } + + impl<'a, $generic: $($bound)+> $name<'a, $generic> + $(where $where_clause)? + { + fn new($array_param: &'a dyn Array) -> Self { + Self { + array: $init_expr, + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let value = self.array.value(index); + builder.append_value(value); + } + Ok(()) + } + } + }; +} + +// ============================================================================ +// Generic row builders generated by macro +// ============================================================================ + +// Primitive builder - handles all primitive types +define_row_builder!( + struct PrimitiveArrowToVariantBuilder + where T::Native: Into>, + |array| -> PrimitiveArray { array.as_primitive() } +); + +// String builder - handles String and LargeString +define_row_builder!( + struct StringArrowToVariantBuilder, + |array| -> GenericStringArray { array.as_string() } +); + +// Binary builder - handles Binary and LargeBinary +define_row_builder!( + struct BinaryArrowToVariantBuilder, + |array| -> GenericBinaryArray { array.as_binary() } +); + // ============================================================================ // Row-oriented builders for efficient Arrow-to-Variant conversion // ============================================================================ @@ -144,36 +207,6 @@ impl<'a> ArrowToVariantRowBuilder<'a> { } } -/// Generic primitive builder for all Arrow primitive types -pub(crate) struct PrimitiveArrowToVariantBuilder<'a, T> -where - T : ArrowPrimitiveType, - T::Native: Into>, -{ - array: &'a arrow::array::PrimitiveArray, -} - -impl<'a, T> PrimitiveArrowToVariantBuilder<'a, T> -where - T : ArrowPrimitiveType, - T::Native: Into>, -{ - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_primitive(), - } - } - - fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let value = self.array.value(index); - builder.append_value(value); - } - Ok(()) - } -} /// Boolean builder for BooleanArray pub(crate) struct BooleanArrowToVariantBuilder<'a> { From 64d68773ecb209933954c75ad0bed1ce37b3d429 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 14:50:21 -0700 Subject: [PATCH 38/53] manual fixes --- .../src/cast_to_variant.rs | 188 +++++++----------- 1 file changed, 69 insertions(+), 119 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 6b9444e1d035..16f64e17bc1f 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -16,7 +16,6 @@ // under the License. use std::collections::HashMap; -use std::sync::Arc; use crate::type_conversion::{ decimal_to_variant_decimal, @@ -24,13 +23,13 @@ use crate::type_conversion::{ use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ Array, AsArray, OffsetSizeTrait, - PrimitiveArray, GenericStringArray, GenericBinaryArray, GenericListArray, + PrimitiveArray, GenericStringArray, GenericBinaryArray, }; use arrow::compute::kernels::cast; use arrow::datatypes::{ - i256, ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, BinaryType, BinaryViewType, Date32Type, Date64Type, Decimal128Type, + ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, - Int32Type, Int64Type, Int8Type, LargeBinaryType, RunEndIndexType, Time32MillisecondType, + Int32Type, Int64Type, Int8Type, RunEndIndexType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, @@ -39,73 +38,11 @@ use arrow::temporal_conversions::{ as_date, as_datetime, as_time, }; use arrow_schema::{ArrowError, DataType, TimeUnit}; -use chrono::{DateTime, NaiveDate, NaiveTime, TimeZone, Utc}; +use chrono::{DateTime, TimeZone, Utc}; use parquet_variant::{ - ObjectFieldBuilder, Variant, VariantBuilder, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, + ObjectFieldBuilder, Variant, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; -// ============================================================================ -// Macro for generating generic row builders -// ============================================================================ - -/// Macro to define generic row builders with consistent structure and behavior -macro_rules! define_row_builder { - ( - struct $name:ident<$generic:ident: $($bound:path)+> - $(where $where_clause:tt)?, - |$array_param:ident| -> $array_type:ty { $init_expr:expr } - ) => { - pub(crate) struct $name<'a, $generic: $($bound)+> - $(where $where_clause)? - { - array: &'a $array_type, - } - - impl<'a, $generic: $($bound)+> $name<'a, $generic> - $(where $where_clause)? - { - fn new($array_param: &'a dyn Array) -> Self { - Self { - array: $init_expr, - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let value = self.array.value(index); - builder.append_value(value); - } - Ok(()) - } - } - }; -} - -// ============================================================================ -// Generic row builders generated by macro -// ============================================================================ - -// Primitive builder - handles all primitive types -define_row_builder!( - struct PrimitiveArrowToVariantBuilder - where T::Native: Into>, - |array| -> PrimitiveArray { array.as_primitive() } -); - -// String builder - handles String and LargeString -define_row_builder!( - struct StringArrowToVariantBuilder, - |array| -> GenericStringArray { array.as_string() } -); - -// Binary builder - handles Binary and LargeBinary -define_row_builder!( - struct BinaryArrowToVariantBuilder, - |array| -> GenericBinaryArray { array.as_binary() } -); - // ============================================================================ // Row-oriented builders for efficient Arrow-to-Variant conversion // ============================================================================ @@ -208,6 +145,56 @@ impl<'a> ArrowToVariantRowBuilder<'a> { } +// ============================================================================ +// Macro for generating generic row builders +// ============================================================================ + +/// Macro to define generic row builders with consistent structure and behavior +macro_rules! define_row_builder { + ( + struct $name:ident<$lifetime:lifetime, $generic:ident: $($bound:path)+> + $(where $where_path:path: $where_bound:path)?, + |$array_param:ident| -> $array_type:ty { $init_expr:expr } + ) => { + pub(crate) struct $name<$lifetime, $generic: $($bound)+> + $(where $where_path: $where_bound)? + { + array: &$lifetime $array_type, + } + + impl<$lifetime, $generic: $($bound)+> $name<$lifetime, $generic> + $(where $where_path: $where_bound)? + { + fn new($array_param: &$lifetime dyn Array) -> Self { + Self { + array: $init_expr, + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let value = self.array.value(index); + builder.append_value(value); + } + Ok(()) + } + } + }; +} + +// ============================================================================ +// Generic row builders generated by macro +// ============================================================================ + +// Primitive builder - handles all primitive types +define_row_builder!( + struct PrimitiveArrowToVariantBuilder<'a, T: ArrowPrimitiveType> + where T::Native: Into>, + |array| -> PrimitiveArray { array.as_primitive() } +); + /// Boolean builder for BooleanArray pub(crate) struct BooleanArrowToVariantBuilder<'a> { array: &'a arrow::array::BooleanArray, @@ -231,34 +218,12 @@ impl<'a> BooleanArrowToVariantBuilder<'a> { } } -/// Generic String builder for StringArray (Utf8 and LargeUtf8) -pub(crate) struct StringArrowToVariantBuilder<'a, O> -where - O : OffsetSizeTrait, -{ - array: &'a arrow::array::GenericStringArray, -} +// Generic String builder for StringArray (Utf8 and LargeUtf8) +define_row_builder!( + struct StringArrowToVariantBuilder<'a, O: OffsetSizeTrait>, + |array| -> GenericStringArray { array.as_string() } +); -impl<'a, O> StringArrowToVariantBuilder<'a, O> -where - O : OffsetSizeTrait, -{ - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_string(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let value = self.array.value(index); - builder.append_value(value); - } - Ok(()) - } -} /// Struct builder for StructArray pub(crate) struct StructArrowToVariantBuilder<'a> { @@ -678,28 +643,11 @@ impl<'a> Decimal256ArrowToVariantBuilder<'a> { } } -/// Generic Binary builder for Arrow BinaryArray and LargeBinaryArray -pub(crate) struct BinaryArrowToVariantBuilder<'a, O: OffsetSizeTrait> { - array: &'a arrow::array::GenericBinaryArray, -} - -impl<'a, O: OffsetSizeTrait> BinaryArrowToVariantBuilder<'a, O> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_binary::(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let bytes = self.array.value(index); - builder.append_value(Variant::from(bytes)); - } - Ok(()) - } -} +// Generic Binary builder for Arrow BinaryArray and LargeBinaryArray +define_row_builder!( + struct BinaryArrowToVariantBuilder<'a, O: OffsetSizeTrait>, + |array| -> GenericBinaryArray { array.as_binary() } +); /// BinaryView builder for Arrow BinaryViewArray pub(crate) struct BinaryViewArrowToVariantBuilder<'a> { @@ -1076,13 +1024,14 @@ mod tests { UInt8Array, UnionArray, TimestampSecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampMicrosecondArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; - use arrow::datatypes::{IntervalDayTime, IntervalMonthDayNano}; + use arrow::datatypes::{i256, BinaryType, LargeBinaryType, BinaryViewType, IntervalDayTime, IntervalMonthDayNano}; use arrow_schema::{DataType, Field, Fields, UnionFields}; use arrow_schema::{ DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, }; + use chrono::{NaiveDate, NaiveTime}; use half::f16; - use parquet_variant::{Variant, VariantDecimal16}; + use parquet_variant::{Variant, VariantBuilder, VariantDecimal16}; use std::{sync::Arc, vec}; macro_rules! max_unscaled_value { @@ -2837,6 +2786,7 @@ mod tests { mod row_builder_tests { use super::*; use arrow::array::{ArrayRef, Int32Array, StringArray, BooleanArray}; + use std::sync::Arc; #[test] fn test_primitive_row_builder() { From 1ad4cc3285c52ea23d05c7cd138cf966446c18fb Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 15:09:41 -0700 Subject: [PATCH 39/53] expand macro to allow value transform --- parquet-variant-compute/src/cast_to_variant.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 16f64e17bc1f..45ee337decd8 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -154,7 +154,8 @@ macro_rules! define_row_builder { ( struct $name:ident<$lifetime:lifetime, $generic:ident: $($bound:path)+> $(where $where_path:path: $where_bound:path)?, - |$array_param:ident| -> $array_type:ty { $init_expr:expr } + |$array_param:ident| -> $array_type:ty { $init_expr:expr }, + |$value:ident| $value_transform:expr ) => { pub(crate) struct $name<$lifetime, $generic: $($bound)+> $(where $where_path: $where_bound)? @@ -175,8 +176,8 @@ macro_rules! define_row_builder { if self.array.is_null(index) { builder.append_null(); } else { - let value = self.array.value(index); - builder.append_value(value); + let $value = self.array.value(index); + builder.append_value($value_transform); } Ok(()) } @@ -192,7 +193,8 @@ macro_rules! define_row_builder { define_row_builder!( struct PrimitiveArrowToVariantBuilder<'a, T: ArrowPrimitiveType> where T::Native: Into>, - |array| -> PrimitiveArray { array.as_primitive() } + |array| -> PrimitiveArray { array.as_primitive() }, + |value| value ); /// Boolean builder for BooleanArray @@ -221,7 +223,8 @@ impl<'a> BooleanArrowToVariantBuilder<'a> { // Generic String builder for StringArray (Utf8 and LargeUtf8) define_row_builder!( struct StringArrowToVariantBuilder<'a, O: OffsetSizeTrait>, - |array| -> GenericStringArray { array.as_string() } + |array| -> GenericStringArray { array.as_string() }, + |value| value ); @@ -646,7 +649,8 @@ impl<'a> Decimal256ArrowToVariantBuilder<'a> { // Generic Binary builder for Arrow BinaryArray and LargeBinaryArray define_row_builder!( struct BinaryArrowToVariantBuilder<'a, O: OffsetSizeTrait>, - |array| -> GenericBinaryArray { array.as_binary() } + |array| -> GenericBinaryArray { array.as_binary() }, + |value| value ); /// BinaryView builder for Arrow BinaryViewArray From eee43cec154ec8f31d678d81c59bf256533b4cd7 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 15:14:38 -0700 Subject: [PATCH 40/53] checkpoint - use macro for time and date --- .../src/cast_to_variant.rs | 89 ++++--------------- 1 file changed, 19 insertions(+), 70 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 45ee337decd8..9f47580e1371 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -760,79 +760,28 @@ impl<'a, T: ArrowTimestampType> TimestampArrowToVariantBuilder<'a, T> { } } -/// Generic Date builder for Arrow date arrays (Date32, Date64) -pub(crate) struct DateArrowToVariantBuilder<'a, T: ArrowTemporalType> -where - i64: From, -{ - array: &'a arrow::array::PrimitiveArray, -} - -impl<'a, T: ArrowTemporalType> DateArrowToVariantBuilder<'a, T> -where - i64: From, -{ - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_primitive::(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let date_value = i64::from(self.array.value(index)); - - // Use Arrow's generic date conversion function - let Some(naive_date) = as_date::(date_value) else { - return Err(ArrowError::CastError(format!( - "Failed to convert Arrow date value {} to chrono::NaiveDate for type {:?}", - date_value, T::DATA_TYPE - ))); - }; - builder.append_value(Variant::from(naive_date)); - } - Ok(()) +// Generic Date builder for Arrow date arrays (Date32, Date64) +define_row_builder!( + struct DateArrowToVariantBuilder<'a, T: ArrowTemporalType> + where i64: From, + |array| -> PrimitiveArray { array.as_primitive() }, + |value| { + let date_value = i64::from(value); + as_date::(date_value).map(Variant::from).unwrap_or(Variant::Null) } -} - -/// Generic Time builder for Arrow time arrays (Time32, Time64) -pub(crate) struct TimeArrowToVariantBuilder<'a, T: ArrowTemporalType> -where - i64: From, -{ - array: &'a arrow::array::PrimitiveArray, -} +); -impl<'a, T: ArrowTemporalType> TimeArrowToVariantBuilder<'a, T> -where - i64: From, -{ - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_primitive::(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let time_value = i64::from(self.array.value(index)); - - // Use Arrow's generic time conversion function - let Some(naive_time) = as_time::(time_value) else { - return Err(ArrowError::CastError(format!( - "Failed to convert Arrow time value {} to chrono::NaiveTime for type {:?}", - time_value, T::DATA_TYPE - ))); - }; - builder.append_value(Variant::from(naive_time)); - } - Ok(()) +// Generic Time builder for Arrow time arrays (Time32, Time64) +define_row_builder!( + struct TimeArrowToVariantBuilder<'a, T: ArrowTemporalType> + where i64: From, + |array| -> PrimitiveArray { array.as_primitive() }, + |value| { + let time_value = i64::from(value); + as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) } -} +); + /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( From 6ffd231b71a95ea53001e1f8cabff2179d3cf576 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 15:27:21 -0700 Subject: [PATCH 41/53] checkpoint - macro support snon-generic row builders --- .../src/cast_to_variant.rs | 121 ++++-------------- 1 file changed, 28 insertions(+), 93 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 9f47580e1371..af7c1aa67dca 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -149,21 +149,21 @@ impl<'a> ArrowToVariantRowBuilder<'a> { // Macro for generating generic row builders // ============================================================================ -/// Macro to define generic row builders with consistent structure and behavior +/// Macro to define (possibly generic) row builders with consistent structure and behavior macro_rules! define_row_builder { ( - struct $name:ident<$lifetime:lifetime, $generic:ident: $($bound:path)+> + struct $name:ident<$lifetime:lifetime $(, $generic:ident: $($bound:path)+)?> $(where $where_path:path: $where_bound:path)?, |$array_param:ident| -> $array_type:ty { $init_expr:expr }, |$value:ident| $value_transform:expr ) => { - pub(crate) struct $name<$lifetime, $generic: $($bound)+> + pub(crate) struct $name<$lifetime $(, $generic: $($bound)+)?> $(where $where_path: $where_bound)? { array: &$lifetime $array_type, } - impl<$lifetime, $generic: $($bound)+> $name<$lifetime, $generic> + impl<$lifetime $(, $generic: $($bound)+)?> $name<$lifetime $(, $generic)?> $(where $where_path: $where_bound)? { fn new($array_param: &$lifetime dyn Array) -> Self { @@ -197,28 +197,12 @@ define_row_builder!( |value| value ); -/// Boolean builder for BooleanArray -pub(crate) struct BooleanArrowToVariantBuilder<'a> { - array: &'a arrow::array::BooleanArray, -} - -impl<'a> BooleanArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_boolean(), - } - } - - fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let value = self.array.value(index); - builder.append_value(value); - } - Ok(()) - } -} +// Boolean builder - handles BooleanArray +define_row_builder!( + struct BooleanArrowToVariantBuilder<'a>, + |array| -> arrow::array::BooleanArray { array.as_boolean() }, + |value| value +); // Generic String builder for StringArray (Utf8 and LargeUtf8) define_row_builder!( @@ -653,74 +637,26 @@ define_row_builder!( |value| value ); -/// BinaryView builder for Arrow BinaryViewArray -pub(crate) struct BinaryViewArrowToVariantBuilder<'a> { - array: &'a arrow::array::BinaryViewArray, -} - -impl<'a> BinaryViewArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_byte_view(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let bytes = self.array.value(index); - builder.append_value(Variant::from(bytes)); - } - Ok(()) - } -} - -/// FixedSizeBinary builder for Arrow FixedSizeBinaryArray -pub(crate) struct FixedSizeBinaryArrowToVariantBuilder<'a> { - array: &'a arrow::array::FixedSizeBinaryArray, -} - -impl<'a> FixedSizeBinaryArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_fixed_size_binary(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let bytes = self.array.value(index); - builder.append_value(Variant::from(bytes)); - } - Ok(()) - } -} +// BinaryView builder - handles BinaryViewArray +define_row_builder!( + struct BinaryViewArrowToVariantBuilder<'a>, + |array| -> arrow::array::BinaryViewArray { array.as_byte_view() }, + |value| value +); -/// Utf8View builder for Arrow StringViewArray -pub(crate) struct Utf8ViewArrowToVariantBuilder<'a> { - array: &'a arrow::array::StringViewArray, -} +// FixedSizeBinary builder - handles FixedSizeBinaryArray +define_row_builder!( + struct FixedSizeBinaryArrowToVariantBuilder<'a>, + |array| -> arrow::array::FixedSizeBinaryArray { array.as_fixed_size_binary() }, + |value| value +); -impl<'a> Utf8ViewArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Self { - Self { - array: array.as_string_view(), - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let string = self.array.value(index); - builder.append_value(Variant::from(string)); - } - Ok(()) - } -} +// Utf8View builder - handles StringViewArray +define_row_builder!( + struct Utf8ViewArrowToVariantBuilder<'a>, + |array| -> arrow::array::StringViewArray { array.as_string_view() }, + |value| value +); /// Generic Timestamp builder for Arrow timestamp arrays pub(crate) struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> { @@ -782,7 +718,6 @@ define_row_builder!( } ); - /// Factory function to create the appropriate row builder for a given DataType fn make_arrow_to_variant_row_builder<'a>( data_type: &'a DataType, From ad9ccf81026c968787e1708c3efc66ac21c9e026 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 15:36:31 -0700 Subject: [PATCH 42/53] checkpoint - macro supports extra fields --- .../src/cast_to_variant.rs | 193 +++++------------- 1 file changed, 55 insertions(+), 138 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index af7c1aa67dca..cf93202453be 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -150,10 +150,12 @@ impl<'a> ArrowToVariantRowBuilder<'a> { // ============================================================================ /// Macro to define (possibly generic) row builders with consistent structure and behavior +/// Supports optional extra fields that are passed to the constructor macro_rules! define_row_builder { ( struct $name:ident<$lifetime:lifetime $(, $generic:ident: $($bound:path)+)?> - $(where $where_path:path: $where_bound:path)?, + $(where $where_path:path: $where_bound:path)? + $({ $($field:ident: $field_type:ty),* $(,)? })?, |$array_param:ident| -> $array_type:ty { $init_expr:expr }, |$value:ident| $value_transform:expr ) => { @@ -161,14 +163,16 @@ macro_rules! define_row_builder { $(where $where_path: $where_bound)? { array: &$lifetime $array_type, + $($($field: $field_type,)*)? } impl<$lifetime $(, $generic: $($bound)+)?> $name<$lifetime $(, $generic)?> $(where $where_path: $where_bound)? { - fn new($array_param: &$lifetime dyn Array) -> Self { + fn new($array_param: &$lifetime dyn Array $(, $($field: $field_type),*)?) -> Self { Self { array: $init_expr, + $($($field,)*)? } } @@ -521,114 +525,44 @@ impl<'a> UnionArrowToVariantBuilder<'a> { } } -/// Decimal32 builder for Arrow Decimal32Array -pub(crate) struct Decimal32ArrowToVariantBuilder<'a> { - array: &'a arrow::array::Decimal32Array, - scale: i8, -} -impl<'a> Decimal32ArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array, scale: i8) -> Self { - Self { - array: array.as_primitive::(), - scale, - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let value = self.array.value(index); - let variant = decimal_to_variant_decimal!(value, &self.scale, i32, VariantDecimal4); - builder.append_value(variant); - } - Ok(()) - } -} - -/// Decimal64 builder for Arrow Decimal64Array -pub(crate) struct Decimal64ArrowToVariantBuilder<'a> { - array: &'a arrow::array::Decimal64Array, - scale: i8, -} - -impl<'a> Decimal64ArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array, scale: i8) -> Self { - Self { - array: array.as_primitive::(), - scale, - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let value = self.array.value(index); - let variant = decimal_to_variant_decimal!(value, &self.scale, i64, VariantDecimal8); - builder.append_value(variant); - } - Ok(()) - } -} - -/// Decimal128 builder for Arrow Decimal128Array -pub(crate) struct Decimal128ArrowToVariantBuilder<'a> { - array: &'a arrow::array::Decimal128Array, - scale: i8, -} +// Decimal32 builder for Arrow Decimal32Array +define_row_builder!( + struct Decimal32ArrowToVariantBuilder<'a> + { scale: i8 }, + |array| -> arrow::array::Decimal32Array { array.as_primitive::() }, + |value| decimal_to_variant_decimal!(value, &self.scale, i32, VariantDecimal4) +); -impl<'a> Decimal128ArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array, scale: i8) -> Self { - Self { - array: array.as_primitive::(), - scale, - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let value = self.array.value(index); - let variant = decimal_to_variant_decimal!(value, &self.scale, i128, VariantDecimal16); - builder.append_value(variant); - } - Ok(()) - } -} +// Decimal64 builder for Arrow Decimal64Array +define_row_builder!( + struct Decimal64ArrowToVariantBuilder<'a> + { scale: i8 }, + |array| -> arrow::array::Decimal64Array { array.as_primitive::() }, + |value| decimal_to_variant_decimal!(value, &self.scale, i64, VariantDecimal8) +); -/// Decimal256 builder for Arrow Decimal256Array -pub(crate) struct Decimal256ArrowToVariantBuilder<'a> { - array: &'a arrow::array::Decimal256Array, - scale: i8, -} +// Decimal128 builder for Arrow Decimal128Array +define_row_builder!( + struct Decimal128ArrowToVariantBuilder<'a> + { scale: i8 }, + |array| -> arrow::array::Decimal128Array { array.as_primitive::() }, + |value| decimal_to_variant_decimal!(value, &self.scale, i128, VariantDecimal16) +); -impl<'a> Decimal256ArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array, scale: i8) -> Self { - Self { - array: array.as_primitive::(), - scale, - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let value = self.array.value(index); - // Special handling for Decimal256 like in original cast_to_variant - let variant = if let Some(v) = value.to_i128() { - decimal_to_variant_decimal!(v, &self.scale, i128, VariantDecimal16) - } else { - Variant::Null - }; - builder.append_value(variant); +// Decimal256 builder for Arrow Decimal256Array +define_row_builder!( + struct Decimal256ArrowToVariantBuilder<'a> + { scale: i8 }, + |array| -> arrow::array::Decimal256Array { array.as_primitive::() }, + |value| { + // Decimal256 needs special handling - convert to i128 if possible + match value.to_i128() { + Some(i128_val) => decimal_to_variant_decimal!(i128_val, &self.scale, i128, VariantDecimal16), + None => Variant::Null, // Value too large for i128 } - Ok(()) } -} +); // Generic Binary builder for Arrow BinaryArray and LargeBinaryArray define_row_builder!( @@ -658,43 +592,26 @@ define_row_builder!( |value| value ); -/// Generic Timestamp builder for Arrow timestamp arrays -pub(crate) struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> { - array: &'a arrow::array::PrimitiveArray, - has_time_zone: bool, -} - -impl<'a, T: ArrowTimestampType> TimestampArrowToVariantBuilder<'a, T> { - fn new(array: &'a dyn Array, has_time_zone: bool) -> Self { - Self { - array: array.as_primitive::(), - has_time_zone, - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); +// Generic Timestamp builder for Arrow timestamp arrays +define_row_builder!( + struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> + { has_time_zone: bool }, + |array| -> arrow::array::PrimitiveArray { array.as_primitive() }, + |value| { + // Convert using Arrow's temporal conversion functions + let Some(naive_datetime) = as_datetime::(value) else { + return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); + }; + if self.has_time_zone { + // Has timezone -> DateTime -> TimestampMicros/TimestampNanos + let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); + Variant::from(utc_dt) // Uses From> for Variant } else { - let timestamp_value = self.array.value(index); - - // Convert using Arrow's temporal conversion functions - let Some(naive_datetime) = as_datetime::(timestamp_value) else { - return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); - }; - let variant = if self.has_time_zone { - // Has timezone -> DateTime -> TimestampMicros/TimestampNanos - let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); - Variant::from(utc_dt) // Uses From> for Variant - } else { - // No timezone -> NaiveDateTime -> TimestampNtzMicros/TimestampNtzNanos - Variant::from(naive_datetime) // Uses From for Variant - }; - builder.append_value(variant); + // No timezone -> NaiveDateTime -> TimestampNtzMicros/TimestampNtzNanos + Variant::from(naive_datetime) // Uses From for Variant } - Ok(()) } -} +); // Generic Date builder for Arrow date arrays (Date32, Date64) define_row_builder!( From f5675ded1ab5a4c6db24609eefed81e9a1672152 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 15:53:58 -0700 Subject: [PATCH 43/53] manual fix - macro hygiene --- .../src/cast_to_variant.rs | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index cf93202453be..9d855939bf1e 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -155,7 +155,7 @@ macro_rules! define_row_builder { ( struct $name:ident<$lifetime:lifetime $(, $generic:ident: $($bound:path)+)?> $(where $where_path:path: $where_bound:path)? - $({ $($field:ident: $field_type:ty),* $(,)? })?, + $({ $($this:ident.$field:ident: $field_type:ty, )+ })?, |$array_param:ident| -> $array_type:ty { $init_expr:expr }, |$value:ident| $value_transform:expr ) => { @@ -181,6 +181,7 @@ macro_rules! define_row_builder { builder.append_null(); } else { let $value = self.array.value(index); + $( $(let $this = self;)+ )? builder.append_value($value_transform); } Ok(()) @@ -529,36 +530,36 @@ impl<'a> UnionArrowToVariantBuilder<'a> { // Decimal32 builder for Arrow Decimal32Array define_row_builder!( struct Decimal32ArrowToVariantBuilder<'a> - { scale: i8 }, + { this.scale: i8, }, |array| -> arrow::array::Decimal32Array { array.as_primitive::() }, - |value| decimal_to_variant_decimal!(value, &self.scale, i32, VariantDecimal4) + |value| decimal_to_variant_decimal!(value, &this.scale, i32, VariantDecimal4) ); // Decimal64 builder for Arrow Decimal64Array define_row_builder!( struct Decimal64ArrowToVariantBuilder<'a> - { scale: i8 }, + { this.scale: i8, }, |array| -> arrow::array::Decimal64Array { array.as_primitive::() }, - |value| decimal_to_variant_decimal!(value, &self.scale, i64, VariantDecimal8) + |value| decimal_to_variant_decimal!(value, &this.scale, i64, VariantDecimal8) ); // Decimal128 builder for Arrow Decimal128Array define_row_builder!( struct Decimal128ArrowToVariantBuilder<'a> - { scale: i8 }, + { this.scale: i8, }, |array| -> arrow::array::Decimal128Array { array.as_primitive::() }, - |value| decimal_to_variant_decimal!(value, &self.scale, i128, VariantDecimal16) + |value| decimal_to_variant_decimal!(value, &this.scale, i128, VariantDecimal16) ); // Decimal256 builder for Arrow Decimal256Array define_row_builder!( struct Decimal256ArrowToVariantBuilder<'a> - { scale: i8 }, + { this.scale: i8, }, |array| -> arrow::array::Decimal256Array { array.as_primitive::() }, |value| { // Decimal256 needs special handling - convert to i128 if possible match value.to_i128() { - Some(i128_val) => decimal_to_variant_decimal!(i128_val, &self.scale, i128, VariantDecimal16), + Some(i128_val) => decimal_to_variant_decimal!(i128_val, &this.scale, i128, VariantDecimal16), None => Variant::Null, // Value too large for i128 } } @@ -595,14 +596,14 @@ define_row_builder!( // Generic Timestamp builder for Arrow timestamp arrays define_row_builder!( struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> - { has_time_zone: bool }, + { this.has_time_zone: bool, }, |array| -> arrow::array::PrimitiveArray { array.as_primitive() }, |value| { // Convert using Arrow's temporal conversion functions let Some(naive_datetime) = as_datetime::(value) else { return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); }; - if self.has_time_zone { + if this.has_time_zone { // Has timezone -> DateTime -> TimestampMicros/TimestampNanos let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); Variant::from(utc_dt) // Uses From> for Variant From 4e9c697c40bc422e7fcf2e5efcb6475ceebe9e46 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 16:00:35 -0700 Subject: [PATCH 44/53] checkpoint - better hygiene solution --- .../src/cast_to_variant.rs | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 9d855939bf1e..af862af9373f 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -155,7 +155,7 @@ macro_rules! define_row_builder { ( struct $name:ident<$lifetime:lifetime $(, $generic:ident: $($bound:path)+)?> $(where $where_path:path: $where_bound:path)? - $({ $($this:ident.$field:ident: $field_type:ty, )+ })?, + $({ $($field:ident: $field_type:ty),* $(,)? })?, |$array_param:ident| -> $array_type:ty { $init_expr:expr }, |$value:ident| $value_transform:expr ) => { @@ -181,7 +181,8 @@ macro_rules! define_row_builder { builder.append_null(); } else { let $value = self.array.value(index); - $( $(let $this = self;)+ )? + // Capture fields as variables the transform can access (hygiene) + $($(let $field = &self.$field;)*)? builder.append_value($value_transform); } Ok(()) @@ -530,36 +531,36 @@ impl<'a> UnionArrowToVariantBuilder<'a> { // Decimal32 builder for Arrow Decimal32Array define_row_builder!( struct Decimal32ArrowToVariantBuilder<'a> - { this.scale: i8, }, + { scale: i8 }, |array| -> arrow::array::Decimal32Array { array.as_primitive::() }, - |value| decimal_to_variant_decimal!(value, &this.scale, i32, VariantDecimal4) + |value| decimal_to_variant_decimal!(value, scale, i32, VariantDecimal4) ); // Decimal64 builder for Arrow Decimal64Array define_row_builder!( struct Decimal64ArrowToVariantBuilder<'a> - { this.scale: i8, }, + { scale: i8 }, |array| -> arrow::array::Decimal64Array { array.as_primitive::() }, - |value| decimal_to_variant_decimal!(value, &this.scale, i64, VariantDecimal8) + |value| decimal_to_variant_decimal!(value, scale, i64, VariantDecimal8) ); // Decimal128 builder for Arrow Decimal128Array define_row_builder!( struct Decimal128ArrowToVariantBuilder<'a> - { this.scale: i8, }, + { scale: i8 }, |array| -> arrow::array::Decimal128Array { array.as_primitive::() }, - |value| decimal_to_variant_decimal!(value, &this.scale, i128, VariantDecimal16) + |value| decimal_to_variant_decimal!(value, scale, i128, VariantDecimal16) ); // Decimal256 builder for Arrow Decimal256Array define_row_builder!( struct Decimal256ArrowToVariantBuilder<'a> - { this.scale: i8, }, + { scale: i8 }, |array| -> arrow::array::Decimal256Array { array.as_primitive::() }, |value| { // Decimal256 needs special handling - convert to i128 if possible match value.to_i128() { - Some(i128_val) => decimal_to_variant_decimal!(i128_val, &this.scale, i128, VariantDecimal16), + Some(i128_val) => decimal_to_variant_decimal!(i128_val, scale, i128, VariantDecimal16), None => Variant::Null, // Value too large for i128 } } @@ -596,14 +597,14 @@ define_row_builder!( // Generic Timestamp builder for Arrow timestamp arrays define_row_builder!( struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> - { this.has_time_zone: bool, }, + { has_time_zone: bool }, |array| -> arrow::array::PrimitiveArray { array.as_primitive() }, |value| { // Convert using Arrow's temporal conversion functions let Some(naive_datetime) = as_datetime::(value) else { return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); }; - if this.has_time_zone { + if *has_time_zone { // Has timezone -> DateTime -> TimestampMicros/TimestampNanos let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); Variant::from(utc_dt) // Uses From> for Variant From 637efdef81decfd47cbb0b6e281f30afe2cc8255 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 16:08:28 -0700 Subject: [PATCH 45/53] manual fix - remove turbofish --- .../src/cast_to_variant.rs | 47 +++++++++---------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index af862af9373f..f83a52a7514e 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -27,8 +27,7 @@ use arrow::array::{ }; use arrow::compute::kernels::cast; use arrow::datatypes::{ - ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, Date32Type, Date64Type, Decimal128Type, - Decimal256Type, Decimal32Type, Decimal64Type, Float16Type, Float32Type, Float64Type, Int16Type, + ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, Date32Type, Date64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, RunEndIndexType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, @@ -397,7 +396,7 @@ pub(crate) struct ListArrowToVariantBuilder<'a, O: OffsetSizeTrait> { impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { fn new(array: &'a dyn Array) -> Result { - let list_array = array.as_list::(); + let list_array = array.as_list(); let values = list_array.values(); let values_builder = make_arrow_to_variant_row_builder( @@ -532,7 +531,7 @@ impl<'a> UnionArrowToVariantBuilder<'a> { define_row_builder!( struct Decimal32ArrowToVariantBuilder<'a> { scale: i8 }, - |array| -> arrow::array::Decimal32Array { array.as_primitive::() }, + |array| -> arrow::array::Decimal32Array { array.as_primitive() }, |value| decimal_to_variant_decimal!(value, scale, i32, VariantDecimal4) ); @@ -540,7 +539,7 @@ define_row_builder!( define_row_builder!( struct Decimal64ArrowToVariantBuilder<'a> { scale: i8 }, - |array| -> arrow::array::Decimal64Array { array.as_primitive::() }, + |array| -> arrow::array::Decimal64Array { array.as_primitive() }, |value| decimal_to_variant_decimal!(value, scale, i64, VariantDecimal8) ); @@ -548,7 +547,7 @@ define_row_builder!( define_row_builder!( struct Decimal128ArrowToVariantBuilder<'a> { scale: i8 }, - |array| -> arrow::array::Decimal128Array { array.as_primitive::() }, + |array| -> arrow::array::Decimal128Array { array.as_primitive() }, |value| decimal_to_variant_decimal!(value, scale, i128, VariantDecimal16) ); @@ -556,7 +555,7 @@ define_row_builder!( define_row_builder!( struct Decimal256ArrowToVariantBuilder<'a> { scale: i8 }, - |array| -> arrow::array::Decimal256Array { array.as_primitive::() }, + |array| -> arrow::array::Decimal256Array { array.as_primitive() }, |value| { // Decimal256 needs special handling - convert to i128 if possible match value.to_i128() { @@ -644,19 +643,19 @@ fn make_arrow_to_variant_row_builder<'a>( ) -> Result, ArrowError> { match data_type { // All integer types - DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::new(array))), + DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::new(array))), + DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::new(array))), + DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::new(array))), + DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::new(array))), + DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::new(array))), + DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::new(array))), + DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::new(array))), // Float types - DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::::new(array))), - DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::::new(array))), + DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::new(array))), + DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::new(array))), + DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::new(array))), // Decimal types DataType::Decimal32(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale))), @@ -724,20 +723,20 @@ fn make_arrow_to_variant_row_builder<'a>( // Date types DataType::Date32 => Ok(ArrowToVariantRowBuilder::Date32( - DateArrowToVariantBuilder::::new(array) + DateArrowToVariantBuilder::new(array) )), DataType::Date64 => Ok(ArrowToVariantRowBuilder::Date64( - DateArrowToVariantBuilder::::new(array) + DateArrowToVariantBuilder::new(array) )), // Time types DataType::Time32(time_unit) => { match time_unit { TimeUnit::Second => Ok(ArrowToVariantRowBuilder::Time32Second( - TimeArrowToVariantBuilder::::new(array) + TimeArrowToVariantBuilder::new(array) )), TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::Time32Millisecond( - TimeArrowToVariantBuilder::::new(array) + TimeArrowToVariantBuilder::new(array) )), _ => Err(ArrowError::CastError(format!("Unsupported Time32 unit: {time_unit:?}"))), } @@ -745,10 +744,10 @@ fn make_arrow_to_variant_row_builder<'a>( DataType::Time64(time_unit) => { match time_unit { TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::Time64Microsecond( - TimeArrowToVariantBuilder::::new(array) + TimeArrowToVariantBuilder::new(array) )), TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::Time64Nanosecond( - TimeArrowToVariantBuilder::::new(array) + TimeArrowToVariantBuilder::new(array) )), _ => Err(ArrowError::CastError(format!("Unsupported Time64 unit: {time_unit:?}"))), } From 20300c8ff272bcef72dd78def36693df69fe08bf Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 16:17:19 -0700 Subject: [PATCH 46/53] fmt --- .../src/cast_to_variant.rs | 1442 +++++++++-------- parquet-variant/src/builder.rs | 2 +- 2 files changed, 785 insertions(+), 659 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index f83a52a7514e..bc9a07cc7a40 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -17,29 +17,25 @@ use std::collections::HashMap; -use crate::type_conversion::{ - decimal_to_variant_decimal, -}; +use crate::type_conversion::decimal_to_variant_decimal; use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ - Array, AsArray, OffsetSizeTrait, - PrimitiveArray, GenericStringArray, GenericBinaryArray, + Array, AsArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; use arrow::compute::kernels::cast; use arrow::datatypes::{ - ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, Date32Type, Date64Type, Float16Type, Float32Type, Float64Type, Int16Type, - Int32Type, Int64Type, Int8Type, RunEndIndexType, Time32MillisecondType, - Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimestampMicrosecondType, - TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, - UInt64Type, UInt8Type, -}; -use arrow::temporal_conversions::{ - as_date, as_datetime, as_time, + ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, Date32Type, + Date64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, + RunEndIndexType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, + Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; +use arrow::temporal_conversions::{as_date, as_datetime, as_time}; use arrow_schema::{ArrowError, DataType, TimeUnit}; use chrono::{DateTime, TimeZone, Utc}; use parquet_variant::{ - ObjectFieldBuilder, Variant, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, + ObjectFieldBuilder, Variant, VariantBuilderExt, VariantDecimal16, VariantDecimal4, + VariantDecimal8, }; // ============================================================================ @@ -94,7 +90,11 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { } impl<'a> ArrowToVariantRowBuilder<'a> { - pub fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + pub fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { match self { ArrowToVariantRowBuilder::PrimitiveInt8(b) => b.append_row(index, builder), ArrowToVariantRowBuilder::PrimitiveInt16(b) => b.append_row(index, builder), @@ -143,13 +143,8 @@ impl<'a> ArrowToVariantRowBuilder<'a> { } } - -// ============================================================================ -// Macro for generating generic row builders -// ============================================================================ - -/// Macro to define (possibly generic) row builders with consistent structure and behavior -/// Supports optional extra fields that are passed to the constructor +/// Macro to define (possibly generic) row builders with consistent structure and behavior. +/// Supports optional extra fields that are passed to the constructor. macro_rules! define_row_builder { ( struct $name:ident<$lifetime:lifetime $(, $generic:ident: $($bound:path)+)?> @@ -158,13 +153,13 @@ macro_rules! define_row_builder { |$array_param:ident| -> $array_type:ty { $init_expr:expr }, |$value:ident| $value_transform:expr ) => { - pub(crate) struct $name<$lifetime $(, $generic: $($bound)+)?> + pub(crate) struct $name<$lifetime $(, $generic: $($bound)+)?> $(where $where_path: $where_bound)? { array: &$lifetime $array_type, $($($field: $field_type,)*)? } - + impl<$lifetime $(, $generic: $($bound)+)?> $name<$lifetime $(, $generic)?> $(where $where_path: $where_bound)? { @@ -174,7 +169,7 @@ macro_rules! define_row_builder { $($($field,)*)? } } - + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { if self.array.is_null(index) { builder.append_null(); @@ -216,7 +211,6 @@ define_row_builder!( |value| value ); - /// Struct builder for StructArray pub(crate) struct StructArrowToVariantBuilder<'a> { struct_array: &'a arrow::array::StructArray, @@ -226,37 +220,42 @@ pub(crate) struct StructArrowToVariantBuilder<'a> { impl<'a> StructArrowToVariantBuilder<'a> { fn new(struct_array: &'a arrow::array::StructArray) -> Result { let mut field_builders = Vec::new(); - + // Create a row builder for each field - for (field_name, field_array) in struct_array.column_names().iter() - .zip(struct_array.columns().iter()) + for (field_name, field_array) in struct_array + .column_names() + .iter() + .zip(struct_array.columns().iter()) { - let field_builder = make_arrow_to_variant_row_builder( - field_array.data_type(), - field_array.as_ref(), - )?; + let field_builder = + make_arrow_to_variant_row_builder(field_array.data_type(), field_array.as_ref())?; field_builders.push((*field_name, field_builder)); } - + Ok(Self { struct_array, field_builders, }) } - - fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { if self.struct_array.is_null(index) { builder.append_null(); } else { // Create object builder for this struct row let mut obj_builder = builder.try_new_object()?; - + // Process each field for (field_name, row_builder) in &mut self.field_builders { - let mut field_builder = parquet_variant::ObjectFieldBuilder::new(field_name, &mut obj_builder); + let mut field_builder = + parquet_variant::ObjectFieldBuilder::new(field_name, &mut obj_builder); row_builder.append_row(index, &mut field_builder)?; } - + obj_builder.finish(); } Ok(()) @@ -267,7 +266,11 @@ impl<'a> StructArrowToVariantBuilder<'a> { pub(crate) struct NullArrowToVariantBuilder; impl NullArrowToVariantBuilder { - fn append_row(&mut self, _index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + fn append_row( + &mut self, + _index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { builder.append_null(); Ok(()) } @@ -277,10 +280,10 @@ impl NullArrowToVariantBuilder { pub(crate) struct RunEndEncodedArrowToVariantBuilder<'a, R: RunEndIndexType> { run_array: &'a arrow::array::RunArray, values_builder: Box>, - + run_ends: &'a [R::Native], - run_number: usize, // Physical index into run_ends and values - run_start: usize, // Logical start index of current run + run_number: usize, // Physical index into run_ends and values + run_start: usize, // Logical start index of current run } impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { @@ -288,13 +291,11 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { let Some(run_array) = array.as_run_opt() else { return Err(ArrowError::CastError("Expected RunArray".to_string())); }; - - let values_array = run_array.values(); - let values_builder = make_arrow_to_variant_row_builder( - values_array.data_type(), - values_array.as_ref(), - )?; - + + let values = run_array.values(); + let values_builder = + make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; + Ok(Self { run_array, values_builder: Box::new(values_builder), @@ -303,26 +304,32 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { run_start: 0, }) } - - fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { self.set_run_for_index(index)?; - + // Handle null values if self.run_array.values().is_null(self.run_number) { builder.append_null(); return Ok(()); } - + // Re-encode the value self.values_builder.append_row(self.run_number, builder)?; - + Ok(()) } - + fn set_run_for_index(&mut self, index: usize) -> Result<(), ArrowError> { if index >= self.run_start { let Some(run_end) = self.run_ends.get(self.run_number) else { - return Err(ArrowError::CastError(format!("Index {} beyond run array", index))); + return Err(ArrowError::CastError(format!( + "Index {index} beyond run array" + ))); }; if index < run_end.as_usize() { return Ok(()); @@ -335,9 +342,13 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { } // Use partition_point for all non-sequential cases - let run_number = self.run_ends.partition_point(|&run_end| run_end.as_usize() <= index); + let run_number = self + .run_ends + .partition_point(|&run_end| run_end.as_usize() <= index); if run_number >= self.run_ends.len() { - return Err(ArrowError::CastError(format!("Index {} beyond run array", index))); + return Err(ArrowError::CastError(format!( + "Index {index} beyond run array" + ))); } self.run_number = run_number; self.run_start = match run_number { @@ -359,25 +370,27 @@ impl<'a> DictionaryArrowToVariantBuilder<'a> { fn new(array: &'a dyn Array) -> Result { let dict_array = array.as_any_dictionary(); let values = dict_array.values(); - let values_builder = make_arrow_to_variant_row_builder( - values.data_type(), - values.as_ref(), - )?; - + let values_builder = + make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; + // WARNING: normalized_keys panics if values is empty let normalized_keys = match values.len() { 0 => Vec::new(), _ => dict_array.normalized_keys(), }; - + Ok(Self { keys: dict_array.keys(), normalized_keys, values_builder: Box::new(values_builder), }) } - - fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { if self.keys.is_null(index) { builder.append_null(); } else { @@ -398,31 +411,33 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { fn new(array: &'a dyn Array) -> Result { let list_array = array.as_list(); let values = list_array.values(); - - let values_builder = make_arrow_to_variant_row_builder( - values.data_type(), - values.as_ref(), - )?; - + let values_builder = + make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; + Ok(Self { list_array, values_builder: Box::new(values_builder), }) } - - fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { if self.list_array.is_null(index) { builder.append_null(); return Ok(()); } - + let offsets = self.list_array.offsets(); let start = offsets[index].as_usize(); let end = offsets[index + 1].as_usize(); - + let mut list_builder = builder.try_new_list()?; for value_index in start..end { - self.values_builder.append_row(value_index, &mut list_builder)?; + self.values_builder + .append_row(value_index, &mut list_builder)?; } list_builder.finish(); Ok(()) @@ -439,46 +454,49 @@ pub(crate) struct MapArrowToVariantBuilder<'a> { impl<'a> MapArrowToVariantBuilder<'a> { fn new(array: &'a dyn Array) -> Result { let map_array = array.as_map(); - + // Pre-cast keys to strings once (like existing convert_map code) let keys = cast(map_array.keys(), &DataType::Utf8)?; let key_strings = keys.as_string::().clone(); - + // Create recursive builder for values let values = map_array.values(); - let values_builder = make_arrow_to_variant_row_builder( - values.data_type(), - values.as_ref(), - )?; - + let values_builder = + make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; + Ok(Self { map_array, key_strings, values_builder: Box::new(values_builder), }) } - - fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { // Check for NULL map first (via null bitmap) if self.map_array.is_null(index) { builder.append_null(); return Ok(()); } - + let offsets = self.map_array.offsets(); let start = offsets[index].as_usize(); let end = offsets[index + 1].as_usize(); - + // Create object builder for this map (even if empty) let mut object_builder = builder.try_new_object()?; - + // Add each key-value pair (loop does nothing for empty maps - correct!) for kv_index in start..end { let key = self.key_strings.value(kv_index); let mut field_builder = ObjectFieldBuilder::new(key, &mut object_builder); - self.values_builder.append_row(kv_index, &mut field_builder)?; + self.values_builder + .append_row(kv_index, &mut field_builder)?; } - + object_builder.finish(); // Empty map becomes empty object {} Ok(()) } @@ -494,67 +512,72 @@ impl<'a> UnionArrowToVariantBuilder<'a> { fn new(array: &'a dyn Array) -> Result { let union_array = array.as_union(); let type_ids = union_array.type_ids(); - + // Create child builders for each union field let mut child_builders = HashMap::new(); for &type_id in type_ids { let child_array = union_array.child(type_id); - let child_builder = make_arrow_to_variant_row_builder( - child_array.data_type(), - child_array.as_ref(), - )?; + let child_builder = + make_arrow_to_variant_row_builder(child_array.data_type(), child_array.as_ref())?; child_builders.insert(type_id, Box::new(child_builder)); } - + Ok(Self { union_array, child_builders, }) } - - fn append_row(&mut self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { let type_id = self.union_array.type_id(index); let value_offset = self.union_array.value_offset(index); - + // Delegate to the appropriate child builder, or append null to handle an invalid type_id match self.child_builders.get_mut(&type_id) { Some(child_builder) => child_builder.append_row(value_offset, builder)?, None => builder.append_null(), } - + Ok(()) } } - // Decimal32 builder for Arrow Decimal32Array define_row_builder!( - struct Decimal32ArrowToVariantBuilder<'a> - { scale: i8 }, + struct Decimal32ArrowToVariantBuilder<'a> { + scale: i8, + }, |array| -> arrow::array::Decimal32Array { array.as_primitive() }, |value| decimal_to_variant_decimal!(value, scale, i32, VariantDecimal4) ); // Decimal64 builder for Arrow Decimal64Array define_row_builder!( - struct Decimal64ArrowToVariantBuilder<'a> - { scale: i8 }, + struct Decimal64ArrowToVariantBuilder<'a> { + scale: i8, + }, |array| -> arrow::array::Decimal64Array { array.as_primitive() }, |value| decimal_to_variant_decimal!(value, scale, i64, VariantDecimal8) ); // Decimal128 builder for Arrow Decimal128Array define_row_builder!( - struct Decimal128ArrowToVariantBuilder<'a> - { scale: i8 }, + struct Decimal128ArrowToVariantBuilder<'a> { + scale: i8, + }, |array| -> arrow::array::Decimal128Array { array.as_primitive() }, |value| decimal_to_variant_decimal!(value, scale, i128, VariantDecimal16) ); // Decimal256 builder for Arrow Decimal256Array define_row_builder!( - struct Decimal256ArrowToVariantBuilder<'a> - { scale: i8 }, + struct Decimal256ArrowToVariantBuilder<'a> { + scale: i8, + }, |array| -> arrow::array::Decimal256Array { array.as_primitive() }, |value| { // Decimal256 needs special handling - convert to i128 if possible @@ -595,16 +618,19 @@ define_row_builder!( // Generic Timestamp builder for Arrow timestamp arrays define_row_builder!( - struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> - { has_time_zone: bool }, + struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> { + has_time_zone: bool, + }, |array| -> arrow::array::PrimitiveArray { array.as_primitive() }, |value| { // Convert using Arrow's temporal conversion functions let Some(naive_datetime) = as_datetime::(value) else { - return Err(ArrowError::CastError("Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string())); + return Err(ArrowError::CastError( + "Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string(), + )); }; if *has_time_zone { - // Has timezone -> DateTime -> TimestampMicros/TimestampNanos + // Has timezone -> DateTime -> TimestampMicros/TimestampNanos let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); Variant::from(utc_dt) // Uses From> for Variant } else { @@ -643,124 +669,184 @@ fn make_arrow_to_variant_row_builder<'a>( ) -> Result, ArrowError> { match data_type { // All integer types - DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::new(array))), - DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::new(array))), - DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::new(array))), - DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::new(array))), - DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::new(array))), - DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::new(array))), - DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::new(array))), - DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::new(array))), - + DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8( + PrimitiveArrowToVariantBuilder::new(array), + )), + DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16( + PrimitiveArrowToVariantBuilder::new(array), + )), + DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32( + PrimitiveArrowToVariantBuilder::new(array), + )), + DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64( + PrimitiveArrowToVariantBuilder::new(array), + )), + DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8( + PrimitiveArrowToVariantBuilder::new(array), + )), + DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16( + PrimitiveArrowToVariantBuilder::new(array), + )), + DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32( + PrimitiveArrowToVariantBuilder::new(array), + )), + DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64( + PrimitiveArrowToVariantBuilder::new(array), + )), + // Float types - DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::new(array))), - DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::new(array))), - DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::new(array))), - + DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16( + PrimitiveArrowToVariantBuilder::new(array), + )), + DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32( + PrimitiveArrowToVariantBuilder::new(array), + )), + DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64( + PrimitiveArrowToVariantBuilder::new(array), + )), + // Decimal types - DataType::Decimal32(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale))), - DataType::Decimal64(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal64(Decimal64ArrowToVariantBuilder::new(array, *scale))), - DataType::Decimal128(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal128(Decimal128ArrowToVariantBuilder::new(array, *scale))), - DataType::Decimal256(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal256(Decimal256ArrowToVariantBuilder::new(array, *scale))), - + DataType::Decimal32(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal32( + Decimal32ArrowToVariantBuilder::new(array, *scale), + )), + DataType::Decimal64(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal64( + Decimal64ArrowToVariantBuilder::new(array, *scale), + )), + DataType::Decimal128(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal128( + Decimal128ArrowToVariantBuilder::new(array, *scale), + )), + DataType::Decimal256(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal256( + Decimal256ArrowToVariantBuilder::new(array, *scale), + )), + // Special types - DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array))), - DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array))), - DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::LargeString(StringArrowToVariantBuilder::new(array))), - DataType::Utf8View => Ok(ArrowToVariantRowBuilder::Utf8View(Utf8ViewArrowToVariantBuilder::new(array))), - + DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean( + BooleanArrowToVariantBuilder::new(array), + )), + DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String( + StringArrowToVariantBuilder::new(array), + )), + DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::LargeString( + StringArrowToVariantBuilder::new(array), + )), + DataType::Utf8View => Ok(ArrowToVariantRowBuilder::Utf8View( + Utf8ViewArrowToVariantBuilder::new(array), + )), + // Binary types - DataType::Binary => Ok(ArrowToVariantRowBuilder::Binary(BinaryArrowToVariantBuilder::new(array))), - DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary(BinaryArrowToVariantBuilder::new(array))), - DataType::BinaryView => Ok(ArrowToVariantRowBuilder::BinaryView(BinaryViewArrowToVariantBuilder::new(array))), - DataType::FixedSizeBinary(_) => Ok(ArrowToVariantRowBuilder::FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder::new(array))), - - DataType::Struct(_) => Ok(ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new(array.as_struct())?)), + DataType::Binary => Ok(ArrowToVariantRowBuilder::Binary( + BinaryArrowToVariantBuilder::new(array), + )), + DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary( + BinaryArrowToVariantBuilder::new(array), + )), + DataType::BinaryView => Ok(ArrowToVariantRowBuilder::BinaryView( + BinaryViewArrowToVariantBuilder::new(array), + )), + DataType::FixedSizeBinary(_) => Ok(ArrowToVariantRowBuilder::FixedSizeBinary( + FixedSizeBinaryArrowToVariantBuilder::new(array), + )), + + DataType::Struct(_) => Ok(ArrowToVariantRowBuilder::Struct( + StructArrowToVariantBuilder::new(array.as_struct())?, + )), DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder)), - + // Run-end encoded types - DataType::RunEndEncoded(run_ends, _) => { - match run_ends.data_type() { - DataType::Int16 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder::new(array)?)), - DataType::Int32 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder::new(array)?)), - DataType::Int64 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder::new(array)?)), - _ => Err(ArrowError::CastError(format!("Unsupported run-end type: {run_ends:?}"))), - } - } - + DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { + DataType::Int16 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt16( + RunEndEncodedArrowToVariantBuilder::new(array)?, + )), + DataType::Int32 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt32( + RunEndEncodedArrowToVariantBuilder::new(array)?, + )), + DataType::Int64 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt64( + RunEndEncodedArrowToVariantBuilder::new(array)?, + )), + _ => Err(ArrowError::CastError(format!( + "Unsupported run-end type: {run_ends:?}" + ))), + }, + // Dictionary types - DataType::Dictionary(_, _) => { - Ok(ArrowToVariantRowBuilder::Dictionary(DictionaryArrowToVariantBuilder::new(array)?)) - } - + DataType::Dictionary(_, _) => Ok(ArrowToVariantRowBuilder::Dictionary( + DictionaryArrowToVariantBuilder::new(array)?, + )), + // List types - DataType::List(_) => Ok(ArrowToVariantRowBuilder::List(ListArrowToVariantBuilder::new(array)?)), - DataType::LargeList(_) => Ok(ArrowToVariantRowBuilder::LargeList(ListArrowToVariantBuilder::new(array)?)), - + DataType::List(_) => Ok(ArrowToVariantRowBuilder::List( + ListArrowToVariantBuilder::new(array)?, + )), + DataType::LargeList(_) => Ok(ArrowToVariantRowBuilder::LargeList( + ListArrowToVariantBuilder::new(array)?, + )), + // Map types - DataType::Map(_, _) => Ok(ArrowToVariantRowBuilder::Map(MapArrowToVariantBuilder::new(array)?)), - + DataType::Map(_, _) => Ok(ArrowToVariantRowBuilder::Map( + MapArrowToVariantBuilder::new(array)?, + )), + // Union types - DataType::Union(_, _) => Ok(ArrowToVariantRowBuilder::Union(UnionArrowToVariantBuilder::new(array)?)), - + DataType::Union(_, _) => Ok(ArrowToVariantRowBuilder::Union( + UnionArrowToVariantBuilder::new(array)?, + )), + // Timestamp types - DataType::Timestamp(time_unit, time_zone) => { - match time_unit { - TimeUnit::Second => Ok(ArrowToVariantRowBuilder::TimestampSecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) - )), - TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::TimestampMillisecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) - )), - TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::TimestampMicrosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) - )), - TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::TimestampNanosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()) - )), - } - } - + DataType::Timestamp(time_unit, time_zone) => match time_unit { + TimeUnit::Second => Ok(ArrowToVariantRowBuilder::TimestampSecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + )), + TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::TimestampMillisecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + )), + TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::TimestampMicrosecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + )), + TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::TimestampNanosecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + )), + }, + // Date types DataType::Date32 => Ok(ArrowToVariantRowBuilder::Date32( - DateArrowToVariantBuilder::new(array) + DateArrowToVariantBuilder::new(array), )), DataType::Date64 => Ok(ArrowToVariantRowBuilder::Date64( - DateArrowToVariantBuilder::new(array) + DateArrowToVariantBuilder::new(array), )), - + // Time types - DataType::Time32(time_unit) => { - match time_unit { - TimeUnit::Second => Ok(ArrowToVariantRowBuilder::Time32Second( - TimeArrowToVariantBuilder::new(array) - )), - TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::Time32Millisecond( - TimeArrowToVariantBuilder::new(array) - )), - _ => Err(ArrowError::CastError(format!("Unsupported Time32 unit: {time_unit:?}"))), - } - } - DataType::Time64(time_unit) => { - match time_unit { - TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::Time64Microsecond( - TimeArrowToVariantBuilder::new(array) - )), - TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::Time64Nanosecond( - TimeArrowToVariantBuilder::new(array) - )), - _ => Err(ArrowError::CastError(format!("Unsupported Time64 unit: {time_unit:?}"))), - } - } - - DataType::Duration(_) | DataType::Interval(_) => { - Err(ArrowError::InvalidArgumentError( - "Casting duration/interval types to Variant is not supported. \ + DataType::Time32(time_unit) => match time_unit { + TimeUnit::Second => Ok(ArrowToVariantRowBuilder::Time32Second( + TimeArrowToVariantBuilder::new(array), + )), + TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::Time32Millisecond( + TimeArrowToVariantBuilder::new(array), + )), + _ => Err(ArrowError::CastError(format!( + "Unsupported Time32 unit: {time_unit:?}" + ))), + }, + DataType::Time64(time_unit) => match time_unit { + TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::Time64Microsecond( + TimeArrowToVariantBuilder::new(array), + )), + TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::Time64Nanosecond( + TimeArrowToVariantBuilder::new(array), + )), + _ => Err(ArrowError::CastError(format!( + "Unsupported Time64 unit: {time_unit:?}" + ))), + }, + + DataType::Duration(_) | DataType::Interval(_) => Err(ArrowError::InvalidArgumentError( + "Casting duration/interval types to Variant is not supported. \ The Variant format does not define duration/interval types." - .to_string(), - )) - } - _ => Err(ArrowError::CastError(format!("Unsupported type for row builder: {data_type:?}"))), + .to_string(), + )), + _ => Err(ArrowError::CastError(format!( + "Unsupported type for row builder: {data_type:?}" + ))), } } @@ -796,17 +882,17 @@ fn make_arrow_to_variant_row_builder<'a>( pub fn cast_to_variant(input: &dyn Array) -> Result { // Create row builder for the input array type let mut row_builder = make_arrow_to_variant_row_builder(input.data_type(), input)?; - + // Create output array builder let mut array_builder = VariantArrayBuilder::new(input.len()); - + // Process each row using the row builder for i in 0..input.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder)?; builder.finish(); } - + Ok(array_builder.build()) } @@ -826,11 +912,14 @@ mod tests { IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeListArray, LargeStringArray, ListArray, MapArray, NullArray, StringArray, StringRunBuilder, StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, Time64NanosecondArray, UInt16Array, UInt32Array, UInt64Array, - UInt8Array, UnionArray, TimestampSecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampMicrosecondArray, + Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, + TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, UnionArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; - use arrow::datatypes::{i256, BinaryType, LargeBinaryType, BinaryViewType, IntervalDayTime, IntervalMonthDayNano}; + use arrow::datatypes::{ + i256, BinaryType, BinaryViewType, IntervalDayTime, IntervalMonthDayNano, LargeBinaryType, + }; use arrow_schema::{DataType, Field, Fields, UnionFields}; use arrow_schema::{ DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, @@ -2293,8 +2382,8 @@ mod tests { #[test] fn test_cast_to_variant_map_with_nulls_and_empty() { - use arrow::array::{MapArray, Int32Array, StringArray, StructArray}; - use arrow::buffer::{OffsetBuffer, NullBuffer}; + use arrow::array::{Int32Array, MapArray, StringArray, StructArray}; + use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::datatypes::{DataType, Field, Fields}; use std::sync::Arc; @@ -2313,42 +2402,45 @@ mod tests { // Create offsets for 4 maps: [0..1], [1..1], [1..1], [1..3] let offsets = OffsetBuffer::new(vec![0, 1, 1, 1, 3].into()); - + // Create null buffer - map at index 2 is NULL let null_buffer = Some(NullBuffer::from(vec![true, true, false, true])); - + let map_field = Arc::new(Field::new( "entries", DataType::Struct(entries_fields), false, )); - - let map_array = MapArray::try_new( - map_field, - offsets, - entries, - null_buffer, - false, - ).unwrap(); + + let map_array = MapArray::try_new(map_field, offsets, entries, null_buffer, false).unwrap(); let result = cast_to_variant(&map_array).unwrap(); - + // Map 0: {"key1": 1} let variant0 = result.value(0); - assert_eq!(variant0.as_object().unwrap().get("key1").unwrap(), Variant::from(1)); - + assert_eq!( + variant0.as_object().unwrap().get("key1").unwrap(), + Variant::from(1) + ); + // Map 1: {} (empty, not null) let variant1 = result.value(1); let obj1 = variant1.as_object().unwrap(); assert_eq!(obj1.len(), 0); // Empty object - + // Map 2: null (actual NULL) assert!(result.is_null(2)); - + // Map 3: {"key2": 2, "key3": 3} let variant3 = result.value(3); - assert_eq!(variant3.as_object().unwrap().get("key2").unwrap(), Variant::from(2)); - assert_eq!(variant3.as_object().unwrap().get("key3").unwrap(), Variant::from(3)); + assert_eq!( + variant3.as_object().unwrap().get("key2").unwrap(), + Variant::from(2) + ); + assert_eq!( + variant3.as_object().unwrap().get("key3").unwrap(), + Variant::from(3) + ); } #[test] @@ -2591,32 +2683,33 @@ mod tests { #[cfg(test)] mod row_builder_tests { use super::*; - use arrow::array::{ArrayRef, Int32Array, StringArray, BooleanArray}; + use arrow::array::{ArrayRef, BooleanArray, Int32Array, StringArray}; use std::sync::Arc; #[test] fn test_primitive_row_builder() { // Test Int32Array let int_array = Int32Array::from(vec![Some(42), None, Some(100)]); - let mut row_builder = make_arrow_to_variant_row_builder(int_array.data_type(), &int_array).unwrap(); - + let mut row_builder = + make_arrow_to_variant_row_builder(int_array.data_type(), &int_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(3); - + // Test first value let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(0, &mut variant_builder).unwrap(); variant_builder.finish(); - + // Test null value let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(1, &mut variant_builder).unwrap(); variant_builder.finish(); - + // Test second value let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(2, &mut variant_builder).unwrap(); variant_builder.finish(); - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); assert_eq!(variant_array.value(0), Variant::Int32(42)); @@ -2627,10 +2720,11 @@ mod row_builder_tests { #[test] fn test_string_row_builder() { let string_array = StringArray::from(vec![Some("hello"), None, Some("world")]); - let mut row_builder = make_arrow_to_variant_row_builder(string_array.data_type(), &string_array).unwrap(); - + let mut row_builder = + make_arrow_to_variant_row_builder(string_array.data_type(), &string_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(3); - + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(0, &mut variant_builder).unwrap(); variant_builder.finish(); @@ -2651,10 +2745,11 @@ mod row_builder_tests { #[test] fn test_boolean_row_builder() { let bool_array = BooleanArray::from(vec![Some(true), None, Some(false)]); - let mut row_builder = make_arrow_to_variant_row_builder(bool_array.data_type(), &bool_array).unwrap(); - + let mut row_builder = + make_arrow_to_variant_row_builder(bool_array.data_type(), &bool_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(3); - + let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(0, &mut variant_builder).unwrap(); variant_builder.finish(); @@ -2674,38 +2769,42 @@ mod row_builder_tests { #[test] fn test_struct_row_builder() { - use arrow::array::{StructArray, Int32Array, StringArray, ArrayRef}; + use arrow::array::{ArrayRef, Int32Array, StringArray, StructArray}; use arrow_schema::{DataType, Field}; use std::sync::Arc; - + // Create a struct array with int and string fields let int_field = Field::new("id", DataType::Int32, true); let string_field = Field::new("name", DataType::Utf8, true); - + let int_array = Int32Array::from(vec![Some(1), None, Some(3)]); let string_array = StringArray::from(vec![Some("Alice"), Some("Bob"), None]); - + let struct_array = StructArray::try_new( vec![int_field, string_field].into(), - vec![Arc::new(int_array) as ArrayRef, Arc::new(string_array) as ArrayRef], + vec![ + Arc::new(int_array) as ArrayRef, + Arc::new(string_array) as ArrayRef, + ], None, ) .unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder(struct_array.data_type(), &struct_array).unwrap(); - + + let mut row_builder = + make_arrow_to_variant_row_builder(struct_array.data_type(), &struct_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(3); - + // Test first row let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(0, &mut variant_builder).unwrap(); variant_builder.finish(); - + // Test second row (with null int field) let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(1, &mut variant_builder).unwrap(); variant_builder.finish(); - + // Test third row (with null string field) let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(2, &mut variant_builder).unwrap(); @@ -2713,17 +2812,23 @@ mod row_builder_tests { let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Check first row - should have both fields let first_variant = variant_array.value(0); assert_eq!(first_variant.get_object_field("id"), Some(Variant::from(1))); - assert_eq!(first_variant.get_object_field("name"), Some(Variant::from("Alice"))); - + assert_eq!( + first_variant.get_object_field("name"), + Some(Variant::from("Alice")) + ); + // Check second row - should have name field but not id (null field omitted) let second_variant = variant_array.value(1); assert_eq!(second_variant.get_object_field("id"), None); // null field omitted - assert_eq!(second_variant.get_object_field("name"), Some(Variant::from("Bob"))); - + assert_eq!( + second_variant.get_object_field("name"), + Some(Variant::from("Bob")) + ); + // Check third row - should have id field but not name (null field omitted) let third_variant = variant_array.value(2); assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(3))); @@ -2732,29 +2837,30 @@ mod row_builder_tests { #[test] fn test_run_end_encoded_row_builder() { - use arrow::array::{RunArray, Int32Array}; + use arrow::array::{Int32Array, RunArray}; use arrow::datatypes::Int32Type; - + // Create a run-end encoded array: [A, A, B, B, B, C] // run_ends: [2, 5, 6] // values: ["A", "B", "C"] let values = StringArray::from(vec!["A", "B", "C"]); let run_ends = Int32Array::from(vec![2, 5, 6]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(6); - + // Test sequential access (most common case) for i in 0..6 { let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(i, &mut variant_builder).unwrap(); variant_builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 6); - + // Verify the values assert_eq!(variant_array.value(0), Variant::from("A")); // Run 0 assert_eq!(variant_array.value(1), Variant::from("A")); // Run 0 @@ -2766,26 +2872,27 @@ mod row_builder_tests { #[test] fn test_run_end_encoded_random_access() { - use arrow::array::{RunArray, Int32Array}; + use arrow::array::{Int32Array, RunArray}; use arrow::datatypes::Int32Type; - + // Create a run-end encoded array: [A, A, B, B, B, C] let values = StringArray::from(vec!["A", "B", "C"]); let run_ends = Int32Array::from(vec![2, 5, 6]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); - + + let mut row_builder = + make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); + // Test random access pattern (backward jumps, forward jumps) let access_pattern = [0, 5, 2, 4, 1, 3]; // Mix of all cases let expected_values = ["A", "C", "B", "B", "A", "B"]; - + for (i, &index) in access_pattern.iter().enumerate() { let mut array_builder = VariantArrayBuilder::new(1); let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(index, &mut variant_builder).unwrap(); variant_builder.finish(); - + let variant_array = array_builder.build(); assert_eq!(variant_array.value(0), Variant::from(expected_values[i])); } @@ -2793,27 +2900,28 @@ mod row_builder_tests { #[test] fn test_run_end_encoded_with_nulls() { - use arrow::array::{RunArray, Int32Array}; + use arrow::array::{Int32Array, RunArray}; use arrow::datatypes::Int32Type; - + // Create a run-end encoded array with null values: [A, A, null, null, B] let values = StringArray::from(vec![Some("A"), None, Some("B")]); let run_ends = Int32Array::from(vec![2, 4, 5]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(5); - + // Test sequential access for i in 0..5 { let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(i, &mut variant_builder).unwrap(); variant_builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 5); - + // Verify the values assert_eq!(variant_array.value(0), Variant::from("A")); // Run 0 assert_eq!(variant_array.value(1), Variant::from("A")); // Run 0 @@ -2826,86 +2934,89 @@ mod row_builder_tests { fn test_dictionary_row_builder() { use arrow::array::{DictionaryArray, Int32Array}; use arrow::datatypes::Int32Type; - + // Create a dictionary array: keys=[0, 1, 0, 2, 1], values=["apple", "banana", "cherry"] let values = StringArray::from(vec!["apple", "banana", "cherry"]); let keys = Int32Array::from(vec![0, 1, 0, 2, 1]); let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(5); - + // Test sequential access for i in 0..5 { let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(i, &mut variant_builder).unwrap(); variant_builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 5); - + // Verify the values match the dictionary lookup - assert_eq!(variant_array.value(0), Variant::from("apple")); // keys[0] = 0 -> values[0] = "apple" - assert_eq!(variant_array.value(1), Variant::from("banana")); // keys[1] = 1 -> values[1] = "banana" - assert_eq!(variant_array.value(2), Variant::from("apple")); // keys[2] = 0 -> values[0] = "apple" - assert_eq!(variant_array.value(3), Variant::from("cherry")); // keys[3] = 2 -> values[2] = "cherry" - assert_eq!(variant_array.value(4), Variant::from("banana")); // keys[4] = 1 -> values[1] = "banana" + assert_eq!(variant_array.value(0), Variant::from("apple")); // keys[0] = 0 -> values[0] = "apple" + assert_eq!(variant_array.value(1), Variant::from("banana")); // keys[1] = 1 -> values[1] = "banana" + assert_eq!(variant_array.value(2), Variant::from("apple")); // keys[2] = 0 -> values[0] = "apple" + assert_eq!(variant_array.value(3), Variant::from("cherry")); // keys[3] = 2 -> values[2] = "cherry" + assert_eq!(variant_array.value(4), Variant::from("banana")); // keys[4] = 1 -> values[1] = "banana" } #[test] fn test_dictionary_with_nulls() { use arrow::array::{DictionaryArray, Int32Array}; use arrow::datatypes::Int32Type; - + // Create a dictionary array with null keys: keys=[0, null, 1, null, 2], values=["x", "y", "z"] let values = StringArray::from(vec!["x", "y", "z"]); let keys = Int32Array::from(vec![Some(0), None, Some(1), None, Some(2)]); let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(5); - + // Test sequential access for i in 0..5 { let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(i, &mut variant_builder).unwrap(); variant_builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 5); - + // Verify the values and nulls - assert_eq!(variant_array.value(0), Variant::from("x")); // keys[0] = 0 -> values[0] = "x" - assert!(variant_array.is_null(1)); // keys[1] = null - assert_eq!(variant_array.value(2), Variant::from("y")); // keys[2] = 1 -> values[1] = "y" - assert!(variant_array.is_null(3)); // keys[3] = null - assert_eq!(variant_array.value(4), Variant::from("z")); // keys[4] = 2 -> values[2] = "z" + assert_eq!(variant_array.value(0), Variant::from("x")); // keys[0] = 0 -> values[0] = "x" + assert!(variant_array.is_null(1)); // keys[1] = null + assert_eq!(variant_array.value(2), Variant::from("y")); // keys[2] = 1 -> values[1] = "y" + assert!(variant_array.is_null(3)); // keys[3] = null + assert_eq!(variant_array.value(4), Variant::from("z")); // keys[4] = 2 -> values[2] = "z" } #[test] fn test_dictionary_random_access() { use arrow::array::{DictionaryArray, Int32Array}; use arrow::datatypes::Int32Type; - + // Create a dictionary array: keys=[0, 1, 2, 0, 1, 2], values=["red", "green", "blue"] let values = StringArray::from(vec!["red", "green", "blue"]); let keys = Int32Array::from(vec![0, 1, 2, 0, 1, 2]); let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); - + + let mut row_builder = + make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + // Test random access pattern let access_pattern = [5, 0, 3, 1, 4, 2]; // Random order let expected_values = ["blue", "red", "red", "green", "green", "blue"]; - + for (i, &index) in access_pattern.iter().enumerate() { let mut array_builder = VariantArrayBuilder::new(1); let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(index, &mut variant_builder).unwrap(); variant_builder.finish(); - + let variant_array = array_builder.build(); assert_eq!(variant_array.value(0), Variant::from(expected_values[i])); } @@ -2914,50 +3025,70 @@ mod row_builder_tests { #[test] fn test_nested_dictionary() { use arrow::array::{DictionaryArray, Int32Array, StructArray}; - use arrow::datatypes::{Int32Type, Field}; - + use arrow::datatypes::{Field, Int32Type}; + // Create a dictionary with struct values let id_array = Int32Array::from(vec![1, 2, 3]); let name_array = StringArray::from(vec!["Alice", "Bob", "Charlie"]); let struct_array = StructArray::from(vec![ - (Arc::new(Field::new("id", DataType::Int32, false)), Arc::new(id_array) as ArrayRef), - (Arc::new(Field::new("name", DataType::Utf8, false)), Arc::new(name_array) as ArrayRef), + ( + Arc::new(Field::new("id", DataType::Int32, false)), + Arc::new(id_array) as ArrayRef, + ), + ( + Arc::new(Field::new("name", DataType::Utf8, false)), + Arc::new(name_array) as ArrayRef, + ), ]); - + let keys = Int32Array::from(vec![0, 1, 0, 2, 1]); - let dict_array = DictionaryArray::::try_new(keys, Arc::new(struct_array)).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + let dict_array = + DictionaryArray::::try_new(keys, Arc::new(struct_array)).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(5); - + // Test sequential access for i in 0..5 { let mut variant_builder = array_builder.variant_builder(); row_builder.append_row(i, &mut variant_builder).unwrap(); variant_builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 5); - + // Verify the nested struct values let first_variant = variant_array.value(0); assert_eq!(first_variant.get_object_field("id"), Some(Variant::from(1))); - assert_eq!(first_variant.get_object_field("name"), Some(Variant::from("Alice"))); - + assert_eq!( + first_variant.get_object_field("name"), + Some(Variant::from("Alice")) + ); + let second_variant = variant_array.value(1); - assert_eq!(second_variant.get_object_field("id"), Some(Variant::from(2))); - assert_eq!(second_variant.get_object_field("name"), Some(Variant::from("Bob"))); - + assert_eq!( + second_variant.get_object_field("id"), + Some(Variant::from(2)) + ); + assert_eq!( + second_variant.get_object_field("name"), + Some(Variant::from("Bob")) + ); + // Test that repeated keys give same values let third_variant = variant_array.value(2); assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(1))); - assert_eq!(third_variant.get_object_field("name"), Some(Variant::from("Alice"))); + assert_eq!( + third_variant.get_object_field("name"), + Some(Variant::from("Alice")) + ); } #[test] fn test_list_row_builder() { - use arrow::array::{ListArray}; + use arrow::array::ListArray; // Create a list array: [[1, 2], [3, 4, 5], null, []] let data = vec![ @@ -2967,28 +3098,29 @@ mod row_builder_tests { Some(vec![]), ]; let list_array = ListArray::from_iter_primitive::(data); - - let mut row_builder = make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); - + for i in 0..list_array.len() { let mut builder = variant_array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = variant_array_builder.build(); - + // Verify results assert_eq!(variant_array.len(), 4); - + // Row 0: [1, 2] let row0 = variant_array.value(0); let list0 = row0.as_list().unwrap(); assert_eq!(list0.len(), 2); assert_eq!(list0.get(0), Some(Variant::from(1))); assert_eq!(list0.get(1), Some(Variant::from(2))); - + // Row 1: [3, 4, 5] let row1 = variant_array.value(1); let list1 = row1.as_list().unwrap(); @@ -2996,10 +3128,10 @@ mod row_builder_tests { assert_eq!(list1.get(0), Some(Variant::from(3))); assert_eq!(list1.get(1), Some(Variant::from(4))); assert_eq!(list1.get(2), Some(Variant::from(5))); - + // Row 2: null assert!(variant_array.is_null(2)); - + // Row 3: [] let row3 = variant_array.value(3); let list3 = row3.as_list().unwrap(); @@ -3008,43 +3140,41 @@ mod row_builder_tests { #[test] fn test_large_list_row_builder() { - use arrow::array::{LargeListArray}; + use arrow::array::LargeListArray; // Create a large list array: [[1, 2], null] - let data = vec![ - Some(vec![Some(1i64), Some(2i64)]), - None, - ]; + let data = vec![Some(vec![Some(1i64), Some(2i64)]), None]; let list_array = LargeListArray::from_iter_primitive::(data); - - let mut row_builder = make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); - + for i in 0..list_array.len() { let mut builder = variant_array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = variant_array_builder.build(); - + // Verify results assert_eq!(variant_array.len(), 2); - + // Row 0: [1, 2] let row0 = variant_array.value(0); let list0 = row0.as_list().unwrap(); assert_eq!(list0.len(), 2); assert_eq!(list0.get(0), Some(Variant::from(1i64))); assert_eq!(list0.get(1), Some(Variant::from(2i64))); - + // Row 1: null assert!(variant_array.is_null(1)); } #[test] fn test_sliced_list_row_builder() { - use arrow::array::{ListArray}; + use arrow::array::ListArray; // Create a list array: [[1, 2], [3, 4, 5], [6]] let data = vec![ @@ -3053,23 +3183,24 @@ mod row_builder_tests { Some(vec![Some(6)]), ]; let list_array = ListArray::from_iter_primitive::(data); - + // Slice to get just the middle element: [[3, 4, 5]] let sliced_array = list_array.slice(1, 1); - - let mut row_builder = make_arrow_to_variant_row_builder(sliced_array.data_type(), &sliced_array).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(sliced_array.data_type(), &sliced_array).unwrap(); let mut variant_array_builder = VariantArrayBuilder::new(sliced_array.len()); - + // Test the single row let mut builder = variant_array_builder.variant_builder(); row_builder.append_row(0, &mut builder).unwrap(); builder.finish(); - + let variant_array = variant_array_builder.build(); - + // Verify result assert_eq!(variant_array.len(), 1); - + // Row 0: [3, 4, 5] let row0 = variant_array.value(0); let list0 = row0.as_list().unwrap(); @@ -3081,19 +3212,16 @@ mod row_builder_tests { #[test] fn test_nested_list_row_builder() { - use arrow::array::{ListArray}; + use arrow::array::ListArray; use arrow::datatypes::Field; - + // Build the nested structure manually let inner_field = Arc::new(Field::new("item", DataType::Int32, true)); let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_field), true)); - - let values_data = vec![ - Some(vec![Some(1), Some(2)]), - Some(vec![Some(3)]), - ]; + + let values_data = vec![Some(vec![Some(1), Some(2)]), Some(vec![Some(3)])]; let values_list = ListArray::from_iter_primitive::(values_data); - + let outer_offsets = arrow::buffer::OffsetBuffer::new(vec![0i32, 2, 2].into()); let outer_list = ListArray::new( inner_list_field, @@ -3101,45 +3229,46 @@ mod row_builder_tests { Arc::new(values_list), Some(arrow::buffer::NullBuffer::from(vec![true, false])), ); - - let mut row_builder = make_arrow_to_variant_row_builder(outer_list.data_type(), &outer_list).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(outer_list.data_type(), &outer_list).unwrap(); let mut variant_array_builder = VariantArrayBuilder::new(outer_list.len()); - + for i in 0..outer_list.len() { let mut builder = variant_array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = variant_array_builder.build(); - + // Verify results assert_eq!(variant_array.len(), 2); - + // Row 0: [[1, 2], [3]] let row0 = variant_array.value(0); let outer_list0 = row0.as_list().unwrap(); assert_eq!(outer_list0.len(), 2); - + let inner_list0_0 = outer_list0.get(0).unwrap(); let inner_list0_0 = inner_list0_0.as_list().unwrap(); assert_eq!(inner_list0_0.len(), 2); assert_eq!(inner_list0_0.get(0), Some(Variant::from(1))); assert_eq!(inner_list0_0.get(1), Some(Variant::from(2))); - + let inner_list0_1 = outer_list0.get(1).unwrap(); let inner_list0_1 = inner_list0_1.as_list().unwrap(); assert_eq!(inner_list0_1.len(), 1); assert_eq!(inner_list0_1.get(0), Some(Variant::from(3))); - + // Row 1: null assert!(variant_array.is_null(1)); } #[test] fn test_map_row_builder() { - use arrow::array::{MapArray, Int32Array, StringArray, StructArray}; - use arrow::buffer::{OffsetBuffer, NullBuffer}; + use arrow::array::{Int32Array, MapArray, StringArray, StructArray}; + use arrow::buffer::{NullBuffer, OffsetBuffer}; use arrow::datatypes::{DataType, Field, Fields}; use std::sync::Arc; @@ -3158,21 +3287,21 @@ mod row_builder_tests { // Create offsets for 4 maps: [0..1], [1..1], [1..1], [1..3] // Map 0: {"key1": 1} (1 entry) - // Map 1: {} (0 entries - empty) + // Map 1: {} (0 entries - empty) // Map 2: null (0 entries but NULL via null buffer) // Map 3: {"key2": 2, "key3": 3} (2 entries) let offsets = OffsetBuffer::new(vec![0, 1, 1, 1, 3].into()); - + // Create null buffer - map at index 2 is NULL let null_buffer = Some(NullBuffer::from(vec![true, true, false, true])); - + // Create the map field let map_field = Arc::new(Field::new( "entries", DataType::Struct(entries_fields), false, // Keys are non-nullable )); - + // Create MapArray using try_new let map_array = MapArray::try_new( map_field, @@ -3180,40 +3309,39 @@ mod row_builder_tests { entries, null_buffer, false, // not ordered - ).unwrap(); + ) + .unwrap(); - let mut row_builder = make_arrow_to_variant_row_builder( - map_array.data_type(), - &map_array - ).unwrap(); + let mut row_builder = + make_arrow_to_variant_row_builder(map_array.data_type(), &map_array).unwrap(); let mut variant_array_builder = VariantArrayBuilder::new(4); - + // Test each row for i in 0..4 { let mut builder = variant_array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = variant_array_builder.build(); - + // Verify results assert_eq!(variant_array.len(), 4); - + // Map 0: {"key1": 1} let map0 = variant_array.value(0); let obj0 = map0.as_object().unwrap(); assert_eq!(obj0.len(), 1); assert_eq!(obj0.get("key1"), Some(Variant::from(1))); - + // Map 1: {} (empty object, not null) let map1 = variant_array.value(1); let obj1 = map1.as_object().unwrap(); assert_eq!(obj1.len(), 0); // Empty object - + // Map 2: null (actual NULL) assert!(variant_array.is_null(2)); - + // Map 3: {"key2": 2, "key3": 3} let map3 = variant_array.value(3); let obj3 = map3.as_object().unwrap(); @@ -3224,7 +3352,7 @@ mod row_builder_tests { #[test] fn test_union_sparse_row_builder() { - use arrow::array::{Int32Array, Float64Array, StringArray, UnionArray}; + use arrow::array::{Float64Array, Int32Array, StringArray, UnionArray}; use arrow::buffer::ScalarBuffer; use arrow::datatypes::{DataType, Field, UnionFields}; use std::sync::Arc; @@ -3259,10 +3387,8 @@ mod row_builder_tests { .unwrap(); // Test the row builder - let mut row_builder = make_arrow_to_variant_row_builder( - union_array.data_type(), - &union_array, - ).unwrap(); + let mut row_builder = + make_arrow_to_variant_row_builder(union_array.data_type(), &union_array).unwrap(); let mut variant_builder = VariantArrayBuilder::new(union_array.len()); for i in 0..union_array.len() { @@ -3274,29 +3400,29 @@ mod row_builder_tests { // Verify results assert_eq!(variant_array.len(), 6); - + // Row 0: int 1 assert_eq!(variant_array.value(0), Variant::Int32(1)); - + // Row 1: float 3.2 assert_eq!(variant_array.value(1), Variant::Double(3.2)); - + // Row 2: string "hello" assert_eq!(variant_array.value(2), Variant::from("hello")); - + // Row 3: float 32.5 assert_eq!(variant_array.value(3), Variant::Double(32.5)); - + // Row 4: int 34 assert_eq!(variant_array.value(4), Variant::Int32(34)); - + // Row 5: null (int array has null at this position) assert!(variant_array.is_null(5)); } #[test] fn test_union_dense_row_builder() { - use arrow::array::{Int32Array, Float64Array, StringArray, UnionArray}; + use arrow::array::{Float64Array, Int32Array, StringArray, UnionArray}; use arrow::buffer::ScalarBuffer; use arrow::datatypes::{DataType, Field, UnionFields}; use std::sync::Arc; @@ -3334,10 +3460,8 @@ mod row_builder_tests { .unwrap(); // Test the row builder - let mut row_builder = make_arrow_to_variant_row_builder( - union_array.data_type(), - &union_array, - ).unwrap(); + let mut row_builder = + make_arrow_to_variant_row_builder(union_array.data_type(), &union_array).unwrap(); let mut variant_builder = VariantArrayBuilder::new(union_array.len()); for i in 0..union_array.len() { @@ -3349,22 +3473,22 @@ mod row_builder_tests { // Verify results assert_eq!(variant_array.len(), 6); - + // Row 0: int 1 (offset 0 in int_array) assert_eq!(variant_array.value(0), Variant::Int32(1)); - + // Row 1: float 3.2 (offset 0 in float_array) assert_eq!(variant_array.value(1), Variant::Double(3.2)); - + // Row 2: string "hello" (offset 0 in string_array) assert_eq!(variant_array.value(2), Variant::from("hello")); - + // Row 3: float 32.5 (offset 1 in float_array) assert_eq!(variant_array.value(3), Variant::Double(32.5)); - + // Row 4: int 34 (offset 1 in int_array) assert_eq!(variant_array.value(4), Variant::Int32(34)); - + // Row 5: null (offset 2 in int_array, which has null) assert!(variant_array.is_null(5)); } @@ -3389,10 +3513,7 @@ mod row_builder_tests { ], ); - let children: Vec> = vec![ - Arc::new(int_array), - Arc::new(string_array), - ]; + let children: Vec> = vec![Arc::new(int_array), Arc::new(string_array)]; let union_array = UnionArray::try_new( union_fields, @@ -3403,10 +3524,8 @@ mod row_builder_tests { .unwrap(); // Test the row builder - let mut row_builder = make_arrow_to_variant_row_builder( - union_array.data_type(), - &union_array, - ).unwrap(); + let mut row_builder = + make_arrow_to_variant_row_builder(union_array.data_type(), &union_array).unwrap(); let mut variant_builder = VariantArrayBuilder::new(union_array.len()); for i in 0..union_array.len() { @@ -3418,10 +3537,10 @@ mod row_builder_tests { // Verify results assert_eq!(variant_array.len(), 2); - + // Row 0: int 42 (type_id = 1) assert_eq!(variant_array.value(0), Variant::Int32(42)); - + // Row 1: string "test" (type_id = 3) assert_eq!(variant_array.value(1), Variant::from("test")); } @@ -3429,71 +3548,81 @@ mod row_builder_tests { #[test] fn test_decimal32_row_builder() { use arrow::array::Decimal32Array; - use parquet_variant::{VariantDecimal4}; + use parquet_variant::VariantDecimal4; // Test Decimal32Array with scale 2 (e.g., for currency: 12.34) let decimal_array = Decimal32Array::from(vec![Some(1234), None, Some(-5678)]) - .with_precision_and_scale(9, 2).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder( - decimal_array.data_type(), - &decimal_array, - ).unwrap(); + .with_precision_and_scale(9, 2) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..decimal_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: 12.34 (1234 with scale 2) - assert_eq!(variant_array.value(0), Variant::from(VariantDecimal4::try_new(1234, 2).unwrap())); - + assert_eq!( + variant_array.value(0), + Variant::from(VariantDecimal4::try_new(1234, 2).unwrap()) + ); + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: -56.78 (-5678 with scale 2) - assert_eq!(variant_array.value(2), Variant::from(VariantDecimal4::try_new(-5678, 2).unwrap())); + assert_eq!( + variant_array.value(2), + Variant::from(VariantDecimal4::try_new(-5678, 2).unwrap()) + ); } #[test] fn test_decimal128_row_builder() { use arrow::array::Decimal128Array; - use parquet_variant::{VariantDecimal16}; + use parquet_variant::VariantDecimal16; // Test Decimal128Array with negative scale (multiply by 10^|scale|) let decimal_array = Decimal128Array::from(vec![Some(123), None, Some(456)]) - .with_precision_and_scale(10, -2).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder( - decimal_array.data_type(), - &decimal_array, - ).unwrap(); + .with_precision_and_scale(10, -2) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..decimal_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: 123 * 10^2 = 12300 with scale 0 (negative scale handling) - assert_eq!(variant_array.value(0), Variant::from(VariantDecimal16::try_new(12300, 0).unwrap())); - + assert_eq!( + variant_array.value(0), + Variant::from(VariantDecimal16::try_new(12300, 0).unwrap()) + ); + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: 456 * 10^2 = 45600 with scale 0 - assert_eq!(variant_array.value(2), Variant::from(VariantDecimal16::try_new(45600, 0).unwrap())); + assert_eq!( + variant_array.value(2), + Variant::from(VariantDecimal16::try_new(45600, 0).unwrap()) + ); } #[test] @@ -3504,29 +3633,31 @@ mod row_builder_tests { // Test Decimal256Array with a value that overflows i128 let large_value = i256::from_i128(i128::MAX) + i256::from(1); // Overflows i128 let decimal_array = Decimal256Array::from(vec![Some(large_value), Some(i256::from(123))]) - .with_precision_and_scale(76, 3).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder( - decimal_array.data_type(), - &decimal_array, - ).unwrap(); + .with_precision_and_scale(76, 3) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(2); - + for i in 0..decimal_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 2); - + // Row 0: overflow value becomes Variant::Null assert_eq!(variant_array.value(0), Variant::Null); - + // Row 1: normal value converts successfully - assert_eq!(variant_array.value(1), Variant::from(VariantDecimal16::try_new(123, 3).unwrap())); + assert_eq!( + variant_array.value(1), + Variant::from(VariantDecimal16::try_new(123, 3).unwrap()) + ); } #[test] @@ -3541,33 +3672,31 @@ mod row_builder_tests { Some(b"".as_slice()), // Empty binary ]; let binary_array = BinaryArray::from(binary_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - binary_array.data_type(), - &binary_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(binary_array.data_type(), &binary_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(4); - + for i in 0..binary_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 4); - + // Row 0: "hello" bytes assert_eq!(variant_array.value(0), Variant::from(b"hello".as_slice())); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: binary with special bytes let bytes = [0x00, 0x01, 0x02, 0xFF]; assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); - + // Row 3: empty binary let bytes = []; assert_eq!(variant_array.value(3), Variant::from(bytes.as_slice())); @@ -3584,31 +3713,36 @@ mod row_builder_tests { Some(b"another large chunk".as_slice()), ]; let large_binary_array = LargeBinaryArray::from(binary_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - large_binary_array.data_type(), - &large_binary_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(large_binary_array.data_type(), &large_binary_array) + .unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..large_binary_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: large binary data - assert_eq!(variant_array.value(0), Variant::from(b"large binary data".as_slice())); - + assert_eq!( + variant_array.value(0), + Variant::from(b"large binary data".as_slice()) + ); + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: another large chunk - assert_eq!(variant_array.value(2), Variant::from(b"another large chunk".as_slice())); + assert_eq!( + variant_array.value(2), + Variant::from(b"another large chunk".as_slice()) + ); } #[test] @@ -3622,31 +3756,33 @@ mod row_builder_tests { Some(b"this is a longer binary view that exceeds inline storage".as_slice()), ]; let binary_view_array = BinaryViewArray::from(binary_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - binary_view_array.data_type(), - &binary_view_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(binary_view_array.data_type(), &binary_view_array) + .unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..binary_view_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: short binary assert_eq!(variant_array.value(0), Variant::from(b"short".as_slice())); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: long binary view - assert_eq!(variant_array.value(2), Variant::from(b"this is a longer binary view that exceeds inline storage".as_slice())); + assert_eq!( + variant_array.value(2), + Variant::from(b"this is a longer binary view that exceeds inline storage".as_slice()) + ); } #[test] @@ -3659,34 +3795,32 @@ mod row_builder_tests { None, Some([0xFF, 0xFE, 0xFD, 0xFC]), ]; - let fixed_binary_array = FixedSizeBinaryArray::try_from_sparse_iter_with_size( - binary_data.into_iter(), - 4, - ).unwrap(); - - let mut row_builder = make_arrow_to_variant_row_builder( - fixed_binary_array.data_type(), - &fixed_binary_array, - ).unwrap(); + let fixed_binary_array = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(binary_data.into_iter(), 4) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(fixed_binary_array.data_type(), &fixed_binary_array) + .unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..fixed_binary_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: fixed size binary let bytes = [0x01, 0x02, 0x03, 0x04]; assert_eq!(variant_array.value(0), Variant::from(bytes.as_slice())); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: another fixed size binary let bytes = [0xFF, 0xFE, 0xFD, 0xFC]; assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); @@ -3703,31 +3837,35 @@ mod row_builder_tests { Some("this is a much longer string that will be stored out-of-line in the buffer"), ]; let string_view_array = StringViewArray::from(string_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - string_view_array.data_type(), - &string_view_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(string_view_array.data_type(), &string_view_array) + .unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..string_view_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: short string assert_eq!(variant_array.value(0), Variant::from("short")); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: long string view - assert_eq!(variant_array.value(2), Variant::from("this is a much longer string that will be stored out-of-line in the buffer")); + assert_eq!( + variant_array.value(2), + Variant::from( + "this is a much longer string that will be stored out-of-line in the buffer" + ) + ); } #[test] @@ -3741,30 +3879,29 @@ mod row_builder_tests { Some(1640995200), // 2022-01-01 00:00:00 UTC ]; let timestamp_array = TimestampSecondArray::from(timestamp_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - timestamp_array.data_type(), - ×tamp_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) + .unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..timestamp_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: 2021-01-01 00:00:00 (no timezone -> NaiveDateTime -> TimestampNtzMicros) let expected_naive = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); assert_eq!(variant_array.value(0), Variant::from(expected_naive)); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: 2022-01-01 00:00:00 let expected_naive2 = DateTime::from_timestamp(1640995200, 0).unwrap().naive_utc(); assert_eq!(variant_array.value(2), Variant::from(expected_naive2)); @@ -3773,7 +3910,7 @@ mod row_builder_tests { #[test] fn test_timestamp_with_timezone_row_builder() { use arrow::array::TimestampMicrosecondArray; - use chrono::{DateTime}; + use chrono::DateTime; // Test TimestampMicrosecondArray with timezone let timestamp_data = vec![ @@ -3782,32 +3919,31 @@ mod row_builder_tests { Some(1640995200000000), // 2022-01-01 00:00:00 UTC (in microseconds) ]; let timezone = "UTC".to_string(); - let timestamp_array = TimestampMicrosecondArray::from(timestamp_data) - .with_timezone(timezone.clone()); - - let mut row_builder = make_arrow_to_variant_row_builder( - timestamp_array.data_type(), - ×tamp_array, - ).unwrap(); + let timestamp_array = + TimestampMicrosecondArray::from(timestamp_data).with_timezone(timezone.clone()); + + let mut row_builder = + make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) + .unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..timestamp_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: 2021-01-01 00:00:00 UTC (with timezone -> DateTime -> TimestampMicros) let expected_utc = DateTime::from_timestamp(1609459200, 0).unwrap(); assert_eq!(variant_array.value(0), Variant::from(expected_utc)); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: 2022-01-01 00:00:00 UTC let expected_utc2 = DateTime::from_timestamp(1640995200, 0).unwrap(); assert_eq!(variant_array.value(2), Variant::from(expected_utc2)); @@ -3824,31 +3960,32 @@ mod row_builder_tests { Some(1609459200000000000), // 2021-01-01 00:00:00.000000000 UTC (no fractional seconds) ]; let timestamp_array = TimestampNanosecondArray::from(timestamp_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - timestamp_array.data_type(), - ×tamp_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) + .unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..timestamp_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: with nanoseconds -> should use TimestampNtzNanos - let expected_with_nanos = DateTime::from_timestamp(1609459200, 123456789).unwrap().naive_utc(); + let expected_with_nanos = DateTime::from_timestamp(1609459200, 123456789) + .unwrap() + .naive_utc(); assert_eq!(variant_array.value(0), Variant::from(expected_with_nanos)); - + // Row 1: null assert!(variant_array.is_null(1)); - - // Row 2: no fractional seconds -> should use TimestampNtzMicros + + // Row 2: no fractional seconds -> should use TimestampNtzMicros let expected_no_nanos = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); assert_eq!(variant_array.value(2), Variant::from(expected_no_nanos)); } @@ -3864,30 +4001,31 @@ mod row_builder_tests { Some(1609459200000), // 2021-01-01 00:00:00.000 UTC ]; let timestamp_array = TimestampMillisecondArray::from(timestamp_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - timestamp_array.data_type(), - ×tamp_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) + .unwrap(); let mut array_builder = VariantArrayBuilder::new(3); - + for i in 0..timestamp_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 3); - + // Row 0: with milliseconds -> TimestampNtzMicros (123ms = 123000000ns) - let expected_with_millis = DateTime::from_timestamp(1609459200, 123000000).unwrap().naive_utc(); + let expected_with_millis = DateTime::from_timestamp(1609459200, 123000000) + .unwrap() + .naive_utc(); assert_eq!(variant_array.value(0), Variant::from(expected_with_millis)); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: no fractional seconds -> TimestampNtzMicros let expected_no_millis = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); assert_eq!(variant_array.value(2), Variant::from(expected_no_millis)); @@ -3900,40 +4038,38 @@ mod row_builder_tests { // Test Date32Array with various dates let date_data = vec![ - Some(0), // 1970-01-01 + Some(0), // 1970-01-01 None, - Some(19723), // 2024-01-01 (days since epoch) - Some(-719162), // 0001-01-01 (near minimum) + Some(19723), // 2024-01-01 (days since epoch) + Some(-719162), // 0001-01-01 (near minimum) ]; let date_array = Date32Array::from(date_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - date_array.data_type(), - &date_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(date_array.data_type(), &date_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(4); - + for i in 0..date_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 4); - + // Row 0: 1970-01-01 (epoch) let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: 2024-01-01 let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); assert_eq!(variant_array.value(2), Variant::from(expected_2024)); - + // Row 3: 0001-01-01 (near minimum date) let expected_min = NaiveDate::from_ymd_opt(1, 1, 1).unwrap(); assert_eq!(variant_array.value(3), Variant::from(expected_min)); @@ -3946,40 +4082,38 @@ mod row_builder_tests { // Test Date64Array with various dates (milliseconds since epoch) let date_data = vec![ - Some(0), // 1970-01-01 + Some(0), // 1970-01-01 None, Some(1704067200000), // 2024-01-01 (milliseconds since epoch) Some(86400000), // 1970-01-02 ]; let date_array = Date64Array::from(date_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - date_array.data_type(), - &date_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(date_array.data_type(), &date_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(4); - + for i in 0..date_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 4); - + // Row 0: 1970-01-01 (epoch) let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: 2024-01-01 let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); assert_eq!(variant_array.value(2), Variant::from(expected_2024)); - + // Row 3: 1970-01-02 let expected_next_day = NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(); assert_eq!(variant_array.value(3), Variant::from(expected_next_day)); @@ -3992,40 +4126,38 @@ mod row_builder_tests { // Test Time32SecondArray with various times (seconds since midnight) let time_data = vec![ - Some(0), // 00:00:00 + Some(0), // 00:00:00 None, - Some(3661), // 01:01:01 - Some(86399), // 23:59:59 + Some(3661), // 01:01:01 + Some(86399), // 23:59:59 ]; let time_array = Time32SecondArray::from(time_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - time_array.data_type(), - &time_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(4); - + for i in 0..time_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 4); - + // Row 0: 00:00:00 (midnight) let expected_midnight = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: 01:01:01 let expected_time = NaiveTime::from_hms_opt(1, 1, 1).unwrap(); assert_eq!(variant_array.value(2), Variant::from(expected_time)); - + // Row 3: 23:59:59 (last second of day) let expected_last = NaiveTime::from_hms_opt(23, 59, 59).unwrap(); assert_eq!(variant_array.value(3), Variant::from(expected_last)); @@ -4038,40 +4170,38 @@ mod row_builder_tests { // Test Time32MillisecondArray with various times (milliseconds since midnight) let time_data = vec![ - Some(0), // 00:00:00.000 + Some(0), // 00:00:00.000 None, - Some(3661123), // 01:01:01.123 - Some(86399999), // 23:59:59.999 + Some(3661123), // 01:01:01.123 + Some(86399999), // 23:59:59.999 ]; let time_array = Time32MillisecondArray::from(time_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - time_array.data_type(), - &time_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(4); - + for i in 0..time_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 4); - + // Row 0: 00:00:00.000 (midnight) let expected_midnight = NaiveTime::from_hms_milli_opt(0, 0, 0, 0).unwrap(); assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: 01:01:01.123 let expected_time = NaiveTime::from_hms_milli_opt(1, 1, 1, 123).unwrap(); assert_eq!(variant_array.value(2), Variant::from(expected_time)); - + // Row 3: 23:59:59.999 (last millisecond of day) let expected_last = NaiveTime::from_hms_milli_opt(23, 59, 59, 999).unwrap(); assert_eq!(variant_array.value(3), Variant::from(expected_last)); @@ -4084,40 +4214,38 @@ mod row_builder_tests { // Test Time64MicrosecondArray with various times (microseconds since midnight) let time_data = vec![ - Some(0), // 00:00:00.000000 + Some(0), // 00:00:00.000000 None, - Some(3661123456), // 01:01:01.123456 - Some(86399999999), // 23:59:59.999999 + Some(3661123456), // 01:01:01.123456 + Some(86399999999), // 23:59:59.999999 ]; let time_array = Time64MicrosecondArray::from(time_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - time_array.data_type(), - &time_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(4); - + for i in 0..time_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 4); - + // Row 0: 00:00:00.000000 (midnight) let expected_midnight = NaiveTime::from_hms_micro_opt(0, 0, 0, 0).unwrap(); assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: 01:01:01.123456 let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); assert_eq!(variant_array.value(2), Variant::from(expected_time)); - + // Row 3: 23:59:59.999999 (last microsecond of day) let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); assert_eq!(variant_array.value(3), Variant::from(expected_last)); @@ -4130,40 +4258,38 @@ mod row_builder_tests { // Test Time64NanosecondArray with various times (nanoseconds since midnight) let time_data = vec![ - Some(0), // 00:00:00.000000000 + Some(0), // 00:00:00.000000000 None, - Some(3661123456789), // 01:01:01.123456789 - Some(86399999999999), // 23:59:59.999999999 + Some(3661123456789), // 01:01:01.123456789 + Some(86399999999999), // 23:59:59.999999999 ]; let time_array = Time64NanosecondArray::from(time_data); - - let mut row_builder = make_arrow_to_variant_row_builder( - time_array.data_type(), - &time_array, - ).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); let mut array_builder = VariantArrayBuilder::new(4); - + for i in 0..time_array.len() { let mut builder = array_builder.variant_builder(); row_builder.append_row(i, &mut builder).unwrap(); builder.finish(); } - + let variant_array = array_builder.build(); assert_eq!(variant_array.len(), 4); - + // Row 0: 00:00:00.000000000 (midnight) let expected_midnight = NaiveTime::from_hms_nano_opt(0, 0, 0, 0).unwrap(); assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - + // Row 1: null assert!(variant_array.is_null(1)); - + // Row 2: 01:01:01.123456789 -> truncated to 01:01:01.123456000 (microsecond precision) let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); assert_eq!(variant_array.value(2), Variant::from(expected_time)); - + // Row 3: 23:59:59.999999999 -> truncated to 23:59:59.999999000 (microsecond precision) let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); assert_eq!(variant_array.value(3), Variant::from(expected_last)); diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index 12490b35dbd5..a7eb2467988a 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -1785,7 +1785,7 @@ impl<'o, 'v, 's> ObjectFieldBuilder<'o, 'v, 's> { impl VariantBuilderExt for ObjectFieldBuilder<'_, '_, '_> { /// A NULL object field is interpreted as missing, so nothing gets inserted at all. - fn append_null(&mut self) { } + fn append_null(&mut self) {} fn append_value<'m, 'v>(&mut self, value: impl Into>) { self.builder.insert(self.key, value); } From a8c667d1ec1890b92ef491d6d6e5a32e9e7dd254 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 16:43:38 -0700 Subject: [PATCH 47/53] diff minimization --- .../src/cast_to_variant.rs | 451 +++++++++--------- 1 file changed, 213 insertions(+), 238 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index bc9a07cc7a40..af15fbb69544 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -38,6 +38,219 @@ use parquet_variant::{ VariantDecimal8, }; +/// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you +/// need to convert a specific data type +/// +/// # Arguments +/// * `input` - A reference to the input [`Array`] to cast +/// +/// # Notes +/// If the input array element is null, the corresponding element in the +/// output `VariantArray` will also be null (not `Variant::Null`). +/// +/// # Example +/// ``` +/// # use arrow::array::{Array, ArrayRef, Int64Array}; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::cast_to_variant::cast_to_variant; +/// // input is an Int64Array, which will be cast to a VariantArray +/// let input = Int64Array::from(vec![Some(1), None, Some(3)]); +/// let result = cast_to_variant(&input).unwrap(); +/// assert_eq!(result.len(), 3); +/// assert_eq!(result.value(0), Variant::Int64(1)); +/// assert!(result.is_null(1)); // note null, not Variant::Null +/// assert_eq!(result.value(2), Variant::Int64(3)); +/// ``` +/// +/// For `DataType::Timestamp`s: if the timestamp has any level of precision +/// greater than a microsecond, it will be truncated. For example +/// `1970-01-01T00:00:01.234567890Z` +/// will be truncated to +/// `1970-01-01T00:00:01.234567Z` +pub fn cast_to_variant(input: &dyn Array) -> Result { + // Create row builder for the input array type + let mut row_builder = make_arrow_to_variant_row_builder(input.data_type(), input)?; + + // Create output array builder + let mut array_builder = VariantArrayBuilder::new(input.len()); + + // Process each row using the row builder + for i in 0..input.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder)?; + builder.finish(); + } + + Ok(array_builder.build()) +} + +/// Factory function to create the appropriate row builder for a given DataType +fn make_arrow_to_variant_row_builder<'a>( + data_type: &'a DataType, + array: &'a dyn Array, +) -> Result, ArrowError> { + let builder = match data_type { + DataType::Null => ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder), + DataType::Boolean => { + ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array)) + } + DataType::Int8 => { + ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Int16 => { + ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Int32 => { + ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Int64 => { + ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::UInt8 => { + ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::UInt16 => { + ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::UInt32 => { + ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::UInt64 => { + ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Float16 => { + ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Float32 => { + ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Float64 => { + ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Decimal32(_, scale) => { + ArrowToVariantRowBuilder::Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale)) + } + DataType::Decimal64(_, scale) => { + ArrowToVariantRowBuilder::Decimal64(Decimal64ArrowToVariantBuilder::new(array, *scale)) + } + DataType::Decimal128(_, scale) => ArrowToVariantRowBuilder::Decimal128( + Decimal128ArrowToVariantBuilder::new(array, *scale), + ), + DataType::Decimal256(_, scale) => ArrowToVariantRowBuilder::Decimal256( + Decimal256ArrowToVariantBuilder::new(array, *scale), + ), + DataType::Timestamp(time_unit, time_zone) => match time_unit { + TimeUnit::Second => ArrowToVariantRowBuilder::TimestampSecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + ), + TimeUnit::Millisecond => ArrowToVariantRowBuilder::TimestampMillisecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + ), + TimeUnit::Microsecond => ArrowToVariantRowBuilder::TimestampMicrosecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + ), + TimeUnit::Nanosecond => ArrowToVariantRowBuilder::TimestampNanosecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + ), + }, + DataType::Date32 => ArrowToVariantRowBuilder::Date32(DateArrowToVariantBuilder::new(array)), + DataType::Date64 => ArrowToVariantRowBuilder::Date64(DateArrowToVariantBuilder::new(array)), + DataType::Time32(time_unit) => match time_unit { + TimeUnit::Second => { + ArrowToVariantRowBuilder::Time32Second(TimeArrowToVariantBuilder::new(array)) + } + TimeUnit::Millisecond => { + ArrowToVariantRowBuilder::Time32Millisecond(TimeArrowToVariantBuilder::new(array)) + } + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported Time32 unit: {time_unit:?}" + ))) + } + }, + DataType::Time64(time_unit) => match time_unit { + TimeUnit::Microsecond => { + ArrowToVariantRowBuilder::Time64Microsecond(TimeArrowToVariantBuilder::new(array)) + } + TimeUnit::Nanosecond => { + ArrowToVariantRowBuilder::Time64Nanosecond(TimeArrowToVariantBuilder::new(array)) + } + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported Time64 unit: {time_unit:?}" + ))) + } + }, + DataType::Duration(_) | DataType::Interval(_) => { + return Err(ArrowError::InvalidArgumentError( + "Casting duration/interval types to Variant is not supported. \ + The Variant format does not define duration/interval types." + .to_string(), + )) + } + DataType::Binary => { + ArrowToVariantRowBuilder::Binary(BinaryArrowToVariantBuilder::new(array)) + } + DataType::LargeBinary => { + ArrowToVariantRowBuilder::LargeBinary(BinaryArrowToVariantBuilder::new(array)) + } + DataType::BinaryView => { + ArrowToVariantRowBuilder::BinaryView(BinaryViewArrowToVariantBuilder::new(array)) + } + DataType::FixedSizeBinary(_) => ArrowToVariantRowBuilder::FixedSizeBinary( + FixedSizeBinaryArrowToVariantBuilder::new(array), + ), + DataType::Utf8 => ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array)), + DataType::LargeUtf8 => { + ArrowToVariantRowBuilder::LargeString(StringArrowToVariantBuilder::new(array)) + } + DataType::Utf8View => { + ArrowToVariantRowBuilder::Utf8View(Utf8ViewArrowToVariantBuilder::new(array)) + } + DataType::List(_) => ArrowToVariantRowBuilder::List(ListArrowToVariantBuilder::new(array)?), + DataType::LargeList(_) => { + ArrowToVariantRowBuilder::LargeList(ListArrowToVariantBuilder::new(array)?) + } + DataType::Struct(_) => { + ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new(array.as_struct())?) + } + DataType::Map(_, _) => ArrowToVariantRowBuilder::Map(MapArrowToVariantBuilder::new(array)?), + DataType::Union(_, _) => { + ArrowToVariantRowBuilder::Union(UnionArrowToVariantBuilder::new(array)?) + } + DataType::Dictionary(_, _) => { + ArrowToVariantRowBuilder::Dictionary(DictionaryArrowToVariantBuilder::new(array)?) + } + DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { + DataType::Int16 => ArrowToVariantRowBuilder::RunEndEncodedInt16( + RunEndEncodedArrowToVariantBuilder::new(array)?, + ), + DataType::Int32 => ArrowToVariantRowBuilder::RunEndEncodedInt32( + RunEndEncodedArrowToVariantBuilder::new(array)?, + ), + DataType::Int64 => ArrowToVariantRowBuilder::RunEndEncodedInt64( + RunEndEncodedArrowToVariantBuilder::new(array)?, + ), + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported run ends type: {:?}", + run_ends.data_type() + ))); + } + }, + dt => { + return Err(ArrowError::CastError(format!( + "Unsupported data type for casting to Variant: {dt:?}", + ))); + } + }; + Ok(builder) +} + +// TODO do we need a cast_with_options to allow specifying conversion behavior, +// e.g. how to handle overflows, whether to convert to Variant::Null or return +// an error, etc. ? + // ============================================================================ // Row-oriented builders for efficient Arrow-to-Variant conversion // ============================================================================ @@ -662,244 +875,6 @@ define_row_builder!( } ); -/// Factory function to create the appropriate row builder for a given DataType -fn make_arrow_to_variant_row_builder<'a>( - data_type: &'a DataType, - array: &'a dyn Array, -) -> Result, ArrowError> { - match data_type { - // All integer types - DataType::Int8 => Ok(ArrowToVariantRowBuilder::PrimitiveInt8( - PrimitiveArrowToVariantBuilder::new(array), - )), - DataType::Int16 => Ok(ArrowToVariantRowBuilder::PrimitiveInt16( - PrimitiveArrowToVariantBuilder::new(array), - )), - DataType::Int32 => Ok(ArrowToVariantRowBuilder::PrimitiveInt32( - PrimitiveArrowToVariantBuilder::new(array), - )), - DataType::Int64 => Ok(ArrowToVariantRowBuilder::PrimitiveInt64( - PrimitiveArrowToVariantBuilder::new(array), - )), - DataType::UInt8 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt8( - PrimitiveArrowToVariantBuilder::new(array), - )), - DataType::UInt16 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt16( - PrimitiveArrowToVariantBuilder::new(array), - )), - DataType::UInt32 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt32( - PrimitiveArrowToVariantBuilder::new(array), - )), - DataType::UInt64 => Ok(ArrowToVariantRowBuilder::PrimitiveUInt64( - PrimitiveArrowToVariantBuilder::new(array), - )), - - // Float types - DataType::Float16 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat16( - PrimitiveArrowToVariantBuilder::new(array), - )), - DataType::Float32 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat32( - PrimitiveArrowToVariantBuilder::new(array), - )), - DataType::Float64 => Ok(ArrowToVariantRowBuilder::PrimitiveFloat64( - PrimitiveArrowToVariantBuilder::new(array), - )), - - // Decimal types - DataType::Decimal32(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal32( - Decimal32ArrowToVariantBuilder::new(array, *scale), - )), - DataType::Decimal64(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal64( - Decimal64ArrowToVariantBuilder::new(array, *scale), - )), - DataType::Decimal128(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal128( - Decimal128ArrowToVariantBuilder::new(array, *scale), - )), - DataType::Decimal256(_, scale) => Ok(ArrowToVariantRowBuilder::Decimal256( - Decimal256ArrowToVariantBuilder::new(array, *scale), - )), - - // Special types - DataType::Boolean => Ok(ArrowToVariantRowBuilder::Boolean( - BooleanArrowToVariantBuilder::new(array), - )), - DataType::Utf8 => Ok(ArrowToVariantRowBuilder::String( - StringArrowToVariantBuilder::new(array), - )), - DataType::LargeUtf8 => Ok(ArrowToVariantRowBuilder::LargeString( - StringArrowToVariantBuilder::new(array), - )), - DataType::Utf8View => Ok(ArrowToVariantRowBuilder::Utf8View( - Utf8ViewArrowToVariantBuilder::new(array), - )), - - // Binary types - DataType::Binary => Ok(ArrowToVariantRowBuilder::Binary( - BinaryArrowToVariantBuilder::new(array), - )), - DataType::LargeBinary => Ok(ArrowToVariantRowBuilder::LargeBinary( - BinaryArrowToVariantBuilder::new(array), - )), - DataType::BinaryView => Ok(ArrowToVariantRowBuilder::BinaryView( - BinaryViewArrowToVariantBuilder::new(array), - )), - DataType::FixedSizeBinary(_) => Ok(ArrowToVariantRowBuilder::FixedSizeBinary( - FixedSizeBinaryArrowToVariantBuilder::new(array), - )), - - DataType::Struct(_) => Ok(ArrowToVariantRowBuilder::Struct( - StructArrowToVariantBuilder::new(array.as_struct())?, - )), - DataType::Null => Ok(ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder)), - - // Run-end encoded types - DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { - DataType::Int16 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt16( - RunEndEncodedArrowToVariantBuilder::new(array)?, - )), - DataType::Int32 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt32( - RunEndEncodedArrowToVariantBuilder::new(array)?, - )), - DataType::Int64 => Ok(ArrowToVariantRowBuilder::RunEndEncodedInt64( - RunEndEncodedArrowToVariantBuilder::new(array)?, - )), - _ => Err(ArrowError::CastError(format!( - "Unsupported run-end type: {run_ends:?}" - ))), - }, - - // Dictionary types - DataType::Dictionary(_, _) => Ok(ArrowToVariantRowBuilder::Dictionary( - DictionaryArrowToVariantBuilder::new(array)?, - )), - - // List types - DataType::List(_) => Ok(ArrowToVariantRowBuilder::List( - ListArrowToVariantBuilder::new(array)?, - )), - DataType::LargeList(_) => Ok(ArrowToVariantRowBuilder::LargeList( - ListArrowToVariantBuilder::new(array)?, - )), - - // Map types - DataType::Map(_, _) => Ok(ArrowToVariantRowBuilder::Map( - MapArrowToVariantBuilder::new(array)?, - )), - - // Union types - DataType::Union(_, _) => Ok(ArrowToVariantRowBuilder::Union( - UnionArrowToVariantBuilder::new(array)?, - )), - - // Timestamp types - DataType::Timestamp(time_unit, time_zone) => match time_unit { - TimeUnit::Second => Ok(ArrowToVariantRowBuilder::TimestampSecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), - )), - TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::TimestampMillisecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), - )), - TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::TimestampMicrosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), - )), - TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::TimestampNanosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), - )), - }, - - // Date types - DataType::Date32 => Ok(ArrowToVariantRowBuilder::Date32( - DateArrowToVariantBuilder::new(array), - )), - DataType::Date64 => Ok(ArrowToVariantRowBuilder::Date64( - DateArrowToVariantBuilder::new(array), - )), - - // Time types - DataType::Time32(time_unit) => match time_unit { - TimeUnit::Second => Ok(ArrowToVariantRowBuilder::Time32Second( - TimeArrowToVariantBuilder::new(array), - )), - TimeUnit::Millisecond => Ok(ArrowToVariantRowBuilder::Time32Millisecond( - TimeArrowToVariantBuilder::new(array), - )), - _ => Err(ArrowError::CastError(format!( - "Unsupported Time32 unit: {time_unit:?}" - ))), - }, - DataType::Time64(time_unit) => match time_unit { - TimeUnit::Microsecond => Ok(ArrowToVariantRowBuilder::Time64Microsecond( - TimeArrowToVariantBuilder::new(array), - )), - TimeUnit::Nanosecond => Ok(ArrowToVariantRowBuilder::Time64Nanosecond( - TimeArrowToVariantBuilder::new(array), - )), - _ => Err(ArrowError::CastError(format!( - "Unsupported Time64 unit: {time_unit:?}" - ))), - }, - - DataType::Duration(_) | DataType::Interval(_) => Err(ArrowError::InvalidArgumentError( - "Casting duration/interval types to Variant is not supported. \ - The Variant format does not define duration/interval types." - .to_string(), - )), - _ => Err(ArrowError::CastError(format!( - "Unsupported type for row builder: {data_type:?}" - ))), - } -} - -/// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you -/// need to convert a specific data type -/// -/// # Arguments -/// * `input` - A reference to the input [`Array`] to cast -/// -/// # Notes -/// If the input array element is null, the corresponding element in the -/// output `VariantArray` will also be null (not `Variant::Null`). -/// -/// # Example -/// ``` -/// # use arrow::array::{Array, ArrayRef, Int64Array}; -/// # use parquet_variant::Variant; -/// # use parquet_variant_compute::cast_to_variant::cast_to_variant; -/// // input is an Int64Array, which will be cast to a VariantArray -/// let input = Int64Array::from(vec![Some(1), None, Some(3)]); -/// let result = cast_to_variant(&input).unwrap(); -/// assert_eq!(result.len(), 3); -/// assert_eq!(result.value(0), Variant::Int64(1)); -/// assert!(result.is_null(1)); // note null, not Variant::Null -/// assert_eq!(result.value(2), Variant::Int64(3)); -/// ``` -/// -/// For `DataType::Timestamp`s: if the timestamp has any level of precision -/// greater than a microsecond, it will be truncated. For example -/// `1970-01-01T00:00:01.234567890Z` -/// will be truncated to -/// `1970-01-01T00:00:01.234567Z` -pub fn cast_to_variant(input: &dyn Array) -> Result { - // Create row builder for the input array type - let mut row_builder = make_arrow_to_variant_row_builder(input.data_type(), input)?; - - // Create output array builder - let mut array_builder = VariantArrayBuilder::new(input.len()); - - // Process each row using the row builder - for i in 0..input.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder)?; - builder.finish(); - } - - Ok(array_builder.build()) -} - -// TODO do we need a cast_with_options to allow specifying conversion behavior, -// e.g. how to handle overflows, whether to convert to Variant::Null or return -// an error, etc. ? - #[cfg(test)] mod tests { use super::*; From 583f7b832a213c04d388e48d4b89de4cd8ed3774 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 18:10:34 -0700 Subject: [PATCH 48/53] split out a new module for the builders --- .../src/arrow_to_variant.rs | 2424 ++++++++++++++++ .../src/cast_to_variant.rs | 2434 +---------------- parquet-variant-compute/src/lib.rs | 1 + 3 files changed, 2432 insertions(+), 2427 deletions(-) create mode 100644 parquet-variant-compute/src/arrow_to_variant.rs diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs new file mode 100644 index 000000000000..a3cc801b8a73 --- /dev/null +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -0,0 +1,2424 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; + +use crate::type_conversion::decimal_to_variant_decimal; +use arrow::array::{ + Array, AsArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, +}; +use arrow::compute::kernels::cast; +use arrow::datatypes::{ + ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, Date32Type, + Date64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, + RunEndIndexType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, + Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, +}; +use arrow::temporal_conversions::{as_date, as_datetime, as_time}; +use arrow_schema::{ArrowError, DataType, TimeUnit}; +use chrono::{DateTime, TimeZone, Utc}; +use parquet_variant::{ + ObjectFieldBuilder, Variant, VariantBuilderExt, VariantDecimal16, VariantDecimal4, + VariantDecimal8, +}; + +// ============================================================================ +// Row-oriented builders for efficient Arrow-to-Variant conversion +// ============================================================================ + +/// Row builder for converting Arrow arrays to VariantArray row by row +pub(crate) enum ArrowToVariantRowBuilder<'a> { + Null(NullArrowToVariantBuilder), + Boolean(BooleanArrowToVariantBuilder<'a>), + PrimitiveInt8(PrimitiveArrowToVariantBuilder<'a, Int8Type>), + PrimitiveInt16(PrimitiveArrowToVariantBuilder<'a, Int16Type>), + PrimitiveInt32(PrimitiveArrowToVariantBuilder<'a, Int32Type>), + PrimitiveInt64(PrimitiveArrowToVariantBuilder<'a, Int64Type>), + PrimitiveUInt8(PrimitiveArrowToVariantBuilder<'a, UInt8Type>), + PrimitiveUInt16(PrimitiveArrowToVariantBuilder<'a, UInt16Type>), + PrimitiveUInt32(PrimitiveArrowToVariantBuilder<'a, UInt32Type>), + PrimitiveUInt64(PrimitiveArrowToVariantBuilder<'a, UInt64Type>), + PrimitiveFloat16(PrimitiveArrowToVariantBuilder<'a, Float16Type>), + PrimitiveFloat32(PrimitiveArrowToVariantBuilder<'a, Float32Type>), + PrimitiveFloat64(PrimitiveArrowToVariantBuilder<'a, Float64Type>), + Decimal32(Decimal32ArrowToVariantBuilder<'a>), + Decimal64(Decimal64ArrowToVariantBuilder<'a>), + Decimal128(Decimal128ArrowToVariantBuilder<'a>), + Decimal256(Decimal256ArrowToVariantBuilder<'a>), + TimestampSecond(TimestampArrowToVariantBuilder<'a, TimestampSecondType>), + TimestampMillisecond(TimestampArrowToVariantBuilder<'a, TimestampMillisecondType>), + TimestampMicrosecond(TimestampArrowToVariantBuilder<'a, TimestampMicrosecondType>), + TimestampNanosecond(TimestampArrowToVariantBuilder<'a, TimestampNanosecondType>), + Date32(DateArrowToVariantBuilder<'a, Date32Type>), + Date64(DateArrowToVariantBuilder<'a, Date64Type>), + Time32Second(TimeArrowToVariantBuilder<'a, Time32SecondType>), + Time32Millisecond(TimeArrowToVariantBuilder<'a, Time32MillisecondType>), + Time64Microsecond(TimeArrowToVariantBuilder<'a, Time64MicrosecondType>), + Time64Nanosecond(TimeArrowToVariantBuilder<'a, Time64NanosecondType>), + Binary(BinaryArrowToVariantBuilder<'a, i32>), + LargeBinary(BinaryArrowToVariantBuilder<'a, i64>), + BinaryView(BinaryViewArrowToVariantBuilder<'a>), + FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder<'a>), + Utf8(StringArrowToVariantBuilder<'a, i32>), + LargeUtf8(StringArrowToVariantBuilder<'a, i64>), + Utf8View(StringViewArrowToVariantBuilder<'a>), + List(ListArrowToVariantBuilder<'a, i32>), + LargeList(ListArrowToVariantBuilder<'a, i64>), + Struct(StructArrowToVariantBuilder<'a>), + Map(MapArrowToVariantBuilder<'a>), + Union(UnionArrowToVariantBuilder<'a>), + Dictionary(DictionaryArrowToVariantBuilder<'a>), + RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder<'a, Int16Type>), + RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder<'a, Int32Type>), + RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder<'a, Int64Type>), +} + +impl<'a> ArrowToVariantRowBuilder<'a> { + pub fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { + match self { + ArrowToVariantRowBuilder::Null(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Boolean(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveInt8(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveInt16(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveInt32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveInt64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveUInt8(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveUInt16(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveUInt32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveUInt64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveFloat16(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveFloat32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::PrimitiveFloat64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Decimal32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Decimal64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Decimal128(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Decimal256(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::TimestampSecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::TimestampMillisecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::TimestampMicrosecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::TimestampNanosecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Date32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Date64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Time32Second(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Time32Millisecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Time64Microsecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Time64Nanosecond(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Binary(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::LargeBinary(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::BinaryView(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::FixedSizeBinary(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Utf8(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::LargeUtf8(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Utf8View(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::List(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::LargeList(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Struct(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Map(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Union(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Dictionary(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::RunEndEncodedInt16(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::RunEndEncodedInt32(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::RunEndEncodedInt64(b) => b.append_row(index, builder), + } + } +} + +/// Factory function to create the appropriate row builder for a given DataType +pub(crate) fn make_arrow_to_variant_row_builder<'a>( + data_type: &'a DataType, + array: &'a dyn Array, +) -> Result, ArrowError> { + let builder = match data_type { + DataType::Null => ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder), + DataType::Boolean => { + ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array)) + } + DataType::Int8 => { + ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Int16 => { + ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Int32 => { + ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Int64 => { + ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::UInt8 => { + ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::UInt16 => { + ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::UInt32 => { + ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::UInt64 => { + ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Float16 => { + ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Float32 => { + ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Float64 => { + ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::new(array)) + } + DataType::Decimal32(_, scale) => { + ArrowToVariantRowBuilder::Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale)) + } + DataType::Decimal64(_, scale) => { + ArrowToVariantRowBuilder::Decimal64(Decimal64ArrowToVariantBuilder::new(array, *scale)) + } + DataType::Decimal128(_, scale) => ArrowToVariantRowBuilder::Decimal128( + Decimal128ArrowToVariantBuilder::new(array, *scale), + ), + DataType::Decimal256(_, scale) => ArrowToVariantRowBuilder::Decimal256( + Decimal256ArrowToVariantBuilder::new(array, *scale), + ), + DataType::Timestamp(time_unit, time_zone) => match time_unit { + TimeUnit::Second => ArrowToVariantRowBuilder::TimestampSecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + ), + TimeUnit::Millisecond => ArrowToVariantRowBuilder::TimestampMillisecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + ), + TimeUnit::Microsecond => ArrowToVariantRowBuilder::TimestampMicrosecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + ), + TimeUnit::Nanosecond => ArrowToVariantRowBuilder::TimestampNanosecond( + TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), + ), + }, + DataType::Date32 => ArrowToVariantRowBuilder::Date32(DateArrowToVariantBuilder::new(array)), + DataType::Date64 => ArrowToVariantRowBuilder::Date64(DateArrowToVariantBuilder::new(array)), + DataType::Time32(time_unit) => match time_unit { + TimeUnit::Second => { + ArrowToVariantRowBuilder::Time32Second(TimeArrowToVariantBuilder::new(array)) + } + TimeUnit::Millisecond => { + ArrowToVariantRowBuilder::Time32Millisecond(TimeArrowToVariantBuilder::new(array)) + } + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported Time32 unit: {time_unit:?}" + ))) + } + }, + DataType::Time64(time_unit) => match time_unit { + TimeUnit::Microsecond => { + ArrowToVariantRowBuilder::Time64Microsecond(TimeArrowToVariantBuilder::new(array)) + } + TimeUnit::Nanosecond => { + ArrowToVariantRowBuilder::Time64Nanosecond(TimeArrowToVariantBuilder::new(array)) + } + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported Time64 unit: {time_unit:?}" + ))) + } + }, + DataType::Duration(_) | DataType::Interval(_) => { + return Err(ArrowError::InvalidArgumentError( + "Casting duration/interval types to Variant is not supported. \ + The Variant format does not define duration/interval types." + .to_string(), + )) + } + DataType::Binary => { + ArrowToVariantRowBuilder::Binary(BinaryArrowToVariantBuilder::new(array)) + } + DataType::LargeBinary => { + ArrowToVariantRowBuilder::LargeBinary(BinaryArrowToVariantBuilder::new(array)) + } + DataType::BinaryView => { + ArrowToVariantRowBuilder::BinaryView(BinaryViewArrowToVariantBuilder::new(array)) + } + DataType::FixedSizeBinary(_) => ArrowToVariantRowBuilder::FixedSizeBinary( + FixedSizeBinaryArrowToVariantBuilder::new(array), + ), + DataType::Utf8 => ArrowToVariantRowBuilder::Utf8(StringArrowToVariantBuilder::new(array)), + DataType::LargeUtf8 => { + ArrowToVariantRowBuilder::LargeUtf8(StringArrowToVariantBuilder::new(array)) + } + DataType::Utf8View => { + ArrowToVariantRowBuilder::Utf8View(StringViewArrowToVariantBuilder::new(array)) + } + DataType::List(_) => ArrowToVariantRowBuilder::List(ListArrowToVariantBuilder::new(array)?), + DataType::LargeList(_) => { + ArrowToVariantRowBuilder::LargeList(ListArrowToVariantBuilder::new(array)?) + } + DataType::Struct(_) => { + ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new(array.as_struct())?) + } + DataType::Map(_, _) => ArrowToVariantRowBuilder::Map(MapArrowToVariantBuilder::new(array)?), + DataType::Union(_, _) => { + ArrowToVariantRowBuilder::Union(UnionArrowToVariantBuilder::new(array)?) + } + DataType::Dictionary(_, _) => { + ArrowToVariantRowBuilder::Dictionary(DictionaryArrowToVariantBuilder::new(array)?) + } + DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { + DataType::Int16 => ArrowToVariantRowBuilder::RunEndEncodedInt16( + RunEndEncodedArrowToVariantBuilder::new(array)?, + ), + DataType::Int32 => ArrowToVariantRowBuilder::RunEndEncodedInt32( + RunEndEncodedArrowToVariantBuilder::new(array)?, + ), + DataType::Int64 => ArrowToVariantRowBuilder::RunEndEncodedInt64( + RunEndEncodedArrowToVariantBuilder::new(array)?, + ), + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported run ends type: {:?}", + run_ends.data_type() + ))); + } + }, + dt => { + return Err(ArrowError::CastError(format!( + "Unsupported data type for casting to Variant: {dt:?}", + ))); + } + }; + Ok(builder) +} + +/// Macro to define (possibly generic) row builders with consistent structure and behavior. +/// Supports optional extra fields that are passed to the constructor. +macro_rules! define_row_builder { + ( + struct $name:ident<$lifetime:lifetime $(, $generic:ident: $($bound:path)+)?> + $(where $where_path:path: $where_bound:path)? + $({ $($field:ident: $field_type:ty),* $(,)? })?, + |$array_param:ident| -> $array_type:ty { $init_expr:expr }, + |$value:ident| $value_transform:expr + ) => { + pub(crate) struct $name<$lifetime $(, $generic: $($bound)+)?> + $(where $where_path: $where_bound)? + { + array: &$lifetime $array_type, + $($($field: $field_type,)*)? + } + + impl<$lifetime $(, $generic: $($bound)+)?> $name<$lifetime $(, $generic)?> + $(where $where_path: $where_bound)? + { + pub(crate) fn new($array_param: &$lifetime dyn Array $(, $($field: $field_type),*)?) -> Self { + Self { + array: $init_expr, + $($($field,)*)? + } + } + + fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + if self.array.is_null(index) { + builder.append_null(); + } else { + let $value = self.array.value(index); + // Capture fields as variables the transform can access (hygiene) + $($(let $field = &self.$field;)*)? + builder.append_value($value_transform); + } + Ok(()) + } + } + }; +} + +define_row_builder!( + struct BooleanArrowToVariantBuilder<'a>, + |array| -> arrow::array::BooleanArray { array.as_boolean() }, + |value| value +); + +define_row_builder!( + struct PrimitiveArrowToVariantBuilder<'a, T: ArrowPrimitiveType> + where T::Native: Into>, + |array| -> PrimitiveArray { array.as_primitive() }, + |value| value +); + +define_row_builder!( + struct Decimal32ArrowToVariantBuilder<'a> { + scale: i8, + }, + |array| -> arrow::array::Decimal32Array { array.as_primitive() }, + |value| decimal_to_variant_decimal!(value, scale, i32, VariantDecimal4) +); + +define_row_builder!( + struct Decimal64ArrowToVariantBuilder<'a> { + scale: i8, + }, + |array| -> arrow::array::Decimal64Array { array.as_primitive() }, + |value| decimal_to_variant_decimal!(value, scale, i64, VariantDecimal8) +); + +define_row_builder!( + struct Decimal128ArrowToVariantBuilder<'a> { + scale: i8, + }, + |array| -> arrow::array::Decimal128Array { array.as_primitive() }, + |value| decimal_to_variant_decimal!(value, scale, i128, VariantDecimal16) +); + +define_row_builder!( + struct Decimal256ArrowToVariantBuilder<'a> { + scale: i8, + }, + |array| -> arrow::array::Decimal256Array { array.as_primitive() }, + |value| { + // Decimal256 needs special handling - convert to i128 if possible + match value.to_i128() { + Some(i128_val) => decimal_to_variant_decimal!(i128_val, scale, i128, VariantDecimal16), + None => Variant::Null, // Value too large for i128 + } + } +); + +define_row_builder!( + struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> { + has_time_zone: bool, + }, + |array| -> arrow::array::PrimitiveArray { array.as_primitive() }, + |value| { + // Convert using Arrow's temporal conversion functions + let Some(naive_datetime) = as_datetime::(value) else { + return Err(ArrowError::CastError( + "Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string(), + )); + }; + if *has_time_zone { + // Has timezone -> DateTime -> TimestampMicros/TimestampNanos + let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); + Variant::from(utc_dt) // Uses From> for Variant + } else { + // No timezone -> NaiveDateTime -> TimestampNtzMicros/TimestampNtzNanos + Variant::from(naive_datetime) // Uses From for Variant + } + } +); + +define_row_builder!( + struct DateArrowToVariantBuilder<'a, T: ArrowTemporalType> + where i64: From, + |array| -> PrimitiveArray { array.as_primitive() }, + |value| { + let date_value = i64::from(value); + as_date::(date_value).map(Variant::from).unwrap_or(Variant::Null) + } +); + +define_row_builder!( + struct TimeArrowToVariantBuilder<'a, T: ArrowTemporalType> + where i64: From, + |array| -> PrimitiveArray { array.as_primitive() }, + |value| { + let time_value = i64::from(value); + as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) + } +); + +define_row_builder!( + struct BinaryArrowToVariantBuilder<'a, O: OffsetSizeTrait>, + |array| -> GenericBinaryArray { array.as_binary() }, + |value| value +); + +define_row_builder!( + struct BinaryViewArrowToVariantBuilder<'a>, + |array| -> arrow::array::BinaryViewArray { array.as_byte_view() }, + |value| value +); + +define_row_builder!( + struct FixedSizeBinaryArrowToVariantBuilder<'a>, + |array| -> arrow::array::FixedSizeBinaryArray { array.as_fixed_size_binary() }, + |value| value +); + +define_row_builder!( + struct StringArrowToVariantBuilder<'a, O: OffsetSizeTrait>, + |array| -> GenericStringArray { array.as_string() }, + |value| value +); + +define_row_builder!( + struct StringViewArrowToVariantBuilder<'a>, + |array| -> arrow::array::StringViewArray { array.as_string_view() }, + |value| value +); + +/// Null builder that always appends null +pub(crate) struct NullArrowToVariantBuilder; + +impl NullArrowToVariantBuilder { + fn append_row( + &mut self, + _index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { + builder.append_null(); + Ok(()) + } +} + +/// Generic list builder for List and LargeList types +pub(crate) struct ListArrowToVariantBuilder<'a, O: OffsetSizeTrait> { + list_array: &'a arrow::array::GenericListArray, + values_builder: Box>, +} + +impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { + pub(crate) fn new(array: &'a dyn Array) -> Result { + let list_array = array.as_list(); + let values = list_array.values(); + let values_builder = + make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; + + Ok(Self { + list_array, + values_builder: Box::new(values_builder), + }) + } + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { + if self.list_array.is_null(index) { + builder.append_null(); + return Ok(()); + } + + let offsets = self.list_array.offsets(); + let start = offsets[index].as_usize(); + let end = offsets[index + 1].as_usize(); + + let mut list_builder = builder.try_new_list()?; + for value_index in start..end { + self.values_builder + .append_row(value_index, &mut list_builder)?; + } + list_builder.finish(); + Ok(()) + } +} + +/// Struct builder for StructArray +pub(crate) struct StructArrowToVariantBuilder<'a> { + struct_array: &'a arrow::array::StructArray, + field_builders: Vec<(&'a str, ArrowToVariantRowBuilder<'a>)>, +} + +impl<'a> StructArrowToVariantBuilder<'a> { + pub(crate) fn new(struct_array: &'a arrow::array::StructArray) -> Result { + let mut field_builders = Vec::new(); + + // Create a row builder for each field + for (field_name, field_array) in struct_array + .column_names() + .iter() + .zip(struct_array.columns().iter()) + { + let field_builder = + make_arrow_to_variant_row_builder(field_array.data_type(), field_array.as_ref())?; + field_builders.push((*field_name, field_builder)); + } + + Ok(Self { + struct_array, + field_builders, + }) + } + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { + if self.struct_array.is_null(index) { + builder.append_null(); + } else { + // Create object builder for this struct row + let mut obj_builder = builder.try_new_object()?; + + // Process each field + for (field_name, row_builder) in &mut self.field_builders { + let mut field_builder = + parquet_variant::ObjectFieldBuilder::new(field_name, &mut obj_builder); + row_builder.append_row(index, &mut field_builder)?; + } + + obj_builder.finish(); + } + Ok(()) + } +} + +/// Map builder for MapArray types +pub(crate) struct MapArrowToVariantBuilder<'a> { + map_array: &'a arrow::array::MapArray, + key_strings: arrow::array::StringArray, + values_builder: Box>, +} + +impl<'a> MapArrowToVariantBuilder<'a> { + pub(crate) fn new(array: &'a dyn Array) -> Result { + let map_array = array.as_map(); + + // Pre-cast keys to strings once (like existing convert_map code) + let keys = cast(map_array.keys(), &DataType::Utf8)?; + let key_strings = keys.as_string::().clone(); + + // Create recursive builder for values + let values = map_array.values(); + let values_builder = + make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; + + Ok(Self { + map_array, + key_strings, + values_builder: Box::new(values_builder), + }) + } + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { + // Check for NULL map first (via null bitmap) + if self.map_array.is_null(index) { + builder.append_null(); + return Ok(()); + } + + let offsets = self.map_array.offsets(); + let start = offsets[index].as_usize(); + let end = offsets[index + 1].as_usize(); + + // Create object builder for this map (even if empty) + let mut object_builder = builder.try_new_object()?; + + // Add each key-value pair (loop does nothing for empty maps - correct!) + for kv_index in start..end { + let key = self.key_strings.value(kv_index); + let mut field_builder = ObjectFieldBuilder::new(key, &mut object_builder); + self.values_builder + .append_row(kv_index, &mut field_builder)?; + } + + object_builder.finish(); // Empty map becomes empty object {} + Ok(()) + } +} + +/// Union builder for both sparse and dense union arrays +pub(crate) struct UnionArrowToVariantBuilder<'a> { + union_array: &'a arrow::array::UnionArray, + child_builders: HashMap>>, +} + +impl<'a> UnionArrowToVariantBuilder<'a> { + pub(crate) fn new(array: &'a dyn Array) -> Result { + let union_array = array.as_union(); + let type_ids = union_array.type_ids(); + + // Create child builders for each union field + let mut child_builders = HashMap::new(); + for &type_id in type_ids { + let child_array = union_array.child(type_id); + let child_builder = + make_arrow_to_variant_row_builder(child_array.data_type(), child_array.as_ref())?; + child_builders.insert(type_id, Box::new(child_builder)); + } + + Ok(Self { + union_array, + child_builders, + }) + } + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { + let type_id = self.union_array.type_id(index); + let value_offset = self.union_array.value_offset(index); + + // Delegate to the appropriate child builder, or append null to handle an invalid type_id + match self.child_builders.get_mut(&type_id) { + Some(child_builder) => child_builder.append_row(value_offset, builder)?, + None => builder.append_null(), + } + + Ok(()) + } +} + +/// Dictionary array builder with simple O(1) indexing +pub(crate) struct DictionaryArrowToVariantBuilder<'a> { + keys: &'a dyn Array, // only needed for null checks + normalized_keys: Vec, + values_builder: Box>, +} + +impl<'a> DictionaryArrowToVariantBuilder<'a> { + pub(crate) fn new(array: &'a dyn Array) -> Result { + let dict_array = array.as_any_dictionary(); + let values = dict_array.values(); + let values_builder = + make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; + + // WARNING: normalized_keys panics if values is empty + let normalized_keys = match values.len() { + 0 => Vec::new(), + _ => dict_array.normalized_keys(), + }; + + Ok(Self { + keys: dict_array.keys(), + normalized_keys, + values_builder: Box::new(values_builder), + }) + } + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { + if self.keys.is_null(index) { + builder.append_null(); + } else { + let normalized_key = self.normalized_keys[index]; + self.values_builder.append_row(normalized_key, builder)?; + } + Ok(()) + } +} + +/// Run-end encoded array builder with efficient sequential access +pub(crate) struct RunEndEncodedArrowToVariantBuilder<'a, R: RunEndIndexType> { + run_array: &'a arrow::array::RunArray, + values_builder: Box>, + + run_ends: &'a [R::Native], + run_number: usize, // Physical index into run_ends and values + run_start: usize, // Logical start index of current run +} + +impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { + pub(crate) fn new(array: &'a dyn Array) -> Result { + let Some(run_array) = array.as_run_opt() else { + return Err(ArrowError::CastError("Expected RunArray".to_string())); + }; + + let values = run_array.values(); + let values_builder = + make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; + + Ok(Self { + run_array, + values_builder: Box::new(values_builder), + run_ends: run_array.run_ends().values(), + run_number: 0, + run_start: 0, + }) + } + + fn append_row( + &mut self, + index: usize, + builder: &mut impl VariantBuilderExt, + ) -> Result<(), ArrowError> { + self.set_run_for_index(index)?; + + // Handle null values + if self.run_array.values().is_null(self.run_number) { + builder.append_null(); + return Ok(()); + } + + // Re-encode the value + self.values_builder.append_row(self.run_number, builder)?; + + Ok(()) + } + + fn set_run_for_index(&mut self, index: usize) -> Result<(), ArrowError> { + if index >= self.run_start { + let Some(run_end) = self.run_ends.get(self.run_number) else { + return Err(ArrowError::CastError(format!( + "Index {index} beyond run array" + ))); + }; + if index < run_end.as_usize() { + return Ok(()); + } + if index == run_end.as_usize() { + self.run_number += 1; + self.run_start = run_end.as_usize(); + return Ok(()); + } + } + + // Use partition_point for all non-sequential cases + let run_number = self + .run_ends + .partition_point(|&run_end| run_end.as_usize() <= index); + if run_number >= self.run_ends.len() { + return Err(ArrowError::CastError(format!( + "Index {index} beyond run array" + ))); + } + self.run_number = run_number; + self.run_start = match run_number { + 0 => 0, + _ => self.run_ends[run_number - 1].as_usize(), + }; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::VariantArrayBuilder; + use arrow::array::{ArrayRef, BooleanArray, Int32Array, StringArray}; + use std::sync::Arc; + + #[test] + fn test_primitive_row_builder() { + // Test Int32Array + let int_array = Int32Array::from(vec![Some(42), None, Some(100)]); + let mut row_builder = + make_arrow_to_variant_row_builder(int_array.data_type(), &int_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + // Test first value + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(0, &mut variant_builder).unwrap(); + variant_builder.finish(); + + // Test null value + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(1, &mut variant_builder).unwrap(); + variant_builder.finish(); + + // Test second value + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(2, &mut variant_builder).unwrap(); + variant_builder.finish(); + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + assert_eq!(variant_array.value(0), Variant::Int32(42)); + assert!(variant_array.is_null(1)); + assert_eq!(variant_array.value(2), Variant::Int32(100)); + } + + #[test] + fn test_string_row_builder() { + let string_array = StringArray::from(vec![Some("hello"), None, Some("world")]); + let mut row_builder = + make_arrow_to_variant_row_builder(string_array.data_type(), &string_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(0, &mut variant_builder).unwrap(); + variant_builder.finish(); + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(1, &mut variant_builder).unwrap(); + variant_builder.finish(); + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(2, &mut variant_builder).unwrap(); + variant_builder.finish(); + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + assert_eq!(variant_array.value(0), Variant::from("hello")); + assert!(variant_array.is_null(1)); + assert_eq!(variant_array.value(2), Variant::from("world")); + } + + #[test] + fn test_boolean_row_builder() { + let bool_array = BooleanArray::from(vec![Some(true), None, Some(false)]); + let mut row_builder = + make_arrow_to_variant_row_builder(bool_array.data_type(), &bool_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(0, &mut variant_builder).unwrap(); + variant_builder.finish(); + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(1, &mut variant_builder).unwrap(); + variant_builder.finish(); + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(2, &mut variant_builder).unwrap(); + variant_builder.finish(); + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + assert_eq!(variant_array.value(0), Variant::from(true)); + assert!(variant_array.is_null(1)); + assert_eq!(variant_array.value(2), Variant::from(false)); + } + + #[test] + fn test_struct_row_builder() { + use arrow::array::{ArrayRef, Int32Array, StringArray, StructArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + // Create a struct array with int and string fields + let int_field = Field::new("id", DataType::Int32, true); + let string_field = Field::new("name", DataType::Utf8, true); + + let int_array = Int32Array::from(vec![Some(1), None, Some(3)]); + let string_array = StringArray::from(vec![Some("Alice"), Some("Bob"), None]); + + let struct_array = StructArray::try_new( + vec![int_field, string_field].into(), + vec![ + Arc::new(int_array) as ArrayRef, + Arc::new(string_array) as ArrayRef, + ], + None, + ) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(struct_array.data_type(), &struct_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + // Test first row + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(0, &mut variant_builder).unwrap(); + variant_builder.finish(); + + // Test second row (with null int field) + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(1, &mut variant_builder).unwrap(); + variant_builder.finish(); + + // Test third row (with null string field) + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(2, &mut variant_builder).unwrap(); + variant_builder.finish(); + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Check first row - should have both fields + let first_variant = variant_array.value(0); + assert_eq!(first_variant.get_object_field("id"), Some(Variant::from(1))); + assert_eq!( + first_variant.get_object_field("name"), + Some(Variant::from("Alice")) + ); + + // Check second row - should have name field but not id (null field omitted) + let second_variant = variant_array.value(1); + assert_eq!(second_variant.get_object_field("id"), None); // null field omitted + assert_eq!( + second_variant.get_object_field("name"), + Some(Variant::from("Bob")) + ); + + // Check third row - should have id field but not name (null field omitted) + let third_variant = variant_array.value(2); + assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(3))); + assert_eq!(third_variant.get_object_field("name"), None); // null field omitted + } + + #[test] + fn test_run_end_encoded_row_builder() { + use arrow::array::{Int32Array, RunArray}; + use arrow::datatypes::Int32Type; + + // Create a run-end encoded array: [A, A, B, B, B, C] + // run_ends: [2, 5, 6] + // values: ["A", "B", "C"] + let values = StringArray::from(vec!["A", "B", "C"]); + let run_ends = Int32Array::from(vec![2, 5, 6]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(6); + + // Test sequential access (most common case) + for i in 0..6 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 6); + + // Verify the values + assert_eq!(variant_array.value(0), Variant::from("A")); // Run 0 + assert_eq!(variant_array.value(1), Variant::from("A")); // Run 0 + assert_eq!(variant_array.value(2), Variant::from("B")); // Run 1 + assert_eq!(variant_array.value(3), Variant::from("B")); // Run 1 + assert_eq!(variant_array.value(4), Variant::from("B")); // Run 1 + assert_eq!(variant_array.value(5), Variant::from("C")); // Run 2 + } + + #[test] + fn test_run_end_encoded_random_access() { + use arrow::array::{Int32Array, RunArray}; + use arrow::datatypes::Int32Type; + + // Create a run-end encoded array: [A, A, B, B, B, C] + let values = StringArray::from(vec!["A", "B", "C"]); + let run_ends = Int32Array::from(vec![2, 5, 6]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); + + // Test random access pattern (backward jumps, forward jumps) + let access_pattern = [0, 5, 2, 4, 1, 3]; // Mix of all cases + let expected_values = ["A", "C", "B", "B", "A", "B"]; + + for (i, &index) in access_pattern.iter().enumerate() { + let mut array_builder = VariantArrayBuilder::new(1); + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(index, &mut variant_builder).unwrap(); + variant_builder.finish(); + + let variant_array = array_builder.build(); + assert_eq!(variant_array.value(0), Variant::from(expected_values[i])); + } + } + + #[test] + fn test_run_end_encoded_with_nulls() { + use arrow::array::{Int32Array, RunArray}; + use arrow::datatypes::Int32Type; + + // Create a run-end encoded array with null values: [A, A, null, null, B] + let values = StringArray::from(vec![Some("A"), None, Some("B")]); + let run_ends = Int32Array::from(vec![2, 4, 5]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(5); + + // Test sequential access + for i in 0..5 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 5); + + // Verify the values + assert_eq!(variant_array.value(0), Variant::from("A")); // Run 0 + assert_eq!(variant_array.value(1), Variant::from("A")); // Run 0 + assert!(variant_array.is_null(2)); // Run 1 (null) + assert!(variant_array.is_null(3)); // Run 1 (null) + assert_eq!(variant_array.value(4), Variant::from("B")); // Run 2 + } + + #[test] + fn test_dictionary_row_builder() { + use arrow::array::{DictionaryArray, Int32Array}; + use arrow::datatypes::Int32Type; + + // Create a dictionary array: keys=[0, 1, 0, 2, 1], values=["apple", "banana", "cherry"] + let values = StringArray::from(vec!["apple", "banana", "cherry"]); + let keys = Int32Array::from(vec![0, 1, 0, 2, 1]); + let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(5); + + // Test sequential access + for i in 0..5 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 5); + + // Verify the values match the dictionary lookup + assert_eq!(variant_array.value(0), Variant::from("apple")); // keys[0] = 0 -> values[0] = "apple" + assert_eq!(variant_array.value(1), Variant::from("banana")); // keys[1] = 1 -> values[1] = "banana" + assert_eq!(variant_array.value(2), Variant::from("apple")); // keys[2] = 0 -> values[0] = "apple" + assert_eq!(variant_array.value(3), Variant::from("cherry")); // keys[3] = 2 -> values[2] = "cherry" + assert_eq!(variant_array.value(4), Variant::from("banana")); // keys[4] = 1 -> values[1] = "banana" + } + + #[test] + fn test_dictionary_with_nulls() { + use arrow::array::{DictionaryArray, Int32Array}; + use arrow::datatypes::Int32Type; + + // Create a dictionary array with null keys: keys=[0, null, 1, null, 2], values=["x", "y", "z"] + let values = StringArray::from(vec!["x", "y", "z"]); + let keys = Int32Array::from(vec![Some(0), None, Some(1), None, Some(2)]); + let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(5); + + // Test sequential access + for i in 0..5 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 5); + + // Verify the values and nulls + assert_eq!(variant_array.value(0), Variant::from("x")); // keys[0] = 0 -> values[0] = "x" + assert!(variant_array.is_null(1)); // keys[1] = null + assert_eq!(variant_array.value(2), Variant::from("y")); // keys[2] = 1 -> values[1] = "y" + assert!(variant_array.is_null(3)); // keys[3] = null + assert_eq!(variant_array.value(4), Variant::from("z")); // keys[4] = 2 -> values[2] = "z" + } + + #[test] + fn test_dictionary_random_access() { + use arrow::array::{DictionaryArray, Int32Array}; + use arrow::datatypes::Int32Type; + + // Create a dictionary array: keys=[0, 1, 2, 0, 1, 2], values=["red", "green", "blue"] + let values = StringArray::from(vec!["red", "green", "blue"]); + let keys = Int32Array::from(vec![0, 1, 2, 0, 1, 2]); + let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + + // Test random access pattern + let access_pattern = [5, 0, 3, 1, 4, 2]; // Random order + let expected_values = ["blue", "red", "red", "green", "green", "blue"]; + + for (i, &index) in access_pattern.iter().enumerate() { + let mut array_builder = VariantArrayBuilder::new(1); + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(index, &mut variant_builder).unwrap(); + variant_builder.finish(); + + let variant_array = array_builder.build(); + assert_eq!(variant_array.value(0), Variant::from(expected_values[i])); + } + } + + #[test] + fn test_nested_dictionary() { + use arrow::array::{DictionaryArray, Int32Array, StructArray}; + use arrow::datatypes::{Field, Int32Type}; + + // Create a dictionary with struct values + let id_array = Int32Array::from(vec![1, 2, 3]); + let name_array = StringArray::from(vec!["Alice", "Bob", "Charlie"]); + let struct_array = StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int32, false)), + Arc::new(id_array) as ArrayRef, + ), + ( + Arc::new(Field::new("name", DataType::Utf8, false)), + Arc::new(name_array) as ArrayRef, + ), + ]); + + let keys = Int32Array::from(vec![0, 1, 0, 2, 1]); + let dict_array = + DictionaryArray::::try_new(keys, Arc::new(struct_array)).unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); + let mut array_builder = VariantArrayBuilder::new(5); + + // Test sequential access + for i in 0..5 { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut variant_builder).unwrap(); + variant_builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 5); + + // Verify the nested struct values + let first_variant = variant_array.value(0); + assert_eq!(first_variant.get_object_field("id"), Some(Variant::from(1))); + assert_eq!( + first_variant.get_object_field("name"), + Some(Variant::from("Alice")) + ); + + let second_variant = variant_array.value(1); + assert_eq!( + second_variant.get_object_field("id"), + Some(Variant::from(2)) + ); + assert_eq!( + second_variant.get_object_field("name"), + Some(Variant::from("Bob")) + ); + + // Test that repeated keys give same values + let third_variant = variant_array.value(2); + assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(1))); + assert_eq!( + third_variant.get_object_field("name"), + Some(Variant::from("Alice")) + ); + } + + #[test] + fn test_list_row_builder() { + use arrow::array::ListArray; + + // Create a list array: [[1, 2], [3, 4, 5], null, []] + let data = vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + None, + Some(vec![]), + ]; + let list_array = ListArray::from_iter_primitive::(data); + + let mut row_builder = + make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); + + for i in 0..list_array.len() { + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = variant_array_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 4); + + // Row 0: [1, 2] + let row0 = variant_array.value(0); + let list0 = row0.as_list().unwrap(); + assert_eq!(list0.len(), 2); + assert_eq!(list0.get(0), Some(Variant::from(1))); + assert_eq!(list0.get(1), Some(Variant::from(2))); + + // Row 1: [3, 4, 5] + let row1 = variant_array.value(1); + let list1 = row1.as_list().unwrap(); + assert_eq!(list1.len(), 3); + assert_eq!(list1.get(0), Some(Variant::from(3))); + assert_eq!(list1.get(1), Some(Variant::from(4))); + assert_eq!(list1.get(2), Some(Variant::from(5))); + + // Row 2: null + assert!(variant_array.is_null(2)); + + // Row 3: [] + let row3 = variant_array.value(3); + let list3 = row3.as_list().unwrap(); + assert_eq!(list3.len(), 0); + } + + #[test] + fn test_large_list_row_builder() { + use arrow::array::LargeListArray; + + // Create a large list array: [[1, 2], null] + let data = vec![Some(vec![Some(1i64), Some(2i64)]), None]; + let list_array = LargeListArray::from_iter_primitive::(data); + + let mut row_builder = + make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); + + for i in 0..list_array.len() { + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = variant_array_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 2); + + // Row 0: [1, 2] + let row0 = variant_array.value(0); + let list0 = row0.as_list().unwrap(); + assert_eq!(list0.len(), 2); + assert_eq!(list0.get(0), Some(Variant::from(1i64))); + assert_eq!(list0.get(1), Some(Variant::from(2i64))); + + // Row 1: null + assert!(variant_array.is_null(1)); + } + + #[test] + fn test_sliced_list_row_builder() { + use arrow::array::ListArray; + + // Create a list array: [[1, 2], [3, 4, 5], [6]] + let data = vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3), Some(4), Some(5)]), + Some(vec![Some(6)]), + ]; + let list_array = ListArray::from_iter_primitive::(data); + + // Slice to get just the middle element: [[3, 4, 5]] + let sliced_array = list_array.slice(1, 1); + + let mut row_builder = + make_arrow_to_variant_row_builder(sliced_array.data_type(), &sliced_array).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(sliced_array.len()); + + // Test the single row + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(0, &mut builder).unwrap(); + builder.finish(); + + let variant_array = variant_array_builder.build(); + + // Verify result + assert_eq!(variant_array.len(), 1); + + // Row 0: [3, 4, 5] + let row0 = variant_array.value(0); + let list0 = row0.as_list().unwrap(); + assert_eq!(list0.len(), 3); + assert_eq!(list0.get(0), Some(Variant::from(3))); + assert_eq!(list0.get(1), Some(Variant::from(4))); + assert_eq!(list0.get(2), Some(Variant::from(5))); + } + + #[test] + fn test_nested_list_row_builder() { + use arrow::array::ListArray; + use arrow::datatypes::Field; + + // Build the nested structure manually + let inner_field = Arc::new(Field::new("item", DataType::Int32, true)); + let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_field), true)); + + let values_data = vec![Some(vec![Some(1), Some(2)]), Some(vec![Some(3)])]; + let values_list = ListArray::from_iter_primitive::(values_data); + + let outer_offsets = arrow::buffer::OffsetBuffer::new(vec![0i32, 2, 2].into()); + let outer_list = ListArray::new( + inner_list_field, + outer_offsets, + Arc::new(values_list), + Some(arrow::buffer::NullBuffer::from(vec![true, false])), + ); + + let mut row_builder = + make_arrow_to_variant_row_builder(outer_list.data_type(), &outer_list).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(outer_list.len()); + + for i in 0..outer_list.len() { + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = variant_array_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 2); + + // Row 0: [[1, 2], [3]] + let row0 = variant_array.value(0); + let outer_list0 = row0.as_list().unwrap(); + assert_eq!(outer_list0.len(), 2); + + let inner_list0_0 = outer_list0.get(0).unwrap(); + let inner_list0_0 = inner_list0_0.as_list().unwrap(); + assert_eq!(inner_list0_0.len(), 2); + assert_eq!(inner_list0_0.get(0), Some(Variant::from(1))); + assert_eq!(inner_list0_0.get(1), Some(Variant::from(2))); + + let inner_list0_1 = outer_list0.get(1).unwrap(); + let inner_list0_1 = inner_list0_1.as_list().unwrap(); + assert_eq!(inner_list0_1.len(), 1); + assert_eq!(inner_list0_1.get(0), Some(Variant::from(3))); + + // Row 1: null + assert!(variant_array.is_null(1)); + } + + #[test] + fn test_map_row_builder() { + use arrow::array::{Int32Array, MapArray, StringArray, StructArray}; + use arrow::buffer::{NullBuffer, OffsetBuffer}; + use arrow::datatypes::{DataType, Field, Fields}; + use std::sync::Arc; + + // Create the entries struct array (key-value pairs) + let keys = StringArray::from(vec!["key1", "key2", "key3"]); + let values = Int32Array::from(vec![1, 2, 3]); + let entries_fields = Fields::from(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Int32, true), + ]); + let entries = StructArray::new( + entries_fields.clone(), + vec![Arc::new(keys), Arc::new(values)], + None, // No nulls in the entries themselves + ); + + // Create offsets for 4 maps: [0..1], [1..1], [1..1], [1..3] + // Map 0: {"key1": 1} (1 entry) + // Map 1: {} (0 entries - empty) + // Map 2: null (0 entries but NULL via null buffer) + // Map 3: {"key2": 2, "key3": 3} (2 entries) + let offsets = OffsetBuffer::new(vec![0, 1, 1, 1, 3].into()); + + // Create null buffer - map at index 2 is NULL + let null_buffer = Some(NullBuffer::from(vec![true, true, false, true])); + + // Create the map field + let map_field = Arc::new(Field::new( + "entries", + DataType::Struct(entries_fields), + false, // Keys are non-nullable + )); + + // Create MapArray using try_new + let map_array = MapArray::try_new( + map_field, + offsets, + entries, + null_buffer, + false, // not ordered + ) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(map_array.data_type(), &map_array).unwrap(); + let mut variant_array_builder = VariantArrayBuilder::new(4); + + // Test each row + for i in 0..4 { + let mut builder = variant_array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = variant_array_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 4); + + // Map 0: {"key1": 1} + let map0 = variant_array.value(0); + let obj0 = map0.as_object().unwrap(); + assert_eq!(obj0.len(), 1); + assert_eq!(obj0.get("key1"), Some(Variant::from(1))); + + // Map 1: {} (empty object, not null) + let map1 = variant_array.value(1); + let obj1 = map1.as_object().unwrap(); + assert_eq!(obj1.len(), 0); // Empty object + + // Map 2: null (actual NULL) + assert!(variant_array.is_null(2)); + + // Map 3: {"key2": 2, "key3": 3} + let map3 = variant_array.value(3); + let obj3 = map3.as_object().unwrap(); + assert_eq!(obj3.len(), 2); + assert_eq!(obj3.get("key2"), Some(Variant::from(2))); + assert_eq!(obj3.get("key3"), Some(Variant::from(3))); + } + + #[test] + fn test_union_sparse_row_builder() { + use arrow::array::{Float64Array, Int32Array, StringArray, UnionArray}; + use arrow::buffer::ScalarBuffer; + use arrow::datatypes::{DataType, Field, UnionFields}; + use std::sync::Arc; + + // Create a sparse union array with mixed types (int, float, string) + let int_array = Int32Array::from(vec![Some(1), None, None, None, Some(34), None]); + let float_array = Float64Array::from(vec![None, Some(3.2), None, Some(32.5), None, None]); + let string_array = StringArray::from(vec![None, None, Some("hello"), None, None, None]); + let type_ids = [0, 1, 2, 1, 0, 0].into_iter().collect::>(); + + let union_fields = UnionFields::new( + vec![0, 1, 2], + vec![ + Field::new("int_field", DataType::Int32, false), + Field::new("float_field", DataType::Float64, false), + Field::new("string_field", DataType::Utf8, false), + ], + ); + + let children: Vec> = vec![ + Arc::new(int_array), + Arc::new(float_array), + Arc::new(string_array), + ]; + + let union_array = UnionArray::try_new( + union_fields, + type_ids, + None, // Sparse union + children, + ) + .unwrap(); + + // Test the row builder + let mut row_builder = + make_arrow_to_variant_row_builder(union_array.data_type(), &union_array).unwrap(); + + let mut variant_builder = VariantArrayBuilder::new(union_array.len()); + for i in 0..union_array.len() { + let mut builder = variant_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + let variant_array = variant_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 6); + + // Row 0: int 1 + assert_eq!(variant_array.value(0), Variant::Int32(1)); + + // Row 1: float 3.2 + assert_eq!(variant_array.value(1), Variant::Double(3.2)); + + // Row 2: string "hello" + assert_eq!(variant_array.value(2), Variant::from("hello")); + + // Row 3: float 32.5 + assert_eq!(variant_array.value(3), Variant::Double(32.5)); + + // Row 4: int 34 + assert_eq!(variant_array.value(4), Variant::Int32(34)); + + // Row 5: null (int array has null at this position) + assert!(variant_array.is_null(5)); + } + + #[test] + fn test_union_dense_row_builder() { + use arrow::array::{Float64Array, Int32Array, StringArray, UnionArray}; + use arrow::buffer::ScalarBuffer; + use arrow::datatypes::{DataType, Field, UnionFields}; + use std::sync::Arc; + + // Create a dense union array with mixed types (int, float, string) + let int_array = Int32Array::from(vec![Some(1), Some(34), None]); + let float_array = Float64Array::from(vec![3.2, 32.5]); + let string_array = StringArray::from(vec!["hello"]); + let type_ids = [0, 1, 2, 1, 0, 0].into_iter().collect::>(); + let offsets = [0, 0, 0, 1, 1, 2] + .into_iter() + .collect::>(); + + let union_fields = UnionFields::new( + vec![0, 1, 2], + vec![ + Field::new("int_field", DataType::Int32, false), + Field::new("float_field", DataType::Float64, false), + Field::new("string_field", DataType::Utf8, false), + ], + ); + + let children: Vec> = vec![ + Arc::new(int_array), + Arc::new(float_array), + Arc::new(string_array), + ]; + + let union_array = UnionArray::try_new( + union_fields, + type_ids, + Some(offsets), // Dense union + children, + ) + .unwrap(); + + // Test the row builder + let mut row_builder = + make_arrow_to_variant_row_builder(union_array.data_type(), &union_array).unwrap(); + + let mut variant_builder = VariantArrayBuilder::new(union_array.len()); + for i in 0..union_array.len() { + let mut builder = variant_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + let variant_array = variant_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 6); + + // Row 0: int 1 (offset 0 in int_array) + assert_eq!(variant_array.value(0), Variant::Int32(1)); + + // Row 1: float 3.2 (offset 0 in float_array) + assert_eq!(variant_array.value(1), Variant::Double(3.2)); + + // Row 2: string "hello" (offset 0 in string_array) + assert_eq!(variant_array.value(2), Variant::from("hello")); + + // Row 3: float 32.5 (offset 1 in float_array) + assert_eq!(variant_array.value(3), Variant::Double(32.5)); + + // Row 4: int 34 (offset 1 in int_array) + assert_eq!(variant_array.value(4), Variant::Int32(34)); + + // Row 5: null (offset 2 in int_array, which has null) + assert!(variant_array.is_null(5)); + } + + #[test] + fn test_union_sparse_type_ids_row_builder() { + use arrow::array::{Int32Array, StringArray, UnionArray}; + use arrow::buffer::ScalarBuffer; + use arrow::datatypes::{DataType, Field, UnionFields}; + use std::sync::Arc; + + // Create a sparse union with non-contiguous type IDs (1, 3) + let int_array = Int32Array::from(vec![Some(42), None]); + let string_array = StringArray::from(vec![None, Some("test")]); + let type_ids = [1, 3].into_iter().collect::>(); + + let union_fields = UnionFields::new( + vec![1, 3], // Non-contiguous type IDs + vec![ + Field::new("int_field", DataType::Int32, false), + Field::new("string_field", DataType::Utf8, false), + ], + ); + + let children: Vec> = vec![Arc::new(int_array), Arc::new(string_array)]; + + let union_array = UnionArray::try_new( + union_fields, + type_ids, + None, // Sparse union + children, + ) + .unwrap(); + + // Test the row builder + let mut row_builder = + make_arrow_to_variant_row_builder(union_array.data_type(), &union_array).unwrap(); + + let mut variant_builder = VariantArrayBuilder::new(union_array.len()); + for i in 0..union_array.len() { + let mut builder = variant_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + let variant_array = variant_builder.build(); + + // Verify results + assert_eq!(variant_array.len(), 2); + + // Row 0: int 42 (type_id = 1) + assert_eq!(variant_array.value(0), Variant::Int32(42)); + + // Row 1: string "test" (type_id = 3) + assert_eq!(variant_array.value(1), Variant::from("test")); + } + + #[test] + fn test_decimal32_row_builder() { + use arrow::array::Decimal32Array; + use parquet_variant::VariantDecimal4; + + // Test Decimal32Array with scale 2 (e.g., for currency: 12.34) + let decimal_array = Decimal32Array::from(vec![Some(1234), None, Some(-5678)]) + .with_precision_and_scale(9, 2) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..decimal_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: 12.34 (1234 with scale 2) + assert_eq!( + variant_array.value(0), + Variant::from(VariantDecimal4::try_new(1234, 2).unwrap()) + ); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: -56.78 (-5678 with scale 2) + assert_eq!( + variant_array.value(2), + Variant::from(VariantDecimal4::try_new(-5678, 2).unwrap()) + ); + } + + #[test] + fn test_decimal128_row_builder() { + use arrow::array::Decimal128Array; + use parquet_variant::VariantDecimal16; + + // Test Decimal128Array with negative scale (multiply by 10^|scale|) + let decimal_array = Decimal128Array::from(vec![Some(123), None, Some(456)]) + .with_precision_and_scale(10, -2) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..decimal_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: 123 * 10^2 = 12300 with scale 0 (negative scale handling) + assert_eq!( + variant_array.value(0), + Variant::from(VariantDecimal16::try_new(12300, 0).unwrap()) + ); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 456 * 10^2 = 45600 with scale 0 + assert_eq!( + variant_array.value(2), + Variant::from(VariantDecimal16::try_new(45600, 0).unwrap()) + ); + } + + #[test] + fn test_decimal256_overflow_row_builder() { + use arrow::array::Decimal256Array; + use arrow::datatypes::i256; + + // Test Decimal256Array with a value that overflows i128 + let large_value = i256::from_i128(i128::MAX) + i256::from(1); // Overflows i128 + let decimal_array = Decimal256Array::from(vec![Some(large_value), Some(i256::from(123))]) + .with_precision_and_scale(76, 3) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(2); + + for i in 0..decimal_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 2); + + // Row 0: overflow value becomes Variant::Null + assert_eq!(variant_array.value(0), Variant::Null); + + // Row 1: normal value converts successfully + assert_eq!( + variant_array.value(1), + Variant::from(VariantDecimal16::try_new(123, 3).unwrap()) + ); + } + + #[test] + fn test_binary_row_builder() { + use arrow::array::BinaryArray; + + // Test BinaryArray with various binary data + let binary_data = vec![ + Some(b"hello".as_slice()), + None, + Some(b"\x00\x01\x02\xFF".as_slice()), + Some(b"".as_slice()), // Empty binary + ]; + let binary_array = BinaryArray::from(binary_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(binary_array.data_type(), &binary_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..binary_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: "hello" bytes + assert_eq!(variant_array.value(0), Variant::from(b"hello".as_slice())); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: binary with special bytes + let bytes = [0x00, 0x01, 0x02, 0xFF]; + assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); + + // Row 3: empty binary + let bytes = []; + assert_eq!(variant_array.value(3), Variant::from(bytes.as_slice())); + } + + #[test] + fn test_large_binary_row_builder() { + use arrow::array::LargeBinaryArray; + + // Test LargeBinaryArray + let binary_data = vec![ + Some(b"large binary data".as_slice()), + None, + Some(b"another large chunk".as_slice()), + ]; + let large_binary_array = LargeBinaryArray::from(binary_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(large_binary_array.data_type(), &large_binary_array) + .unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..large_binary_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: large binary data + assert_eq!( + variant_array.value(0), + Variant::from(b"large binary data".as_slice()) + ); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: another large chunk + assert_eq!( + variant_array.value(2), + Variant::from(b"another large chunk".as_slice()) + ); + } + + #[test] + fn test_binary_view_row_builder() { + use arrow::array::BinaryViewArray; + + // Test BinaryViewArray + let binary_data = vec![ + Some(b"short".as_slice()), + None, + Some(b"this is a longer binary view that exceeds inline storage".as_slice()), + ]; + let binary_view_array = BinaryViewArray::from(binary_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(binary_view_array.data_type(), &binary_view_array) + .unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..binary_view_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: short binary + assert_eq!(variant_array.value(0), Variant::from(b"short".as_slice())); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: long binary view + assert_eq!( + variant_array.value(2), + Variant::from(b"this is a longer binary view that exceeds inline storage".as_slice()) + ); + } + + #[test] + fn test_fixed_size_binary_row_builder() { + use arrow::array::FixedSizeBinaryArray; + + // Test FixedSizeBinaryArray with 4-byte values + let binary_data = vec![ + Some([0x01, 0x02, 0x03, 0x04]), + None, + Some([0xFF, 0xFE, 0xFD, 0xFC]), + ]; + let fixed_binary_array = + FixedSizeBinaryArray::try_from_sparse_iter_with_size(binary_data.into_iter(), 4) + .unwrap(); + + let mut row_builder = + make_arrow_to_variant_row_builder(fixed_binary_array.data_type(), &fixed_binary_array) + .unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..fixed_binary_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: fixed size binary + let bytes = [0x01, 0x02, 0x03, 0x04]; + assert_eq!(variant_array.value(0), Variant::from(bytes.as_slice())); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: another fixed size binary + let bytes = [0xFF, 0xFE, 0xFD, 0xFC]; + assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); + } + + #[test] + fn test_utf8_view_row_builder() { + use arrow::array::StringViewArray; + + // Test StringViewArray (Utf8View) + let string_data = vec![ + Some("short"), + None, + Some("this is a much longer string that will be stored out-of-line in the buffer"), + ]; + let string_view_array = StringViewArray::from(string_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(string_view_array.data_type(), &string_view_array) + .unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..string_view_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: short string + assert_eq!(variant_array.value(0), Variant::from("short")); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: long string view + assert_eq!( + variant_array.value(2), + Variant::from( + "this is a much longer string that will be stored out-of-line in the buffer" + ) + ); + } + + #[test] + fn test_timestamp_second_row_builder() { + use arrow::array::TimestampSecondArray; + + // Test TimestampSecondArray without timezone + let timestamp_data = vec![ + Some(1609459200), // 2021-01-01 00:00:00 UTC + None, + Some(1640995200), // 2022-01-01 00:00:00 UTC + ]; + let timestamp_array = TimestampSecondArray::from(timestamp_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) + .unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..timestamp_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: 2021-01-01 00:00:00 (no timezone -> NaiveDateTime -> TimestampNtzMicros) + let expected_naive = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); + assert_eq!(variant_array.value(0), Variant::from(expected_naive)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 2022-01-01 00:00:00 + let expected_naive2 = DateTime::from_timestamp(1640995200, 0).unwrap().naive_utc(); + assert_eq!(variant_array.value(2), Variant::from(expected_naive2)); + } + + #[test] + fn test_timestamp_with_timezone_row_builder() { + use arrow::array::TimestampMicrosecondArray; + use chrono::DateTime; + + // Test TimestampMicrosecondArray with timezone + let timestamp_data = vec![ + Some(1609459200000000), // 2021-01-01 00:00:00 UTC (in microseconds) + None, + Some(1640995200000000), // 2022-01-01 00:00:00 UTC (in microseconds) + ]; + let timezone = "UTC".to_string(); + let timestamp_array = + TimestampMicrosecondArray::from(timestamp_data).with_timezone(timezone.clone()); + + let mut row_builder = + make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) + .unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..timestamp_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: 2021-01-01 00:00:00 UTC (with timezone -> DateTime -> TimestampMicros) + let expected_utc = DateTime::from_timestamp(1609459200, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_utc)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 2022-01-01 00:00:00 UTC + let expected_utc2 = DateTime::from_timestamp(1640995200, 0).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_utc2)); + } + + #[test] + fn test_timestamp_nanosecond_precision_row_builder() { + use arrow::array::TimestampNanosecondArray; + + // Test TimestampNanosecondArray with nanosecond precision + let timestamp_data = vec![ + Some(1609459200123456789), // 2021-01-01 00:00:00.123456789 UTC + None, + Some(1609459200000000000), // 2021-01-01 00:00:00.000000000 UTC (no fractional seconds) + ]; + let timestamp_array = TimestampNanosecondArray::from(timestamp_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) + .unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..timestamp_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: with nanoseconds -> should use TimestampNtzNanos + let expected_with_nanos = DateTime::from_timestamp(1609459200, 123456789) + .unwrap() + .naive_utc(); + assert_eq!(variant_array.value(0), Variant::from(expected_with_nanos)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: no fractional seconds -> should use TimestampNtzMicros + let expected_no_nanos = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); + assert_eq!(variant_array.value(2), Variant::from(expected_no_nanos)); + } + + #[test] + fn test_timestamp_millisecond_row_builder() { + use arrow::array::TimestampMillisecondArray; + + // Test TimestampMillisecondArray + let timestamp_data = vec![ + Some(1609459200123), // 2021-01-01 00:00:00.123 UTC + None, + Some(1609459200000), // 2021-01-01 00:00:00.000 UTC + ]; + let timestamp_array = TimestampMillisecondArray::from(timestamp_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) + .unwrap(); + + let mut array_builder = VariantArrayBuilder::new(3); + + for i in 0..timestamp_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 3); + + // Row 0: with milliseconds -> TimestampNtzMicros (123ms = 123000000ns) + let expected_with_millis = DateTime::from_timestamp(1609459200, 123000000) + .unwrap() + .naive_utc(); + assert_eq!(variant_array.value(0), Variant::from(expected_with_millis)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: no fractional seconds -> TimestampNtzMicros + let expected_no_millis = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); + assert_eq!(variant_array.value(2), Variant::from(expected_no_millis)); + } + + #[test] + fn test_date32_row_builder() { + use arrow::array::Date32Array; + use chrono::NaiveDate; + + // Test Date32Array with various dates + let date_data = vec![ + Some(0), // 1970-01-01 + None, + Some(19723), // 2024-01-01 (days since epoch) + Some(-719162), // 0001-01-01 (near minimum) + ]; + let date_array = Date32Array::from(date_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(date_array.data_type(), &date_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..date_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 1970-01-01 (epoch) + let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 2024-01-01 + let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_2024)); + + // Row 3: 0001-01-01 (near minimum date) + let expected_min = NaiveDate::from_ymd_opt(1, 1, 1).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_min)); + } + + #[test] + fn test_date64_row_builder() { + use arrow::array::Date64Array; + use chrono::NaiveDate; + + // Test Date64Array with various dates (milliseconds since epoch) + let date_data = vec![ + Some(0), // 1970-01-01 + None, + Some(1704067200000), // 2024-01-01 (milliseconds since epoch) + Some(86400000), // 1970-01-02 + ]; + let date_array = Date64Array::from(date_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(date_array.data_type(), &date_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..date_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 1970-01-01 (epoch) + let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 2024-01-01 + let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_2024)); + + // Row 3: 1970-01-02 + let expected_next_day = NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_next_day)); + } + + #[test] + fn test_time32_second_row_builder() { + use arrow::array::Time32SecondArray; + use chrono::NaiveTime; + + // Test Time32SecondArray with various times (seconds since midnight) + let time_data = vec![ + Some(0), // 00:00:00 + None, + Some(3661), // 01:01:01 + Some(86399), // 23:59:59 + ]; + let time_array = Time32SecondArray::from(time_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..time_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 00:00:00 (midnight) + let expected_midnight = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 01:01:01 + let expected_time = NaiveTime::from_hms_opt(1, 1, 1).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_time)); + + // Row 3: 23:59:59 (last second of day) + let expected_last = NaiveTime::from_hms_opt(23, 59, 59).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_last)); + } + + #[test] + fn test_time32_millisecond_row_builder() { + use arrow::array::Time32MillisecondArray; + use chrono::NaiveTime; + + // Test Time32MillisecondArray with various times (milliseconds since midnight) + let time_data = vec![ + Some(0), // 00:00:00.000 + None, + Some(3661123), // 01:01:01.123 + Some(86399999), // 23:59:59.999 + ]; + let time_array = Time32MillisecondArray::from(time_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..time_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 00:00:00.000 (midnight) + let expected_midnight = NaiveTime::from_hms_milli_opt(0, 0, 0, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 01:01:01.123 + let expected_time = NaiveTime::from_hms_milli_opt(1, 1, 1, 123).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_time)); + + // Row 3: 23:59:59.999 (last millisecond of day) + let expected_last = NaiveTime::from_hms_milli_opt(23, 59, 59, 999).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_last)); + } + + #[test] + fn test_time64_microsecond_row_builder() { + use arrow::array::Time64MicrosecondArray; + use chrono::NaiveTime; + + // Test Time64MicrosecondArray with various times (microseconds since midnight) + let time_data = vec![ + Some(0), // 00:00:00.000000 + None, + Some(3661123456), // 01:01:01.123456 + Some(86399999999), // 23:59:59.999999 + ]; + let time_array = Time64MicrosecondArray::from(time_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..time_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 00:00:00.000000 (midnight) + let expected_midnight = NaiveTime::from_hms_micro_opt(0, 0, 0, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 01:01:01.123456 + let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_time)); + + // Row 3: 23:59:59.999999 (last microsecond of day) + let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_last)); + } + + #[test] + fn test_time64_nanosecond_row_builder() { + use arrow::array::Time64NanosecondArray; + use chrono::NaiveTime; + + // Test Time64NanosecondArray with various times (nanoseconds since midnight) + let time_data = vec![ + Some(0), // 00:00:00.000000000 + None, + Some(3661123456789), // 01:01:01.123456789 + Some(86399999999999), // 23:59:59.999999999 + ]; + let time_array = Time64NanosecondArray::from(time_data); + + let mut row_builder = + make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); + + let mut array_builder = VariantArrayBuilder::new(4); + + for i in 0..time_array.len() { + let mut builder = array_builder.variant_builder(); + row_builder.append_row(i, &mut builder).unwrap(); + builder.finish(); + } + + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), 4); + + // Row 0: 00:00:00.000000000 (midnight) + let expected_midnight = NaiveTime::from_hms_nano_opt(0, 0, 0, 0).unwrap(); + assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); + + // Row 1: null + assert!(variant_array.is_null(1)); + + // Row 2: 01:01:01.123456789 -> truncated to 01:01:01.123456000 (microsecond precision) + let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); + assert_eq!(variant_array.value(2), Variant::from(expected_time)); + + // Row 3: 23:59:59.999999999 -> truncated to 23:59:59.999999000 (microsecond precision) + let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); + assert_eq!(variant_array.value(3), Variant::from(expected_last)); + } +} diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index af15fbb69544..849f73b9ae22 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -15,28 +15,10 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashMap; - -use crate::type_conversion::decimal_to_variant_decimal; +use crate::arrow_to_variant::make_arrow_to_variant_row_builder; use crate::{VariantArray, VariantArrayBuilder}; -use arrow::array::{ - Array, AsArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, -}; -use arrow::compute::kernels::cast; -use arrow::datatypes::{ - ArrowNativeType, ArrowPrimitiveType, ArrowTemporalType, ArrowTimestampType, Date32Type, - Date64Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, - RunEndIndexType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType, - Time64NanosecondType, TimestampMicrosecondType, TimestampMillisecondType, - TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, -}; -use arrow::temporal_conversions::{as_date, as_datetime, as_time}; -use arrow_schema::{ArrowError, DataType, TimeUnit}; -use chrono::{DateTime, TimeZone, Utc}; -use parquet_variant::{ - ObjectFieldBuilder, Variant, VariantBuilderExt, VariantDecimal16, VariantDecimal4, - VariantDecimal8, -}; +use arrow::array::Array; +use arrow_schema::ArrowError; /// Casts a typed arrow [`Array`] to a [`VariantArray`]. This is useful when you /// need to convert a specific data type @@ -84,797 +66,10 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { Ok(array_builder.build()) } -/// Factory function to create the appropriate row builder for a given DataType -fn make_arrow_to_variant_row_builder<'a>( - data_type: &'a DataType, - array: &'a dyn Array, -) -> Result, ArrowError> { - let builder = match data_type { - DataType::Null => ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder), - DataType::Boolean => { - ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array)) - } - DataType::Int8 => { - ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Int16 => { - ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Int32 => { - ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Int64 => { - ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::UInt8 => { - ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::UInt16 => { - ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::UInt32 => { - ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::UInt64 => { - ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Float16 => { - ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Float32 => { - ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Float64 => { - ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Decimal32(_, scale) => { - ArrowToVariantRowBuilder::Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale)) - } - DataType::Decimal64(_, scale) => { - ArrowToVariantRowBuilder::Decimal64(Decimal64ArrowToVariantBuilder::new(array, *scale)) - } - DataType::Decimal128(_, scale) => ArrowToVariantRowBuilder::Decimal128( - Decimal128ArrowToVariantBuilder::new(array, *scale), - ), - DataType::Decimal256(_, scale) => ArrowToVariantRowBuilder::Decimal256( - Decimal256ArrowToVariantBuilder::new(array, *scale), - ), - DataType::Timestamp(time_unit, time_zone) => match time_unit { - TimeUnit::Second => ArrowToVariantRowBuilder::TimestampSecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), - ), - TimeUnit::Millisecond => ArrowToVariantRowBuilder::TimestampMillisecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), - ), - TimeUnit::Microsecond => ArrowToVariantRowBuilder::TimestampMicrosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), - ), - TimeUnit::Nanosecond => ArrowToVariantRowBuilder::TimestampNanosecond( - TimestampArrowToVariantBuilder::new(array, time_zone.is_some()), - ), - }, - DataType::Date32 => ArrowToVariantRowBuilder::Date32(DateArrowToVariantBuilder::new(array)), - DataType::Date64 => ArrowToVariantRowBuilder::Date64(DateArrowToVariantBuilder::new(array)), - DataType::Time32(time_unit) => match time_unit { - TimeUnit::Second => { - ArrowToVariantRowBuilder::Time32Second(TimeArrowToVariantBuilder::new(array)) - } - TimeUnit::Millisecond => { - ArrowToVariantRowBuilder::Time32Millisecond(TimeArrowToVariantBuilder::new(array)) - } - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported Time32 unit: {time_unit:?}" - ))) - } - }, - DataType::Time64(time_unit) => match time_unit { - TimeUnit::Microsecond => { - ArrowToVariantRowBuilder::Time64Microsecond(TimeArrowToVariantBuilder::new(array)) - } - TimeUnit::Nanosecond => { - ArrowToVariantRowBuilder::Time64Nanosecond(TimeArrowToVariantBuilder::new(array)) - } - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported Time64 unit: {time_unit:?}" - ))) - } - }, - DataType::Duration(_) | DataType::Interval(_) => { - return Err(ArrowError::InvalidArgumentError( - "Casting duration/interval types to Variant is not supported. \ - The Variant format does not define duration/interval types." - .to_string(), - )) - } - DataType::Binary => { - ArrowToVariantRowBuilder::Binary(BinaryArrowToVariantBuilder::new(array)) - } - DataType::LargeBinary => { - ArrowToVariantRowBuilder::LargeBinary(BinaryArrowToVariantBuilder::new(array)) - } - DataType::BinaryView => { - ArrowToVariantRowBuilder::BinaryView(BinaryViewArrowToVariantBuilder::new(array)) - } - DataType::FixedSizeBinary(_) => ArrowToVariantRowBuilder::FixedSizeBinary( - FixedSizeBinaryArrowToVariantBuilder::new(array), - ), - DataType::Utf8 => ArrowToVariantRowBuilder::String(StringArrowToVariantBuilder::new(array)), - DataType::LargeUtf8 => { - ArrowToVariantRowBuilder::LargeString(StringArrowToVariantBuilder::new(array)) - } - DataType::Utf8View => { - ArrowToVariantRowBuilder::Utf8View(Utf8ViewArrowToVariantBuilder::new(array)) - } - DataType::List(_) => ArrowToVariantRowBuilder::List(ListArrowToVariantBuilder::new(array)?), - DataType::LargeList(_) => { - ArrowToVariantRowBuilder::LargeList(ListArrowToVariantBuilder::new(array)?) - } - DataType::Struct(_) => { - ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new(array.as_struct())?) - } - DataType::Map(_, _) => ArrowToVariantRowBuilder::Map(MapArrowToVariantBuilder::new(array)?), - DataType::Union(_, _) => { - ArrowToVariantRowBuilder::Union(UnionArrowToVariantBuilder::new(array)?) - } - DataType::Dictionary(_, _) => { - ArrowToVariantRowBuilder::Dictionary(DictionaryArrowToVariantBuilder::new(array)?) - } - DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { - DataType::Int16 => ArrowToVariantRowBuilder::RunEndEncodedInt16( - RunEndEncodedArrowToVariantBuilder::new(array)?, - ), - DataType::Int32 => ArrowToVariantRowBuilder::RunEndEncodedInt32( - RunEndEncodedArrowToVariantBuilder::new(array)?, - ), - DataType::Int64 => ArrowToVariantRowBuilder::RunEndEncodedInt64( - RunEndEncodedArrowToVariantBuilder::new(array)?, - ), - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported run ends type: {:?}", - run_ends.data_type() - ))); - } - }, - dt => { - return Err(ArrowError::CastError(format!( - "Unsupported data type for casting to Variant: {dt:?}", - ))); - } - }; - Ok(builder) -} - // TODO do we need a cast_with_options to allow specifying conversion behavior, // e.g. how to handle overflows, whether to convert to Variant::Null or return // an error, etc. ? -// ============================================================================ -// Row-oriented builders for efficient Arrow-to-Variant conversion -// ============================================================================ - -/// Row builder for converting Arrow arrays to VariantArray row by row -pub(crate) enum ArrowToVariantRowBuilder<'a> { - PrimitiveInt8(PrimitiveArrowToVariantBuilder<'a, Int8Type>), - PrimitiveInt16(PrimitiveArrowToVariantBuilder<'a, Int16Type>), - PrimitiveInt32(PrimitiveArrowToVariantBuilder<'a, Int32Type>), - PrimitiveInt64(PrimitiveArrowToVariantBuilder<'a, Int64Type>), - PrimitiveUInt8(PrimitiveArrowToVariantBuilder<'a, UInt8Type>), - PrimitiveUInt16(PrimitiveArrowToVariantBuilder<'a, UInt16Type>), - PrimitiveUInt32(PrimitiveArrowToVariantBuilder<'a, UInt32Type>), - PrimitiveUInt64(PrimitiveArrowToVariantBuilder<'a, UInt64Type>), - PrimitiveFloat16(PrimitiveArrowToVariantBuilder<'a, Float16Type>), - PrimitiveFloat32(PrimitiveArrowToVariantBuilder<'a, Float32Type>), - PrimitiveFloat64(PrimitiveArrowToVariantBuilder<'a, Float64Type>), - Decimal32(Decimal32ArrowToVariantBuilder<'a>), - Decimal64(Decimal64ArrowToVariantBuilder<'a>), - Decimal128(Decimal128ArrowToVariantBuilder<'a>), - Decimal256(Decimal256ArrowToVariantBuilder<'a>), - Boolean(BooleanArrowToVariantBuilder<'a>), - String(StringArrowToVariantBuilder<'a, i32>), - LargeString(StringArrowToVariantBuilder<'a, i64>), - Binary(BinaryArrowToVariantBuilder<'a, i32>), - LargeBinary(BinaryArrowToVariantBuilder<'a, i64>), - BinaryView(BinaryViewArrowToVariantBuilder<'a>), - FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder<'a>), - Utf8View(Utf8ViewArrowToVariantBuilder<'a>), - Struct(StructArrowToVariantBuilder<'a>), - Null(NullArrowToVariantBuilder), - RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder<'a, Int16Type>), - RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder<'a, Int32Type>), - RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder<'a, Int64Type>), - Dictionary(DictionaryArrowToVariantBuilder<'a>), - List(ListArrowToVariantBuilder<'a, i32>), - LargeList(ListArrowToVariantBuilder<'a, i64>), - Map(MapArrowToVariantBuilder<'a>), - Union(UnionArrowToVariantBuilder<'a>), - TimestampSecond(TimestampArrowToVariantBuilder<'a, TimestampSecondType>), - TimestampMillisecond(TimestampArrowToVariantBuilder<'a, TimestampMillisecondType>), - TimestampMicrosecond(TimestampArrowToVariantBuilder<'a, TimestampMicrosecondType>), - TimestampNanosecond(TimestampArrowToVariantBuilder<'a, TimestampNanosecondType>), - Date32(DateArrowToVariantBuilder<'a, Date32Type>), - Date64(DateArrowToVariantBuilder<'a, Date64Type>), - Time32Second(TimeArrowToVariantBuilder<'a, Time32SecondType>), - Time32Millisecond(TimeArrowToVariantBuilder<'a, Time32MillisecondType>), - Time64Microsecond(TimeArrowToVariantBuilder<'a, Time64MicrosecondType>), - Time64Nanosecond(TimeArrowToVariantBuilder<'a, Time64NanosecondType>), -} - -impl<'a> ArrowToVariantRowBuilder<'a> { - pub fn append_row( - &mut self, - index: usize, - builder: &mut impl VariantBuilderExt, - ) -> Result<(), ArrowError> { - match self { - ArrowToVariantRowBuilder::PrimitiveInt8(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveInt16(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveInt32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveInt64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveUInt8(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveUInt16(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveUInt32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveUInt64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveFloat16(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveFloat32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveFloat64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Decimal32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Decimal64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Decimal128(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Decimal256(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Boolean(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::String(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::LargeString(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Binary(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::LargeBinary(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::BinaryView(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::FixedSizeBinary(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Utf8View(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Struct(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Null(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::RunEndEncodedInt16(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::RunEndEncodedInt32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::RunEndEncodedInt64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Dictionary(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::List(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::LargeList(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Map(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Union(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::TimestampSecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::TimestampMillisecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::TimestampMicrosecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::TimestampNanosecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Date32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Date64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Time32Second(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Time32Millisecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Time64Microsecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Time64Nanosecond(b) => b.append_row(index, builder), - } - } -} - -/// Macro to define (possibly generic) row builders with consistent structure and behavior. -/// Supports optional extra fields that are passed to the constructor. -macro_rules! define_row_builder { - ( - struct $name:ident<$lifetime:lifetime $(, $generic:ident: $($bound:path)+)?> - $(where $where_path:path: $where_bound:path)? - $({ $($field:ident: $field_type:ty),* $(,)? })?, - |$array_param:ident| -> $array_type:ty { $init_expr:expr }, - |$value:ident| $value_transform:expr - ) => { - pub(crate) struct $name<$lifetime $(, $generic: $($bound)+)?> - $(where $where_path: $where_bound)? - { - array: &$lifetime $array_type, - $($($field: $field_type,)*)? - } - - impl<$lifetime $(, $generic: $($bound)+)?> $name<$lifetime $(, $generic)?> - $(where $where_path: $where_bound)? - { - fn new($array_param: &$lifetime dyn Array $(, $($field: $field_type),*)?) -> Self { - Self { - array: $init_expr, - $($($field,)*)? - } - } - - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { - if self.array.is_null(index) { - builder.append_null(); - } else { - let $value = self.array.value(index); - // Capture fields as variables the transform can access (hygiene) - $($(let $field = &self.$field;)*)? - builder.append_value($value_transform); - } - Ok(()) - } - } - }; -} - -// ============================================================================ -// Generic row builders generated by macro -// ============================================================================ - -// Primitive builder - handles all primitive types -define_row_builder!( - struct PrimitiveArrowToVariantBuilder<'a, T: ArrowPrimitiveType> - where T::Native: Into>, - |array| -> PrimitiveArray { array.as_primitive() }, - |value| value -); - -// Boolean builder - handles BooleanArray -define_row_builder!( - struct BooleanArrowToVariantBuilder<'a>, - |array| -> arrow::array::BooleanArray { array.as_boolean() }, - |value| value -); - -// Generic String builder for StringArray (Utf8 and LargeUtf8) -define_row_builder!( - struct StringArrowToVariantBuilder<'a, O: OffsetSizeTrait>, - |array| -> GenericStringArray { array.as_string() }, - |value| value -); - -/// Struct builder for StructArray -pub(crate) struct StructArrowToVariantBuilder<'a> { - struct_array: &'a arrow::array::StructArray, - field_builders: Vec<(&'a str, ArrowToVariantRowBuilder<'a>)>, -} - -impl<'a> StructArrowToVariantBuilder<'a> { - fn new(struct_array: &'a arrow::array::StructArray) -> Result { - let mut field_builders = Vec::new(); - - // Create a row builder for each field - for (field_name, field_array) in struct_array - .column_names() - .iter() - .zip(struct_array.columns().iter()) - { - let field_builder = - make_arrow_to_variant_row_builder(field_array.data_type(), field_array.as_ref())?; - field_builders.push((*field_name, field_builder)); - } - - Ok(Self { - struct_array, - field_builders, - }) - } - - fn append_row( - &mut self, - index: usize, - builder: &mut impl VariantBuilderExt, - ) -> Result<(), ArrowError> { - if self.struct_array.is_null(index) { - builder.append_null(); - } else { - // Create object builder for this struct row - let mut obj_builder = builder.try_new_object()?; - - // Process each field - for (field_name, row_builder) in &mut self.field_builders { - let mut field_builder = - parquet_variant::ObjectFieldBuilder::new(field_name, &mut obj_builder); - row_builder.append_row(index, &mut field_builder)?; - } - - obj_builder.finish(); - } - Ok(()) - } -} - -/// Null builder that always appends null -pub(crate) struct NullArrowToVariantBuilder; - -impl NullArrowToVariantBuilder { - fn append_row( - &mut self, - _index: usize, - builder: &mut impl VariantBuilderExt, - ) -> Result<(), ArrowError> { - builder.append_null(); - Ok(()) - } -} - -/// Run-end encoded array builder with efficient sequential access -pub(crate) struct RunEndEncodedArrowToVariantBuilder<'a, R: RunEndIndexType> { - run_array: &'a arrow::array::RunArray, - values_builder: Box>, - - run_ends: &'a [R::Native], - run_number: usize, // Physical index into run_ends and values - run_start: usize, // Logical start index of current run -} - -impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { - fn new(array: &'a dyn Array) -> Result { - let Some(run_array) = array.as_run_opt() else { - return Err(ArrowError::CastError("Expected RunArray".to_string())); - }; - - let values = run_array.values(); - let values_builder = - make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; - - Ok(Self { - run_array, - values_builder: Box::new(values_builder), - run_ends: run_array.run_ends().values(), - run_number: 0, - run_start: 0, - }) - } - - fn append_row( - &mut self, - index: usize, - builder: &mut impl VariantBuilderExt, - ) -> Result<(), ArrowError> { - self.set_run_for_index(index)?; - - // Handle null values - if self.run_array.values().is_null(self.run_number) { - builder.append_null(); - return Ok(()); - } - - // Re-encode the value - self.values_builder.append_row(self.run_number, builder)?; - - Ok(()) - } - - fn set_run_for_index(&mut self, index: usize) -> Result<(), ArrowError> { - if index >= self.run_start { - let Some(run_end) = self.run_ends.get(self.run_number) else { - return Err(ArrowError::CastError(format!( - "Index {index} beyond run array" - ))); - }; - if index < run_end.as_usize() { - return Ok(()); - } - if index == run_end.as_usize() { - self.run_number += 1; - self.run_start = run_end.as_usize(); - return Ok(()); - } - } - - // Use partition_point for all non-sequential cases - let run_number = self - .run_ends - .partition_point(|&run_end| run_end.as_usize() <= index); - if run_number >= self.run_ends.len() { - return Err(ArrowError::CastError(format!( - "Index {index} beyond run array" - ))); - } - self.run_number = run_number; - self.run_start = match run_number { - 0 => 0, - _ => self.run_ends[run_number - 1].as_usize(), - }; - Ok(()) - } -} - -/// Dictionary array builder with simple O(1) indexing -pub(crate) struct DictionaryArrowToVariantBuilder<'a> { - keys: &'a dyn Array, // only needed for null checks - normalized_keys: Vec, - values_builder: Box>, -} - -impl<'a> DictionaryArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Result { - let dict_array = array.as_any_dictionary(); - let values = dict_array.values(); - let values_builder = - make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; - - // WARNING: normalized_keys panics if values is empty - let normalized_keys = match values.len() { - 0 => Vec::new(), - _ => dict_array.normalized_keys(), - }; - - Ok(Self { - keys: dict_array.keys(), - normalized_keys, - values_builder: Box::new(values_builder), - }) - } - - fn append_row( - &mut self, - index: usize, - builder: &mut impl VariantBuilderExt, - ) -> Result<(), ArrowError> { - if self.keys.is_null(index) { - builder.append_null(); - } else { - let normalized_key = self.normalized_keys[index]; - self.values_builder.append_row(normalized_key, builder)?; - } - Ok(()) - } -} - -/// Generic list builder for List and LargeList types -pub(crate) struct ListArrowToVariantBuilder<'a, O: OffsetSizeTrait> { - list_array: &'a arrow::array::GenericListArray, - values_builder: Box>, -} - -impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { - fn new(array: &'a dyn Array) -> Result { - let list_array = array.as_list(); - let values = list_array.values(); - let values_builder = - make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; - - Ok(Self { - list_array, - values_builder: Box::new(values_builder), - }) - } - - fn append_row( - &mut self, - index: usize, - builder: &mut impl VariantBuilderExt, - ) -> Result<(), ArrowError> { - if self.list_array.is_null(index) { - builder.append_null(); - return Ok(()); - } - - let offsets = self.list_array.offsets(); - let start = offsets[index].as_usize(); - let end = offsets[index + 1].as_usize(); - - let mut list_builder = builder.try_new_list()?; - for value_index in start..end { - self.values_builder - .append_row(value_index, &mut list_builder)?; - } - list_builder.finish(); - Ok(()) - } -} - -/// Map builder for MapArray types -pub(crate) struct MapArrowToVariantBuilder<'a> { - map_array: &'a arrow::array::MapArray, - key_strings: arrow::array::StringArray, - values_builder: Box>, -} - -impl<'a> MapArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Result { - let map_array = array.as_map(); - - // Pre-cast keys to strings once (like existing convert_map code) - let keys = cast(map_array.keys(), &DataType::Utf8)?; - let key_strings = keys.as_string::().clone(); - - // Create recursive builder for values - let values = map_array.values(); - let values_builder = - make_arrow_to_variant_row_builder(values.data_type(), values.as_ref())?; - - Ok(Self { - map_array, - key_strings, - values_builder: Box::new(values_builder), - }) - } - - fn append_row( - &mut self, - index: usize, - builder: &mut impl VariantBuilderExt, - ) -> Result<(), ArrowError> { - // Check for NULL map first (via null bitmap) - if self.map_array.is_null(index) { - builder.append_null(); - return Ok(()); - } - - let offsets = self.map_array.offsets(); - let start = offsets[index].as_usize(); - let end = offsets[index + 1].as_usize(); - - // Create object builder for this map (even if empty) - let mut object_builder = builder.try_new_object()?; - - // Add each key-value pair (loop does nothing for empty maps - correct!) - for kv_index in start..end { - let key = self.key_strings.value(kv_index); - let mut field_builder = ObjectFieldBuilder::new(key, &mut object_builder); - self.values_builder - .append_row(kv_index, &mut field_builder)?; - } - - object_builder.finish(); // Empty map becomes empty object {} - Ok(()) - } -} - -/// Union builder for both sparse and dense union arrays -pub(crate) struct UnionArrowToVariantBuilder<'a> { - union_array: &'a arrow::array::UnionArray, - child_builders: HashMap>>, -} - -impl<'a> UnionArrowToVariantBuilder<'a> { - fn new(array: &'a dyn Array) -> Result { - let union_array = array.as_union(); - let type_ids = union_array.type_ids(); - - // Create child builders for each union field - let mut child_builders = HashMap::new(); - for &type_id in type_ids { - let child_array = union_array.child(type_id); - let child_builder = - make_arrow_to_variant_row_builder(child_array.data_type(), child_array.as_ref())?; - child_builders.insert(type_id, Box::new(child_builder)); - } - - Ok(Self { - union_array, - child_builders, - }) - } - - fn append_row( - &mut self, - index: usize, - builder: &mut impl VariantBuilderExt, - ) -> Result<(), ArrowError> { - let type_id = self.union_array.type_id(index); - let value_offset = self.union_array.value_offset(index); - - // Delegate to the appropriate child builder, or append null to handle an invalid type_id - match self.child_builders.get_mut(&type_id) { - Some(child_builder) => child_builder.append_row(value_offset, builder)?, - None => builder.append_null(), - } - - Ok(()) - } -} - -// Decimal32 builder for Arrow Decimal32Array -define_row_builder!( - struct Decimal32ArrowToVariantBuilder<'a> { - scale: i8, - }, - |array| -> arrow::array::Decimal32Array { array.as_primitive() }, - |value| decimal_to_variant_decimal!(value, scale, i32, VariantDecimal4) -); - -// Decimal64 builder for Arrow Decimal64Array -define_row_builder!( - struct Decimal64ArrowToVariantBuilder<'a> { - scale: i8, - }, - |array| -> arrow::array::Decimal64Array { array.as_primitive() }, - |value| decimal_to_variant_decimal!(value, scale, i64, VariantDecimal8) -); - -// Decimal128 builder for Arrow Decimal128Array -define_row_builder!( - struct Decimal128ArrowToVariantBuilder<'a> { - scale: i8, - }, - |array| -> arrow::array::Decimal128Array { array.as_primitive() }, - |value| decimal_to_variant_decimal!(value, scale, i128, VariantDecimal16) -); - -// Decimal256 builder for Arrow Decimal256Array -define_row_builder!( - struct Decimal256ArrowToVariantBuilder<'a> { - scale: i8, - }, - |array| -> arrow::array::Decimal256Array { array.as_primitive() }, - |value| { - // Decimal256 needs special handling - convert to i128 if possible - match value.to_i128() { - Some(i128_val) => decimal_to_variant_decimal!(i128_val, scale, i128, VariantDecimal16), - None => Variant::Null, // Value too large for i128 - } - } -); - -// Generic Binary builder for Arrow BinaryArray and LargeBinaryArray -define_row_builder!( - struct BinaryArrowToVariantBuilder<'a, O: OffsetSizeTrait>, - |array| -> GenericBinaryArray { array.as_binary() }, - |value| value -); - -// BinaryView builder - handles BinaryViewArray -define_row_builder!( - struct BinaryViewArrowToVariantBuilder<'a>, - |array| -> arrow::array::BinaryViewArray { array.as_byte_view() }, - |value| value -); - -// FixedSizeBinary builder - handles FixedSizeBinaryArray -define_row_builder!( - struct FixedSizeBinaryArrowToVariantBuilder<'a>, - |array| -> arrow::array::FixedSizeBinaryArray { array.as_fixed_size_binary() }, - |value| value -); - -// Utf8View builder - handles StringViewArray -define_row_builder!( - struct Utf8ViewArrowToVariantBuilder<'a>, - |array| -> arrow::array::StringViewArray { array.as_string_view() }, - |value| value -); - -// Generic Timestamp builder for Arrow timestamp arrays -define_row_builder!( - struct TimestampArrowToVariantBuilder<'a, T: ArrowTimestampType> { - has_time_zone: bool, - }, - |array| -> arrow::array::PrimitiveArray { array.as_primitive() }, - |value| { - // Convert using Arrow's temporal conversion functions - let Some(naive_datetime) = as_datetime::(value) else { - return Err(ArrowError::CastError( - "Failed to convert Arrow timestamp value to chrono::NaiveDateTime".to_string(), - )); - }; - if *has_time_zone { - // Has timezone -> DateTime -> TimestampMicros/TimestampNanos - let utc_dt: DateTime = Utc.from_utc_datetime(&naive_datetime); - Variant::from(utc_dt) // Uses From> for Variant - } else { - // No timezone -> NaiveDateTime -> TimestampNtzMicros/TimestampNtzNanos - Variant::from(naive_datetime) // Uses From for Variant - } - } -); - -// Generic Date builder for Arrow date arrays (Date32, Date64) -define_row_builder!( - struct DateArrowToVariantBuilder<'a, T: ArrowTemporalType> - where i64: From, - |array| -> PrimitiveArray { array.as_primitive() }, - |value| { - let date_value = i64::from(value); - as_date::(date_value).map(Variant::from).unwrap_or(Variant::Null) - } -); - -// Generic Time builder for Arrow time arrays (Time32, Time64) -define_row_builder!( - struct TimeArrowToVariantBuilder<'a, T: ArrowTemporalType> - where i64: From, - |array| -> PrimitiveArray { array.as_primitive() }, - |value| { - let time_value = i64::from(value); - as_time::(time_value).map(Variant::from).unwrap_or(Variant::Null) - } -); - #[cfg(test)] mod tests { use super::*; @@ -893,15 +88,16 @@ mod tests { }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{ - i256, BinaryType, BinaryViewType, IntervalDayTime, IntervalMonthDayNano, LargeBinaryType, + i256, BinaryType, BinaryViewType, Date32Type, Date64Type, Int32Type, Int64Type, + Int8Type, IntervalDayTime, IntervalMonthDayNano, LargeBinaryType, }; use arrow_schema::{DataType, Field, Fields, UnionFields}; use arrow_schema::{ DECIMAL128_MAX_PRECISION, DECIMAL32_MAX_PRECISION, DECIMAL64_MAX_PRECISION, }; - use chrono::{NaiveDate, NaiveTime}; + use chrono::{DateTime, NaiveDate, NaiveTime}; use half::f16; - use parquet_variant::{Variant, VariantBuilder, VariantDecimal16}; + use parquet_variant::{Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8}; use std::{sync::Arc, vec}; macro_rules! max_unscaled_value { @@ -2654,1619 +1850,3 @@ mod tests { } } } - -#[cfg(test)] -mod row_builder_tests { - use super::*; - use arrow::array::{ArrayRef, BooleanArray, Int32Array, StringArray}; - use std::sync::Arc; - - #[test] - fn test_primitive_row_builder() { - // Test Int32Array - let int_array = Int32Array::from(vec![Some(42), None, Some(100)]); - let mut row_builder = - make_arrow_to_variant_row_builder(int_array.data_type(), &int_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - // Test first value - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(0, &mut variant_builder).unwrap(); - variant_builder.finish(); - - // Test null value - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(1, &mut variant_builder).unwrap(); - variant_builder.finish(); - - // Test second value - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(2, &mut variant_builder).unwrap(); - variant_builder.finish(); - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - assert_eq!(variant_array.value(0), Variant::Int32(42)); - assert!(variant_array.is_null(1)); - assert_eq!(variant_array.value(2), Variant::Int32(100)); - } - - #[test] - fn test_string_row_builder() { - let string_array = StringArray::from(vec![Some("hello"), None, Some("world")]); - let mut row_builder = - make_arrow_to_variant_row_builder(string_array.data_type(), &string_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(0, &mut variant_builder).unwrap(); - variant_builder.finish(); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(1, &mut variant_builder).unwrap(); - variant_builder.finish(); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(2, &mut variant_builder).unwrap(); - variant_builder.finish(); - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - assert_eq!(variant_array.value(0), Variant::from("hello")); - assert!(variant_array.is_null(1)); - assert_eq!(variant_array.value(2), Variant::from("world")); - } - - #[test] - fn test_boolean_row_builder() { - let bool_array = BooleanArray::from(vec![Some(true), None, Some(false)]); - let mut row_builder = - make_arrow_to_variant_row_builder(bool_array.data_type(), &bool_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(0, &mut variant_builder).unwrap(); - variant_builder.finish(); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(1, &mut variant_builder).unwrap(); - variant_builder.finish(); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(2, &mut variant_builder).unwrap(); - variant_builder.finish(); - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - assert_eq!(variant_array.value(0), Variant::from(true)); - assert!(variant_array.is_null(1)); - assert_eq!(variant_array.value(2), Variant::from(false)); - } - - #[test] - fn test_struct_row_builder() { - use arrow::array::{ArrayRef, Int32Array, StringArray, StructArray}; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; - - // Create a struct array with int and string fields - let int_field = Field::new("id", DataType::Int32, true); - let string_field = Field::new("name", DataType::Utf8, true); - - let int_array = Int32Array::from(vec![Some(1), None, Some(3)]); - let string_array = StringArray::from(vec![Some("Alice"), Some("Bob"), None]); - - let struct_array = StructArray::try_new( - vec![int_field, string_field].into(), - vec![ - Arc::new(int_array) as ArrayRef, - Arc::new(string_array) as ArrayRef, - ], - None, - ) - .unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(struct_array.data_type(), &struct_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - // Test first row - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(0, &mut variant_builder).unwrap(); - variant_builder.finish(); - - // Test second row (with null int field) - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(1, &mut variant_builder).unwrap(); - variant_builder.finish(); - - // Test third row (with null string field) - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(2, &mut variant_builder).unwrap(); - variant_builder.finish(); - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Check first row - should have both fields - let first_variant = variant_array.value(0); - assert_eq!(first_variant.get_object_field("id"), Some(Variant::from(1))); - assert_eq!( - first_variant.get_object_field("name"), - Some(Variant::from("Alice")) - ); - - // Check second row - should have name field but not id (null field omitted) - let second_variant = variant_array.value(1); - assert_eq!(second_variant.get_object_field("id"), None); // null field omitted - assert_eq!( - second_variant.get_object_field("name"), - Some(Variant::from("Bob")) - ); - - // Check third row - should have id field but not name (null field omitted) - let third_variant = variant_array.value(2); - assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(3))); - assert_eq!(third_variant.get_object_field("name"), None); // null field omitted - } - - #[test] - fn test_run_end_encoded_row_builder() { - use arrow::array::{Int32Array, RunArray}; - use arrow::datatypes::Int32Type; - - // Create a run-end encoded array: [A, A, B, B, B, C] - // run_ends: [2, 5, 6] - // values: ["A", "B", "C"] - let values = StringArray::from(vec!["A", "B", "C"]); - let run_ends = Int32Array::from(vec![2, 5, 6]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); - let mut array_builder = VariantArrayBuilder::new(6); - - // Test sequential access (most common case) - for i in 0..6 { - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); - variant_builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 6); - - // Verify the values - assert_eq!(variant_array.value(0), Variant::from("A")); // Run 0 - assert_eq!(variant_array.value(1), Variant::from("A")); // Run 0 - assert_eq!(variant_array.value(2), Variant::from("B")); // Run 1 - assert_eq!(variant_array.value(3), Variant::from("B")); // Run 1 - assert_eq!(variant_array.value(4), Variant::from("B")); // Run 1 - assert_eq!(variant_array.value(5), Variant::from("C")); // Run 2 - } - - #[test] - fn test_run_end_encoded_random_access() { - use arrow::array::{Int32Array, RunArray}; - use arrow::datatypes::Int32Type; - - // Create a run-end encoded array: [A, A, B, B, B, C] - let values = StringArray::from(vec!["A", "B", "C"]); - let run_ends = Int32Array::from(vec![2, 5, 6]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); - - // Test random access pattern (backward jumps, forward jumps) - let access_pattern = [0, 5, 2, 4, 1, 3]; // Mix of all cases - let expected_values = ["A", "C", "B", "B", "A", "B"]; - - for (i, &index) in access_pattern.iter().enumerate() { - let mut array_builder = VariantArrayBuilder::new(1); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(index, &mut variant_builder).unwrap(); - variant_builder.finish(); - - let variant_array = array_builder.build(); - assert_eq!(variant_array.value(0), Variant::from(expected_values[i])); - } - } - - #[test] - fn test_run_end_encoded_with_nulls() { - use arrow::array::{Int32Array, RunArray}; - use arrow::datatypes::Int32Type; - - // Create a run-end encoded array with null values: [A, A, null, null, B] - let values = StringArray::from(vec![Some("A"), None, Some("B")]); - let run_ends = Int32Array::from(vec![2, 4, 5]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(run_array.data_type(), &run_array).unwrap(); - let mut array_builder = VariantArrayBuilder::new(5); - - // Test sequential access - for i in 0..5 { - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); - variant_builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 5); - - // Verify the values - assert_eq!(variant_array.value(0), Variant::from("A")); // Run 0 - assert_eq!(variant_array.value(1), Variant::from("A")); // Run 0 - assert!(variant_array.is_null(2)); // Run 1 (null) - assert!(variant_array.is_null(3)); // Run 1 (null) - assert_eq!(variant_array.value(4), Variant::from("B")); // Run 2 - } - - #[test] - fn test_dictionary_row_builder() { - use arrow::array::{DictionaryArray, Int32Array}; - use arrow::datatypes::Int32Type; - - // Create a dictionary array: keys=[0, 1, 0, 2, 1], values=["apple", "banana", "cherry"] - let values = StringArray::from(vec!["apple", "banana", "cherry"]); - let keys = Int32Array::from(vec![0, 1, 0, 2, 1]); - let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); - let mut array_builder = VariantArrayBuilder::new(5); - - // Test sequential access - for i in 0..5 { - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); - variant_builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 5); - - // Verify the values match the dictionary lookup - assert_eq!(variant_array.value(0), Variant::from("apple")); // keys[0] = 0 -> values[0] = "apple" - assert_eq!(variant_array.value(1), Variant::from("banana")); // keys[1] = 1 -> values[1] = "banana" - assert_eq!(variant_array.value(2), Variant::from("apple")); // keys[2] = 0 -> values[0] = "apple" - assert_eq!(variant_array.value(3), Variant::from("cherry")); // keys[3] = 2 -> values[2] = "cherry" - assert_eq!(variant_array.value(4), Variant::from("banana")); // keys[4] = 1 -> values[1] = "banana" - } - - #[test] - fn test_dictionary_with_nulls() { - use arrow::array::{DictionaryArray, Int32Array}; - use arrow::datatypes::Int32Type; - - // Create a dictionary array with null keys: keys=[0, null, 1, null, 2], values=["x", "y", "z"] - let values = StringArray::from(vec!["x", "y", "z"]); - let keys = Int32Array::from(vec![Some(0), None, Some(1), None, Some(2)]); - let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); - let mut array_builder = VariantArrayBuilder::new(5); - - // Test sequential access - for i in 0..5 { - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); - variant_builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 5); - - // Verify the values and nulls - assert_eq!(variant_array.value(0), Variant::from("x")); // keys[0] = 0 -> values[0] = "x" - assert!(variant_array.is_null(1)); // keys[1] = null - assert_eq!(variant_array.value(2), Variant::from("y")); // keys[2] = 1 -> values[1] = "y" - assert!(variant_array.is_null(3)); // keys[3] = null - assert_eq!(variant_array.value(4), Variant::from("z")); // keys[4] = 2 -> values[2] = "z" - } - - #[test] - fn test_dictionary_random_access() { - use arrow::array::{DictionaryArray, Int32Array}; - use arrow::datatypes::Int32Type; - - // Create a dictionary array: keys=[0, 1, 2, 0, 1, 2], values=["red", "green", "blue"] - let values = StringArray::from(vec!["red", "green", "blue"]); - let keys = Int32Array::from(vec![0, 1, 2, 0, 1, 2]); - let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); - - // Test random access pattern - let access_pattern = [5, 0, 3, 1, 4, 2]; // Random order - let expected_values = ["blue", "red", "red", "green", "green", "blue"]; - - for (i, &index) in access_pattern.iter().enumerate() { - let mut array_builder = VariantArrayBuilder::new(1); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(index, &mut variant_builder).unwrap(); - variant_builder.finish(); - - let variant_array = array_builder.build(); - assert_eq!(variant_array.value(0), Variant::from(expected_values[i])); - } - } - - #[test] - fn test_nested_dictionary() { - use arrow::array::{DictionaryArray, Int32Array, StructArray}; - use arrow::datatypes::{Field, Int32Type}; - - // Create a dictionary with struct values - let id_array = Int32Array::from(vec![1, 2, 3]); - let name_array = StringArray::from(vec!["Alice", "Bob", "Charlie"]); - let struct_array = StructArray::from(vec![ - ( - Arc::new(Field::new("id", DataType::Int32, false)), - Arc::new(id_array) as ArrayRef, - ), - ( - Arc::new(Field::new("name", DataType::Utf8, false)), - Arc::new(name_array) as ArrayRef, - ), - ]); - - let keys = Int32Array::from(vec![0, 1, 0, 2, 1]); - let dict_array = - DictionaryArray::::try_new(keys, Arc::new(struct_array)).unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array).unwrap(); - let mut array_builder = VariantArrayBuilder::new(5); - - // Test sequential access - for i in 0..5 { - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); - variant_builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 5); - - // Verify the nested struct values - let first_variant = variant_array.value(0); - assert_eq!(first_variant.get_object_field("id"), Some(Variant::from(1))); - assert_eq!( - first_variant.get_object_field("name"), - Some(Variant::from("Alice")) - ); - - let second_variant = variant_array.value(1); - assert_eq!( - second_variant.get_object_field("id"), - Some(Variant::from(2)) - ); - assert_eq!( - second_variant.get_object_field("name"), - Some(Variant::from("Bob")) - ); - - // Test that repeated keys give same values - let third_variant = variant_array.value(2); - assert_eq!(third_variant.get_object_field("id"), Some(Variant::from(1))); - assert_eq!( - third_variant.get_object_field("name"), - Some(Variant::from("Alice")) - ); - } - - #[test] - fn test_list_row_builder() { - use arrow::array::ListArray; - - // Create a list array: [[1, 2], [3, 4, 5], null, []] - let data = vec![ - Some(vec![Some(1), Some(2)]), - Some(vec![Some(3), Some(4), Some(5)]), - None, - Some(vec![]), - ]; - let list_array = ListArray::from_iter_primitive::(data); - - let mut row_builder = - make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); - let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); - - for i in 0..list_array.len() { - let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = variant_array_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 4); - - // Row 0: [1, 2] - let row0 = variant_array.value(0); - let list0 = row0.as_list().unwrap(); - assert_eq!(list0.len(), 2); - assert_eq!(list0.get(0), Some(Variant::from(1))); - assert_eq!(list0.get(1), Some(Variant::from(2))); - - // Row 1: [3, 4, 5] - let row1 = variant_array.value(1); - let list1 = row1.as_list().unwrap(); - assert_eq!(list1.len(), 3); - assert_eq!(list1.get(0), Some(Variant::from(3))); - assert_eq!(list1.get(1), Some(Variant::from(4))); - assert_eq!(list1.get(2), Some(Variant::from(5))); - - // Row 2: null - assert!(variant_array.is_null(2)); - - // Row 3: [] - let row3 = variant_array.value(3); - let list3 = row3.as_list().unwrap(); - assert_eq!(list3.len(), 0); - } - - #[test] - fn test_large_list_row_builder() { - use arrow::array::LargeListArray; - - // Create a large list array: [[1, 2], null] - let data = vec![Some(vec![Some(1i64), Some(2i64)]), None]; - let list_array = LargeListArray::from_iter_primitive::(data); - - let mut row_builder = - make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); - let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); - - for i in 0..list_array.len() { - let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = variant_array_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 2); - - // Row 0: [1, 2] - let row0 = variant_array.value(0); - let list0 = row0.as_list().unwrap(); - assert_eq!(list0.len(), 2); - assert_eq!(list0.get(0), Some(Variant::from(1i64))); - assert_eq!(list0.get(1), Some(Variant::from(2i64))); - - // Row 1: null - assert!(variant_array.is_null(1)); - } - - #[test] - fn test_sliced_list_row_builder() { - use arrow::array::ListArray; - - // Create a list array: [[1, 2], [3, 4, 5], [6]] - let data = vec![ - Some(vec![Some(1), Some(2)]), - Some(vec![Some(3), Some(4), Some(5)]), - Some(vec![Some(6)]), - ]; - let list_array = ListArray::from_iter_primitive::(data); - - // Slice to get just the middle element: [[3, 4, 5]] - let sliced_array = list_array.slice(1, 1); - - let mut row_builder = - make_arrow_to_variant_row_builder(sliced_array.data_type(), &sliced_array).unwrap(); - let mut variant_array_builder = VariantArrayBuilder::new(sliced_array.len()); - - // Test the single row - let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(0, &mut builder).unwrap(); - builder.finish(); - - let variant_array = variant_array_builder.build(); - - // Verify result - assert_eq!(variant_array.len(), 1); - - // Row 0: [3, 4, 5] - let row0 = variant_array.value(0); - let list0 = row0.as_list().unwrap(); - assert_eq!(list0.len(), 3); - assert_eq!(list0.get(0), Some(Variant::from(3))); - assert_eq!(list0.get(1), Some(Variant::from(4))); - assert_eq!(list0.get(2), Some(Variant::from(5))); - } - - #[test] - fn test_nested_list_row_builder() { - use arrow::array::ListArray; - use arrow::datatypes::Field; - - // Build the nested structure manually - let inner_field = Arc::new(Field::new("item", DataType::Int32, true)); - let inner_list_field = Arc::new(Field::new("item", DataType::List(inner_field), true)); - - let values_data = vec![Some(vec![Some(1), Some(2)]), Some(vec![Some(3)])]; - let values_list = ListArray::from_iter_primitive::(values_data); - - let outer_offsets = arrow::buffer::OffsetBuffer::new(vec![0i32, 2, 2].into()); - let outer_list = ListArray::new( - inner_list_field, - outer_offsets, - Arc::new(values_list), - Some(arrow::buffer::NullBuffer::from(vec![true, false])), - ); - - let mut row_builder = - make_arrow_to_variant_row_builder(outer_list.data_type(), &outer_list).unwrap(); - let mut variant_array_builder = VariantArrayBuilder::new(outer_list.len()); - - for i in 0..outer_list.len() { - let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = variant_array_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 2); - - // Row 0: [[1, 2], [3]] - let row0 = variant_array.value(0); - let outer_list0 = row0.as_list().unwrap(); - assert_eq!(outer_list0.len(), 2); - - let inner_list0_0 = outer_list0.get(0).unwrap(); - let inner_list0_0 = inner_list0_0.as_list().unwrap(); - assert_eq!(inner_list0_0.len(), 2); - assert_eq!(inner_list0_0.get(0), Some(Variant::from(1))); - assert_eq!(inner_list0_0.get(1), Some(Variant::from(2))); - - let inner_list0_1 = outer_list0.get(1).unwrap(); - let inner_list0_1 = inner_list0_1.as_list().unwrap(); - assert_eq!(inner_list0_1.len(), 1); - assert_eq!(inner_list0_1.get(0), Some(Variant::from(3))); - - // Row 1: null - assert!(variant_array.is_null(1)); - } - - #[test] - fn test_map_row_builder() { - use arrow::array::{Int32Array, MapArray, StringArray, StructArray}; - use arrow::buffer::{NullBuffer, OffsetBuffer}; - use arrow::datatypes::{DataType, Field, Fields}; - use std::sync::Arc; - - // Create the entries struct array (key-value pairs) - let keys = StringArray::from(vec!["key1", "key2", "key3"]); - let values = Int32Array::from(vec![1, 2, 3]); - let entries_fields = Fields::from(vec![ - Field::new("key", DataType::Utf8, false), - Field::new("value", DataType::Int32, true), - ]); - let entries = StructArray::new( - entries_fields.clone(), - vec![Arc::new(keys), Arc::new(values)], - None, // No nulls in the entries themselves - ); - - // Create offsets for 4 maps: [0..1], [1..1], [1..1], [1..3] - // Map 0: {"key1": 1} (1 entry) - // Map 1: {} (0 entries - empty) - // Map 2: null (0 entries but NULL via null buffer) - // Map 3: {"key2": 2, "key3": 3} (2 entries) - let offsets = OffsetBuffer::new(vec![0, 1, 1, 1, 3].into()); - - // Create null buffer - map at index 2 is NULL - let null_buffer = Some(NullBuffer::from(vec![true, true, false, true])); - - // Create the map field - let map_field = Arc::new(Field::new( - "entries", - DataType::Struct(entries_fields), - false, // Keys are non-nullable - )); - - // Create MapArray using try_new - let map_array = MapArray::try_new( - map_field, - offsets, - entries, - null_buffer, - false, // not ordered - ) - .unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(map_array.data_type(), &map_array).unwrap(); - let mut variant_array_builder = VariantArrayBuilder::new(4); - - // Test each row - for i in 0..4 { - let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = variant_array_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 4); - - // Map 0: {"key1": 1} - let map0 = variant_array.value(0); - let obj0 = map0.as_object().unwrap(); - assert_eq!(obj0.len(), 1); - assert_eq!(obj0.get("key1"), Some(Variant::from(1))); - - // Map 1: {} (empty object, not null) - let map1 = variant_array.value(1); - let obj1 = map1.as_object().unwrap(); - assert_eq!(obj1.len(), 0); // Empty object - - // Map 2: null (actual NULL) - assert!(variant_array.is_null(2)); - - // Map 3: {"key2": 2, "key3": 3} - let map3 = variant_array.value(3); - let obj3 = map3.as_object().unwrap(); - assert_eq!(obj3.len(), 2); - assert_eq!(obj3.get("key2"), Some(Variant::from(2))); - assert_eq!(obj3.get("key3"), Some(Variant::from(3))); - } - - #[test] - fn test_union_sparse_row_builder() { - use arrow::array::{Float64Array, Int32Array, StringArray, UnionArray}; - use arrow::buffer::ScalarBuffer; - use arrow::datatypes::{DataType, Field, UnionFields}; - use std::sync::Arc; - - // Create a sparse union array with mixed types (int, float, string) - let int_array = Int32Array::from(vec![Some(1), None, None, None, Some(34), None]); - let float_array = Float64Array::from(vec![None, Some(3.2), None, Some(32.5), None, None]); - let string_array = StringArray::from(vec![None, None, Some("hello"), None, None, None]); - let type_ids = [0, 1, 2, 1, 0, 0].into_iter().collect::>(); - - let union_fields = UnionFields::new( - vec![0, 1, 2], - vec![ - Field::new("int_field", DataType::Int32, false), - Field::new("float_field", DataType::Float64, false), - Field::new("string_field", DataType::Utf8, false), - ], - ); - - let children: Vec> = vec![ - Arc::new(int_array), - Arc::new(float_array), - Arc::new(string_array), - ]; - - let union_array = UnionArray::try_new( - union_fields, - type_ids, - None, // Sparse union - children, - ) - .unwrap(); - - // Test the row builder - let mut row_builder = - make_arrow_to_variant_row_builder(union_array.data_type(), &union_array).unwrap(); - - let mut variant_builder = VariantArrayBuilder::new(union_array.len()); - for i in 0..union_array.len() { - let mut builder = variant_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - let variant_array = variant_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 6); - - // Row 0: int 1 - assert_eq!(variant_array.value(0), Variant::Int32(1)); - - // Row 1: float 3.2 - assert_eq!(variant_array.value(1), Variant::Double(3.2)); - - // Row 2: string "hello" - assert_eq!(variant_array.value(2), Variant::from("hello")); - - // Row 3: float 32.5 - assert_eq!(variant_array.value(3), Variant::Double(32.5)); - - // Row 4: int 34 - assert_eq!(variant_array.value(4), Variant::Int32(34)); - - // Row 5: null (int array has null at this position) - assert!(variant_array.is_null(5)); - } - - #[test] - fn test_union_dense_row_builder() { - use arrow::array::{Float64Array, Int32Array, StringArray, UnionArray}; - use arrow::buffer::ScalarBuffer; - use arrow::datatypes::{DataType, Field, UnionFields}; - use std::sync::Arc; - - // Create a dense union array with mixed types (int, float, string) - let int_array = Int32Array::from(vec![Some(1), Some(34), None]); - let float_array = Float64Array::from(vec![3.2, 32.5]); - let string_array = StringArray::from(vec!["hello"]); - let type_ids = [0, 1, 2, 1, 0, 0].into_iter().collect::>(); - let offsets = [0, 0, 0, 1, 1, 2] - .into_iter() - .collect::>(); - - let union_fields = UnionFields::new( - vec![0, 1, 2], - vec![ - Field::new("int_field", DataType::Int32, false), - Field::new("float_field", DataType::Float64, false), - Field::new("string_field", DataType::Utf8, false), - ], - ); - - let children: Vec> = vec![ - Arc::new(int_array), - Arc::new(float_array), - Arc::new(string_array), - ]; - - let union_array = UnionArray::try_new( - union_fields, - type_ids, - Some(offsets), // Dense union - children, - ) - .unwrap(); - - // Test the row builder - let mut row_builder = - make_arrow_to_variant_row_builder(union_array.data_type(), &union_array).unwrap(); - - let mut variant_builder = VariantArrayBuilder::new(union_array.len()); - for i in 0..union_array.len() { - let mut builder = variant_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - let variant_array = variant_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 6); - - // Row 0: int 1 (offset 0 in int_array) - assert_eq!(variant_array.value(0), Variant::Int32(1)); - - // Row 1: float 3.2 (offset 0 in float_array) - assert_eq!(variant_array.value(1), Variant::Double(3.2)); - - // Row 2: string "hello" (offset 0 in string_array) - assert_eq!(variant_array.value(2), Variant::from("hello")); - - // Row 3: float 32.5 (offset 1 in float_array) - assert_eq!(variant_array.value(3), Variant::Double(32.5)); - - // Row 4: int 34 (offset 1 in int_array) - assert_eq!(variant_array.value(4), Variant::Int32(34)); - - // Row 5: null (offset 2 in int_array, which has null) - assert!(variant_array.is_null(5)); - } - - #[test] - fn test_union_sparse_type_ids_row_builder() { - use arrow::array::{Int32Array, StringArray, UnionArray}; - use arrow::buffer::ScalarBuffer; - use arrow::datatypes::{DataType, Field, UnionFields}; - use std::sync::Arc; - - // Create a sparse union with non-contiguous type IDs (1, 3) - let int_array = Int32Array::from(vec![Some(42), None]); - let string_array = StringArray::from(vec![None, Some("test")]); - let type_ids = [1, 3].into_iter().collect::>(); - - let union_fields = UnionFields::new( - vec![1, 3], // Non-contiguous type IDs - vec![ - Field::new("int_field", DataType::Int32, false), - Field::new("string_field", DataType::Utf8, false), - ], - ); - - let children: Vec> = vec![Arc::new(int_array), Arc::new(string_array)]; - - let union_array = UnionArray::try_new( - union_fields, - type_ids, - None, // Sparse union - children, - ) - .unwrap(); - - // Test the row builder - let mut row_builder = - make_arrow_to_variant_row_builder(union_array.data_type(), &union_array).unwrap(); - - let mut variant_builder = VariantArrayBuilder::new(union_array.len()); - for i in 0..union_array.len() { - let mut builder = variant_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - let variant_array = variant_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 2); - - // Row 0: int 42 (type_id = 1) - assert_eq!(variant_array.value(0), Variant::Int32(42)); - - // Row 1: string "test" (type_id = 3) - assert_eq!(variant_array.value(1), Variant::from("test")); - } - - #[test] - fn test_decimal32_row_builder() { - use arrow::array::Decimal32Array; - use parquet_variant::VariantDecimal4; - - // Test Decimal32Array with scale 2 (e.g., for currency: 12.34) - let decimal_array = Decimal32Array::from(vec![Some(1234), None, Some(-5678)]) - .with_precision_and_scale(9, 2) - .unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..decimal_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: 12.34 (1234 with scale 2) - assert_eq!( - variant_array.value(0), - Variant::from(VariantDecimal4::try_new(1234, 2).unwrap()) - ); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: -56.78 (-5678 with scale 2) - assert_eq!( - variant_array.value(2), - Variant::from(VariantDecimal4::try_new(-5678, 2).unwrap()) - ); - } - - #[test] - fn test_decimal128_row_builder() { - use arrow::array::Decimal128Array; - use parquet_variant::VariantDecimal16; - - // Test Decimal128Array with negative scale (multiply by 10^|scale|) - let decimal_array = Decimal128Array::from(vec![Some(123), None, Some(456)]) - .with_precision_and_scale(10, -2) - .unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..decimal_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: 123 * 10^2 = 12300 with scale 0 (negative scale handling) - assert_eq!( - variant_array.value(0), - Variant::from(VariantDecimal16::try_new(12300, 0).unwrap()) - ); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 456 * 10^2 = 45600 with scale 0 - assert_eq!( - variant_array.value(2), - Variant::from(VariantDecimal16::try_new(45600, 0).unwrap()) - ); - } - - #[test] - fn test_decimal256_overflow_row_builder() { - use arrow::array::Decimal256Array; - use arrow::datatypes::i256; - - // Test Decimal256Array with a value that overflows i128 - let large_value = i256::from_i128(i128::MAX) + i256::from(1); // Overflows i128 - let decimal_array = Decimal256Array::from(vec![Some(large_value), Some(i256::from(123))]) - .with_precision_and_scale(76, 3) - .unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(2); - - for i in 0..decimal_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 2); - - // Row 0: overflow value becomes Variant::Null - assert_eq!(variant_array.value(0), Variant::Null); - - // Row 1: normal value converts successfully - assert_eq!( - variant_array.value(1), - Variant::from(VariantDecimal16::try_new(123, 3).unwrap()) - ); - } - - #[test] - fn test_binary_row_builder() { - use arrow::array::BinaryArray; - - // Test BinaryArray with various binary data - let binary_data = vec![ - Some(b"hello".as_slice()), - None, - Some(b"\x00\x01\x02\xFF".as_slice()), - Some(b"".as_slice()), // Empty binary - ]; - let binary_array = BinaryArray::from(binary_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(binary_array.data_type(), &binary_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..binary_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: "hello" bytes - assert_eq!(variant_array.value(0), Variant::from(b"hello".as_slice())); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: binary with special bytes - let bytes = [0x00, 0x01, 0x02, 0xFF]; - assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); - - // Row 3: empty binary - let bytes = []; - assert_eq!(variant_array.value(3), Variant::from(bytes.as_slice())); - } - - #[test] - fn test_large_binary_row_builder() { - use arrow::array::LargeBinaryArray; - - // Test LargeBinaryArray - let binary_data = vec![ - Some(b"large binary data".as_slice()), - None, - Some(b"another large chunk".as_slice()), - ]; - let large_binary_array = LargeBinaryArray::from(binary_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(large_binary_array.data_type(), &large_binary_array) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..large_binary_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: large binary data - assert_eq!( - variant_array.value(0), - Variant::from(b"large binary data".as_slice()) - ); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: another large chunk - assert_eq!( - variant_array.value(2), - Variant::from(b"another large chunk".as_slice()) - ); - } - - #[test] - fn test_binary_view_row_builder() { - use arrow::array::BinaryViewArray; - - // Test BinaryViewArray - let binary_data = vec![ - Some(b"short".as_slice()), - None, - Some(b"this is a longer binary view that exceeds inline storage".as_slice()), - ]; - let binary_view_array = BinaryViewArray::from(binary_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(binary_view_array.data_type(), &binary_view_array) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..binary_view_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: short binary - assert_eq!(variant_array.value(0), Variant::from(b"short".as_slice())); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: long binary view - assert_eq!( - variant_array.value(2), - Variant::from(b"this is a longer binary view that exceeds inline storage".as_slice()) - ); - } - - #[test] - fn test_fixed_size_binary_row_builder() { - use arrow::array::FixedSizeBinaryArray; - - // Test FixedSizeBinaryArray with 4-byte values - let binary_data = vec![ - Some([0x01, 0x02, 0x03, 0x04]), - None, - Some([0xFF, 0xFE, 0xFD, 0xFC]), - ]; - let fixed_binary_array = - FixedSizeBinaryArray::try_from_sparse_iter_with_size(binary_data.into_iter(), 4) - .unwrap(); - - let mut row_builder = - make_arrow_to_variant_row_builder(fixed_binary_array.data_type(), &fixed_binary_array) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..fixed_binary_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: fixed size binary - let bytes = [0x01, 0x02, 0x03, 0x04]; - assert_eq!(variant_array.value(0), Variant::from(bytes.as_slice())); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: another fixed size binary - let bytes = [0xFF, 0xFE, 0xFD, 0xFC]; - assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); - } - - #[test] - fn test_utf8_view_row_builder() { - use arrow::array::StringViewArray; - - // Test StringViewArray (Utf8View) - let string_data = vec![ - Some("short"), - None, - Some("this is a much longer string that will be stored out-of-line in the buffer"), - ]; - let string_view_array = StringViewArray::from(string_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(string_view_array.data_type(), &string_view_array) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..string_view_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: short string - assert_eq!(variant_array.value(0), Variant::from("short")); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: long string view - assert_eq!( - variant_array.value(2), - Variant::from( - "this is a much longer string that will be stored out-of-line in the buffer" - ) - ); - } - - #[test] - fn test_timestamp_second_row_builder() { - use arrow::array::TimestampSecondArray; - - // Test TimestampSecondArray without timezone - let timestamp_data = vec![ - Some(1609459200), // 2021-01-01 00:00:00 UTC - None, - Some(1640995200), // 2022-01-01 00:00:00 UTC - ]; - let timestamp_array = TimestampSecondArray::from(timestamp_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..timestamp_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: 2021-01-01 00:00:00 (no timezone -> NaiveDateTime -> TimestampNtzMicros) - let expected_naive = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); - assert_eq!(variant_array.value(0), Variant::from(expected_naive)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 2022-01-01 00:00:00 - let expected_naive2 = DateTime::from_timestamp(1640995200, 0).unwrap().naive_utc(); - assert_eq!(variant_array.value(2), Variant::from(expected_naive2)); - } - - #[test] - fn test_timestamp_with_timezone_row_builder() { - use arrow::array::TimestampMicrosecondArray; - use chrono::DateTime; - - // Test TimestampMicrosecondArray with timezone - let timestamp_data = vec![ - Some(1609459200000000), // 2021-01-01 00:00:00 UTC (in microseconds) - None, - Some(1640995200000000), // 2022-01-01 00:00:00 UTC (in microseconds) - ]; - let timezone = "UTC".to_string(); - let timestamp_array = - TimestampMicrosecondArray::from(timestamp_data).with_timezone(timezone.clone()); - - let mut row_builder = - make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..timestamp_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: 2021-01-01 00:00:00 UTC (with timezone -> DateTime -> TimestampMicros) - let expected_utc = DateTime::from_timestamp(1609459200, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_utc)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 2022-01-01 00:00:00 UTC - let expected_utc2 = DateTime::from_timestamp(1640995200, 0).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_utc2)); - } - - #[test] - fn test_timestamp_nanosecond_precision_row_builder() { - use arrow::array::TimestampNanosecondArray; - - // Test TimestampNanosecondArray with nanosecond precision - let timestamp_data = vec![ - Some(1609459200123456789), // 2021-01-01 00:00:00.123456789 UTC - None, - Some(1609459200000000000), // 2021-01-01 00:00:00.000000000 UTC (no fractional seconds) - ]; - let timestamp_array = TimestampNanosecondArray::from(timestamp_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..timestamp_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: with nanoseconds -> should use TimestampNtzNanos - let expected_with_nanos = DateTime::from_timestamp(1609459200, 123456789) - .unwrap() - .naive_utc(); - assert_eq!(variant_array.value(0), Variant::from(expected_with_nanos)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: no fractional seconds -> should use TimestampNtzMicros - let expected_no_nanos = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); - assert_eq!(variant_array.value(2), Variant::from(expected_no_nanos)); - } - - #[test] - fn test_timestamp_millisecond_row_builder() { - use arrow::array::TimestampMillisecondArray; - - // Test TimestampMillisecondArray - let timestamp_data = vec![ - Some(1609459200123), // 2021-01-01 00:00:00.123 UTC - None, - Some(1609459200000), // 2021-01-01 00:00:00.000 UTC - ]; - let timestamp_array = TimestampMillisecondArray::from(timestamp_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(timestamp_array.data_type(), ×tamp_array) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..timestamp_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: with milliseconds -> TimestampNtzMicros (123ms = 123000000ns) - let expected_with_millis = DateTime::from_timestamp(1609459200, 123000000) - .unwrap() - .naive_utc(); - assert_eq!(variant_array.value(0), Variant::from(expected_with_millis)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: no fractional seconds -> TimestampNtzMicros - let expected_no_millis = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); - assert_eq!(variant_array.value(2), Variant::from(expected_no_millis)); - } - - #[test] - fn test_date32_row_builder() { - use arrow::array::Date32Array; - use chrono::NaiveDate; - - // Test Date32Array with various dates - let date_data = vec![ - Some(0), // 1970-01-01 - None, - Some(19723), // 2024-01-01 (days since epoch) - Some(-719162), // 0001-01-01 (near minimum) - ]; - let date_array = Date32Array::from(date_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(date_array.data_type(), &date_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..date_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 1970-01-01 (epoch) - let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 2024-01-01 - let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_2024)); - - // Row 3: 0001-01-01 (near minimum date) - let expected_min = NaiveDate::from_ymd_opt(1, 1, 1).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_min)); - } - - #[test] - fn test_date64_row_builder() { - use arrow::array::Date64Array; - use chrono::NaiveDate; - - // Test Date64Array with various dates (milliseconds since epoch) - let date_data = vec![ - Some(0), // 1970-01-01 - None, - Some(1704067200000), // 2024-01-01 (milliseconds since epoch) - Some(86400000), // 1970-01-02 - ]; - let date_array = Date64Array::from(date_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(date_array.data_type(), &date_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..date_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 1970-01-01 (epoch) - let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 2024-01-01 - let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_2024)); - - // Row 3: 1970-01-02 - let expected_next_day = NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_next_day)); - } - - #[test] - fn test_time32_second_row_builder() { - use arrow::array::Time32SecondArray; - use chrono::NaiveTime; - - // Test Time32SecondArray with various times (seconds since midnight) - let time_data = vec![ - Some(0), // 00:00:00 - None, - Some(3661), // 01:01:01 - Some(86399), // 23:59:59 - ]; - let time_array = Time32SecondArray::from(time_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..time_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 00:00:00 (midnight) - let expected_midnight = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 01:01:01 - let expected_time = NaiveTime::from_hms_opt(1, 1, 1).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_time)); - - // Row 3: 23:59:59 (last second of day) - let expected_last = NaiveTime::from_hms_opt(23, 59, 59).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_last)); - } - - #[test] - fn test_time32_millisecond_row_builder() { - use arrow::array::Time32MillisecondArray; - use chrono::NaiveTime; - - // Test Time32MillisecondArray with various times (milliseconds since midnight) - let time_data = vec![ - Some(0), // 00:00:00.000 - None, - Some(3661123), // 01:01:01.123 - Some(86399999), // 23:59:59.999 - ]; - let time_array = Time32MillisecondArray::from(time_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..time_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 00:00:00.000 (midnight) - let expected_midnight = NaiveTime::from_hms_milli_opt(0, 0, 0, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 01:01:01.123 - let expected_time = NaiveTime::from_hms_milli_opt(1, 1, 1, 123).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_time)); - - // Row 3: 23:59:59.999 (last millisecond of day) - let expected_last = NaiveTime::from_hms_milli_opt(23, 59, 59, 999).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_last)); - } - - #[test] - fn test_time64_microsecond_row_builder() { - use arrow::array::Time64MicrosecondArray; - use chrono::NaiveTime; - - // Test Time64MicrosecondArray with various times (microseconds since midnight) - let time_data = vec![ - Some(0), // 00:00:00.000000 - None, - Some(3661123456), // 01:01:01.123456 - Some(86399999999), // 23:59:59.999999 - ]; - let time_array = Time64MicrosecondArray::from(time_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..time_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 00:00:00.000000 (midnight) - let expected_midnight = NaiveTime::from_hms_micro_opt(0, 0, 0, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 01:01:01.123456 - let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_time)); - - // Row 3: 23:59:59.999999 (last microsecond of day) - let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_last)); - } - - #[test] - fn test_time64_nanosecond_row_builder() { - use arrow::array::Time64NanosecondArray; - use chrono::NaiveTime; - - // Test Time64NanosecondArray with various times (nanoseconds since midnight) - let time_data = vec![ - Some(0), // 00:00:00.000000000 - None, - Some(3661123456789), // 01:01:01.123456789 - Some(86399999999999), // 23:59:59.999999999 - ]; - let time_array = Time64NanosecondArray::from(time_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(time_array.data_type(), &time_array).unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..time_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 00:00:00.000000000 (midnight) - let expected_midnight = NaiveTime::from_hms_nano_opt(0, 0, 0, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 01:01:01.123456789 -> truncated to 01:01:01.123456000 (microsecond precision) - let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_time)); - - // Row 3: 23:59:59.999999999 -> truncated to 23:59:59.999999000 (microsecond precision) - let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_last)); - } -} diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index ef674d9614b5..e0414d61b504 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -35,6 +35,7 @@ //! [`VariantPath`]: parquet_variant::VariantPath //! [Variant issue]: https://github.com/apache/arrow-rs/issues/6736 +mod arrow_to_variant; pub mod cast_to_variant; mod from_json; mod to_json; From c3603d6bdd318309d6fa4de91edd5d6a1cb5239c Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 18:45:34 -0700 Subject: [PATCH 49/53] fmt --- parquet-variant-compute/src/cast_to_variant.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 849f73b9ae22..4bc5fd37edc9 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -88,8 +88,8 @@ mod tests { }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{ - i256, BinaryType, BinaryViewType, Date32Type, Date64Type, Int32Type, Int64Type, - Int8Type, IntervalDayTime, IntervalMonthDayNano, LargeBinaryType, + i256, BinaryType, BinaryViewType, Date32Type, Date64Type, Int32Type, Int64Type, Int8Type, + IntervalDayTime, IntervalMonthDayNano, LargeBinaryType, }; use arrow_schema::{DataType, Field, Fields, UnionFields}; use arrow_schema::{ @@ -97,7 +97,9 @@ mod tests { }; use chrono::{DateTime, NaiveDate, NaiveTime}; use half::f16; - use parquet_variant::{Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8}; + use parquet_variant::{ + Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, + }; use std::{sync::Arc, vec}; macro_rules! max_unscaled_value { From 22a77454f468bab2022de7cf18648cedb4a6ccc1 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 8 Sep 2025 19:29:38 -0700 Subject: [PATCH 50/53] self review --- .../src/arrow_to_variant.rs | 287 +++++++----------- .../src/cast_to_variant.rs | 2 +- .../src/type_conversion.rs | 53 ---- 3 files changed, 107 insertions(+), 235 deletions(-) diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index a3cc801b8a73..a32ccbe9a01c 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -91,53 +91,53 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { impl<'a> ArrowToVariantRowBuilder<'a> { pub fn append_row( &mut self, - index: usize, builder: &mut impl VariantBuilderExt, + index: usize, ) -> Result<(), ArrowError> { match self { - ArrowToVariantRowBuilder::Null(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Boolean(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveInt8(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveInt16(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveInt32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveInt64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveUInt8(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveUInt16(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveUInt32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveUInt64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveFloat16(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveFloat32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::PrimitiveFloat64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Decimal32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Decimal64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Decimal128(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Decimal256(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::TimestampSecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::TimestampMillisecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::TimestampMicrosecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::TimestampNanosecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Date32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Date64(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Time32Second(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Time32Millisecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Time64Microsecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Time64Nanosecond(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Binary(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::LargeBinary(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::BinaryView(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::FixedSizeBinary(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Utf8(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::LargeUtf8(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Utf8View(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::List(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::LargeList(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Struct(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Map(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Union(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::Dictionary(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::RunEndEncodedInt16(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::RunEndEncodedInt32(b) => b.append_row(index, builder), - ArrowToVariantRowBuilder::RunEndEncodedInt64(b) => b.append_row(index, builder), + ArrowToVariantRowBuilder::Null(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Boolean(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveInt8(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveInt16(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveInt32(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveInt64(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveUInt8(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveUInt16(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveUInt32(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveUInt64(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveFloat16(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveFloat32(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::PrimitiveFloat64(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Decimal32(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Decimal64(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Decimal128(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Decimal256(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::TimestampSecond(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::TimestampMillisecond(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::TimestampMicrosecond(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::TimestampNanosecond(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Date32(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Date64(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Time32Second(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Time32Millisecond(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Time64Microsecond(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Time64Nanosecond(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Binary(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::LargeBinary(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::BinaryView(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::FixedSizeBinary(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Utf8(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::LargeUtf8(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Utf8View(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::List(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::LargeList(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Struct(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Map(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Union(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::Dictionary(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::RunEndEncodedInt16(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::RunEndEncodedInt32(b) => b.append_row(builder, index), + ArrowToVariantRowBuilder::RunEndEncodedInt64(b) => b.append_row(builder, index), } } } @@ -332,7 +332,7 @@ macro_rules! define_row_builder { } } - fn append_row(&self, index: usize, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> { + fn append_row(&self, builder: &mut impl VariantBuilderExt, index: usize) -> Result<(), ArrowError> { if self.array.is_null(index) { builder.append_null(); } else { @@ -477,8 +477,8 @@ pub(crate) struct NullArrowToVariantBuilder; impl NullArrowToVariantBuilder { fn append_row( &mut self, - _index: usize, builder: &mut impl VariantBuilderExt, + _index: usize, ) -> Result<(), ArrowError> { builder.append_null(); Ok(()) @@ -506,8 +506,8 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { fn append_row( &mut self, - index: usize, builder: &mut impl VariantBuilderExt, + index: usize, ) -> Result<(), ArrowError> { if self.list_array.is_null(index) { builder.append_null(); @@ -521,7 +521,7 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { let mut list_builder = builder.try_new_list()?; for value_index in start..end { self.values_builder - .append_row(value_index, &mut list_builder)?; + .append_row(&mut list_builder, value_index)?; } list_builder.finish(); Ok(()) @@ -557,8 +557,8 @@ impl<'a> StructArrowToVariantBuilder<'a> { fn append_row( &mut self, - index: usize, builder: &mut impl VariantBuilderExt, + index: usize, ) -> Result<(), ArrowError> { if self.struct_array.is_null(index) { builder.append_null(); @@ -570,7 +570,7 @@ impl<'a> StructArrowToVariantBuilder<'a> { for (field_name, row_builder) in &mut self.field_builders { let mut field_builder = parquet_variant::ObjectFieldBuilder::new(field_name, &mut obj_builder); - row_builder.append_row(index, &mut field_builder)?; + row_builder.append_row(&mut field_builder, index)?; } obj_builder.finish(); @@ -590,7 +590,7 @@ impl<'a> MapArrowToVariantBuilder<'a> { pub(crate) fn new(array: &'a dyn Array) -> Result { let map_array = array.as_map(); - // Pre-cast keys to strings once (like existing convert_map code) + // Pre-cast keys to strings once let keys = cast(map_array.keys(), &DataType::Utf8)?; let key_strings = keys.as_string::().clone(); @@ -608,8 +608,8 @@ impl<'a> MapArrowToVariantBuilder<'a> { fn append_row( &mut self, - index: usize, builder: &mut impl VariantBuilderExt, + index: usize, ) -> Result<(), ArrowError> { // Check for NULL map first (via null bitmap) if self.map_array.is_null(index) { @@ -621,7 +621,7 @@ impl<'a> MapArrowToVariantBuilder<'a> { let start = offsets[index].as_usize(); let end = offsets[index + 1].as_usize(); - // Create object builder for this map (even if empty) + // Create object builder for this map let mut object_builder = builder.try_new_object()?; // Add each key-value pair (loop does nothing for empty maps - correct!) @@ -629,15 +629,17 @@ impl<'a> MapArrowToVariantBuilder<'a> { let key = self.key_strings.value(kv_index); let mut field_builder = ObjectFieldBuilder::new(key, &mut object_builder); self.values_builder - .append_row(kv_index, &mut field_builder)?; + .append_row(&mut field_builder, kv_index)?; } - object_builder.finish(); // Empty map becomes empty object {} + object_builder.finish(); Ok(()) } } /// Union builder for both sparse and dense union arrays +/// +/// NOTE: Union type ids are _not_ required to be dense, hence the hash map for child builders. pub(crate) struct UnionArrowToVariantBuilder<'a> { union_array: &'a arrow::array::UnionArray, child_builders: HashMap>>, @@ -665,15 +667,15 @@ impl<'a> UnionArrowToVariantBuilder<'a> { fn append_row( &mut self, - index: usize, builder: &mut impl VariantBuilderExt, + index: usize, ) -> Result<(), ArrowError> { let type_id = self.union_array.type_id(index); let value_offset = self.union_array.value_offset(index); // Delegate to the appropriate child builder, or append null to handle an invalid type_id match self.child_builders.get_mut(&type_id) { - Some(child_builder) => child_builder.append_row(value_offset, builder)?, + Some(child_builder) => child_builder.append_row(builder, value_offset)?, None => builder.append_null(), } @@ -710,14 +712,14 @@ impl<'a> DictionaryArrowToVariantBuilder<'a> { fn append_row( &mut self, - index: usize, builder: &mut impl VariantBuilderExt, + index: usize, ) -> Result<(), ArrowError> { if self.keys.is_null(index) { builder.append_null(); } else { let normalized_key = self.normalized_keys[index]; - self.values_builder.append_row(normalized_key, builder)?; + self.values_builder.append_row(builder, normalized_key)?; } Ok(()) } @@ -754,8 +756,8 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { fn append_row( &mut self, - index: usize, builder: &mut impl VariantBuilderExt, + index: usize, ) -> Result<(), ArrowError> { self.set_run_for_index(index)?; @@ -766,7 +768,7 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { } // Re-encode the value - self.values_builder.append_row(self.run_number, builder)?; + self.values_builder.append_row(builder, self.run_number)?; Ok(()) } @@ -824,17 +826,17 @@ mod tests { // Test first value let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(0, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 0).unwrap(); variant_builder.finish(); // Test null value let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(1, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 1).unwrap(); variant_builder.finish(); // Test second value let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(2, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 2).unwrap(); variant_builder.finish(); let variant_array = array_builder.build(); @@ -853,13 +855,13 @@ mod tests { let mut array_builder = VariantArrayBuilder::new(3); let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(0, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 0).unwrap(); variant_builder.finish(); let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(1, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 1).unwrap(); variant_builder.finish(); let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(2, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 2).unwrap(); variant_builder.finish(); let variant_array = array_builder.build(); @@ -878,13 +880,13 @@ mod tests { let mut array_builder = VariantArrayBuilder::new(3); let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(0, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 0).unwrap(); variant_builder.finish(); let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(1, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 1).unwrap(); variant_builder.finish(); let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(2, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 2).unwrap(); variant_builder.finish(); let variant_array = array_builder.build(); @@ -924,17 +926,17 @@ mod tests { // Test first row let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(0, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 0).unwrap(); variant_builder.finish(); // Test second row (with null int field) let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(1, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 1).unwrap(); variant_builder.finish(); // Test third row (with null string field) let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(2, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, 2).unwrap(); variant_builder.finish(); let variant_array = array_builder.build(); @@ -981,7 +983,7 @@ mod tests { // Test sequential access (most common case) for i in 0..6 { let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, i).unwrap(); variant_builder.finish(); } @@ -1017,7 +1019,7 @@ mod tests { for (i, &index) in access_pattern.iter().enumerate() { let mut array_builder = VariantArrayBuilder::new(1); let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(index, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, index).unwrap(); variant_builder.finish(); let variant_array = array_builder.build(); @@ -1042,7 +1044,7 @@ mod tests { // Test sequential access for i in 0..5 { let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, i).unwrap(); variant_builder.finish(); } @@ -1074,7 +1076,7 @@ mod tests { // Test sequential access for i in 0..5 { let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, i).unwrap(); variant_builder.finish(); } @@ -1106,7 +1108,7 @@ mod tests { // Test sequential access for i in 0..5 { let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, i).unwrap(); variant_builder.finish(); } @@ -1141,7 +1143,7 @@ mod tests { for (i, &index) in access_pattern.iter().enumerate() { let mut array_builder = VariantArrayBuilder::new(1); let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(index, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, index).unwrap(); variant_builder.finish(); let variant_array = array_builder.build(); @@ -1179,7 +1181,7 @@ mod tests { // Test sequential access for i in 0..5 { let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut variant_builder).unwrap(); + row_builder.append_row(&mut variant_builder, i).unwrap(); variant_builder.finish(); } @@ -1232,7 +1234,7 @@ mod tests { for i in 0..list_array.len() { let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -1265,40 +1267,6 @@ mod tests { assert_eq!(list3.len(), 0); } - #[test] - fn test_large_list_row_builder() { - use arrow::array::LargeListArray; - - // Create a large list array: [[1, 2], null] - let data = vec![Some(vec![Some(1i64), Some(2i64)]), None]; - let list_array = LargeListArray::from_iter_primitive::(data); - - let mut row_builder = - make_arrow_to_variant_row_builder(list_array.data_type(), &list_array).unwrap(); - let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); - - for i in 0..list_array.len() { - let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = variant_array_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 2); - - // Row 0: [1, 2] - let row0 = variant_array.value(0); - let list0 = row0.as_list().unwrap(); - assert_eq!(list0.len(), 2); - assert_eq!(list0.get(0), Some(Variant::from(1i64))); - assert_eq!(list0.get(1), Some(Variant::from(2i64))); - - // Row 1: null - assert!(variant_array.is_null(1)); - } - #[test] fn test_sliced_list_row_builder() { use arrow::array::ListArray; @@ -1320,7 +1288,7 @@ mod tests { // Test the single row let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(0, &mut builder).unwrap(); + row_builder.append_row(&mut builder, 0).unwrap(); builder.finish(); let variant_array = variant_array_builder.build(); @@ -1363,7 +1331,7 @@ mod tests { for i in 0..outer_list.len() { let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -1446,7 +1414,7 @@ mod tests { // Test each row for i in 0..4 { let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -1520,7 +1488,7 @@ mod tests { let mut variant_builder = VariantArrayBuilder::new(union_array.len()); for i in 0..union_array.len() { let mut builder = variant_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } let variant_array = variant_builder.build(); @@ -1593,7 +1561,7 @@ mod tests { let mut variant_builder = VariantArrayBuilder::new(union_array.len()); for i in 0..union_array.len() { let mut builder = variant_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } let variant_array = variant_builder.build(); @@ -1657,7 +1625,7 @@ mod tests { let mut variant_builder = VariantArrayBuilder::new(union_array.len()); for i in 0..union_array.len() { let mut builder = variant_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } let variant_array = variant_builder.build(); @@ -1689,7 +1657,7 @@ mod tests { for i in 0..decimal_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -1729,7 +1697,7 @@ mod tests { for i in 0..decimal_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -1770,7 +1738,7 @@ mod tests { for i in 0..decimal_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -1807,7 +1775,7 @@ mod tests { for i in 0..binary_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -1829,49 +1797,6 @@ mod tests { assert_eq!(variant_array.value(3), Variant::from(bytes.as_slice())); } - #[test] - fn test_large_binary_row_builder() { - use arrow::array::LargeBinaryArray; - - // Test LargeBinaryArray - let binary_data = vec![ - Some(b"large binary data".as_slice()), - None, - Some(b"another large chunk".as_slice()), - ]; - let large_binary_array = LargeBinaryArray::from(binary_data); - - let mut row_builder = - make_arrow_to_variant_row_builder(large_binary_array.data_type(), &large_binary_array) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..large_binary_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: large binary data - assert_eq!( - variant_array.value(0), - Variant::from(b"large binary data".as_slice()) - ); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: another large chunk - assert_eq!( - variant_array.value(2), - Variant::from(b"another large chunk".as_slice()) - ); - } - #[test] fn test_binary_view_row_builder() { use arrow::array::BinaryViewArray; @@ -1892,7 +1817,7 @@ mod tests { for i in 0..binary_view_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -1934,7 +1859,7 @@ mod tests { for i in 0..fixed_binary_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -1973,7 +1898,7 @@ mod tests { for i in 0..string_view_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2015,7 +1940,7 @@ mod tests { for i in 0..timestamp_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2057,7 +1982,7 @@ mod tests { for i in 0..timestamp_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2096,7 +2021,7 @@ mod tests { for i in 0..timestamp_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2137,7 +2062,7 @@ mod tests { for i in 0..timestamp_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2179,7 +2104,7 @@ mod tests { for i in 0..date_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2223,7 +2148,7 @@ mod tests { for i in 0..date_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2267,7 +2192,7 @@ mod tests { for i in 0..time_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2311,7 +2236,7 @@ mod tests { for i in 0..time_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2355,7 +2280,7 @@ mod tests { for i in 0..time_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } @@ -2399,7 +2324,7 @@ mod tests { for i in 0..time_array.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder).unwrap(); + row_builder.append_row(&mut builder, i).unwrap(); builder.finish(); } diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 4bc5fd37edc9..1ec0c54888f6 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -59,7 +59,7 @@ pub fn cast_to_variant(input: &dyn Array) -> Result { // Process each row using the row builder for i in 0..input.len() { let mut builder = array_builder.variant_builder(); - row_builder.append_row(i, &mut builder)?; + row_builder.append_row(&mut builder, i)?; builder.finish(); } diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index da25a638f1a6..b60dc0188b3f 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -17,26 +17,6 @@ //! Module for transforming a typed arrow `Array` to `VariantArray`. -/// Convert the input array to a `VariantArray` row by row, using `method` -/// not requiring a generic type to downcast the generic array to a specific -/// array type and `cast_fn` to transform each element to a type compatible with Variant -#[allow(unused)] -macro_rules! non_generic_conversion_array { - ($array:expr, $cast_fn:expr, $builder:expr) => {{ - let array = $array; - for i in 0..array.len() { - if array.is_null(i) { - $builder.append_null(); - continue; - } - let cast_value = $cast_fn(array.value(i)); - $builder.append_variant(Variant::from(cast_value)); - } - }}; -} -#[allow(unused)] -pub(crate) use non_generic_conversion_array; - /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! non_generic_conversion_single_value { ($array:expr, $cast_fn:expr, $index:expr) => {{ @@ -51,22 +31,6 @@ macro_rules! non_generic_conversion_single_value { } pub(crate) use non_generic_conversion_single_value; -/// Convert the input array to a `VariantArray` row by row, using `method` -/// requiring a generic type to downcast the generic array to a specific -/// array type and `cast_fn` to transform each element to a type compatible with Variant -#[allow(unused)] -macro_rules! generic_conversion_array { - ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $builder:expr) => {{ - $crate::type_conversion::non_generic_conversion_array!( - $input.$method::<$t>(), - $cast_fn, - $builder - ) - }}; -} -#[allow(unused)] -pub(crate) use generic_conversion_array; - /// Convert the value at a specific index in the given array into a `Variant`, /// using `method` requiring a generic type to downcast the generic array /// to a specific array type and `cast_fn` to transform the element. @@ -81,23 +45,6 @@ macro_rules! generic_conversion_single_value { } pub(crate) use generic_conversion_single_value; -/// Convert the input array of a specific primitive type to a `VariantArray` -/// row by row -#[allow(unused)] -macro_rules! primitive_conversion_array { - ($t:ty, $input:expr, $builder:expr) => {{ - $crate::type_conversion::generic_conversion_array!( - $t, - as_primitive, - |v| v, - $input, - $builder - ) - }}; -} -#[allow(unused)] -pub(crate) use primitive_conversion_array; - /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! primitive_conversion_single_value { ($t:ty, $input:expr, $index:expr) => {{ From c9fdb8ce83f5ba63887645bcb0b1b0c50957bc45 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 9 Sep 2025 06:43:28 -0700 Subject: [PATCH 51/53] simpler macro usage --- .../src/arrow_to_variant.rs | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index a32ccbe9a01c..912a7fb4bf0a 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -306,14 +306,16 @@ pub(crate) fn make_arrow_to_variant_row_builder<'a>( } /// Macro to define (possibly generic) row builders with consistent structure and behavior. -/// Supports optional extra fields that are passed to the constructor. +/// Supports an optional transform for values read from the underlying array. Also supports optional +/// extra fields that are passed to the constructor and which are available by reference in the +/// value transform. macro_rules! define_row_builder { ( struct $name:ident<$lifetime:lifetime $(, $generic:ident: $($bound:path)+)?> $(where $where_path:path: $where_bound:path)? $({ $($field:ident: $field_type:ty),* $(,)? })?, - |$array_param:ident| -> $array_type:ty { $init_expr:expr }, - |$value:ident| $value_transform:expr + |$array_param:ident| -> $array_type:ty { $init_expr:expr } + $(, |$value:ident| $value_transform:expr)? ) => { pub(crate) struct $name<$lifetime $(, $generic: $($bound)+)?> $(where $where_path: $where_bound)? @@ -336,10 +338,16 @@ macro_rules! define_row_builder { if self.array.is_null(index) { builder.append_null(); } else { - let $value = self.array.value(index); // Capture fields as variables the transform can access (hygiene) $($(let $field = &self.$field;)*)? - builder.append_value($value_transform); + + // Apply the value transform, if any (with name swapping for hygiene) + let value = self.array.value(index); + $( + let $value = value; + let value = $value_transform; + )? + builder.append_value(value); } Ok(()) } @@ -349,15 +357,13 @@ macro_rules! define_row_builder { define_row_builder!( struct BooleanArrowToVariantBuilder<'a>, - |array| -> arrow::array::BooleanArray { array.as_boolean() }, - |value| value + |array| -> arrow::array::BooleanArray { array.as_boolean() } ); define_row_builder!( struct PrimitiveArrowToVariantBuilder<'a, T: ArrowPrimitiveType> where T::Native: Into>, - |array| -> PrimitiveArray { array.as_primitive() }, - |value| value + |array| -> PrimitiveArray { array.as_primitive() } ); define_row_builder!( @@ -443,32 +449,27 @@ define_row_builder!( define_row_builder!( struct BinaryArrowToVariantBuilder<'a, O: OffsetSizeTrait>, - |array| -> GenericBinaryArray { array.as_binary() }, - |value| value + |array| -> GenericBinaryArray { array.as_binary() } ); define_row_builder!( struct BinaryViewArrowToVariantBuilder<'a>, - |array| -> arrow::array::BinaryViewArray { array.as_byte_view() }, - |value| value + |array| -> arrow::array::BinaryViewArray { array.as_byte_view() } ); define_row_builder!( struct FixedSizeBinaryArrowToVariantBuilder<'a>, - |array| -> arrow::array::FixedSizeBinaryArray { array.as_fixed_size_binary() }, - |value| value + |array| -> arrow::array::FixedSizeBinaryArray { array.as_fixed_size_binary() } ); define_row_builder!( struct StringArrowToVariantBuilder<'a, O: OffsetSizeTrait>, - |array| -> GenericStringArray { array.as_string() }, - |value| value + |array| -> GenericStringArray { array.as_string() } ); define_row_builder!( struct StringViewArrowToVariantBuilder<'a>, - |array| -> arrow::array::StringViewArray { array.as_string_view() }, - |value| value + |array| -> arrow::array::StringViewArray { array.as_string_view() } ); /// Null builder that always appends null From 96256a30088a0f197bf9954ec1f517b62298796a Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Tue, 9 Sep 2025 15:04:06 -0700 Subject: [PATCH 52/53] cosmetic cleanups --- .../src/arrow_to_variant.rs | 400 ++++++++---------- 1 file changed, 180 insertions(+), 220 deletions(-) diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index f652d9937d82..c4953b024d69 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -89,55 +89,57 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { } impl<'a> ArrowToVariantRowBuilder<'a> { + /// Appends a single row at the given index to the supplied builder. pub fn append_row( &mut self, builder: &mut impl VariantBuilderExt, index: usize, ) -> Result<(), ArrowError> { + use ArrowToVariantRowBuilder::*; match self { - ArrowToVariantRowBuilder::Null(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Boolean(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveInt8(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveInt16(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveInt32(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveInt64(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveUInt8(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveUInt16(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveUInt32(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveUInt64(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveFloat16(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveFloat32(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::PrimitiveFloat64(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Decimal32(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Decimal64(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Decimal128(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Decimal256(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::TimestampSecond(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::TimestampMillisecond(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::TimestampMicrosecond(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::TimestampNanosecond(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Date32(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Date64(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Time32Second(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Time32Millisecond(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Time64Microsecond(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Time64Nanosecond(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Binary(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::LargeBinary(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::BinaryView(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::FixedSizeBinary(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Utf8(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::LargeUtf8(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Utf8View(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::List(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::LargeList(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Struct(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Map(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Union(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::Dictionary(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::RunEndEncodedInt16(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::RunEndEncodedInt32(b) => b.append_row(builder, index), - ArrowToVariantRowBuilder::RunEndEncodedInt64(b) => b.append_row(builder, index), + Null(b) => b.append_row(builder, index), + Boolean(b) => b.append_row(builder, index), + PrimitiveInt8(b) => b.append_row(builder, index), + PrimitiveInt16(b) => b.append_row(builder, index), + PrimitiveInt32(b) => b.append_row(builder, index), + PrimitiveInt64(b) => b.append_row(builder, index), + PrimitiveUInt8(b) => b.append_row(builder, index), + PrimitiveUInt16(b) => b.append_row(builder, index), + PrimitiveUInt32(b) => b.append_row(builder, index), + PrimitiveUInt64(b) => b.append_row(builder, index), + PrimitiveFloat16(b) => b.append_row(builder, index), + PrimitiveFloat32(b) => b.append_row(builder, index), + PrimitiveFloat64(b) => b.append_row(builder, index), + Decimal32(b) => b.append_row(builder, index), + Decimal64(b) => b.append_row(builder, index), + Decimal128(b) => b.append_row(builder, index), + Decimal256(b) => b.append_row(builder, index), + TimestampSecond(b) => b.append_row(builder, index), + TimestampMillisecond(b) => b.append_row(builder, index), + TimestampMicrosecond(b) => b.append_row(builder, index), + TimestampNanosecond(b) => b.append_row(builder, index), + Date32(b) => b.append_row(builder, index), + Date64(b) => b.append_row(builder, index), + Time32Second(b) => b.append_row(builder, index), + Time32Millisecond(b) => b.append_row(builder, index), + Time64Microsecond(b) => b.append_row(builder, index), + Time64Nanosecond(b) => b.append_row(builder, index), + Binary(b) => b.append_row(builder, index), + LargeBinary(b) => b.append_row(builder, index), + BinaryView(b) => b.append_row(builder, index), + FixedSizeBinary(b) => b.append_row(builder, index), + Utf8(b) => b.append_row(builder, index), + LargeUtf8(b) => b.append_row(builder, index), + Utf8View(b) => b.append_row(builder, index), + List(b) => b.append_row(builder, index), + LargeList(b) => b.append_row(builder, index), + Struct(b) => b.append_row(builder, index), + Map(b) => b.append_row(builder, index), + Union(b) => b.append_row(builder, index), + Dictionary(b) => b.append_row(builder, index), + RunEndEncodedInt16(b) => b.append_row(builder, index), + RunEndEncodedInt32(b) => b.append_row(builder, index), + RunEndEncodedInt64(b) => b.append_row(builder, index), } } } @@ -148,170 +150,128 @@ pub(crate) fn make_arrow_to_variant_row_builder<'a>( array: &'a dyn Array, options: &'a CastOptions, ) -> Result, ArrowError> { - let builder = match data_type { - DataType::Null => ArrowToVariantRowBuilder::Null(NullArrowToVariantBuilder), - DataType::Boolean => { - ArrowToVariantRowBuilder::Boolean(BooleanArrowToVariantBuilder::new(array)) - } - DataType::Int8 => { - ArrowToVariantRowBuilder::PrimitiveInt8(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Int16 => { - ArrowToVariantRowBuilder::PrimitiveInt16(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Int32 => { - ArrowToVariantRowBuilder::PrimitiveInt32(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Int64 => { - ArrowToVariantRowBuilder::PrimitiveInt64(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::UInt8 => { - ArrowToVariantRowBuilder::PrimitiveUInt8(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::UInt16 => { - ArrowToVariantRowBuilder::PrimitiveUInt16(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::UInt32 => { - ArrowToVariantRowBuilder::PrimitiveUInt32(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::UInt64 => { - ArrowToVariantRowBuilder::PrimitiveUInt64(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Float16 => { - ArrowToVariantRowBuilder::PrimitiveFloat16(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Float32 => { - ArrowToVariantRowBuilder::PrimitiveFloat32(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Float64 => { - ArrowToVariantRowBuilder::PrimitiveFloat64(PrimitiveArrowToVariantBuilder::new(array)) - } - DataType::Decimal32(_, scale) => { - ArrowToVariantRowBuilder::Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale)) - } - DataType::Decimal64(_, scale) => { - ArrowToVariantRowBuilder::Decimal64(Decimal64ArrowToVariantBuilder::new(array, *scale)) - } - DataType::Decimal128(_, scale) => ArrowToVariantRowBuilder::Decimal128( - Decimal128ArrowToVariantBuilder::new(array, *scale), - ), - DataType::Decimal256(_, scale) => ArrowToVariantRowBuilder::Decimal256( - Decimal256ArrowToVariantBuilder::new(array, *scale), - ), - DataType::Timestamp(time_unit, time_zone) => match time_unit { - TimeUnit::Second => ArrowToVariantRowBuilder::TimestampSecond( - TimestampArrowToVariantBuilder::new(array, options, time_zone.is_some()), - ), - TimeUnit::Millisecond => ArrowToVariantRowBuilder::TimestampMillisecond( - TimestampArrowToVariantBuilder::new(array, options, time_zone.is_some()), - ), - TimeUnit::Microsecond => ArrowToVariantRowBuilder::TimestampMicrosecond( - TimestampArrowToVariantBuilder::new(array, options, time_zone.is_some()), - ), - TimeUnit::Nanosecond => ArrowToVariantRowBuilder::TimestampNanosecond( - TimestampArrowToVariantBuilder::new(array, options, time_zone.is_some()), - ), - }, - DataType::Date32 => { - ArrowToVariantRowBuilder::Date32(DateArrowToVariantBuilder::new(array, options)) - } - DataType::Date64 => { - ArrowToVariantRowBuilder::Date64(DateArrowToVariantBuilder::new(array, options)) - } - DataType::Time32(time_unit) => match time_unit { - TimeUnit::Second => ArrowToVariantRowBuilder::Time32Second( - TimeArrowToVariantBuilder::new(array, options), - ), - TimeUnit::Millisecond => ArrowToVariantRowBuilder::Time32Millisecond( - TimeArrowToVariantBuilder::new(array, options), - ), - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported Time32 unit: {time_unit:?}" - ))) + use ArrowToVariantRowBuilder::*; + let builder = + match data_type { + DataType::Null => Null(NullArrowToVariantBuilder), + DataType::Boolean => Boolean(BooleanArrowToVariantBuilder::new(array)), + DataType::Int8 => PrimitiveInt8(PrimitiveArrowToVariantBuilder::new(array)), + DataType::Int16 => PrimitiveInt16(PrimitiveArrowToVariantBuilder::new(array)), + DataType::Int32 => PrimitiveInt32(PrimitiveArrowToVariantBuilder::new(array)), + DataType::Int64 => PrimitiveInt64(PrimitiveArrowToVariantBuilder::new(array)), + DataType::UInt8 => PrimitiveUInt8(PrimitiveArrowToVariantBuilder::new(array)), + DataType::UInt16 => PrimitiveUInt16(PrimitiveArrowToVariantBuilder::new(array)), + DataType::UInt32 => PrimitiveUInt32(PrimitiveArrowToVariantBuilder::new(array)), + DataType::UInt64 => PrimitiveUInt64(PrimitiveArrowToVariantBuilder::new(array)), + DataType::Float16 => PrimitiveFloat16(PrimitiveArrowToVariantBuilder::new(array)), + DataType::Float32 => PrimitiveFloat32(PrimitiveArrowToVariantBuilder::new(array)), + DataType::Float64 => PrimitiveFloat64(PrimitiveArrowToVariantBuilder::new(array)), + DataType::Decimal32(_, scale) => { + Decimal32(Decimal32ArrowToVariantBuilder::new(array, *scale)) } - }, - DataType::Time64(time_unit) => match time_unit { - TimeUnit::Microsecond => ArrowToVariantRowBuilder::Time64Microsecond( - TimeArrowToVariantBuilder::new(array, options), - ), - TimeUnit::Nanosecond => ArrowToVariantRowBuilder::Time64Nanosecond( - TimeArrowToVariantBuilder::new(array, options), - ), - _ => { - return Err(ArrowError::CastError(format!( - "Unsupported Time64 unit: {time_unit:?}" - ))) + DataType::Decimal64(_, scale) => { + Decimal64(Decimal64ArrowToVariantBuilder::new(array, *scale)) + } + DataType::Decimal128(_, scale) => { + Decimal128(Decimal128ArrowToVariantBuilder::new(array, *scale)) } - }, - DataType::Duration(_) | DataType::Interval(_) => { - return Err(ArrowError::InvalidArgumentError( - "Casting duration/interval types to Variant is not supported. \ + DataType::Decimal256(_, scale) => { + Decimal256(Decimal256ArrowToVariantBuilder::new(array, *scale)) + } + DataType::Timestamp(time_unit, time_zone) => { + match time_unit { + TimeUnit::Second => TimestampSecond(TimestampArrowToVariantBuilder::new( + array, + options, + time_zone.is_some(), + )), + TimeUnit::Millisecond => TimestampMillisecond( + TimestampArrowToVariantBuilder::new(array, options, time_zone.is_some()), + ), + TimeUnit::Microsecond => TimestampMicrosecond( + TimestampArrowToVariantBuilder::new(array, options, time_zone.is_some()), + ), + TimeUnit::Nanosecond => TimestampNanosecond( + TimestampArrowToVariantBuilder::new(array, options, time_zone.is_some()), + ), + } + } + DataType::Date32 => Date32(DateArrowToVariantBuilder::new(array, options)), + DataType::Date64 => Date64(DateArrowToVariantBuilder::new(array, options)), + DataType::Time32(time_unit) => match time_unit { + TimeUnit::Second => Time32Second(TimeArrowToVariantBuilder::new(array, options)), + TimeUnit::Millisecond => { + Time32Millisecond(TimeArrowToVariantBuilder::new(array, options)) + } + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported Time32 unit: {time_unit:?}" + ))) + } + }, + DataType::Time64(time_unit) => match time_unit { + TimeUnit::Microsecond => { + Time64Microsecond(TimeArrowToVariantBuilder::new(array, options)) + } + TimeUnit::Nanosecond => { + Time64Nanosecond(TimeArrowToVariantBuilder::new(array, options)) + } + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported Time64 unit: {time_unit:?}" + ))) + } + }, + DataType::Duration(_) | DataType::Interval(_) => { + return Err(ArrowError::InvalidArgumentError( + "Casting duration/interval types to Variant is not supported. \ The Variant format does not define duration/interval types." - .to_string(), - )) - } - DataType::Binary => { - ArrowToVariantRowBuilder::Binary(BinaryArrowToVariantBuilder::new(array)) - } - DataType::LargeBinary => { - ArrowToVariantRowBuilder::LargeBinary(BinaryArrowToVariantBuilder::new(array)) - } - DataType::BinaryView => { - ArrowToVariantRowBuilder::BinaryView(BinaryViewArrowToVariantBuilder::new(array)) - } - DataType::FixedSizeBinary(_) => ArrowToVariantRowBuilder::FixedSizeBinary( - FixedSizeBinaryArrowToVariantBuilder::new(array), - ), - DataType::Utf8 => ArrowToVariantRowBuilder::Utf8(StringArrowToVariantBuilder::new(array)), - DataType::LargeUtf8 => { - ArrowToVariantRowBuilder::LargeUtf8(StringArrowToVariantBuilder::new(array)) - } - DataType::Utf8View => { - ArrowToVariantRowBuilder::Utf8View(StringViewArrowToVariantBuilder::new(array)) - } - DataType::List(_) => { - ArrowToVariantRowBuilder::List(ListArrowToVariantBuilder::new(array, options)?) - } - DataType::LargeList(_) => { - ArrowToVariantRowBuilder::LargeList(ListArrowToVariantBuilder::new(array, options)?) - } - DataType::Struct(_) => ArrowToVariantRowBuilder::Struct(StructArrowToVariantBuilder::new( - array.as_struct(), - options, - )?), - DataType::Map(_, _) => { - ArrowToVariantRowBuilder::Map(MapArrowToVariantBuilder::new(array, options)?) - } - DataType::Union(_, _) => { - ArrowToVariantRowBuilder::Union(UnionArrowToVariantBuilder::new(array, options)?) - } - DataType::Dictionary(_, _) => ArrowToVariantRowBuilder::Dictionary( - DictionaryArrowToVariantBuilder::new(array, options)?, - ), - DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { - DataType::Int16 => ArrowToVariantRowBuilder::RunEndEncodedInt16( - RunEndEncodedArrowToVariantBuilder::new(array, options)?, - ), - DataType::Int32 => ArrowToVariantRowBuilder::RunEndEncodedInt32( - RunEndEncodedArrowToVariantBuilder::new(array, options)?, - ), - DataType::Int64 => ArrowToVariantRowBuilder::RunEndEncodedInt64( - RunEndEncodedArrowToVariantBuilder::new(array, options)?, - ), - _ => { + .to_string(), + )) + } + DataType::Binary => Binary(BinaryArrowToVariantBuilder::new(array)), + DataType::LargeBinary => LargeBinary(BinaryArrowToVariantBuilder::new(array)), + DataType::BinaryView => BinaryView(BinaryViewArrowToVariantBuilder::new(array)), + DataType::FixedSizeBinary(_) => { + FixedSizeBinary(FixedSizeBinaryArrowToVariantBuilder::new(array)) + } + DataType::Utf8 => Utf8(StringArrowToVariantBuilder::new(array)), + DataType::LargeUtf8 => LargeUtf8(StringArrowToVariantBuilder::new(array)), + DataType::Utf8View => Utf8View(StringViewArrowToVariantBuilder::new(array)), + DataType::List(_) => List(ListArrowToVariantBuilder::new(array, options)?), + DataType::LargeList(_) => LargeList(ListArrowToVariantBuilder::new(array, options)?), + DataType::Struct(_) => Struct(StructArrowToVariantBuilder::new( + array.as_struct(), + options, + )?), + DataType::Map(_, _) => Map(MapArrowToVariantBuilder::new(array, options)?), + DataType::Union(_, _) => Union(UnionArrowToVariantBuilder::new(array, options)?), + DataType::Dictionary(_, _) => { + Dictionary(DictionaryArrowToVariantBuilder::new(array, options)?) + } + DataType::RunEndEncoded(run_ends, _) => match run_ends.data_type() { + DataType::Int16 => { + RunEndEncodedInt16(RunEndEncodedArrowToVariantBuilder::new(array, options)?) + } + DataType::Int32 => { + RunEndEncodedInt32(RunEndEncodedArrowToVariantBuilder::new(array, options)?) + } + DataType::Int64 => { + RunEndEncodedInt64(RunEndEncodedArrowToVariantBuilder::new(array, options)?) + } + _ => { + return Err(ArrowError::CastError(format!( + "Unsupported run ends type: {:?}", + run_ends.data_type() + ))); + } + }, + dt => { return Err(ArrowError::CastError(format!( - "Unsupported run ends type: {:?}", - run_ends.data_type() + "Unsupported data type for casting to Variant: {dt:?}", ))); } - }, - dt => { - return Err(ArrowError::CastError(format!( - "Unsupported data type for casting to Variant: {dt:?}", - ))); - } - }; + }; Ok(builder) } @@ -826,25 +786,6 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { }) } - fn append_row( - &mut self, - builder: &mut impl VariantBuilderExt, - index: usize, - ) -> Result<(), ArrowError> { - self.set_run_for_index(index)?; - - // Handle null values - if self.run_array.values().is_null(self.run_number) { - builder.append_null(); - return Ok(()); - } - - // Re-encode the value - self.values_builder.append_row(builder, self.run_number)?; - - Ok(()) - } - fn set_run_for_index(&mut self, index: usize) -> Result<(), ArrowError> { if index >= self.run_start { let Some(run_end) = self.run_ends.get(self.run_number) else { @@ -878,6 +819,25 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { }; Ok(()) } + + fn append_row( + &mut self, + builder: &mut impl VariantBuilderExt, + index: usize, + ) -> Result<(), ArrowError> { + self.set_run_for_index(index)?; + + // Handle null values + if self.run_array.values().is_null(self.run_number) { + builder.append_null(); + return Ok(()); + } + + // Re-encode the value + self.values_builder.append_row(builder, self.run_number)?; + + Ok(()) + } } #[cfg(test)] From 6622e86994efe3e92d411535140687e0d7a639df Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Wed, 10 Sep 2025 05:37:36 -0700 Subject: [PATCH 53/53] helper functions to reduce boiler plate in unit tests --- .../src/arrow_to_variant.rs | 879 +++++------------- 1 file changed, 207 insertions(+), 672 deletions(-) diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index c4953b024d69..c08990de6911 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -843,94 +843,75 @@ impl<'a, R: RunEndIndexType> RunEndEncodedArrowToVariantBuilder<'a, R> { #[cfg(test)] mod tests { use super::*; - use crate::VariantArrayBuilder; + use crate::{VariantArray, VariantArrayBuilder}; use arrow::array::{ArrayRef, BooleanArray, Int32Array, StringArray}; use std::sync::Arc; - #[test] - fn test_primitive_row_builder() { - // Test Int32Array - let int_array = Int32Array::from(vec![Some(42), None, Some(100)]); + /// Builds a VariantArray from an Arrow array using the row builder. + fn execute_row_builder_test(array: &dyn Array) -> VariantArray { let options = CastOptions::default(); let mut row_builder = - make_arrow_to_variant_row_builder(int_array.data_type(), &int_array, &options).unwrap(); + make_arrow_to_variant_row_builder(array.data_type(), array, &options).unwrap(); - let mut array_builder = VariantArrayBuilder::new(3); + let mut array_builder = VariantArrayBuilder::new(array.len()); - // Test first value - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 0).unwrap(); - variant_builder.finish(); + // The repetitive loop that appears in every test + for i in 0..array.len() { + let mut variant_builder = array_builder.variant_builder(); + row_builder.append_row(&mut variant_builder, i).unwrap(); + variant_builder.finish(); + } - // Test null value - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 1).unwrap(); - variant_builder.finish(); + let variant_array = array_builder.build(); + assert_eq!(variant_array.len(), array.len()); + variant_array + } - // Test second value - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 2).unwrap(); - variant_builder.finish(); + /// Generic helper function to test row builders with basic assertion patterns. + /// Uses execute_row_builder_test and adds simple value comparison assertions. + fn test_row_builder_basic(array: &dyn Array, expected_values: Vec>) { + let variant_array = execute_row_builder_test(array); - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - assert_eq!(variant_array.value(0), Variant::Int32(42)); - assert!(variant_array.is_null(1)); - assert_eq!(variant_array.value(2), Variant::Int32(100)); + // The repetitive assertion pattern + for (i, expected) in expected_values.iter().enumerate() { + match expected { + Some(variant) => { + assert_eq!(variant_array.value(i), *variant, "Mismatch at index {}", i) + } + None => assert!(variant_array.is_null(i), "Expected null at index {}", i), + } + } + } + + #[test] + fn test_primitive_row_builder() { + let int_array = Int32Array::from(vec![Some(42), None, Some(100)]); + test_row_builder_basic( + &int_array, + vec![Some(Variant::Int32(42)), None, Some(Variant::Int32(100))], + ); } #[test] fn test_string_row_builder() { let string_array = StringArray::from(vec![Some("hello"), None, Some("world")]); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(string_array.data_type(), &string_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 0).unwrap(); - variant_builder.finish(); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 1).unwrap(); - variant_builder.finish(); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 2).unwrap(); - variant_builder.finish(); - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - assert_eq!(variant_array.value(0), Variant::from("hello")); - assert!(variant_array.is_null(1)); - assert_eq!(variant_array.value(2), Variant::from("world")); + test_row_builder_basic( + &string_array, + vec![ + Some(Variant::from("hello")), + None, + Some(Variant::from("world")), + ], + ); } #[test] fn test_boolean_row_builder() { let bool_array = BooleanArray::from(vec![Some(true), None, Some(false)]); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(bool_array.data_type(), &bool_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 0).unwrap(); - variant_builder.finish(); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 1).unwrap(); - variant_builder.finish(); - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 2).unwrap(); - variant_builder.finish(); - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - assert_eq!(variant_array.value(0), Variant::from(true)); - assert!(variant_array.is_null(1)); - assert_eq!(variant_array.value(2), Variant::from(false)); + test_row_builder_basic( + &bool_array, + vec![Some(Variant::from(true)), None, Some(Variant::from(false))], + ); } #[test] @@ -956,30 +937,7 @@ mod tests { ) .unwrap(); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(struct_array.data_type(), &struct_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - // Test first row - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 0).unwrap(); - variant_builder.finish(); - - // Test second row (with null int field) - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 1).unwrap(); - variant_builder.finish(); - - // Test third row (with null string field) - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, 2).unwrap(); - variant_builder.finish(); - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); + let variant_array = execute_row_builder_test(&struct_array); // Check first row - should have both fields let first_variant = variant_array.value(0); @@ -1015,20 +973,7 @@ mod tests { let run_ends = Int32Array::from(vec![2, 5, 6]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(run_array.data_type(), &run_array, &options).unwrap(); - let mut array_builder = VariantArrayBuilder::new(6); - - // Test sequential access (most common case) - for i in 0..6 { - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, i).unwrap(); - variant_builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 6); + let variant_array = execute_row_builder_test(&run_array); // Verify the values assert_eq!(variant_array.value(0), Variant::from("A")); // Run 0 @@ -1111,21 +1056,7 @@ mod tests { let keys = Int32Array::from(vec![0, 1, 0, 2, 1]); let dict_array = DictionaryArray::::try_new(keys, Arc::new(values)).unwrap(); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(dict_array.data_type(), &dict_array, &options) - .unwrap(); - let mut array_builder = VariantArrayBuilder::new(5); - - // Test sequential access - for i in 0..5 { - let mut variant_builder = array_builder.variant_builder(); - row_builder.append_row(&mut variant_builder, i).unwrap(); - variant_builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 5); + let variant_array = execute_row_builder_test(&dict_array); // Verify the values match the dictionary lookup assert_eq!(variant_array.value(0), Variant::from("apple")); // keys[0] = 0 -> values[0] = "apple" @@ -1278,22 +1209,7 @@ mod tests { ]; let list_array = ListArray::from_iter_primitive::(data); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(list_array.data_type(), &list_array, &options) - .unwrap(); - let mut variant_array_builder = VariantArrayBuilder::new(list_array.len()); - - for i in 0..list_array.len() { - let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = variant_array_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 4); + let variant_array = execute_row_builder_test(&list_array); // Row 0: [1, 2] let row0 = variant_array.value(0); @@ -1463,22 +1379,7 @@ mod tests { ) .unwrap(); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(map_array.data_type(), &map_array, &options).unwrap(); - let mut variant_array_builder = VariantArrayBuilder::new(4); - - // Test each row - for i in 0..4 { - let mut builder = variant_array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = variant_array_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 4); + let variant_array = execute_row_builder_test(&map_array); // Map 0: {"key1": 1} let map0 = variant_array.value(0); @@ -1538,39 +1439,12 @@ mod tests { ) .unwrap(); - // Test the row builder - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(union_array.data_type(), &union_array, &options) - .unwrap(); - - let mut variant_builder = VariantArrayBuilder::new(union_array.len()); - for i in 0..union_array.len() { - let mut builder = variant_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - let variant_array = variant_builder.build(); - - // Verify results - assert_eq!(variant_array.len(), 6); - - // Row 0: int 1 + let variant_array = execute_row_builder_test(&union_array); assert_eq!(variant_array.value(0), Variant::Int32(1)); - - // Row 1: float 3.2 assert_eq!(variant_array.value(1), Variant::Double(3.2)); - - // Row 2: string "hello" assert_eq!(variant_array.value(2), Variant::from("hello")); - - // Row 3: float 32.5 assert_eq!(variant_array.value(3), Variant::Double(32.5)); - - // Row 4: int 34 assert_eq!(variant_array.value(4), Variant::Int32(34)); - - // Row 5: null (int array has null at this position) assert!(variant_array.is_null(5)); } @@ -1627,25 +1501,12 @@ mod tests { } let variant_array = variant_builder.build(); - // Verify results assert_eq!(variant_array.len(), 6); - - // Row 0: int 1 (offset 0 in int_array) assert_eq!(variant_array.value(0), Variant::Int32(1)); - - // Row 1: float 3.2 (offset 0 in float_array) assert_eq!(variant_array.value(1), Variant::Double(3.2)); - - // Row 2: string "hello" (offset 0 in string_array) assert_eq!(variant_array.value(2), Variant::from("hello")); - - // Row 3: float 32.5 (offset 1 in float_array) assert_eq!(variant_array.value(3), Variant::Double(32.5)); - - // Row 4: int 34 (offset 1 in int_array) assert_eq!(variant_array.value(4), Variant::Int32(34)); - - // Row 5: null (offset 2 in int_array, which has null) assert!(variant_array.is_null(5)); } @@ -1713,35 +1574,13 @@ mod tests { .with_precision_and_scale(9, 2) .unwrap(); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..decimal_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: 12.34 (1234 with scale 2) - assert_eq!( - variant_array.value(0), - Variant::from(VariantDecimal4::try_new(1234, 2).unwrap()) - ); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: -56.78 (-5678 with scale 2) - assert_eq!( - variant_array.value(2), - Variant::from(VariantDecimal4::try_new(-5678, 2).unwrap()) + test_row_builder_basic( + &decimal_array, + vec![ + Some(Variant::from(VariantDecimal4::try_new(1234, 2).unwrap())), + None, + Some(Variant::from(VariantDecimal4::try_new(-5678, 2).unwrap())), + ], ); } @@ -1755,35 +1594,13 @@ mod tests { .with_precision_and_scale(10, -2) .unwrap(); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..decimal_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: 123 * 10^2 = 12300 with scale 0 (negative scale handling) - assert_eq!( - variant_array.value(0), - Variant::from(VariantDecimal16::try_new(12300, 0).unwrap()) - ); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 456 * 10^2 = 45600 with scale 0 - assert_eq!( - variant_array.value(2), - Variant::from(VariantDecimal16::try_new(45600, 0).unwrap()) + test_row_builder_basic( + &decimal_array, + vec![ + Some(Variant::from(VariantDecimal16::try_new(12300, 0).unwrap())), + None, + Some(Variant::from(VariantDecimal16::try_new(45600, 0).unwrap())), + ], ); } @@ -1798,29 +1615,12 @@ mod tests { .with_precision_and_scale(76, 3) .unwrap(); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(decimal_array.data_type(), &decimal_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(2); - - for i in 0..decimal_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 2); - - // Row 0: overflow value becomes Variant::Null - assert_eq!(variant_array.value(0), Variant::Null); - - // Row 1: normal value converts successfully - assert_eq!( - variant_array.value(1), - Variant::from(VariantDecimal16::try_new(123, 3).unwrap()) + test_row_builder_basic( + &decimal_array, + vec![ + Some(Variant::Null), // Overflow value becomes Null + Some(Variant::from(VariantDecimal16::try_new(123, 3).unwrap())), + ], ); } @@ -1828,7 +1628,6 @@ mod tests { fn test_binary_row_builder() { use arrow::array::BinaryArray; - // Test BinaryArray with various binary data let binary_data = vec![ Some(b"hello".as_slice()), None, @@ -1837,42 +1636,21 @@ mod tests { ]; let binary_array = BinaryArray::from(binary_data); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(binary_array.data_type(), &binary_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..binary_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: "hello" bytes - assert_eq!(variant_array.value(0), Variant::from(b"hello".as_slice())); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: binary with special bytes - let bytes = [0x00, 0x01, 0x02, 0xFF]; - assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); - - // Row 3: empty binary - let bytes = []; - assert_eq!(variant_array.value(3), Variant::from(bytes.as_slice())); + test_row_builder_basic( + &binary_array, + vec![ + Some(Variant::from(b"hello".as_slice())), + None, + Some(Variant::from([0x00, 0x01, 0x02, 0xFF].as_slice())), + Some(Variant::from([].as_slice())), + ], + ); } #[test] fn test_binary_view_row_builder() { use arrow::array::BinaryViewArray; - // Test BinaryViewArray let binary_data = vec![ Some(b"short".as_slice()), None, @@ -1880,35 +1658,15 @@ mod tests { ]; let binary_view_array = BinaryViewArray::from(binary_data); - let options = CastOptions::default(); - let mut row_builder = make_arrow_to_variant_row_builder( - binary_view_array.data_type(), + test_row_builder_basic( &binary_view_array, - &options, - ) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..binary_view_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: short binary - assert_eq!(variant_array.value(0), Variant::from(b"short".as_slice())); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: long binary view - assert_eq!( - variant_array.value(2), - Variant::from(b"this is a longer binary view that exceeds inline storage".as_slice()) + vec![ + Some(Variant::from(b"short".as_slice())), + None, + Some(Variant::from( + b"this is a longer binary view that exceeds inline storage".as_slice(), + )), + ], ); } @@ -1916,7 +1674,6 @@ mod tests { fn test_fixed_size_binary_row_builder() { use arrow::array::FixedSizeBinaryArray; - // Test FixedSizeBinaryArray with 4-byte values let binary_data = vec![ Some([0x01, 0x02, 0x03, 0x04]), None, @@ -1926,42 +1683,20 @@ mod tests { FixedSizeBinaryArray::try_from_sparse_iter_with_size(binary_data.into_iter(), 4) .unwrap(); - let options = CastOptions::default(); - let mut row_builder = make_arrow_to_variant_row_builder( - fixed_binary_array.data_type(), + test_row_builder_basic( &fixed_binary_array, - &options, - ) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..fixed_binary_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: fixed size binary - let bytes = [0x01, 0x02, 0x03, 0x04]; - assert_eq!(variant_array.value(0), Variant::from(bytes.as_slice())); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: another fixed size binary - let bytes = [0xFF, 0xFE, 0xFD, 0xFC]; - assert_eq!(variant_array.value(2), Variant::from(bytes.as_slice())); + vec![ + Some(Variant::from([0x01, 0x02, 0x03, 0x04].as_slice())), + None, + Some(Variant::from([0xFF, 0xFE, 0xFD, 0xFC].as_slice())), + ], + ); } #[test] fn test_utf8_view_row_builder() { use arrow::array::StringViewArray; - // Test StringViewArray (Utf8View) let string_data = vec![ Some("short"), None, @@ -1969,37 +1704,15 @@ mod tests { ]; let string_view_array = StringViewArray::from(string_data); - let options = CastOptions::default(); - let mut row_builder = make_arrow_to_variant_row_builder( - string_view_array.data_type(), + test_row_builder_basic( &string_view_array, - &options, - ) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..string_view_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: short string - assert_eq!(variant_array.value(0), Variant::from("short")); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: long string view - assert_eq!( - variant_array.value(2), - Variant::from( - "this is a much longer string that will be stored out-of-line in the buffer" - ) + vec![ + Some(Variant::from("short")), + None, + Some(Variant::from( + "this is a much longer string that will be stored out-of-line in the buffer", + )), + ], ); } @@ -2007,7 +1720,6 @@ mod tests { fn test_timestamp_second_row_builder() { use arrow::array::TimestampSecondArray; - // Test TimestampSecondArray without timezone let timestamp_data = vec![ Some(1609459200), // 2021-01-01 00:00:00 UTC None, @@ -2015,35 +1727,17 @@ mod tests { ]; let timestamp_array = TimestampSecondArray::from(timestamp_data); - let options = CastOptions::default(); - let mut row_builder = make_arrow_to_variant_row_builder( - timestamp_array.data_type(), - ×tamp_array, - &options, - ) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..timestamp_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: 2021-01-01 00:00:00 (no timezone -> NaiveDateTime -> TimestampNtzMicros) - let expected_naive = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); - assert_eq!(variant_array.value(0), Variant::from(expected_naive)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 2022-01-01 00:00:00 + let expected_naive1 = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); let expected_naive2 = DateTime::from_timestamp(1640995200, 0).unwrap().naive_utc(); - assert_eq!(variant_array.value(2), Variant::from(expected_naive2)); + + test_row_builder_basic( + ×tamp_array, + vec![ + Some(Variant::from(expected_naive1)), + None, + Some(Variant::from(expected_naive2)), + ], + ); } #[test] @@ -2051,7 +1745,6 @@ mod tests { use arrow::array::TimestampMicrosecondArray; use chrono::DateTime; - // Test TimestampMicrosecondArray with timezone let timestamp_data = vec![ Some(1609459200000000), // 2021-01-01 00:00:00 UTC (in microseconds) None, @@ -2059,44 +1752,25 @@ mod tests { ]; let timezone = "UTC".to_string(); let timestamp_array = - TimestampMicrosecondArray::from(timestamp_data).with_timezone(timezone.clone()); - - let options = CastOptions::default(); - let mut row_builder = make_arrow_to_variant_row_builder( - timestamp_array.data_type(), - ×tamp_array, - &options, - ) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); + TimestampMicrosecondArray::from(timestamp_data).with_timezone(timezone); - for i in 0..timestamp_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: 2021-01-01 00:00:00 UTC (with timezone -> DateTime -> TimestampMicros) - let expected_utc = DateTime::from_timestamp(1609459200, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_utc)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 2022-01-01 00:00:00 UTC + let expected_utc1 = DateTime::from_timestamp(1609459200, 0).unwrap(); let expected_utc2 = DateTime::from_timestamp(1640995200, 0).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_utc2)); + + test_row_builder_basic( + ×tamp_array, + vec![ + Some(Variant::from(expected_utc1)), + None, + Some(Variant::from(expected_utc2)), + ], + ); } #[test] fn test_timestamp_nanosecond_precision_row_builder() { use arrow::array::TimestampNanosecondArray; - // Test TimestampNanosecondArray with nanosecond precision let timestamp_data = vec![ Some(1609459200123456789), // 2021-01-01 00:00:00.123456789 UTC None, @@ -2104,44 +1778,25 @@ mod tests { ]; let timestamp_array = TimestampNanosecondArray::from(timestamp_data); - let options = CastOptions::default(); - let mut row_builder = make_arrow_to_variant_row_builder( - timestamp_array.data_type(), - ×tamp_array, - &options, - ) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..timestamp_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: with nanoseconds -> should use TimestampNtzNanos let expected_with_nanos = DateTime::from_timestamp(1609459200, 123456789) .unwrap() .naive_utc(); - assert_eq!(variant_array.value(0), Variant::from(expected_with_nanos)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: no fractional seconds -> should use TimestampNtzMicros let expected_no_nanos = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); - assert_eq!(variant_array.value(2), Variant::from(expected_no_nanos)); + + test_row_builder_basic( + ×tamp_array, + vec![ + Some(Variant::from(expected_with_nanos)), + None, + Some(Variant::from(expected_no_nanos)), + ], + ); } #[test] fn test_timestamp_millisecond_row_builder() { use arrow::array::TimestampMillisecondArray; - // Test TimestampMillisecondArray let timestamp_data = vec![ Some(1609459200123), // 2021-01-01 00:00:00.123 UTC None, @@ -2149,37 +1804,19 @@ mod tests { ]; let timestamp_array = TimestampMillisecondArray::from(timestamp_data); - let options = CastOptions::default(); - let mut row_builder = make_arrow_to_variant_row_builder( - timestamp_array.data_type(), - ×tamp_array, - &options, - ) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(3); - - for i in 0..timestamp_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 3); - - // Row 0: with milliseconds -> TimestampNtzMicros (123ms = 123000000ns) let expected_with_millis = DateTime::from_timestamp(1609459200, 123000000) .unwrap() .naive_utc(); - assert_eq!(variant_array.value(0), Variant::from(expected_with_millis)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: no fractional seconds -> TimestampNtzMicros let expected_no_millis = DateTime::from_timestamp(1609459200, 0).unwrap().naive_utc(); - assert_eq!(variant_array.value(2), Variant::from(expected_no_millis)); + + test_row_builder_basic( + ×tamp_array, + vec![ + Some(Variant::from(expected_with_millis)), + None, + Some(Variant::from(expected_no_millis)), + ], + ); } #[test] @@ -2187,7 +1824,6 @@ mod tests { use arrow::array::Date32Array; use chrono::NaiveDate; - // Test Date32Array with various dates let date_data = vec![ Some(0), // 1970-01-01 None, @@ -2196,36 +1832,19 @@ mod tests { ]; let date_array = Date32Array::from(date_data); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(date_array.data_type(), &date_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..date_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 1970-01-01 (epoch) let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 2024-01-01 let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_2024)); - - // Row 3: 0001-01-01 (near minimum date) let expected_min = NaiveDate::from_ymd_opt(1, 1, 1).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_min)); + + test_row_builder_basic( + &date_array, + vec![ + Some(Variant::from(expected_epoch)), + None, + Some(Variant::from(expected_2024)), + Some(Variant::from(expected_min)), + ], + ); } #[test] @@ -2242,36 +1861,19 @@ mod tests { ]; let date_array = Date64Array::from(date_data); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(date_array.data_type(), &date_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..date_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 1970-01-01 (epoch) let expected_epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_epoch)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 2024-01-01 let expected_2024 = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_2024)); - - // Row 3: 1970-01-02 let expected_next_day = NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_next_day)); + + test_row_builder_basic( + &date_array, + vec![ + Some(Variant::from(expected_epoch)), + None, + Some(Variant::from(expected_2024)), + Some(Variant::from(expected_next_day)), + ], + ); } #[test] @@ -2288,36 +1890,19 @@ mod tests { ]; let time_array = Time32SecondArray::from(time_data); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(time_array.data_type(), &time_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..time_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 00:00:00 (midnight) let expected_midnight = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 01:01:01 let expected_time = NaiveTime::from_hms_opt(1, 1, 1).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_time)); - - // Row 3: 23:59:59 (last second of day) let expected_last = NaiveTime::from_hms_opt(23, 59, 59).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_last)); + + test_row_builder_basic( + &time_array, + vec![ + Some(Variant::from(expected_midnight)), + None, + Some(Variant::from(expected_time)), + Some(Variant::from(expected_last)), + ], + ); } #[test] @@ -2334,36 +1919,19 @@ mod tests { ]; let time_array = Time32MillisecondArray::from(time_data); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(time_array.data_type(), &time_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..time_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 00:00:00.000 (midnight) let expected_midnight = NaiveTime::from_hms_milli_opt(0, 0, 0, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 01:01:01.123 let expected_time = NaiveTime::from_hms_milli_opt(1, 1, 1, 123).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_time)); - - // Row 3: 23:59:59.999 (last millisecond of day) let expected_last = NaiveTime::from_hms_milli_opt(23, 59, 59, 999).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_last)); + + test_row_builder_basic( + &time_array, + vec![ + Some(Variant::from(expected_midnight)), + None, + Some(Variant::from(expected_time)), + Some(Variant::from(expected_last)), + ], + ); } #[test] @@ -2380,36 +1948,19 @@ mod tests { ]; let time_array = Time64MicrosecondArray::from(time_data); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(time_array.data_type(), &time_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..time_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 00:00:00.000000 (midnight) let expected_midnight = NaiveTime::from_hms_micro_opt(0, 0, 0, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 01:01:01.123456 let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_time)); - - // Row 3: 23:59:59.999999 (last microsecond of day) let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_last)); + + test_row_builder_basic( + &time_array, + vec![ + Some(Variant::from(expected_midnight)), + None, + Some(Variant::from(expected_time)), + Some(Variant::from(expected_last)), + ], + ); } #[test] @@ -2426,35 +1977,19 @@ mod tests { ]; let time_array = Time64NanosecondArray::from(time_data); - let options = CastOptions::default(); - let mut row_builder = - make_arrow_to_variant_row_builder(time_array.data_type(), &time_array, &options) - .unwrap(); - - let mut array_builder = VariantArrayBuilder::new(4); - - for i in 0..time_array.len() { - let mut builder = array_builder.variant_builder(); - row_builder.append_row(&mut builder, i).unwrap(); - builder.finish(); - } - - let variant_array = array_builder.build(); - assert_eq!(variant_array.len(), 4); - - // Row 0: 00:00:00.000000000 (midnight) let expected_midnight = NaiveTime::from_hms_nano_opt(0, 0, 0, 0).unwrap(); - assert_eq!(variant_array.value(0), Variant::from(expected_midnight)); - - // Row 1: null - assert!(variant_array.is_null(1)); - - // Row 2: 01:01:01.123456789 -> truncated to 01:01:01.123456000 (microsecond precision) + // Nanoseconds are truncated to microsecond precision in Variant let expected_time = NaiveTime::from_hms_micro_opt(1, 1, 1, 123456).unwrap(); - assert_eq!(variant_array.value(2), Variant::from(expected_time)); - - // Row 3: 23:59:59.999999999 -> truncated to 23:59:59.999999000 (microsecond precision) let expected_last = NaiveTime::from_hms_micro_opt(23, 59, 59, 999999).unwrap(); - assert_eq!(variant_array.value(3), Variant::from(expected_last)); + + test_row_builder_basic( + &time_array, + vec![ + Some(Variant::from(expected_midnight)), + None, + Some(Variant::from(expected_time)), + Some(Variant::from(expected_last)), + ], + ); } }