apache · alamb · Jul 11, 2025 · Jul 7, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/Cargo.toml b/Cargo.toml
@@ -40,6 +40,7 @@ members = [
     "arrow-string",
     "parquet",
     "parquet-variant",
+    "parquet-variant-compute",
     "parquet-variant-json",
     "parquet_derive",
     "parquet_derive_test",
@@ -103,6 +104,7 @@ parquet = { version = "55.2.0", path = "./parquet", default-features = false }
 # These crates have not yet been released and thus do not use the workspace version
 parquet-variant = { version = "0.1.0", path = "./parquet-variant"}
 parquet-variant-json = { version = "0.1.0", path = "./parquet-variant-json" }
+parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-json" }
 
 chrono = { version = "0.4.40", default-features = false, features = ["clock"] }
 

diff --git a/parquet-variant-compute/Cargo.toml b/parquet-variant-compute/Cargo.toml
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "parquet-variant-compute"
+# This package is still in development and thus the version does
+# not follow the versions of the rest of the crates in this repo.
+version = "0.1.0"
+license = { workspace = true }
+description = "Apache Parquet Variant Batch Processing"
+homepage = { workspace = true }
+repository = { workspace = true }
+authors = { workspace = true }
+keywords = ["arrow", "parquet", "variant"]
+edition = { workspace = true }
+# parquet-variant needs newer version than workspace
+rust-version = "1.83"
+
+
+[dependencies]
+arrow = { workspace = true }
+arrow-schema = { workspace = true }
+parquet-variant = { workspace = true }
+parquet-variant-json = { workspace = true }
+
+[lib]
+name = "parquet_variant_compute"
+bench = false
+
+[dev-dependencies]
diff --git a/parquet-variant-compute/src/from_json.rs b/parquet-variant-compute/src/from_json.rs
@@ -0,0 +1,181 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Module for transforming a batch of JSON strings into a batch of Variants represented as
+//! STRUCT<metadata: BINARY, value: BINARY>
+
+use std::sync::Arc;
+
+use arrow::array::{Array, ArrayRef, BinaryArray, BooleanBufferBuilder, StringArray, StructArray};
+use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer, ScalarBuffer};
+use arrow::datatypes::{DataType, Field};
+use arrow_schema::ArrowError;
+use parquet_variant::VariantBuilder;
+use parquet_variant_json::json_to_variant;
+
+fn variant_arrow_repr() -> DataType {
+    // The subfields are expected to be non-nullable according to the parquet variant spec.
+    let metadata_field = Field::new("metadata", DataType::Binary, false);
+    let value_field = Field::new("value", DataType::Binary, false);
+    let fields = vec![metadata_field, value_field];
+    DataType::Struct(fields.into())
+}
+
+/// Parse a batch of JSON strings into a batch of Variants represented as
+/// STRUCT<metadata: BINARY, value: BINARY> where nulls are preserved. The JSON strings in the input
+/// must be valid.
+pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<StructArray, ArrowError> {
+    let input_string_array = match input.as_any().downcast_ref::<StringArray>() {
+        Some(string_array) => Ok(string_array),
+        None => Err(ArrowError::CastError(
+            "Expected reference to StringArray as input".into(),
+        )),
+    }?;
-    let input_string_array = match input.as_any().downcast_ref::<StringArray>() {
-        Some(string_array) => Ok(string_array),
-        None => Err(ArrowError::CastError(
-            "Expected reference to StringArray as input".into(),
-        )),
-    }?;
+    let input_string_array = input
+        .as_any()
+        .downcast_ref::<StringArray>()
+        .ok_or_else(|| ArrowError::CastError("Expected a StringArray as input".into()))?;
-    let input_string_array = match input.as_any().downcast_ref::<StringArray>() {
-        Some(string_array) => Ok(string_array),
-        None => Err(ArrowError::CastError(
-            "Expected reference to StringArray as input".into(),
-        )),
-    }?;
+    let input_string_array = input
+        .as_any()
+        .downcast_ref::<StringArray>()
+        .ok_or_else(|| ArrowError::CastError("Expected a StringArray as input".into()))?;
+
+    // Zero-copy builders
+    let mut metadata_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128);
+    let mut metadata_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1);
+    let mut metadata_validity = BooleanBufferBuilder::new(input.len());
+    let mut metadata_current_offset: i32 = 0;
+    metadata_offsets.push(metadata_current_offset);
+
+    let mut value_buffer: Vec<u8> = Vec::with_capacity(input.len() * 128);
+    let mut value_offsets: Vec<i32> = Vec::with_capacity(input.len() + 1);
+    let mut value_validity = BooleanBufferBuilder::new(input.len());
+    let mut value_current_offset: i32 = 0;
+    value_offsets.push(value_current_offset);
+
+    let mut validity = BooleanBufferBuilder::new(input.len());
+    for i in 0..input.len() {
+        if input.is_null(i) {
+            // The subfields are expected to be non-nullable according to the parquet variant spec.
+            metadata_validity.append(true);
+            value_validity.append(true);
+            metadata_offsets.push(metadata_current_offset);
+            value_offsets.push(value_current_offset);
+            validity.append(false);
+        } else {
+            let mut vb = VariantBuilder::new();
+            json_to_variant(input_string_array.value(i), &mut vb)?;
+            let (metadata, value) = vb.finish();
+            validity.append(true);
+
+            metadata_current_offset += metadata.len() as i32;
+            metadata_buffer.extend(metadata);
+            metadata_offsets.push(metadata_current_offset);
+            metadata_validity.append(true);
+
+            value_current_offset += value.len() as i32;
+            value_buffer.extend(value);
+            value_offsets.push(value_current_offset);
+            value_validity.append(true);
+        }
+    }
+    let metadata_offsets_buffer = OffsetBuffer::new(ScalarBuffer::from(metadata_offsets));
+    let metadata_data_buffer = Buffer::from_vec(metadata_buffer);
+    let metadata_null_buffer = NullBuffer::new(metadata_validity.finish());
+
+    let value_offsets_buffer = OffsetBuffer::new(ScalarBuffer::from(value_offsets));
+    let value_data_buffer = Buffer::from_vec(value_buffer);
+    let value_null_buffer = NullBuffer::new(value_validity.finish());
+
+    let metadata_array = BinaryArray::new(
+        metadata_offsets_buffer,
+        metadata_data_buffer,
+        Some(metadata_null_buffer),
+    );
+    let value_array = BinaryArray::new(
+        value_offsets_buffer,
+        value_data_buffer,
+        Some(value_null_buffer),
+    );
+
+    let struct_fields: Vec<ArrayRef> = vec![Arc::new(metadata_array), Arc::new(value_array)];
+    let variant_fields = match variant_arrow_repr() {
+        DataType::Struct(fields) => fields,
+        _ => unreachable!("variant_arrow_repr is hard-coded and must match the expected schema"),
+    };
-    let variant_fields = match variant_arrow_repr() {
-        DataType::Struct(fields) => fields,
-        _ => unreachable!("variant_arrow_repr is hard-coded and must match the expected schema"),
-    };
+    let metadata_field = Field::new("metadata", metadata_array.data_type(), false);
+    let value_field = Field::new("value", value_array.data_type(), false);
+    let variant_fields = vec![metadata_field, value_field].into();
-    let variant_fields = match variant_arrow_repr() {
-        DataType::Struct(fields) => fields,
-        _ => unreachable!("variant_arrow_repr is hard-coded and must match the expected schema"),
-    };
+    let metadata_field = Field::new("metadata", metadata_array.data_type(), false);
+    let value_field = Field::new("value", value_array.data_type(), false);
+    let variant_fields = vec![metadata_field, value_field].into();
+    let null_buffer = NullBuffer::new(validity.finish());
+    Ok(StructArray::new(
+        variant_fields,
+        struct_fields,
+        Some(null_buffer),
+    ))
+}
+
+#[cfg(test)]
+mod test {
+    use crate::batch_json_string_to_variant;
+    use arrow::array::{Array, ArrayRef, BinaryArray, StringArray};
+    use arrow_schema::ArrowError;
+    use parquet_variant::{Variant, VariantBuilder};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_batch_json_string_to_variant() -> Result<(), ArrowError> {
+        let input = StringArray::from(vec![
+            Some("1"),
+            None,
+            Some("{\"a\": 32}"),
+            Some("null"),
+            None,
+        ]);
+        let array_ref: ArrayRef = Arc::new(input);
+        let output = batch_json_string_to_variant(&array_ref).unwrap();
+
+        let struct_array = &output;
+        let metadata_array = struct_array
+            .column(0)
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .unwrap();
+        let value_array = struct_array
+            .column(1)
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .unwrap();
+
+        assert!(!struct_array.is_null(0));
+        assert!(struct_array.is_null(1));
+        assert!(!struct_array.is_null(2));
+        assert!(!struct_array.is_null(3));
+        assert!(struct_array.is_null(4));
+
+        assert_eq!(metadata_array.value(0), &[1, 0, 0]);
+        assert_eq!(value_array.value(0), &[12, 1]);
+
+        {
+            let mut vb = VariantBuilder::new();
+            let mut ob = vb.new_object();
+            ob.insert("a", Variant::Int8(32));
+            ob.finish()?;
+            let (object_metadata, object_value) = vb.finish();
+            assert_eq!(metadata_array.value(2), &object_metadata);
+            assert_eq!(value_array.value(2), &object_value);
+        }
+
+        assert_eq!(metadata_array.value(3), &[1, 0, 0]);
+        assert_eq!(value_array.value(3), &[0]);
+
+        // Ensure that the subfields are not actually nullable
+        assert!(!metadata_array.is_null(1));
+        assert!(!value_array.is_null(1));
+        assert!(!metadata_array.is_null(4));
+        assert!(!value_array.is_null(4));
+        Ok(())
+    }
+}
diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod from_json;
+mod to_json;
+
+pub use from_json::batch_json_string_to_variant;
+pub use to_json::batch_variant_to_json_string;