diff --git a/java/lance-jni/src/schema.rs b/java/lance-jni/src/schema.rs index b9c3d70ef8..c571ab8057 100644 --- a/java/lance-jni/src/schema.rs +++ b/java/lance-jni/src/schema.rs @@ -39,9 +39,11 @@ pub fn convert_to_java_field<'local>( let name = env.new_string(&lance_field.name)?; let children = convert_children_fields(env, lance_field)?; let metadata = to_java_map(env, &lance_field.metadata)?; + let logical_type = env.new_string(lance_field.logical_type.to_string())?; let arrow_type = convert_arrow_type(env, &lance_field.data_type())?; let ctor_sig = "(IILjava/lang/String;".to_owned() - + "ZLorg/apache/arrow/vector/types/pojo/ArrowType;" + + "ZLjava/lang/String;" + + "Lorg/apache/arrow/vector/types/pojo/ArrowType;" + "Lorg/apache/arrow/vector/types/pojo/DictionaryEncoding;" + "Ljava/util/Map;" + "Ljava/util/List;Z)V"; @@ -53,6 +55,7 @@ pub fn convert_to_java_field<'local>( JValue::Int(lance_field.parent_id as jint), JValue::Object(&JObject::from(name)), JValue::Bool(lance_field.nullable as jboolean), + JValue::Object(&JObject::from(logical_type)), JValue::Object(&arrow_type), JValue::Object(&JObject::null()), JValue::Object(&metadata), diff --git a/java/src/main/java/org/lance/schema/LanceField.java b/java/src/main/java/org/lance/schema/LanceField.java index 4ede9ccb86..d169229bc0 100644 --- a/java/src/main/java/org/lance/schema/LanceField.java +++ b/java/src/main/java/org/lance/schema/LanceField.java @@ -14,11 +14,17 @@ package org.lance.schema; import com.google.common.base.MoreObjects; +import com.google.common.collect.ImmutableMap; +import org.apache.arrow.vector.types.DateUnit; +import org.apache.arrow.vector.types.FloatingPointPrecision; +import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.DictionaryEncoding; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; +import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; @@ -29,6 +35,7 @@ public class LanceField { private final int parentId; private final String name; private final boolean nullable; + private final String logicalType; private final ArrowType type; private final DictionaryEncoding dictionaryEncoding; private final Map metadata; @@ -40,6 +47,7 @@ public class LanceField { int parentId, String name, boolean nullable, + String logicalType, ArrowType type, DictionaryEncoding dictionaryEncoding, Map metadata, @@ -49,6 +57,7 @@ public class LanceField { this.parentId = parentId; this.name = name; this.nullable = nullable; + this.logicalType = logicalType; this.type = type; this.dictionaryEncoding = dictionaryEncoding; this.metadata = metadata; @@ -72,6 +81,10 @@ public boolean isNullable() { return nullable; } + public String getLogicalType() { + return logicalType; + } + public ArrowType getType() { return type; } @@ -95,10 +108,126 @@ public boolean isUnenforcedPrimaryKey() { public Field asArrowField() { List arrowChildren = children.stream().map(LanceField::asArrowField).collect(Collectors.toList()); + + if (type instanceof ArrowType.FixedSizeList) { + arrowChildren.addAll(childrenForFixedSizeList()); + } + return new Field( name, new FieldType(nullable, type, dictionaryEncoding, metadata), arrowChildren); } + private List childrenForFixedSizeList() { + if (logicalType == null || logicalType.isEmpty()) { + return Collections.emptyList(); + } + + if (!(type instanceof ArrowType.FixedSizeList)) { + return Collections.emptyList(); + } + + if (!logicalType.startsWith("fixed_size_list:")) { + return Collections.emptyList(); + } + + String[] parts = logicalType.split(":"); + if (parts.length < 3) { + throw new IllegalArgumentException("Unsupported logical type: " + logicalType); + } + + String innerLogicalType = + Arrays.asList(parts).subList(1, parts.length - 1).stream().collect(Collectors.joining(":")); + + Field itemField; + switch (innerLogicalType) { + case "lance.bfloat16": + itemField = + new Field( + "item", + new FieldType( + true, + new ArrowType.FixedSizeBinary(2), + null, + ImmutableMap.of( + "ARROW:extension:name", "lance.bfloat16", + "ARROW:extension:metadata", "")), + Collections.emptyList()); + return Collections.singletonList(itemField); + + default: + ArrowType elementType = arrowTypeFromLogicalType(innerLogicalType); + itemField = + new Field( + "item", + new FieldType(true, elementType, null, Collections.emptyMap()), + Collections.emptyList()); + return Collections.singletonList(itemField); + } + } + + private ArrowType arrowTypeFromLogicalType(String logicalType) { + switch (logicalType) { + case "null": + return ArrowType.Null.INSTANCE; + case "bool": + return ArrowType.Bool.INSTANCE; + case "int8": + return new ArrowType.Int(8, true); + case "uint8": + return new ArrowType.Int(8, false); + case "int16": + return new ArrowType.Int(16, true); + case "uint16": + return new ArrowType.Int(16, false); + case "int32": + return new ArrowType.Int(32, true); + case "uint32": + return new ArrowType.Int(32, false); + case "int64": + return new ArrowType.Int(64, true); + case "uint64": + return new ArrowType.Int(64, false); + case "halffloat": + return new ArrowType.FloatingPoint(FloatingPointPrecision.HALF); + case "float": + return new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE); + case "double": + return new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE); + case "string": + return ArrowType.Utf8.INSTANCE; + case "binary": + return ArrowType.Binary.INSTANCE; + case "large_string": + return ArrowType.LargeUtf8.INSTANCE; + case "large_binary": + case "blob": + case "json": + return ArrowType.LargeBinary.INSTANCE; + case "date32:day": + return new ArrowType.Date(DateUnit.DAY); + case "date64:ms": + return new ArrowType.Date(DateUnit.MILLISECOND); + case "time32:s": + return new ArrowType.Time(TimeUnit.SECOND, 32); + case "time32:ms": + return new ArrowType.Time(TimeUnit.MILLISECOND, 32); + case "time64:us": + return new ArrowType.Time(TimeUnit.MICROSECOND, 64); + case "time64:ns": + return new ArrowType.Time(TimeUnit.NANOSECOND, 64); + case "duration:s": + return new ArrowType.Duration(TimeUnit.SECOND); + case "duration:ms": + return new ArrowType.Duration(TimeUnit.MILLISECOND); + case "duration:us": + return new ArrowType.Duration(TimeUnit.MICROSECOND); + case "duration:ns": + return new ArrowType.Duration(TimeUnit.NANOSECOND); + default: + throw new IllegalArgumentException("Unsupported logical type: " + logicalType); + } + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -106,6 +235,7 @@ public String toString() { .add("parentId", parentId) .add("name", name) .add("nullable", nullable) + .add("logicalType", logicalType) .add("type", type) .add("dictionaryEncoding", dictionaryEncoding) .add("children", children) diff --git a/java/src/test/java/org/lance/TestUtils.java b/java/src/test/java/org/lance/TestUtils.java index b17848ef6f..115965feb4 100644 --- a/java/src/test/java/org/lance/TestUtils.java +++ b/java/src/test/java/org/lance/TestUtils.java @@ -17,6 +17,7 @@ import org.lance.fragment.FragmentUpdateResult; import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.arrow.c.ArrowArrayStream; @@ -357,7 +358,31 @@ public static class ComplexTestDataset extends TestDataset { FieldType.nullable(new ArrowType.Struct()), Arrays.asList( Field.nullable("field1", ArrowType.Utf8.INSTANCE), - Field.nullable("field2", new ArrowType.Int(16, true)))))); + Field.nullable("field2", new ArrowType.Int(16, true)))), + + // fixed size list type + new Field( + "fixed_size_list_col", + FieldType.nullable(new ArrowType.FixedSizeList(3)), + Collections.singletonList(Field.nullable("item", new ArrowType.Int(32, true)))), + + // fixed bfloat16 list type + new Field( + "bfloat16_fixed_size_list_col", + FieldType.nullable(new ArrowType.FixedSizeList(3)), + Collections.singletonList( + new Field( + "item", + new FieldType( + true, + new ArrowType.FixedSizeBinary(2), + null, + ImmutableMap.of( + "ARROW:extension:name", + "lance.bfloat16", + "ARROW:extension:metadata", + "")), + Collections.emptyList()))))); public ComplexTestDataset(BufferAllocator allocator, String datasetPath) { super(allocator, datasetPath);