Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 69 additions & 32 deletions core/src/main/java/org/apache/iceberg/SingleValueParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,41 +119,14 @@ public static Object fromJson(Type type, JsonNode defaultValue) {
}
return uuid;
case DATE:
Preconditions.checkArgument(
defaultValue.isTextual(), "Cannot parse default as a %s value: %s", type, defaultValue);
return DateTimeUtil.isoDateToDays(defaultValue.textValue());
return parseDateValue(type, defaultValue);
case TIME:
Preconditions.checkArgument(
defaultValue.isTextual(), "Cannot parse default as a %s value: %s", type, defaultValue);
return DateTimeUtil.isoTimeToMicros(defaultValue.textValue());
return parseTimeValue(type, defaultValue);
case TIMESTAMP:
Preconditions.checkArgument(
defaultValue.isTextual(), "Cannot parse default as a %s value: %s", type, defaultValue);
if (((Types.TimestampType) type).shouldAdjustToUTC()) {
String timestampTz = defaultValue.textValue();
Preconditions.checkArgument(
DateTimeUtil.isUTCTimestamptz(timestampTz),
"Cannot parse default as a %s value: %s, offset must be +00:00",
type,
defaultValue);
return DateTimeUtil.isoTimestamptzToMicros(timestampTz);
} else {
return DateTimeUtil.isoTimestampToMicros(defaultValue.textValue());
}
return parseTimestampValue(type, defaultValue);
case TIMESTAMP_NANO:
Preconditions.checkArgument(
defaultValue.isTextual(), "Cannot parse default as a %s value: %s", type, defaultValue);
if (((Types.TimestampNanoType) type).shouldAdjustToUTC()) {
String timestampTzNano = defaultValue.textValue();
Preconditions.checkArgument(
DateTimeUtil.isUTCTimestamptz(timestampTzNano),
"Cannot parse default as a %s value: %s, offset must be +00:00",
type,
defaultValue);
return DateTimeUtil.isoTimestamptzToNanos(timestampTzNano);
} else {
return DateTimeUtil.isoTimestampToNanos(defaultValue.textValue());
}
return parseTimestampNanoValue(type, defaultValue);

case FIXED:
Preconditions.checkArgument(
defaultValue.isTextual(), "Cannot parse default as a %s value: %s", type, defaultValue);
Expand Down Expand Up @@ -414,4 +387,68 @@ public static void toJson(Type type, Object defaultValue, JsonGenerator generato
throw new UnsupportedOperationException(String.format("Type: %s is not supported", type));
}
}

private static Object parseDateValue(Type type, JsonNode value) {
if (value.isTextual()) {
return DateTimeUtil.isoDateToDays(value.textValue());
} else if (value.isIntegralNumber() && value.canConvertToInt()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these changes are probably not required to fix the underlying issue, so we might want to separate them out and test them individually

return value.intValue();
} else {
throw new IllegalArgumentException(
String.format("Cannot parse default as a %s value: %s", type, value));
}
}

private static Object parseTimeValue(Type type, JsonNode value) {
if (value.isTextual()) {
return DateTimeUtil.isoTimeToMicros(value.textValue());
} else if (value.isIntegralNumber() && value.canConvertToLong()) {
return value.longValue();
} else {
throw new IllegalArgumentException(
String.format("Cannot parse default as a %s value: %s", type, value));
}
}

private static Object parseTimestampValue(Type type, JsonNode value) {
if (value.isTextual()) {
if (((Types.TimestampType) type).shouldAdjustToUTC()) {
String timestampTz = value.textValue();
Preconditions.checkArgument(
DateTimeUtil.isUTCTimestamptz(timestampTz),
"Cannot parse default as a %s value: %s, offset must be +00:00",
type,
value);
return DateTimeUtil.isoTimestamptzToMicros(timestampTz);
} else {
return DateTimeUtil.isoTimestampToMicros(value.textValue());
}
} else if (value.isIntegralNumber() && value.canConvertToLong()) {
return value.longValue();
} else {
throw new IllegalArgumentException(
String.format("Cannot parse default as a %s value: %s", type, value));
}
}

private static Object parseTimestampNanoValue(Type type, JsonNode value) {
if (value.isTextual()) {
if (((Types.TimestampNanoType) type).shouldAdjustToUTC()) {
String timestampTzNano = value.textValue();
Preconditions.checkArgument(
DateTimeUtil.isUTCTimestamptz(timestampTzNano),
"Cannot parse default as a %s value: %s, offset must be +00:00",
type,
value);
return DateTimeUtil.isoTimestamptzToNanos(timestampTzNano);
} else {
return DateTimeUtil.isoTimestampToNanos(value.textValue());
}
} else if (value.isIntegralNumber() && value.canConvertToLong()) {
return value.longValue();
} else {
throw new IllegalArgumentException(
String.format("Cannot parse default as a %s value: %s", type, value));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ public static Expression fromJson(String json, Schema schema) {
return JsonUtil.parse(json, node -> fromJson(node, schema));
}

static Expression fromJson(JsonNode json, Schema schema) {
public static Expression fromJson(JsonNode json, Schema schema) {
Preconditions.checkArgument(null != json, "Cannot parse expression from null object");
// check for constant expressions
if (json.isBoolean()) {
Expand Down
13 changes: 8 additions & 5 deletions core/src/main/java/org/apache/iceberg/rest/CatalogHandlers.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
import org.apache.iceberg.exceptions.NoSuchNamespaceException;
import org.apache.iceberg.exceptions.NoSuchTableException;
import org.apache.iceberg.exceptions.NoSuchViewException;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
Expand Down Expand Up @@ -657,7 +658,7 @@ public static PlanTableScanResponse planTableScan(
.fromSnapshotInclusive(request.startSnapshotId())
.toSnapshot(request.endSnapshotId());

configuredScan = configureScan(incrementalScan, request);
configuredScan = configureScan(incrementalScan, request, incrementalScan.schema());
} else {
// Regular table scan at a specific snapshot
TableScan tableScan = table.newScan();
Expand All @@ -667,7 +668,7 @@ public static PlanTableScanResponse planTableScan(
}

// Apply filters and projections using common method
configuredScan = configureScan(tableScan, request);
configuredScan = configureScan(tableScan, request, tableScan.schema());
}

if (shouldPlanAsync.test(configuredScan)) {
Expand Down Expand Up @@ -773,18 +774,20 @@ static void clearPlanningState() {
*
* @param scan the scan to configure
* @param request the plan table scan request containing filters and projections
* @param schema the table schema to use for type-aware filter deserialization
* @param <T> the specific scan type (TableScan, IncrementalAppendScan, etc.)
* @return the configured scan with filters and projections applied
*/
private static <T extends Scan<T, FileScanTask, ?>> T configureScan(
T scan, PlanTableScanRequest request) {
T scan, PlanTableScanRequest request, Schema schema) {
T configuredScan = scan;

if (request.select() != null) {
configuredScan = configuredScan.select(request.select());
}
if (request.filter() != null) {
configuredScan = configuredScan.filter(request.filter());
Expression filter = request.filter(schema);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this change is probably not needed

if (filter != null) {
configuredScan = configuredScan.filter(filter);
}
if (request.statsFields() != null) {
configuredScan = configuredScan.includeColumnStats(request.statsFields());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ public static FileScanTask fromJson(
DataFile dataFile =
(DataFile) ContentFileParser.fromJson(JsonUtil.get(DATA_FILE, jsonNode), specsById);
int specId = dataFile.specId();
PartitionSpec spec = specsById.get(specId);

DeleteFile[] deleteFiles = null;
if (jsonNode.has(DELETE_FILE_REFERENCES)) {
Expand All @@ -96,13 +97,12 @@ public static FileScanTask fromJson(

Expression filter = null;
if (jsonNode.has(RESIDUAL_FILTER)) {
filter = ExpressionParser.fromJson(jsonNode.get(RESIDUAL_FILTER));
filter = ExpressionParser.fromJson(jsonNode.get(RESIDUAL_FILTER), spec.schema());
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was caught during the execution phase of spark, need to pass schema for residual

}

String schemaString = SchemaParser.toJson(specsById.get(specId).schema());
String specString = PartitionSpecParser.toJson(specsById.get(specId));
ResidualEvaluator boundResidual =
ResidualEvaluator.of(specsById.get(specId), filter, isCaseSensitive);
String schemaString = SchemaParser.toJson(spec.schema());
String specString = PartitionSpecParser.toJson(spec);
ResidualEvaluator boundResidual = ResidualEvaluator.of(spec, filter, isCaseSensitive);

return new BaseFileScanTask(dataFile, deleteFiles, schemaString, specString, boundResidual);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,22 @@
*/
package org.apache.iceberg.rest.requests;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import java.io.UncheckedIOException;
import java.util.List;
import org.apache.iceberg.Schema;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ExpressionParser;
import org.apache.iceberg.relocated.com.google.common.base.MoreObjects;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.rest.RESTRequest;
import org.apache.iceberg.util.JsonUtil;

public class PlanTableScanRequest implements RESTRequest {
private final Long snapshotId;
private final List<String> select;
private final Expression filter;
private final JsonNode filterJson;
private final boolean caseSensitive;
private final boolean useSnapshotSchema;
private final Long startSnapshotId;
Expand All @@ -43,8 +49,47 @@ public List<String> select() {
return select;
}

/**
* Returns the filter expression, deserializing it without schema context.
*
* <p>Note: This method does not perform type-aware deserialization and may not work correctly for
* BINARY, FIXED, and DECIMAL types. Use {@link #filter(Schema)} instead for proper type handling.
*
* @return the filter expression, or null if no filter was specified
* @deprecated since 1.11.0, will be removed in 1.12.0; use {@link #filter(Schema)} instead for
* proper type-aware deserialization
*/
@Deprecated
public Expression filter() {
return filter;
if (filterJson == null) {
return null;
}
return ExpressionParser.fromJson(filterJson);
}

/**
* Returns the filter expression, deserializing it with the provided schema for type inference.
*
* <p>This method should be preferred over {@link #filter()} as it properly handles BINARY, FIXED,
* and DECIMAL types by using schema information for type-aware deserialization.
*
* @param schema the table schema to use for type-aware deserialization of filter values
* @return the filter expression, or null if no filter was specified
*/
public Expression filter(Schema schema) {
Copy link
Contributor

@nastra nastra Dec 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let me think about this a bit more. I also think we have a few more cases across the codebase where we also ser/de Expression without a Schema and theoretically we would have the same issue in those places as well.
Whatever approach we pick, we'd want to follow up in those other places too

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the other thing we might need to consider is how we would be lazily binding this in other client implementations. @Fokko does pyiceberg have examples of how it does a late-binding similar to this one?
The issue that we have here is that we deserialize an Expression where we can only correctly do so when we bind it to a Schema

if (filterJson == null) {
return null;
}
return ExpressionParser.fromJson(filterJson, schema);
}

/**
* Returns the raw filter JSON node, if available. Package-private for use by parser.
*
* @return the raw filter JSON, or null if no filter JSON was stored
*/
JsonNode filterJson() {
return filterJson;
}

public boolean caseSensitive() {
Expand Down Expand Up @@ -74,7 +119,7 @@ public Long minRowsRequested() {
private PlanTableScanRequest(
Long snapshotId,
List<String> select,
Expression filter,
JsonNode filterJson,
boolean caseSensitive,
boolean useSnapshotSchema,
Long startSnapshotId,
Expand All @@ -83,7 +128,7 @@ private PlanTableScanRequest(
Long minRowsRequested) {
this.snapshotId = snapshotId;
this.select = select;
this.filter = filter;
this.filterJson = filterJson;
this.caseSensitive = caseSensitive;
this.useSnapshotSchema = useSnapshotSchema;
this.startSnapshotId = startSnapshotId;
Expand Down Expand Up @@ -111,14 +156,21 @@ public void validate() {
Preconditions.checkArgument(
minRowsRequested >= 0L, "Invalid scan: minRowsRequested is negative");
}

if (null != filterJson) {
Preconditions.checkArgument(
filterJson.isBoolean() || filterJson.isObject(),
"Cannot parse expression from non-object: %s",
filterJson);
}
}

@Override
public String toString() {
return MoreObjects.toStringHelper(this)
.add("snapshotId", snapshotId)
.add("select", select)
.add("filter", filter)
.add("filter", filterJson)
.add("caseSensitive", caseSensitive)
.add("useSnapshotSchema", useSnapshotSchema)
.add("startSnapshotId", startSnapshotId)
Expand All @@ -135,7 +187,7 @@ public static Builder builder() {
public static class Builder {
private Long snapshotId;
private List<String> select;
private Expression filter;
private JsonNode filterJson;
private boolean caseSensitive = true;
private boolean useSnapshotSchema = false;
private Long startSnapshotId;
Expand All @@ -160,8 +212,38 @@ public Builder withSelect(List<String> projection) {
return this;
}

/**
* Sets the filter expression for the scan.
*
* @param expression the filter expression
* @return this builder
* @deprecated since 1.11.0, will be removed in 1.12.0; this method serializes the expression to
* JSON immediately, which may lose type information for BINARY, FIXED, and DECIMAL types
*/
@Deprecated
public Builder withFilter(Expression expression) {
this.filter = expression;
if (expression != null) {
try {
// Serialize expression to JSON immediately for deferred type-aware deserialization
String jsonString = ExpressionParser.toJson(expression);
this.filterJson = JsonUtil.mapper().readTree(jsonString);
} catch (JsonProcessingException e) {
throw new UncheckedIOException("Failed to serialize filter expression to JSON", e);
}
} else {
this.filterJson = null;
}
return this;
}

/**
* Sets the filter JSON node directly. Package-private for use by parser.
*
* @param filterJsonNode the filter as a JSON node
* @return this builder
*/
Builder withFilterJson(JsonNode filterJsonNode) {
this.filterJson = filterJsonNode;
return this;
}

Expand Down Expand Up @@ -199,7 +281,7 @@ public PlanTableScanRequest build() {
return new PlanTableScanRequest(
snapshotId,
select,
filter,
filterJson,
caseSensitive,
useSnapshotSchema,
startSnapshotId,
Expand Down
Loading