Skip to content

Commit af8ec4e

Browse files
committed
add udf1,2.py
1 parent d8f5dc2 commit af8ec4e

File tree

2 files changed

+68
-0
lines changed

2 files changed

+68
-0
lines changed

examples/udf1.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# udf_diff2_failure_example.py
2+
3+
import pyarrow as pa
4+
from datafusion import SessionContext, udf, col
5+
6+
# Define UUID extension type and source array
7+
uuid_type = pa.uuid()
8+
storage = pa.array(
9+
[b"\x00" * 16, b"\x01" * 16],
10+
type=uuid_type.storage_type
11+
)
12+
uuid_array = uuid_type.wrap_array(storage)
13+
14+
# Correct decorator pattern for Diff 2
15+
@udf(
16+
input_types=[pa.field("v", uuid_type)],
17+
return_type=pa.field("v", uuid_type),
18+
volatility="immutable",
19+
name="ident",
20+
)
21+
def ident(x):
22+
if not isinstance(x, pa.ExtensionArray):
23+
# This is where Diff 2 breaks: x is a FixedSizeBinaryArray
24+
raise TypeError(f"Expected ExtensionArray, got {type(x)}")
25+
return x
26+
27+
ctx = SessionContext()
28+
df = ctx.from_pydict({"uuid_col": uuid_array})
29+
30+
# This should trigger the failure due to missing extension type handling in Diff 2
31+
try:
32+
result = df.select(ident(col("uuid_col"))).collect()
33+
print(result)
34+
except Exception as e:
35+
print(f"❌ UDF failed: {e}")

examples/udf2.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
2+
import pyarrow as pa
3+
from datafusion import SessionContext, udf, column
4+
5+
uuid_type = pa.uuid()
6+
chunks = [
7+
uuid_type.wrap_array(pa.array([], type=uuid_type.storage_type)),
8+
uuid_type.wrap_array(
9+
pa.array([b'\x00' * 16], type=uuid_type.storage_type)
10+
),
11+
]
12+
chunked_ext = pa.chunked_array(chunks, type=uuid_type)
13+
14+
def combine_if_uuid(x: pa.ChunkedArray) -> pa.Array:
15+
# Diff 2 will deliver a broken type here → no extension metadata
16+
if x.type != uuid_type:
17+
raise TypeError(f"Expected uuid type, got {x.type}")
18+
19+
return x.combine_chunks()
20+
21+
combine_udf = udf(
22+
combine_if_uuid,
23+
input_types=[pa.field("v", uuid_type)],
24+
return_type=pa.field("v", uuid_type),
25+
volatility="immutable",
26+
)
27+
28+
ctx = SessionContext()
29+
df = ctx.from_pydict({"uuid_col": chunked_ext})
30+
31+
# ❌ Breaks under Diff 2 (but works under Diff 1)
32+
res = df.select(combine_udf(column("uuid_col")))
33+
print(res.collect())

0 commit comments

Comments
 (0)