File tree Expand file tree Collapse file tree 2 files changed +68
-0
lines changed
Expand file tree Collapse file tree 2 files changed +68
-0
lines changed Original file line number Diff line number Diff line change 1+ # udf_diff2_failure_example.py
2+
3+ import pyarrow as pa
4+ from datafusion import SessionContext , udf , col
5+
6+ # Define UUID extension type and source array
7+ uuid_type = pa .uuid ()
8+ storage = pa .array (
9+ [b"\x00 " * 16 , b"\x01 " * 16 ],
10+ type = uuid_type .storage_type
11+ )
12+ uuid_array = uuid_type .wrap_array (storage )
13+
14+ # Correct decorator pattern for Diff 2
15+ @udf (
16+ input_types = [pa .field ("v" , uuid_type )],
17+ return_type = pa .field ("v" , uuid_type ),
18+ volatility = "immutable" ,
19+ name = "ident" ,
20+ )
21+ def ident (x ):
22+ if not isinstance (x , pa .ExtensionArray ):
23+ # This is where Diff 2 breaks: x is a FixedSizeBinaryArray
24+ raise TypeError (f"Expected ExtensionArray, got { type (x )} " )
25+ return x
26+
27+ ctx = SessionContext ()
28+ df = ctx .from_pydict ({"uuid_col" : uuid_array })
29+
30+ # This should trigger the failure due to missing extension type handling in Diff 2
31+ try :
32+ result = df .select (ident (col ("uuid_col" ))).collect ()
33+ print (result )
34+ except Exception as e :
35+ print (f"❌ UDF failed: { e } " )
Original file line number Diff line number Diff line change 1+
2+ import pyarrow as pa
3+ from datafusion import SessionContext , udf , column
4+
5+ uuid_type = pa .uuid ()
6+ chunks = [
7+ uuid_type .wrap_array (pa .array ([], type = uuid_type .storage_type )),
8+ uuid_type .wrap_array (
9+ pa .array ([b'\x00 ' * 16 ], type = uuid_type .storage_type )
10+ ),
11+ ]
12+ chunked_ext = pa .chunked_array (chunks , type = uuid_type )
13+
14+ def combine_if_uuid (x : pa .ChunkedArray ) -> pa .Array :
15+ # Diff 2 will deliver a broken type here → no extension metadata
16+ if x .type != uuid_type :
17+ raise TypeError (f"Expected uuid type, got { x .type } " )
18+
19+ return x .combine_chunks ()
20+
21+ combine_udf = udf (
22+ combine_if_uuid ,
23+ input_types = [pa .field ("v" , uuid_type )],
24+ return_type = pa .field ("v" , uuid_type ),
25+ volatility = "immutable" ,
26+ )
27+
28+ ctx = SessionContext ()
29+ df = ctx .from_pydict ({"uuid_col" : chunked_ext })
30+
31+ # ❌ Breaks under Diff 2 (but works under Diff 1)
32+ res = df .select (combine_udf (column ("uuid_col" )))
33+ print (res .collect ())
You can’t perform that action at this time.
0 commit comments