From 152ef8166870f941d5df1fa37ed143998c596af3 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 4 Jun 2026 12:56:47 -0400 Subject: [PATCH 1/8] feat: expose arrow_field, arrow_try_cast, cast_to_type, with_metadata Adds Python bindings for five scalar functions from datafusion::functions::expr_fn that were not previously surfaced: - arrow_field: returns a struct describing an expression's Arrow field (name, data_type, nullable, metadata). - arrow_try_cast: like arrow_cast but yields NULL on cast failure. - cast_to_type / try_cast_to_type: casts a value to the type of a reference expression. These are exposed as a single Python entry point cast_to_type(value, type_ref, *, try_cast=False); the kwarg switches between the strict and try variants. - with_metadata: attach Arrow field metadata; the inverse of arrow_metadata. Accepts a dict[str, str] for ergonomics. Updates skills/datafusion_python/SKILL.md to list the new functions and documents the cast_to_type kwarg behavior. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/core/src/functions.rs | 10 +++ python/datafusion/functions.py | 107 ++++++++++++++++++++++++++++++ skills/datafusion_python/SKILL.md | 7 +- 3 files changed, 123 insertions(+), 1 deletion(-) diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs index 395d5ebfd..014d22480 100644 --- a/crates/core/src/functions.rs +++ b/crates/core/src/functions.rs @@ -607,7 +607,12 @@ expr_fn_vec!(named_struct); expr_fn!(from_unixtime, unixtime); expr_fn!(arrow_typeof, arg_1); expr_fn!(arrow_cast, arg_1 datatype); +expr_fn!(arrow_try_cast, arg_1 datatype); +expr_fn!(arrow_field, arg_1); +expr_fn!(cast_to_type, arg_1 reference); +expr_fn!(try_cast_to_type, arg_1 reference); expr_fn_vec!(arrow_metadata); +expr_fn_vec!(with_metadata); expr_fn!(union_tag, arg1); expr_fn!(random); @@ -962,7 +967,12 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(array_agg))?; m.add_wrapped(wrap_pyfunction!(arrow_typeof))?; m.add_wrapped(wrap_pyfunction!(arrow_cast))?; + m.add_wrapped(wrap_pyfunction!(arrow_try_cast))?; + m.add_wrapped(wrap_pyfunction!(arrow_field))?; + m.add_wrapped(wrap_pyfunction!(cast_to_type))?; + m.add_wrapped(wrap_pyfunction!(try_cast_to_type))?; m.add_wrapped(wrap_pyfunction!(arrow_metadata))?; + m.add_wrapped(wrap_pyfunction!(with_metadata))?; m.add_wrapped(wrap_pyfunction!(ascii))?; m.add_wrapped(wrap_pyfunction!(asin))?; m.add_wrapped(wrap_pyfunction!(asinh))?; diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index c8f07497d..a2694a1b4 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -120,7 +120,9 @@ "arrays_overlap", "arrays_zip", "arrow_cast", + "arrow_field", "arrow_metadata", + "arrow_try_cast", "arrow_typeof", "ascii", "asin", @@ -138,6 +140,7 @@ "btrim", "cardinality", "case", + "cast_to_type", "cbrt", "ceil", "char_length", @@ -368,6 +371,7 @@ "var_sample", "version", "when", + "with_metadata", ] @@ -2930,6 +2934,82 @@ def arrow_cast(expr: Expr, data_type: Expr | str | pa.DataType) -> Expr: return Expr(f.arrow_cast(expr.expr, data_type.expr)) +def arrow_try_cast(expr: Expr, data_type: Expr | str) -> Expr: + """Casts an expression to a specified data type, returning NULL on failure. + + Like :py:func:`arrow_cast` but produces NULL instead of erroring when the + cast cannot be performed. The ``data_type`` may be a string in DataFusion + type syntax (for example ``"Float64"``) or an ``Expr`` of string type. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["oops"]}) + >>> result = df.select( + ... dfn.functions.arrow_try_cast(dfn.col("a"), "Float64").alias("c") + ... ) + >>> result.collect_column("c")[0].as_py() is None + True + """ + if isinstance(data_type, str): + data_type = Expr.string_literal(data_type) + return Expr(f.arrow_try_cast(expr.expr, data_type.expr)) + + +def arrow_field(expr: Expr) -> Expr: + """Returns the Arrow field information of an expression as a struct. + + The returned struct contains the field's name, data type, nullability, + and metadata. + + Examples: + >>> field = pa.field("val", pa.int64(), metadata={"k": "v"}) + >>> schema = pa.schema([field]) + >>> batch = pa.RecordBatch.from_arrays([pa.array([1])], schema=schema) + >>> ctx = dfn.SessionContext() + >>> df = ctx.create_dataframe([[batch]]) + >>> result = df.select( + ... dfn.functions.arrow_field(dfn.col("val")).alias("f") + ... ) + >>> result.collect_column("f")[0].as_py()["name"] + 'val' + """ + return Expr(f.arrow_field(expr.expr)) + + +def cast_to_type(value: Expr, type_ref: Expr, *, try_cast: bool = False) -> Expr: + """Casts ``value`` to the data type of ``type_ref``. + + Only the *type* of ``type_ref`` is used; its value is ignored. This is + useful when the target type comes from another column or expression + rather than being known up-front. When ``try_cast=True``, casts that + fail produce NULL instead of erroring (this dispatches to upstream + ``try_cast_to_type``). + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1], "b": [1.0]}) + >>> result = df.select( + ... dfn.functions.cast_to_type( + ... dfn.col("a"), dfn.col("b") + ... ).alias("c") + ... ) + >>> result.collect_column("c")[0].as_py() + 1.0 + + >>> df = ctx.from_pydict({"a": ["oops"], "b": [1.0]}) + >>> result = df.select( + ... dfn.functions.cast_to_type( + ... dfn.col("a"), dfn.col("b"), try_cast=True + ... ).alias("c") + ... ) + >>> result.collect_column("c")[0].as_py() is None + True + """ + if try_cast: + return Expr(f.try_cast_to_type(value.expr, type_ref.expr)) + return Expr(f.cast_to_type(value.expr, type_ref.expr)) + + def arrow_metadata(expr: Expr, key: Expr | str | None = None) -> Expr: """Returns the metadata of the input expression. @@ -2963,6 +3043,33 @@ def arrow_metadata(expr: Expr, key: Expr | str | None = None) -> Expr: return Expr(f.arrow_metadata(expr.expr, key.expr)) +def with_metadata(expr: Expr, metadata: dict[str, str]) -> Expr: + """Attaches Arrow field metadata (key/value pairs) to the input expression. + + This is the inverse of :py:func:`arrow_metadata`. Existing metadata on the + input field is preserved; new keys overwrite on collision. Keys must be + non-empty strings; empty values are allowed. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1]}) + >>> result = df.select( + ... dfn.functions.with_metadata( + ... dfn.col("a"), {"unit": "ms"} + ... ).alias("a") + ... ) + >>> result.select( + ... dfn.functions.arrow_metadata(dfn.col("a"), "unit").alias("u") + ... ).collect_column("u")[0].as_py() + 'ms' + """ + args = [expr] + for k, v in metadata.items(): + args.append(Expr.string_literal(k)) + args.append(Expr.string_literal(v)) + return Expr(f.with_metadata(*(a.expr for a in args))) + + def get_field(expr: Expr, *names: Expr | str) -> Expr: """Extracts a (possibly nested) field from a struct or map by name. diff --git a/skills/datafusion_python/SKILL.md b/skills/datafusion_python/SKILL.md index 1aeb78777..4ddf54e99 100644 --- a/skills/datafusion_python/SKILL.md +++ b/skills/datafusion_python/SKILL.md @@ -758,7 +758,12 @@ F.left(col("c_phone"), lit(2)) # prefix shortcut **Hash**: `md5`, `sha224`, `sha256`, `sha384`, `sha512`, `digest` -**Type**: `arrow_typeof`, `arrow_cast`, `arrow_metadata` +**Type**: `arrow_typeof`, `arrow_cast`, `arrow_try_cast`, `arrow_field`, +`arrow_metadata`, `cast_to_type`, `with_metadata` + +Note: ``cast_to_type(value, type_ref, *, try_cast=False)`` is the single +Python entry point for both upstream ``cast_to_type`` and ``try_cast_to_type``; +pass ``try_cast=True`` for the variant that returns NULL on failure. **Other**: `in_list`, `order_by`, `alias`, `col`, `encode`, `decode`, `to_hex`, `to_char`, `uuid`, `version`, `bit_length`, `octet_length` From 04979eaa707fa4ab50060b4b3b92d250a023a4f0 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 4 Jun 2026 13:50:45 -0400 Subject: [PATCH 2/8] refactor: collapse try_cast_to_type into cast_to_type kwarg The previous commit exposed cast_to_type and try_cast_to_type as two separate pyo3 bindings and unified them in the Python wrapper via a try_cast kwarg. That left try_cast_to_type in datafusion._internal without a matching public Python name, breaking test_datafusion_missing_exports. Move the dispatch into the rust binding: cast_to_type now takes a try_cast kwarg and selects between functions::expr_fn::cast_to_type and try_cast_to_type internally. Only one pyo3 binding is registered, so the wrapper-coverage check passes and the Python entrypoint is unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/core/src/functions.rs | 12 +++++++++--- python/datafusion/functions.py | 7 ++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/crates/core/src/functions.rs b/crates/core/src/functions.rs index 014d22480..2876b3ee5 100644 --- a/crates/core/src/functions.rs +++ b/crates/core/src/functions.rs @@ -609,8 +609,15 @@ expr_fn!(arrow_typeof, arg_1); expr_fn!(arrow_cast, arg_1 datatype); expr_fn!(arrow_try_cast, arg_1 datatype); expr_fn!(arrow_field, arg_1); -expr_fn!(cast_to_type, arg_1 reference); -expr_fn!(try_cast_to_type, arg_1 reference); +#[pyfunction] +#[pyo3(signature = (arg_1, reference, *, try_cast = false))] +fn cast_to_type(arg_1: PyExpr, reference: PyExpr, try_cast: bool) -> PyExpr { + if try_cast { + functions::expr_fn::try_cast_to_type(arg_1.into(), reference.into()).into() + } else { + functions::expr_fn::cast_to_type(arg_1.into(), reference.into()).into() + } +} expr_fn_vec!(arrow_metadata); expr_fn_vec!(with_metadata); expr_fn!(union_tag, arg1); @@ -970,7 +977,6 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(arrow_try_cast))?; m.add_wrapped(wrap_pyfunction!(arrow_field))?; m.add_wrapped(wrap_pyfunction!(cast_to_type))?; - m.add_wrapped(wrap_pyfunction!(try_cast_to_type))?; m.add_wrapped(wrap_pyfunction!(arrow_metadata))?; m.add_wrapped(wrap_pyfunction!(with_metadata))?; m.add_wrapped(wrap_pyfunction!(ascii))?; diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index a2694a1b4..c99c5f206 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -2982,8 +2982,7 @@ def cast_to_type(value: Expr, type_ref: Expr, *, try_cast: bool = False) -> Expr Only the *type* of ``type_ref`` is used; its value is ignored. This is useful when the target type comes from another column or expression rather than being known up-front. When ``try_cast=True``, casts that - fail produce NULL instead of erroring (this dispatches to upstream - ``try_cast_to_type``). + fail produce NULL instead of erroring. Examples: >>> ctx = dfn.SessionContext() @@ -3005,9 +3004,7 @@ def cast_to_type(value: Expr, type_ref: Expr, *, try_cast: bool = False) -> Expr >>> result.collect_column("c")[0].as_py() is None True """ - if try_cast: - return Expr(f.try_cast_to_type(value.expr, type_ref.expr)) - return Expr(f.cast_to_type(value.expr, type_ref.expr)) + return Expr(f.cast_to_type(value.expr, type_ref.expr, try_cast=try_cast)) def arrow_metadata(expr: Expr, key: Expr | str | None = None) -> Expr: From 398b388222685e153fde3e78f220c5f02cea2a58 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 5 Jun 2026 10:37:03 -0400 Subject: [PATCH 3/8] feat: accept pyarrow DataType in arrow_try_cast Mirrors arrow_cast: arrow_try_cast now accepts `pa.DataType` in addition to `str` and `Expr`. Adds `Expr.try_cast(pa.DataType)` PyO3 binding for the pyarrow-type routing path. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/core/src/expr.rs | 5 +++++ python/datafusion/expr.py | 22 ++++++++++++++++++++++ python/datafusion/functions.py | 15 +++++++++++++-- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/crates/core/src/expr.rs b/crates/core/src/expr.rs index eac571a11..432c4cd23 100644 --- a/crates/core/src/expr.rs +++ b/crates/core/src/expr.rs @@ -358,6 +358,11 @@ impl PyExpr { expr.into() } + pub fn try_cast(&self, to: PyArrowType) -> PyExpr { + let expr = Expr::TryCast(TryCast::new(Box::new(self.expr.clone()), to.0)); + expr.into() + } + #[pyo3(signature = (low, high, negated=false))] pub fn between(&self, low: PyExpr, high: PyExpr, negated: bool) -> PyExpr { let expr = Expr::Between(Between::new( diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 4fdbdc5d4..be3a14123 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -894,6 +894,28 @@ def cast(self, to: pa.DataType[Any] | type) -> Expr: return Expr(self.expr.cast(to)) + def try_cast(self, to: pa.DataType[Any] | type) -> Expr: + """Cast to a new data type, returning NULL on failure. + + Like :py:meth:`cast` but produces NULL instead of erroring when the + cast cannot be performed for a given row. + + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["oops"]}) + >>> result = df.select(col("a").try_cast(pa.float64()).alias("c")) + >>> result.collect_column("c")[0].as_py() is None + True + """ + if not isinstance(to, pa.DataType): + try: + to = self._to_pyarrow_types[to] + except KeyError as err: + error_msg = "Expected instance of pyarrow.DataType or builtins.type" + raise TypeError(error_msg) from err + + return Expr(self.expr.try_cast(to)) + def between(self, low: Any, high: Any, negated: bool = False) -> Expr: """Returns ``True`` if this expression is between a given range. diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index c99c5f206..33f7099cf 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -2934,12 +2934,13 @@ def arrow_cast(expr: Expr, data_type: Expr | str | pa.DataType) -> Expr: return Expr(f.arrow_cast(expr.expr, data_type.expr)) -def arrow_try_cast(expr: Expr, data_type: Expr | str) -> Expr: +def arrow_try_cast(expr: Expr, data_type: Expr | str | pa.DataType) -> Expr: """Casts an expression to a specified data type, returning NULL on failure. Like :py:func:`arrow_cast` but produces NULL instead of erroring when the cast cannot be performed. The ``data_type`` may be a string in DataFusion - type syntax (for example ``"Float64"``) or an ``Expr`` of string type. + type syntax (for example ``"Float64"``), a ``pyarrow.DataType``, or an + ``Expr`` of string type. Examples: >>> ctx = dfn.SessionContext() @@ -2949,7 +2950,17 @@ def arrow_try_cast(expr: Expr, data_type: Expr | str) -> Expr: ... ) >>> result.collect_column("c")[0].as_py() is None True + + >>> result = df.select( + ... dfn.functions.arrow_try_cast( + ... dfn.col("a"), data_type=pa.float64() + ... ).alias("c") + ... ) + >>> result.collect_column("c")[0].as_py() is None + True """ + if isinstance(data_type, pa.DataType): + return expr.try_cast(data_type) if isinstance(data_type, str): data_type = Expr.string_literal(data_type) return Expr(f.arrow_try_cast(expr.expr, data_type.expr)) From 708cd4de74d8a39e8a0e91cc7ab4d822ec4d9425 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 5 Jun 2026 10:38:02 -0400 Subject: [PATCH 4/8] fix: guard with_metadata against empty dict and empty keys Empty `metadata` dict now returns the input expression unchanged (previously bubbled an opaque DataFusion error about minimum arg count). Empty keys raise `ValueError` to match the docstring contract. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/datafusion/functions.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 33f7099cf..91359dc8a 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -3058,6 +3058,9 @@ def with_metadata(expr: Expr, metadata: dict[str, str]) -> Expr: input field is preserved; new keys overwrite on collision. Keys must be non-empty strings; empty values are allowed. + An empty ``metadata`` dict is a no-op and returns the input expression + unchanged. Empty keys raise :py:class:`ValueError`. + Examples: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1]}) @@ -3071,11 +3074,16 @@ def with_metadata(expr: Expr, metadata: dict[str, str]) -> Expr: ... ).collect_column("u")[0].as_py() 'ms' """ - args = [expr] + if not metadata: + return expr + args = [expr.expr] for k, v in metadata.items(): - args.append(Expr.string_literal(k)) - args.append(Expr.string_literal(v)) - return Expr(f.with_metadata(*(a.expr for a in args))) + if not k: + msg = "with_metadata keys must be non-empty strings" + raise ValueError(msg) + args.append(Expr.string_literal(k).expr) + args.append(Expr.string_literal(v).expr) + return Expr(f.with_metadata(*args)) def get_field(expr: Expr, *names: Expr | str) -> Expr: From 83dca2e00552bd608c0905bfe359be1746f79544 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 5 Jun 2026 10:40:08 -0400 Subject: [PATCH 5/8] docs: assert full struct shape in arrow_field doctest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous doctest set metadata on the input field but only checked the name — the metadata setup was dead. Now the example asserts the full returned struct (name, data_type, nullable, metadata) so the demo shows what the function actually produces. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/datafusion/functions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 91359dc8a..029671669 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -2981,8 +2981,9 @@ def arrow_field(expr: Expr) -> Expr: >>> result = df.select( ... dfn.functions.arrow_field(dfn.col("val")).alias("f") ... ) - >>> result.collect_column("f")[0].as_py()["name"] - 'val' + >>> out = result.collect_column("f")[0].as_py() + >>> out["name"], out["data_type"], out["nullable"], out["metadata"] + ('val', 'Int64', True, [('k', 'v')]) """ return Expr(f.arrow_field(expr.expr)) From 7d8a43574f1cf58f54d73ab1a374e4fa1592f62b Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 5 Jun 2026 10:43:59 -0400 Subject: [PATCH 6/8] test: add unit tests for arrow_try_cast, arrow_field, cast_to_type, with_metadata Mirrors the existing test_arrow_cast pattern. Covers: - arrow_try_cast: string-syntax, pa.DataType, and null-on-failure paths - arrow_field: full returned struct shape (name, data_type, nullable, metadata) - cast_to_type: type-from-expr happy path and try_cast=True null behavior - with_metadata: round-trip through arrow_metadata, empty-dict no-op, and empty-key ValueError Co-Authored-By: Claude Opus 4.7 (1M context) --- python/tests/test_functions.py | 105 +++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index 55d9c8ee8..b32d6b99f 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1325,6 +1325,111 @@ def test_arrow_cast_with_pyarrow_type(df): assert result.column(2) == pa.array(["4", "5", "6"], type=pa.string()) +def test_arrow_try_cast(df): + df = df.select( + f.arrow_try_cast(column("b"), "Float64").alias("b_as_float"), + f.arrow_try_cast(column("b"), "Int32").alias("b_as_int"), + ) + result = df.collect()[0] + + assert result.column(0) == pa.array([4.0, 5.0, 6.0], type=pa.float64()) + assert result.column(1) == pa.array([4, 5, 6], type=pa.int32()) + + +def test_arrow_try_cast_with_pyarrow_type(df): + df = df.select( + f.arrow_try_cast(column("b"), pa.float64()).alias("b_as_float"), + f.arrow_try_cast(column("b"), pa.int32()).alias("b_as_int"), + ) + result = df.collect()[0] + + assert result.column(0) == pa.array([4.0, 5.0, 6.0], type=pa.float64()) + assert result.column(1) == pa.array([4, 5, 6], type=pa.int32()) + + +def test_arrow_try_cast_null_on_failure(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays([pa.array(["1.5", "oops", "3"])], names=["s"]) + df = ctx.create_dataframe([[batch]]) + + result = df.select( + f.arrow_try_cast(column("s"), "Float64").alias("c"), + f.arrow_try_cast(column("s"), pa.float64()).alias("c_pa"), + ).collect()[0] + + assert result.column(0).to_pylist() == [1.5, None, 3.0] + assert result.column(1).to_pylist() == [1.5, None, 3.0] + + +def test_arrow_field(): + ctx = SessionContext() + field = pa.field("val", pa.int64(), metadata={"k": "v"}) + schema = pa.schema([field]) + batch = pa.RecordBatch.from_arrays([pa.array([1])], schema=schema) + df = ctx.create_dataframe([[batch]]) + + out = ( + df.select(f.arrow_field(column("val")).alias("f")) + .collect_column("f")[0] + .as_py() + ) + assert out == { + "name": "val", + "data_type": "Int64", + "nullable": True, + "metadata": [("k", "v")], + } + + +def test_cast_to_type(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [pa.array([4, 5, 6]), pa.array([1.0, 2.0, 3.0])], + names=["b", "fl"], + ) + df = ctx.create_dataframe([[batch]]) + + result = df.select(f.cast_to_type(column("b"), column("fl")).alias("c")).collect()[ + 0 + ] + + assert result.column(0) == pa.array([4.0, 5.0, 6.0], type=pa.float64()) + + +def test_cast_to_type_try_cast_null_on_failure(): + ctx = SessionContext() + batch = pa.RecordBatch.from_arrays( + [pa.array(["oops", "2", "3"]), pa.array([1.0, 2.0, 3.0])], + names=["a", "fl"], + ) + df = ctx.create_dataframe([[batch]]) + + result = df.select( + f.cast_to_type(column("a"), column("fl"), try_cast=True).alias("c") + ).collect()[0] + + assert result.column(0).to_pylist() == [None, 2.0, 3.0] + assert result.column(0).type == pa.float64() + + +def test_with_metadata_round_trip(df): + df = df.select(f.with_metadata(column("b"), {"unit": "ms"}).alias("b")) + result = df.select(f.arrow_metadata(column("b"), "unit").alias("u")).collect_column( + "u" + ) + assert result[0].as_py() == "ms" + + +def test_with_metadata_empty_dict_noop(df): + out = df.select(f.with_metadata(column("b"), {}).alias("b")).collect()[0] + assert out.column(0) == pa.array([4, 5, 6]) + + +def test_with_metadata_empty_key_raises(df): + with pytest.raises(ValueError, match="non-empty"): + f.with_metadata(column("b"), {"": "v"}) + + def test_case(df): df = df.select( f.case(column("b")).when(literal(4), literal(10)).otherwise(literal(8)), From ce83065b0551e43448a0d124e1ec9dff64a2a181 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 5 Jun 2026 10:46:10 -0400 Subject: [PATCH 7/8] test: parameterize arrow cast / try_cast tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Folds the previous four cast tests (arrow_cast + arrow_try_cast × str + pyarrow target type) into a single parameterized test that runs both functions across all five target-type variants. Collapses the two cast_to_type tests (happy path + try_cast=True) into one parameterized test, and parameterizes arrow_try_cast null-on-failure over both target-type syntaxes. 7 test functions, 19 cases — net less code, same coverage. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/tests/test_functions.py | 104 ++++++++++----------------------- 1 file changed, 31 insertions(+), 73 deletions(-) diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py index b32d6b99f..ebc26ab27 100644 --- a/python/tests/test_functions.py +++ b/python/tests/test_functions.py @@ -1299,66 +1299,32 @@ def test_make_time(df): assert result.column(0)[0].as_py() == time(12, 30) -def test_arrow_cast(df): - df = df.select( - f.arrow_cast(column("b"), "Float64").alias("b_as_float"), - f.arrow_cast(column("b"), "Int32").alias("b_as_int"), - ) - result = df.collect() - assert len(result) == 1 - result = result[0] - - assert result.column(0) == pa.array([4.0, 5.0, 6.0], type=pa.float64()) - assert result.column(1) == pa.array([4, 5, 6], type=pa.int32()) - - -def test_arrow_cast_with_pyarrow_type(df): - df = df.select( - f.arrow_cast(column("b"), pa.float64()).alias("b_as_float"), - f.arrow_cast(column("b"), pa.int32()).alias("b_as_int"), - f.arrow_cast(column("b"), pa.string()).alias("b_as_str"), - ) - result = df.collect()[0] - - assert result.column(0) == pa.array([4.0, 5.0, 6.0], type=pa.float64()) - assert result.column(1) == pa.array([4, 5, 6], type=pa.int32()) - assert result.column(2) == pa.array(["4", "5", "6"], type=pa.string()) - - -def test_arrow_try_cast(df): - df = df.select( - f.arrow_try_cast(column("b"), "Float64").alias("b_as_float"), - f.arrow_try_cast(column("b"), "Int32").alias("b_as_int"), - ) - result = df.collect()[0] - - assert result.column(0) == pa.array([4.0, 5.0, 6.0], type=pa.float64()) - assert result.column(1) == pa.array([4, 5, 6], type=pa.int32()) - - -def test_arrow_try_cast_with_pyarrow_type(df): - df = df.select( - f.arrow_try_cast(column("b"), pa.float64()).alias("b_as_float"), - f.arrow_try_cast(column("b"), pa.int32()).alias("b_as_int"), - ) - result = df.collect()[0] - - assert result.column(0) == pa.array([4.0, 5.0, 6.0], type=pa.float64()) - assert result.column(1) == pa.array([4, 5, 6], type=pa.int32()) +@pytest.mark.parametrize("cast_fn", [f.arrow_cast, f.arrow_try_cast]) +@pytest.mark.parametrize( + ("data_type", "expected"), + [ + ("Float64", pa.array([4.0, 5.0, 6.0], type=pa.float64())), + ("Int32", pa.array([4, 5, 6], type=pa.int32())), + (pa.float64(), pa.array([4.0, 5.0, 6.0], type=pa.float64())), + (pa.int32(), pa.array([4, 5, 6], type=pa.int32())), + (pa.string(), pa.array(["4", "5", "6"], type=pa.string())), + ], +) +def test_arrow_cast_variants(df, cast_fn, data_type, expected): + """arrow_cast / arrow_try_cast accept str and pyarrow target types.""" + result = df.select(cast_fn(column("b"), data_type).alias("c")).collect()[0] + assert result.column(0) == expected -def test_arrow_try_cast_null_on_failure(): +@pytest.mark.parametrize("data_type", ["Float64", pa.float64()]) +def test_arrow_try_cast_null_on_failure(data_type): ctx = SessionContext() batch = pa.RecordBatch.from_arrays([pa.array(["1.5", "oops", "3"])], names=["s"]) df = ctx.create_dataframe([[batch]]) - result = df.select( - f.arrow_try_cast(column("s"), "Float64").alias("c"), - f.arrow_try_cast(column("s"), pa.float64()).alias("c_pa"), - ).collect()[0] + result = df.select(f.arrow_try_cast(column("s"), data_type).alias("c")).collect()[0] assert result.column(0).to_pylist() == [1.5, None, 3.0] - assert result.column(1).to_pylist() == [1.5, None, 3.0] def test_arrow_field(): @@ -1381,34 +1347,26 @@ def test_arrow_field(): } -def test_cast_to_type(): - ctx = SessionContext() - batch = pa.RecordBatch.from_arrays( - [pa.array([4, 5, 6]), pa.array([1.0, 2.0, 3.0])], - names=["b", "fl"], - ) - df = ctx.create_dataframe([[batch]]) - - result = df.select(f.cast_to_type(column("b"), column("fl")).alias("c")).collect()[ - 0 - ] - - assert result.column(0) == pa.array([4.0, 5.0, 6.0], type=pa.float64()) - - -def test_cast_to_type_try_cast_null_on_failure(): +@pytest.mark.parametrize( + ("values", "try_cast", "expected"), + [ + (pa.array([4, 5, 6]), False, [4.0, 5.0, 6.0]), + (pa.array(["oops", "2", "3"]), True, [None, 2.0, 3.0]), + ], +) +def test_cast_to_type(values, try_cast, expected): + """cast_to_type takes target type from ``type_ref``; try_cast nullifies failures.""" ctx = SessionContext() batch = pa.RecordBatch.from_arrays( - [pa.array(["oops", "2", "3"]), pa.array([1.0, 2.0, 3.0])], - names=["a", "fl"], + [values, pa.array([1.0, 2.0, 3.0])], names=["v", "fl"] ) df = ctx.create_dataframe([[batch]]) result = df.select( - f.cast_to_type(column("a"), column("fl"), try_cast=True).alias("c") + f.cast_to_type(column("v"), column("fl"), try_cast=try_cast).alias("c") ).collect()[0] - assert result.column(0).to_pylist() == [None, 2.0, 3.0] + assert result.column(0).to_pylist() == expected assert result.column(0).type == pa.float64() @@ -1425,7 +1383,7 @@ def test_with_metadata_empty_dict_noop(df): assert out.column(0) == pa.array([4, 5, 6]) -def test_with_metadata_empty_key_raises(df): +def test_with_metadata_empty_key_raises(): with pytest.raises(ValueError, match="non-empty"): f.with_metadata(column("b"), {"": "v"}) From 25cb8e27a32ce4a4c3ee2be31175ad6086aa5656 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 5 Jun 2026 10:50:22 -0400 Subject: [PATCH 8/8] docs: point cast_to_type at arrow_cast for static target types Adds a one-line cross-reference so users with a known target type reach for arrow_cast / arrow_try_cast instead of building a sentinel expression to feed cast_to_type. Co-Authored-By: Claude Opus 4.7 (1M context) --- python/datafusion/functions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 029671669..ff8c84a35 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -2996,6 +2996,10 @@ def cast_to_type(value: Expr, type_ref: Expr, *, try_cast: bool = False) -> Expr rather than being known up-front. When ``try_cast=True``, casts that fail produce NULL instead of erroring. + If the target type is known statically, prefer :py:func:`arrow_cast` + (or :py:func:`arrow_try_cast` for the NULL-on-failure variant) and + pass a type string or ``pyarrow.DataType`` directly. + Examples: >>> ctx = dfn.SessionContext() >>> df = ctx.from_pydict({"a": [1], "b": [1.0]})