Skip to content
127 changes: 87 additions & 40 deletions python/datafusion/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3323,10 +3323,6 @@ def array_agg(
filter: If provided, only compute against rows for which the filter is True
order_by: Order the resultant array values. Accepts column names or expressions.

For example::

df.aggregate([], array_agg(col("a"), order_by="b"))

Examples:
---------
>>> ctx = dfn.SessionContext()
Expand Down Expand Up @@ -4047,9 +4043,14 @@ def first_value(
column names or expressions.
null_treatment: Assign whether to respect or ignore null values.

For example::

df.aggregate([], first_value(col("a"), order_by="ts"))
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.aggregate(
... [], [dfn.functions.first_value(dfn.col("a")).alias("v")]
... )
>>> result.collect_column("v")[0].as_py()
10
"""
order_by_raw = sort_list_to_raw_sort_list(order_by)
filter_raw = filter.expr if filter is not None else None
Expand Down Expand Up @@ -4084,9 +4085,14 @@ def last_value(
column names or expressions.
null_treatment: Assign whether to respect or ignore null values.

For example::

df.aggregate([], last_value(col("a"), order_by="ts"))
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.aggregate(
... [], [dfn.functions.last_value(dfn.col("a")).alias("v")]
... )
>>> result.collect_column("v")[0].as_py()
30
"""
order_by_raw = sort_list_to_raw_sort_list(order_by)
filter_raw = filter.expr if filter is not None else None
Expand Down Expand Up @@ -4123,9 +4129,14 @@ def nth_value(
column names or expressions.
null_treatment: Assign whether to respect or ignore null values.

For example::

df.aggregate([], nth_value(col("a"), 2, order_by="ts"))
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.aggregate(
... [], [dfn.functions.nth_value(dfn.col("a"), 2).alias("v")]
... )
>>> result.collect_column("v")[0].as_py()
20
"""
order_by_raw = sort_list_to_raw_sort_list(order_by)
filter_raw = filter.expr if filter is not None else None
Expand Down Expand Up @@ -4303,9 +4314,14 @@ def lead(
order_by: Set ordering within the window frame. Accepts
column names or expressions.

For example::

lead(col("b"), order_by="ts")
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> result = df.select(
... dfn.col("a"), dfn.functions.lead(dfn.col("a"), shift_offset=1,
... default_value=0, order_by="a").alias("lead"))
>>> result.sort(dfn.col("a")).collect_column("lead").to_pylist()
[2, 3, 0]
"""
if not isinstance(default_value, pa.Scalar) and default_value is not None:
default_value = pa.scalar(default_value)
Expand Down Expand Up @@ -4358,9 +4374,14 @@ def lag(
order_by: Set ordering within the window frame. Accepts
column names or expressions.

For example::

lag(col("b"), order_by="ts")
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1, 2, 3]})
>>> result = df.select(
... dfn.col("a"), dfn.functions.lag(dfn.col("a"), shift_offset=1,
... default_value=0, order_by="a").alias("lag"))
>>> result.sort(dfn.col("a")).collect_column("lag").to_pylist()
[0, 1, 2]
"""
if not isinstance(default_value, pa.Scalar):
default_value = pa.scalar(default_value)
Expand Down Expand Up @@ -4403,9 +4424,13 @@ def row_number(
order_by: Set ordering within the window frame. Accepts
column names or expressions.

For example::

row_number(order_by="points")
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.select(
... dfn.col("a"), dfn.functions.row_number(order_by="a").alias("rn"))
>>> result.sort(dfn.col("a")).collect_column("rn").to_pylist()
[1, 2, 3]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
Expand Down Expand Up @@ -4447,9 +4472,14 @@ def rank(
order_by: Set ordering within the window frame. Accepts
column names or expressions.

For example::

rank(order_by="points")
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 10, 20]})
>>> result = df.select(
... dfn.col("a"), dfn.functions.rank(order_by="a").alias("rnk")
... )
>>> result.sort(dfn.col("a")).collect_column("rnk").to_pylist()
[1, 1, 3]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
Expand Down Expand Up @@ -4486,9 +4516,13 @@ def dense_rank(
order_by: Set ordering within the window frame. Accepts
column names or expressions.

For example::

dense_rank(order_by="points")
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 10, 20]})
>>> result = df.select(
... dfn.col("a"), dfn.functions.dense_rank(order_by="a").alias("dr"))
>>> result.sort(dfn.col("a")).collect_column("dr").to_pylist()
[1, 1, 2]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
Expand Down Expand Up @@ -4526,9 +4560,14 @@ def percent_rank(
order_by: Set ordering within the window frame. Accepts
column names or expressions.

For example::

percent_rank(order_by="points")
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30]})
>>> result = df.select(
... dfn.col("a"), dfn.functions.percent_rank(order_by="a").alias("pr"))
>>> result.sort(dfn.col("a")).collect_column("pr").to_pylist()
[0.0, 0.5, 1.0]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
Expand Down Expand Up @@ -4566,9 +4605,17 @@ def cume_dist(
order_by: Set ordering within the window frame. Accepts
column names or expressions.

For example::

cume_dist(order_by="points")
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [1., 2., 2., 3.]})
>>> result = df.select(
... dfn.col("a"),
... dfn.functions.cume_dist(
... order_by="a"
... ).alias("cd")
... )
>>> result.collect_column("cd").to_pylist()
[0.25..., 0.75..., 0.75..., 1.0...]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
Expand Down Expand Up @@ -4610,9 +4657,13 @@ def ntile(
order_by: Set ordering within the window frame. Accepts
column names or expressions.

For example::

ntile(3, order_by="points")
Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [10, 20, 30, 40]})
>>> result = df.select(
... dfn.col("a"), dfn.functions.ntile(2, order_by="a").alias("nt"))
>>> result.sort(dfn.col("a")).collect_column("nt").to_pylist()
[1, 1, 2, 2]
"""
partition_by_raw = expr_list_to_raw_expr_list(partition_by)
order_by_raw = sort_list_to_raw_sort_list(order_by)
Expand Down Expand Up @@ -4648,10 +4699,6 @@ def string_agg(
order_by: Set the ordering of the expression to evaluate. Accepts
column names or expressions.

For example::

df.aggregate([], string_agg(col("a"), ",", order_by="b"))

Examples:
---------
>>> ctx = dfn.SessionContext()
Expand Down
Loading