diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index a4933a747..e85d710e7 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -3323,10 +3323,6 @@ def array_agg( filter: If provided, only compute against rows for which the filter is True order_by: Order the resultant array values. Accepts column names or expressions. - For example:: - - df.aggregate([], array_agg(col("a"), order_by="b")) - Examples: --------- >>> ctx = dfn.SessionContext() @@ -4047,9 +4043,14 @@ def first_value( column names or expressions. null_treatment: Assign whether to respect or ignore null values. - For example:: - - df.aggregate([], first_value(col("a"), order_by="ts")) + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.aggregate( + ... [], [dfn.functions.first_value(dfn.col("a")).alias("v")] + ... ) + >>> result.collect_column("v")[0].as_py() + 10 """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -4084,9 +4085,14 @@ def last_value( column names or expressions. null_treatment: Assign whether to respect or ignore null values. - For example:: - - df.aggregate([], last_value(col("a"), order_by="ts")) + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.aggregate( + ... [], [dfn.functions.last_value(dfn.col("a")).alias("v")] + ... ) + >>> result.collect_column("v")[0].as_py() + 30 """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -4123,9 +4129,14 @@ def nth_value( column names or expressions. null_treatment: Assign whether to respect or ignore null values. - For example:: - - df.aggregate([], nth_value(col("a"), 2, order_by="ts")) + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.aggregate( + ... [], [dfn.functions.nth_value(dfn.col("a"), 2).alias("v")] + ... ) + >>> result.collect_column("v")[0].as_py() + 20 """ order_by_raw = sort_list_to_raw_sort_list(order_by) filter_raw = filter.expr if filter is not None else None @@ -4303,9 +4314,14 @@ def lead( order_by: Set ordering within the window frame. Accepts column names or expressions. - For example:: - - lead(col("b"), order_by="ts") + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.lead(dfn.col("a"), shift_offset=1, + ... default_value=0, order_by="a").alias("lead")) + >>> result.sort(dfn.col("a")).collect_column("lead").to_pylist() + [2, 3, 0] """ if not isinstance(default_value, pa.Scalar) and default_value is not None: default_value = pa.scalar(default_value) @@ -4358,9 +4374,14 @@ def lag( order_by: Set ordering within the window frame. Accepts column names or expressions. - For example:: - - lag(col("b"), order_by="ts") + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.lag(dfn.col("a"), shift_offset=1, + ... default_value=0, order_by="a").alias("lag")) + >>> result.sort(dfn.col("a")).collect_column("lag").to_pylist() + [0, 1, 2] """ if not isinstance(default_value, pa.Scalar): default_value = pa.scalar(default_value) @@ -4403,9 +4424,13 @@ def row_number( order_by: Set ordering within the window frame. Accepts column names or expressions. - For example:: - - row_number(order_by="points") + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.row_number(order_by="a").alias("rn")) + >>> result.sort(dfn.col("a")).collect_column("rn").to_pylist() + [1, 2, 3] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -4447,9 +4472,14 @@ def rank( order_by: Set ordering within the window frame. Accepts column names or expressions. - For example:: - - rank(order_by="points") + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 10, 20]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.rank(order_by="a").alias("rnk") + ... ) + >>> result.sort(dfn.col("a")).collect_column("rnk").to_pylist() + [1, 1, 3] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -4486,9 +4516,13 @@ def dense_rank( order_by: Set ordering within the window frame. Accepts column names or expressions. - For example:: - - dense_rank(order_by="points") + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 10, 20]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.dense_rank(order_by="a").alias("dr")) + >>> result.sort(dfn.col("a")).collect_column("dr").to_pylist() + [1, 1, 2] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -4526,9 +4560,14 @@ def percent_rank( order_by: Set ordering within the window frame. Accepts column names or expressions. - For example:: - percent_rank(order_by="points") + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.percent_rank(order_by="a").alias("pr")) + >>> result.sort(dfn.col("a")).collect_column("pr").to_pylist() + [0.0, 0.5, 1.0] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -4566,9 +4605,17 @@ def cume_dist( order_by: Set ordering within the window frame. Accepts column names or expressions. - For example:: - - cume_dist(order_by="points") + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [1., 2., 2., 3.]}) + >>> result = df.select( + ... dfn.col("a"), + ... dfn.functions.cume_dist( + ... order_by="a" + ... ).alias("cd") + ... ) + >>> result.collect_column("cd").to_pylist() + [0.25..., 0.75..., 0.75..., 1.0...] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -4610,9 +4657,13 @@ def ntile( order_by: Set ordering within the window frame. Accepts column names or expressions. - For example:: - - ntile(3, order_by="points") + Examples: + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [10, 20, 30, 40]}) + >>> result = df.select( + ... dfn.col("a"), dfn.functions.ntile(2, order_by="a").alias("nt")) + >>> result.sort(dfn.col("a")).collect_column("nt").to_pylist() + [1, 1, 2, 2] """ partition_by_raw = expr_list_to_raw_expr_list(partition_by) order_by_raw = sort_list_to_raw_sort_list(order_by) @@ -4648,10 +4699,6 @@ def string_agg( order_by: Set the ordering of the expression to evaluate. Accepts column names or expressions. - For example:: - - df.aggregate([], string_agg(col("a"), ",", order_by="b")) - Examples: --------- >>> ctx = dfn.SessionContext()