diff --git a/datafusion/tests/test_dataframe.py b/datafusion/tests/test_dataframe.py index 611bcabe4..db59e3c52 100644 --- a/datafusion/tests/test_dataframe.py +++ b/datafusion/tests/test_dataframe.py @@ -607,3 +607,27 @@ def test_to_pydict(df): pydict = df.to_pydict() assert type(pydict) == dict assert pydict == {"a": [1, 2, 3], "b": [4, 5, 6], "c": [8, 5, 8]} + + +def test_describe(df): + + # Calculate statistics + df = df.describe() + + # Collect the result + result = df.to_pydict() + + assert result == { + "describe": [ + "count", + "null_count", + "mean", + "std", + "min", + "max", + "median", + ], + "a": [3.0, 3.0, 2.0, 1.0, 1.0, 3.0, 2.0], + "b": [3.0, 3.0, 5.0, 1.0, 4.0, 6.0, 5.0], + "c": [3.0, 3.0, 7.0, 1.7320508075688772, 5.0, 8.0, 8.0], + } diff --git a/src/dataframe.rs b/src/dataframe.rs index b21f56104..605e11611 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -76,6 +76,13 @@ impl PyDataFrame { } } + /// Calculate summary statistics for a DataFrame + fn describe(&self, py: Python) -> PyResult { + let df = self.df.as_ref().clone(); + let stat_df = wait_for_future(py, df.describe())?; + Ok(Self::new(stat_df)) + } + /// Returns the schema from the logical plan fn schema(&self) -> PyArrowType { PyArrowType(self.df.schema().into())