Merge conflict

Dandandan · Dandandan · commit 85226333c55b · 2021-05-07T12:19:32.000+02:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,7 +20,7 @@ members = [
     "datafusion",
     "datafusion-cli",
     "datafusion-examples",
-	"benchmarks",
+    "benchmarks",
     "ballista/rust/client",
     "ballista/rust/core",
     "ballista/rust/executor",
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ the convenience of an SQL interface or a DataFrame API.
 
 Here are some of the projects known to use DataFusion:
 
-* [Ballista](https://github.com/ballista-compute/ballista) Distributed Compute Platform
+* [Ballista](ballista) Distributed Compute Platform
 * [Cloudfuse Buzz](https://github.com/cloudfuse-io/buzz-rust)
 * [Cube Store](https://github.com/cube-js/cube.js/tree/master/rust)
 * [datafusion-python](https://pypi.org/project/datafusion)
diff --git a/ballista/rust/core/proto/ballista.proto b/ballista/rust/core/proto/ballista.proto
@@ -33,7 +33,7 @@ message LogicalExprNode {
   oneof ExprType {
     // column references
     string column_name = 1;
-    
+
     // alias
     AliasNode alias = 2;
 
@@ -42,15 +42,15 @@ message LogicalExprNode {
 
     // binary expressions
     BinaryExprNode binary_expr = 4;
-    
+
     // aggregate expressions
     AggregateExprNode aggregate_expr = 5;
-    
+
     // null checks
     IsNull is_null_expr = 6;
     IsNotNull is_not_null_expr = 7;
     Not not_expr = 8;
-    
+
     BetweenNode between = 9;
     CaseNode case_ = 10;
     CastNode cast = 11;
@@ -130,6 +130,7 @@ enum ScalarFunction {
   SHA256 = 30;
   SHA384 = 31;
   SHA512 = 32;
+  LN = 33;
 }
 
 message ScalarFunctionNode {
@@ -361,7 +362,7 @@ message CsvScanExecNode {
   bool has_header = 5;
   uint32 batch_size = 6;
   string delimiter = 7;
-  
+
   // partition filenames
   repeated string filename = 8;
 }
@@ -466,7 +467,7 @@ message Action {
     // Fetch a partition from an executor
     PartitionId fetch_partition = 3;
   }
-  
+
   // configuration settings
   repeated KeyValuePair settings = 100;
 }
@@ -742,10 +743,10 @@ message ScalarValue{
     }
 }
 
-// Contains all valid datafusion scalar type except for 
+// Contains all valid datafusion scalar type except for
 // List
 enum PrimitiveScalarType{
-    
+
     BOOL = 0;     // arrow::Type::BOOL
     UINT8 = 1;    // arrow::Type::UINT8
     INT8 = 2;     // arrow::Type::INT8
@@ -777,7 +778,7 @@ message ScalarListType{
     PrimitiveScalarType deepest_type = 2;
 }
 
-// Broke out into multiple message types so that type 
+// Broke out into multiple message types so that type
 // metadata did not need to be in separate message
 //All types that are of the empty message types contain no additional metadata
 // about the type
@@ -794,7 +795,7 @@ message ArrowType{
         EmptyMessage UINT64 =9;
         EmptyMessage INT64 =10 ;
         EmptyMessage FLOAT16 =11 ;
-        EmptyMessage FLOAT32 =12 ; 
+        EmptyMessage FLOAT32 =12 ;
         EmptyMessage FLOAT64 =13 ;
         EmptyMessage UTF8 =14 ;
         EmptyMessage LARGE_UTF8 = 32;
@@ -824,7 +825,7 @@ message ArrowType{
 
 //Useful for representing an empty enum variant in rust
 // E.G. enum example{One, Two(i32)}
-// maps to 
+// maps to
 // message example{
 //    oneof{
 //        EmptyMessage One = 1;
diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
@@ -28,8 +28,8 @@ use crate::{convert_box_required, convert_required};
 
 use arrow::datatypes::{DataType, Field, Schema};
 use datafusion::logical_plan::{
-    abs, acos, asin, atan, ceil, cos, exp, floor, log10, log2, round, signum, sin, sqrt,
-    tan, trunc, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator,
+    abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round, signum, sin,
+    sqrt, tan, trunc, Expr, JoinType, LogicalPlan, LogicalPlanBuilder, Operator,
 };
 use datafusion::physical_plan::aggregates::AggregateFunction;
 use datafusion::physical_plan::csv::CsvReadOptions;
@@ -1013,6 +1013,7 @@ impl TryInto<Expr> for &protobuf::LogicalExprNode {
                     protobuf::ScalarFunction::Log2 => {
                         Ok(log2((&expr.expr[0]).try_into()?))
                     }
+                    protobuf::ScalarFunction::Ln => Ok(ln((&expr.expr[0]).try_into()?)),
                     protobuf::ScalarFunction::Log10 => {
                         Ok(log10((&expr.expr[0]).try_into()?))
                     }
diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs b/ballista/rust/core/src/serde/logical_plan/to_proto.rs
@@ -1200,6 +1200,7 @@ impl TryInto<protobuf::ScalarFunction> for &BuiltinScalarFunction {
             BuiltinScalarFunction::Atan => Ok(protobuf::ScalarFunction::Atan),
             BuiltinScalarFunction::Exp => Ok(protobuf::ScalarFunction::Exp),
             BuiltinScalarFunction::Log => Ok(protobuf::ScalarFunction::Log),
+            BuiltinScalarFunction::Ln => Ok(protobuf::ScalarFunction::Ln),
             BuiltinScalarFunction::Log10 => Ok(protobuf::ScalarFunction::Log10),
             BuiltinScalarFunction::Floor => Ok(protobuf::ScalarFunction::Floor),
             BuiltinScalarFunction::Ceil => Ok(protobuf::ScalarFunction::Ceil),
diff --git a/datafusion-cli/Dockerfile b/datafusion-cli/Dockerfile
@@ -15,14 +15,20 @@
 # specific language governing permissions and limitations
 # under the License.
 
-FROM rust:latest
+FROM rust:latest as builder
 
+COPY ./datafusion /usr/src/datafusion
 
-COPY ./datafusion ./usr/src/datafusion
-COPY ./datafusion-cli ./usr/src/datafusion-cli
+COPY ./datafusion-cli /usr/src/datafusion-cli
 
 WORKDIR /usr/src/datafusion-cli
-RUN cargo install --path .
 
+RUN cargo build --release
 
-CMD ["datafusion-cli", "--data-path", "/data"]
+FROM debian:buster-slim
+
+COPY --from=builder /usr/src/datafusion-cli/target/release/datafusion-cli /usr/local/bin
+
+ENTRYPOINT ["datafusion-cli"]
+
+CMD ["--data-path", "/data"]
diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs
@@ -1766,6 +1766,25 @@ mod tests {
                 "+-----+-------------+",
             ];
             assert_batches_sorted_eq!(expected, &results);
+
+            // Now, use dict as an aggregate
+            let results = plan_and_collect(
+                &mut ctx,
+                "SELECT val, count(distinct dict) FROM t GROUP BY val",
+            )
+            .await
+            .expect("ran plan correctly");
+
+            let expected = vec![
+                "+-----+----------------------+",
+                "| val | count(DISTINCT dict) |",
+                "+-----+----------------------+",
+                "| 1   | 2                    |",
+                "| 2   | 2                    |",
+                "| 4   | 1                    |",
+                "+-----+----------------------+",
+            ];
+            assert_batches_sorted_eq!(expected, &results);
         }
 
         run_test_case::<Int8Type>().await;
diff --git a/datafusion/src/execution/dataframe_impl.rs b/datafusion/src/execution/dataframe_impl.rs
@@ -177,9 +177,11 @@ impl DataFrame for DataFrameImpl {
 
 #[cfg(test)]
 mod tests {
+    use std::vec;
+
     use super::*;
-    use crate::execution::context::ExecutionContext;
     use crate::logical_plan::*;
+    use crate::{assert_batches_sorted_eq, execution::context::ExecutionContext};
     use crate::{datasource::csv::CsvReadOptions, physical_plan::ColumnarValue};
     use crate::{physical_plan::functions::ScalarFunctionImplementation, test};
     use arrow::datatypes::DataType;
@@ -216,8 +218,8 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn aggregate() -> Result<()> {
+    #[tokio::test]
+    async fn aggregate() -> Result<()> {
         // build plan using DataFrame API
         let df = test_table()?;
         let group_expr = vec![col("c1")];
@@ -230,18 +232,22 @@ mod tests {
             count_distinct(col("c12")),
         ];
 
-        let df = df.aggregate(group_expr, aggr_expr)?;
-
-        let plan = df.to_logical_plan();
-
-        // build same plan using SQL API
-        let sql = "SELECT c1, MIN(c12), MAX(c12), AVG(c12), SUM(c12), COUNT(c12), COUNT(DISTINCT c12) \
-                   FROM aggregate_test_100 \
-                   GROUP BY c1";
-        let sql_plan = create_plan(sql)?;
-
-        // the two plans should be identical
-        assert_same_plan(&plan, &sql_plan);
+        let df: Vec<RecordBatch> = df.aggregate(group_expr, aggr_expr)?.collect().await?;
+
+        assert_batches_sorted_eq!(
+            vec![
+            "+----+----------------------+--------------------+---------------------+--------------------+------------+---------------------+",
+            "| c1 | MIN(c12)             | MAX(c12)           | AVG(c12)            | SUM(c12)           | COUNT(c12) | COUNT(DISTINCT c12) |",
+            "+----+----------------------+--------------------+---------------------+--------------------+------------+---------------------+",
+            "| a  | 0.02182578039211991  | 0.9800193410444061 | 0.48754517466109415 | 10.238448667882977 | 21         | 21                  |",
+            "| b  | 0.04893135681998029  | 0.9185813970744787 | 0.41040709263815384 | 7.797734760124923  | 19         | 19                  |",
+            "| c  | 0.0494924465469434   | 0.991517828651004  | 0.6600456536439784  | 13.860958726523545 | 21         | 21                  |",
+            "| d  | 0.061029375346466685 | 0.9748360509016578 | 0.48855379387549824 | 8.793968289758968  | 18         | 18                  |",
+            "| e  | 0.01479305307777301  | 0.9965400387585364 | 0.48600669271341534 | 10.206140546981722 | 21         | 21                  |",
+            "+----+----------------------+--------------------+---------------------+--------------------+------------+---------------------+",
+            ],
+            &df
+        );
 
         Ok(())
     }
diff --git a/datafusion/src/logical_plan/expr.rs b/datafusion/src/logical_plan/expr.rs
@@ -1086,9 +1086,9 @@ unary_scalar_expr!(Trunc, trunc);
 unary_scalar_expr!(Abs, abs);
 unary_scalar_expr!(Signum, signum);
 unary_scalar_expr!(Exp, exp);
-unary_scalar_expr!(Log, ln);
 unary_scalar_expr!(Log2, log2);
 unary_scalar_expr!(Log10, log10);
+unary_scalar_expr!(Ln, ln);
 
 // string functions
 unary_scalar_expr!(Ascii, ascii);
diff --git a/datafusion/src/physical_plan/distinct_expressions.rs b/datafusion/src/physical_plan/distinct_expressions.rs
@@ -47,8 +47,8 @@ pub struct DistinctCount {
     name: String,
     /// The DataType for the final count
     data_type: DataType,
-    /// The DataType for each input argument
-    input_data_types: Vec<DataType>,
+    /// The DataType used to hold the state for each input
+    state_data_types: Vec<DataType>,
     /// The input arguments
     exprs: Vec<Arc<dyn PhysicalExpr>>,
 }
@@ -61,15 +61,26 @@ impl DistinctCount {
         name: String,
         data_type: DataType,
     ) -> Self {
+        let state_data_types = input_data_types.into_iter().map(state_type).collect();
+
         Self {
-            input_data_types,
-            exprs,
             name,
             data_type,
+            state_data_types,
+            exprs,
         }
     }
 }
 
+/// return the type to use to accumulate state for the specified input type
+fn state_type(data_type: DataType) -> DataType {
+    match data_type {
+        // when aggregating dictionary values, use the underlying value type
+        DataType::Dictionary(_key_type, value_type) => *value_type,
+        t => t,
+    }
+}
+
 impl AggregateExpr for DistinctCount {
     /// Return a reference to Any that can be used for downcasting
     fn as_any(&self) -> &dyn Any {
@@ -82,12 +93,16 @@ impl AggregateExpr for DistinctCount {
 
     fn state_fields(&self) -> Result<Vec<Field>> {
         Ok(self
-            .input_data_types
+            .state_data_types
             .iter()
-            .map(|data_type| {
+            .map(|state_data_type| {
                 Field::new(
                     &format_state_name(&self.name, "count distinct"),
-                    DataType::List(Box::new(Field::new("item", data_type.clone(), true))),
+                    DataType::List(Box::new(Field::new(
+                        "item",
+                        state_data_type.clone(),
+                        true,
+                    ))),
                     false,
                 )
             })
@@ -101,7 +116,7 @@ impl AggregateExpr for DistinctCount {
     fn create_accumulator(&self) -> Result<Box<dyn Accumulator>> {
         Ok(Box::new(DistinctCountAccumulator {
             values: HashSet::default(),
-            data_types: self.input_data_types.clone(),
+            state_data_types: self.state_data_types.clone(),
             count_data_type: self.data_type.clone(),
         }))
     }
@@ -110,7 +125,7 @@ impl AggregateExpr for DistinctCount {
 #[derive(Debug)]
 struct DistinctCountAccumulator {
     values: HashSet<DistinctScalarValues, RandomState>,
-    data_types: Vec<DataType>,
+    state_data_types: Vec<DataType>,
     count_data_type: DataType,
 }
 
@@ -156,9 +171,11 @@ impl Accumulator for DistinctCountAccumulator {
 
     fn state(&self) -> Result<Vec<ScalarValue>> {
         let mut cols_out = self
-            .data_types
+            .state_data_types
             .iter()
-            .map(|data_type| ScalarValue::List(Some(Vec::new()), data_type.clone()))
+            .map(|state_data_type| {
+                ScalarValue::List(Some(Vec::new()), state_data_type.clone())
+            })
             .collect::<Vec<_>>();
 
         let mut cols_vec = cols_out
diff --git a/datafusion/src/physical_plan/functions.rs b/datafusion/src/physical_plan/functions.rs
@@ -102,7 +102,9 @@ pub enum BuiltinScalarFunction {
     Exp,
     /// floor
     Floor,
-    /// log, also known as ln
+    /// ln, Natural logarithm
+    Ln,
+    /// log, same as log10
     Log,
     /// log10
     Log10,
@@ -222,6 +224,7 @@ impl FromStr for BuiltinScalarFunction {
             "cos" => BuiltinScalarFunction::Cos,
             "exp" => BuiltinScalarFunction::Exp,
             "floor" => BuiltinScalarFunction::Floor,
+            "ln" => BuiltinScalarFunction::Ln,
             "log" => BuiltinScalarFunction::Log,
             "log10" => BuiltinScalarFunction::Log10,
             "log2" => BuiltinScalarFunction::Log2,
@@ -633,6 +636,7 @@ pub fn return_type(
         | BuiltinScalarFunction::Exp
         | BuiltinScalarFunction::Floor
         | BuiltinScalarFunction::Log
+        | BuiltinScalarFunction::Ln
         | BuiltinScalarFunction::Log10
         | BuiltinScalarFunction::Log2
         | BuiltinScalarFunction::Round
@@ -721,7 +725,8 @@ pub fn create_physical_expr(
         BuiltinScalarFunction::Cos => math_expressions::cos,
         BuiltinScalarFunction::Exp => math_expressions::exp,
         BuiltinScalarFunction::Floor => math_expressions::floor,
-        BuiltinScalarFunction::Log => math_expressions::ln,
+        BuiltinScalarFunction::Log => math_expressions::log10,
+        BuiltinScalarFunction::Ln => math_expressions::ln,
         BuiltinScalarFunction::Log10 => math_expressions::log10,
         BuiltinScalarFunction::Log2 => math_expressions::log2,
         BuiltinScalarFunction::Round => math_expressions::round,
diff --git a/datafusion/src/physical_plan/math_expressions.rs b/datafusion/src/physical_plan/math_expressions.rs
@@ -113,6 +113,6 @@ math_unary_function!("trunc", trunc);
 math_unary_function!("abs", abs);
 math_unary_function!("signum", signum);
 math_unary_function!("exp", exp);
-math_unary_function!("log", ln);
+math_unary_function!("ln", ln);
 math_unary_function!("log2", log2);
 math_unary_function!("log10", log10);
diff --git a/datafusion/src/physical_plan/mod.rs b/datafusion/src/physical_plan/mod.rs
diff --git a/datafusion/src/scalar.rs b/datafusion/src/scalar.rs
diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs
diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs