Skip to content

Commit 128d7c6

Browse files
[MINOR]: Simplify enforce_distribution, minor changes (#7924)
* Initial commit * Simplifications * Cleanup imports * Review --------- Co-authored-by: Mehmet Ozan Kabak <ozankabak@gmail.com>
1 parent 12a6316 commit 128d7c6

10 files changed

Lines changed: 184 additions & 179 deletions

File tree

datafusion/core/src/physical_optimizer/enforce_distribution.rs

Lines changed: 78 additions & 118 deletions
Large diffs are not rendered by default.

datafusion/core/src/physical_optimizer/enforce_sorting.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717

1818
//! EnforceSorting optimizer rule inspects the physical plan with respect
1919
//! to local sorting requirements and does the following:
20-
//! - Adds a [SortExec] when a requirement is not met,
21-
//! - Removes an already-existing [SortExec] if it is possible to prove
20+
//! - Adds a [`SortExec`] when a requirement is not met,
21+
//! - Removes an already-existing [`SortExec`] if it is possible to prove
2222
//! that this sort is unnecessary
2323
//! The rule can work on valid *and* invalid physical plans with respect to
2424
//! sorting requirements, but always produces a valid physical plan in this sense.
@@ -496,9 +496,10 @@ fn ensure_sorting(
496496
{
497497
// This SortPreservingMergeExec is unnecessary, input already has a
498498
// single partition.
499+
sort_onwards.truncate(1);
499500
return Ok(Transformed::Yes(PlanWithCorrespondingSort {
500-
plan: children[0].clone(),
501-
sort_onwards: vec![sort_onwards[0].clone()],
501+
plan: children.swap_remove(0),
502+
sort_onwards,
502503
}));
503504
}
504505
Ok(Transformed::Yes(PlanWithCorrespondingSort {
@@ -649,7 +650,7 @@ fn remove_corresponding_coalesce_in_sub_plan(
649650
&& is_repartition(&new_plan)
650651
&& is_repartition(parent)
651652
{
652-
new_plan = new_plan.children()[0].clone()
653+
new_plan = new_plan.children().swap_remove(0)
653654
}
654655
new_plan
655656
} else {
@@ -689,7 +690,7 @@ fn remove_corresponding_sort_from_sub_plan(
689690
) -> Result<Arc<dyn ExecutionPlan>> {
690691
// A `SortExec` is always at the bottom of the tree.
691692
let mut updated_plan = if is_sort(&sort_onwards.plan) {
692-
sort_onwards.plan.children()[0].clone()
693+
sort_onwards.plan.children().swap_remove(0)
693694
} else {
694695
let plan = &sort_onwards.plan;
695696
let mut children = plan.children();
@@ -703,12 +704,12 @@ fn remove_corresponding_sort_from_sub_plan(
703704
}
704705
// Replace with variants that do not preserve order.
705706
if is_sort_preserving_merge(plan) {
706-
children[0].clone()
707+
children.swap_remove(0)
707708
} else if let Some(repartition) = plan.as_any().downcast_ref::<RepartitionExec>()
708709
{
709710
Arc::new(
710711
RepartitionExec::try_new(
711-
children[0].clone(),
712+
children.swap_remove(0),
712713
repartition.partitioning().clone(),
713714
)?
714715
.with_preserve_order(false),
@@ -730,7 +731,7 @@ fn remove_corresponding_sort_from_sub_plan(
730731
updated_plan,
731732
));
732733
} else {
733-
updated_plan = Arc::new(CoalescePartitionsExec::new(updated_plan.clone()));
734+
updated_plan = Arc::new(CoalescePartitionsExec::new(updated_plan));
734735
}
735736
}
736737
Ok(updated_plan)
@@ -777,8 +778,7 @@ mod tests {
777778
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
778779
use datafusion_common::Result;
779780
use datafusion_expr::JoinType;
780-
use datafusion_physical_expr::expressions::Column;
781-
use datafusion_physical_expr::expressions::{col, NotExpr};
781+
use datafusion_physical_expr::expressions::{col, Column, NotExpr};
782782

783783
fn create_test_schema() -> Result<SchemaRef> {
784784
let nullable_column = Field::new("nullable_col", DataType::Int32, true);

datafusion/core/src/physical_optimizer/test_utils.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ use datafusion_execution::object_store::ObjectStoreUrl;
4444
use datafusion_expr::{AggregateFunction, WindowFrame, WindowFunction};
4545
use datafusion_physical_expr::expressions::col;
4646
use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
47+
use datafusion_physical_plan::windows::PartitionSearchMode;
4748

4849
use async_trait::async_trait;
4950

@@ -239,7 +240,7 @@ pub fn bounded_window_exec(
239240
.unwrap()],
240241
input.clone(),
241242
vec![],
242-
crate::physical_plan::windows::PartitionSearchMode::Sorted,
243+
PartitionSearchMode::Sorted,
243244
)
244245
.unwrap(),
245246
)

datafusion/core/tests/fuzz_cases/window_fuzz.rs

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,32 +22,33 @@ use arrow::compute::{concat_batches, SortOptions};
2222
use arrow::datatypes::SchemaRef;
2323
use arrow::record_batch::RecordBatch;
2424
use arrow::util::pretty::pretty_format_batches;
25-
use hashbrown::HashMap;
26-
use rand::rngs::StdRng;
27-
use rand::{Rng, SeedableRng};
2825

2926
use datafusion::physical_plan::memory::MemoryExec;
3027
use datafusion::physical_plan::sorts::sort::SortExec;
3128
use datafusion::physical_plan::windows::{
3229
create_window_expr, BoundedWindowAggExec, PartitionSearchMode, WindowAggExec,
3330
};
3431
use datafusion::physical_plan::{collect, ExecutionPlan};
32+
use datafusion::prelude::{SessionConfig, SessionContext};
33+
use datafusion_common::{Result, ScalarValue};
34+
use datafusion_expr::type_coercion::aggregates::coerce_types;
3535
use datafusion_expr::{
3636
AggregateFunction, BuiltInWindowFunction, WindowFrame, WindowFrameBound,
3737
WindowFrameUnits, WindowFunction,
3838
};
39-
40-
use datafusion::prelude::{SessionConfig, SessionContext};
41-
use datafusion_common::{Result, ScalarValue};
42-
use datafusion_expr::type_coercion::aggregates::coerce_types;
4339
use datafusion_physical_expr::expressions::{cast, col, lit};
4440
use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
4541
use test_utils::add_empty_batches;
4642

43+
use hashbrown::HashMap;
44+
use rand::rngs::StdRng;
45+
use rand::{Rng, SeedableRng};
46+
4747
#[cfg(test)]
4848
mod tests {
4949
use super::*;
50-
use datafusion::physical_plan::windows::PartitionSearchMode::{
50+
51+
use datafusion_physical_plan::windows::PartitionSearchMode::{
5152
Linear, PartiallySorted, Sorted,
5253
};
5354

datafusion/physical-expr/src/aggregate/first_last.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,9 @@ use crate::{
2626
reverse_order_bys, AggregateExpr, LexOrdering, PhysicalExpr, PhysicalSortExpr,
2727
};
2828

29-
use arrow::array::ArrayRef;
30-
use arrow::compute;
31-
use arrow::compute::{lexsort_to_indices, SortColumn};
29+
use arrow::array::{Array, ArrayRef, AsArray, BooleanArray};
30+
use arrow::compute::{self, lexsort_to_indices, SortColumn};
3231
use arrow::datatypes::{DataType, Field};
33-
use arrow_array::cast::AsArray;
34-
use arrow_array::{Array, BooleanArray};
3532
use arrow_schema::SortOptions;
3633
use datafusion_common::utils::{compare_rows, get_arrayref_at_indices, get_row_at_idx};
3734
use datafusion_common::{DataFusionError, Result, ScalarValue};

datafusion/physical-expr/src/physical_expr.rs

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use std::any::Any;
19+
use std::fmt::{Debug, Display};
20+
use std::hash::{Hash, Hasher};
21+
use std::sync::Arc;
22+
1823
use crate::intervals::Interval;
1924
use crate::sort_properties::SortProperties;
2025
use crate::utils::scatter;
@@ -27,11 +32,6 @@ use datafusion_common::utils::DataPtr;
2732
use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result};
2833
use datafusion_expr::ColumnarValue;
2934

30-
use std::any::Any;
31-
use std::fmt::{Debug, Display};
32-
use std::hash::{Hash, Hasher};
33-
use std::sync::Arc;
34-
3535
/// Expression that can be evaluated against a RecordBatch
3636
/// A Physical expression knows its type, nullability and how to evaluate itself.
3737
pub trait PhysicalExpr: Send + Sync + Display + Debug + PartialEq<dyn Any> {
@@ -54,13 +54,12 @@ pub trait PhysicalExpr: Send + Sync + Display + Debug + PartialEq<dyn Any> {
5454
let tmp_batch = filter_record_batch(batch, selection)?;
5555

5656
let tmp_result = self.evaluate(&tmp_batch)?;
57-
// All values from the `selection` filter are true.
57+
5858
if batch.num_rows() == tmp_batch.num_rows() {
59-
return Ok(tmp_result);
60-
}
61-
if let ColumnarValue::Array(a) = tmp_result {
62-
let result = scatter(selection, a.as_ref())?;
63-
Ok(ColumnarValue::Array(result))
59+
// All values from the `selection` filter are true.
60+
Ok(tmp_result)
61+
} else if let ColumnarValue::Array(a) = tmp_result {
62+
scatter(selection, a.as_ref()).map(ColumnarValue::Array)
6463
} else {
6564
Ok(tmp_result)
6665
}
@@ -216,8 +215,8 @@ pub fn down_cast_any_ref(any: &dyn Any) -> &dyn Any {
216215
}
217216
}
218217

219-
/// It is similar to contains method of vector.
220-
/// Finds whether `expr` is among `physical_exprs`.
218+
/// This function is similar to the `contains` method of `Vec`. It finds
219+
/// whether `expr` is among `physical_exprs`.
221220
pub fn physical_exprs_contains(
222221
physical_exprs: &[Arc<dyn PhysicalExpr>],
223222
expr: &Arc<dyn PhysicalExpr>,
@@ -226,3 +225,49 @@ pub fn physical_exprs_contains(
226225
.iter()
227226
.any(|physical_expr| physical_expr.eq(expr))
228227
}
228+
229+
#[cfg(test)]
230+
mod tests {
231+
use std::sync::Arc;
232+
233+
use crate::expressions::{Column, Literal};
234+
use crate::physical_expr::{physical_exprs_contains, PhysicalExpr};
235+
236+
use datafusion_common::{Result, ScalarValue};
237+
238+
#[test]
239+
fn test_physical_exprs_contains() -> Result<()> {
240+
let lit_true = Arc::new(Literal::new(ScalarValue::Boolean(Some(true))))
241+
as Arc<dyn PhysicalExpr>;
242+
let lit_false = Arc::new(Literal::new(ScalarValue::Boolean(Some(false))))
243+
as Arc<dyn PhysicalExpr>;
244+
let lit4 =
245+
Arc::new(Literal::new(ScalarValue::Int32(Some(4)))) as Arc<dyn PhysicalExpr>;
246+
let lit2 =
247+
Arc::new(Literal::new(ScalarValue::Int32(Some(2)))) as Arc<dyn PhysicalExpr>;
248+
let lit1 =
249+
Arc::new(Literal::new(ScalarValue::Int32(Some(1)))) as Arc<dyn PhysicalExpr>;
250+
let col_a_expr = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
251+
let col_b_expr = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
252+
let col_c_expr = Arc::new(Column::new("c", 2)) as Arc<dyn PhysicalExpr>;
253+
254+
// lit(true), lit(false), lit(4), lit(2), Col(a), Col(b)
255+
let physical_exprs: Vec<Arc<dyn PhysicalExpr>> = vec![
256+
lit_true.clone(),
257+
lit_false.clone(),
258+
lit4.clone(),
259+
lit2.clone(),
260+
col_a_expr.clone(),
261+
col_b_expr.clone(),
262+
];
263+
// below expressions are inside physical_exprs
264+
assert!(physical_exprs_contains(&physical_exprs, &lit_true));
265+
assert!(physical_exprs_contains(&physical_exprs, &lit2));
266+
assert!(physical_exprs_contains(&physical_exprs, &col_b_expr));
267+
268+
// below expressions are not inside physical_exprs
269+
assert!(!physical_exprs_contains(&physical_exprs, &col_c_expr));
270+
assert!(!physical_exprs_contains(&physical_exprs, &lit1));
271+
Ok(())
272+
}
273+
}

datafusion/physical-expr/src/scalar_function.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,24 +29,25 @@
2929
//! This module also has a set of coercion rules to improve user experience: if an argument i32 is passed
3030
//! to a function that supports f64, it is coerced to f64.
3131
32+
use std::any::Any;
33+
use std::fmt::Debug;
34+
use std::fmt::{self, Formatter};
35+
use std::hash::{Hash, Hasher};
36+
use std::sync::Arc;
37+
3238
use crate::functions::out_ordering;
3339
use crate::physical_expr::down_cast_any_ref;
3440
use crate::sort_properties::SortProperties;
3541
use crate::utils::expr_list_eq_strict_order;
3642
use crate::PhysicalExpr;
43+
3744
use arrow::datatypes::{DataType, Schema};
3845
use arrow::record_batch::RecordBatch;
3946
use datafusion_common::Result;
40-
use datafusion_expr::expr_vec_fmt;
41-
use datafusion_expr::BuiltinScalarFunction;
42-
use datafusion_expr::ColumnarValue;
43-
use datafusion_expr::FuncMonotonicity;
44-
use datafusion_expr::ScalarFunctionImplementation;
45-
use std::any::Any;
46-
use std::fmt::Debug;
47-
use std::fmt::{self, Formatter};
48-
use std::hash::{Hash, Hasher};
49-
use std::sync::Arc;
47+
use datafusion_expr::{
48+
expr_vec_fmt, BuiltinScalarFunction, ColumnarValue, FuncMonotonicity,
49+
ScalarFunctionImplementation,
50+
};
5051

5152
/// Physical expression of a scalar function
5253
pub struct ScalarFunctionExpr {

datafusion/physical-plan/src/aggregates/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ impl PhysicalGroupBy {
228228
}
229229

230230
/// Return grouping expressions as they occur in the output schema.
231-
fn output_exprs(&self) -> Vec<Arc<dyn PhysicalExpr>> {
231+
pub fn output_exprs(&self) -> Vec<Arc<dyn PhysicalExpr>> {
232232
self.expr
233233
.iter()
234234
.enumerate()

datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ use crate::expressions::PhysicalSortExpr;
3131
use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
3232
use crate::windows::{
3333
calc_requirements, get_ordered_partition_by_indices, window_ordering_equivalence,
34+
PartitionSearchMode,
3435
};
3536
use crate::{
3637
ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
@@ -68,17 +69,6 @@ use hashbrown::raw::RawTable;
6869
use indexmap::IndexMap;
6970
use log::debug;
7071

71-
#[derive(Debug, Clone, PartialEq)]
72-
/// Specifies partition column properties in terms of input ordering
73-
pub enum PartitionSearchMode {
74-
/// None of the columns among the partition columns is ordered.
75-
Linear,
76-
/// Some columns of the partition columns are ordered but not all
77-
PartiallySorted(Vec<usize>),
78-
/// All Partition columns are ordered (Also empty case)
79-
Sorted,
80-
}
81-
8272
/// Window execution plan
8373
#[derive(Debug)]
8474
pub struct BoundedWindowAggExec {

datafusion/physical-plan/src/windows/mod.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,23 @@ mod bounded_window_agg_exec;
5454
mod window_agg_exec;
5555

5656
pub use bounded_window_agg_exec::BoundedWindowAggExec;
57-
pub use bounded_window_agg_exec::PartitionSearchMode;
5857
pub use window_agg_exec::WindowAggExec;
5958

6059
pub use datafusion_physical_expr::window::{
6160
BuiltInWindowExpr, PlainAggregateWindowExpr, WindowExpr,
6261
};
6362

63+
#[derive(Debug, Clone, PartialEq)]
64+
/// Specifies partition column properties in terms of input ordering
65+
pub enum PartitionSearchMode {
66+
/// None of the columns among the partition columns is ordered.
67+
Linear,
68+
/// Some columns of the partition columns are ordered but not all
69+
PartiallySorted(Vec<usize>),
70+
/// All Partition columns are ordered (Also empty case)
71+
Sorted,
72+
}
73+
6474
/// Create a physical expression for window function
6575
pub fn create_window_expr(
6676
fun: &WindowFunction,

0 commit comments

Comments
 (0)