From 85ac332c4c3f1b9b40287a62a0397e23ad902190 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 23 Jan 2024 15:28:37 -0500 Subject: [PATCH 1/5] Add new test and issue link --- .../optimizer/src/simplify_expressions/expr_simplifier.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 35450b1f32ff2..e28ae1f774e58 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3223,6 +3223,12 @@ mod tests { ); assert_eq!(simplify(expr.clone()), lit(true)); + // 3.5 c1 NOT IN (1, 2, 3, 4) OR c1 NOT IN (4, 5, 6, 7) -> c1 != 4 (4 overlaps) + let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).or( + in_list(col("c1"), vec![lit(4), lit(5), lit(6), lit(7)], true), + ); + assert_eq!(simplify(expr.clone()), col("c1").not_eq(lit(4))); + // 4. c1 NOT IN (1,2,3,4) AND c1 NOT IN (4,5,6,7) -> c1 NOT IN (1,2,3,4,5,6,7) let expr = in_list(col("c1"), vec![lit(1), lit(2), lit(3), lit(4)], true).and( in_list(col("c1"), vec![lit(4), lit(5), lit(6), lit(7)], true), @@ -3316,6 +3322,7 @@ mod tests { true, ))); // TODO: Further simplify this expression + // https://github.com/apache/arrow-datafusion/issues/8970 // assert_eq!(simplify(expr.clone()), lit(true)); assert_eq!(simplify(expr.clone()), expr); } From 9a1c62db175c2a938bcaf4f5d98769a682f221d8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 23 Jan 2024 16:12:43 -0500 Subject: [PATCH 2/5] Avoid cloning in InListSimplifier --- .../simplify_expressions/inlist_simplifier.rs | 81 ++++++++++--------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index fa95f1688e6f4..0452370a68900 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -52,50 +52,55 @@ impl TreeNodeRewriter for InListSimplifier { type N = Expr; fn mutate(&mut self, expr: Expr) -> Result { - if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = &expr { - if let (Expr::InList(l1), Operator::And, Expr::InList(l2)) = - (left.as_ref(), op, right.as_ref()) - { - if l1.expr == l2.expr && !l1.negated && !l2.negated { - return inlist_intersection(l1, l2, false); - } else if l1.expr == l2.expr && l1.negated && l2.negated { - return inlist_union(l1, l2, true); - } else if l1.expr == l2.expr && !l1.negated && l2.negated { - return inlist_except(l1, l2); - } else if l1.expr == l2.expr && l1.negated && !l2.negated { - return inlist_except(l2, l1); - } - } else if let (Expr::InList(l1), Operator::Or, Expr::InList(l2)) = - (left.as_ref(), op, right.as_ref()) - { - if l1.expr == l2.expr && l1.negated && l2.negated { - return inlist_intersection(l1, l2, true); - } + if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = expr { + match (*left, op, *right) { + (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && !l1.negated && !l2.negated => + inlist_intersection(l1, l2, false), + (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && l1.negated && l2.negated => + inlist_union(l1, l2, true), + (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && !l1.negated && l2.negated => + inlist_except(l1, l2), + (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && l1.negated && !l2.negated => + inlist_except(l2, l1), + (Expr::InList(l1), Operator::Or, Expr::InList(l2)) if l1.expr == l2.expr && l1.negated && l2.negated => + inlist_intersection(l1, l2, true), + (left, op, right) => { + // put the expression back together + Ok(Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op, + right: Box::new(right), + })) + } } + } else { + Ok(expr) } - - Ok(expr) } } -fn inlist_union(l1: &InList, l2: &InList, negated: bool) -> Result { - let mut seen: HashSet = HashSet::new(); - let list = l1 - .list - .iter() - .chain(l2.list.iter()) - .filter(|&e| seen.insert(e.to_owned())) - .cloned() - .collect::>(); - let merged_inlist = InList { - expr: l1.expr.clone(), - list, - negated, - }; - Ok(Expr::InList(merged_inlist)) +/// Return the union of two inlist expressions +/// maintaining the order of the elements in the two lists +fn inlist_union(mut l1: InList, l2: InList, negated: bool) -> Result { + // extend the list in l1 with the elements in l2 that are not already in l1 + let l1_items: HashSet<_> = l1.list.iter().collect(); + + // keep all l2 items that do not also appear in l1 + let keep_l2: Vec<_> = l2.list + .into_iter() + .filter_map(|e| { + if l1_items.contains(&e) { + None + } else { + Some(e) + } + }).collect(); + + l1.list.extend(keep_l2.into_iter()); + Ok(Expr::InList(l1)) } -fn inlist_intersection(l1: &InList, l2: &InList, negated: bool) -> Result { +fn inlist_intersection(l1: InList, l2: InList, negated: bool) -> Result { let l1_set: HashSet = l1.list.iter().cloned().collect(); let intersect_list: Vec = l2 .list @@ -116,7 +121,7 @@ fn inlist_intersection(l1: &InList, l2: &InList, negated: bool) -> Result Ok(Expr::InList(merged_inlist)) } -fn inlist_except(l1: &InList, l2: &InList) -> Result { +fn inlist_except(l1: InList, l2: InList) -> Result { let l2_set: HashSet = l2.list.iter().cloned().collect(); let except_list: Vec = l1 .list From 24bb8ab575f071bffa5ed1b1b38821f2bf10353a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 23 Jan 2024 16:15:51 -0500 Subject: [PATCH 3/5] Remove another clone --- .../simplify_expressions/inlist_simplifier.rs | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index 0452370a68900..a65ab7d8f1d40 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -97,28 +97,24 @@ fn inlist_union(mut l1: InList, l2: InList, negated: bool) -> Result { }).collect(); l1.list.extend(keep_l2.into_iter()); + l1.negated = negated; Ok(Expr::InList(l1)) } -fn inlist_intersection(l1: InList, l2: InList, negated: bool) -> Result { - let l1_set: HashSet = l1.list.iter().cloned().collect(); - let intersect_list: Vec = l2 - .list - .iter() - .filter(|x| l1_set.contains(x)) - .cloned() - .collect(); +/// Return the union of two inlist expressions +/// maintaining the order of the elements in the two lists +fn inlist_intersection(mut l1: InList, l2: InList, negated: bool) -> Result { + let l2_items = l2.list.iter().collect::>(); + + // remove all items from l1 that are not in l2 + l1.list = l1.list.into_iter().filter(|e| l2_items.contains(e)).collect(); + // e in () is always false // e not in () is always true - if intersect_list.is_empty() { + if l1.list.is_empty() { return Ok(lit(negated)); } - let merged_inlist = InList { - expr: l1.expr.clone(), - list: intersect_list, - negated, - }; - Ok(Expr::InList(merged_inlist)) + Ok(Expr::InList(l1)) } fn inlist_except(l1: InList, l2: InList) -> Result { From 87e284b25bf157295d78fe5d4c6717511812ee88 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 23 Jan 2024 20:03:05 -0500 Subject: [PATCH 4/5] fmt --- .../simplify_expressions/inlist_simplifier.rs | 69 +++++++++++-------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index a65ab7d8f1d40..d672858bdef6c 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -54,24 +54,39 @@ impl TreeNodeRewriter for InListSimplifier { fn mutate(&mut self, expr: Expr) -> Result { if let Expr::BinaryExpr(BinaryExpr { left, op, right }) = expr { match (*left, op, *right) { - (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && !l1.negated && !l2.negated => - inlist_intersection(l1, l2, false), - (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && l1.negated && l2.negated => - inlist_union(l1, l2, true), - (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && !l1.negated && l2.negated => - inlist_except(l1, l2), - (Expr::InList(l1), Operator::And, Expr::InList(l2)) if l1.expr == l2.expr && l1.negated && !l2.negated => - inlist_except(l2, l1), - (Expr::InList(l1), Operator::Or, Expr::InList(l2)) if l1.expr == l2.expr && l1.negated && l2.negated => - inlist_intersection(l1, l2, true), - (left, op, right) => { - // put the expression back together - Ok(Expr::BinaryExpr(BinaryExpr { - left: Box::new(left), - op, - right: Box::new(right), - })) - } + (Expr::InList(l1), Operator::And, Expr::InList(l2)) + if l1.expr == l2.expr && !l1.negated && !l2.negated => + { + inlist_intersection(l1, l2, false) + } + (Expr::InList(l1), Operator::And, Expr::InList(l2)) + if l1.expr == l2.expr && l1.negated && l2.negated => + { + inlist_union(l1, l2, true) + } + (Expr::InList(l1), Operator::And, Expr::InList(l2)) + if l1.expr == l2.expr && !l1.negated && l2.negated => + { + inlist_except(l1, l2) + } + (Expr::InList(l1), Operator::And, Expr::InList(l2)) + if l1.expr == l2.expr && l1.negated && !l2.negated => + { + inlist_except(l2, l1) + } + (Expr::InList(l1), Operator::Or, Expr::InList(l2)) + if l1.expr == l2.expr && l1.negated && l2.negated => + { + inlist_intersection(l1, l2, true) + } + (left, op, right) => { + // put the expression back together + Ok(Expr::BinaryExpr(BinaryExpr { + left: Box::new(left), + op, + right: Box::new(right), + })) + } } } else { Ok(expr) @@ -86,15 +101,11 @@ fn inlist_union(mut l1: InList, l2: InList, negated: bool) -> Result { let l1_items: HashSet<_> = l1.list.iter().collect(); // keep all l2 items that do not also appear in l1 - let keep_l2: Vec<_> = l2.list + let keep_l2: Vec<_> = l2 + .list .into_iter() - .filter_map(|e| { - if l1_items.contains(&e) { - None - } else { - Some(e) - } - }).collect(); + .filter_map(|e| if l1_items.contains(&e) { None } else { Some(e) }) + .collect(); l1.list.extend(keep_l2.into_iter()); l1.negated = negated; @@ -107,7 +118,11 @@ fn inlist_intersection(mut l1: InList, l2: InList, negated: bool) -> Result>(); // remove all items from l1 that are not in l2 - l1.list = l1.list.into_iter().filter(|e| l2_items.contains(e)).collect(); + l1.list = l1 + .list + .into_iter() + .filter(|e| l2_items.contains(e)) + .collect(); // e in () is always false // e not in () is always true From f4adf4adb7f5015c880d125053c43d5a928a3f75 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 24 Jan 2024 06:36:48 -0500 Subject: [PATCH 5/5] fix clippy, simplify more --- .../simplify_expressions/inlist_simplifier.rs | 35 +++++++------------ 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs index d672858bdef6c..e9ce2734636c4 100644 --- a/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/inlist_simplifier.rs @@ -107,22 +107,18 @@ fn inlist_union(mut l1: InList, l2: InList, negated: bool) -> Result { .filter_map(|e| if l1_items.contains(&e) { None } else { Some(e) }) .collect(); - l1.list.extend(keep_l2.into_iter()); + l1.list.extend(keep_l2); l1.negated = negated; Ok(Expr::InList(l1)) } -/// Return the union of two inlist expressions +/// Return the intersection of two inlist expressions /// maintaining the order of the elements in the two lists fn inlist_intersection(mut l1: InList, l2: InList, negated: bool) -> Result { let l2_items = l2.list.iter().collect::>(); // remove all items from l1 that are not in l2 - l1.list = l1 - .list - .into_iter() - .filter(|e| l2_items.contains(e)) - .collect(); + l1.list.retain(|e| l2_items.contains(e)); // e in () is always false // e not in () is always true @@ -132,21 +128,16 @@ fn inlist_intersection(mut l1: InList, l2: InList, negated: bool) -> Result Result { - let l2_set: HashSet = l2.list.iter().cloned().collect(); - let except_list: Vec = l1 - .list - .iter() - .filter(|x| !l2_set.contains(x)) - .cloned() - .collect(); - if except_list.is_empty() { +/// Return the all items in l1 that are not in l2 +/// maintaining the order of the elements in the two lists +fn inlist_except(mut l1: InList, l2: InList) -> Result { + let l2_items = l2.list.iter().collect::>(); + + // keep only items from l1 that are not in l2 + l1.list.retain(|e| !l2_items.contains(e)); + + if l1.list.is_empty() { return Ok(lit(false)); } - let merged_inlist = InList { - expr: l1.expr.clone(), - list: except_list, - negated: false, - }; - Ok(Expr::InList(merged_inlist)) + Ok(Expr::InList(l1)) }