diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index a7482072ba37e..12cb7311b7dc7 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -58,3 +58,11 @@ regex = { version = "^1.4.3", optional = true } sha2 = { version = "^0.10.1", optional = true } unicode-segmentation = { version = "^1.7.1", optional = true } uuid = { version = "^1.2", features = ["v4"] } + +[dev-dependencies] +criterion = "0.4" +rand = "0.8" + +[[bench]] +harness = false +name = "in_list" diff --git a/datafusion/physical-expr/benches/in_list.rs b/datafusion/physical-expr/benches/in_list.rs new file mode 100644 index 0000000000000..415d124dc7faa --- /dev/null +++ b/datafusion/physical-expr/benches/in_list.rs @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Array, ArrayRef, Float32Array, Int32Array, StringArray}; +use arrow::datatypes::{Field, Schema}; +use arrow::record_batch::RecordBatch; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_common::ScalarValue; +use datafusion_physical_expr::expressions::{col, in_list, lit}; +use rand::distributions::Alphanumeric; +use rand::prelude::*; +use std::sync::Arc; + +fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValue]) { + let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]); + let exprs = exprs.iter().map(|s| lit(s.clone())).collect(); + let expr = in_list(col("a", &schema).unwrap(), exprs, &false, &schema).unwrap(); + let batch = RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); + + c.bench_function(name, |b| { + b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) + }); +} + +fn random_string(rng: &mut StdRng, len: usize) -> String { + let value = rng.sample_iter(&Alphanumeric).take(len).collect(); + String::from_utf8(value).unwrap() +} + +fn do_benches( + c: &mut Criterion, + array_length: usize, + in_list_length: usize, + null_percent: f64, +) { + let mut rng = StdRng::seed_from_u64(120320); + for string_length in [5, 10, 20] { + let values: StringArray = (0..array_length) + .map(|_| { + rng.gen_bool(null_percent) + .then(|| random_string(&mut rng, string_length)) + }) + .collect(); + + let in_list: Vec<_> = (0..in_list_length) + .map(|_| ScalarValue::Utf8(Some(random_string(&mut rng, string_length)))) + .collect(); + + do_bench( + c, + &format!( + "in_list_utf8({}) ({}, {}) IN ({}, 0)", + string_length, array_length, null_percent, in_list_length + ), + Arc::new(values), + &in_list, + ) + } + + let values: Float32Array = (0..array_length) + .map(|_| rng.gen_bool(null_percent).then(|| rng.gen())) + .collect(); + + let in_list: Vec<_> = (0..in_list_length) + .map(|_| ScalarValue::Float32(Some(rng.gen()))) + .collect(); + + do_bench( + c, + &format!( + "in_list_f32 ({}, {}) IN ({}, 0)", + array_length, null_percent, in_list_length + ), + Arc::new(values), + &in_list, + ); + + let values: Int32Array = (0..array_length) + .map(|_| rng.gen_bool(null_percent).then(|| rng.gen())) + .collect(); + + let in_list: Vec<_> = (0..in_list_length) + .map(|_| ScalarValue::Int32(Some(rng.gen()))) + .collect(); + + do_bench( + c, + &format!( + "in_list_i32 ({}, {}) IN ({}, 0)", + array_length, null_percent, in_list_length + ), + Arc::new(values), + &in_list, + ) +} + +fn criterion_benchmark(c: &mut Criterion) { + for in_list_length in [1, 3, 10, 100] { + for null_percent in [0., 0.2] { + do_benches(c, 1024, in_list_length, null_percent) + } + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches);