Skip to content

Commit 5ea1d31

Browse files
authored
Add sort integration benchmark (#13306)
* Add sort integration benchmark * clippy * review
1 parent 1e96a0a commit 5ea1d31

File tree

5 files changed

+366
-1
lines changed

5 files changed

+366
-1
lines changed

benchmarks/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,30 @@ steps.
330330
The tests sort the entire dataset using several different sort
331331
orders.
332332

333+
## Sort TPCH
334+
335+
Test performance of end-to-end sort SQL queries. (While the `Sort` benchmark focuses on a single sort executor, this benchmark tests how sorting is executed across multiple CPU cores by benchmarking sorting the whole relational table.)
336+
337+
Sort integration benchmark runs whole table sort queries on TPCH `lineitem` table, with different characteristics. For example, different number of sort keys, different sort key cardinality, different number of payload columns, etc.
338+
339+
See [`sort_tpch.rs`](src/sort_tpch.rs) for more details.
340+
341+
### Sort TPCH Benchmark Example Runs
342+
1. Run all queries with default setting:
343+
```bash
344+
cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json'
345+
```
346+
347+
2. Run a specific query:
348+
```bash
349+
cargo run --release --bin dfbench -- sort-tpch -p '....../datafusion/benchmarks/data/tpch_sf1' -o '/tmp/sort_tpch.json' --query 2
350+
```
351+
352+
3. Run all queries with `bench.sh` script:
353+
```bash
354+
./bench.sh run sort_tpch
355+
```
356+
333357
## IMDB
334358

335359
Run Join Order Benchmark (JOB) on IMDB dataset.

benchmarks/bench.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ tpch10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB),
7575
tpch_mem10: TPCH inspired benchmark on Scale Factor (SF) 10 (~10GB), query from memory
7676
parquet: Benchmark of parquet reader's filtering speed
7777
sort: Benchmark of sorting speed
78+
sort_tpch: Benchmark of sorting speed for end-to-end sort queries on TPCH dataset
7879
clickbench_1: ClickBench queries against a single parquet file
7980
clickbench_partitioned: ClickBench queries against a partitioned (100 files) parquet
8081
clickbench_extended: ClickBench \"inspired\" queries against a single parquet (DataFusion specific)
@@ -175,6 +176,10 @@ main() {
175176
# same data as for tpch
176177
data_tpch "1"
177178
;;
179+
sort_tpch)
180+
# same data as for tpch
181+
data_tpch "1"
182+
;;
178183
*)
179184
echo "Error: unknown benchmark '$BENCHMARK' for data generation"
180185
usage
@@ -252,6 +257,9 @@ main() {
252257
external_aggr)
253258
run_external_aggr
254259
;;
260+
sort_tpch)
261+
run_sort_tpch
262+
;;
255263
*)
256264
echo "Error: unknown benchmark '$BENCHMARK' for run"
257265
usage
@@ -549,6 +557,16 @@ run_external_aggr() {
549557
$CARGO_COMMAND --bin external_aggr -- benchmark --partitions 4 --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
550558
}
551559

560+
# Runs the sort integration benchmark
561+
run_sort_tpch() {
562+
TPCH_DIR="${DATA_DIR}/tpch_sf1"
563+
RESULTS_FILE="${RESULTS_DIR}/sort_tpch.json"
564+
echo "RESULTS_FILE: ${RESULTS_FILE}"
565+
echo "Running sort tpch benchmark..."
566+
567+
$CARGO_COMMAND --bin dfbench -- sort-tpch --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}"
568+
}
569+
552570

553571
compare_benchmarks() {
554572
BASE_RESULTS_DIR="${SCRIPT_DIR}/results"

benchmarks/src/bin/dfbench.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc;
3333
#[global_allocator]
3434
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
3535

36-
use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, tpch};
36+
use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, sort_tpch, tpch};
3737

3838
#[derive(Debug, StructOpt)]
3939
#[structopt(about = "benchmark command")]
@@ -43,6 +43,7 @@ enum Options {
4343
Clickbench(clickbench::RunOpt),
4444
ParquetFilter(parquet_filter::RunOpt),
4545
Sort(sort::RunOpt),
46+
SortTpch(sort_tpch::RunOpt),
4647
Imdb(imdb::RunOpt),
4748
}
4849

@@ -57,6 +58,7 @@ pub async fn main() -> Result<()> {
5758
Options::Clickbench(opt) => opt.run().await,
5859
Options::ParquetFilter(opt) => opt.run().await,
5960
Options::Sort(opt) => opt.run().await,
61+
Options::SortTpch(opt) => opt.run().await,
6062
Options::Imdb(opt) => opt.run().await,
6163
}
6264
}

benchmarks/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@ pub mod clickbench;
2020
pub mod imdb;
2121
pub mod parquet_filter;
2222
pub mod sort;
23+
pub mod sort_tpch;
2324
pub mod tpch;
2425
pub mod util;

0 commit comments

Comments
 (0)