Skip to content

Commit d2ea5c8

Browse files
marvinlanhenkefindepi
authored andcommitted
Extract Parquet statistics from Interval column (apache#10801)
* feat: add make_batch + basic test
1 parent ad72658 commit d2ea5c8

2 files changed

Lines changed: 159 additions & 2 deletions

File tree

datafusion/core/tests/parquet/arrow_statistics.rs

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ use arrow::datatypes::{
3030
use arrow_array::{
3131
make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
3232
Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array,
33-
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
33+
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray,
34+
IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeBinaryArray,
3435
LargeStringArray, RecordBatch, StringArray, Time32MillisecondArray,
3536
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
3637
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
@@ -1072,6 +1073,84 @@ async fn test_dates_64_diff_rg_sizes() {
10721073
.run();
10731074
}
10741075

1076+
#[tokio::test]
1077+
#[should_panic]
1078+
// Currently this test `should_panic` since statistics for `Intervals`
1079+
// are not supported and `IntervalMonthDayNano` cannot be written
1080+
// to parquet yet.
1081+
// Refer to issue: https://github.com/apache/arrow-rs/issues/5847
1082+
// and https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_writer/mod.rs#L747
1083+
async fn test_interval_diff_rg_sizes() {
1084+
// This creates a parquet files of 3 columns:
1085+
// "year_month" --> IntervalYearMonthArray
1086+
// "day_time" --> IntervalDayTimeArray
1087+
// "month_day_nano" --> IntervalMonthDayNanoArray
1088+
//
1089+
// The file is created by 4 record batches (each has a null row)
1090+
// each has 5 rows but then will be split into 2 row groups with size 13, 7
1091+
let reader = TestReader {
1092+
scenario: Scenario::Interval,
1093+
row_per_group: 13,
1094+
}
1095+
.build()
1096+
.await;
1097+
1098+
// TODO: expected values need to be changed once issue is resolved
1099+
// expected_min: Arc::new(IntervalYearMonthArray::from(vec![
1100+
// IntervalYearMonthType::make_value(1, 10),
1101+
// IntervalYearMonthType::make_value(4, 13),
1102+
// ])),
1103+
// expected_max: Arc::new(IntervalYearMonthArray::from(vec![
1104+
// IntervalYearMonthType::make_value(6, 51),
1105+
// IntervalYearMonthType::make_value(8, 53),
1106+
// ])),
1107+
Test {
1108+
reader: &reader,
1109+
expected_min: Arc::new(IntervalYearMonthArray::from(vec![None, None])),
1110+
expected_max: Arc::new(IntervalYearMonthArray::from(vec![None, None])),
1111+
expected_null_counts: UInt64Array::from(vec![2, 2]),
1112+
expected_row_counts: UInt64Array::from(vec![13, 7]),
1113+
column_name: "year_month",
1114+
}
1115+
.run();
1116+
1117+
// expected_min: Arc::new(IntervalDayTimeArray::from(vec![
1118+
// IntervalDayTimeType::make_value(1, 10),
1119+
// IntervalDayTimeType::make_value(4, 13),
1120+
// ])),
1121+
// expected_max: Arc::new(IntervalDayTimeArray::from(vec![
1122+
// IntervalDayTimeType::make_value(6, 51),
1123+
// IntervalDayTimeType::make_value(8, 53),
1124+
// ])),
1125+
Test {
1126+
reader: &reader,
1127+
expected_min: Arc::new(IntervalDayTimeArray::from(vec![None, None])),
1128+
expected_max: Arc::new(IntervalDayTimeArray::from(vec![None, None])),
1129+
expected_null_counts: UInt64Array::from(vec![2, 2]),
1130+
expected_row_counts: UInt64Array::from(vec![13, 7]),
1131+
column_name: "day_time",
1132+
}
1133+
.run();
1134+
1135+
// expected_min: Arc::new(IntervalMonthDayNanoArray::from(vec![
1136+
// IntervalMonthDayNanoType::make_value(1, 10, 100),
1137+
// IntervalMonthDayNanoType::make_value(4, 13, 103),
1138+
// ])),
1139+
// expected_max: Arc::new(IntervalMonthDayNanoArray::from(vec![
1140+
// IntervalMonthDayNanoType::make_value(6, 51, 501),
1141+
// IntervalMonthDayNanoType::make_value(8, 53, 503),
1142+
// ])),
1143+
Test {
1144+
reader: &reader,
1145+
expected_min: Arc::new(IntervalMonthDayNanoArray::from(vec![None, None])),
1146+
expected_max: Arc::new(IntervalMonthDayNanoArray::from(vec![None, None])),
1147+
expected_null_counts: UInt64Array::from(vec![2, 2]),
1148+
expected_row_counts: UInt64Array::from(vec![13, 7]),
1149+
column_name: "month_day_nano",
1150+
}
1151+
.run();
1152+
}
1153+
10751154
#[tokio::test]
10761155
async fn test_uint() {
10771156
// This creates a parquet files of 4 columns named "u8", "u16", "u32", "u64"

datafusion/core/tests/parquet/mod.rs

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717

1818
//! Parquet integration tests
1919
use arrow::array::Decimal128Array;
20-
use arrow::datatypes::i256;
20+
use arrow::datatypes::{
21+
i256, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType,
22+
};
2123
use arrow::{
2224
array::{
2325
make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
@@ -33,6 +35,10 @@ use arrow::{
3335
record_batch::RecordBatch,
3436
util::pretty::pretty_format_batches,
3537
};
38+
use arrow_array::{
39+
IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray,
40+
};
41+
use arrow_schema::IntervalUnit;
3642
use chrono::{Datelike, Duration, TimeDelta};
3743
use datafusion::{
3844
datasource::{physical_plan::ParquetExec, provider_as_source, TableProvider},
@@ -80,6 +86,7 @@ enum Scenario {
8086
Time32Millisecond,
8187
Time64Nanosecond,
8288
Time64Microsecond,
89+
Interval,
8390
/// 7 Rows, for each i8, i16, i32, i64, u8, u16, u32, u64, f32, f64
8491
/// -MIN, -100, -1, 0, 1, 100, MAX
8592
NumericLimits,
@@ -925,6 +932,71 @@ fn make_dict_batch() -> RecordBatch {
925932
.unwrap()
926933
}
927934

935+
fn make_interval_batch(offset: i32) -> RecordBatch {
936+
let schema = Schema::new(vec![
937+
Field::new(
938+
"year_month",
939+
DataType::Interval(IntervalUnit::YearMonth),
940+
true,
941+
),
942+
Field::new("day_time", DataType::Interval(IntervalUnit::DayTime), true),
943+
Field::new(
944+
"month_day_nano",
945+
DataType::Interval(IntervalUnit::MonthDayNano),
946+
true,
947+
),
948+
]);
949+
let schema = Arc::new(schema);
950+
951+
let ym_arr = IntervalYearMonthArray::from(vec![
952+
Some(IntervalYearMonthType::make_value(1 + offset, 10 + offset)),
953+
Some(IntervalYearMonthType::make_value(2 + offset, 20 + offset)),
954+
Some(IntervalYearMonthType::make_value(3 + offset, 30 + offset)),
955+
None,
956+
Some(IntervalYearMonthType::make_value(5 + offset, 50 + offset)),
957+
]);
958+
959+
let dt_arr = IntervalDayTimeArray::from(vec![
960+
Some(IntervalDayTimeType::make_value(1 + offset, 10 + offset)),
961+
Some(IntervalDayTimeType::make_value(2 + offset, 20 + offset)),
962+
Some(IntervalDayTimeType::make_value(3 + offset, 30 + offset)),
963+
None,
964+
Some(IntervalDayTimeType::make_value(5 + offset, 50 + offset)),
965+
]);
966+
967+
// Not yet implemented, refer to:
968+
// https://github.com/apache/arrow-rs/blob/master/parquet/src/arrow/arrow_writer/mod.rs#L747
969+
let mdn_arr = IntervalMonthDayNanoArray::from(vec![
970+
Some(IntervalMonthDayNanoType::make_value(
971+
1 + offset,
972+
10 + offset,
973+
100 + (offset as i64),
974+
)),
975+
Some(IntervalMonthDayNanoType::make_value(
976+
2 + offset,
977+
20 + offset,
978+
200 + (offset as i64),
979+
)),
980+
Some(IntervalMonthDayNanoType::make_value(
981+
3 + offset,
982+
30 + offset,
983+
300 + (offset as i64),
984+
)),
985+
None,
986+
Some(IntervalMonthDayNanoType::make_value(
987+
5 + offset,
988+
50 + offset,
989+
500 + (offset as i64),
990+
)),
991+
]);
992+
993+
RecordBatch::try_new(
994+
schema,
995+
vec![Arc::new(ym_arr), Arc::new(dt_arr), Arc::new(mdn_arr)],
996+
)
997+
.unwrap()
998+
}
999+
9281000
fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
9291001
match scenario {
9301002
Scenario::Boolean => {
@@ -1346,6 +1418,12 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
13461418
]),
13471419
]
13481420
}
1421+
Scenario::Interval => vec![
1422+
make_interval_batch(0),
1423+
make_interval_batch(1),
1424+
make_interval_batch(2),
1425+
make_interval_batch(3),
1426+
],
13491427
}
13501428
}
13511429

0 commit comments

Comments
 (0)