From 3f7076a6c7736d0aebbb749a582d0a856e2b1f91 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 29 May 2024 03:37:28 +0000 Subject: [PATCH 01/19] Fix incorrect statistics read for unsigned integers columns in parquet --- .../physical_plan/parquet/statistics.rs | 12 ++++ .../core/tests/parquet/arrow_statistics.rs | 60 ++++++++----------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index bbdd46af5d01f..ae8395aef6a41 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -81,6 +81,15 @@ macro_rules! get_statistic { Some(DataType::Int16) => { Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap()))) } + Some(DataType::UInt8) => { + Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap()))) + } + Some(DataType::UInt16) => { + Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap()))) + } + Some(DataType::UInt32) => { + Some(ScalarValue::UInt32(Some((*s.$func()) as u32))) + } Some(DataType::Date32) => { Some(ScalarValue::Date32(Some(*s.$func()))) } @@ -100,6 +109,9 @@ macro_rules! get_statistic { *scale, )) } + Some(DataType::UInt64) => { + Some(ScalarValue::UInt64(Some((*s.$func()) as u64))) + } _ => Some(ScalarValue::Int64(Some(*s.$func()))), } } diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index 93cb7636b82ea..eebf3447cbe9f 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -26,7 +26,8 @@ use arrow::datatypes::{Date32Type, Date64Type}; use arrow_array::{ make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array, + Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{DataType, Field, Schema}; use datafusion::datasource::physical_plan::parquet::{ @@ -703,8 +704,6 @@ async fn test_dates_64_diff_rg_sizes() { .run(); } -// BUG: -// https://github.com/apache/datafusion/issues/10604 #[tokio::test] async fn test_uint() { // This creates a parquet files of 4 columns named "u8", "u16", "u32", "u64" @@ -719,48 +718,40 @@ async fn test_uint() { row_per_group: 4, }; - // u8 - // BUG: expect UInt8Array but returns Int32Array Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt8Array - expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt8Array + expected_min: Arc::new(UInt8Array::from(vec![0, 1, 4, 7, 251])), + expected_max: Arc::new(UInt8Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]), column_name: "u8", } .run(); - // u16 - // BUG: expect UInt16Array but returns Int32Array Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt16Array - expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt16Array + expected_min: Arc::new(UInt16Array::from(vec![0, 1, 4, 7, 251])), + expected_max: Arc::new(UInt16Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]), column_name: "u16", } .run(); - // u32 - // BUG: expect UInt32Array but returns Int32Array Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt32Array - expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt32Array + expected_min: Arc::new(UInt32Array::from(vec![0, 1, 4, 7, 251])), + expected_max: Arc::new(UInt32Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]), column_name: "u32", } .run(); - // u64 - // BUG: expect UInt64rray but returns Int64Array Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt64Array - expected_max: Arc::new(Int64Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt64Array + expected_min: Arc::new(UInt64Array::from(vec![0, 1, 4, 7, 251])), + expected_max: Arc::new(UInt64Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]), column_name: "u64", @@ -788,8 +779,6 @@ async fn test_int32_range() { .run(); } -// BUG: not convert UInt32Array to Int32Array -// https://github.com/apache/datafusion/issues/10604 #[tokio::test] async fn test_uint32_range() { // This creates a parquet file of 1 column "u" @@ -801,8 +790,8 @@ async fn test_uint32_range() { Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![0])), // should be UInt32Array - expected_max: Arc::new(Int32Array::from(vec![300000])), // should be UInt32Array + expected_min: Arc::new(UInt32Array::from(vec![0])), + expected_max: Arc::new(UInt32Array::from(vec![300000])), expected_null_counts: UInt64Array::from(vec![0]), expected_row_counts: UInt64Array::from(vec![4]), column_name: "u", @@ -820,44 +809,45 @@ async fn test_numeric_limits_unsigned() { Test { reader: reader.build().await, - expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])), - expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])), + expected_min: Arc::new(UInt8Array::from(vec![u8::MIN, 100])), + expected_max: Arc::new(UInt8Array::from(vec![100, u8::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: UInt64Array::from(vec![5, 2]), - column_name: "i8", + column_name: "u8", } .run(); Test { reader: reader.build().await, - expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])), - expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])), + expected_min: Arc::new(UInt16Array::from(vec![u16::MIN, 100])), + expected_max: Arc::new(UInt16Array::from(vec![100, u16::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: UInt64Array::from(vec![5, 2]), - column_name: "i16", + column_name: "u16", } .run(); Test { reader: reader.build().await, - expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])), - expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])), + expected_min: Arc::new(UInt32Array::from(vec![u32::MIN, 100])), + expected_max: Arc::new(UInt32Array::from(vec![100, u32::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: UInt64Array::from(vec![5, 2]), - column_name: "i32", + column_name: "u32", } .run(); Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])), - expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])), + expected_min: Arc::new(UInt64Array::from(vec![u64::MIN, 100])), + expected_max: Arc::new(UInt64Array::from(vec![100, u64::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: UInt64Array::from(vec![5, 2]), - column_name: "i64", + column_name: "u64", } .run(); } + #[tokio::test] async fn test_numeric_limits_signed() { // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows. From 095ac391e17f9d4c3d841971316fb47dc371b791 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 29 May 2024 10:31:23 +0000 Subject: [PATCH 02/19] Staging the change for faster stat --- .../physical_plan/parquet/statistics.rs | 464 ++++++++++++------ 1 file changed, 313 insertions(+), 151 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index ae8395aef6a41..ee6720b338fc4 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -20,11 +20,14 @@ // TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328 use arrow::{array::ArrayRef, datatypes::DataType}; -use arrow_array::{new_empty_array, new_null_array, UInt64Array}; -use arrow_schema::{Field, FieldRef, Schema}; -use datafusion_common::{ - internal_datafusion_err, internal_err, plan_err, Result, ScalarValue, +use arrow_array::{ + new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, + Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array, + UInt64Array, UInt8Array, }; +use arrow_schema::{Field, FieldRef, Schema}; +use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; use parquet::file::metadata::ParquetMetaData; use parquet::file::statistics::Statistics as ParquetStatistics; use parquet::schema::types::SchemaDescriptor; @@ -52,131 +55,300 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] { result } -/// Extract a single min/max statistics from a [`ParquetStatistics`] object -/// -/// * `$column_statistics` is the `ParquetStatistics` object -/// * `$func is the function` (`min`/`max`) to call to get the value -/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get the value as bytes -/// * `$target_arrow_type` is the [`DataType`] of the target statistics -macro_rules! get_statistic { - ($column_statistics:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{ - if !$column_statistics.has_min_max_set() { - return None; - } - match $column_statistics { - ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))), - ParquetStatistics::Int32(s) => { - match $target_arrow_type { - // int32 to decimal with the precision and scale - Some(DataType::Decimal128(precision, scale)) => { - Some(ScalarValue::Decimal128( - Some(*s.$func() as i128), - *precision, - *scale, - )) - } - Some(DataType::Int8) => { - Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap()))) - } - Some(DataType::Int16) => { - Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap()))) - } - Some(DataType::UInt8) => { - Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap()))) - } - Some(DataType::UInt16) => { - Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap()))) - } - Some(DataType::UInt32) => { - Some(ScalarValue::UInt32(Some((*s.$func()) as u32))) - } - Some(DataType::Date32) => { - Some(ScalarValue::Date32(Some(*s.$func()))) - } - Some(DataType::Date64) => { - Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000))) - } - _ => Some(ScalarValue::Int32(Some(*s.$func()))), - } +macro_rules! get_statistics_iter { + ($column_statistics_iter:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{ + match $target_arrow_type { + Some(DataType::Boolean) => Some(Arc::new(BooleanArray::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Boolean(s) => Some(*s.$func()), + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::Int8) => Some(Arc::new(Int8Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int32(s) => { + Some((*s.$func()).try_into().unwrap()) + } + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::Int16) => Some(Arc::new(Int16Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int32(s) => { + Some((*s.$func()).try_into().unwrap()) + } + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::Int32) => Some(Arc::new(Int32Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int32(s) => Some(*s.$func()), + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::Int64) => Some(Arc::new(Int64Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int64(s) => { + Some((*s.$func()).try_into().unwrap()) + } + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::UInt8) => { + Some(Arc::new(UInt8Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int32(s) => { + Some((*s.$func()).try_into().unwrap()) + } + _ => None, + } + }) + }), + )) as ArrayRef) } - ParquetStatistics::Int64(s) => { - match $target_arrow_type { - // int64 to decimal with the precision and scale - Some(DataType::Decimal128(precision, scale)) => { - Some(ScalarValue::Decimal128( - Some(*s.$func() as i128), - *precision, - *scale, - )) - } - Some(DataType::UInt64) => { - Some(ScalarValue::UInt64(Some((*s.$func()) as u64))) - } - _ => Some(ScalarValue::Int64(Some(*s.$func()))), - } + Some(DataType::UInt16) => { + Some(Arc::new(UInt16Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int32(s) => { + Some((*s.$func()).try_into().unwrap()) + } + _ => None, + } + }) + }), + )) as ArrayRef) } - // 96 bit ints not supported - ParquetStatistics::Int96(_) => None, - ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))), - ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))), - ParquetStatistics::ByteArray(s) => { - match $target_arrow_type { - // decimal data type - Some(DataType::Decimal128(precision, scale)) => { - Some(ScalarValue::Decimal128( - Some(from_bytes_to_i128(s.$bytes_func())), - *precision, - *scale, - )) - } - Some(DataType::Binary) => { - Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec()))) - } - _ => { - let s = std::str::from_utf8(s.$bytes_func()) - .map(|s| s.to_string()) - .ok(); - if s.is_none() { - log::debug!( - "Utf8 statistics is a non-UTF8 value, ignoring it." - ); - } - Some(ScalarValue::Utf8(s)) - } - } + Some(DataType::UInt32) => { + Some(Arc::new(UInt32Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int32(s) => { + Some(*s.$func() as u32) + } + _ => None, + } + }) + }), + )) as ArrayRef) } - // type not fully supported yet - ParquetStatistics::FixedLenByteArray(s) => { - match $target_arrow_type { - // just support specific logical data types, there are others each - // with their own ordering - Some(DataType::Decimal128(precision, scale)) => { - Some(ScalarValue::Decimal128( - Some(from_bytes_to_i128(s.$bytes_func())), - *precision, - *scale, - )) - } - Some(DataType::FixedSizeBinary(size)) => { - let value = s.$bytes_func().to_vec(); - let value = if value.len().try_into() == Ok(*size) { - Some(value) - } else { - log::debug!( - "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", - size, - value.len(), - ); - None - }; - Some(ScalarValue::FixedSizeBinary( - *size, - value, - )) - } - _ => None, - } + Some(DataType::UInt64) => { + Some(Arc::new(UInt64Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int64(s) => { + Some(*s.$func() as u64) + } + _ => None, + } + }) + }), + )) as ArrayRef) + } + Some(DataType::Date32) => Some(Arc::new(Date32Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int32(s) => Some(*s.$func()), + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::Date64) => Some(Arc::new(Date64Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int32(s) => { + Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000) + } + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::Float32) => Some(Arc::new(Float32Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Float(s) => Some(*s.$func()), + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::Float64) => Some(Arc::new(Float64Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Double(s) => Some(*s.$func()), + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::Decimal128(precision, scale)) => { + let mut array = Decimal128Array::from_iter($column_statistics_iter.map( + |statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int32(s) => Some(*s.$func() as i128), + ParquetStatistics::Int64(s) => Some(*s.$func() as i128), + ParquetStatistics::ByteArray(s) => { + Some(from_bytes_to_i128(s.$bytes_func())) + } + ParquetStatistics::FixedLenByteArray(s) => { + Some(from_bytes_to_i128(s.$bytes_func())) + } + _ => None, + } + }) + }, + )); + + array = array.with_precision_and_scale(*precision, *scale)?; + Some(Arc::new(array) as ArrayRef) } + Some(DataType::Binary) => Some(Arc::new(BinaryArray::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::ByteArray(s) => { + Some(s.$bytes_func().to_vec()) + } + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::Utf8) => Some(Arc::new(StringArray::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::ByteArray(s) => { + let res = std::str::from_utf8(s.$bytes_func()) + .map(|s| s.to_string()) + .ok(); + + if res.is_none() { + log::debug!( + "Utf8 statistics is a non-UTF8 value, ignoring it." + ); + } + + Some(res.unwrap_or_else(|| "".to_string())) + } + _ => None, + } + }) + }), + )) as ArrayRef), + Some(DataType::FixedSizeBinary(size)) => Some(Arc::new(FixedSizeBinaryArray::from( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::FixedLenByteArray(s) => { + let value = s.$bytes_func(); + let value = if value.len().try_into() == Ok(*size) { + Some(value) + } else { + log::debug!( + "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", + size, + value.len(), + ); + None + }; + value + } + _ => None, + } + }) + }).collect::>(), + )) as ArrayRef), + Some(DataType::Timestamp(_, _)) => { + log::info!( + "Timestamp statistics are not supported, returning none" + ); + None + }, + _ => None, } }}; } @@ -211,9 +383,13 @@ pub(crate) fn min_statistics<'a, I: Iterator Result { - let scalars = iterator - .map(|x| x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type)))); - collect_scalars(data_type, scalars) + match get_statistics_iter!(iterator, min, min_bytes, Some(data_type)) { + Some(array) => Ok(array), + None => { + log::debug!("Unsupported data type for statistics"); + Ok(new_empty_array(data_type)) + } + } } /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`] @@ -221,22 +397,11 @@ pub(crate) fn max_statistics<'a, I: Iterator Result { - let scalars = iterator - .map(|x| x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type)))); - collect_scalars(data_type, scalars) -} - -/// Builds an array from an iterator of ScalarValue -fn collect_scalars>>( - data_type: &DataType, - iterator: I, -) -> Result { - let mut scalars = iterator.peekable(); - match scalars.peek().is_none() { - true => Ok(new_empty_array(data_type)), - false => { - let null = ScalarValue::try_from(data_type)?; - ScalarValue::iter_to_array(scalars.map(|x| x.unwrap_or_else(|| null.clone()))) + match get_statistics_iter!(iterator, max, max_bytes, Some(data_type)) { + Some(array) => Ok(array), + None => { + log::debug!("Unsupported data type for statistics"); + Ok(new_empty_array(data_type)) } } } @@ -536,9 +701,6 @@ mod test { } #[test] - #[should_panic( - expected = "Inconsistent types in ScalarValue::iter_to_array. Expected Int64, got TimestampNanosecond(NULL, None)" - )] // Due to https://github.com/apache/datafusion/issues/8295 fn roundtrip_timestamp() { Test { @@ -556,8 +718,8 @@ mod test { None, None, ]), - expected_min: timestamp_array([Some(1), Some(5), None]), - expected_max: timestamp_array([Some(3), Some(9), None]), + expected_min: timestamp_array([]), + expected_max: timestamp_array([]), } .run() } @@ -914,8 +1076,8 @@ mod test { // File has no min/max for timestamp_col .with_column(ExpectedColumn { name: "timestamp_col", - expected_min: timestamp_array([None]), - expected_max: timestamp_array([None]), + expected_min: timestamp_array([]), + expected_max: timestamp_array([]), }) .with_column(ExpectedColumn { name: "year", From 2faec57e0fc89bf6d90c069a54daa7c991f43d09 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 29 May 2024 10:44:47 +0000 Subject: [PATCH 03/19] Improve performance of extracting statistics from parquet files --- .../physical_plan/parquet/statistics.rs | 29 +++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index ee6720b338fc4..90776f917225a 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -342,11 +342,22 @@ macro_rules! get_statistics_iter { }) }).collect::>(), )) as ArrayRef), - Some(DataType::Timestamp(_, _)) => { - log::info!( - "Timestamp statistics are not supported, returning none" - ); - None + Some(DataType::Timestamp(_unit, _literal)) => { + Some(Arc::new(Int64Array::from_iter( + $column_statistics_iter.map(|statistic| { + statistic.and_then(|x| { + if !x.has_min_max_set() { + return None; + } + match x { + ParquetStatistics::Int64(s) => { + Some(*s.$func()) + } + _ => None, + } + }) + }), + )) as ArrayRef) }, _ => None, } @@ -718,8 +729,8 @@ mod test { None, None, ]), - expected_min: timestamp_array([]), - expected_max: timestamp_array([]), + expected_min: i64_array([Some(1), Some(5), None]), + expected_max: i64_array([Some(3), Some(9), None]), } .run() } @@ -1076,8 +1087,8 @@ mod test { // File has no min/max for timestamp_col .with_column(ExpectedColumn { name: "timestamp_col", - expected_min: timestamp_array([]), - expected_max: timestamp_array([]), + expected_min: i64_array([None]), + expected_max: i64_array([None]), }) .with_column(ExpectedColumn { name: "year", From b2ee8051b07e82be585cf094c271d1b1a1dddf30 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 30 May 2024 09:04:15 +0000 Subject: [PATCH 04/19] Revert "Improve performance of extracting statistics from parquet files" This reverts commit 2faec57e0fc89bf6d90c069a54daa7c991f43d09. --- .../physical_plan/parquet/statistics.rs | 29 ++++++------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 90776f917225a..ee6720b338fc4 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -342,22 +342,11 @@ macro_rules! get_statistics_iter { }) }).collect::>(), )) as ArrayRef), - Some(DataType::Timestamp(_unit, _literal)) => { - Some(Arc::new(Int64Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int64(s) => { - Some(*s.$func()) - } - _ => None, - } - }) - }), - )) as ArrayRef) + Some(DataType::Timestamp(_, _)) => { + log::info!( + "Timestamp statistics are not supported, returning none" + ); + None }, _ => None, } @@ -729,8 +718,8 @@ mod test { None, None, ]), - expected_min: i64_array([Some(1), Some(5), None]), - expected_max: i64_array([Some(3), Some(9), None]), + expected_min: timestamp_array([]), + expected_max: timestamp_array([]), } .run() } @@ -1087,8 +1076,8 @@ mod test { // File has no min/max for timestamp_col .with_column(ExpectedColumn { name: "timestamp_col", - expected_min: i64_array([None]), - expected_max: i64_array([None]), + expected_min: timestamp_array([]), + expected_max: timestamp_array([]), }) .with_column(ExpectedColumn { name: "year", From 8d7e9f2274a84bb1e471a6dcfb841bc58b1ea4cd Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 30 May 2024 09:04:34 +0000 Subject: [PATCH 05/19] Revert "Staging the change for faster stat" This reverts commit 095ac391e17f9d4c3d841971316fb47dc371b791. --- .../physical_plan/parquet/statistics.rs | 464 ++++++------------ 1 file changed, 151 insertions(+), 313 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index ee6720b338fc4..ae8395aef6a41 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -20,14 +20,11 @@ // TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328 use arrow::{array::ArrayRef, datatypes::DataType}; -use arrow_array::{ - new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, - Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, -}; +use arrow_array::{new_empty_array, new_null_array, UInt64Array}; use arrow_schema::{Field, FieldRef, Schema}; -use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; +use datafusion_common::{ + internal_datafusion_err, internal_err, plan_err, Result, ScalarValue, +}; use parquet::file::metadata::ParquetMetaData; use parquet::file::statistics::Statistics as ParquetStatistics; use parquet::schema::types::SchemaDescriptor; @@ -55,300 +52,131 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] { result } -macro_rules! get_statistics_iter { - ($column_statistics_iter:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{ - match $target_arrow_type { - Some(DataType::Boolean) => Some(Arc::new(BooleanArray::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Boolean(s) => Some(*s.$func()), - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::Int8) => Some(Arc::new(Int8Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int32(s) => { - Some((*s.$func()).try_into().unwrap()) - } - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::Int16) => Some(Arc::new(Int16Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int32(s) => { - Some((*s.$func()).try_into().unwrap()) - } - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::Int32) => Some(Arc::new(Int32Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int32(s) => Some(*s.$func()), - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::Int64) => Some(Arc::new(Int64Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int64(s) => { - Some((*s.$func()).try_into().unwrap()) - } - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::UInt8) => { - Some(Arc::new(UInt8Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int32(s) => { - Some((*s.$func()).try_into().unwrap()) - } - _ => None, - } - }) - }), - )) as ArrayRef) - } - Some(DataType::UInt16) => { - Some(Arc::new(UInt16Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int32(s) => { - Some((*s.$func()).try_into().unwrap()) - } - _ => None, - } - }) - }), - )) as ArrayRef) - } - Some(DataType::UInt32) => { - Some(Arc::new(UInt32Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int32(s) => { - Some(*s.$func() as u32) - } - _ => None, - } - }) - }), - )) as ArrayRef) +/// Extract a single min/max statistics from a [`ParquetStatistics`] object +/// +/// * `$column_statistics` is the `ParquetStatistics` object +/// * `$func is the function` (`min`/`max`) to call to get the value +/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get the value as bytes +/// * `$target_arrow_type` is the [`DataType`] of the target statistics +macro_rules! get_statistic { + ($column_statistics:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{ + if !$column_statistics.has_min_max_set() { + return None; + } + match $column_statistics { + ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))), + ParquetStatistics::Int32(s) => { + match $target_arrow_type { + // int32 to decimal with the precision and scale + Some(DataType::Decimal128(precision, scale)) => { + Some(ScalarValue::Decimal128( + Some(*s.$func() as i128), + *precision, + *scale, + )) + } + Some(DataType::Int8) => { + Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap()))) + } + Some(DataType::Int16) => { + Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap()))) + } + Some(DataType::UInt8) => { + Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap()))) + } + Some(DataType::UInt16) => { + Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap()))) + } + Some(DataType::UInt32) => { + Some(ScalarValue::UInt32(Some((*s.$func()) as u32))) + } + Some(DataType::Date32) => { + Some(ScalarValue::Date32(Some(*s.$func()))) + } + Some(DataType::Date64) => { + Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000))) + } + _ => Some(ScalarValue::Int32(Some(*s.$func()))), + } } - Some(DataType::UInt64) => { - Some(Arc::new(UInt64Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int64(s) => { - Some(*s.$func() as u64) - } - _ => None, - } - }) - }), - )) as ArrayRef) + ParquetStatistics::Int64(s) => { + match $target_arrow_type { + // int64 to decimal with the precision and scale + Some(DataType::Decimal128(precision, scale)) => { + Some(ScalarValue::Decimal128( + Some(*s.$func() as i128), + *precision, + *scale, + )) + } + Some(DataType::UInt64) => { + Some(ScalarValue::UInt64(Some((*s.$func()) as u64))) + } + _ => Some(ScalarValue::Int64(Some(*s.$func()))), + } } - Some(DataType::Date32) => Some(Arc::new(Date32Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int32(s) => Some(*s.$func()), - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::Date64) => Some(Arc::new(Date64Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int32(s) => { - Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000) - } - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::Float32) => Some(Arc::new(Float32Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Float(s) => Some(*s.$func()), - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::Float64) => Some(Arc::new(Float64Array::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Double(s) => Some(*s.$func()), - _ => None, + // 96 bit ints not supported + ParquetStatistics::Int96(_) => None, + ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))), + ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))), + ParquetStatistics::ByteArray(s) => { + match $target_arrow_type { + // decimal data type + Some(DataType::Decimal128(precision, scale)) => { + Some(ScalarValue::Decimal128( + Some(from_bytes_to_i128(s.$bytes_func())), + *precision, + *scale, + )) + } + Some(DataType::Binary) => { + Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec()))) + } + _ => { + let s = std::str::from_utf8(s.$bytes_func()) + .map(|s| s.to_string()) + .ok(); + if s.is_none() { + log::debug!( + "Utf8 statistics is a non-UTF8 value, ignoring it." + ); } - }) - }), - )) as ArrayRef), - Some(DataType::Decimal128(precision, scale)) => { - let mut array = Decimal128Array::from_iter($column_statistics_iter.map( - |statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::Int32(s) => Some(*s.$func() as i128), - ParquetStatistics::Int64(s) => Some(*s.$func() as i128), - ParquetStatistics::ByteArray(s) => { - Some(from_bytes_to_i128(s.$bytes_func())) - } - ParquetStatistics::FixedLenByteArray(s) => { - Some(from_bytes_to_i128(s.$bytes_func())) - } - _ => None, - } - }) - }, - )); - - array = array.with_precision_and_scale(*precision, *scale)?; - Some(Arc::new(array) as ArrayRef) + Some(ScalarValue::Utf8(s)) + } + } + } + // type not fully supported yet + ParquetStatistics::FixedLenByteArray(s) => { + match $target_arrow_type { + // just support specific logical data types, there are others each + // with their own ordering + Some(DataType::Decimal128(precision, scale)) => { + Some(ScalarValue::Decimal128( + Some(from_bytes_to_i128(s.$bytes_func())), + *precision, + *scale, + )) + } + Some(DataType::FixedSizeBinary(size)) => { + let value = s.$bytes_func().to_vec(); + let value = if value.len().try_into() == Ok(*size) { + Some(value) + } else { + log::debug!( + "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", + size, + value.len(), + ); + None + }; + Some(ScalarValue::FixedSizeBinary( + *size, + value, + )) + } + _ => None, + } } - Some(DataType::Binary) => Some(Arc::new(BinaryArray::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::ByteArray(s) => { - Some(s.$bytes_func().to_vec()) - } - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::Utf8) => Some(Arc::new(StringArray::from_iter( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::ByteArray(s) => { - let res = std::str::from_utf8(s.$bytes_func()) - .map(|s| s.to_string()) - .ok(); - - if res.is_none() { - log::debug!( - "Utf8 statistics is a non-UTF8 value, ignoring it." - ); - } - - Some(res.unwrap_or_else(|| "".to_string())) - } - _ => None, - } - }) - }), - )) as ArrayRef), - Some(DataType::FixedSizeBinary(size)) => Some(Arc::new(FixedSizeBinaryArray::from( - $column_statistics_iter.map(|statistic| { - statistic.and_then(|x| { - if !x.has_min_max_set() { - return None; - } - match x { - ParquetStatistics::FixedLenByteArray(s) => { - let value = s.$bytes_func(); - let value = if value.len().try_into() == Ok(*size) { - Some(value) - } else { - log::debug!( - "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", - size, - value.len(), - ); - None - }; - value - } - _ => None, - } - }) - }).collect::>(), - )) as ArrayRef), - Some(DataType::Timestamp(_, _)) => { - log::info!( - "Timestamp statistics are not supported, returning none" - ); - None - }, - _ => None, } }}; } @@ -383,13 +211,9 @@ pub(crate) fn min_statistics<'a, I: Iterator Result { - match get_statistics_iter!(iterator, min, min_bytes, Some(data_type)) { - Some(array) => Ok(array), - None => { - log::debug!("Unsupported data type for statistics"); - Ok(new_empty_array(data_type)) - } - } + let scalars = iterator + .map(|x| x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type)))); + collect_scalars(data_type, scalars) } /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`] @@ -397,11 +221,22 @@ pub(crate) fn max_statistics<'a, I: Iterator Result { - match get_statistics_iter!(iterator, max, max_bytes, Some(data_type)) { - Some(array) => Ok(array), - None => { - log::debug!("Unsupported data type for statistics"); - Ok(new_empty_array(data_type)) + let scalars = iterator + .map(|x| x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type)))); + collect_scalars(data_type, scalars) +} + +/// Builds an array from an iterator of ScalarValue +fn collect_scalars>>( + data_type: &DataType, + iterator: I, +) -> Result { + let mut scalars = iterator.peekable(); + match scalars.peek().is_none() { + true => Ok(new_empty_array(data_type)), + false => { + let null = ScalarValue::try_from(data_type)?; + ScalarValue::iter_to_array(scalars.map(|x| x.unwrap_or_else(|| null.clone()))) } } } @@ -701,6 +536,9 @@ mod test { } #[test] + #[should_panic( + expected = "Inconsistent types in ScalarValue::iter_to_array. Expected Int64, got TimestampNanosecond(NULL, None)" + )] // Due to https://github.com/apache/datafusion/issues/8295 fn roundtrip_timestamp() { Test { @@ -718,8 +556,8 @@ mod test { None, None, ]), - expected_min: timestamp_array([]), - expected_max: timestamp_array([]), + expected_min: timestamp_array([Some(1), Some(5), None]), + expected_max: timestamp_array([Some(3), Some(9), None]), } .run() } @@ -1076,8 +914,8 @@ mod test { // File has no min/max for timestamp_col .with_column(ExpectedColumn { name: "timestamp_col", - expected_min: timestamp_array([]), - expected_max: timestamp_array([]), + expected_min: timestamp_array([None]), + expected_max: timestamp_array([None]), }) .with_column(ExpectedColumn { name: "year", From 5a3fdec113297df8419fe566ca9ce626efee00df Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 30 May 2024 10:39:24 +0000 Subject: [PATCH 06/19] Refine using the iterator idea --- .../physical_plan/parquet/statistics.rs | 277 +++++++++++++++++- 1 file changed, 263 insertions(+), 14 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index ae8395aef6a41..82782dd8dcbed 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -20,7 +20,11 @@ // TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328 use arrow::{array::ArrayRef, datatypes::DataType}; -use arrow_array::{new_empty_array, new_null_array, UInt64Array}; +use arrow_array::{ + new_empty_array, new_null_array, BooleanArray, Date32Array, Date64Array, + Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + UInt16Array, UInt32Array, UInt64Array, UInt8Array, +}; use arrow_schema::{Field, FieldRef, Schema}; use datafusion_common::{ internal_datafusion_err, internal_err, plan_err, Result, ScalarValue, @@ -52,6 +56,94 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] { result } +macro_rules! make_stats_iterator { + ($iterator_type:ident, $func:ident, $parquet_statistics_type:path, $stat_value_type:ty) => { + struct $iterator_type<'a, I> + where + I: Iterator>, + { + iter: I, + } + + impl<'a, I> $iterator_type<'a, I> + where + I: Iterator>, + { + fn new(iter: I) -> Self { + Self { iter } + } + } + + impl<'a, I> Iterator for $iterator_type<'a, I> + where + I: Iterator>, + { + type Item = Option<&'a $stat_value_type>; + + fn next(&mut self) -> Option { + let next = self.iter.next(); + next.map(|x| { + x.and_then(|stats| match stats { + $parquet_statistics_type(s) if stats.has_min_max_set() => { + Some(s.$func()) + } + _ => None, + }) + }) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } + } + }; +} + +make_stats_iterator!( + MinBooleanStatsIterator, + min, + ParquetStatistics::Boolean, + bool +); +make_stats_iterator!( + MaxBooleanStatsIterator, + max, + ParquetStatistics::Boolean, + bool +); +make_stats_iterator!(MinInt32StatsIterator, min, ParquetStatistics::Int32, i32); +make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, i32); +make_stats_iterator!(MinInt64StatsIterator, min, ParquetStatistics::Int64, i64); +make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, i64); +make_stats_iterator!(MinFloatStatsIterator, min, ParquetStatistics::Float, f32); +make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, f32); +make_stats_iterator!(MinDoubleStatsIterator, min, ParquetStatistics::Double, f64); +make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, f64); +make_stats_iterator!( + MinByteArrayStatsIterator, + min_bytes, + ParquetStatistics::ByteArray, + [u8] +); +make_stats_iterator!( + MaxByteArrayStatsIterator, + max_bytes, + ParquetStatistics::ByteArray, + [u8] +); +make_stats_iterator!( + MinFixedLenByteArrayStatsIterator, + min_bytes, + ParquetStatistics::FixedLenByteArray, + [u8] +); +make_stats_iterator!( + MaxFixedLenByteArrayStatsIterator, + max_bytes, + ParquetStatistics::FixedLenByteArray, + [u8] +); + /// Extract a single min/max statistics from a [`ParquetStatistics`] object /// /// * `$column_statistics` is the `ParquetStatistics` object @@ -211,9 +303,89 @@ pub(crate) fn min_statistics<'a, I: Iterator Result { - let scalars = iterator - .map(|x| x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type)))); - collect_scalars(data_type, scalars) + match data_type { + DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter( + MinBooleanStatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Int8 => Ok(Arc::new(Int8Array::from_iter( + MinInt32StatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = i8::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::Int16 => Ok(Arc::new(Int16Array::from_iter( + MinInt32StatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = i16::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::Int32 => Ok(Arc::new(Int32Array::from_iter( + MinInt32StatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Int64 => Ok(Arc::new(Int64Array::from_iter( + MinInt64StatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter( + MinInt32StatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = u8::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter( + MinInt32StatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = u16::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter( + MinInt32StatsIterator::new(iterator).map(|x| x.map(|x| *x as u32)), + ))), + DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter( + MinInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)), + ))), + DataType::Float32 => Ok(Arc::new(Float32Array::from_iter( + MinFloatStatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Float64 => Ok(Arc::new(Float64Array::from_iter( + MinDoubleStatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Date32 => Ok(Arc::new(Date32Array::from_iter( + MinInt32StatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Date64 => Ok(Arc::new(Date64Array::from_iter( + MinInt32StatsIterator::new(iterator) + .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)), + ))), + DataType::Timestamp(_, _) => Ok(Arc::new(Int64Array::from_iter( + MinInt64StatsIterator::new(iterator).map(|x| x.copied()), + ))), + _ => { + let scalars = iterator.map(|x| { + x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type))) + }); + collect_scalars(data_type, scalars) + } + } } /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`] @@ -221,9 +393,89 @@ pub(crate) fn max_statistics<'a, I: Iterator Result { - let scalars = iterator - .map(|x| x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type)))); - collect_scalars(data_type, scalars) + match data_type { + DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter( + MaxBooleanStatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Int8 => Ok(Arc::new(Int8Array::from_iter( + MaxInt32StatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = i8::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::Int16 => Ok(Arc::new(Int16Array::from_iter( + MaxInt32StatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = i16::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::Int32 => Ok(Arc::new(Int32Array::from_iter( + MaxInt32StatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Int64 => Ok(Arc::new(Int64Array::from_iter( + MaxInt64StatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter( + MaxInt32StatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = u8::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter( + MaxInt32StatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = u16::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter( + MaxInt32StatsIterator::new(iterator).map(|x| x.map(|x| *x as u32)), + ))), + DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter( + MaxInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)), + ))), + DataType::Float32 => Ok(Arc::new(Float32Array::from_iter( + MaxFloatStatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Float64 => Ok(Arc::new(Float64Array::from_iter( + MaxDoubleStatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Date32 => Ok(Arc::new(Date32Array::from_iter( + MaxInt32StatsIterator::new(iterator).map(|x| x.copied()), + ))), + DataType::Date64 => Ok(Arc::new(Date64Array::from_iter( + MaxInt32StatsIterator::new(iterator) + .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)), + ))), + DataType::Timestamp(_, _) => Ok(Arc::new(Int64Array::from_iter( + MaxInt64StatsIterator::new(iterator).map(|x| x.copied()), + ))), + _ => { + let scalars = iterator.map(|x| { + x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type))) + }); + collect_scalars(data_type, scalars) + } + } } /// Builds an array from an iterator of ScalarValue @@ -536,9 +788,6 @@ mod test { } #[test] - #[should_panic( - expected = "Inconsistent types in ScalarValue::iter_to_array. Expected Int64, got TimestampNanosecond(NULL, None)" - )] // Due to https://github.com/apache/datafusion/issues/8295 fn roundtrip_timestamp() { Test { @@ -556,8 +805,8 @@ mod test { None, None, ]), - expected_min: timestamp_array([Some(1), Some(5), None]), - expected_max: timestamp_array([Some(3), Some(9), None]), + expected_min: i64_array([Some(1), Some(5), None]), + expected_max: i64_array([Some(3), Some(9), None]), } .run() } @@ -914,8 +1163,8 @@ mod test { // File has no min/max for timestamp_col .with_column(ExpectedColumn { name: "timestamp_col", - expected_min: timestamp_array([None]), - expected_max: timestamp_array([None]), + expected_min: i64_array([None]), + expected_max: i64_array([None]), }) .with_column(ExpectedColumn { name: "year", From 9f552e86561bc0260ed12643c1296ad8c8ec6398 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 30 May 2024 11:09:47 +0000 Subject: [PATCH 07/19] Add the rest types --- .../physical_plan/parquet/statistics.rs | 86 ++++++++++++++++++- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 82782dd8dcbed..58399a5ce3d97 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -21,9 +21,9 @@ use arrow::{array::ArrayRef, datatypes::DataType}; use arrow_array::{ - new_empty_array, new_null_array, BooleanArray, Date32Array, Date64Array, - Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, - UInt16Array, UInt32Array, UInt64Array, UInt8Array, + new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, + FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, + Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{Field, FieldRef, Schema}; use datafusion_common::{ @@ -56,8 +56,25 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] { result } +////// Define an adapter iterator for extracting statistics from an iterator of +/// `ParquetStatistics` +/// +/// +/// Handles checking if the statistics are present and valid with the correct type. +/// +/// Parameters: +/// * `$iterator_type` is the name of the iterator type (e.g. `MinBooleanStatsIterator`) +/// * `$func` is the function to call to get the value (e.g. `min` or `max`) +/// * `$parquet_statistics_type` is the type of the statistics (e.g. `ParquetStatistics::Boolean`) +/// * `$stat_value_type` is the type of the statistics value (e.g. `bool`) macro_rules! make_stats_iterator { ($iterator_type:ident, $func:ident, $parquet_statistics_type:path, $stat_value_type:ty) => { + /// Maps an iterator of `ParquetStatistics` into an iterator of + /// `&$stat_value_type`` + /// + /// Yielded elements: + /// * Some(stats) if valid + /// * None if the statistics are not present, not valid, or not $stat_value_type struct $iterator_type<'a, I> where I: Iterator>, @@ -69,17 +86,20 @@ macro_rules! make_stats_iterator { where I: Iterator>, { + /// Create a new iterator to extract the statistics fn new(iter: I) -> Self { Self { iter } } } + /// Implement the Iterator trait for the iterator impl<'a, I> Iterator for $iterator_type<'a, I> where I: Iterator>, { type Item = Option<&'a $stat_value_type>; + /// return the next statistics value fn next(&mut self) -> Option { let next = self.iter.next(); next.map(|x| { @@ -379,6 +399,36 @@ pub(crate) fn min_statistics<'a, I: Iterator Ok(Arc::new(Int64Array::from_iter( MinInt64StatsIterator::new(iterator).map(|x| x.copied()), ))), + DataType::Binary => Ok(Arc::new(BinaryArray::from_iter( + MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())), + ))), + DataType::Utf8 => Ok(Arc::new(StringArray::from_iter( + MinByteArrayStatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); + if res.is_none() { + log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it."); + } + res + }) + }), + ))), + DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from( + MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if x.len().try_into() == Ok(*size) { + Some(x) + } else { + log::debug!( + "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", + size, + x.len(), + ); + None + } + }) + }).collect::>(), + ))), _ => { let scalars = iterator.map(|x| { x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type))) @@ -469,6 +519,36 @@ pub(crate) fn max_statistics<'a, I: Iterator Ok(Arc::new(Int64Array::from_iter( MaxInt64StatsIterator::new(iterator).map(|x| x.copied()), ))), + DataType::Binary => Ok(Arc::new(BinaryArray::from_iter( + MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())), + ))), + DataType::Utf8 => Ok(Arc::new(StringArray::from_iter( + MaxByteArrayStatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); + if res.is_none() { + log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it."); + } + res + }) + }), + ))), + DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from( + MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + if x.len().try_into() == Ok(*size) { + Some(x) + } else { + log::debug!( + "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", + size, + x.len(), + ); + None + } + }) + }).collect::>(), + ))), _ => { let scalars = iterator.map(|x| { x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type))) From 7f1695664ddd79551caa7e1b66708218ed563781 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 30 May 2024 13:36:44 -0400 Subject: [PATCH 08/19] Consolidate Decimal statistics extraction --- .../physical_plan/parquet/statistics.rs | 235 +++++++----------- 1 file changed, 86 insertions(+), 149 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 58399a5ce3d97..d517ce3cfabd6 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -22,13 +22,12 @@ use arrow::{array::ArrayRef, datatypes::DataType}; use arrow_array::{ new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, - FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, + Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array, + UInt64Array, UInt8Array, }; use arrow_schema::{Field, FieldRef, Schema}; -use datafusion_common::{ - internal_datafusion_err, internal_err, plan_err, Result, ScalarValue, -}; +use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; use parquet::file::metadata::ParquetMetaData; use parquet::file::statistics::Statistics as ParquetStatistics; use parquet::schema::types::SchemaDescriptor; @@ -56,7 +55,7 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] { result } -////// Define an adapter iterator for extracting statistics from an iterator of +/// Define an adapter iterator for extracting statistics from an iterator of /// `ParquetStatistics` /// /// @@ -164,135 +163,78 @@ make_stats_iterator!( [u8] ); -/// Extract a single min/max statistics from a [`ParquetStatistics`] object +/// Special iterator adapter for extracting i128 values from from an iterator of +/// `ParquetStatistics` +/// +/// Handles checking if the statistics are present and valid with the correct type. +/// +/// Depending on the parquet file, the statistics for `Decimal128` can be stored as +/// `Int32`, `Int64` or `ByteArray` or `FixedSizeByteArray` :mindblown: /// -/// * `$column_statistics` is the `ParquetStatistics` object -/// * `$func is the function` (`min`/`max`) to call to get the value -/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get the value as bytes -/// * `$target_arrow_type` is the [`DataType`] of the target statistics -macro_rules! get_statistic { - ($column_statistics:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{ - if !$column_statistics.has_min_max_set() { - return None; +/// This iterator handles all cases, extracting the values +/// and converting it to `i128`. +/// +/// Parameters: +/// * `$iterator_type` is the name of the iterator type (e.g. `MinBooleanStatsIterator`) +/// * `$func` is the function to call to get the value (e.g. `min` or `max`) +/// * `$bytes_func` is the function to call to get the value as bytes (e.g. `min_bytes` or `max_bytes`) +macro_rules! make_decimal_stats_iterator { + ($iterator_type:ident, $func:ident, $bytes_func:ident) => { + struct $iterator_type<'a, I> + where + I: Iterator>, + { + iter: I, } - match $column_statistics { - ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))), - ParquetStatistics::Int32(s) => { - match $target_arrow_type { - // int32 to decimal with the precision and scale - Some(DataType::Decimal128(precision, scale)) => { - Some(ScalarValue::Decimal128( - Some(*s.$func() as i128), - *precision, - *scale, - )) - } - Some(DataType::Int8) => { - Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap()))) - } - Some(DataType::Int16) => { - Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap()))) - } - Some(DataType::UInt8) => { - Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap()))) - } - Some(DataType::UInt16) => { - Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap()))) - } - Some(DataType::UInt32) => { - Some(ScalarValue::UInt32(Some((*s.$func()) as u32))) - } - Some(DataType::Date32) => { - Some(ScalarValue::Date32(Some(*s.$func()))) - } - Some(DataType::Date64) => { - Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000))) - } - _ => Some(ScalarValue::Int32(Some(*s.$func()))), - } - } - ParquetStatistics::Int64(s) => { - match $target_arrow_type { - // int64 to decimal with the precision and scale - Some(DataType::Decimal128(precision, scale)) => { - Some(ScalarValue::Decimal128( - Some(*s.$func() as i128), - *precision, - *scale, - )) - } - Some(DataType::UInt64) => { - Some(ScalarValue::UInt64(Some((*s.$func()) as u64))) - } - _ => Some(ScalarValue::Int64(Some(*s.$func()))), - } + + impl<'a, I> $iterator_type<'a, I> + where + I: Iterator>, + { + fn new(iter: I) -> Self { + Self { iter } } - // 96 bit ints not supported - ParquetStatistics::Int96(_) => None, - ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))), - ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))), - ParquetStatistics::ByteArray(s) => { - match $target_arrow_type { - // decimal data type - Some(DataType::Decimal128(precision, scale)) => { - Some(ScalarValue::Decimal128( - Some(from_bytes_to_i128(s.$bytes_func())), - *precision, - *scale, - )) - } - Some(DataType::Binary) => { - Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec()))) - } - _ => { - let s = std::str::from_utf8(s.$bytes_func()) - .map(|s| s.to_string()) - .ok(); - if s.is_none() { - log::debug!( - "Utf8 statistics is a non-UTF8 value, ignoring it." - ); + } + + impl<'a, I> Iterator for $iterator_type<'a, I> + where + I: Iterator>, + { + type Item = Option; + + fn next(&mut self) -> Option { + let next = self.iter.next(); + next.map(|x| { + x.and_then(|stats| match stats { + ParquetStatistics::Int32(s) if stats.has_min_max_set() => { + Some(*s.$func() as i128) } - Some(ScalarValue::Utf8(s)) - } - } + ParquetStatistics::Int64(s) if stats.has_min_max_set() => { + Some(*s.$func() as i128) + } + ParquetStatistics::ByteArray(s) if stats.has_min_max_set() => { + Some(from_bytes_to_i128(s.$bytes_func())) + } + ParquetStatistics::FixedLenByteArray(s) + if stats.has_min_max_set() => + { + Some(from_bytes_to_i128(s.$bytes_func())) + } + _ => None, + }) + }) } - // type not fully supported yet - ParquetStatistics::FixedLenByteArray(s) => { - match $target_arrow_type { - // just support specific logical data types, there are others each - // with their own ordering - Some(DataType::Decimal128(precision, scale)) => { - Some(ScalarValue::Decimal128( - Some(from_bytes_to_i128(s.$bytes_func())), - *precision, - *scale, - )) - } - Some(DataType::FixedSizeBinary(size)) => { - let value = s.$bytes_func().to_vec(); - let value = if value.len().try_into() == Ok(*size) { - Some(value) - } else { - log::debug!( - "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", - size, - value.len(), - ); - None - }; - Some(ScalarValue::FixedSizeBinary( - *size, - value, - )) - } - _ => None, - } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() } } - }}; + }; } +make_decimal_stats_iterator!(MinDecimal128StatsIterator, min, min_bytes); +make_decimal_stats_iterator!(MaxDecimal128StatsIterator, max, max_bytes); + /// Lookups up the parquet column by name /// /// Returns the parquet column index and the corresponding arrow field @@ -429,11 +371,16 @@ pub(crate) fn min_statistics<'a, I: Iterator>(), ))), + DataType::Decimal128(precision, scale) => { + let arr = Decimal128Array::from_iter( + MinDecimal128StatsIterator::new(iterator) + ).with_precision_and_scale(*precision, *scale)?; + Ok(Arc::new(arr) as ArrayRef) + }, _ => { - let scalars = iterator.map(|x| { - x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type))) - }); - collect_scalars(data_type, scalars) + let len = iterator.count(); + // don't know how to extract statistics, so return a null array + Ok(new_null_array(data_type, len)) } } } @@ -549,26 +496,16 @@ pub(crate) fn max_statistics<'a, I: Iterator>(), ))), - _ => { - let scalars = iterator.map(|x| { - x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type))) - }); - collect_scalars(data_type, scalars) + DataType::Decimal128(precision, scale) => { + let arr = Decimal128Array::from_iter( + MaxDecimal128StatsIterator::new(iterator) + ).with_precision_and_scale(*precision, *scale)?; + Ok(Arc::new(arr) as ArrayRef) } - } -} - -/// Builds an array from an iterator of ScalarValue -fn collect_scalars>>( - data_type: &DataType, - iterator: I, -) -> Result { - let mut scalars = iterator.peekable(); - match scalars.peek().is_none() { - true => Ok(new_empty_array(data_type)), - false => { - let null = ScalarValue::try_from(data_type)?; - ScalarValue::iter_to_array(scalars.map(|x| x.unwrap_or_else(|| null.clone()))) + _ => { + let len = iterator.count(); + // don't know how to extract statistics, so return a null array + Ok(new_null_array(data_type, len)) } } } From 26f5431c3998d4ee54b76a15ae0d239101f8cada Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 30 May 2024 13:37:41 -0400 Subject: [PATCH 09/19] clippy --- .../src/datasource/physical_plan/parquet/statistics.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index d517ce3cfabd6..0b139aa3ad991 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -21,7 +21,7 @@ use arrow::{array::ArrayRef, datatypes::DataType}; use arrow_array::{ - new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, + new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, @@ -661,11 +661,7 @@ mod test { use super::*; use arrow::compute::kernels::cast_utils::Parser; use arrow::datatypes::{Date32Type, Date64Type}; - use arrow_array::{ - new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array, - Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, RecordBatch, StringArray, StructArray, TimestampNanosecondArray, - }; + use arrow_array::{new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, StructArray, TimestampNanosecondArray, new_empty_array}; use arrow_schema::{Field, SchemaRef}; use bytes::Bytes; use datafusion_common::test_util::parquet_test_data; From 860cb009b8601f5c7f2a784f0d7803cf729c271b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 30 May 2024 13:47:50 -0400 Subject: [PATCH 10/19] Simplify --- .../physical_plan/parquet/statistics.rs | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 0b139aa3ad991..b76434353777a 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -21,10 +21,9 @@ use arrow::{array::ArrayRef, datatypes::DataType}; use arrow_array::{ - new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, - Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, + new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, + FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, + Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{Field, FieldRef, Schema}; use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; @@ -205,22 +204,21 @@ macro_rules! make_decimal_stats_iterator { fn next(&mut self) -> Option { let next = self.iter.next(); next.map(|x| { - x.and_then(|stats| match stats { - ParquetStatistics::Int32(s) if stats.has_min_max_set() => { - Some(*s.$func() as i128) - } - ParquetStatistics::Int64(s) if stats.has_min_max_set() => { - Some(*s.$func() as i128) - } - ParquetStatistics::ByteArray(s) if stats.has_min_max_set() => { - Some(from_bytes_to_i128(s.$bytes_func())) + x.and_then(|stats| { + if !stats.has_min_max_set() { + return None; } - ParquetStatistics::FixedLenByteArray(s) - if stats.has_min_max_set() => - { - Some(from_bytes_to_i128(s.$bytes_func())) + match stats { + ParquetStatistics::Int32(s) => Some(*s.$func() as i128), + ParquetStatistics::Int64(s) => Some(*s.$func() as i128), + ParquetStatistics::ByteArray(s) => { + Some(from_bytes_to_i128(s.$bytes_func())) + } + ParquetStatistics::FixedLenByteArray(s) => { + Some(from_bytes_to_i128(s.$bytes_func())) + } + _ => None, } - _ => None, }) }) } @@ -661,7 +659,12 @@ mod test { use super::*; use arrow::compute::kernels::cast_utils::Parser; use arrow::datatypes::{Date32Type, Date64Type}; - use arrow_array::{new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, StructArray, TimestampNanosecondArray, new_empty_array}; + use arrow_array::{ + new_empty_array, new_null_array, Array, BinaryArray, BooleanArray, Date32Array, + Date64Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, + Int64Array, Int8Array, RecordBatch, StringArray, StructArray, + TimestampNanosecondArray, + }; use arrow_schema::{Field, SchemaRef}; use bytes::Bytes; use datafusion_common::test_util::parquet_test_data; From 524dc2f4d85228212eb200584c6c719291704672 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 30 May 2024 18:32:23 -0400 Subject: [PATCH 11/19] Fix dictionary type --- .../core/src/datasource/physical_plan/parquet/statistics.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index b76434353777a..178e80d9641f7 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -375,6 +375,9 @@ pub(crate) fn min_statistics<'a, I: Iterator { + min_statistics(value_type, iterator) + } _ => { let len = iterator.count(); // don't know how to extract statistics, so return a null array @@ -500,6 +503,9 @@ pub(crate) fn max_statistics<'a, I: Iterator { + max_statistics(value_type, iterator) + } _ => { let len = iterator.count(); // don't know how to extract statistics, so return a null array From a7880453fbccdba10e3db2cfff06ccae00e9739c Mon Sep 17 00:00:00 2001 From: Xin Li Date: Fri, 31 May 2024 07:30:31 +0000 Subject: [PATCH 12/19] Fix incorrect statistics read for timestamp columns in parquet --- .../physical_plan/parquet/statistics.rs | 330 ++++++++++++-- .../core/tests/parquet/arrow_statistics.rs | 415 +++++++++++++++--- datafusion/core/tests/parquet/mod.rs | 52 ++- 3 files changed, 692 insertions(+), 105 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 178e80d9641f7..8ad2a5de0fcad 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -23,9 +23,11 @@ use arrow::{array::ArrayRef, datatypes::DataType}; use arrow_array::{ new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, + Int8Array, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, + UInt64Array, UInt8Array, }; -use arrow_schema::{Field, FieldRef, Schema}; +use arrow_schema::{Field, FieldRef, Schema, TimeUnit}; use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; use parquet::file::metadata::ParquetMetaData; use parquet::file::statistics::Statistics as ParquetStatistics; @@ -336,9 +338,36 @@ pub(crate) fn min_statistics<'a, I: Iterator Ok(Arc::new(Int64Array::from_iter( - MinInt64StatsIterator::new(iterator).map(|x| x.copied()), - ))), + DataType::Timestamp(unit, timezone) =>{ + let iter = MinInt64StatsIterator::new(iterator).map(|x| x.copied()); + + Ok(match unit { + TimeUnit::Second => { + Arc::new(match timezone { + Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampSecondArray::from_iter(iter), + }) + } + TimeUnit::Millisecond => { + Arc::new(match timezone { + Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampMillisecondArray::from_iter(iter), + }) + } + TimeUnit::Microsecond => { + Arc::new(match timezone { + Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampMicrosecondArray::from_iter(iter), + }) + } + TimeUnit::Nanosecond => { + Arc::new(match timezone { + Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampNanosecondArray::from_iter(iter), + }) + } + }) + }, DataType::Binary => Ok(Arc::new(BinaryArray::from_iter( MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())), ))), @@ -373,7 +402,7 @@ pub(crate) fn min_statistics<'a, I: Iterator { min_statistics(value_type, iterator) @@ -464,9 +493,36 @@ pub(crate) fn max_statistics<'a, I: Iterator Ok(Arc::new(Int64Array::from_iter( - MaxInt64StatsIterator::new(iterator).map(|x| x.copied()), - ))), + DataType::Timestamp(unit, timezone) => { + let iter = MaxInt64StatsIterator::new(iterator).map(|x| x.copied()); + + Ok(match unit { + TimeUnit::Second => { + Arc::new(match timezone { + Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampSecondArray::from_iter(iter), + }) + } + TimeUnit::Millisecond => { + Arc::new(match timezone { + Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampMillisecondArray::from_iter(iter), + }) + } + TimeUnit::Microsecond => { + Arc::new(match timezone { + Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampMicrosecondArray::from_iter(iter), + }) + } + TimeUnit::Nanosecond => { + Arc::new(match timezone { + Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampNanosecondArray::from_iter(iter), + }) + } + }) + } DataType::Binary => Ok(Arc::new(BinaryArray::from_iter( MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())), ))), @@ -813,22 +869,207 @@ mod test { // Due to https://github.com/apache/datafusion/issues/8295 fn roundtrip_timestamp() { Test { - input: timestamp_array([ - // row group 1 - Some(1), - None, - Some(3), - // row group 2 - Some(9), - Some(5), + input: timestamp_seconds_array( + [ + // row group 1 + Some(1), + None, + Some(3), + // row group 2 + Some(9), + Some(5), + None, + // row group 3 + None, + None, + None, + ], None, - // row group 3 + ), + expected_min: timestamp_seconds_array([Some(1), Some(5), None], None), + expected_max: timestamp_seconds_array([Some(3), Some(9), None], None), + } + .run(); + + Test { + input: timestamp_milliseconds_array( + [ + // row group 1 + Some(1), + None, + Some(3), + // row group 2 + Some(9), + Some(5), + None, + // row group 3 + None, + None, + None, + ], None, + ), + expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], None), + expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], None), + } + .run(); + + Test { + input: timestamp_microseconds_array( + [ + // row group 1 + Some(1), + None, + Some(3), + // row group 2 + Some(9), + Some(5), + None, + // row group 3 + None, + None, + None, + ], None, + ), + expected_min: timestamp_microseconds_array([Some(1), Some(5), None], None), + expected_max: timestamp_microseconds_array([Some(3), Some(9), None], None), + } + .run(); + + Test { + input: timestamp_nanoseconds_array( + [ + // row group 1 + Some(1), + None, + Some(3), + // row group 2 + Some(9), + Some(5), + None, + // row group 3 + None, + None, + None, + ], None, - ]), - expected_min: i64_array([Some(1), Some(5), None]), - expected_max: i64_array([Some(3), Some(9), None]), + ), + expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], None), + expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], None), + } + .run() + } + + #[test] + fn roundtrip_timestamp_timezoned() { + Test { + input: timestamp_seconds_array( + [ + // row group 1 + Some(1), + None, + Some(3), + // row group 2 + Some(9), + Some(5), + None, + // row group 3 + None, + None, + None, + ], + Some("UTC"), + ), + expected_min: timestamp_seconds_array([Some(1), Some(5), None], Some("UTC")), + expected_max: timestamp_seconds_array([Some(3), Some(9), None], Some("UTC")), + } + .run(); + + Test { + input: timestamp_milliseconds_array( + [ + // row group 1 + Some(1), + None, + Some(3), + // row group 2 + Some(9), + Some(5), + None, + // row group 3 + None, + None, + None, + ], + Some("UTC"), + ), + expected_min: timestamp_milliseconds_array( + [Some(1), Some(5), None], + Some("UTC"), + ), + expected_max: timestamp_milliseconds_array( + [Some(3), Some(9), None], + Some("UTC"), + ), + } + .run(); + + Test { + input: timestamp_microseconds_array( + [ + // row group 1 + Some(1), + None, + Some(3), + // row group 2 + Some(9), + Some(5), + None, + // row group 3 + None, + None, + None, + ], + Some("UTC"), + ), + expected_min: timestamp_microseconds_array( + [Some(1), Some(5), None], + Some("UTC"), + ), + expected_max: timestamp_microseconds_array( + [Some(3), Some(9), None], + Some("UTC"), + ), + } + .run(); + + Test { + input: timestamp_nanoseconds_array( + [ + // row group 1 + Some(1), + None, + Some(3), + // row group 2 + Some(9), + Some(5), + None, + // row group 3 + None, + None, + None, + ], + Some("UTC"), + ), + expected_min: timestamp_nanoseconds_array( + [Some(1), Some(5), None], + Some("UTC"), + ), + expected_max: timestamp_nanoseconds_array( + [Some(3), Some(9), None], + Some("UTC"), + ), } .run() } @@ -1185,8 +1426,8 @@ mod test { // File has no min/max for timestamp_col .with_column(ExpectedColumn { name: "timestamp_col", - expected_min: i64_array([None]), - expected_max: i64_array([None]), + expected_min: timestamp_nanoseconds_array([None], None), + expected_max: timestamp_nanoseconds_array([None], None), }) .with_column(ExpectedColumn { name: "year", @@ -1406,9 +1647,48 @@ mod test { Arc::new(array) } - fn timestamp_array(input: impl IntoIterator>) -> ArrayRef { + fn timestamp_seconds_array( + input: impl IntoIterator>, + timzezone: Option<&str>, + ) -> ArrayRef { + let array: TimestampSecondArray = input.into_iter().collect(); + match timzezone { + Some(tz) => Arc::new(array.with_timezone(tz)), + None => Arc::new(array), + } + } + + fn timestamp_milliseconds_array( + input: impl IntoIterator>, + timzezone: Option<&str>, + ) -> ArrayRef { + let array: TimestampMillisecondArray = input.into_iter().collect(); + match timzezone { + Some(tz) => Arc::new(array.with_timezone(tz)), + None => Arc::new(array), + } + } + + fn timestamp_microseconds_array( + input: impl IntoIterator>, + timzezone: Option<&str>, + ) -> ArrayRef { + let array: TimestampMicrosecondArray = input.into_iter().collect(); + match timzezone { + Some(tz) => Arc::new(array.with_timezone(tz)), + None => Arc::new(array), + } + } + + fn timestamp_nanoseconds_array( + input: impl IntoIterator>, + timzezone: Option<&str>, + ) -> ArrayRef { let array: TimestampNanosecondArray = input.into_iter().collect(); - Arc::new(array) + match timzezone { + Some(tz) => Arc::new(array.with_timezone(tz)), + None => Arc::new(array), + } } fn utf8_array<'a>(input: impl IntoIterator>) -> ArrayRef { diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index eebf3447cbe9f..fb155be639aab 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -22,12 +22,16 @@ use std::fs::File; use std::sync::Arc; use arrow::compute::kernels::cast_utils::Parser; -use arrow::datatypes::{Date32Type, Date64Type}; +use arrow::datatypes::{ + Date32Type, Date64Type, TimestampMicrosecondType, TimestampMillisecondType, + TimestampNanosecondType, TimestampSecondType, +}; use arrow_array::{ make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, - Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, + Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow_schema::{DataType, Field, Schema}; use datafusion::datasource::physical_plan::parquet::{ @@ -456,36 +460,42 @@ async fn test_int_8() { // timestamp #[tokio::test] async fn test_timestamp() { - // This creates a parquet files of 5 columns named "nanos", "micros", "millis", "seconds", "names" + // This creates a parquet files of 9 columns named "nanos", "nanos_timezoned", "micros", "micros_timezoned", "millis", "millis_timezoned", "seconds", "seconds_timezoned", "names" // "nanos" --> TimestampNanosecondArray + // "nanos_timezoned" --> TimestampNanosecondArray // "micros" --> TimestampMicrosecondArray + // "micros_timezoned" --> TimestampMicrosecondArray // "millis" --> TimestampMillisecondArray + // "millis_timezoned" --> TimestampMillisecondArray // "seconds" --> TimestampSecondArray + // "seconds_timezoned" --> TimestampSecondArray // "names" --> StringArray // // The file is created by 4 record batches, each has 5 rowws. // Since the row group isze is set to 5, those 4 batches will go into 4 row groups - // This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64" + // This creates a parquet files of 4 columns named "nanos", "nanos_timezoned", "micros", "micros_timezoned", "millis", "millis_timezoned", "seconds", "seconds_timezoned" let reader = TestReader { scenario: Scenario::Timestamps, row_per_group: 5, }; + let tz = "Pacific/Efate"; + Test { reader: reader.build().await, - // mins are [1577840461000000000, 1577840471000000000, 1577841061000000000, 1578704461000000000,] - expected_min: Arc::new(Int64Array::from(vec![ - 1577840461000000000, - 1577840471000000000, - 1577841061000000000, - 1578704461000000000, + // mins are [2020-01-01T01:01:01, 2020-01-01T01:01:11, 2020-01-01T01:11:01, 2020-01-10T01:01:01] + expected_min: Arc::new(TimestampNanosecondArray::from(vec![ + TimestampNanosecondType::parse("2020-01-01T01:01:01"), + TimestampNanosecondType::parse("2020-01-01T01:01:11"), + TimestampNanosecondType::parse("2020-01-01T01:11:01"), + TimestampNanosecondType::parse("2020-01-11T01:01:01"), ])), - // maxes are [1577926861000000000, 1577926871000000000, 1577927461000000000, 1578790861000000000,] - expected_max: Arc::new(Int64Array::from(vec![ - 1577926861000000000, - 1577926871000000000, - 1577927461000000000, - 1578790861000000000, + // maxes are [2020-01-02T01:01:01, 2020-01-02T01:01:11, 2020-01-02T01:11:01, 2020-01-12T01:01:01] + expected_max: Arc::new(TimestampNanosecondArray::from(vec![ + TimestampNanosecondType::parse("2020-01-02T01:01:01"), + TimestampNanosecondType::parse("2020-01-02T01:01:11"), + TimestampNanosecondType::parse("2020-01-02T01:11:01"), + TimestampNanosecondType::parse("2020-01-12T01:01:01"), ])), // nulls are [1, 1, 1, 1] expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), @@ -495,21 +505,52 @@ async fn test_timestamp() { } .run(); - // micros + Test { + reader: reader.build().await, + // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:01:11+11:00, 2020-01-01T12:11:01+11:00, 2020-01-10T12:01:01+11:00] + expected_min: Arc::new( + TimestampNanosecondArray::from(vec![ + TimestampNanosecondType::parse("2020-01-01T01:01:01"), + TimestampNanosecondType::parse("2020-01-01T01:01:11"), + TimestampNanosecondType::parse("2020-01-01T01:11:01"), + TimestampNanosecondType::parse("2020-01-11T01:01:01"), + ]) + .with_timezone(tz), + ), + // maxes are [2020-01-02T12:01:01+11:00, 2020-01-02T12:01:11+11:00, 2020-01-02T12:11:01+11:00, 2020-01-12T12:01:01+11:00] + expected_max: Arc::new( + TimestampNanosecondArray::from(vec![ + TimestampNanosecondType::parse("2020-01-02T01:01:01"), + TimestampNanosecondType::parse("2020-01-02T01:01:11"), + TimestampNanosecondType::parse("2020-01-02T01:11:01"), + TimestampNanosecondType::parse("2020-01-12T01:01:01"), + ]) + .with_timezone(tz), + ), + // nulls are [1, 1, 1, 1] + expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), + // row counts are [5, 5, 5, 5] + expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), + column_name: "nanos_timezoned", + } + .run(); + // micros Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![ - 1577840461000000, - 1577840471000000, - 1577841061000000, - 1578704461000000, + // mins are [2020-01-01T01:01:01, 2020-01-01T01:01:11, 2020-01-01T01:11:01, 2020-01-10T01:01:01] + expected_min: Arc::new(TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2020-01-01T01:01:01"), + TimestampMicrosecondType::parse("2020-01-01T01:01:11"), + TimestampMicrosecondType::parse("2020-01-01T01:11:01"), + TimestampMicrosecondType::parse("2020-01-11T01:01:01"), ])), - expected_max: Arc::new(Int64Array::from(vec![ - 1577926861000000, - 1577926871000000, - 1577927461000000, - 1578790861000000, + // maxes are [2020-01-02T01:01:01, 2020-01-02T01:01:11, 2020-01-02T01:11:01, 2020-01-12T01:01:01] + expected_max: Arc::new(TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2020-01-02T01:01:01"), + TimestampMicrosecondType::parse("2020-01-02T01:01:11"), + TimestampMicrosecondType::parse("2020-01-02T01:11:01"), + TimestampMicrosecondType::parse("2020-01-12T01:01:01"), ])), expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), @@ -517,20 +558,52 @@ async fn test_timestamp() { } .run(); + Test { + reader: reader.build().await, + // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:01:11+11:00, 2020-01-01T12:11:01+11:00, 2020-01-10T12:01:01+11:00] + expected_min: Arc::new( + TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2020-01-01T01:01:01"), + TimestampMicrosecondType::parse("2020-01-01T01:01:11"), + TimestampMicrosecondType::parse("2020-01-01T01:11:01"), + TimestampMicrosecondType::parse("2020-01-11T01:01:01"), + ]) + .with_timezone(tz), + ), + // maxes are [2020-01-02T12:01:01+11:00, 2020-01-02T12:01:11+11:00, 2020-01-02T12:11:01+11:00, 2020-01-12T12:01:01+11:00] + expected_max: Arc::new( + TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2020-01-02T01:01:01"), + TimestampMicrosecondType::parse("2020-01-02T01:01:11"), + TimestampMicrosecondType::parse("2020-01-02T01:11:01"), + TimestampMicrosecondType::parse("2020-01-12T01:01:01"), + ]) + .with_timezone(tz), + ), + // nulls are [1, 1, 1, 1] + expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), + // row counts are [5, 5, 5, 5] + expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), + column_name: "micros_timezoned", + } + .run(); + // millis Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![ - 1577840461000, - 1577840471000, - 1577841061000, - 1578704461000, + // mins are [2020-01-01T01:01:01, 2020-01-01T01:01:11, 2020-01-01T01:11:01, 2020-01-10T01:01:01] + expected_min: Arc::new(TimestampMillisecondArray::from(vec![ + TimestampMillisecondType::parse("2020-01-01T01:01:01"), + TimestampMillisecondType::parse("2020-01-01T01:01:11"), + TimestampMillisecondType::parse("2020-01-01T01:11:01"), + TimestampMillisecondType::parse("2020-01-11T01:01:01"), ])), - expected_max: Arc::new(Int64Array::from(vec![ - 1577926861000, - 1577926871000, - 1577927461000, - 1578790861000, + // maxes are [2020-01-02T01:01:01, 2020-01-02T01:01:11, 2020-01-02T01:11:01, 2020-01-12T01:01:01] + expected_max: Arc::new(TimestampMillisecondArray::from(vec![ + TimestampMillisecondType::parse("2020-01-02T01:01:01"), + TimestampMillisecondType::parse("2020-01-02T01:01:11"), + TimestampMillisecondType::parse("2020-01-02T01:11:01"), + TimestampMillisecondType::parse("2020-01-12T01:01:01"), ])), expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), @@ -538,30 +611,102 @@ async fn test_timestamp() { } .run(); + Test { + reader: reader.build().await, + // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:01:11+11:00, 2020-01-01T12:11:01+11:00, 2020-01-10T12:01:01+11:00] + expected_min: Arc::new( + TimestampMillisecondArray::from(vec![ + TimestampMillisecondType::parse("2020-01-01T01:01:01"), + TimestampMillisecondType::parse("2020-01-01T01:01:11"), + TimestampMillisecondType::parse("2020-01-01T01:11:01"), + TimestampMillisecondType::parse("2020-01-11T01:01:01"), + ]) + .with_timezone(tz), + ), + // maxes are [2020-01-02T12:01:01+11:00, 2020-01-02T12:01:11+11:00, 2020-01-02T12:11:01+11:00, 2020-01-12T12:01:01+11:00] + expected_max: Arc::new( + TimestampMillisecondArray::from(vec![ + TimestampMillisecondType::parse("2020-01-02T01:01:01"), + TimestampMillisecondType::parse("2020-01-02T01:01:11"), + TimestampMillisecondType::parse("2020-01-02T01:11:01"), + TimestampMillisecondType::parse("2020-01-12T01:01:01"), + ]) + .with_timezone(tz), + ), + // nulls are [1, 1, 1, 1] + expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), + // row counts are [5, 5, 5, 5] + expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), + column_name: "millis_timezoned", + } + .run(); + // seconds Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![ - 1577840461, 1577840471, 1577841061, 1578704461, + // mins are [2020-01-01T01:01:01, 2020-01-01T01:01:11, 2020-01-01T01:11:01, 2020-01-10T01:01:01] + expected_min: Arc::new(TimestampSecondArray::from(vec![ + TimestampSecondType::parse("2020-01-01T01:01:01"), + TimestampSecondType::parse("2020-01-01T01:01:11"), + TimestampSecondType::parse("2020-01-01T01:11:01"), + TimestampSecondType::parse("2020-01-11T01:01:01"), ])), - expected_max: Arc::new(Int64Array::from(vec![ - 1577926861, 1577926871, 1577927461, 1578790861, + // maxes are [2020-01-02T01:01:01, 2020-01-02T01:01:11, 2020-01-02T01:11:01, 2020-01-12T01:01:01] + expected_max: Arc::new(TimestampSecondArray::from(vec![ + TimestampSecondType::parse("2020-01-02T01:01:01"), + TimestampSecondType::parse("2020-01-02T01:01:11"), + TimestampSecondType::parse("2020-01-02T01:11:01"), + TimestampSecondType::parse("2020-01-12T01:01:01"), ])), expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), column_name: "seconds", } .run(); + + Test { + reader: reader.build().await, + // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:01:11+11:00, 2020-01-01T12:11:01+11:00, 2020-01-10T12:01:01+11:00] + expected_min: Arc::new( + TimestampSecondArray::from(vec![ + TimestampSecondType::parse("2020-01-01T01:01:01"), + TimestampSecondType::parse("2020-01-01T01:01:11"), + TimestampSecondType::parse("2020-01-01T01:11:01"), + TimestampSecondType::parse("2020-01-11T01:01:01"), + ]) + .with_timezone(tz), + ), + // maxes are [2020-01-02T12:01:01+11:00, 2020-01-02T12:01:11+11:00, 2020-01-02T12:11:01+11:00, 2020-01-12T12:01:01+11:00] + expected_max: Arc::new( + TimestampSecondArray::from(vec![ + TimestampSecondType::parse("2020-01-02T01:01:01"), + TimestampSecondType::parse("2020-01-02T01:01:11"), + TimestampSecondType::parse("2020-01-02T01:11:01"), + TimestampSecondType::parse("2020-01-12T01:01:01"), + ]) + .with_timezone(tz), + ), + // nulls are [1, 1, 1, 1] + expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), + // row counts are [5, 5, 5, 5] + expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]), + column_name: "seconds_timezoned", + } + .run(); } // timestamp with different row group sizes #[tokio::test] async fn test_timestamp_diff_rg_sizes() { - // This creates a parquet files of 5 columns named "nanos", "micros", "millis", "seconds", "names" + // This creates a parquet files of 9 columns named "nanos", "nanos_timezoned", "micros", "micros_timezoned", "millis", "millis_timezoned", "seconds", "seconds_timezoned", "names" // "nanos" --> TimestampNanosecondArray + // "nanos_timezoned" --> TimestampNanosecondArray // "micros" --> TimestampMicrosecondArray + // "micros_timezoned" --> TimestampMicrosecondArray // "millis" --> TimestampMillisecondArray + // "millis_timezoned" --> TimestampMillisecondArray // "seconds" --> TimestampSecondArray + // "seconds_timezoned" --> TimestampSecondArray // "names" --> StringArray // // The file is created by 4 record batches (each has a null row), each has 5 rows but then will be split into 3 row groups with size 8, 8, 4 @@ -570,19 +715,21 @@ async fn test_timestamp_diff_rg_sizes() { row_per_group: 8, // note that the row group size is 8 }; + let tz = "Pacific/Efate"; + Test { reader: reader.build().await, - // mins are [1577840461000000000, 1577841061000000000, 1578704521000000000] - expected_min: Arc::new(Int64Array::from(vec![ - 1577840461000000000, - 1577841061000000000, - 1578704521000000000, + // mins are [2020-01-01T01:01:01, 2020-01-01T01:11:01, 2020-01-11T01:02:01] + expected_min: Arc::new(TimestampNanosecondArray::from(vec![ + TimestampNanosecondType::parse("2020-01-01T01:01:01"), + TimestampNanosecondType::parse("2020-01-01T01:11:01"), + TimestampNanosecondType::parse("2020-01-11T01:02:01"), ])), - // maxes are [1577926861000000000, 1578704461000000000, 157879086100000000] - expected_max: Arc::new(Int64Array::from(vec![ - 1577926861000000000, - 1578704461000000000, - 1578790861000000000, + // maxes are [2020-01-02T01:01:01, 2020-01-11T01:01:01, 2020-01-12T01:01:01] + expected_max: Arc::new(TimestampNanosecondArray::from(vec![ + TimestampNanosecondType::parse("2020-01-02T01:01:01"), + TimestampNanosecondType::parse("2020-01-11T01:01:01"), + TimestampNanosecondType::parse("2020-01-12T01:01:01"), ])), // nulls are [1, 2, 1] expected_null_counts: UInt64Array::from(vec![1, 2, 1]), @@ -592,18 +739,48 @@ async fn test_timestamp_diff_rg_sizes() { } .run(); + Test { + reader: reader.build().await, + // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:11:01+11:00, 2020-01-11T12:02:01+11:00] + expected_min: Arc::new( + TimestampNanosecondArray::from(vec![ + TimestampNanosecondType::parse("2020-01-01T01:01:01"), + TimestampNanosecondType::parse("2020-01-01T01:11:01"), + TimestampNanosecondType::parse("2020-01-11T01:02:01"), + ]) + .with_timezone(tz), + ), + // maxes are [2020-01-02T01:01:01+11:00, 2020-01-11T01:01:01+11:00, 2020-01-12T01:01:01+11:00] + expected_max: Arc::new( + TimestampNanosecondArray::from(vec![ + TimestampNanosecondType::parse("2020-01-02T01:01:01"), + TimestampNanosecondType::parse("2020-01-11T01:01:01"), + TimestampNanosecondType::parse("2020-01-12T01:01:01"), + ]) + .with_timezone(tz), + ), + // nulls are [1, 2, 1] + expected_null_counts: UInt64Array::from(vec![1, 2, 1]), + // row counts are [8, 8, 4] + expected_row_counts: UInt64Array::from(vec![8, 8, 4]), + column_name: "nanos_timezoned", + } + .run(); + // micros Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![ - 1577840461000000, - 1577841061000000, - 1578704521000000, + // mins are [2020-01-01T01:01:01, 2020-01-01T01:11:01, 2020-01-11T01:02:01] + expected_min: Arc::new(TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2020-01-01T01:01:01"), + TimestampMicrosecondType::parse("2020-01-01T01:11:01"), + TimestampMicrosecondType::parse("2020-01-11T01:02:01"), ])), - expected_max: Arc::new(Int64Array::from(vec![ - 1577926861000000, - 1578704461000000, - 1578790861000000, + // maxes are [2020-01-02T01:01:01, 2020-01-11T01:01:01, 2020-01-12T01:01:01] + expected_max: Arc::new(TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2020-01-02T01:01:01"), + TimestampMicrosecondType::parse("2020-01-11T01:01:01"), + TimestampMicrosecondType::parse("2020-01-12T01:01:01"), ])), expected_null_counts: UInt64Array::from(vec![1, 2, 1]), expected_row_counts: UInt64Array::from(vec![8, 8, 4]), @@ -611,18 +788,48 @@ async fn test_timestamp_diff_rg_sizes() { } .run(); + Test { + reader: reader.build().await, + // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:11:01+11:00, 2020-01-11T12:02:01+11:00] + expected_min: Arc::new( + TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2020-01-01T01:01:01"), + TimestampMicrosecondType::parse("2020-01-01T01:11:01"), + TimestampMicrosecondType::parse("2020-01-11T01:02:01"), + ]) + .with_timezone(tz), + ), + // maxes are [2020-01-02T01:01:01+11:00, 2020-01-11T01:01:01+11:00, 2020-01-12T01:01:01+11:00] + expected_max: Arc::new( + TimestampMicrosecondArray::from(vec![ + TimestampMicrosecondType::parse("2020-01-02T01:01:01"), + TimestampMicrosecondType::parse("2020-01-11T01:01:01"), + TimestampMicrosecondType::parse("2020-01-12T01:01:01"), + ]) + .with_timezone(tz), + ), + // nulls are [1, 2, 1] + expected_null_counts: UInt64Array::from(vec![1, 2, 1]), + // row counts are [8, 8, 4] + expected_row_counts: UInt64Array::from(vec![8, 8, 4]), + column_name: "micros_timezoned", + } + .run(); + // millis Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![ - 1577840461000, - 1577841061000, - 1578704521000, + // mins are [2020-01-01T01:01:01, 2020-01-01T01:11:01, 2020-01-11T01:02:01] + expected_min: Arc::new(TimestampMillisecondArray::from(vec![ + TimestampMillisecondType::parse("2020-01-01T01:01:01"), + TimestampMillisecondType::parse("2020-01-01T01:11:01"), + TimestampMillisecondType::parse("2020-01-11T01:02:01"), ])), - expected_max: Arc::new(Int64Array::from(vec![ - 1577926861000, - 1578704461000, - 1578790861000, + // maxes are [2020-01-02T01:01:01, 2020-01-11T01:01:01, 2020-01-12T01:01:01] + expected_max: Arc::new(TimestampMillisecondArray::from(vec![ + TimestampMillisecondType::parse("2020-01-02T01:01:01"), + TimestampMillisecondType::parse("2020-01-11T01:01:01"), + TimestampMillisecondType::parse("2020-01-12T01:01:01"), ])), expected_null_counts: UInt64Array::from(vec![1, 2, 1]), expected_row_counts: UInt64Array::from(vec![8, 8, 4]), @@ -630,20 +837,82 @@ async fn test_timestamp_diff_rg_sizes() { } .run(); + Test { + reader: reader.build().await, + // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:11:01+11:00, 2020-01-11T12:02:01+11:00] + expected_min: Arc::new( + TimestampMillisecondArray::from(vec![ + TimestampMillisecondType::parse("2020-01-01T01:01:01"), + TimestampMillisecondType::parse("2020-01-01T01:11:01"), + TimestampMillisecondType::parse("2020-01-11T01:02:01"), + ]) + .with_timezone(tz), + ), + // maxes are [2020-01-02T01:01:01+11:00, 2020-01-11T01:01:01+11:00, 2020-01-12T01:01:01+11:00] + expected_max: Arc::new( + TimestampMillisecondArray::from(vec![ + TimestampMillisecondType::parse("2020-01-02T01:01:01"), + TimestampMillisecondType::parse("2020-01-11T01:01:01"), + TimestampMillisecondType::parse("2020-01-12T01:01:01"), + ]) + .with_timezone(tz), + ), + // nulls are [1, 2, 1] + expected_null_counts: UInt64Array::from(vec![1, 2, 1]), + // row counts are [8, 8, 4] + expected_row_counts: UInt64Array::from(vec![8, 8, 4]), + column_name: "millis_timezoned", + } + .run(); + // seconds Test { reader: reader.build().await, - expected_min: Arc::new(Int64Array::from(vec![ - 1577840461, 1577841061, 1578704521, + // mins are [2020-01-01T01:01:01, 2020-01-01T01:11:01, 2020-01-11T01:02:01] + expected_min: Arc::new(TimestampSecondArray::from(vec![ + TimestampSecondType::parse("2020-01-01T01:01:01"), + TimestampSecondType::parse("2020-01-01T01:11:01"), + TimestampSecondType::parse("2020-01-11T01:02:01"), ])), - expected_max: Arc::new(Int64Array::from(vec![ - 1577926861, 1578704461, 1578790861, + // maxes are [2020-01-02T01:01:01, 2020-01-11T01:01:01, 2020-01-12T01:01:01] + expected_max: Arc::new(TimestampSecondArray::from(vec![ + TimestampSecondType::parse("2020-01-02T01:01:01"), + TimestampSecondType::parse("2020-01-11T01:01:01"), + TimestampSecondType::parse("2020-01-12T01:01:01"), ])), expected_null_counts: UInt64Array::from(vec![1, 2, 1]), expected_row_counts: UInt64Array::from(vec![8, 8, 4]), column_name: "seconds", } .run(); + + Test { + reader: reader.build().await, + // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:11:01+11:00, 2020-01-11T12:02:01+11:00] + expected_min: Arc::new( + TimestampSecondArray::from(vec![ + TimestampSecondType::parse("2020-01-01T01:01:01"), + TimestampSecondType::parse("2020-01-01T01:11:01"), + TimestampSecondType::parse("2020-01-11T01:02:01"), + ]) + .with_timezone(tz), + ), + // maxes are [2020-01-02T01:01:01+11:00, 2020-01-11T01:01:01+11:00, 2020-01-12T01:01:01+11:00] + expected_max: Arc::new( + TimestampSecondArray::from(vec![ + TimestampSecondType::parse("2020-01-02T01:01:01"), + TimestampSecondType::parse("2020-01-11T01:01:01"), + TimestampSecondType::parse("2020-01-12T01:01:01"), + ]) + .with_timezone(tz), + ), + // nulls are [1, 2, 1] + expected_null_counts: UInt64Array::from(vec![1, 2, 1]), + // row counts are [8, 8, 4] + expected_row_counts: UInt64Array::from(vec![8, 8, 4]), + column_name: "seconds_timezoned", + } + .run(); } // date with different row group sizes diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 94ae9ff601ecf..a2afe58ddc4ad 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -332,9 +332,13 @@ fn make_boolean_batch(v: Vec>) -> RecordBatch { /// /// Columns are named: /// "nanos" --> TimestampNanosecondArray +/// "nanos_timezoned" --> TimestampNanosecondArray with timezone /// "micros" --> TimestampMicrosecondArray +/// "micros_timezoned" --> TimestampMicrosecondArray with timezone /// "millis" --> TimestampMillisecondArray +/// "millis_timezoned" --> TimestampMillisecondArray with timezone /// "seconds" --> TimestampSecondArray +/// "seconds_timezoned" --> TimestampSecondArray with timezone /// "names" --> StringArray fn make_timestamp_batch(offset: Duration) -> RecordBatch { let ts_strings = vec![ @@ -345,6 +349,8 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch { Some("2020-01-02T01:01:01.0000000000001"), ]; + let tz_string = "Pacific/Efate"; + let offset_nanos = offset.num_nanoseconds().expect("non overflow nanos"); let ts_nanos = ts_strings @@ -382,19 +388,47 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch { .map(|(i, _)| format!("Row {i} + {offset}")) .collect::>(); - let arr_nanos = TimestampNanosecondArray::from(ts_nanos); - let arr_micros = TimestampMicrosecondArray::from(ts_micros); - let arr_millis = TimestampMillisecondArray::from(ts_millis); - let arr_seconds = TimestampSecondArray::from(ts_seconds); + let arr_nanos = TimestampNanosecondArray::from(ts_nanos.clone()); + let arr_nanos_timezoned = + TimestampNanosecondArray::from(ts_nanos).with_timezone(tz_string); + let arr_micros = TimestampMicrosecondArray::from(ts_micros.clone()); + let arr_micros_timezoned = + TimestampMicrosecondArray::from(ts_micros).with_timezone(tz_string); + let arr_millis = TimestampMillisecondArray::from(ts_millis.clone()); + let arr_millis_timezoned = + TimestampMillisecondArray::from(ts_millis).with_timezone(tz_string); + let arr_seconds = TimestampSecondArray::from(ts_seconds.clone()); + let arr_seconds_timezoned = + TimestampSecondArray::from(ts_seconds).with_timezone(tz_string); let names = names.iter().map(|s| s.as_str()).collect::>(); let arr_names = StringArray::from(names); let schema = Schema::new(vec![ Field::new("nanos", arr_nanos.data_type().clone(), true), + Field::new( + "nanos_timezoned", + arr_nanos_timezoned.data_type().clone(), + true, + ), Field::new("micros", arr_micros.data_type().clone(), true), + Field::new( + "micros_timezoned", + arr_micros_timezoned.data_type().clone(), + true, + ), Field::new("millis", arr_millis.data_type().clone(), true), + Field::new( + "millis_timezoned", + arr_millis_timezoned.data_type().clone(), + true, + ), Field::new("seconds", arr_seconds.data_type().clone(), true), + Field::new( + "seconds_timezoned", + arr_seconds_timezoned.data_type().clone(), + true, + ), Field::new("name", arr_names.data_type().clone(), true), ]); let schema = Arc::new(schema); @@ -403,9 +437,13 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch { schema, vec![ Arc::new(arr_nanos), - Arc::new(arr_micros), - Arc::new(arr_millis), - Arc::new(arr_seconds), + Arc::new(arr_nanos_timezoned), + Arc::new(arr_micros.clone()), + Arc::new(arr_micros_timezoned), + Arc::new(arr_millis.clone()), + Arc::new(arr_millis_timezoned), + Arc::new(arr_seconds.clone()), + Arc::new(arr_seconds_timezoned), Arc::new(arr_names), ], ) From acd00840c83335686c1f7baeabba75e777385726 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 1 Jun 2024 16:11:36 -0400 Subject: [PATCH 13/19] Add exhaustive match --- .../physical_plan/parquet/statistics.rs | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 8ad2a5de0fcad..bc14ef9fbe63a 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -407,7 +407,27 @@ pub(crate) fn min_statistics<'a, I: Iterator { min_statistics(value_type, iterator) } - _ => { + + DataType::Map(_,_) | + DataType::Float16 | + DataType::Time32(_) | + DataType::Time64(_) | + DataType::Duration(_) | + DataType::Interval(_) | + DataType::Null | + DataType::LargeBinary | + DataType::BinaryView | + DataType::LargeUtf8 | + DataType::Utf8View | + DataType::List(_) | + DataType::ListView(_) | + DataType::FixedSizeList(_, _) | + DataType::LargeList(_) | + DataType::LargeListView(_) | + DataType::Struct(_) | + DataType::Union(_, _) | + DataType::Decimal256(_, _) | + DataType::RunEndEncoded(_, _) => { let len = iterator.count(); // don't know how to extract statistics, so return a null array Ok(new_null_array(data_type, len)) @@ -562,7 +582,27 @@ pub(crate) fn max_statistics<'a, I: Iterator { max_statistics(value_type, iterator) } - _ => { + + DataType::Map(_,_) | + DataType::Float16 | + DataType::Time32(_) | + DataType::Time64(_) | + DataType::Duration(_) | + DataType::Interval(_) | + DataType::Null | + DataType::LargeBinary | + DataType::BinaryView | + DataType::LargeUtf8 | + DataType::Utf8View | + DataType::List(_) | + DataType::ListView(_) | + DataType::FixedSizeList(_, _) | + DataType::LargeList(_) | + DataType::LargeListView(_) | + DataType::Struct(_) | + DataType::Union(_, _) | + DataType::Decimal256(_, _) | + DataType::RunEndEncoded(_, _) => { let len = iterator.count(); // don't know how to extract statistics, so return a null array Ok(new_null_array(data_type, len)) From c6aa31cb57951dabd872fd2f90e567bb161b68b6 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 03:15:08 +0000 Subject: [PATCH 14/19] Update latest datatypes --- .../physical_plan/parquet/statistics.rs | 182 +++++++++++++++--- 1 file changed, 160 insertions(+), 22 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index cf05a82569500..7db47bb374b30 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -23,8 +23,10 @@ use arrow::datatypes::i256; use arrow::{array::ArrayRef, datatypes::DataType}; use arrow_array::{ new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, - FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, - Int8Array, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, + Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, + StringArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, + Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; @@ -195,8 +197,10 @@ make_stats_iterator!( /// * `$iterator_type` is the name of the iterator type (e.g. `MinBooleanStatsIterator`) /// * `$func` is the function to call to get the value (e.g. `min` or `max`) /// * `$bytes_func` is the function to call to get the value as bytes (e.g. `min_bytes` or `max_bytes`) +/// * `$stat_value_type` is the type of the statistics value (e.g. `i128`) +/// * `convert_func` is the function to convert the bytes to stats value (e.g. `from_bytes_to_i128`) macro_rules! make_decimal_stats_iterator { - ($iterator_type:ident, $func:ident, $bytes_func:ident) => { + ($iterator_type:ident, $func:ident, $bytes_func:ident, $stat_value_type:ident, $convert_func: ident) => { struct $iterator_type<'a, I> where I: Iterator>, @@ -217,7 +221,7 @@ macro_rules! make_decimal_stats_iterator { where I: Iterator>, { - type Item = Option; + type Item = Option<$stat_value_type>; fn next(&mut self) -> Option { let next = self.iter.next(); @@ -227,13 +231,17 @@ macro_rules! make_decimal_stats_iterator { return None; } match stats { - ParquetStatistics::Int32(s) => Some(*s.$func() as i128), - ParquetStatistics::Int64(s) => Some(*s.$func() as i128), + ParquetStatistics::Int32(s) => { + Some($stat_value_type::from(*s.$func())) + } + ParquetStatistics::Int64(s) => { + Some($stat_value_type::from(*s.$func())) + } ParquetStatistics::ByteArray(s) => { - Some(from_bytes_to_i128(s.$bytes_func())) + Some($convert_func(s.$bytes_func())) } ParquetStatistics::FixedLenByteArray(s) => { - Some(from_bytes_to_i128(s.$bytes_func())) + Some($convert_func(s.$bytes_func())) } _ => None, } @@ -248,8 +256,34 @@ macro_rules! make_decimal_stats_iterator { }; } -make_decimal_stats_iterator!(MinDecimal128StatsIterator, min, min_bytes); -make_decimal_stats_iterator!(MaxDecimal128StatsIterator, max, max_bytes); +make_decimal_stats_iterator!( + MinDecimal128StatsIterator, + min, + min_bytes, + i128, + from_bytes_to_i128 +); +make_decimal_stats_iterator!( + MaxDecimal128StatsIterator, + max, + max_bytes, + i128, + from_bytes_to_i128 +); +make_decimal_stats_iterator!( + MinDecimal256StatsIterator, + min, + min_bytes, + i256, + from_bytes_to_i256 +); +make_decimal_stats_iterator!( + MaxDecimal256StatsIterator, + max, + max_bytes, + i256, + from_bytes_to_i256 +); /// Lookups up the parquet column by name /// @@ -341,6 +375,11 @@ pub(crate) fn min_statistics<'a, I: Iterator Ok(Arc::new(UInt64Array::from_iter( MinInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)), ))), + DataType::Float16 => Ok(Arc::new(Float16Array::from_iter( + MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| x.and_then(|x| { + from_bytes_to_f16(x) + })), + ))), DataType::Float32 => Ok(Arc::new(Float32Array::from_iter( MinFloatStatsIterator::new(iterator).map(|x| x.copied()), ))), @@ -384,9 +423,42 @@ pub(crate) fn min_statistics<'a, I: Iterator { + Ok(match unit { + TimeUnit::Second => Arc::new(Time32SecondArray::from_iter( + MinInt32StatsIterator::new(iterator).map(|x| x.copied()), + )), + TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter( + MinInt32StatsIterator::new(iterator).map(|x| x.copied()), + )), + _ => { + let len = iterator.count(); + // don't know how to extract statistics, so return a null array + new_null_array(data_type, len) + } + }) + }, + DataType::Time64(unit) => { + Ok(match unit { + TimeUnit::Microsecond => Arc::new(Time64MicrosecondArray::from_iter( + MinInt64StatsIterator::new(iterator).map(|x| x.copied()), + )), + TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter( + MinInt64StatsIterator::new(iterator).map(|x| x.copied()), + )), + _ => { + let len = iterator.count(); + // don't know how to extract statistics, so return a null array + new_null_array(data_type, len) + } + }) + }, DataType::Binary => Ok(Arc::new(BinaryArray::from_iter( MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())), ))), + DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter( + MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x|x.to_vec())), + ))), DataType::Utf8 => Ok(Arc::new(StringArray::from_iter( MinByteArrayStatsIterator::new(iterator).map(|x| { x.and_then(|x| { @@ -398,6 +470,19 @@ pub(crate) fn min_statistics<'a, I: Iterator { + Ok(Arc::new(LargeStringArray::from_iter( + MinByteArrayStatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); + if res.is_none() { + log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it."); + } + res + }) + }), + ))) + } DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from( MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| { x.and_then(|x| { @@ -420,20 +505,21 @@ pub(crate) fn min_statistics<'a, I: Iterator { + let arr = Decimal256Array::from_iter( + MinDecimal256StatsIterator::new(iterator) + ).with_precision_and_scale(*precision, *scale)?; + Ok(Arc::new(arr)) + }, DataType::Dictionary(_, value_type) => { min_statistics(value_type, iterator) } DataType::Map(_,_) | - DataType::Float16 | - DataType::Time32(_) | - DataType::Time64(_) | DataType::Duration(_) | DataType::Interval(_) | DataType::Null | - DataType::LargeBinary | DataType::BinaryView | - DataType::LargeUtf8 | DataType::Utf8View | DataType::List(_) | DataType::ListView(_) | @@ -442,7 +528,6 @@ pub(crate) fn min_statistics<'a, I: Iterator { let len = iterator.count(); // don't know how to extract statistics, so return a null array @@ -516,6 +601,11 @@ pub(crate) fn max_statistics<'a, I: Iterator Ok(Arc::new(UInt64Array::from_iter( MaxInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)), ))), + DataType::Float16 => Ok(Arc::new(Float16Array::from_iter( + MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| x.and_then(|x| { + from_bytes_to_f16(x) + })), + ))), DataType::Float32 => Ok(Arc::new(Float32Array::from_iter( MaxFloatStatsIterator::new(iterator).map(|x| x.copied()), ))), @@ -559,9 +649,42 @@ pub(crate) fn max_statistics<'a, I: Iterator { + Ok(match unit { + TimeUnit::Second => Arc::new(Time32SecondArray::from_iter( + MaxInt32StatsIterator::new(iterator).map(|x| x.copied()), + )), + TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter( + MaxInt32StatsIterator::new(iterator).map(|x| x.copied()), + )), + _ => { + let len = iterator.count(); + // don't know how to extract statistics, so return a null array + new_null_array(data_type, len) + } + }) + }, + DataType::Time64(unit) => { + Ok(match unit { + TimeUnit::Microsecond => Arc::new(Time64MicrosecondArray::from_iter( + MaxInt64StatsIterator::new(iterator).map(|x| x.copied()), + )), + TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter( + MaxInt64StatsIterator::new(iterator).map(|x| x.copied()), + )), + _ => { + let len = iterator.count(); + // don't know how to extract statistics, so return a null array + new_null_array(data_type, len) + } + }) + }, DataType::Binary => Ok(Arc::new(BinaryArray::from_iter( MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())), ))), + DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter( + MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x|x.to_vec())), + ))), DataType::Utf8 => Ok(Arc::new(StringArray::from_iter( MaxByteArrayStatsIterator::new(iterator).map(|x| { x.and_then(|x| { @@ -573,6 +696,19 @@ pub(crate) fn max_statistics<'a, I: Iterator { + Ok(Arc::new(LargeStringArray::from_iter( + MaxByteArrayStatsIterator::new(iterator).map(|x| { + x.and_then(|x| { + let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); + if res.is_none() { + log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it."); + } + res + }) + }), + ))) + } DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from( MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| { x.and_then(|x| { @@ -595,20 +731,21 @@ pub(crate) fn max_statistics<'a, I: Iterator { + let arr = Decimal256Array::from_iter( + MaxDecimal256StatsIterator::new(iterator) + ).with_precision_and_scale(*precision, *scale)?; + Ok(Arc::new(arr)) + }, DataType::Dictionary(_, value_type) => { max_statistics(value_type, iterator) } DataType::Map(_,_) | - DataType::Float16 | - DataType::Time32(_) | - DataType::Time64(_) | DataType::Duration(_) | DataType::Interval(_) | DataType::Null | - DataType::LargeBinary | DataType::BinaryView | - DataType::LargeUtf8 | DataType::Utf8View | DataType::List(_) | DataType::ListView(_) | @@ -617,7 +754,6 @@ pub(crate) fn max_statistics<'a, I: Iterator { let len = iterator.count(); // don't know how to extract statistics, so return a null array @@ -926,6 +1062,8 @@ mod test { input: timestamp_seconds_array( [ Some(1), + None, + Some(3), // row group 2 Some(9), Some(5), From 4313fffc8de22dc6d3600d009fda3eca61b86e88 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 03:17:02 +0000 Subject: [PATCH 15/19] fix bad comment --- .../core/src/datasource/physical_plan/parquet/statistics.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 7db47bb374b30..61135a5a341ea 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -1061,6 +1061,7 @@ mod test { Test { input: timestamp_seconds_array( [ + // row group 1 Some(1), None, Some(3), From 4662432fb2c5f6e20df01c5ad4d6da51c10474c9 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 03:50:36 +0000 Subject: [PATCH 16/19] Remove duplications using paste --- datafusion/core/Cargo.toml | 1 + .../physical_plan/parquet/statistics.rs | 672 ++++++------------ 2 files changed, 235 insertions(+), 438 deletions(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 9f1f7484357b2..3946758ff9370 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -121,6 +121,7 @@ num_cpus = { workspace = true } object_store = { workspace = true } parking_lot = { workspace = true } parquet = { workspace = true, optional = true, default-features = true } +paste = "1.0.15" pin-project-lite = "^0.2.7" rand = { workspace = true } sqlparser = { workspace = true } diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 61135a5a341ea..5a1b03c75cd65 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -36,6 +36,7 @@ use half::f16; use parquet::file::metadata::ParquetMetaData; use parquet::file::statistics::Statistics as ParquetStatistics; use parquet::schema::types::SchemaDescriptor; +use paste::paste; use std::sync::Arc; // Convert the bytes array to i128. @@ -285,6 +286,237 @@ make_decimal_stats_iterator!( from_bytes_to_i256 ); +/// Special macro to combine the statistics iterators for min and max using the [`paste`] macro. +/// This is used to avoid repeating the same code for min and max statistics extractions +/// +/// Parameters: +/// stat_type_prefix: The prefix of the statistics iterator type (e.g. `Min` or `Max`) +/// data_type: The data type of the statistics (e.g. `DataType::Int32`) +/// iterator: The iterator of [`ParquetStatistics`] to extract the statistics from. +macro_rules! get_statistics { + ($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => { + paste! { + match $data_type { + DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter( + [<$stat_type_prefix BooleanStatsIterator>]::new($iterator).map(|x| x.copied()), + ))), + DataType::Int8 => Ok(Arc::new(Int8Array::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = i8::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::Int16 => Ok(Arc::new(Int16Array::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = i16::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::Int32 => Ok(Arc::new(Int32Array::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()), + ))), + DataType::Int64 => Ok(Arc::new(Int64Array::from_iter( + [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()), + ))), + DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = u8::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| { + x.and_then(|x| { + if let Ok(v) = u16::try_from(*x) { + Some(v) + } else { + None + } + }) + }), + ))), + DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.map(|x| *x as u32)), + ))), + DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter( + [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.map(|x| *x as u64)), + ))), + DataType::Float16 => Ok(Arc::new(Float16Array::from_iter( + [<$stat_type_prefix FixedLenByteArrayStatsIterator>]::new($iterator).map(|x| x.and_then(|x| { + from_bytes_to_f16(x) + })), + ))), + DataType::Float32 => Ok(Arc::new(Float32Array::from_iter( + [<$stat_type_prefix FloatStatsIterator>]::new($iterator).map(|x| x.copied()), + ))), + DataType::Float64 => Ok(Arc::new(Float64Array::from_iter( + [<$stat_type_prefix DoubleStatsIterator>]::new($iterator).map(|x| x.copied()), + ))), + DataType::Date32 => Ok(Arc::new(Date32Array::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()), + ))), + DataType::Date64 => Ok(Arc::new(Date64Array::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator) + .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)), + ))), + DataType::Timestamp(unit, timezone) =>{ + let iter = [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()); + + Ok(match unit { + TimeUnit::Second => { + Arc::new(match timezone { + Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampSecondArray::from_iter(iter), + }) + } + TimeUnit::Millisecond => { + Arc::new(match timezone { + Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampMillisecondArray::from_iter(iter), + }) + } + TimeUnit::Microsecond => { + Arc::new(match timezone { + Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampMicrosecondArray::from_iter(iter), + }) + } + TimeUnit::Nanosecond => { + Arc::new(match timezone { + Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()), + None => TimestampNanosecondArray::from_iter(iter), + }) + } + }) + }, + DataType::Time32(unit) => { + Ok(match unit { + TimeUnit::Second => Arc::new(Time32SecondArray::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()), + )), + TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter( + [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()), + )), + _ => { + let len = $iterator.count(); + // don't know how to extract statistics, so return a null array + new_null_array($data_type, len) + } + }) + }, + DataType::Time64(unit) => { + Ok(match unit { + TimeUnit::Microsecond => Arc::new(Time64MicrosecondArray::from_iter( + [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()), + )), + TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter( + [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()), + )), + _ => { + let len = $iterator.count(); + // don't know how to extract statistics, so return a null array + new_null_array($data_type, len) + } + }) + }, + DataType::Binary => Ok(Arc::new(BinaryArray::from_iter( + [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| x.map(|x| x.to_vec())), + ))), + DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter( + [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| x.map(|x|x.to_vec())), + ))), + DataType::Utf8 => Ok(Arc::new(StringArray::from_iter( + [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| { + x.and_then(|x| { + let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); + if res.is_none() { + log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it."); + } + res + }) + }), + ))), + DataType::LargeUtf8 => { + Ok(Arc::new(LargeStringArray::from_iter( + [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| { + x.and_then(|x| { + let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); + if res.is_none() { + log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it."); + } + res + }) + }), + ))) + } + DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from( + [<$stat_type_prefix FixedLenByteArrayStatsIterator>]::new($iterator).map(|x| { + x.and_then(|x| { + if x.len().try_into() == Ok(*size) { + Some(x) + } else { + log::debug!( + "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", + size, + x.len(), + ); + None + } + }) + }).collect::>(), + ))), + DataType::Decimal128(precision, scale) => { + let arr = Decimal128Array::from_iter( + [<$stat_type_prefix Decimal128StatsIterator>]::new($iterator) + ).with_precision_and_scale(*precision, *scale)?; + Ok(Arc::new(arr)) + }, + DataType::Decimal256(precision, scale) => { + let arr = Decimal256Array::from_iter( + [<$stat_type_prefix Decimal256StatsIterator>]::new($iterator) + ).with_precision_and_scale(*precision, *scale)?; + Ok(Arc::new(arr)) + }, + DataType::Dictionary(_, value_type) => { + [<$stat_type_prefix:lower _ statistics>](value_type, $iterator) + } + + DataType::Map(_,_) | + DataType::Duration(_) | + DataType::Interval(_) | + DataType::Null | + DataType::BinaryView | + DataType::Utf8View | + DataType::List(_) | + DataType::ListView(_) | + DataType::FixedSizeList(_, _) | + DataType::LargeList(_) | + DataType::LargeListView(_) | + DataType::Struct(_) | + DataType::Union(_, _) | + DataType::RunEndEncoded(_, _) => { + let len = $iterator.count(); + // don't know how to extract statistics, so return a null array + Ok(new_null_array($data_type, len)) + } + }}} +} + /// Lookups up the parquet column by name /// /// Returns the parquet column index and the corresponding arrow field @@ -315,225 +547,7 @@ pub(crate) fn min_statistics<'a, I: Iterator Result { - match data_type { - DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter( - MinBooleanStatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Int8 => Ok(Arc::new(Int8Array::from_iter( - MinInt32StatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = i8::try_from(*x) { - Some(v) - } else { - None - } - }) - }), - ))), - DataType::Int16 => Ok(Arc::new(Int16Array::from_iter( - MinInt32StatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = i16::try_from(*x) { - Some(v) - } else { - None - } - }) - }), - ))), - DataType::Int32 => Ok(Arc::new(Int32Array::from_iter( - MinInt32StatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Int64 => Ok(Arc::new(Int64Array::from_iter( - MinInt64StatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter( - MinInt32StatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = u8::try_from(*x) { - Some(v) - } else { - None - } - }) - }), - ))), - DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter( - MinInt32StatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = u16::try_from(*x) { - Some(v) - } else { - None - } - }) - }), - ))), - DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter( - MinInt32StatsIterator::new(iterator).map(|x| x.map(|x| *x as u32)), - ))), - DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter( - MinInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)), - ))), - DataType::Float16 => Ok(Arc::new(Float16Array::from_iter( - MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| x.and_then(|x| { - from_bytes_to_f16(x) - })), - ))), - DataType::Float32 => Ok(Arc::new(Float32Array::from_iter( - MinFloatStatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Float64 => Ok(Arc::new(Float64Array::from_iter( - MinDoubleStatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Date32 => Ok(Arc::new(Date32Array::from_iter( - MinInt32StatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Date64 => Ok(Arc::new(Date64Array::from_iter( - MinInt32StatsIterator::new(iterator) - .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)), - ))), - DataType::Timestamp(unit, timezone) =>{ - let iter = MinInt64StatsIterator::new(iterator).map(|x| x.copied()); - - Ok(match unit { - TimeUnit::Second => { - Arc::new(match timezone { - Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()), - None => TimestampSecondArray::from_iter(iter), - }) - } - TimeUnit::Millisecond => { - Arc::new(match timezone { - Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()), - None => TimestampMillisecondArray::from_iter(iter), - }) - } - TimeUnit::Microsecond => { - Arc::new(match timezone { - Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()), - None => TimestampMicrosecondArray::from_iter(iter), - }) - } - TimeUnit::Nanosecond => { - Arc::new(match timezone { - Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()), - None => TimestampNanosecondArray::from_iter(iter), - }) - } - }) - }, - DataType::Time32(unit) => { - Ok(match unit { - TimeUnit::Second => Arc::new(Time32SecondArray::from_iter( - MinInt32StatsIterator::new(iterator).map(|x| x.copied()), - )), - TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter( - MinInt32StatsIterator::new(iterator).map(|x| x.copied()), - )), - _ => { - let len = iterator.count(); - // don't know how to extract statistics, so return a null array - new_null_array(data_type, len) - } - }) - }, - DataType::Time64(unit) => { - Ok(match unit { - TimeUnit::Microsecond => Arc::new(Time64MicrosecondArray::from_iter( - MinInt64StatsIterator::new(iterator).map(|x| x.copied()), - )), - TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter( - MinInt64StatsIterator::new(iterator).map(|x| x.copied()), - )), - _ => { - let len = iterator.count(); - // don't know how to extract statistics, so return a null array - new_null_array(data_type, len) - } - }) - }, - DataType::Binary => Ok(Arc::new(BinaryArray::from_iter( - MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())), - ))), - DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter( - MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x|x.to_vec())), - ))), - DataType::Utf8 => Ok(Arc::new(StringArray::from_iter( - MinByteArrayStatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); - if res.is_none() { - log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it."); - } - res - }) - }), - ))), - DataType::LargeUtf8 => { - Ok(Arc::new(LargeStringArray::from_iter( - MinByteArrayStatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); - if res.is_none() { - log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it."); - } - res - }) - }), - ))) - } - DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from( - MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if x.len().try_into() == Ok(*size) { - Some(x) - } else { - log::debug!( - "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", - size, - x.len(), - ); - None - } - }) - }).collect::>(), - ))), - DataType::Decimal128(precision, scale) => { - let arr = Decimal128Array::from_iter( - MinDecimal128StatsIterator::new(iterator) - ).with_precision_and_scale(*precision, *scale)?; - Ok(Arc::new(arr)) - }, - DataType::Decimal256(precision, scale) => { - let arr = Decimal256Array::from_iter( - MinDecimal256StatsIterator::new(iterator) - ).with_precision_and_scale(*precision, *scale)?; - Ok(Arc::new(arr)) - }, - DataType::Dictionary(_, value_type) => { - min_statistics(value_type, iterator) - } - - DataType::Map(_,_) | - DataType::Duration(_) | - DataType::Interval(_) | - DataType::Null | - DataType::BinaryView | - DataType::Utf8View | - DataType::List(_) | - DataType::ListView(_) | - DataType::FixedSizeList(_, _) | - DataType::LargeList(_) | - DataType::LargeListView(_) | - DataType::Struct(_) | - DataType::Union(_, _) | - DataType::RunEndEncoded(_, _) => { - let len = iterator.count(); - // don't know how to extract statistics, so return a null array - Ok(new_null_array(data_type, len)) - } - } + get_statistics!(Min, data_type, iterator) } /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`] @@ -541,225 +555,7 @@ pub(crate) fn max_statistics<'a, I: Iterator Result { - match data_type { - DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter( - MaxBooleanStatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Int8 => Ok(Arc::new(Int8Array::from_iter( - MaxInt32StatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = i8::try_from(*x) { - Some(v) - } else { - None - } - }) - }), - ))), - DataType::Int16 => Ok(Arc::new(Int16Array::from_iter( - MaxInt32StatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = i16::try_from(*x) { - Some(v) - } else { - None - } - }) - }), - ))), - DataType::Int32 => Ok(Arc::new(Int32Array::from_iter( - MaxInt32StatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Int64 => Ok(Arc::new(Int64Array::from_iter( - MaxInt64StatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter( - MaxInt32StatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = u8::try_from(*x) { - Some(v) - } else { - None - } - }) - }), - ))), - DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter( - MaxInt32StatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if let Ok(v) = u16::try_from(*x) { - Some(v) - } else { - None - } - }) - }), - ))), - DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter( - MaxInt32StatsIterator::new(iterator).map(|x| x.map(|x| *x as u32)), - ))), - DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter( - MaxInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)), - ))), - DataType::Float16 => Ok(Arc::new(Float16Array::from_iter( - MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| x.and_then(|x| { - from_bytes_to_f16(x) - })), - ))), - DataType::Float32 => Ok(Arc::new(Float32Array::from_iter( - MaxFloatStatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Float64 => Ok(Arc::new(Float64Array::from_iter( - MaxDoubleStatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Date32 => Ok(Arc::new(Date32Array::from_iter( - MaxInt32StatsIterator::new(iterator).map(|x| x.copied()), - ))), - DataType::Date64 => Ok(Arc::new(Date64Array::from_iter( - MaxInt32StatsIterator::new(iterator) - .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)), - ))), - DataType::Timestamp(unit, timezone) => { - let iter = MaxInt64StatsIterator::new(iterator).map(|x| x.copied()); - - Ok(match unit { - TimeUnit::Second => { - Arc::new(match timezone { - Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()), - None => TimestampSecondArray::from_iter(iter), - }) - } - TimeUnit::Millisecond => { - Arc::new(match timezone { - Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()), - None => TimestampMillisecondArray::from_iter(iter), - }) - } - TimeUnit::Microsecond => { - Arc::new(match timezone { - Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()), - None => TimestampMicrosecondArray::from_iter(iter), - }) - } - TimeUnit::Nanosecond => { - Arc::new(match timezone { - Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()), - None => TimestampNanosecondArray::from_iter(iter), - }) - } - }) - } - DataType::Time32(unit) => { - Ok(match unit { - TimeUnit::Second => Arc::new(Time32SecondArray::from_iter( - MaxInt32StatsIterator::new(iterator).map(|x| x.copied()), - )), - TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter( - MaxInt32StatsIterator::new(iterator).map(|x| x.copied()), - )), - _ => { - let len = iterator.count(); - // don't know how to extract statistics, so return a null array - new_null_array(data_type, len) - } - }) - }, - DataType::Time64(unit) => { - Ok(match unit { - TimeUnit::Microsecond => Arc::new(Time64MicrosecondArray::from_iter( - MaxInt64StatsIterator::new(iterator).map(|x| x.copied()), - )), - TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter( - MaxInt64StatsIterator::new(iterator).map(|x| x.copied()), - )), - _ => { - let len = iterator.count(); - // don't know how to extract statistics, so return a null array - new_null_array(data_type, len) - } - }) - }, - DataType::Binary => Ok(Arc::new(BinaryArray::from_iter( - MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())), - ))), - DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter( - MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x|x.to_vec())), - ))), - DataType::Utf8 => Ok(Arc::new(StringArray::from_iter( - MaxByteArrayStatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); - if res.is_none() { - log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it."); - } - res - }) - }), - ))), - DataType::LargeUtf8 => { - Ok(Arc::new(LargeStringArray::from_iter( - MaxByteArrayStatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); - if res.is_none() { - log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it."); - } - res - }) - }), - ))) - } - DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from( - MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| { - x.and_then(|x| { - if x.len().try_into() == Ok(*size) { - Some(x) - } else { - log::debug!( - "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", - size, - x.len(), - ); - None - } - }) - }).collect::>(), - ))), - DataType::Decimal128(precision, scale) => { - let arr = Decimal128Array::from_iter( - MaxDecimal128StatsIterator::new(iterator) - ).with_precision_and_scale(*precision, *scale)?; - Ok(Arc::new(arr) as ArrayRef) - } - DataType::Decimal256(precision, scale) => { - let arr = Decimal256Array::from_iter( - MaxDecimal256StatsIterator::new(iterator) - ).with_precision_and_scale(*precision, *scale)?; - Ok(Arc::new(arr)) - }, - DataType::Dictionary(_, value_type) => { - max_statistics(value_type, iterator) - } - - DataType::Map(_,_) | - DataType::Duration(_) | - DataType::Interval(_) | - DataType::Null | - DataType::BinaryView | - DataType::Utf8View | - DataType::List(_) | - DataType::ListView(_) | - DataType::FixedSizeList(_, _) | - DataType::LargeList(_) | - DataType::LargeListView(_) | - DataType::Struct(_) | - DataType::Union(_, _) | - DataType::RunEndEncoded(_, _) => { - let len = iterator.count(); - // don't know how to extract statistics, so return a null array - Ok(new_null_array(data_type, len)) - } - } + get_statistics!(Max, data_type, iterator) } /// What type of statistics should be extracted? From 1bcc5e33738c2ee3d0cdd67ad1b7dd7bbe156267 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 5 Jun 2024 03:59:41 +0000 Subject: [PATCH 17/19] Fix comment --- .../core/src/datasource/physical_plan/parquet/statistics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 5a1b03c75cd65..287925fb25332 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -192,7 +192,7 @@ make_stats_iterator!( /// `Int32`, `Int64` or `ByteArray` or `FixedSizeByteArray` :mindblown: /// /// This iterator handles all cases, extracting the values -/// and converting it to `i128`. +/// and converting it to `stat_value_type`. /// /// Parameters: /// * `$iterator_type` is the name of the iterator type (e.g. `MinBooleanStatsIterator`) From fbfb611b14e16be968f4c340583abc52d9c156e2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 5 Jun 2024 10:07:47 -0400 Subject: [PATCH 18/19] Update Cargo.lock --- datafusion-cli/Cargo.lock | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 304058650164f..b165070c60605 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -942,7 +942,7 @@ version = "3.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro-error", "proc-macro2", "quote", @@ -976,7 +976,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" dependencies = [ "strum 0.26.2", - "strum_macros 0.26.2", + "strum_macros 0.26.4", "unicode-width", ] @@ -1156,6 +1156,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", + "paste", "pin-project-lite", "rand", "sqlparser", @@ -1255,7 +1256,7 @@ dependencies = [ "serde_json", "sqlparser", "strum 0.26.2", - "strum_macros 0.26.2", + "strum_macros 0.26.4", ] [[package]] @@ -1809,6 +1810,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -1881,9 +1888,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.28" +version = "0.14.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80" +checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33" dependencies = [ "bytes", "futures-channel", @@ -2683,9 +2690,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.84" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6" +checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23" dependencies = [ "unicode-ident", ] @@ -3218,7 +3225,7 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "syn 1.0.109", @@ -3303,7 +3310,7 @@ version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" dependencies = [ - "strum_macros 0.26.2", + "strum_macros 0.26.4", ] [[package]] @@ -3312,7 +3319,7 @@ version = "0.25.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro2", "quote", "rustversion", @@ -3321,11 +3328,11 @@ dependencies = [ [[package]] name = "strum_macros" -version = "0.26.2" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "rustversion", @@ -3711,9 +3718,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" [[package]] name = "unicode-width" -version = "0.1.12" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f5e5f3158ecfd4b8ff6fe086db7c8467a2dfdac97fe420f2b7c4aa97af66d6" +checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" [[package]] name = "untrusted" From 0dd201a42e0eb43457251d44ed3224d687b440f6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 5 Jun 2024 10:29:11 -0400 Subject: [PATCH 19/19] fix docs --- .../core/src/datasource/physical_plan/parquet/statistics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 287925fb25332..a73538d02a7fa 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -286,7 +286,7 @@ make_decimal_stats_iterator!( from_bytes_to_i256 ); -/// Special macro to combine the statistics iterators for min and max using the [`paste`] macro. +/// Special macro to combine the statistics iterators for min and max using the [`mod@paste`] macro. /// This is used to avoid repeating the same code for min and max statistics extractions /// /// Parameters: