From 3f7076a6c7736d0aebbb749a582d0a856e2b1f91 Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Wed, 29 May 2024 03:37:28 +0000
Subject: [PATCH 01/19] Fix incorrect statistics read for unsigned integers
 columns in parquet

---
 .../physical_plan/parquet/statistics.rs       | 12 ++++
 .../core/tests/parquet/arrow_statistics.rs    | 60 ++++++++-----------
 2 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index bbdd46af5d01f..ae8395aef6a41 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -81,6 +81,15 @@ macro_rules! get_statistic {
                     Some(DataType::Int16) => {
                         Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
                     }
+                    Some(DataType::UInt8) => {
+                        Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
+                    }
+                    Some(DataType::UInt16) => {
+                        Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
+                    }
+                    Some(DataType::UInt32) => {
+                        Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
+                    }
                     Some(DataType::Date32) => {
                         Some(ScalarValue::Date32(Some(*s.$func())))
                     }
@@ -100,6 +109,9 @@ macro_rules! get_statistic {
                             *scale,
                         ))
                     }
+                    Some(DataType::UInt64) => {
+                        Some(ScalarValue::UInt64(Some((*s.$func()) as u64)))
+                    }
                     _ => Some(ScalarValue::Int64(Some(*s.$func()))),
                 }
             }
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs
index 93cb7636b82ea..eebf3447cbe9f 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -26,7 +26,8 @@ use arrow::datatypes::{Date32Type, Date64Type};
 use arrow_array::{
     make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
     Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array,
-    Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt64Array,
+    Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt16Array,
+    UInt32Array, UInt64Array, UInt8Array,
 };
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::datasource::physical_plan::parquet::{
@@ -703,8 +704,6 @@ async fn test_dates_64_diff_rg_sizes() {
     .run();
 }
 
-// BUG:
-// https://github.com/apache/datafusion/issues/10604
 #[tokio::test]
 async fn test_uint() {
     // This creates a parquet files of 4 columns named "u8", "u16", "u32", "u64"
@@ -719,48 +718,40 @@ async fn test_uint() {
         row_per_group: 4,
     };
 
-    // u8
-    // BUG: expect UInt8Array but returns Int32Array
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt8Array
-        expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt8Array
+        expected_min: Arc::new(UInt8Array::from(vec![0, 1, 4, 7, 251])),
+        expected_max: Arc::new(UInt8Array::from(vec![3, 4, 6, 250, 254])),
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
         column_name: "u8",
     }
     .run();
 
-    // u16
-    // BUG: expect UInt16Array but returns Int32Array
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt16Array
-        expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt16Array
+        expected_min: Arc::new(UInt16Array::from(vec![0, 1, 4, 7, 251])),
+        expected_max: Arc::new(UInt16Array::from(vec![3, 4, 6, 250, 254])),
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
         column_name: "u16",
     }
     .run();
 
-    // u32
-    // BUG: expect UInt32Array but returns Int32Array
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt32Array
-        expected_max: Arc::new(Int32Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt32Array
+        expected_min: Arc::new(UInt32Array::from(vec![0, 1, 4, 7, 251])),
+        expected_max: Arc::new(UInt32Array::from(vec![3, 4, 6, 250, 254])),
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
         column_name: "u32",
     }
     .run();
 
-    // u64
-    // BUG: expect UInt64rray but returns Int64Array
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![0, 1, 4, 7, 251])), // shoudld be UInt64Array
-        expected_max: Arc::new(Int64Array::from(vec![3, 4, 6, 250, 254])), // shoudld be UInt64Array
+        expected_min: Arc::new(UInt64Array::from(vec![0, 1, 4, 7, 251])),
+        expected_max: Arc::new(UInt64Array::from(vec![3, 4, 6, 250, 254])),
         expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]),
         expected_row_counts: UInt64Array::from(vec![4, 4, 4, 4, 4]),
         column_name: "u64",
@@ -788,8 +779,6 @@ async fn test_int32_range() {
     .run();
 }
 
-// BUG: not convert UInt32Array to Int32Array
-// https://github.com/apache/datafusion/issues/10604
 #[tokio::test]
 async fn test_uint32_range() {
     // This creates a parquet file of 1 column "u"
@@ -801,8 +790,8 @@ async fn test_uint32_range() {
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![0])), // should be UInt32Array
-        expected_max: Arc::new(Int32Array::from(vec![300000])), // should be UInt32Array
+        expected_min: Arc::new(UInt32Array::from(vec![0])),
+        expected_max: Arc::new(UInt32Array::from(vec![300000])),
         expected_null_counts: UInt64Array::from(vec![0]),
         expected_row_counts: UInt64Array::from(vec![4]),
         column_name: "u",
@@ -820,44 +809,45 @@ async fn test_numeric_limits_unsigned() {
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int8Array::from(vec![i8::MIN, -100])),
-        expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])),
+        expected_min: Arc::new(UInt8Array::from(vec![u8::MIN, 100])),
+        expected_max: Arc::new(UInt8Array::from(vec![100, u8::MAX])),
         expected_null_counts: UInt64Array::from(vec![0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 2]),
-        column_name: "i8",
+        column_name: "u8",
     }
     .run();
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int16Array::from(vec![i16::MIN, -100])),
-        expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])),
+        expected_min: Arc::new(UInt16Array::from(vec![u16::MIN, 100])),
+        expected_max: Arc::new(UInt16Array::from(vec![100, u16::MAX])),
         expected_null_counts: UInt64Array::from(vec![0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 2]),
-        column_name: "i16",
+        column_name: "u16",
     }
     .run();
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int32Array::from(vec![i32::MIN, -100])),
-        expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])),
+        expected_min: Arc::new(UInt32Array::from(vec![u32::MIN, 100])),
+        expected_max: Arc::new(UInt32Array::from(vec![100, u32::MAX])),
         expected_null_counts: UInt64Array::from(vec![0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 2]),
-        column_name: "i32",
+        column_name: "u32",
     }
     .run();
 
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![i64::MIN, -100])),
-        expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])),
+        expected_min: Arc::new(UInt64Array::from(vec![u64::MIN, 100])),
+        expected_max: Arc::new(UInt64Array::from(vec![100, u64::MAX])),
         expected_null_counts: UInt64Array::from(vec![0, 0]),
         expected_row_counts: UInt64Array::from(vec![5, 2]),
-        column_name: "i64",
+        column_name: "u64",
     }
     .run();
 }
+
 #[tokio::test]
 async fn test_numeric_limits_signed() {
     // file has 7 rows, 2 row groups: one with 5 rows, one with 2 rows.

From 095ac391e17f9d4c3d841971316fb47dc371b791 Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Wed, 29 May 2024 10:31:23 +0000
Subject: [PATCH 02/19] Staging the change for faster stat

---
 .../physical_plan/parquet/statistics.rs       | 464 ++++++++++++------
 1 file changed, 313 insertions(+), 151 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index ae8395aef6a41..ee6720b338fc4 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -20,11 +20,14 @@
 // TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328
 
 use arrow::{array::ArrayRef, datatypes::DataType};
-use arrow_array::{new_empty_array, new_null_array, UInt64Array};
-use arrow_schema::{Field, FieldRef, Schema};
-use datafusion_common::{
-    internal_datafusion_err, internal_err, plan_err, Result, ScalarValue,
+use arrow_array::{
+    new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array,
+    Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array,
+    Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array,
+    UInt64Array, UInt8Array,
 };
+use arrow_schema::{Field, FieldRef, Schema};
+use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result};
 use parquet::file::metadata::ParquetMetaData;
 use parquet::file::statistics::Statistics as ParquetStatistics;
 use parquet::schema::types::SchemaDescriptor;
@@ -52,131 +55,300 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] {
     result
 }
 
-/// Extract a single min/max statistics from a [`ParquetStatistics`] object
-///
-/// * `$column_statistics` is the `ParquetStatistics` object
-/// * `$func is the function` (`min`/`max`) to call to get the value
-/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get the value as bytes
-/// * `$target_arrow_type` is the [`DataType`] of the target statistics
-macro_rules! get_statistic {
-    ($column_statistics:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{
-        if !$column_statistics.has_min_max_set() {
-            return None;
-        }
-        match $column_statistics {
-            ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))),
-            ParquetStatistics::Int32(s) => {
-                match $target_arrow_type {
-                    // int32 to decimal with the precision and scale
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(*s.$func() as i128),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::Int8) => {
-                        Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::Int16) => {
-                        Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt8) => {
-                        Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt16) => {
-                        Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt32) => {
-                        Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
-                    }
-                    Some(DataType::Date32) => {
-                        Some(ScalarValue::Date32(Some(*s.$func())))
-                    }
-                    Some(DataType::Date64) => {
-                        Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000)))
-                    }
-                    _ => Some(ScalarValue::Int32(Some(*s.$func()))),
-                }
+macro_rules! get_statistics_iter {
+    ($column_statistics_iter:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{
+        match $target_arrow_type {
+            Some(DataType::Boolean) => Some(Arc::new(BooleanArray::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Boolean(s) => Some(*s.$func()),
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Int8) => Some(Arc::new(Int8Array::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Int32(s) => {
+                                Some((*s.$func()).try_into().unwrap())
+                            }
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Int16) => Some(Arc::new(Int16Array::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Int32(s) => {
+                                Some((*s.$func()).try_into().unwrap())
+                            }
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Int32) => Some(Arc::new(Int32Array::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Int32(s) => Some(*s.$func()),
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Int64) => Some(Arc::new(Int64Array::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Int64(s) => {
+                                Some((*s.$func()).try_into().unwrap())
+                            }
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::UInt8) => {
+                Some(Arc::new(UInt8Array::from_iter(
+                    $column_statistics_iter.map(|statistic| {
+                        statistic.and_then(|x| {
+                            if !x.has_min_max_set() {
+                                return None;
+                            }
+                            match x {
+                                ParquetStatistics::Int32(s) => {
+                                    Some((*s.$func()).try_into().unwrap())
+                                }
+                                _ => None,
+                            }
+                        })
+                    }),
+                )) as ArrayRef)
             }
-            ParquetStatistics::Int64(s) => {
-                match $target_arrow_type {
-                    // int64 to decimal with the precision and scale
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(*s.$func() as i128),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::UInt64) => {
-                        Some(ScalarValue::UInt64(Some((*s.$func()) as u64)))
-                    }
-                    _ => Some(ScalarValue::Int64(Some(*s.$func()))),
-                }
+            Some(DataType::UInt16) => {
+                Some(Arc::new(UInt16Array::from_iter(
+                    $column_statistics_iter.map(|statistic| {
+                        statistic.and_then(|x| {
+                            if !x.has_min_max_set() {
+                                return None;
+                            }
+                            match x {
+                                ParquetStatistics::Int32(s) => {
+                                    Some((*s.$func()).try_into().unwrap())
+                                }
+                                _ => None,
+                            }
+                        })
+                    }),
+                )) as ArrayRef)
             }
-            // 96 bit ints not supported
-            ParquetStatistics::Int96(_) => None,
-            ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))),
-            ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))),
-            ParquetStatistics::ByteArray(s) => {
-                match $target_arrow_type {
-                    // decimal data type
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(from_bytes_to_i128(s.$bytes_func())),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::Binary) => {
-                        Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec())))
-                    }
-                    _ => {
-                        let s = std::str::from_utf8(s.$bytes_func())
-                            .map(|s| s.to_string())
-                            .ok();
-                        if s.is_none() {
-                            log::debug!(
-                                "Utf8 statistics is a non-UTF8 value, ignoring it."
-                            );
-                        }
-                        Some(ScalarValue::Utf8(s))
-                    }
-                }
+            Some(DataType::UInt32) => {
+                Some(Arc::new(UInt32Array::from_iter(
+                    $column_statistics_iter.map(|statistic| {
+                        statistic.and_then(|x| {
+                            if !x.has_min_max_set() {
+                                return None;
+                            }
+                            match x {
+                                ParquetStatistics::Int32(s) => {
+                                    Some(*s.$func() as u32)
+                                }
+                                _ => None,
+                            }
+                        })
+                    }),
+                )) as ArrayRef)
             }
-            // type not fully supported yet
-            ParquetStatistics::FixedLenByteArray(s) => {
-                match $target_arrow_type {
-                    // just support specific logical data types, there are others each
-                    // with their own ordering
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(from_bytes_to_i128(s.$bytes_func())),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::FixedSizeBinary(size)) => {
-                        let value = s.$bytes_func().to_vec();
-                        let value = if value.len().try_into() == Ok(*size) {
-                            Some(value)
-                        } else {
-                            log::debug!(
-                                "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
-                                size,
-                                value.len(),
-                            );
-                            None
-                        };
-                        Some(ScalarValue::FixedSizeBinary(
-                            *size,
-                            value,
-                        ))
-                    }
-                    _ => None,
-                }
+            Some(DataType::UInt64) => {
+                Some(Arc::new(UInt64Array::from_iter(
+                    $column_statistics_iter.map(|statistic| {
+                        statistic.and_then(|x| {
+                            if !x.has_min_max_set() {
+                                return None;
+                            }
+                            match x {
+                                ParquetStatistics::Int64(s) => {
+                                    Some(*s.$func() as u64)
+                                }
+                                _ => None,
+                            }
+                        })
+                    }),
+                )) as ArrayRef)
+            }
+            Some(DataType::Date32) => Some(Arc::new(Date32Array::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Int32(s) => Some(*s.$func()),
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Date64) => Some(Arc::new(Date64Array::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Int32(s) => {
+                                Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000)
+                            }
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Float32) => Some(Arc::new(Float32Array::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Float(s) => Some(*s.$func()),
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Float64) => Some(Arc::new(Float64Array::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::Double(s) => Some(*s.$func()),
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Decimal128(precision, scale)) => {
+                let mut array = Decimal128Array::from_iter($column_statistics_iter.map(
+                    |statistic| {
+                        statistic.and_then(|x| {
+                            if !x.has_min_max_set() {
+                                return None;
+                            }
+                            match x {
+                                ParquetStatistics::Int32(s) => Some(*s.$func() as i128),
+                                ParquetStatistics::Int64(s) => Some(*s.$func() as i128),
+                                ParquetStatistics::ByteArray(s) => {
+                                    Some(from_bytes_to_i128(s.$bytes_func()))
+                                }
+                                ParquetStatistics::FixedLenByteArray(s) => {
+                                    Some(from_bytes_to_i128(s.$bytes_func()))
+                                }
+                                _ => None,
+                            }
+                        })
+                    },
+                ));
+
+                array = array.with_precision_and_scale(*precision, *scale)?;
+                Some(Arc::new(array) as ArrayRef)
             }
+            Some(DataType::Binary) => Some(Arc::new(BinaryArray::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::ByteArray(s) => {
+                                Some(s.$bytes_func().to_vec())
+                            }
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::Utf8) => Some(Arc::new(StringArray::from_iter(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::ByteArray(s) => {
+                                let res = std::str::from_utf8(s.$bytes_func())
+                                    .map(|s| s.to_string())
+                                    .ok();
+
+                                if res.is_none() {
+                                    log::debug!(
+                                        "Utf8 statistics is a non-UTF8 value, ignoring it."
+                                    );
+                                }
+
+                                Some(res.unwrap_or_else(|| "".to_string()))
+                            }
+                            _ => None,
+                        }
+                    })
+                }),
+            )) as ArrayRef),
+            Some(DataType::FixedSizeBinary(size)) => Some(Arc::new(FixedSizeBinaryArray::from(
+                $column_statistics_iter.map(|statistic| {
+                    statistic.and_then(|x| {
+                        if !x.has_min_max_set() {
+                            return None;
+                        }
+                        match x {
+                            ParquetStatistics::FixedLenByteArray(s) => {
+                                let value = s.$bytes_func();
+                                let value = if value.len().try_into() == Ok(*size) {
+                                    Some(value)
+                                } else {
+                                    log::debug!(
+                                        "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
+                                        size,
+                                        value.len(),
+                                    );
+                                    None
+                                };
+                                value
+                            }
+                            _ => None,
+                        }
+                    })
+                }).collect::<Vec<_>>(),
+            )) as ArrayRef),
+            Some(DataType::Timestamp(_, _)) => {
+                log::info!(
+                    "Timestamp statistics are not supported, returning none"
+                );
+                None
+            },
+            _ => None,
         }
     }};
 }
@@ -211,9 +383,13 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
-    let scalars = iterator
-        .map(|x| x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type))));
-    collect_scalars(data_type, scalars)
+    match get_statistics_iter!(iterator, min, min_bytes, Some(data_type)) {
+        Some(array) => Ok(array),
+        None => {
+            log::debug!("Unsupported data type for statistics");
+            Ok(new_empty_array(data_type))
+        }
+    }
 }
 
 /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`]
@@ -221,22 +397,11 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
-    let scalars = iterator
-        .map(|x| x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type))));
-    collect_scalars(data_type, scalars)
-}
-
-/// Builds an array from an iterator of ScalarValue
-fn collect_scalars<I: Iterator<Item = Option<ScalarValue>>>(
-    data_type: &DataType,
-    iterator: I,
-) -> Result<ArrayRef> {
-    let mut scalars = iterator.peekable();
-    match scalars.peek().is_none() {
-        true => Ok(new_empty_array(data_type)),
-        false => {
-            let null = ScalarValue::try_from(data_type)?;
-            ScalarValue::iter_to_array(scalars.map(|x| x.unwrap_or_else(|| null.clone())))
+    match get_statistics_iter!(iterator, max, max_bytes, Some(data_type)) {
+        Some(array) => Ok(array),
+        None => {
+            log::debug!("Unsupported data type for statistics");
+            Ok(new_empty_array(data_type))
         }
     }
 }
@@ -536,9 +701,6 @@ mod test {
     }
 
     #[test]
-    #[should_panic(
-        expected = "Inconsistent types in ScalarValue::iter_to_array. Expected Int64, got TimestampNanosecond(NULL, None)"
-    )]
     // Due to https://github.com/apache/datafusion/issues/8295
     fn roundtrip_timestamp() {
         Test {
@@ -556,8 +718,8 @@ mod test {
                 None,
                 None,
             ]),
-            expected_min: timestamp_array([Some(1), Some(5), None]),
-            expected_max: timestamp_array([Some(3), Some(9), None]),
+            expected_min: timestamp_array([]),
+            expected_max: timestamp_array([]),
         }
         .run()
     }
@@ -914,8 +1076,8 @@ mod test {
             // File has no min/max for timestamp_col
             .with_column(ExpectedColumn {
                 name: "timestamp_col",
-                expected_min: timestamp_array([None]),
-                expected_max: timestamp_array([None]),
+                expected_min: timestamp_array([]),
+                expected_max: timestamp_array([]),
             })
             .with_column(ExpectedColumn {
                 name: "year",

From 2faec57e0fc89bf6d90c069a54daa7c991f43d09 Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Wed, 29 May 2024 10:44:47 +0000
Subject: [PATCH 03/19] Improve performance of extracting statistics from
 parquet files

---
 .../physical_plan/parquet/statistics.rs       | 29 +++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index ee6720b338fc4..90776f917225a 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -342,11 +342,22 @@ macro_rules! get_statistics_iter {
                     })
                 }).collect::<Vec<_>>(),
             )) as ArrayRef),
-            Some(DataType::Timestamp(_, _)) => {
-                log::info!(
-                    "Timestamp statistics are not supported, returning none"
-                );
-                None
+            Some(DataType::Timestamp(_unit, _literal)) => {
+                Some(Arc::new(Int64Array::from_iter(
+                    $column_statistics_iter.map(|statistic| {
+                        statistic.and_then(|x| {
+                            if !x.has_min_max_set() {
+                                return None;
+                            }
+                            match x {
+                                ParquetStatistics::Int64(s) => {
+                                    Some(*s.$func())
+                                }
+                                _ => None,
+                            }
+                        })
+                    }),
+                )) as ArrayRef)
             },
             _ => None,
         }
@@ -718,8 +729,8 @@ mod test {
                 None,
                 None,
             ]),
-            expected_min: timestamp_array([]),
-            expected_max: timestamp_array([]),
+            expected_min: i64_array([Some(1), Some(5), None]),
+            expected_max: i64_array([Some(3), Some(9), None]),
         }
         .run()
     }
@@ -1076,8 +1087,8 @@ mod test {
             // File has no min/max for timestamp_col
             .with_column(ExpectedColumn {
                 name: "timestamp_col",
-                expected_min: timestamp_array([]),
-                expected_max: timestamp_array([]),
+                expected_min: i64_array([None]),
+                expected_max: i64_array([None]),
             })
             .with_column(ExpectedColumn {
                 name: "year",

From b2ee8051b07e82be585cf094c271d1b1a1dddf30 Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Thu, 30 May 2024 09:04:15 +0000
Subject: [PATCH 04/19] Revert "Improve performance of extracting statistics
 from parquet files"

This reverts commit 2faec57e0fc89bf6d90c069a54daa7c991f43d09.
---
 .../physical_plan/parquet/statistics.rs       | 29 ++++++-------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 90776f917225a..ee6720b338fc4 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -342,22 +342,11 @@ macro_rules! get_statistics_iter {
                     })
                 }).collect::<Vec<_>>(),
             )) as ArrayRef),
-            Some(DataType::Timestamp(_unit, _literal)) => {
-                Some(Arc::new(Int64Array::from_iter(
-                    $column_statistics_iter.map(|statistic| {
-                        statistic.and_then(|x| {
-                            if !x.has_min_max_set() {
-                                return None;
-                            }
-                            match x {
-                                ParquetStatistics::Int64(s) => {
-                                    Some(*s.$func())
-                                }
-                                _ => None,
-                            }
-                        })
-                    }),
-                )) as ArrayRef)
+            Some(DataType::Timestamp(_, _)) => {
+                log::info!(
+                    "Timestamp statistics are not supported, returning none"
+                );
+                None
             },
             _ => None,
         }
@@ -729,8 +718,8 @@ mod test {
                 None,
                 None,
             ]),
-            expected_min: i64_array([Some(1), Some(5), None]),
-            expected_max: i64_array([Some(3), Some(9), None]),
+            expected_min: timestamp_array([]),
+            expected_max: timestamp_array([]),
         }
         .run()
     }
@@ -1087,8 +1076,8 @@ mod test {
             // File has no min/max for timestamp_col
             .with_column(ExpectedColumn {
                 name: "timestamp_col",
-                expected_min: i64_array([None]),
-                expected_max: i64_array([None]),
+                expected_min: timestamp_array([]),
+                expected_max: timestamp_array([]),
             })
             .with_column(ExpectedColumn {
                 name: "year",

From 8d7e9f2274a84bb1e471a6dcfb841bc58b1ea4cd Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Thu, 30 May 2024 09:04:34 +0000
Subject: [PATCH 05/19] Revert "Staging the change for faster stat"

This reverts commit 095ac391e17f9d4c3d841971316fb47dc371b791.
---
 .../physical_plan/parquet/statistics.rs       | 464 ++++++------------
 1 file changed, 151 insertions(+), 313 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index ee6720b338fc4..ae8395aef6a41 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -20,14 +20,11 @@
 // TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328
 
 use arrow::{array::ArrayRef, datatypes::DataType};
-use arrow_array::{
-    new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array,
-    Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array,
-    Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array,
-    UInt64Array, UInt8Array,
-};
+use arrow_array::{new_empty_array, new_null_array, UInt64Array};
 use arrow_schema::{Field, FieldRef, Schema};
-use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result};
+use datafusion_common::{
+    internal_datafusion_err, internal_err, plan_err, Result, ScalarValue,
+};
 use parquet::file::metadata::ParquetMetaData;
 use parquet::file::statistics::Statistics as ParquetStatistics;
 use parquet::schema::types::SchemaDescriptor;
@@ -55,300 +52,131 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] {
     result
 }
 
-macro_rules! get_statistics_iter {
-    ($column_statistics_iter:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{
-        match $target_arrow_type {
-            Some(DataType::Boolean) => Some(Arc::new(BooleanArray::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::Boolean(s) => Some(*s.$func()),
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::Int8) => Some(Arc::new(Int8Array::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::Int32(s) => {
-                                Some((*s.$func()).try_into().unwrap())
-                            }
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::Int16) => Some(Arc::new(Int16Array::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::Int32(s) => {
-                                Some((*s.$func()).try_into().unwrap())
-                            }
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::Int32) => Some(Arc::new(Int32Array::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::Int32(s) => Some(*s.$func()),
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::Int64) => Some(Arc::new(Int64Array::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::Int64(s) => {
-                                Some((*s.$func()).try_into().unwrap())
-                            }
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::UInt8) => {
-                Some(Arc::new(UInt8Array::from_iter(
-                    $column_statistics_iter.map(|statistic| {
-                        statistic.and_then(|x| {
-                            if !x.has_min_max_set() {
-                                return None;
-                            }
-                            match x {
-                                ParquetStatistics::Int32(s) => {
-                                    Some((*s.$func()).try_into().unwrap())
-                                }
-                                _ => None,
-                            }
-                        })
-                    }),
-                )) as ArrayRef)
-            }
-            Some(DataType::UInt16) => {
-                Some(Arc::new(UInt16Array::from_iter(
-                    $column_statistics_iter.map(|statistic| {
-                        statistic.and_then(|x| {
-                            if !x.has_min_max_set() {
-                                return None;
-                            }
-                            match x {
-                                ParquetStatistics::Int32(s) => {
-                                    Some((*s.$func()).try_into().unwrap())
-                                }
-                                _ => None,
-                            }
-                        })
-                    }),
-                )) as ArrayRef)
-            }
-            Some(DataType::UInt32) => {
-                Some(Arc::new(UInt32Array::from_iter(
-                    $column_statistics_iter.map(|statistic| {
-                        statistic.and_then(|x| {
-                            if !x.has_min_max_set() {
-                                return None;
-                            }
-                            match x {
-                                ParquetStatistics::Int32(s) => {
-                                    Some(*s.$func() as u32)
-                                }
-                                _ => None,
-                            }
-                        })
-                    }),
-                )) as ArrayRef)
+/// Extract a single min/max statistics from a [`ParquetStatistics`] object
+///
+/// * `$column_statistics` is the `ParquetStatistics` object
+/// * `$func is the function` (`min`/`max`) to call to get the value
+/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get the value as bytes
+/// * `$target_arrow_type` is the [`DataType`] of the target statistics
+macro_rules! get_statistic {
+    ($column_statistics:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{
+        if !$column_statistics.has_min_max_set() {
+            return None;
+        }
+        match $column_statistics {
+            ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))),
+            ParquetStatistics::Int32(s) => {
+                match $target_arrow_type {
+                    // int32 to decimal with the precision and scale
+                    Some(DataType::Decimal128(precision, scale)) => {
+                        Some(ScalarValue::Decimal128(
+                            Some(*s.$func() as i128),
+                            *precision,
+                            *scale,
+                        ))
+                    }
+                    Some(DataType::Int8) => {
+                        Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap())))
+                    }
+                    Some(DataType::Int16) => {
+                        Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
+                    }
+                    Some(DataType::UInt8) => {
+                        Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
+                    }
+                    Some(DataType::UInt16) => {
+                        Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
+                    }
+                    Some(DataType::UInt32) => {
+                        Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
+                    }
+                    Some(DataType::Date32) => {
+                        Some(ScalarValue::Date32(Some(*s.$func())))
+                    }
+                    Some(DataType::Date64) => {
+                        Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000)))
+                    }
+                    _ => Some(ScalarValue::Int32(Some(*s.$func()))),
+                }
             }
-            Some(DataType::UInt64) => {
-                Some(Arc::new(UInt64Array::from_iter(
-                    $column_statistics_iter.map(|statistic| {
-                        statistic.and_then(|x| {
-                            if !x.has_min_max_set() {
-                                return None;
-                            }
-                            match x {
-                                ParquetStatistics::Int64(s) => {
-                                    Some(*s.$func() as u64)
-                                }
-                                _ => None,
-                            }
-                        })
-                    }),
-                )) as ArrayRef)
+            ParquetStatistics::Int64(s) => {
+                match $target_arrow_type {
+                    // int64 to decimal with the precision and scale
+                    Some(DataType::Decimal128(precision, scale)) => {
+                        Some(ScalarValue::Decimal128(
+                            Some(*s.$func() as i128),
+                            *precision,
+                            *scale,
+                        ))
+                    }
+                    Some(DataType::UInt64) => {
+                        Some(ScalarValue::UInt64(Some((*s.$func()) as u64)))
+                    }
+                    _ => Some(ScalarValue::Int64(Some(*s.$func()))),
+                }
             }
-            Some(DataType::Date32) => Some(Arc::new(Date32Array::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::Int32(s) => Some(*s.$func()),
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::Date64) => Some(Arc::new(Date64Array::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::Int32(s) => {
-                                Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000)
-                            }
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::Float32) => Some(Arc::new(Float32Array::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::Float(s) => Some(*s.$func()),
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::Float64) => Some(Arc::new(Float64Array::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::Double(s) => Some(*s.$func()),
-                            _ => None,
+            // 96 bit ints not supported
+            ParquetStatistics::Int96(_) => None,
+            ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))),
+            ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))),
+            ParquetStatistics::ByteArray(s) => {
+                match $target_arrow_type {
+                    // decimal data type
+                    Some(DataType::Decimal128(precision, scale)) => {
+                        Some(ScalarValue::Decimal128(
+                            Some(from_bytes_to_i128(s.$bytes_func())),
+                            *precision,
+                            *scale,
+                        ))
+                    }
+                    Some(DataType::Binary) => {
+                        Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec())))
+                    }
+                    _ => {
+                        let s = std::str::from_utf8(s.$bytes_func())
+                            .map(|s| s.to_string())
+                            .ok();
+                        if s.is_none() {
+                            log::debug!(
+                                "Utf8 statistics is a non-UTF8 value, ignoring it."
+                            );
                         }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::Decimal128(precision, scale)) => {
-                let mut array = Decimal128Array::from_iter($column_statistics_iter.map(
-                    |statistic| {
-                        statistic.and_then(|x| {
-                            if !x.has_min_max_set() {
-                                return None;
-                            }
-                            match x {
-                                ParquetStatistics::Int32(s) => Some(*s.$func() as i128),
-                                ParquetStatistics::Int64(s) => Some(*s.$func() as i128),
-                                ParquetStatistics::ByteArray(s) => {
-                                    Some(from_bytes_to_i128(s.$bytes_func()))
-                                }
-                                ParquetStatistics::FixedLenByteArray(s) => {
-                                    Some(from_bytes_to_i128(s.$bytes_func()))
-                                }
-                                _ => None,
-                            }
-                        })
-                    },
-                ));
-
-                array = array.with_precision_and_scale(*precision, *scale)?;
-                Some(Arc::new(array) as ArrayRef)
+                        Some(ScalarValue::Utf8(s))
+                    }
+                }
+            }
+            // type not fully supported yet
+            ParquetStatistics::FixedLenByteArray(s) => {
+                match $target_arrow_type {
+                    // just support specific logical data types, there are others each
+                    // with their own ordering
+                    Some(DataType::Decimal128(precision, scale)) => {
+                        Some(ScalarValue::Decimal128(
+                            Some(from_bytes_to_i128(s.$bytes_func())),
+                            *precision,
+                            *scale,
+                        ))
+                    }
+                    Some(DataType::FixedSizeBinary(size)) => {
+                        let value = s.$bytes_func().to_vec();
+                        let value = if value.len().try_into() == Ok(*size) {
+                            Some(value)
+                        } else {
+                            log::debug!(
+                                "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
+                                size,
+                                value.len(),
+                            );
+                            None
+                        };
+                        Some(ScalarValue::FixedSizeBinary(
+                            *size,
+                            value,
+                        ))
+                    }
+                    _ => None,
+                }
             }
-            Some(DataType::Binary) => Some(Arc::new(BinaryArray::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::ByteArray(s) => {
-                                Some(s.$bytes_func().to_vec())
-                            }
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::Utf8) => Some(Arc::new(StringArray::from_iter(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::ByteArray(s) => {
-                                let res = std::str::from_utf8(s.$bytes_func())
-                                    .map(|s| s.to_string())
-                                    .ok();
-
-                                if res.is_none() {
-                                    log::debug!(
-                                        "Utf8 statistics is a non-UTF8 value, ignoring it."
-                                    );
-                                }
-
-                                Some(res.unwrap_or_else(|| "".to_string()))
-                            }
-                            _ => None,
-                        }
-                    })
-                }),
-            )) as ArrayRef),
-            Some(DataType::FixedSizeBinary(size)) => Some(Arc::new(FixedSizeBinaryArray::from(
-                $column_statistics_iter.map(|statistic| {
-                    statistic.and_then(|x| {
-                        if !x.has_min_max_set() {
-                            return None;
-                        }
-                        match x {
-                            ParquetStatistics::FixedLenByteArray(s) => {
-                                let value = s.$bytes_func();
-                                let value = if value.len().try_into() == Ok(*size) {
-                                    Some(value)
-                                } else {
-                                    log::debug!(
-                                        "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
-                                        size,
-                                        value.len(),
-                                    );
-                                    None
-                                };
-                                value
-                            }
-                            _ => None,
-                        }
-                    })
-                }).collect::<Vec<_>>(),
-            )) as ArrayRef),
-            Some(DataType::Timestamp(_, _)) => {
-                log::info!(
-                    "Timestamp statistics are not supported, returning none"
-                );
-                None
-            },
-            _ => None,
         }
     }};
 }
@@ -383,13 +211,9 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
-    match get_statistics_iter!(iterator, min, min_bytes, Some(data_type)) {
-        Some(array) => Ok(array),
-        None => {
-            log::debug!("Unsupported data type for statistics");
-            Ok(new_empty_array(data_type))
-        }
-    }
+    let scalars = iterator
+        .map(|x| x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type))));
+    collect_scalars(data_type, scalars)
 }
 
 /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`]
@@ -397,11 +221,22 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
-    match get_statistics_iter!(iterator, max, max_bytes, Some(data_type)) {
-        Some(array) => Ok(array),
-        None => {
-            log::debug!("Unsupported data type for statistics");
-            Ok(new_empty_array(data_type))
+    let scalars = iterator
+        .map(|x| x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type))));
+    collect_scalars(data_type, scalars)
+}
+
+/// Builds an array from an iterator of ScalarValue
+fn collect_scalars<I: Iterator<Item = Option<ScalarValue>>>(
+    data_type: &DataType,
+    iterator: I,
+) -> Result<ArrayRef> {
+    let mut scalars = iterator.peekable();
+    match scalars.peek().is_none() {
+        true => Ok(new_empty_array(data_type)),
+        false => {
+            let null = ScalarValue::try_from(data_type)?;
+            ScalarValue::iter_to_array(scalars.map(|x| x.unwrap_or_else(|| null.clone())))
         }
     }
 }
@@ -701,6 +536,9 @@ mod test {
     }
 
     #[test]
+    #[should_panic(
+        expected = "Inconsistent types in ScalarValue::iter_to_array. Expected Int64, got TimestampNanosecond(NULL, None)"
+    )]
     // Due to https://github.com/apache/datafusion/issues/8295
     fn roundtrip_timestamp() {
         Test {
@@ -718,8 +556,8 @@ mod test {
                 None,
                 None,
             ]),
-            expected_min: timestamp_array([]),
-            expected_max: timestamp_array([]),
+            expected_min: timestamp_array([Some(1), Some(5), None]),
+            expected_max: timestamp_array([Some(3), Some(9), None]),
         }
         .run()
     }
@@ -1076,8 +914,8 @@ mod test {
             // File has no min/max for timestamp_col
             .with_column(ExpectedColumn {
                 name: "timestamp_col",
-                expected_min: timestamp_array([]),
-                expected_max: timestamp_array([]),
+                expected_min: timestamp_array([None]),
+                expected_max: timestamp_array([None]),
             })
             .with_column(ExpectedColumn {
                 name: "year",

From 5a3fdec113297df8419fe566ca9ce626efee00df Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Thu, 30 May 2024 10:39:24 +0000
Subject: [PATCH 06/19] Refine using the iterator idea

---
 .../physical_plan/parquet/statistics.rs       | 277 +++++++++++++++++-
 1 file changed, 263 insertions(+), 14 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index ae8395aef6a41..82782dd8dcbed 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -20,7 +20,11 @@
 // TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328
 
 use arrow::{array::ArrayRef, datatypes::DataType};
-use arrow_array::{new_empty_array, new_null_array, UInt64Array};
+use arrow_array::{
+    new_empty_array, new_null_array, BooleanArray, Date32Array, Date64Array,
+    Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
+    UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+};
 use arrow_schema::{Field, FieldRef, Schema};
 use datafusion_common::{
     internal_datafusion_err, internal_err, plan_err, Result, ScalarValue,
@@ -52,6 +56,94 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] {
     result
 }
 
+macro_rules! make_stats_iterator {
+    ($iterator_type:ident, $func:ident, $parquet_statistics_type:path, $stat_value_type:ty) => {
+        struct $iterator_type<'a, I>
+        where
+            I: Iterator<Item = Option<&'a ParquetStatistics>>,
+        {
+            iter: I,
+        }
+
+        impl<'a, I> $iterator_type<'a, I>
+        where
+            I: Iterator<Item = Option<&'a ParquetStatistics>>,
+        {
+            fn new(iter: I) -> Self {
+                Self { iter }
+            }
+        }
+
+        impl<'a, I> Iterator for $iterator_type<'a, I>
+        where
+            I: Iterator<Item = Option<&'a ParquetStatistics>>,
+        {
+            type Item = Option<&'a $stat_value_type>;
+
+            fn next(&mut self) -> Option<Self::Item> {
+                let next = self.iter.next();
+                next.map(|x| {
+                    x.and_then(|stats| match stats {
+                        $parquet_statistics_type(s) if stats.has_min_max_set() => {
+                            Some(s.$func())
+                        }
+                        _ => None,
+                    })
+                })
+            }
+
+            fn size_hint(&self) -> (usize, Option<usize>) {
+                self.iter.size_hint()
+            }
+        }
+    };
+}
+
+make_stats_iterator!(
+    MinBooleanStatsIterator,
+    min,
+    ParquetStatistics::Boolean,
+    bool
+);
+make_stats_iterator!(
+    MaxBooleanStatsIterator,
+    max,
+    ParquetStatistics::Boolean,
+    bool
+);
+make_stats_iterator!(MinInt32StatsIterator, min, ParquetStatistics::Int32, i32);
+make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, i32);
+make_stats_iterator!(MinInt64StatsIterator, min, ParquetStatistics::Int64, i64);
+make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, i64);
+make_stats_iterator!(MinFloatStatsIterator, min, ParquetStatistics::Float, f32);
+make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, f32);
+make_stats_iterator!(MinDoubleStatsIterator, min, ParquetStatistics::Double, f64);
+make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, f64);
+make_stats_iterator!(
+    MinByteArrayStatsIterator,
+    min_bytes,
+    ParquetStatistics::ByteArray,
+    [u8]
+);
+make_stats_iterator!(
+    MaxByteArrayStatsIterator,
+    max_bytes,
+    ParquetStatistics::ByteArray,
+    [u8]
+);
+make_stats_iterator!(
+    MinFixedLenByteArrayStatsIterator,
+    min_bytes,
+    ParquetStatistics::FixedLenByteArray,
+    [u8]
+);
+make_stats_iterator!(
+    MaxFixedLenByteArrayStatsIterator,
+    max_bytes,
+    ParquetStatistics::FixedLenByteArray,
+    [u8]
+);
+
 /// Extract a single min/max statistics from a [`ParquetStatistics`] object
 ///
 /// * `$column_statistics` is the `ParquetStatistics` object
@@ -211,9 +303,89 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
-    let scalars = iterator
-        .map(|x| x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type))));
-    collect_scalars(data_type, scalars)
+    match data_type {
+        DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter(
+            MinBooleanStatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Int8 => Ok(Arc::new(Int8Array::from_iter(
+            MinInt32StatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if let Ok(v) = i8::try_from(*x) {
+                        Some(v)
+                    } else {
+                        None
+                    }
+                })
+            }),
+        ))),
+        DataType::Int16 => Ok(Arc::new(Int16Array::from_iter(
+            MinInt32StatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if let Ok(v) = i16::try_from(*x) {
+                        Some(v)
+                    } else {
+                        None
+                    }
+                })
+            }),
+        ))),
+        DataType::Int32 => Ok(Arc::new(Int32Array::from_iter(
+            MinInt32StatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Int64 => Ok(Arc::new(Int64Array::from_iter(
+            MinInt64StatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter(
+            MinInt32StatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if let Ok(v) = u8::try_from(*x) {
+                        Some(v)
+                    } else {
+                        None
+                    }
+                })
+            }),
+        ))),
+        DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter(
+            MinInt32StatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if let Ok(v) = u16::try_from(*x) {
+                        Some(v)
+                    } else {
+                        None
+                    }
+                })
+            }),
+        ))),
+        DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter(
+            MinInt32StatsIterator::new(iterator).map(|x| x.map(|x| *x as u32)),
+        ))),
+        DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter(
+            MinInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)),
+        ))),
+        DataType::Float32 => Ok(Arc::new(Float32Array::from_iter(
+            MinFloatStatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Float64 => Ok(Arc::new(Float64Array::from_iter(
+            MinDoubleStatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Date32 => Ok(Arc::new(Date32Array::from_iter(
+            MinInt32StatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Date64 => Ok(Arc::new(Date64Array::from_iter(
+            MinInt32StatsIterator::new(iterator)
+                .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)),
+        ))),
+        DataType::Timestamp(_, _) => Ok(Arc::new(Int64Array::from_iter(
+            MinInt64StatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        _ => {
+            let scalars = iterator.map(|x| {
+                x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type)))
+            });
+            collect_scalars(data_type, scalars)
+        }
+    }
 }
 
 /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`]
@@ -221,9 +393,89 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
-    let scalars = iterator
-        .map(|x| x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type))));
-    collect_scalars(data_type, scalars)
+    match data_type {
+        DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter(
+            MaxBooleanStatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Int8 => Ok(Arc::new(Int8Array::from_iter(
+            MaxInt32StatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if let Ok(v) = i8::try_from(*x) {
+                        Some(v)
+                    } else {
+                        None
+                    }
+                })
+            }),
+        ))),
+        DataType::Int16 => Ok(Arc::new(Int16Array::from_iter(
+            MaxInt32StatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if let Ok(v) = i16::try_from(*x) {
+                        Some(v)
+                    } else {
+                        None
+                    }
+                })
+            }),
+        ))),
+        DataType::Int32 => Ok(Arc::new(Int32Array::from_iter(
+            MaxInt32StatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Int64 => Ok(Arc::new(Int64Array::from_iter(
+            MaxInt64StatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter(
+            MaxInt32StatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if let Ok(v) = u8::try_from(*x) {
+                        Some(v)
+                    } else {
+                        None
+                    }
+                })
+            }),
+        ))),
+        DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter(
+            MaxInt32StatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if let Ok(v) = u16::try_from(*x) {
+                        Some(v)
+                    } else {
+                        None
+                    }
+                })
+            }),
+        ))),
+        DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter(
+            MaxInt32StatsIterator::new(iterator).map(|x| x.map(|x| *x as u32)),
+        ))),
+        DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter(
+            MaxInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)),
+        ))),
+        DataType::Float32 => Ok(Arc::new(Float32Array::from_iter(
+            MaxFloatStatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Float64 => Ok(Arc::new(Float64Array::from_iter(
+            MaxDoubleStatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Date32 => Ok(Arc::new(Date32Array::from_iter(
+            MaxInt32StatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        DataType::Date64 => Ok(Arc::new(Date64Array::from_iter(
+            MaxInt32StatsIterator::new(iterator)
+                .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)),
+        ))),
+        DataType::Timestamp(_, _) => Ok(Arc::new(Int64Array::from_iter(
+            MaxInt64StatsIterator::new(iterator).map(|x| x.copied()),
+        ))),
+        _ => {
+            let scalars = iterator.map(|x| {
+                x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type)))
+            });
+            collect_scalars(data_type, scalars)
+        }
+    }
 }
 
 /// Builds an array from an iterator of ScalarValue
@@ -536,9 +788,6 @@ mod test {
     }
 
     #[test]
-    #[should_panic(
-        expected = "Inconsistent types in ScalarValue::iter_to_array. Expected Int64, got TimestampNanosecond(NULL, None)"
-    )]
     // Due to https://github.com/apache/datafusion/issues/8295
     fn roundtrip_timestamp() {
         Test {
@@ -556,8 +805,8 @@ mod test {
                 None,
                 None,
             ]),
-            expected_min: timestamp_array([Some(1), Some(5), None]),
-            expected_max: timestamp_array([Some(3), Some(9), None]),
+            expected_min: i64_array([Some(1), Some(5), None]),
+            expected_max: i64_array([Some(3), Some(9), None]),
         }
         .run()
     }
@@ -914,8 +1163,8 @@ mod test {
             // File has no min/max for timestamp_col
             .with_column(ExpectedColumn {
                 name: "timestamp_col",
-                expected_min: timestamp_array([None]),
-                expected_max: timestamp_array([None]),
+                expected_min: i64_array([None]),
+                expected_max: i64_array([None]),
             })
             .with_column(ExpectedColumn {
                 name: "year",

From 9f552e86561bc0260ed12643c1296ad8c8ec6398 Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Thu, 30 May 2024 11:09:47 +0000
Subject: [PATCH 07/19] Add the rest types

---
 .../physical_plan/parquet/statistics.rs       | 86 ++++++++++++++++++-
 1 file changed, 83 insertions(+), 3 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 82782dd8dcbed..58399a5ce3d97 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -21,9 +21,9 @@
 
 use arrow::{array::ArrayRef, datatypes::DataType};
 use arrow_array::{
-    new_empty_array, new_null_array, BooleanArray, Date32Array, Date64Array,
-    Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
-    UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+    new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array,
+    FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
+    Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
 };
 use arrow_schema::{Field, FieldRef, Schema};
 use datafusion_common::{
@@ -56,8 +56,25 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] {
     result
 }
 
+////// Define an adapter iterator for extracting statistics from an iterator of
+/// `ParquetStatistics`
+///
+///
+/// Handles checking if the statistics are present and valid with the correct type.
+///
+/// Parameters:
+/// * `$iterator_type` is the name of the iterator type (e.g. `MinBooleanStatsIterator`)
+/// * `$func` is the function to call to get the value (e.g. `min` or `max`)
+/// * `$parquet_statistics_type` is the type of the statistics (e.g. `ParquetStatistics::Boolean`)
+/// * `$stat_value_type` is the type of the statistics value (e.g. `bool`)
 macro_rules! make_stats_iterator {
     ($iterator_type:ident, $func:ident, $parquet_statistics_type:path, $stat_value_type:ty) => {
+        /// Maps an iterator of `ParquetStatistics` into an iterator of
+        /// `&$stat_value_type``
+        ///
+        /// Yielded elements:
+        /// * Some(stats) if valid
+        /// * None if the statistics are not present, not valid, or not $stat_value_type
         struct $iterator_type<'a, I>
         where
             I: Iterator<Item = Option<&'a ParquetStatistics>>,
@@ -69,17 +86,20 @@ macro_rules! make_stats_iterator {
         where
             I: Iterator<Item = Option<&'a ParquetStatistics>>,
         {
+            /// Create a new iterator to extract the statistics
             fn new(iter: I) -> Self {
                 Self { iter }
             }
         }
 
+        /// Implement the Iterator trait for the iterator
         impl<'a, I> Iterator for $iterator_type<'a, I>
         where
             I: Iterator<Item = Option<&'a ParquetStatistics>>,
         {
             type Item = Option<&'a $stat_value_type>;
 
+            /// return the next statistics value
             fn next(&mut self) -> Option<Self::Item> {
                 let next = self.iter.next();
                 next.map(|x| {
@@ -379,6 +399,36 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
         DataType::Timestamp(_, _) => Ok(Arc::new(Int64Array::from_iter(
             MinInt64StatsIterator::new(iterator).map(|x| x.copied()),
         ))),
+        DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
+            MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())),
+        ))),
+        DataType::Utf8 => Ok(Arc::new(StringArray::from_iter(
+            MinByteArrayStatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
+                    if res.is_none() {
+                        log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it.");
+                    }
+                    res
+                })
+            }),
+        ))),
+        DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from(
+            MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if x.len().try_into() == Ok(*size) {
+                        Some(x)
+                    } else {
+                        log::debug!(
+                            "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
+                            size,
+                            x.len(),
+                        );
+                        None
+                    }
+                })
+            }).collect::<Vec<_>>(),
+        ))),
         _ => {
             let scalars = iterator.map(|x| {
                 x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type)))
@@ -469,6 +519,36 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
         DataType::Timestamp(_, _) => Ok(Arc::new(Int64Array::from_iter(
             MaxInt64StatsIterator::new(iterator).map(|x| x.copied()),
         ))),
+        DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
+            MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())),
+        ))),
+        DataType::Utf8 => Ok(Arc::new(StringArray::from_iter(
+            MaxByteArrayStatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
+                    if res.is_none() {
+                        log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it.");
+                    }
+                    res
+                })
+            }),
+        ))),
+        DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from(
+            MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| {
+                x.and_then(|x| {
+                    if x.len().try_into() == Ok(*size) {
+                        Some(x)
+                    } else {
+                        log::debug!(
+                            "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
+                            size,
+                            x.len(),
+                        );
+                        None
+                    }
+                })
+            }).collect::<Vec<_>>(),
+        ))),
         _ => {
             let scalars = iterator.map(|x| {
                 x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type)))

From 7f1695664ddd79551caa7e1b66708218ed563781 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 30 May 2024 13:36:44 -0400
Subject: [PATCH 08/19] Consolidate Decimal statistics extraction

---
 .../physical_plan/parquet/statistics.rs       | 235 +++++++-----------
 1 file changed, 86 insertions(+), 149 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 58399a5ce3d97..d517ce3cfabd6 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -22,13 +22,12 @@
 use arrow::{array::ArrayRef, datatypes::DataType};
 use arrow_array::{
     new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array,
-    FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-    Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+    Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array,
+    Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array,
+    UInt64Array, UInt8Array,
 };
 use arrow_schema::{Field, FieldRef, Schema};
-use datafusion_common::{
-    internal_datafusion_err, internal_err, plan_err, Result, ScalarValue,
-};
+use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result};
 use parquet::file::metadata::ParquetMetaData;
 use parquet::file::statistics::Statistics as ParquetStatistics;
 use parquet::schema::types::SchemaDescriptor;
@@ -56,7 +55,7 @@ fn sign_extend_be(b: &[u8]) -> [u8; 16] {
     result
 }
 
-////// Define an adapter iterator for extracting statistics from an iterator of
+/// Define an adapter iterator for extracting statistics from an iterator of
 /// `ParquetStatistics`
 ///
 ///
@@ -164,135 +163,78 @@ make_stats_iterator!(
     [u8]
 );
 
-/// Extract a single min/max statistics from a [`ParquetStatistics`] object
+/// Special iterator adapter for extracting i128 values from from an iterator of
+/// `ParquetStatistics`
+///
+/// Handles checking if the statistics are present and valid with the correct type.
+///
+/// Depending on the parquet file, the statistics for `Decimal128` can be stored as
+/// `Int32`, `Int64` or `ByteArray` or `FixedSizeByteArray` :mindblown:
 ///
-/// * `$column_statistics` is the `ParquetStatistics` object
-/// * `$func is the function` (`min`/`max`) to call to get the value
-/// * `$bytes_func` is the function (`min_bytes`/`max_bytes`) to call to get the value as bytes
-/// * `$target_arrow_type` is the [`DataType`] of the target statistics
-macro_rules! get_statistic {
-    ($column_statistics:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{
-        if !$column_statistics.has_min_max_set() {
-            return None;
+/// This iterator handles all cases, extracting the values
+/// and converting it to `i128`.
+///
+/// Parameters:
+/// * `$iterator_type` is the name of the iterator type (e.g. `MinBooleanStatsIterator`)
+/// * `$func` is the function to call to get the value (e.g. `min` or `max`)
+/// * `$bytes_func` is the function to call to get the value as bytes (e.g. `min_bytes` or `max_bytes`)
+macro_rules! make_decimal_stats_iterator {
+    ($iterator_type:ident, $func:ident, $bytes_func:ident) => {
+        struct $iterator_type<'a, I>
+        where
+            I: Iterator<Item = Option<&'a ParquetStatistics>>,
+        {
+            iter: I,
         }
-        match $column_statistics {
-            ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))),
-            ParquetStatistics::Int32(s) => {
-                match $target_arrow_type {
-                    // int32 to decimal with the precision and scale
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(*s.$func() as i128),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::Int8) => {
-                        Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::Int16) => {
-                        Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt8) => {
-                        Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt16) => {
-                        Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
-                    }
-                    Some(DataType::UInt32) => {
-                        Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
-                    }
-                    Some(DataType::Date32) => {
-                        Some(ScalarValue::Date32(Some(*s.$func())))
-                    }
-                    Some(DataType::Date64) => {
-                        Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000)))
-                    }
-                    _ => Some(ScalarValue::Int32(Some(*s.$func()))),
-                }
-            }
-            ParquetStatistics::Int64(s) => {
-                match $target_arrow_type {
-                    // int64 to decimal with the precision and scale
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(*s.$func() as i128),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::UInt64) => {
-                        Some(ScalarValue::UInt64(Some((*s.$func()) as u64)))
-                    }
-                    _ => Some(ScalarValue::Int64(Some(*s.$func()))),
-                }
+
+        impl<'a, I> $iterator_type<'a, I>
+        where
+            I: Iterator<Item = Option<&'a ParquetStatistics>>,
+        {
+            fn new(iter: I) -> Self {
+                Self { iter }
             }
-            // 96 bit ints not supported
-            ParquetStatistics::Int96(_) => None,
-            ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))),
-            ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))),
-            ParquetStatistics::ByteArray(s) => {
-                match $target_arrow_type {
-                    // decimal data type
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(from_bytes_to_i128(s.$bytes_func())),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::Binary) => {
-                        Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec())))
-                    }
-                    _ => {
-                        let s = std::str::from_utf8(s.$bytes_func())
-                            .map(|s| s.to_string())
-                            .ok();
-                        if s.is_none() {
-                            log::debug!(
-                                "Utf8 statistics is a non-UTF8 value, ignoring it."
-                            );
+        }
+
+        impl<'a, I> Iterator for $iterator_type<'a, I>
+        where
+            I: Iterator<Item = Option<&'a ParquetStatistics>>,
+        {
+            type Item = Option<i128>;
+
+            fn next(&mut self) -> Option<Self::Item> {
+                let next = self.iter.next();
+                next.map(|x| {
+                    x.and_then(|stats| match stats {
+                        ParquetStatistics::Int32(s) if stats.has_min_max_set() => {
+                            Some(*s.$func() as i128)
                         }
-                        Some(ScalarValue::Utf8(s))
-                    }
-                }
+                        ParquetStatistics::Int64(s) if stats.has_min_max_set() => {
+                            Some(*s.$func() as i128)
+                        }
+                        ParquetStatistics::ByteArray(s) if stats.has_min_max_set() => {
+                            Some(from_bytes_to_i128(s.$bytes_func()))
+                        }
+                        ParquetStatistics::FixedLenByteArray(s)
+                            if stats.has_min_max_set() =>
+                        {
+                            Some(from_bytes_to_i128(s.$bytes_func()))
+                        }
+                        _ => None,
+                    })
+                })
             }
-            // type not fully supported yet
-            ParquetStatistics::FixedLenByteArray(s) => {
-                match $target_arrow_type {
-                    // just support specific logical data types, there are others each
-                    // with their own ordering
-                    Some(DataType::Decimal128(precision, scale)) => {
-                        Some(ScalarValue::Decimal128(
-                            Some(from_bytes_to_i128(s.$bytes_func())),
-                            *precision,
-                            *scale,
-                        ))
-                    }
-                    Some(DataType::FixedSizeBinary(size)) => {
-                        let value = s.$bytes_func().to_vec();
-                        let value = if value.len().try_into() == Ok(*size) {
-                            Some(value)
-                        } else {
-                            log::debug!(
-                                "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
-                                size,
-                                value.len(),
-                            );
-                            None
-                        };
-                        Some(ScalarValue::FixedSizeBinary(
-                            *size,
-                            value,
-                        ))
-                    }
-                    _ => None,
-                }
+
+            fn size_hint(&self) -> (usize, Option<usize>) {
+                self.iter.size_hint()
             }
         }
-    }};
+    };
 }
 
+make_decimal_stats_iterator!(MinDecimal128StatsIterator, min, min_bytes);
+make_decimal_stats_iterator!(MaxDecimal128StatsIterator, max, max_bytes);
+
 /// Lookups up the parquet column by name
 ///
 /// Returns the parquet column index and the corresponding arrow field
@@ -429,11 +371,16 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
                 })
             }).collect::<Vec<_>>(),
         ))),
+        DataType::Decimal128(precision, scale) => {
+            let arr = Decimal128Array::from_iter(
+                MinDecimal128StatsIterator::new(iterator)
+            ).with_precision_and_scale(*precision, *scale)?;
+            Ok(Arc::new(arr) as ArrayRef)
+        },
         _ => {
-            let scalars = iterator.map(|x| {
-                x.and_then(|s| get_statistic!(s, min, min_bytes, Some(data_type)))
-            });
-            collect_scalars(data_type, scalars)
+            let len = iterator.count();
+            // don't know how to extract statistics, so return a null array
+            Ok(new_null_array(data_type, len))
         }
     }
 }
@@ -549,26 +496,16 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
                 })
             }).collect::<Vec<_>>(),
         ))),
-        _ => {
-            let scalars = iterator.map(|x| {
-                x.and_then(|s| get_statistic!(s, max, max_bytes, Some(data_type)))
-            });
-            collect_scalars(data_type, scalars)
+        DataType::Decimal128(precision, scale) => {
+                let arr = Decimal128Array::from_iter(
+                    MaxDecimal128StatsIterator::new(iterator)
+                ).with_precision_and_scale(*precision, *scale)?;
+                Ok(Arc::new(arr) as ArrayRef)
         }
-    }
-}
-
-/// Builds an array from an iterator of ScalarValue
-fn collect_scalars<I: Iterator<Item = Option<ScalarValue>>>(
-    data_type: &DataType,
-    iterator: I,
-) -> Result<ArrayRef> {
-    let mut scalars = iterator.peekable();
-    match scalars.peek().is_none() {
-        true => Ok(new_empty_array(data_type)),
-        false => {
-            let null = ScalarValue::try_from(data_type)?;
-            ScalarValue::iter_to_array(scalars.map(|x| x.unwrap_or_else(|| null.clone())))
+        _ => {
+            let len = iterator.count();
+            // don't know how to extract statistics, so return a null array
+            Ok(new_null_array(data_type, len))
         }
     }
 }

From 26f5431c3998d4ee54b76a15ae0d239101f8cada Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 30 May 2024 13:37:41 -0400
Subject: [PATCH 09/19] clippy

---
 .../src/datasource/physical_plan/parquet/statistics.rs    | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index d517ce3cfabd6..0b139aa3ad991 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -21,7 +21,7 @@
 
 use arrow::{array::ArrayRef, datatypes::DataType};
 use arrow_array::{
-    new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array,
+    new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array,
     Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array,
     Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array,
     UInt64Array, UInt8Array,
@@ -661,11 +661,7 @@ mod test {
     use super::*;
     use arrow::compute::kernels::cast_utils::Parser;
     use arrow::datatypes::{Date32Type, Date64Type};
-    use arrow_array::{
-        new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array,
-        Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-        Int8Array, RecordBatch, StringArray, StructArray, TimestampNanosecondArray,
-    };
+    use arrow_array::{new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, StructArray, TimestampNanosecondArray, new_empty_array};
     use arrow_schema::{Field, SchemaRef};
     use bytes::Bytes;
     use datafusion_common::test_util::parquet_test_data;

From 860cb009b8601f5c7f2a784f0d7803cf729c271b Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 30 May 2024 13:47:50 -0400
Subject: [PATCH 10/19] Simplify

---
 .../physical_plan/parquet/statistics.rs       | 41 ++++++++++---------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 0b139aa3ad991..b76434353777a 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -21,10 +21,9 @@
 
 use arrow::{array::ArrayRef, datatypes::DataType};
 use arrow_array::{
-    new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array,
-    Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array,
-    Int32Array, Int64Array, Int8Array, StringArray, UInt16Array, UInt32Array,
-    UInt64Array, UInt8Array,
+    new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array,
+    FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
+    Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
 };
 use arrow_schema::{Field, FieldRef, Schema};
 use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result};
@@ -205,22 +204,21 @@ macro_rules! make_decimal_stats_iterator {
             fn next(&mut self) -> Option<Self::Item> {
                 let next = self.iter.next();
                 next.map(|x| {
-                    x.and_then(|stats| match stats {
-                        ParquetStatistics::Int32(s) if stats.has_min_max_set() => {
-                            Some(*s.$func() as i128)
-                        }
-                        ParquetStatistics::Int64(s) if stats.has_min_max_set() => {
-                            Some(*s.$func() as i128)
-                        }
-                        ParquetStatistics::ByteArray(s) if stats.has_min_max_set() => {
-                            Some(from_bytes_to_i128(s.$bytes_func()))
+                    x.and_then(|stats| {
+                        if !stats.has_min_max_set() {
+                            return None;
                         }
-                        ParquetStatistics::FixedLenByteArray(s)
-                            if stats.has_min_max_set() =>
-                        {
-                            Some(from_bytes_to_i128(s.$bytes_func()))
+                        match stats {
+                            ParquetStatistics::Int32(s) => Some(*s.$func() as i128),
+                            ParquetStatistics::Int64(s) => Some(*s.$func() as i128),
+                            ParquetStatistics::ByteArray(s) => {
+                                Some(from_bytes_to_i128(s.$bytes_func()))
+                            }
+                            ParquetStatistics::FixedLenByteArray(s) => {
+                                Some(from_bytes_to_i128(s.$bytes_func()))
+                            }
+                            _ => None,
                         }
-                        _ => None,
                     })
                 })
             }
@@ -661,7 +659,12 @@ mod test {
     use super::*;
     use arrow::compute::kernels::cast_utils::Parser;
     use arrow::datatypes::{Date32Type, Date64Type};
-    use arrow_array::{new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, StructArray, TimestampNanosecondArray, new_empty_array};
+    use arrow_array::{
+        new_empty_array, new_null_array, Array, BinaryArray, BooleanArray, Date32Array,
+        Date64Array, Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array,
+        Int64Array, Int8Array, RecordBatch, StringArray, StructArray,
+        TimestampNanosecondArray,
+    };
     use arrow_schema::{Field, SchemaRef};
     use bytes::Bytes;
     use datafusion_common::test_util::parquet_test_data;

From 524dc2f4d85228212eb200584c6c719291704672 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 30 May 2024 18:32:23 -0400
Subject: [PATCH 11/19] Fix dictionary type

---
 .../core/src/datasource/physical_plan/parquet/statistics.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index b76434353777a..178e80d9641f7 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -375,6 +375,9 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
             ).with_precision_and_scale(*precision, *scale)?;
             Ok(Arc::new(arr) as ArrayRef)
         },
+        DataType::Dictionary(_, value_type) => {
+            min_statistics(value_type, iterator)
+        }
         _ => {
             let len = iterator.count();
             // don't know how to extract statistics, so return a null array
@@ -500,6 +503,9 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
                 ).with_precision_and_scale(*precision, *scale)?;
                 Ok(Arc::new(arr) as ArrayRef)
         }
+        DataType::Dictionary(_, value_type) => {
+            max_statistics(value_type, iterator)
+        }
         _ => {
             let len = iterator.count();
             // don't know how to extract statistics, so return a null array

From a7880453fbccdba10e3db2cfff06ccae00e9739c Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Fri, 31 May 2024 07:30:31 +0000
Subject: [PATCH 12/19] Fix incorrect statistics read for timestamp columns in
 parquet

---
 .../physical_plan/parquet/statistics.rs       | 330 ++++++++++++--
 .../core/tests/parquet/arrow_statistics.rs    | 415 +++++++++++++++---
 datafusion/core/tests/parquet/mod.rs          |  52 ++-
 3 files changed, 692 insertions(+), 105 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 178e80d9641f7..8ad2a5de0fcad 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -23,9 +23,11 @@ use arrow::{array::ArrayRef, datatypes::DataType};
 use arrow_array::{
     new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array,
     FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-    Int8Array, StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+    Int8Array, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray,
+    TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array,
+    UInt64Array, UInt8Array,
 };
-use arrow_schema::{Field, FieldRef, Schema};
+use arrow_schema::{Field, FieldRef, Schema, TimeUnit};
 use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result};
 use parquet::file::metadata::ParquetMetaData;
 use parquet::file::statistics::Statistics as ParquetStatistics;
@@ -336,9 +338,36 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
             MinInt32StatsIterator::new(iterator)
                 .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)),
         ))),
-        DataType::Timestamp(_, _) => Ok(Arc::new(Int64Array::from_iter(
-            MinInt64StatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
+        DataType::Timestamp(unit, timezone) =>{
+            let iter = MinInt64StatsIterator::new(iterator).map(|x| x.copied());
+
+            Ok(match unit {
+                TimeUnit::Second => {
+                    Arc::new(match timezone {
+                        Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()),
+                        None => TimestampSecondArray::from_iter(iter),
+                    })
+                }
+                TimeUnit::Millisecond => {
+                    Arc::new(match timezone {
+                        Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()),
+                        None => TimestampMillisecondArray::from_iter(iter),
+                    })
+                }
+                TimeUnit::Microsecond => {
+                    Arc::new(match timezone {
+                        Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()),
+                        None => TimestampMicrosecondArray::from_iter(iter),
+                    })
+                }
+                TimeUnit::Nanosecond => {
+                    Arc::new(match timezone {
+                        Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()),
+                        None => TimestampNanosecondArray::from_iter(iter),
+                    })
+                }
+            })
+        },
         DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
             MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())),
         ))),
@@ -373,7 +402,7 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
             let arr = Decimal128Array::from_iter(
                 MinDecimal128StatsIterator::new(iterator)
             ).with_precision_and_scale(*precision, *scale)?;
-            Ok(Arc::new(arr) as ArrayRef)
+            Ok(Arc::new(arr))
         },
         DataType::Dictionary(_, value_type) => {
             min_statistics(value_type, iterator)
@@ -464,9 +493,36 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
             MaxInt32StatsIterator::new(iterator)
                 .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)),
         ))),
-        DataType::Timestamp(_, _) => Ok(Arc::new(Int64Array::from_iter(
-            MaxInt64StatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
+        DataType::Timestamp(unit, timezone) => {
+            let iter = MaxInt64StatsIterator::new(iterator).map(|x| x.copied());
+
+            Ok(match unit {
+                TimeUnit::Second => {
+                    Arc::new(match timezone {
+                        Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()),
+                        None => TimestampSecondArray::from_iter(iter),
+                    })
+                }
+                TimeUnit::Millisecond => {
+                    Arc::new(match timezone {
+                        Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()),
+                        None => TimestampMillisecondArray::from_iter(iter),
+                    })
+                }
+                TimeUnit::Microsecond => {
+                    Arc::new(match timezone {
+                        Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()),
+                        None => TimestampMicrosecondArray::from_iter(iter),
+                    })
+                }
+                TimeUnit::Nanosecond => {
+                    Arc::new(match timezone {
+                        Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()),
+                        None => TimestampNanosecondArray::from_iter(iter),
+                    })
+                }
+            })
+        }
         DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
             MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())),
         ))),
@@ -813,22 +869,207 @@ mod test {
     // Due to https://github.com/apache/datafusion/issues/8295
     fn roundtrip_timestamp() {
         Test {
-            input: timestamp_array([
-                // row group 1
-                Some(1),
-                None,
-                Some(3),
-                // row group 2
-                Some(9),
-                Some(5),
+            input: timestamp_seconds_array(
+                [
+                    // row group 1
+                    Some(1),
+                    None,
+                    Some(3),
+                    // row group 2
+                    Some(9),
+                    Some(5),
+                    None,
+                    // row group 3
+                    None,
+                    None,
+                    None,
+                ],
                 None,
-                // row group 3
+            ),
+            expected_min: timestamp_seconds_array([Some(1), Some(5), None], None),
+            expected_max: timestamp_seconds_array([Some(3), Some(9), None], None),
+        }
+        .run();
+
+        Test {
+            input: timestamp_milliseconds_array(
+                [
+                    // row group 1
+                    Some(1),
+                    None,
+                    Some(3),
+                    // row group 2
+                    Some(9),
+                    Some(5),
+                    None,
+                    // row group 3
+                    None,
+                    None,
+                    None,
+                ],
                 None,
+            ),
+            expected_min: timestamp_milliseconds_array([Some(1), Some(5), None], None),
+            expected_max: timestamp_milliseconds_array([Some(3), Some(9), None], None),
+        }
+        .run();
+
+        Test {
+            input: timestamp_microseconds_array(
+                [
+                    // row group 1
+                    Some(1),
+                    None,
+                    Some(3),
+                    // row group 2
+                    Some(9),
+                    Some(5),
+                    None,
+                    // row group 3
+                    None,
+                    None,
+                    None,
+                ],
                 None,
+            ),
+            expected_min: timestamp_microseconds_array([Some(1), Some(5), None], None),
+            expected_max: timestamp_microseconds_array([Some(3), Some(9), None], None),
+        }
+        .run();
+
+        Test {
+            input: timestamp_nanoseconds_array(
+                [
+                    // row group 1
+                    Some(1),
+                    None,
+                    Some(3),
+                    // row group 2
+                    Some(9),
+                    Some(5),
+                    None,
+                    // row group 3
+                    None,
+                    None,
+                    None,
+                ],
                 None,
-            ]),
-            expected_min: i64_array([Some(1), Some(5), None]),
-            expected_max: i64_array([Some(3), Some(9), None]),
+            ),
+            expected_min: timestamp_nanoseconds_array([Some(1), Some(5), None], None),
+            expected_max: timestamp_nanoseconds_array([Some(3), Some(9), None], None),
+        }
+        .run()
+    }
+
+    #[test]
+    fn roundtrip_timestamp_timezoned() {
+        Test {
+            input: timestamp_seconds_array(
+                [
+                    // row group 1
+                    Some(1),
+                    None,
+                    Some(3),
+                    // row group 2
+                    Some(9),
+                    Some(5),
+                    None,
+                    // row group 3
+                    None,
+                    None,
+                    None,
+                ],
+                Some("UTC"),
+            ),
+            expected_min: timestamp_seconds_array([Some(1), Some(5), None], Some("UTC")),
+            expected_max: timestamp_seconds_array([Some(3), Some(9), None], Some("UTC")),
+        }
+        .run();
+
+        Test {
+            input: timestamp_milliseconds_array(
+                [
+                    // row group 1
+                    Some(1),
+                    None,
+                    Some(3),
+                    // row group 2
+                    Some(9),
+                    Some(5),
+                    None,
+                    // row group 3
+                    None,
+                    None,
+                    None,
+                ],
+                Some("UTC"),
+            ),
+            expected_min: timestamp_milliseconds_array(
+                [Some(1), Some(5), None],
+                Some("UTC"),
+            ),
+            expected_max: timestamp_milliseconds_array(
+                [Some(3), Some(9), None],
+                Some("UTC"),
+            ),
+        }
+        .run();
+
+        Test {
+            input: timestamp_microseconds_array(
+                [
+                    // row group 1
+                    Some(1),
+                    None,
+                    Some(3),
+                    // row group 2
+                    Some(9),
+                    Some(5),
+                    None,
+                    // row group 3
+                    None,
+                    None,
+                    None,
+                ],
+                Some("UTC"),
+            ),
+            expected_min: timestamp_microseconds_array(
+                [Some(1), Some(5), None],
+                Some("UTC"),
+            ),
+            expected_max: timestamp_microseconds_array(
+                [Some(3), Some(9), None],
+                Some("UTC"),
+            ),
+        }
+        .run();
+
+        Test {
+            input: timestamp_nanoseconds_array(
+                [
+                    // row group 1
+                    Some(1),
+                    None,
+                    Some(3),
+                    // row group 2
+                    Some(9),
+                    Some(5),
+                    None,
+                    // row group 3
+                    None,
+                    None,
+                    None,
+                ],
+                Some("UTC"),
+            ),
+            expected_min: timestamp_nanoseconds_array(
+                [Some(1), Some(5), None],
+                Some("UTC"),
+            ),
+            expected_max: timestamp_nanoseconds_array(
+                [Some(3), Some(9), None],
+                Some("UTC"),
+            ),
         }
         .run()
     }
@@ -1185,8 +1426,8 @@ mod test {
             // File has no min/max for timestamp_col
             .with_column(ExpectedColumn {
                 name: "timestamp_col",
-                expected_min: i64_array([None]),
-                expected_max: i64_array([None]),
+                expected_min: timestamp_nanoseconds_array([None], None),
+                expected_max: timestamp_nanoseconds_array([None], None),
             })
             .with_column(ExpectedColumn {
                 name: "year",
@@ -1406,9 +1647,48 @@ mod test {
         Arc::new(array)
     }
 
-    fn timestamp_array(input: impl IntoIterator<Item = Option<i64>>) -> ArrayRef {
+    fn timestamp_seconds_array(
+        input: impl IntoIterator<Item = Option<i64>>,
+        timzezone: Option<&str>,
+    ) -> ArrayRef {
+        let array: TimestampSecondArray = input.into_iter().collect();
+        match timzezone {
+            Some(tz) => Arc::new(array.with_timezone(tz)),
+            None => Arc::new(array),
+        }
+    }
+
+    fn timestamp_milliseconds_array(
+        input: impl IntoIterator<Item = Option<i64>>,
+        timzezone: Option<&str>,
+    ) -> ArrayRef {
+        let array: TimestampMillisecondArray = input.into_iter().collect();
+        match timzezone {
+            Some(tz) => Arc::new(array.with_timezone(tz)),
+            None => Arc::new(array),
+        }
+    }
+
+    fn timestamp_microseconds_array(
+        input: impl IntoIterator<Item = Option<i64>>,
+        timzezone: Option<&str>,
+    ) -> ArrayRef {
+        let array: TimestampMicrosecondArray = input.into_iter().collect();
+        match timzezone {
+            Some(tz) => Arc::new(array.with_timezone(tz)),
+            None => Arc::new(array),
+        }
+    }
+
+    fn timestamp_nanoseconds_array(
+        input: impl IntoIterator<Item = Option<i64>>,
+        timzezone: Option<&str>,
+    ) -> ArrayRef {
         let array: TimestampNanosecondArray = input.into_iter().collect();
-        Arc::new(array)
+        match timzezone {
+            Some(tz) => Arc::new(array.with_timezone(tz)),
+            None => Arc::new(array),
+        }
     }
 
     fn utf8_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> ArrayRef {
diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs
index eebf3447cbe9f..fb155be639aab 100644
--- a/datafusion/core/tests/parquet/arrow_statistics.rs
+++ b/datafusion/core/tests/parquet/arrow_statistics.rs
@@ -22,12 +22,16 @@ use std::fs::File;
 use std::sync::Arc;
 
 use arrow::compute::kernels::cast_utils::Parser;
-use arrow::datatypes::{Date32Type, Date64Type};
+use arrow::datatypes::{
+    Date32Type, Date64Type, TimestampMicrosecondType, TimestampMillisecondType,
+    TimestampNanosecondType, TimestampSecondType,
+};
 use arrow_array::{
     make_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
     Decimal128Array, FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array,
-    Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, UInt16Array,
-    UInt32Array, UInt64Array, UInt8Array,
+    Int32Array, Int64Array, Int8Array, RecordBatch, StringArray,
+    TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+    TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
 };
 use arrow_schema::{DataType, Field, Schema};
 use datafusion::datasource::physical_plan::parquet::{
@@ -456,36 +460,42 @@ async fn test_int_8() {
 // timestamp
 #[tokio::test]
 async fn test_timestamp() {
-    // This creates a parquet files of 5 columns named "nanos", "micros", "millis", "seconds", "names"
+    // This creates a parquet files of 9 columns named "nanos", "nanos_timezoned", "micros", "micros_timezoned", "millis", "millis_timezoned", "seconds", "seconds_timezoned", "names"
     // "nanos" --> TimestampNanosecondArray
+    // "nanos_timezoned" --> TimestampNanosecondArray
     // "micros" --> TimestampMicrosecondArray
+    // "micros_timezoned" --> TimestampMicrosecondArray
     // "millis" --> TimestampMillisecondArray
+    // "millis_timezoned" --> TimestampMillisecondArray
     // "seconds" --> TimestampSecondArray
+    // "seconds_timezoned" --> TimestampSecondArray
     // "names" --> StringArray
     //
     // The file is created by 4 record batches, each has 5 rowws.
     // Since the row group isze is set to 5, those 4 batches will go into 4 row groups
-    // This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64"
+    // This creates a parquet files of 4 columns named "nanos", "nanos_timezoned", "micros", "micros_timezoned", "millis", "millis_timezoned", "seconds", "seconds_timezoned"
     let reader = TestReader {
         scenario: Scenario::Timestamps,
         row_per_group: 5,
     };
 
+    let tz = "Pacific/Efate";
+
     Test {
         reader: reader.build().await,
-        // mins are [1577840461000000000, 1577840471000000000, 1577841061000000000, 1578704461000000000,]
-        expected_min: Arc::new(Int64Array::from(vec![
-            1577840461000000000,
-            1577840471000000000,
-            1577841061000000000,
-            1578704461000000000,
+        // mins are [2020-01-01T01:01:01, 2020-01-01T01:01:11, 2020-01-01T01:11:01, 2020-01-10T01:01:01]
+        expected_min: Arc::new(TimestampNanosecondArray::from(vec![
+            TimestampNanosecondType::parse("2020-01-01T01:01:01"),
+            TimestampNanosecondType::parse("2020-01-01T01:01:11"),
+            TimestampNanosecondType::parse("2020-01-01T01:11:01"),
+            TimestampNanosecondType::parse("2020-01-11T01:01:01"),
         ])),
-        // maxes are [1577926861000000000, 1577926871000000000, 1577927461000000000, 1578790861000000000,]
-        expected_max: Arc::new(Int64Array::from(vec![
-            1577926861000000000,
-            1577926871000000000,
-            1577927461000000000,
-            1578790861000000000,
+        // maxes are [2020-01-02T01:01:01, 2020-01-02T01:01:11, 2020-01-02T01:11:01, 2020-01-12T01:01:01]
+        expected_max: Arc::new(TimestampNanosecondArray::from(vec![
+            TimestampNanosecondType::parse("2020-01-02T01:01:01"),
+            TimestampNanosecondType::parse("2020-01-02T01:01:11"),
+            TimestampNanosecondType::parse("2020-01-02T01:11:01"),
+            TimestampNanosecondType::parse("2020-01-12T01:01:01"),
         ])),
         // nulls are [1, 1, 1, 1]
         expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
@@ -495,21 +505,52 @@ async fn test_timestamp() {
     }
     .run();
 
-    // micros
+    Test {
+        reader: reader.build().await,
+        // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:01:11+11:00, 2020-01-01T12:11:01+11:00, 2020-01-10T12:01:01+11:00]
+        expected_min: Arc::new(
+            TimestampNanosecondArray::from(vec![
+                TimestampNanosecondType::parse("2020-01-01T01:01:01"),
+                TimestampNanosecondType::parse("2020-01-01T01:01:11"),
+                TimestampNanosecondType::parse("2020-01-01T01:11:01"),
+                TimestampNanosecondType::parse("2020-01-11T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // maxes are [2020-01-02T12:01:01+11:00, 2020-01-02T12:01:11+11:00, 2020-01-02T12:11:01+11:00, 2020-01-12T12:01:01+11:00]
+        expected_max: Arc::new(
+            TimestampNanosecondArray::from(vec![
+                TimestampNanosecondType::parse("2020-01-02T01:01:01"),
+                TimestampNanosecondType::parse("2020-01-02T01:01:11"),
+                TimestampNanosecondType::parse("2020-01-02T01:11:01"),
+                TimestampNanosecondType::parse("2020-01-12T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // nulls are [1, 1, 1, 1]
+        expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
+        // row counts are [5, 5, 5, 5]
+        expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
+        column_name: "nanos_timezoned",
+    }
+    .run();
 
+    // micros
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![
-            1577840461000000,
-            1577840471000000,
-            1577841061000000,
-            1578704461000000,
+        // mins are [2020-01-01T01:01:01, 2020-01-01T01:01:11, 2020-01-01T01:11:01, 2020-01-10T01:01:01]
+        expected_min: Arc::new(TimestampMicrosecondArray::from(vec![
+            TimestampMicrosecondType::parse("2020-01-01T01:01:01"),
+            TimestampMicrosecondType::parse("2020-01-01T01:01:11"),
+            TimestampMicrosecondType::parse("2020-01-01T01:11:01"),
+            TimestampMicrosecondType::parse("2020-01-11T01:01:01"),
         ])),
-        expected_max: Arc::new(Int64Array::from(vec![
-            1577926861000000,
-            1577926871000000,
-            1577927461000000,
-            1578790861000000,
+        // maxes are [2020-01-02T01:01:01, 2020-01-02T01:01:11, 2020-01-02T01:11:01, 2020-01-12T01:01:01]
+        expected_max: Arc::new(TimestampMicrosecondArray::from(vec![
+            TimestampMicrosecondType::parse("2020-01-02T01:01:01"),
+            TimestampMicrosecondType::parse("2020-01-02T01:01:11"),
+            TimestampMicrosecondType::parse("2020-01-02T01:11:01"),
+            TimestampMicrosecondType::parse("2020-01-12T01:01:01"),
         ])),
         expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
         expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
@@ -517,20 +558,52 @@ async fn test_timestamp() {
     }
     .run();
 
+    Test {
+        reader: reader.build().await,
+        // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:01:11+11:00, 2020-01-01T12:11:01+11:00, 2020-01-10T12:01:01+11:00]
+        expected_min: Arc::new(
+            TimestampMicrosecondArray::from(vec![
+                TimestampMicrosecondType::parse("2020-01-01T01:01:01"),
+                TimestampMicrosecondType::parse("2020-01-01T01:01:11"),
+                TimestampMicrosecondType::parse("2020-01-01T01:11:01"),
+                TimestampMicrosecondType::parse("2020-01-11T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // maxes are [2020-01-02T12:01:01+11:00, 2020-01-02T12:01:11+11:00, 2020-01-02T12:11:01+11:00, 2020-01-12T12:01:01+11:00]
+        expected_max: Arc::new(
+            TimestampMicrosecondArray::from(vec![
+                TimestampMicrosecondType::parse("2020-01-02T01:01:01"),
+                TimestampMicrosecondType::parse("2020-01-02T01:01:11"),
+                TimestampMicrosecondType::parse("2020-01-02T01:11:01"),
+                TimestampMicrosecondType::parse("2020-01-12T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // nulls are [1, 1, 1, 1]
+        expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
+        // row counts are [5, 5, 5, 5]
+        expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
+        column_name: "micros_timezoned",
+    }
+    .run();
+
     // millis
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![
-            1577840461000,
-            1577840471000,
-            1577841061000,
-            1578704461000,
+        // mins are [2020-01-01T01:01:01, 2020-01-01T01:01:11, 2020-01-01T01:11:01, 2020-01-10T01:01:01]
+        expected_min: Arc::new(TimestampMillisecondArray::from(vec![
+            TimestampMillisecondType::parse("2020-01-01T01:01:01"),
+            TimestampMillisecondType::parse("2020-01-01T01:01:11"),
+            TimestampMillisecondType::parse("2020-01-01T01:11:01"),
+            TimestampMillisecondType::parse("2020-01-11T01:01:01"),
         ])),
-        expected_max: Arc::new(Int64Array::from(vec![
-            1577926861000,
-            1577926871000,
-            1577927461000,
-            1578790861000,
+        // maxes are [2020-01-02T01:01:01, 2020-01-02T01:01:11, 2020-01-02T01:11:01, 2020-01-12T01:01:01]
+        expected_max: Arc::new(TimestampMillisecondArray::from(vec![
+            TimestampMillisecondType::parse("2020-01-02T01:01:01"),
+            TimestampMillisecondType::parse("2020-01-02T01:01:11"),
+            TimestampMillisecondType::parse("2020-01-02T01:11:01"),
+            TimestampMillisecondType::parse("2020-01-12T01:01:01"),
         ])),
         expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
         expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
@@ -538,30 +611,102 @@ async fn test_timestamp() {
     }
     .run();
 
+    Test {
+        reader: reader.build().await,
+        // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:01:11+11:00, 2020-01-01T12:11:01+11:00, 2020-01-10T12:01:01+11:00]
+        expected_min: Arc::new(
+            TimestampMillisecondArray::from(vec![
+                TimestampMillisecondType::parse("2020-01-01T01:01:01"),
+                TimestampMillisecondType::parse("2020-01-01T01:01:11"),
+                TimestampMillisecondType::parse("2020-01-01T01:11:01"),
+                TimestampMillisecondType::parse("2020-01-11T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // maxes are [2020-01-02T12:01:01+11:00, 2020-01-02T12:01:11+11:00, 2020-01-02T12:11:01+11:00, 2020-01-12T12:01:01+11:00]
+        expected_max: Arc::new(
+            TimestampMillisecondArray::from(vec![
+                TimestampMillisecondType::parse("2020-01-02T01:01:01"),
+                TimestampMillisecondType::parse("2020-01-02T01:01:11"),
+                TimestampMillisecondType::parse("2020-01-02T01:11:01"),
+                TimestampMillisecondType::parse("2020-01-12T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // nulls are [1, 1, 1, 1]
+        expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
+        // row counts are [5, 5, 5, 5]
+        expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
+        column_name: "millis_timezoned",
+    }
+    .run();
+
     // seconds
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![
-            1577840461, 1577840471, 1577841061, 1578704461,
+        // mins are [2020-01-01T01:01:01, 2020-01-01T01:01:11, 2020-01-01T01:11:01, 2020-01-10T01:01:01]
+        expected_min: Arc::new(TimestampSecondArray::from(vec![
+            TimestampSecondType::parse("2020-01-01T01:01:01"),
+            TimestampSecondType::parse("2020-01-01T01:01:11"),
+            TimestampSecondType::parse("2020-01-01T01:11:01"),
+            TimestampSecondType::parse("2020-01-11T01:01:01"),
         ])),
-        expected_max: Arc::new(Int64Array::from(vec![
-            1577926861, 1577926871, 1577927461, 1578790861,
+        // maxes are [2020-01-02T01:01:01, 2020-01-02T01:01:11, 2020-01-02T01:11:01, 2020-01-12T01:01:01]
+        expected_max: Arc::new(TimestampSecondArray::from(vec![
+            TimestampSecondType::parse("2020-01-02T01:01:01"),
+            TimestampSecondType::parse("2020-01-02T01:01:11"),
+            TimestampSecondType::parse("2020-01-02T01:11:01"),
+            TimestampSecondType::parse("2020-01-12T01:01:01"),
         ])),
         expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
         expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
         column_name: "seconds",
     }
     .run();
+
+    Test {
+        reader: reader.build().await,
+        // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:01:11+11:00, 2020-01-01T12:11:01+11:00, 2020-01-10T12:01:01+11:00]
+        expected_min: Arc::new(
+            TimestampSecondArray::from(vec![
+                TimestampSecondType::parse("2020-01-01T01:01:01"),
+                TimestampSecondType::parse("2020-01-01T01:01:11"),
+                TimestampSecondType::parse("2020-01-01T01:11:01"),
+                TimestampSecondType::parse("2020-01-11T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // maxes are [2020-01-02T12:01:01+11:00, 2020-01-02T12:01:11+11:00, 2020-01-02T12:11:01+11:00, 2020-01-12T12:01:01+11:00]
+        expected_max: Arc::new(
+            TimestampSecondArray::from(vec![
+                TimestampSecondType::parse("2020-01-02T01:01:01"),
+                TimestampSecondType::parse("2020-01-02T01:01:11"),
+                TimestampSecondType::parse("2020-01-02T01:11:01"),
+                TimestampSecondType::parse("2020-01-12T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // nulls are [1, 1, 1, 1]
+        expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]),
+        // row counts are [5, 5, 5, 5]
+        expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
+        column_name: "seconds_timezoned",
+    }
+    .run();
 }
 
 // timestamp with different row group sizes
 #[tokio::test]
 async fn test_timestamp_diff_rg_sizes() {
-    // This creates a parquet files of 5 columns named "nanos", "micros", "millis", "seconds", "names"
+    // This creates a parquet files of 9 columns named "nanos", "nanos_timezoned", "micros", "micros_timezoned", "millis", "millis_timezoned", "seconds", "seconds_timezoned", "names"
     // "nanos" --> TimestampNanosecondArray
+    // "nanos_timezoned" --> TimestampNanosecondArray
     // "micros" --> TimestampMicrosecondArray
+    // "micros_timezoned" --> TimestampMicrosecondArray
     // "millis" --> TimestampMillisecondArray
+    // "millis_timezoned" --> TimestampMillisecondArray
     // "seconds" --> TimestampSecondArray
+    // "seconds_timezoned" --> TimestampSecondArray
     // "names" --> StringArray
     //
     // The file is created by 4 record batches (each has a null row), each has 5 rows but then will be split into 3 row groups with size 8, 8, 4
@@ -570,19 +715,21 @@ async fn test_timestamp_diff_rg_sizes() {
         row_per_group: 8, // note that the row group size is 8
     };
 
+    let tz = "Pacific/Efate";
+
     Test {
         reader: reader.build().await,
-        // mins are [1577840461000000000, 1577841061000000000, 1578704521000000000]
-        expected_min: Arc::new(Int64Array::from(vec![
-            1577840461000000000,
-            1577841061000000000,
-            1578704521000000000,
+        // mins are [2020-01-01T01:01:01, 2020-01-01T01:11:01, 2020-01-11T01:02:01]
+        expected_min: Arc::new(TimestampNanosecondArray::from(vec![
+            TimestampNanosecondType::parse("2020-01-01T01:01:01"),
+            TimestampNanosecondType::parse("2020-01-01T01:11:01"),
+            TimestampNanosecondType::parse("2020-01-11T01:02:01"),
         ])),
-        // maxes are [1577926861000000000, 1578704461000000000, 157879086100000000]
-        expected_max: Arc::new(Int64Array::from(vec![
-            1577926861000000000,
-            1578704461000000000,
-            1578790861000000000,
+        // maxes are [2020-01-02T01:01:01, 2020-01-11T01:01:01, 2020-01-12T01:01:01]
+        expected_max: Arc::new(TimestampNanosecondArray::from(vec![
+            TimestampNanosecondType::parse("2020-01-02T01:01:01"),
+            TimestampNanosecondType::parse("2020-01-11T01:01:01"),
+            TimestampNanosecondType::parse("2020-01-12T01:01:01"),
         ])),
         // nulls are [1, 2, 1]
         expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
@@ -592,18 +739,48 @@ async fn test_timestamp_diff_rg_sizes() {
     }
     .run();
 
+    Test {
+        reader: reader.build().await,
+        // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:11:01+11:00, 2020-01-11T12:02:01+11:00]
+        expected_min: Arc::new(
+            TimestampNanosecondArray::from(vec![
+                TimestampNanosecondType::parse("2020-01-01T01:01:01"),
+                TimestampNanosecondType::parse("2020-01-01T01:11:01"),
+                TimestampNanosecondType::parse("2020-01-11T01:02:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // maxes are [2020-01-02T01:01:01+11:00, 2020-01-11T01:01:01+11:00, 2020-01-12T01:01:01+11:00]
+        expected_max: Arc::new(
+            TimestampNanosecondArray::from(vec![
+                TimestampNanosecondType::parse("2020-01-02T01:01:01"),
+                TimestampNanosecondType::parse("2020-01-11T01:01:01"),
+                TimestampNanosecondType::parse("2020-01-12T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // nulls are [1, 2, 1]
+        expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
+        // row counts are [8, 8, 4]
+        expected_row_counts: UInt64Array::from(vec![8, 8, 4]),
+        column_name: "nanos_timezoned",
+    }
+    .run();
+
     // micros
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![
-            1577840461000000,
-            1577841061000000,
-            1578704521000000,
+        // mins are [2020-01-01T01:01:01, 2020-01-01T01:11:01, 2020-01-11T01:02:01]
+        expected_min: Arc::new(TimestampMicrosecondArray::from(vec![
+            TimestampMicrosecondType::parse("2020-01-01T01:01:01"),
+            TimestampMicrosecondType::parse("2020-01-01T01:11:01"),
+            TimestampMicrosecondType::parse("2020-01-11T01:02:01"),
         ])),
-        expected_max: Arc::new(Int64Array::from(vec![
-            1577926861000000,
-            1578704461000000,
-            1578790861000000,
+        // maxes are [2020-01-02T01:01:01, 2020-01-11T01:01:01, 2020-01-12T01:01:01]
+        expected_max: Arc::new(TimestampMicrosecondArray::from(vec![
+            TimestampMicrosecondType::parse("2020-01-02T01:01:01"),
+            TimestampMicrosecondType::parse("2020-01-11T01:01:01"),
+            TimestampMicrosecondType::parse("2020-01-12T01:01:01"),
         ])),
         expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
         expected_row_counts: UInt64Array::from(vec![8, 8, 4]),
@@ -611,18 +788,48 @@ async fn test_timestamp_diff_rg_sizes() {
     }
     .run();
 
+    Test {
+        reader: reader.build().await,
+        // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:11:01+11:00, 2020-01-11T12:02:01+11:00]
+        expected_min: Arc::new(
+            TimestampMicrosecondArray::from(vec![
+                TimestampMicrosecondType::parse("2020-01-01T01:01:01"),
+                TimestampMicrosecondType::parse("2020-01-01T01:11:01"),
+                TimestampMicrosecondType::parse("2020-01-11T01:02:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // maxes are [2020-01-02T01:01:01+11:00, 2020-01-11T01:01:01+11:00, 2020-01-12T01:01:01+11:00]
+        expected_max: Arc::new(
+            TimestampMicrosecondArray::from(vec![
+                TimestampMicrosecondType::parse("2020-01-02T01:01:01"),
+                TimestampMicrosecondType::parse("2020-01-11T01:01:01"),
+                TimestampMicrosecondType::parse("2020-01-12T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // nulls are [1, 2, 1]
+        expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
+        // row counts are [8, 8, 4]
+        expected_row_counts: UInt64Array::from(vec![8, 8, 4]),
+        column_name: "micros_timezoned",
+    }
+    .run();
+
     // millis
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![
-            1577840461000,
-            1577841061000,
-            1578704521000,
+        // mins are [2020-01-01T01:01:01, 2020-01-01T01:11:01, 2020-01-11T01:02:01]
+        expected_min: Arc::new(TimestampMillisecondArray::from(vec![
+            TimestampMillisecondType::parse("2020-01-01T01:01:01"),
+            TimestampMillisecondType::parse("2020-01-01T01:11:01"),
+            TimestampMillisecondType::parse("2020-01-11T01:02:01"),
         ])),
-        expected_max: Arc::new(Int64Array::from(vec![
-            1577926861000,
-            1578704461000,
-            1578790861000,
+        // maxes are [2020-01-02T01:01:01, 2020-01-11T01:01:01, 2020-01-12T01:01:01]
+        expected_max: Arc::new(TimestampMillisecondArray::from(vec![
+            TimestampMillisecondType::parse("2020-01-02T01:01:01"),
+            TimestampMillisecondType::parse("2020-01-11T01:01:01"),
+            TimestampMillisecondType::parse("2020-01-12T01:01:01"),
         ])),
         expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
         expected_row_counts: UInt64Array::from(vec![8, 8, 4]),
@@ -630,20 +837,82 @@ async fn test_timestamp_diff_rg_sizes() {
     }
     .run();
 
+    Test {
+        reader: reader.build().await,
+        // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:11:01+11:00, 2020-01-11T12:02:01+11:00]
+        expected_min: Arc::new(
+            TimestampMillisecondArray::from(vec![
+                TimestampMillisecondType::parse("2020-01-01T01:01:01"),
+                TimestampMillisecondType::parse("2020-01-01T01:11:01"),
+                TimestampMillisecondType::parse("2020-01-11T01:02:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // maxes are [2020-01-02T01:01:01+11:00, 2020-01-11T01:01:01+11:00, 2020-01-12T01:01:01+11:00]
+        expected_max: Arc::new(
+            TimestampMillisecondArray::from(vec![
+                TimestampMillisecondType::parse("2020-01-02T01:01:01"),
+                TimestampMillisecondType::parse("2020-01-11T01:01:01"),
+                TimestampMillisecondType::parse("2020-01-12T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // nulls are [1, 2, 1]
+        expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
+        // row counts are [8, 8, 4]
+        expected_row_counts: UInt64Array::from(vec![8, 8, 4]),
+        column_name: "millis_timezoned",
+    }
+    .run();
+
     // seconds
     Test {
         reader: reader.build().await,
-        expected_min: Arc::new(Int64Array::from(vec![
-            1577840461, 1577841061, 1578704521,
+        // mins are [2020-01-01T01:01:01, 2020-01-01T01:11:01, 2020-01-11T01:02:01]
+        expected_min: Arc::new(TimestampSecondArray::from(vec![
+            TimestampSecondType::parse("2020-01-01T01:01:01"),
+            TimestampSecondType::parse("2020-01-01T01:11:01"),
+            TimestampSecondType::parse("2020-01-11T01:02:01"),
         ])),
-        expected_max: Arc::new(Int64Array::from(vec![
-            1577926861, 1578704461, 1578790861,
+        // maxes are [2020-01-02T01:01:01, 2020-01-11T01:01:01, 2020-01-12T01:01:01]
+        expected_max: Arc::new(TimestampSecondArray::from(vec![
+            TimestampSecondType::parse("2020-01-02T01:01:01"),
+            TimestampSecondType::parse("2020-01-11T01:01:01"),
+            TimestampSecondType::parse("2020-01-12T01:01:01"),
         ])),
         expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
         expected_row_counts: UInt64Array::from(vec![8, 8, 4]),
         column_name: "seconds",
     }
     .run();
+
+    Test {
+        reader: reader.build().await,
+        // mins are [2020-01-01T12:01:01+11:00, 2020-01-01T12:11:01+11:00, 2020-01-11T12:02:01+11:00]
+        expected_min: Arc::new(
+            TimestampSecondArray::from(vec![
+                TimestampSecondType::parse("2020-01-01T01:01:01"),
+                TimestampSecondType::parse("2020-01-01T01:11:01"),
+                TimestampSecondType::parse("2020-01-11T01:02:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // maxes are [2020-01-02T01:01:01+11:00, 2020-01-11T01:01:01+11:00, 2020-01-12T01:01:01+11:00]
+        expected_max: Arc::new(
+            TimestampSecondArray::from(vec![
+                TimestampSecondType::parse("2020-01-02T01:01:01"),
+                TimestampSecondType::parse("2020-01-11T01:01:01"),
+                TimestampSecondType::parse("2020-01-12T01:01:01"),
+            ])
+            .with_timezone(tz),
+        ),
+        // nulls are [1, 2, 1]
+        expected_null_counts: UInt64Array::from(vec![1, 2, 1]),
+        // row counts are [8, 8, 4]
+        expected_row_counts: UInt64Array::from(vec![8, 8, 4]),
+        column_name: "seconds_timezoned",
+    }
+    .run();
 }
 
 // date with different row group sizes
diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs
index 94ae9ff601ecf..a2afe58ddc4ad 100644
--- a/datafusion/core/tests/parquet/mod.rs
+++ b/datafusion/core/tests/parquet/mod.rs
@@ -332,9 +332,13 @@ fn make_boolean_batch(v: Vec<Option<bool>>) -> RecordBatch {
 ///
 /// Columns are named:
 /// "nanos" --> TimestampNanosecondArray
+/// "nanos_timezoned" --> TimestampNanosecondArray with timezone
 /// "micros" --> TimestampMicrosecondArray
+/// "micros_timezoned" --> TimestampMicrosecondArray with timezone
 /// "millis" --> TimestampMillisecondArray
+/// "millis_timezoned" --> TimestampMillisecondArray with timezone
 /// "seconds" --> TimestampSecondArray
+/// "seconds_timezoned" --> TimestampSecondArray with timezone
 /// "names" --> StringArray
 fn make_timestamp_batch(offset: Duration) -> RecordBatch {
     let ts_strings = vec![
@@ -345,6 +349,8 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch {
         Some("2020-01-02T01:01:01.0000000000001"),
     ];
 
+    let tz_string = "Pacific/Efate";
+
     let offset_nanos = offset.num_nanoseconds().expect("non overflow nanos");
 
     let ts_nanos = ts_strings
@@ -382,19 +388,47 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch {
         .map(|(i, _)| format!("Row {i} + {offset}"))
         .collect::<Vec<_>>();
 
-    let arr_nanos = TimestampNanosecondArray::from(ts_nanos);
-    let arr_micros = TimestampMicrosecondArray::from(ts_micros);
-    let arr_millis = TimestampMillisecondArray::from(ts_millis);
-    let arr_seconds = TimestampSecondArray::from(ts_seconds);
+    let arr_nanos = TimestampNanosecondArray::from(ts_nanos.clone());
+    let arr_nanos_timezoned =
+        TimestampNanosecondArray::from(ts_nanos).with_timezone(tz_string);
+    let arr_micros = TimestampMicrosecondArray::from(ts_micros.clone());
+    let arr_micros_timezoned =
+        TimestampMicrosecondArray::from(ts_micros).with_timezone(tz_string);
+    let arr_millis = TimestampMillisecondArray::from(ts_millis.clone());
+    let arr_millis_timezoned =
+        TimestampMillisecondArray::from(ts_millis).with_timezone(tz_string);
+    let arr_seconds = TimestampSecondArray::from(ts_seconds.clone());
+    let arr_seconds_timezoned =
+        TimestampSecondArray::from(ts_seconds).with_timezone(tz_string);
 
     let names = names.iter().map(|s| s.as_str()).collect::<Vec<_>>();
     let arr_names = StringArray::from(names);
 
     let schema = Schema::new(vec![
         Field::new("nanos", arr_nanos.data_type().clone(), true),
+        Field::new(
+            "nanos_timezoned",
+            arr_nanos_timezoned.data_type().clone(),
+            true,
+        ),
         Field::new("micros", arr_micros.data_type().clone(), true),
+        Field::new(
+            "micros_timezoned",
+            arr_micros_timezoned.data_type().clone(),
+            true,
+        ),
         Field::new("millis", arr_millis.data_type().clone(), true),
+        Field::new(
+            "millis_timezoned",
+            arr_millis_timezoned.data_type().clone(),
+            true,
+        ),
         Field::new("seconds", arr_seconds.data_type().clone(), true),
+        Field::new(
+            "seconds_timezoned",
+            arr_seconds_timezoned.data_type().clone(),
+            true,
+        ),
         Field::new("name", arr_names.data_type().clone(), true),
     ]);
     let schema = Arc::new(schema);
@@ -403,9 +437,13 @@ fn make_timestamp_batch(offset: Duration) -> RecordBatch {
         schema,
         vec![
             Arc::new(arr_nanos),
-            Arc::new(arr_micros),
-            Arc::new(arr_millis),
-            Arc::new(arr_seconds),
+            Arc::new(arr_nanos_timezoned),
+            Arc::new(arr_micros.clone()),
+            Arc::new(arr_micros_timezoned),
+            Arc::new(arr_millis.clone()),
+            Arc::new(arr_millis_timezoned),
+            Arc::new(arr_seconds.clone()),
+            Arc::new(arr_seconds_timezoned),
             Arc::new(arr_names),
         ],
     )

From acd00840c83335686c1f7baeabba75e777385726 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Sat, 1 Jun 2024 16:11:36 -0400
Subject: [PATCH 13/19] Add exhaustive match

---
 .../physical_plan/parquet/statistics.rs       | 44 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 8ad2a5de0fcad..bc14ef9fbe63a 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -407,7 +407,27 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
         DataType::Dictionary(_, value_type) => {
             min_statistics(value_type, iterator)
         }
-        _ => {
+
+        DataType::Map(_,_) |
+        DataType::Float16 |
+        DataType::Time32(_) |
+        DataType::Time64(_) |
+        DataType::Duration(_) |
+        DataType::Interval(_) |
+        DataType::Null |
+        DataType::LargeBinary |
+        DataType::BinaryView |
+        DataType::LargeUtf8 |
+        DataType::Utf8View |
+        DataType::List(_) |
+        DataType::ListView(_) |
+        DataType::FixedSizeList(_, _) |
+        DataType::LargeList(_) |
+        DataType::LargeListView(_) |
+        DataType::Struct(_) |
+        DataType::Union(_, _) |
+        DataType::Decimal256(_, _) |
+        DataType::RunEndEncoded(_, _) => {
             let len = iterator.count();
             // don't know how to extract statistics, so return a null array
             Ok(new_null_array(data_type, len))
@@ -562,7 +582,27 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
         DataType::Dictionary(_, value_type) => {
             max_statistics(value_type, iterator)
         }
-        _ => {
+
+        DataType::Map(_,_) |
+        DataType::Float16 |
+        DataType::Time32(_) |
+        DataType::Time64(_) |
+        DataType::Duration(_) |
+        DataType::Interval(_) |
+        DataType::Null |
+        DataType::LargeBinary |
+        DataType::BinaryView |
+        DataType::LargeUtf8 |
+        DataType::Utf8View |
+        DataType::List(_) |
+        DataType::ListView(_) |
+        DataType::FixedSizeList(_, _) |
+        DataType::LargeList(_) |
+        DataType::LargeListView(_) |
+        DataType::Struct(_) |
+        DataType::Union(_, _) |
+        DataType::Decimal256(_, _) |
+        DataType::RunEndEncoded(_, _) => {
             let len = iterator.count();
             // don't know how to extract statistics, so return a null array
             Ok(new_null_array(data_type, len))

From c6aa31cb57951dabd872fd2f90e567bb161b68b6 Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Wed, 5 Jun 2024 03:15:08 +0000
Subject: [PATCH 14/19] Update latest datatypes

---
 .../physical_plan/parquet/statistics.rs       | 182 +++++++++++++++---
 1 file changed, 160 insertions(+), 22 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index cf05a82569500..7db47bb374b30 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -23,8 +23,10 @@ use arrow::datatypes::i256;
 use arrow::{array::ArrayRef, datatypes::DataType};
 use arrow_array::{
     new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array,
-    FixedSizeBinaryArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array,
-    Int8Array, StringArray, TimestampMicrosecondArray, TimestampMillisecondArray,
+    Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array, Float64Array,
+    Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, LargeStringArray,
+    StringArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray,
+    Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray,
     TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array,
     UInt64Array, UInt8Array,
 };
@@ -195,8 +197,10 @@ make_stats_iterator!(
 /// * `$iterator_type` is the name of the iterator type (e.g. `MinBooleanStatsIterator`)
 /// * `$func` is the function to call to get the value (e.g. `min` or `max`)
 /// * `$bytes_func` is the function to call to get the value as bytes (e.g. `min_bytes` or `max_bytes`)
+/// * `$stat_value_type` is the type of the statistics value (e.g. `i128`)
+/// * `convert_func` is the function to convert the bytes to stats value (e.g. `from_bytes_to_i128`)
 macro_rules! make_decimal_stats_iterator {
-    ($iterator_type:ident, $func:ident, $bytes_func:ident) => {
+    ($iterator_type:ident, $func:ident, $bytes_func:ident, $stat_value_type:ident, $convert_func: ident) => {
         struct $iterator_type<'a, I>
         where
             I: Iterator<Item = Option<&'a ParquetStatistics>>,
@@ -217,7 +221,7 @@ macro_rules! make_decimal_stats_iterator {
         where
             I: Iterator<Item = Option<&'a ParquetStatistics>>,
         {
-            type Item = Option<i128>;
+            type Item = Option<$stat_value_type>;
 
             fn next(&mut self) -> Option<Self::Item> {
                 let next = self.iter.next();
@@ -227,13 +231,17 @@ macro_rules! make_decimal_stats_iterator {
                             return None;
                         }
                         match stats {
-                            ParquetStatistics::Int32(s) => Some(*s.$func() as i128),
-                            ParquetStatistics::Int64(s) => Some(*s.$func() as i128),
+                            ParquetStatistics::Int32(s) => {
+                                Some($stat_value_type::from(*s.$func()))
+                            }
+                            ParquetStatistics::Int64(s) => {
+                                Some($stat_value_type::from(*s.$func()))
+                            }
                             ParquetStatistics::ByteArray(s) => {
-                                Some(from_bytes_to_i128(s.$bytes_func()))
+                                Some($convert_func(s.$bytes_func()))
                             }
                             ParquetStatistics::FixedLenByteArray(s) => {
-                                Some(from_bytes_to_i128(s.$bytes_func()))
+                                Some($convert_func(s.$bytes_func()))
                             }
                             _ => None,
                         }
@@ -248,8 +256,34 @@ macro_rules! make_decimal_stats_iterator {
     };
 }
 
-make_decimal_stats_iterator!(MinDecimal128StatsIterator, min, min_bytes);
-make_decimal_stats_iterator!(MaxDecimal128StatsIterator, max, max_bytes);
+make_decimal_stats_iterator!(
+    MinDecimal128StatsIterator,
+    min,
+    min_bytes,
+    i128,
+    from_bytes_to_i128
+);
+make_decimal_stats_iterator!(
+    MaxDecimal128StatsIterator,
+    max,
+    max_bytes,
+    i128,
+    from_bytes_to_i128
+);
+make_decimal_stats_iterator!(
+    MinDecimal256StatsIterator,
+    min,
+    min_bytes,
+    i256,
+    from_bytes_to_i256
+);
+make_decimal_stats_iterator!(
+    MaxDecimal256StatsIterator,
+    max,
+    max_bytes,
+    i256,
+    from_bytes_to_i256
+);
 
 /// Lookups up the parquet column by name
 ///
@@ -341,6 +375,11 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
         DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter(
             MinInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)),
         ))),
+        DataType::Float16 => Ok(Arc::new(Float16Array::from_iter(
+            MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| x.and_then(|x| {
+                from_bytes_to_f16(x)
+            })),
+        ))),
         DataType::Float32 => Ok(Arc::new(Float32Array::from_iter(
             MinFloatStatsIterator::new(iterator).map(|x| x.copied()),
         ))),
@@ -384,9 +423,42 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
                 }
             })
         },
+        DataType::Time32(unit) => {
+            Ok(match unit {
+                TimeUnit::Second =>  Arc::new(Time32SecondArray::from_iter(
+                    MinInt32StatsIterator::new(iterator).map(|x| x.copied()),
+                )),
+                TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter(
+                    MinInt32StatsIterator::new(iterator).map(|x| x.copied()),
+                )),
+                _ => {
+                    let len = iterator.count();
+                    // don't know how to extract statistics, so return a null array
+                    new_null_array(data_type, len)
+                }
+            })
+        },
+        DataType::Time64(unit) => {
+            Ok(match unit {
+                TimeUnit::Microsecond =>  Arc::new(Time64MicrosecondArray::from_iter(
+                    MinInt64StatsIterator::new(iterator).map(|x| x.copied()),
+                )),
+                TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter(
+                    MinInt64StatsIterator::new(iterator).map(|x| x.copied()),
+                )),
+                _ => {
+                    let len = iterator.count();
+                    // don't know how to extract statistics, so return a null array
+                    new_null_array(data_type, len)
+                }
+            })
+        },
         DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
             MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())),
         ))),
+        DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter(
+            MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x|x.to_vec())),
+        ))),
         DataType::Utf8 => Ok(Arc::new(StringArray::from_iter(
             MinByteArrayStatsIterator::new(iterator).map(|x| {
                 x.and_then(|x| {
@@ -398,6 +470,19 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
                 })
             }),
         ))),
+        DataType::LargeUtf8 => {
+            Ok(Arc::new(LargeStringArray::from_iter(
+                MinByteArrayStatsIterator::new(iterator).map(|x| {
+                    x.and_then(|x| {
+                        let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
+                        if res.is_none() {
+                            log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it.");
+                        }
+                        res
+                    })
+                }),
+            )))
+        }
         DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from(
             MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| {
                 x.and_then(|x| {
@@ -420,20 +505,21 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
             ).with_precision_and_scale(*precision, *scale)?;
             Ok(Arc::new(arr))
         },
+        DataType::Decimal256(precision, scale) => {
+            let arr = Decimal256Array::from_iter(
+                MinDecimal256StatsIterator::new(iterator)
+            ).with_precision_and_scale(*precision, *scale)?;
+            Ok(Arc::new(arr))
+        },
         DataType::Dictionary(_, value_type) => {
             min_statistics(value_type, iterator)
         }
 
         DataType::Map(_,_) |
-        DataType::Float16 |
-        DataType::Time32(_) |
-        DataType::Time64(_) |
         DataType::Duration(_) |
         DataType::Interval(_) |
         DataType::Null |
-        DataType::LargeBinary |
         DataType::BinaryView |
-        DataType::LargeUtf8 |
         DataType::Utf8View |
         DataType::List(_) |
         DataType::ListView(_) |
@@ -442,7 +528,6 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
         DataType::LargeListView(_) |
         DataType::Struct(_) |
         DataType::Union(_, _) |
-        DataType::Decimal256(_, _) |
         DataType::RunEndEncoded(_, _) => {
             let len = iterator.count();
             // don't know how to extract statistics, so return a null array
@@ -516,6 +601,11 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
         DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter(
             MaxInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)),
         ))),
+        DataType::Float16 => Ok(Arc::new(Float16Array::from_iter(
+            MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| x.and_then(|x| {
+                from_bytes_to_f16(x)
+            })),
+        ))),
         DataType::Float32 => Ok(Arc::new(Float32Array::from_iter(
             MaxFloatStatsIterator::new(iterator).map(|x| x.copied()),
         ))),
@@ -559,9 +649,42 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
                 }
             })
         }
+        DataType::Time32(unit) => {
+            Ok(match unit {
+                TimeUnit::Second =>  Arc::new(Time32SecondArray::from_iter(
+                    MaxInt32StatsIterator::new(iterator).map(|x| x.copied()),
+                )),
+                TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter(
+                    MaxInt32StatsIterator::new(iterator).map(|x| x.copied()),
+                )),
+                _ => {
+                    let len = iterator.count();
+                    // don't know how to extract statistics, so return a null array
+                    new_null_array(data_type, len)
+                }
+            })
+        },
+        DataType::Time64(unit) => {
+            Ok(match unit {
+                TimeUnit::Microsecond =>  Arc::new(Time64MicrosecondArray::from_iter(
+                    MaxInt64StatsIterator::new(iterator).map(|x| x.copied()),
+                )),
+                TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter(
+                    MaxInt64StatsIterator::new(iterator).map(|x| x.copied()),
+                )),
+                _ => {
+                    let len = iterator.count();
+                    // don't know how to extract statistics, so return a null array
+                    new_null_array(data_type, len)
+                }
+            })
+        },
         DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
             MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())),
         ))),
+        DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter(
+            MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x|x.to_vec())),
+        ))),
         DataType::Utf8 => Ok(Arc::new(StringArray::from_iter(
             MaxByteArrayStatsIterator::new(iterator).map(|x| {
                 x.and_then(|x| {
@@ -573,6 +696,19 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
                 })
             }),
         ))),
+        DataType::LargeUtf8 => {
+            Ok(Arc::new(LargeStringArray::from_iter(
+                MaxByteArrayStatsIterator::new(iterator).map(|x| {
+                    x.and_then(|x| {
+                        let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
+                        if res.is_none() {
+                            log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it.");
+                        }
+                        res
+                    })
+                }),
+            )))
+        }
         DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from(
             MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| {
                 x.and_then(|x| {
@@ -595,20 +731,21 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
                 ).with_precision_and_scale(*precision, *scale)?;
                 Ok(Arc::new(arr) as ArrayRef)
         }
+        DataType::Decimal256(precision, scale) => {
+            let arr = Decimal256Array::from_iter(
+                MaxDecimal256StatsIterator::new(iterator)
+            ).with_precision_and_scale(*precision, *scale)?;
+            Ok(Arc::new(arr))
+        },
         DataType::Dictionary(_, value_type) => {
             max_statistics(value_type, iterator)
         }
 
         DataType::Map(_,_) |
-        DataType::Float16 |
-        DataType::Time32(_) |
-        DataType::Time64(_) |
         DataType::Duration(_) |
         DataType::Interval(_) |
         DataType::Null |
-        DataType::LargeBinary |
         DataType::BinaryView |
-        DataType::LargeUtf8 |
         DataType::Utf8View |
         DataType::List(_) |
         DataType::ListView(_) |
@@ -617,7 +754,6 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
         DataType::LargeListView(_) |
         DataType::Struct(_) |
         DataType::Union(_, _) |
-        DataType::Decimal256(_, _) |
         DataType::RunEndEncoded(_, _) => {
             let len = iterator.count();
             // don't know how to extract statistics, so return a null array
@@ -926,6 +1062,8 @@ mod test {
             input: timestamp_seconds_array(
                 [
                     Some(1),
+                    None,
+                    Some(3),
                     // row group 2
                     Some(9),
                     Some(5),

From 4313fffc8de22dc6d3600d009fda3eca61b86e88 Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Wed, 5 Jun 2024 03:17:02 +0000
Subject: [PATCH 15/19] fix bad comment

---
 .../core/src/datasource/physical_plan/parquet/statistics.rs      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 7db47bb374b30..61135a5a341ea 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -1061,6 +1061,7 @@ mod test {
         Test {
             input: timestamp_seconds_array(
                 [
+                    // row group 1
                     Some(1),
                     None,
                     Some(3),

From 4662432fb2c5f6e20df01c5ad4d6da51c10474c9 Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Wed, 5 Jun 2024 03:50:36 +0000
Subject: [PATCH 16/19] Remove duplications using paste

---
 datafusion/core/Cargo.toml                    |   1 +
 .../physical_plan/parquet/statistics.rs       | 672 ++++++------------
 2 files changed, 235 insertions(+), 438 deletions(-)

diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index 9f1f7484357b2..3946758ff9370 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -121,6 +121,7 @@ num_cpus = { workspace = true }
 object_store = { workspace = true }
 parking_lot = { workspace = true }
 parquet = { workspace = true, optional = true, default-features = true }
+paste = "1.0.15"
 pin-project-lite = "^0.2.7"
 rand = { workspace = true }
 sqlparser = { workspace = true }
diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 61135a5a341ea..5a1b03c75cd65 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -36,6 +36,7 @@ use half::f16;
 use parquet::file::metadata::ParquetMetaData;
 use parquet::file::statistics::Statistics as ParquetStatistics;
 use parquet::schema::types::SchemaDescriptor;
+use paste::paste;
 use std::sync::Arc;
 
 // Convert the bytes array to i128.
@@ -285,6 +286,237 @@ make_decimal_stats_iterator!(
     from_bytes_to_i256
 );
 
+/// Special macro to combine the statistics iterators for min and max using the [`paste`] macro.
+/// This is used to avoid repeating the same code for min and max statistics extractions
+///
+/// Parameters:
+/// stat_type_prefix: The prefix of the statistics iterator type (e.g. `Min` or `Max`)
+/// data_type: The data type of the statistics (e.g. `DataType::Int32`)
+/// iterator: The iterator of [`ParquetStatistics`] to extract the statistics from.
+macro_rules! get_statistics {
+    ($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
+        paste! {
+        match $data_type {
+            DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter(
+                [<$stat_type_prefix BooleanStatsIterator>]::new($iterator).map(|x| x.copied()),
+            ))),
+            DataType::Int8 => Ok(Arc::new(Int8Array::from_iter(
+                [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| {
+                    x.and_then(|x| {
+                        if let Ok(v) = i8::try_from(*x) {
+                            Some(v)
+                        } else {
+                            None
+                        }
+                    })
+                }),
+            ))),
+            DataType::Int16 => Ok(Arc::new(Int16Array::from_iter(
+                [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| {
+                    x.and_then(|x| {
+                        if let Ok(v) = i16::try_from(*x) {
+                            Some(v)
+                        } else {
+                            None
+                        }
+                    })
+                }),
+            ))),
+            DataType::Int32 => Ok(Arc::new(Int32Array::from_iter(
+                [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()),
+            ))),
+            DataType::Int64 => Ok(Arc::new(Int64Array::from_iter(
+                [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()),
+            ))),
+            DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter(
+                [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| {
+                    x.and_then(|x| {
+                        if let Ok(v) = u8::try_from(*x) {
+                            Some(v)
+                        } else {
+                            None
+                        }
+                    })
+                }),
+            ))),
+            DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter(
+                [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| {
+                    x.and_then(|x| {
+                        if let Ok(v) = u16::try_from(*x) {
+                            Some(v)
+                        } else {
+                            None
+                        }
+                    })
+                }),
+            ))),
+            DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter(
+                [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.map(|x| *x as u32)),
+            ))),
+            DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter(
+                [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.map(|x| *x as u64)),
+            ))),
+            DataType::Float16 => Ok(Arc::new(Float16Array::from_iter(
+                [<$stat_type_prefix FixedLenByteArrayStatsIterator>]::new($iterator).map(|x| x.and_then(|x| {
+                    from_bytes_to_f16(x)
+                })),
+            ))),
+            DataType::Float32 => Ok(Arc::new(Float32Array::from_iter(
+                [<$stat_type_prefix FloatStatsIterator>]::new($iterator).map(|x| x.copied()),
+            ))),
+            DataType::Float64 => Ok(Arc::new(Float64Array::from_iter(
+                [<$stat_type_prefix DoubleStatsIterator>]::new($iterator).map(|x| x.copied()),
+            ))),
+            DataType::Date32 => Ok(Arc::new(Date32Array::from_iter(
+                [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()),
+            ))),
+            DataType::Date64 => Ok(Arc::new(Date64Array::from_iter(
+                [<$stat_type_prefix Int32StatsIterator>]::new($iterator)
+                    .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)),
+            ))),
+            DataType::Timestamp(unit, timezone) =>{
+                let iter = [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied());
+
+                Ok(match unit {
+                    TimeUnit::Second => {
+                        Arc::new(match timezone {
+                            Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()),
+                            None => TimestampSecondArray::from_iter(iter),
+                        })
+                    }
+                    TimeUnit::Millisecond => {
+                        Arc::new(match timezone {
+                            Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()),
+                            None => TimestampMillisecondArray::from_iter(iter),
+                        })
+                    }
+                    TimeUnit::Microsecond => {
+                        Arc::new(match timezone {
+                            Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()),
+                            None => TimestampMicrosecondArray::from_iter(iter),
+                        })
+                    }
+                    TimeUnit::Nanosecond => {
+                        Arc::new(match timezone {
+                            Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()),
+                            None => TimestampNanosecondArray::from_iter(iter),
+                        })
+                    }
+                })
+            },
+            DataType::Time32(unit) => {
+                Ok(match unit {
+                    TimeUnit::Second =>  Arc::new(Time32SecondArray::from_iter(
+                        [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()),
+                    )),
+                    TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter(
+                        [<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()),
+                    )),
+                    _ => {
+                        let len = $iterator.count();
+                        // don't know how to extract statistics, so return a null array
+                        new_null_array($data_type, len)
+                    }
+                })
+            },
+            DataType::Time64(unit) => {
+                Ok(match unit {
+                    TimeUnit::Microsecond =>  Arc::new(Time64MicrosecondArray::from_iter(
+                        [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()),
+                    )),
+                    TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter(
+                        [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()),
+                    )),
+                    _ => {
+                        let len = $iterator.count();
+                        // don't know how to extract statistics, so return a null array
+                        new_null_array($data_type, len)
+                    }
+                })
+            },
+            DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
+                [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| x.map(|x| x.to_vec())),
+            ))),
+            DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter(
+                [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| x.map(|x|x.to_vec())),
+            ))),
+            DataType::Utf8 => Ok(Arc::new(StringArray::from_iter(
+                [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| {
+                    x.and_then(|x| {
+                        let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
+                        if res.is_none() {
+                            log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it.");
+                        }
+                        res
+                    })
+                }),
+            ))),
+            DataType::LargeUtf8 => {
+                Ok(Arc::new(LargeStringArray::from_iter(
+                    [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| {
+                        x.and_then(|x| {
+                            let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
+                            if res.is_none() {
+                                log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it.");
+                            }
+                            res
+                        })
+                    }),
+                )))
+            }
+            DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from(
+                [<$stat_type_prefix FixedLenByteArrayStatsIterator>]::new($iterator).map(|x| {
+                    x.and_then(|x| {
+                        if x.len().try_into() == Ok(*size) {
+                            Some(x)
+                        } else {
+                            log::debug!(
+                                "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
+                                size,
+                                x.len(),
+                            );
+                            None
+                        }
+                    })
+                }).collect::<Vec<_>>(),
+            ))),
+            DataType::Decimal128(precision, scale) => {
+                let arr = Decimal128Array::from_iter(
+                    [<$stat_type_prefix Decimal128StatsIterator>]::new($iterator)
+                ).with_precision_and_scale(*precision, *scale)?;
+                Ok(Arc::new(arr))
+            },
+            DataType::Decimal256(precision, scale) => {
+                let arr = Decimal256Array::from_iter(
+                    [<$stat_type_prefix Decimal256StatsIterator>]::new($iterator)
+                ).with_precision_and_scale(*precision, *scale)?;
+                Ok(Arc::new(arr))
+            },
+            DataType::Dictionary(_, value_type) => {
+                [<$stat_type_prefix:lower _ statistics>](value_type, $iterator)
+            }
+
+            DataType::Map(_,_) |
+            DataType::Duration(_) |
+            DataType::Interval(_) |
+            DataType::Null |
+            DataType::BinaryView |
+            DataType::Utf8View |
+            DataType::List(_) |
+            DataType::ListView(_) |
+            DataType::FixedSizeList(_, _) |
+            DataType::LargeList(_) |
+            DataType::LargeListView(_) |
+            DataType::Struct(_) |
+            DataType::Union(_, _) |
+            DataType::RunEndEncoded(_, _) => {
+                let len = $iterator.count();
+                // don't know how to extract statistics, so return a null array
+                Ok(new_null_array($data_type, len))
+            }
+        }}}
+}
+
 /// Lookups up the parquet column by name
 ///
 /// Returns the parquet column index and the corresponding arrow field
@@ -315,225 +547,7 @@ pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
-    match data_type {
-        DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter(
-            MinBooleanStatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Int8 => Ok(Arc::new(Int8Array::from_iter(
-            MinInt32StatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if let Ok(v) = i8::try_from(*x) {
-                        Some(v)
-                    } else {
-                        None
-                    }
-                })
-            }),
-        ))),
-        DataType::Int16 => Ok(Arc::new(Int16Array::from_iter(
-            MinInt32StatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if let Ok(v) = i16::try_from(*x) {
-                        Some(v)
-                    } else {
-                        None
-                    }
-                })
-            }),
-        ))),
-        DataType::Int32 => Ok(Arc::new(Int32Array::from_iter(
-            MinInt32StatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Int64 => Ok(Arc::new(Int64Array::from_iter(
-            MinInt64StatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter(
-            MinInt32StatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if let Ok(v) = u8::try_from(*x) {
-                        Some(v)
-                    } else {
-                        None
-                    }
-                })
-            }),
-        ))),
-        DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter(
-            MinInt32StatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if let Ok(v) = u16::try_from(*x) {
-                        Some(v)
-                    } else {
-                        None
-                    }
-                })
-            }),
-        ))),
-        DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter(
-            MinInt32StatsIterator::new(iterator).map(|x| x.map(|x| *x as u32)),
-        ))),
-        DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter(
-            MinInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)),
-        ))),
-        DataType::Float16 => Ok(Arc::new(Float16Array::from_iter(
-            MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| x.and_then(|x| {
-                from_bytes_to_f16(x)
-            })),
-        ))),
-        DataType::Float32 => Ok(Arc::new(Float32Array::from_iter(
-            MinFloatStatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Float64 => Ok(Arc::new(Float64Array::from_iter(
-            MinDoubleStatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Date32 => Ok(Arc::new(Date32Array::from_iter(
-            MinInt32StatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Date64 => Ok(Arc::new(Date64Array::from_iter(
-            MinInt32StatsIterator::new(iterator)
-                .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)),
-        ))),
-        DataType::Timestamp(unit, timezone) =>{
-            let iter = MinInt64StatsIterator::new(iterator).map(|x| x.copied());
-
-            Ok(match unit {
-                TimeUnit::Second => {
-                    Arc::new(match timezone {
-                        Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()),
-                        None => TimestampSecondArray::from_iter(iter),
-                    })
-                }
-                TimeUnit::Millisecond => {
-                    Arc::new(match timezone {
-                        Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()),
-                        None => TimestampMillisecondArray::from_iter(iter),
-                    })
-                }
-                TimeUnit::Microsecond => {
-                    Arc::new(match timezone {
-                        Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()),
-                        None => TimestampMicrosecondArray::from_iter(iter),
-                    })
-                }
-                TimeUnit::Nanosecond => {
-                    Arc::new(match timezone {
-                        Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()),
-                        None => TimestampNanosecondArray::from_iter(iter),
-                    })
-                }
-            })
-        },
-        DataType::Time32(unit) => {
-            Ok(match unit {
-                TimeUnit::Second =>  Arc::new(Time32SecondArray::from_iter(
-                    MinInt32StatsIterator::new(iterator).map(|x| x.copied()),
-                )),
-                TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter(
-                    MinInt32StatsIterator::new(iterator).map(|x| x.copied()),
-                )),
-                _ => {
-                    let len = iterator.count();
-                    // don't know how to extract statistics, so return a null array
-                    new_null_array(data_type, len)
-                }
-            })
-        },
-        DataType::Time64(unit) => {
-            Ok(match unit {
-                TimeUnit::Microsecond =>  Arc::new(Time64MicrosecondArray::from_iter(
-                    MinInt64StatsIterator::new(iterator).map(|x| x.copied()),
-                )),
-                TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter(
-                    MinInt64StatsIterator::new(iterator).map(|x| x.copied()),
-                )),
-                _ => {
-                    let len = iterator.count();
-                    // don't know how to extract statistics, so return a null array
-                    new_null_array(data_type, len)
-                }
-            })
-        },
-        DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
-            MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())),
-        ))),
-        DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter(
-            MinByteArrayStatsIterator::new(iterator).map(|x| x.map(|x|x.to_vec())),
-        ))),
-        DataType::Utf8 => Ok(Arc::new(StringArray::from_iter(
-            MinByteArrayStatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
-                    if res.is_none() {
-                        log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it.");
-                    }
-                    res
-                })
-            }),
-        ))),
-        DataType::LargeUtf8 => {
-            Ok(Arc::new(LargeStringArray::from_iter(
-                MinByteArrayStatsIterator::new(iterator).map(|x| {
-                    x.and_then(|x| {
-                        let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
-                        if res.is_none() {
-                            log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it.");
-                        }
-                        res
-                    })
-                }),
-            )))
-        }
-        DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from(
-            MinFixedLenByteArrayStatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if x.len().try_into() == Ok(*size) {
-                        Some(x)
-                    } else {
-                        log::debug!(
-                            "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
-                            size,
-                            x.len(),
-                        );
-                        None
-                    }
-                })
-            }).collect::<Vec<_>>(),
-        ))),
-        DataType::Decimal128(precision, scale) => {
-            let arr = Decimal128Array::from_iter(
-                MinDecimal128StatsIterator::new(iterator)
-            ).with_precision_and_scale(*precision, *scale)?;
-            Ok(Arc::new(arr))
-        },
-        DataType::Decimal256(precision, scale) => {
-            let arr = Decimal256Array::from_iter(
-                MinDecimal256StatsIterator::new(iterator)
-            ).with_precision_and_scale(*precision, *scale)?;
-            Ok(Arc::new(arr))
-        },
-        DataType::Dictionary(_, value_type) => {
-            min_statistics(value_type, iterator)
-        }
-
-        DataType::Map(_,_) |
-        DataType::Duration(_) |
-        DataType::Interval(_) |
-        DataType::Null |
-        DataType::BinaryView |
-        DataType::Utf8View |
-        DataType::List(_) |
-        DataType::ListView(_) |
-        DataType::FixedSizeList(_, _) |
-        DataType::LargeList(_) |
-        DataType::LargeListView(_) |
-        DataType::Struct(_) |
-        DataType::Union(_, _) |
-        DataType::RunEndEncoded(_, _) => {
-            let len = iterator.count();
-            // don't know how to extract statistics, so return a null array
-            Ok(new_null_array(data_type, len))
-        }
-    }
+    get_statistics!(Min, data_type, iterator)
 }
 
 /// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`]
@@ -541,225 +555,7 @@ pub(crate) fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics
     data_type: &DataType,
     iterator: I,
 ) -> Result<ArrayRef> {
-    match data_type {
-        DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter(
-            MaxBooleanStatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Int8 => Ok(Arc::new(Int8Array::from_iter(
-            MaxInt32StatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if let Ok(v) = i8::try_from(*x) {
-                        Some(v)
-                    } else {
-                        None
-                    }
-                })
-            }),
-        ))),
-        DataType::Int16 => Ok(Arc::new(Int16Array::from_iter(
-            MaxInt32StatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if let Ok(v) = i16::try_from(*x) {
-                        Some(v)
-                    } else {
-                        None
-                    }
-                })
-            }),
-        ))),
-        DataType::Int32 => Ok(Arc::new(Int32Array::from_iter(
-            MaxInt32StatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Int64 => Ok(Arc::new(Int64Array::from_iter(
-            MaxInt64StatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter(
-            MaxInt32StatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if let Ok(v) = u8::try_from(*x) {
-                        Some(v)
-                    } else {
-                        None
-                    }
-                })
-            }),
-        ))),
-        DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter(
-            MaxInt32StatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if let Ok(v) = u16::try_from(*x) {
-                        Some(v)
-                    } else {
-                        None
-                    }
-                })
-            }),
-        ))),
-        DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter(
-            MaxInt32StatsIterator::new(iterator).map(|x| x.map(|x| *x as u32)),
-        ))),
-        DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter(
-            MaxInt64StatsIterator::new(iterator).map(|x| x.map(|x| *x as u64)),
-        ))),
-        DataType::Float16 => Ok(Arc::new(Float16Array::from_iter(
-            MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| x.and_then(|x| {
-                from_bytes_to_f16(x)
-            })),
-        ))),
-        DataType::Float32 => Ok(Arc::new(Float32Array::from_iter(
-            MaxFloatStatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Float64 => Ok(Arc::new(Float64Array::from_iter(
-            MaxDoubleStatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Date32 => Ok(Arc::new(Date32Array::from_iter(
-            MaxInt32StatsIterator::new(iterator).map(|x| x.copied()),
-        ))),
-        DataType::Date64 => Ok(Arc::new(Date64Array::from_iter(
-            MaxInt32StatsIterator::new(iterator)
-                .map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000)),
-        ))),
-        DataType::Timestamp(unit, timezone) => {
-            let iter = MaxInt64StatsIterator::new(iterator).map(|x| x.copied());
-
-            Ok(match unit {
-                TimeUnit::Second => {
-                    Arc::new(match timezone {
-                        Some(tz) => TimestampSecondArray::from_iter(iter).with_timezone(tz.clone()),
-                        None => TimestampSecondArray::from_iter(iter),
-                    })
-                }
-                TimeUnit::Millisecond => {
-                    Arc::new(match timezone {
-                        Some(tz) => TimestampMillisecondArray::from_iter(iter).with_timezone(tz.clone()),
-                        None => TimestampMillisecondArray::from_iter(iter),
-                    })
-                }
-                TimeUnit::Microsecond => {
-                    Arc::new(match timezone {
-                        Some(tz) => TimestampMicrosecondArray::from_iter(iter).with_timezone(tz.clone()),
-                        None => TimestampMicrosecondArray::from_iter(iter),
-                    })
-                }
-                TimeUnit::Nanosecond => {
-                    Arc::new(match timezone {
-                        Some(tz) => TimestampNanosecondArray::from_iter(iter).with_timezone(tz.clone()),
-                        None => TimestampNanosecondArray::from_iter(iter),
-                    })
-                }
-            })
-        }
-        DataType::Time32(unit) => {
-            Ok(match unit {
-                TimeUnit::Second =>  Arc::new(Time32SecondArray::from_iter(
-                    MaxInt32StatsIterator::new(iterator).map(|x| x.copied()),
-                )),
-                TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter(
-                    MaxInt32StatsIterator::new(iterator).map(|x| x.copied()),
-                )),
-                _ => {
-                    let len = iterator.count();
-                    // don't know how to extract statistics, so return a null array
-                    new_null_array(data_type, len)
-                }
-            })
-        },
-        DataType::Time64(unit) => {
-            Ok(match unit {
-                TimeUnit::Microsecond =>  Arc::new(Time64MicrosecondArray::from_iter(
-                    MaxInt64StatsIterator::new(iterator).map(|x| x.copied()),
-                )),
-                TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter(
-                    MaxInt64StatsIterator::new(iterator).map(|x| x.copied()),
-                )),
-                _ => {
-                    let len = iterator.count();
-                    // don't know how to extract statistics, so return a null array
-                    new_null_array(data_type, len)
-                }
-            })
-        },
-        DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
-            MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x| x.to_vec())),
-        ))),
-        DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter(
-            MaxByteArrayStatsIterator::new(iterator).map(|x| x.map(|x|x.to_vec())),
-        ))),
-        DataType::Utf8 => Ok(Arc::new(StringArray::from_iter(
-            MaxByteArrayStatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
-                    if res.is_none() {
-                        log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it.");
-                    }
-                    res
-                })
-            }),
-        ))),
-        DataType::LargeUtf8 => {
-            Ok(Arc::new(LargeStringArray::from_iter(
-                MaxByteArrayStatsIterator::new(iterator).map(|x| {
-                    x.and_then(|x| {
-                        let res = std::str::from_utf8(x).map(|s| s.to_string()).ok();
-                        if res.is_none() {
-                            log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it.");
-                        }
-                        res
-                    })
-                }),
-            )))
-        }
-        DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from(
-            MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x| {
-                x.and_then(|x| {
-                    if x.len().try_into() == Ok(*size) {
-                        Some(x)
-                    } else {
-                        log::debug!(
-                            "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
-                            size,
-                            x.len(),
-                        );
-                        None
-                    }
-                })
-            }).collect::<Vec<_>>(),
-        ))),
-        DataType::Decimal128(precision, scale) => {
-                let arr = Decimal128Array::from_iter(
-                    MaxDecimal128StatsIterator::new(iterator)
-                ).with_precision_and_scale(*precision, *scale)?;
-                Ok(Arc::new(arr) as ArrayRef)
-        }
-        DataType::Decimal256(precision, scale) => {
-            let arr = Decimal256Array::from_iter(
-                MaxDecimal256StatsIterator::new(iterator)
-            ).with_precision_and_scale(*precision, *scale)?;
-            Ok(Arc::new(arr))
-        },
-        DataType::Dictionary(_, value_type) => {
-            max_statistics(value_type, iterator)
-        }
-
-        DataType::Map(_,_) |
-        DataType::Duration(_) |
-        DataType::Interval(_) |
-        DataType::Null |
-        DataType::BinaryView |
-        DataType::Utf8View |
-        DataType::List(_) |
-        DataType::ListView(_) |
-        DataType::FixedSizeList(_, _) |
-        DataType::LargeList(_) |
-        DataType::LargeListView(_) |
-        DataType::Struct(_) |
-        DataType::Union(_, _) |
-        DataType::RunEndEncoded(_, _) => {
-            let len = iterator.count();
-            // don't know how to extract statistics, so return a null array
-            Ok(new_null_array(data_type, len))
-        }
-    }
+    get_statistics!(Max, data_type, iterator)
 }
 
 /// What type of statistics should be extracted?

From 1bcc5e33738c2ee3d0cdd67ad1b7dd7bbe156267 Mon Sep 17 00:00:00 2001
From: Xin Li <bluer906@gmail.com>
Date: Wed, 5 Jun 2024 03:59:41 +0000
Subject: [PATCH 17/19] Fix comment

---
 .../core/src/datasource/physical_plan/parquet/statistics.rs     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 5a1b03c75cd65..287925fb25332 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -192,7 +192,7 @@ make_stats_iterator!(
 /// `Int32`, `Int64` or `ByteArray` or `FixedSizeByteArray` :mindblown:
 ///
 /// This iterator handles all cases, extracting the values
-/// and converting it to `i128`.
+/// and converting it to `stat_value_type`.
 ///
 /// Parameters:
 /// * `$iterator_type` is the name of the iterator type (e.g. `MinBooleanStatsIterator`)

From fbfb611b14e16be968f4c340583abc52d9c156e2 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 5 Jun 2024 10:07:47 -0400
Subject: [PATCH 18/19] Update Cargo.lock

---
 datafusion-cli/Cargo.lock | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 304058650164f..b165070c60605 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -942,7 +942,7 @@ version = "3.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro-error",
  "proc-macro2",
  "quote",
@@ -976,7 +976,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7"
 dependencies = [
  "strum 0.26.2",
- "strum_macros 0.26.2",
+ "strum_macros 0.26.4",
  "unicode-width",
 ]
 
@@ -1156,6 +1156,7 @@ dependencies = [
  "object_store",
  "parking_lot",
  "parquet",
+ "paste",
  "pin-project-lite",
  "rand",
  "sqlparser",
@@ -1255,7 +1256,7 @@ dependencies = [
  "serde_json",
  "sqlparser",
  "strum 0.26.2",
- "strum_macros 0.26.2",
+ "strum_macros 0.26.4",
 ]
 
 [[package]]
@@ -1809,6 +1810,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
 [[package]]
 name = "hermit-abi"
 version = "0.1.19"
@@ -1881,9 +1888,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
 
 [[package]]
 name = "hyper"
-version = "0.14.28"
+version = "0.14.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80"
+checksum = "f361cde2f109281a220d4307746cdfd5ee3f410da58a70377762396775634b33"
 dependencies = [
  "bytes",
  "futures-channel",
@@ -2683,9 +2690,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.84"
+version = "1.0.85"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6"
+checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23"
 dependencies = [
  "unicode-ident",
 ]
@@ -3218,7 +3225,7 @@ version = "0.7.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "syn 1.0.109",
@@ -3303,7 +3310,7 @@ version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29"
 dependencies = [
- "strum_macros 0.26.2",
+ "strum_macros 0.26.4",
 ]
 
 [[package]]
@@ -3312,7 +3319,7 @@ version = "0.25.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
 dependencies = [
- "heck",
+ "heck 0.4.1",
  "proc-macro2",
  "quote",
  "rustversion",
@@ -3321,11 +3328,11 @@ dependencies = [
 
 [[package]]
 name = "strum_macros"
-version = "0.26.2"
+version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck",
+ "heck 0.5.0",
  "proc-macro2",
  "quote",
  "rustversion",
@@ -3711,9 +3718,9 @@ checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
 
 [[package]]
 name = "unicode-width"
-version = "0.1.12"
+version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68f5e5f3158ecfd4b8ff6fe086db7c8467a2dfdac97fe420f2b7c4aa97af66d6"
+checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d"
 
 [[package]]
 name = "untrusted"

From 0dd201a42e0eb43457251d44ed3224d687b440f6 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 5 Jun 2024 10:29:11 -0400
Subject: [PATCH 19/19] fix docs

---
 .../core/src/datasource/physical_plan/parquet/statistics.rs     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
index 287925fb25332..a73538d02a7fa 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
@@ -286,7 +286,7 @@ make_decimal_stats_iterator!(
     from_bytes_to_i256
 );
 
-/// Special macro to combine the statistics iterators for min and max using the [`paste`] macro.
+/// Special macro to combine the statistics iterators for min and max using the [`mod@paste`] macro.
 /// This is used to avoid repeating the same code for min and max statistics extractions
 ///
 /// Parameters: