1919
2020// TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328
2121
22- use arrow:: { array:: ArrayRef , datatypes:: DataType , datatypes:: TimeUnit } ;
22+ use arrow:: { array:: ArrayRef , datatypes:: i256 , datatypes :: DataType , datatypes:: TimeUnit } ;
2323use arrow_array:: { new_empty_array, new_null_array, UInt64Array } ;
2424use arrow_schema:: { Field , FieldRef , Schema } ;
2525use datafusion_common:: {
@@ -36,7 +36,13 @@ pub(crate) fn from_bytes_to_i128(b: &[u8]) -> i128 {
3636 // The bytes array are from parquet file and must be the big-endian.
3737 // The endian is defined by parquet format, and the reference document
3838 // https://github.com/apache/parquet-format/blob/54e53e5d7794d383529dd30746378f19a12afd58/src/main/thrift/parquet.thrift#L66
39- i128:: from_be_bytes ( sign_extend_be ( b) )
39+ i128:: from_be_bytes ( sign_extend_be :: < 16 > ( b) )
40+ }
41+
42+ // Convert the bytes array to i256.
43+ // The endian of the input bytes array must be big-endian.
44+ pub ( crate ) fn from_bytes_to_i256 ( b : & [ u8 ] ) -> i256 {
45+ i256:: from_be_bytes ( sign_extend_be :: < 32 > ( b) )
4046}
4147
4248// Convert the bytes array to f16
@@ -48,13 +54,13 @@ pub(crate) fn from_bytes_to_f16(b: &[u8]) -> Option<f16> {
4854}
4955
5056// Copy from arrow-rs
51- // https://github.com/apache/arrow-rs/blob/733b7e7fd1e8c43a404c3ce40ecf741d493c21b4 /parquet/src/arrow/buffer/bit_util.rs#L55
52- // Convert the byte slice to fixed length byte array with the length of 16
53- fn sign_extend_be ( b : & [ u8 ] ) -> [ u8 ; 16 ] {
54- assert ! ( b. len( ) <= 16 , "Array too large, expected less than 16 " ) ;
57+ // https://github.com/apache/arrow-rs/blob/198af7a3f4aa20f9bd003209d9f04b0f37bb120e /parquet/src/arrow/buffer/bit_util.rs#L54
58+ // Convert the byte slice to fixed length byte array with the length of N.
59+ pub fn sign_extend_be < const N : usize > ( b : & [ u8 ] ) -> [ u8 ; N ] {
60+ assert ! ( b. len( ) <= N , "Array too large, expected less than {N} " ) ;
5561 let is_negative = ( b[ 0 ] & 128u8 ) == 128u8 ;
56- let mut result = if is_negative { [ 255u8 ; 16 ] } else { [ 0u8 ; 16 ] } ;
57- for ( d, s) in result. iter_mut ( ) . skip ( 16 - b. len ( ) ) . zip ( b) {
62+ let mut result = if is_negative { [ 255u8 ; N ] } else { [ 0u8 ; N ] } ;
63+ for ( d, s) in result. iter_mut ( ) . skip ( N - b. len ( ) ) . zip ( b) {
5864 * d = * s;
5965 }
6066 result
@@ -83,6 +89,13 @@ macro_rules! get_statistic {
8389 * scale,
8490 ) )
8591 }
92+ Some ( DataType :: Decimal256 ( precision, scale) ) => {
93+ Some ( ScalarValue :: Decimal256 (
94+ Some ( i256:: from( * s. $func( ) ) ) ,
95+ * precision,
96+ * scale,
97+ ) )
98+ }
8699 Some ( DataType :: Int8 ) => {
87100 Some ( ScalarValue :: Int8 ( Some ( ( * s. $func( ) ) . try_into( ) . unwrap( ) ) ) )
88101 }
@@ -123,6 +136,13 @@ macro_rules! get_statistic {
123136 * scale,
124137 ) )
125138 }
139+ Some ( DataType :: Decimal256 ( precision, scale) ) => {
140+ Some ( ScalarValue :: Decimal256 (
141+ Some ( i256:: from( * s. $func( ) ) ) ,
142+ * precision,
143+ * scale,
144+ ) )
145+ }
126146 Some ( DataType :: UInt64 ) => {
127147 Some ( ScalarValue :: UInt64 ( Some ( ( * s. $func( ) ) as u64 ) ) )
128148 }
@@ -169,6 +189,13 @@ macro_rules! get_statistic {
169189 * scale,
170190 ) )
171191 }
192+ Some ( DataType :: Decimal256 ( precision, scale) ) => {
193+ Some ( ScalarValue :: Decimal256 (
194+ Some ( from_bytes_to_i256( s. $bytes_func( ) ) ) ,
195+ * precision,
196+ * scale,
197+ ) )
198+ }
172199 Some ( DataType :: Binary ) => {
173200 Some ( ScalarValue :: Binary ( Some ( s. $bytes_func( ) . to_vec( ) ) ) )
174201 }
@@ -202,6 +229,13 @@ macro_rules! get_statistic {
202229 * scale,
203230 ) )
204231 }
232+ Some ( DataType :: Decimal256 ( precision, scale) ) => {
233+ Some ( ScalarValue :: Decimal256 (
234+ Some ( from_bytes_to_i256( s. $bytes_func( ) ) ) ,
235+ * precision,
236+ * scale,
237+ ) )
238+ }
205239 Some ( DataType :: FixedSizeBinary ( size) ) => {
206240 let value = s. $bytes_func( ) . to_vec( ) ;
207241 let value = if value. len( ) . try_into( ) == Ok ( * size) {
@@ -438,13 +472,13 @@ impl<'a> StatisticsConverter<'a> {
438472mod test {
439473 use super :: * ;
440474 use arrow:: compute:: kernels:: cast_utils:: Parser ;
441- use arrow:: datatypes:: { Date32Type , Date64Type } ;
475+ use arrow:: datatypes:: { i256 , Date32Type , Date64Type } ;
442476 use arrow_array:: {
443477 new_null_array, Array , BinaryArray , BooleanArray , Date32Array , Date64Array ,
444- Decimal128Array , Float32Array , Float64Array , Int16Array , Int32Array , Int64Array ,
445- Int8Array , LargeBinaryArray , RecordBatch , StringArray , StructArray ,
446- TimestampMicrosecondArray , TimestampMillisecondArray , TimestampNanosecondArray ,
447- TimestampSecondArray ,
478+ Decimal128Array , Decimal256Array , Float32Array , Float64Array , Int16Array ,
479+ Int32Array , Int64Array , Int8Array , LargeBinaryArray , RecordBatch , StringArray ,
480+ StructArray , TimestampMicrosecondArray , TimestampMillisecondArray ,
481+ TimestampNanosecondArray , TimestampSecondArray ,
448482 } ;
449483 use arrow_schema:: { Field , SchemaRef } ;
450484 use bytes:: Bytes ;
@@ -824,6 +858,42 @@ mod test {
824858 . unwrap ( ) ,
825859 ) ,
826860 }
861+ . run ( ) ;
862+
863+ Test {
864+ input : Arc :: new (
865+ Decimal256Array :: from ( vec ! [
866+ // row group 1
867+ Some ( i256:: from( 100 ) ) ,
868+ None ,
869+ Some ( i256:: from( 22000 ) ) ,
870+ // row group 2
871+ Some ( i256:: MAX ) ,
872+ Some ( i256:: MIN ) ,
873+ None ,
874+ // row group 3
875+ None ,
876+ None ,
877+ None ,
878+ ] )
879+ . with_precision_and_scale ( 76 , 76 )
880+ . unwrap ( ) ,
881+ ) ,
882+ expected_min : Arc :: new (
883+ Decimal256Array :: from ( vec ! [ Some ( i256:: from( 100 ) ) , Some ( i256:: MIN ) , None ] )
884+ . with_precision_and_scale ( 76 , 76 )
885+ . unwrap ( ) ,
886+ ) ,
887+ expected_max : Arc :: new (
888+ Decimal256Array :: from ( vec ! [
889+ Some ( i256:: from( 22000 ) ) ,
890+ Some ( i256:: MAX ) ,
891+ None ,
892+ ] )
893+ . with_precision_and_scale ( 76 , 76 )
894+ . unwrap ( ) ,
895+ ) ,
896+ }
827897 . run ( )
828898 }
829899
0 commit comments