@@ -410,23 +410,24 @@ pub async fn get_statistics_with_limit(
410410}
411411
412412/// Generic function to compute statistics across multiple items that have statistics
413- fn compute_summary_statistics < T , I > (
413+ /// If `items` is empty or all items don't have statistics, it returns `None`.
414+ pub fn compute_summary_statistics < T , I > (
414415 items : I ,
415- file_schema : & SchemaRef ,
416416 stats_extractor : impl Fn ( & T ) -> Option < & Statistics > ,
417- ) -> Statistics
417+ ) -> Option < Statistics >
418418where
419419 I : IntoIterator < Item = T > ,
420420{
421- let size = file_schema. fields ( ) . len ( ) ;
422- let mut col_stats_set = vec ! [ ColumnStatistics :: default ( ) ; size] ;
421+ let mut col_stats_set = Vec :: new ( ) ;
423422 let mut num_rows = Precision :: < usize > :: Absent ;
424423 let mut total_byte_size = Precision :: < usize > :: Absent ;
425424
426- for ( idx , item) in items. into_iter ( ) . enumerate ( ) {
425+ for item in items. into_iter ( ) {
427426 if let Some ( item_stats) = stats_extractor ( & item) {
428- if idx == 0 {
427+ if col_stats_set . is_empty ( ) {
429428 // First item, set values directly
429+ col_stats_set =
430+ vec ! [ ColumnStatistics :: default ( ) ; item_stats. column_statistics. len( ) ] ;
430431 num_rows = item_stats. num_rows ;
431432 total_byte_size = item_stats. total_byte_size ;
432433 for ( index, column_stats) in
@@ -458,11 +459,15 @@ where
458459 }
459460 }
460461
461- Statistics {
462+ if col_stats_set. is_empty ( ) {
463+ // No statistics available
464+ return None ;
465+ }
466+ Some ( Statistics {
462467 num_rows,
463468 total_byte_size,
464469 column_statistics : col_stats_set,
465- }
470+ } )
466471}
467472
468473/// Computes the summary statistics for a group of files(`FileGroup` level's statistics).
@@ -479,22 +484,24 @@ where
479484/// * `collect_stats` - Whether to collect statistics (if false, returns original file group)
480485///
481486/// # Returns
482- /// A new file group with summary statistics attached
487+ /// A new file group with summary statistics attached if there is statistics
483488pub fn compute_file_group_statistics (
484- file_group : FileGroup ,
485- file_schema : SchemaRef ,
489+ mut file_group : FileGroup ,
486490 collect_stats : bool ,
487- ) -> Result < FileGroup > {
491+ ) -> FileGroup {
488492 if !collect_stats {
489- return Ok ( file_group) ;
493+ return file_group;
490494 }
491495
492- let statistics =
493- compute_summary_statistics ( file_group. iter ( ) , & file_schema, |file| {
494- file. statistics . as_ref ( ) . map ( |stats| stats. as_ref ( ) )
495- } ) ;
496+ let statistics = compute_summary_statistics ( file_group. iter ( ) , |file| {
497+ file. statistics . as_ref ( ) . map ( |stats| stats. as_ref ( ) )
498+ } ) ;
499+
500+ if let Some ( stats) = statistics {
501+ file_group = file_group. with_statistics ( stats) ;
502+ }
496503
497- Ok ( file_group. with_statistics ( Arc :: new ( statistics ) ) )
504+ file_group
498505}
499506
500507/// Computes statistics for all files across multiple file groups.
@@ -519,29 +526,30 @@ pub fn compute_all_files_statistics(
519526 file_schema : SchemaRef ,
520527 collect_stats : bool ,
521528 inexact_stats : bool ,
522- ) -> Result < ( Vec < FileGroup > , Statistics ) > {
529+ ) -> ( Vec < FileGroup > , Statistics ) {
530+ if !collect_stats {
531+ return ( file_groups, Statistics :: new_unknown ( & file_schema) ) ;
532+ }
523533 let mut file_groups_with_stats = Vec :: with_capacity ( file_groups. len ( ) ) ;
524534
525535 // First compute statistics for each file group
526536 for file_group in file_groups {
527- file_groups_with_stats. push ( compute_file_group_statistics (
528- file_group,
529- Arc :: clone ( & file_schema) ,
530- collect_stats,
531- ) ?) ;
537+ file_groups_with_stats
538+ . push ( compute_file_group_statistics ( file_group, collect_stats) ) ;
532539 }
533540
534541 // Then summary statistics across all file groups
535542 let mut statistics =
536- compute_summary_statistics ( & file_groups_with_stats, & file_schema , |file_group| {
543+ compute_summary_statistics ( & file_groups_with_stats, |file_group| {
537544 file_group. statistics ( )
538- } ) ;
545+ } )
546+ . unwrap_or ( Statistics :: new_unknown ( & file_schema) ) ;
539547
540548 if inexact_stats {
541549 statistics = statistics. to_inexact ( )
542550 }
543551
544- Ok ( ( file_groups_with_stats, statistics) )
552+ ( file_groups_with_stats, statistics)
545553}
546554
547555pub fn add_row_stats (
@@ -620,18 +628,11 @@ fn set_min_if_lesser(
620628#[ cfg( test) ]
621629mod tests {
622630 use super :: * ;
623- use arrow:: datatypes:: { DataType , Field , Schema } ;
624631 use datafusion_common:: ScalarValue ;
625632 use std:: sync:: Arc ;
626633
627634 #[ test]
628635 fn test_compute_summary_statistics_basic ( ) {
629- // Create a schema with two columns
630- let schema = Arc :: new ( Schema :: new ( vec ! [
631- Field :: new( "col1" , DataType :: Int32 , false ) ,
632- Field :: new( "col2" , DataType :: Int32 , false ) ,
633- ] ) ) ;
634-
635636 // Create items with statistics
636637 let stats1 = Statistics {
637638 num_rows : Precision :: Exact ( 10 ) ,
@@ -679,7 +680,7 @@ mod tests {
679680
680681 // Call compute_summary_statistics
681682 let summary_stats =
682- compute_summary_statistics ( items, & schema , |item| Some ( item. as_ref ( ) ) ) ;
683+ compute_summary_statistics ( items, |item| Some ( item. as_ref ( ) ) ) . unwrap ( ) ;
683684
684685 // Verify the results
685686 assert_eq ! ( summary_stats. num_rows, Precision :: Exact ( 25 ) ) ; // 10 + 15
@@ -719,13 +720,6 @@ mod tests {
719720
720721 #[ test]
721722 fn test_compute_summary_statistics_mixed_precision ( ) {
722- // Create a schema with one column
723- let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
724- "col1" ,
725- DataType :: Int32 ,
726- false ,
727- ) ] ) ) ;
728-
729723 // Create items with different precision levels
730724 let stats1 = Statistics {
731725 num_rows : Precision :: Exact ( 10 ) ,
@@ -754,7 +748,7 @@ mod tests {
754748 let items = vec ! [ Arc :: new( stats1) , Arc :: new( stats2) ] ;
755749
756750 let summary_stats =
757- compute_summary_statistics ( items, & schema , |item| Some ( item. as_ref ( ) ) ) ;
751+ compute_summary_statistics ( items, |item| Some ( item. as_ref ( ) ) ) . unwrap ( ) ;
758752
759753 assert_eq ! ( summary_stats. num_rows, Precision :: Inexact ( 25 ) ) ;
760754 assert_eq ! ( summary_stats. total_byte_size, Precision :: Inexact ( 250 ) ) ;
@@ -774,25 +768,11 @@ mod tests {
774768
775769 #[ test]
776770 fn test_compute_summary_statistics_empty ( ) {
777- let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
778- "col1" ,
779- DataType :: Int32 ,
780- false ,
781- ) ] ) ) ;
782-
783771 // Empty collection
784772 let items: Vec < Arc < Statistics > > = vec ! [ ] ;
785773
786- let summary_stats =
787- compute_summary_statistics ( items, & schema, |item| Some ( item. as_ref ( ) ) ) ;
774+ let summary_stats = compute_summary_statistics ( items, |item| Some ( item. as_ref ( ) ) ) ;
788775
789- // Verify default values for empty collection
790- assert_eq ! ( summary_stats. num_rows, Precision :: Absent ) ;
791- assert_eq ! ( summary_stats. total_byte_size, Precision :: Absent ) ;
792- assert_eq ! ( summary_stats. column_statistics. len( ) , 1 ) ;
793- assert_eq ! (
794- summary_stats. column_statistics[ 0 ] . null_count,
795- Precision :: Absent
796- ) ;
776+ assert ! ( summary_stats. is_none( ) ) ;
797777 }
798778}
0 commit comments