@@ -80,6 +80,7 @@ using arrow::DataType;
8080using arrow::Datum;
8181using arrow::DecimalType;
8282using arrow::default_memory_pool;
83+ using arrow::DictionaryArray;
8384using arrow::ListArray;
8485using arrow::PrimitiveArray;
8586using arrow::ResizableBuffer;
@@ -4138,6 +4139,74 @@ TEST_P(TestArrowWriteDictionary, Statistics) {
41384139INSTANTIATE_TEST_SUITE_P (WriteDictionary, TestArrowWriteDictionary,
41394140 ::testing::Values (ParquetDataPageVersion::V1,
41404141 ParquetDataPageVersion::V2));
4142+
4143+ TEST_P (TestArrowWriteDictionary, StatisticsUnifiedDictionary) {
4144+ // Two chunks, with a shared dictionary
4145+ std::shared_ptr<::arrow::Table> table;
4146+ std::shared_ptr<::arrow::DataType> dict_type =
4147+ ::arrow::dictionary (::arrow::int32(), ::arrow::utf8());
4148+ std::shared_ptr<::arrow::Schema> schema =
4149+ ::arrow::schema ({::arrow::field (" values" , dict_type)});
4150+ {
4151+ // It's important there are no duplicate values in the dictionary, otherwise
4152+ // we trigger the WriteDense() code path which side-steps dictionary encoding.
4153+ std::shared_ptr<::arrow::Array> test_dictionary =
4154+ ArrayFromJSON (::arrow::utf8 (), R"( ["b", "c", "d", "a"])" );
4155+ std::vector<std::shared_ptr<::arrow::Array>> test_indices = {
4156+ ArrayFromJSON (::arrow::int32 (),
4157+ R"( [3, null, 3, 3, null, 3])" ), // ["a", null "a", "a", null, "a"]
4158+ ArrayFromJSON (
4159+ ::arrow::int32 (),
4160+ R"([0 , 3 , null, 0 , null, 1 ])")}; // ["b", "a", null, "b", null, "c"]
4161+
4162+ ::arrow::ArrayVector chunks = {
4163+ std::make_shared<DictionaryArray>(dict_type, test_indices[0 ], test_dictionary),
4164+ std::make_shared<DictionaryArray>(dict_type, test_indices[1 ], test_dictionary),
4165+ };
4166+ std::shared_ptr<ChunkedArray> arr = std::make_shared<ChunkedArray>(chunks, dict_type);
4167+ table = ::arrow::Table::Make (schema, {arr});
4168+ }
4169+
4170+ std::shared_ptr<::arrow::ResizableBuffer> serialized_data = AllocateBuffer ();
4171+ auto out_stream = std::make_shared<::arrow::io::BufferOutputStream>(serialized_data);
4172+ {
4173+ // Will write data as two row groups, one with 9 rows and one with 3.
4174+ std::shared_ptr<WriterProperties> writer_properties =
4175+ WriterProperties::Builder ()
4176+ .max_row_group_length (9 )
4177+ ->data_page_version (this ->GetParquetDataPageVersion ())
4178+ ->write_batch_size (3 )
4179+ ->data_pagesize (3 )
4180+ ->build ();
4181+ std::unique_ptr<FileWriter> writer;
4182+ ASSERT_OK_AND_ASSIGN (
4183+ writer, FileWriter::Open (*schema, ::arrow::default_memory_pool (), out_stream,
4184+ writer_properties, default_arrow_writer_properties ()));
4185+ ASSERT_OK (writer->WriteTable (*table, std::numeric_limits<int64_t >::max ()));
4186+ ASSERT_OK (writer->Close ());
4187+ ASSERT_OK (out_stream->Close ());
4188+ }
4189+
4190+ auto buffer_reader = std::make_shared<::arrow::io::BufferReader>(serialized_data);
4191+ std::unique_ptr<ParquetFileReader> parquet_reader =
4192+ ParquetFileReader::Open (std::move (buffer_reader));
4193+ // Check row group statistics
4194+ std::shared_ptr<FileMetaData> metadata = parquet_reader->metadata ();
4195+ ASSERT_EQ (metadata->num_row_groups (), 2 );
4196+ ASSERT_EQ (metadata->RowGroup (0 )->num_rows (), 9 );
4197+ ASSERT_EQ (metadata->RowGroup (1 )->num_rows (), 3 );
4198+ auto stats0 = metadata->RowGroup (0 )->ColumnChunk (0 )->statistics ();
4199+ auto stats1 = metadata->RowGroup (1 )->ColumnChunk (0 )->statistics ();
4200+ ASSERT_EQ (stats0->num_values (), 6 );
4201+ ASSERT_EQ (stats1->num_values (), 2 );
4202+ ASSERT_EQ (stats0->null_count (), 3 );
4203+ ASSERT_EQ (stats1->null_count (), 1 );
4204+ ASSERT_EQ (stats0->EncodeMin (), " a" );
4205+ ASSERT_EQ (stats1->EncodeMin (), " b" );
4206+ ASSERT_EQ (stats0->EncodeMax (), " b" );
4207+ ASSERT_EQ (stats1->EncodeMax (), " c" );
4208+ }
4209+
41414210// ----------------------------------------------------------------------
41424211// Tests for directly reading DictionaryArray
41434212
0 commit comments