Skip to content

Commit b136d8b

Browse files
authored
PARQUET-2261 Size Statistics (#14000)
Adds Parquet size statistics introduced in apache/parquet-format#197. Authors: - Ed Seidl (https://github.com/etseidl) - Nghia Truong (https://github.com/ttnghia) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) URL: #14000
1 parent fe612b3 commit b136d8b

10 files changed

Lines changed: 717 additions & 150 deletions

cpp/src/io/parquet/compact_protocol_reader.cpp

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ class parquet_field_union_struct : public parquet_field {
289289
inline bool operator()(CompactProtocolReader* cpr, int field_type)
290290
{
291291
T v;
292-
bool const res = parquet_field_struct<T>(field(), v).operator()(cpr, field_type);
292+
bool const res = parquet_field_struct<T>{field(), v}(cpr, field_type);
293293
if (!res) {
294294
val = v;
295295
enum_val = static_cast<E>(field());
@@ -424,7 +424,7 @@ class parquet_field_optional : public parquet_field {
424424
inline bool operator()(CompactProtocolReader* cpr, int field_type)
425425
{
426426
T v;
427-
bool const res = FieldFunctor(field(), v).operator()(cpr, field_type);
427+
bool const res = FieldFunctor{field(), v}(cpr, field_type);
428428
if (!res) { val = v; }
429429
return res;
430430
}
@@ -631,6 +631,8 @@ bool CompactProtocolReader::read(ColumnChunk* c)
631631

632632
bool CompactProtocolReader::read(ColumnChunkMetaData* c)
633633
{
634+
using optional_size_statistics =
635+
parquet_field_optional<SizeStatistics, parquet_field_struct<SizeStatistics>>;
634636
auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
635637
parquet_field_enum_list(2, c->encodings),
636638
parquet_field_string_list(3, c->path_in_schema),
@@ -641,7 +643,8 @@ bool CompactProtocolReader::read(ColumnChunkMetaData* c)
641643
parquet_field_int64(9, c->data_page_offset),
642644
parquet_field_int64(10, c->index_page_offset),
643645
parquet_field_int64(11, c->dictionary_page_offset),
644-
parquet_field_struct(12, c->statistics));
646+
parquet_field_struct(12, c->statistics),
647+
optional_size_statistics(16, c->size_statistics));
645648
return function_builder(this, op);
646649
}
647650

@@ -700,17 +703,35 @@ bool CompactProtocolReader::read(PageLocation* p)
700703

701704
bool CompactProtocolReader::read(OffsetIndex* o)
702705
{
703-
auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations));
706+
using optional_list_i64 = parquet_field_optional<std::vector<int64_t>, parquet_field_int64_list>;
707+
708+
auto op = std::make_tuple(parquet_field_struct_list(1, o->page_locations),
709+
optional_list_i64(2, o->unencoded_byte_array_data_bytes));
710+
return function_builder(this, op);
711+
}
712+
713+
bool CompactProtocolReader::read(SizeStatistics* s)
714+
{
715+
using optional_i64 = parquet_field_optional<int64_t, parquet_field_int64>;
716+
using optional_list_i64 = parquet_field_optional<std::vector<int64_t>, parquet_field_int64_list>;
717+
718+
auto op = std::make_tuple(optional_i64(1, s->unencoded_byte_array_data_bytes),
719+
optional_list_i64(2, s->repetition_level_histogram),
720+
optional_list_i64(3, s->definition_level_histogram));
704721
return function_builder(this, op);
705722
}
706723

707724
bool CompactProtocolReader::read(ColumnIndex* c)
708725
{
726+
using optional_list_i64 = parquet_field_optional<std::vector<int64_t>, parquet_field_int64_list>;
727+
709728
auto op = std::make_tuple(parquet_field_bool_list(1, c->null_pages),
710729
parquet_field_binary_list(2, c->min_values),
711730
parquet_field_binary_list(3, c->max_values),
712731
parquet_field_enum<BoundaryOrder>(4, c->boundary_order),
713-
parquet_field_int64_list(5, c->null_counts));
732+
parquet_field_int64_list(5, c->null_counts),
733+
optional_list_i64(6, c->repetition_level_histogram),
734+
optional_list_i64(7, c->definition_level_histogram));
714735
return function_builder(this, op);
715736
}
716737

cpp/src/io/parquet/compact_protocol_reader.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ class CompactProtocolReader {
116116
bool read(KeyValue* k);
117117
bool read(PageLocation* p);
118118
bool read(OffsetIndex* o);
119+
bool read(SizeStatistics* s);
119120
bool read(ColumnIndex* c);
120121
bool read(Statistics* s);
121122
bool read(ColumnOrder* c);

cpp/src/io/parquet/compact_protocol_writer.cpp

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
182182
if (s.index_page_offset != 0) { c.field_int(10, s.index_page_offset); }
183183
if (s.dictionary_page_offset != 0) { c.field_int(11, s.dictionary_page_offset); }
184184
c.field_struct(12, s.statistics);
185+
if (s.size_statistics.has_value()) { c.field_struct(16, s.size_statistics.value()); }
185186
return c.value();
186187
}
187188

@@ -210,6 +211,24 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s)
210211
{
211212
CompactProtocolFieldWriter c(*this);
212213
c.field_struct_list(1, s.page_locations);
214+
if (s.unencoded_byte_array_data_bytes.has_value()) {
215+
c.field_int_list(2, s.unencoded_byte_array_data_bytes.value());
216+
}
217+
return c.value();
218+
}
219+
220+
size_t CompactProtocolWriter::write(SizeStatistics const& s)
221+
{
222+
CompactProtocolFieldWriter c(*this);
223+
if (s.unencoded_byte_array_data_bytes.has_value()) {
224+
c.field_int(1, s.unencoded_byte_array_data_bytes.value());
225+
}
226+
if (s.repetition_level_histogram.has_value()) {
227+
c.field_int_list(2, s.repetition_level_histogram.value());
228+
}
229+
if (s.definition_level_histogram.has_value()) {
230+
c.field_int_list(3, s.definition_level_histogram.value());
231+
}
213232
return c.value();
214233
}
215234

@@ -286,13 +305,26 @@ inline void CompactProtocolFieldWriter::field_int(int field, int64_t val)
286305
current_field_value = field;
287306
}
288307

308+
template <>
309+
inline void CompactProtocolFieldWriter::field_int_list<int64_t>(int field,
310+
std::vector<int64_t> const& val)
311+
{
312+
put_field_header(field, current_field_value, ST_FLD_LIST);
313+
put_byte(static_cast<uint8_t>((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I64));
314+
if (val.size() >= 0xfUL) { put_uint(val.size()); }
315+
for (auto const v : val) {
316+
put_int(v);
317+
}
318+
current_field_value = field;
319+
}
320+
289321
template <typename Enum>
290322
inline void CompactProtocolFieldWriter::field_int_list(int field, std::vector<Enum> const& val)
291323
{
292324
put_field_header(field, current_field_value, ST_FLD_LIST);
293-
put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_I32));
294-
if (val.size() >= 0xf) put_uint(val.size());
295-
for (auto& v : val) {
325+
put_byte(static_cast<uint8_t>((std::min(val.size(), 0xfUL) << 4) | ST_FLD_I32));
326+
if (val.size() >= 0xfUL) { put_uint(val.size()); }
327+
for (auto const& v : val) {
296328
put_int(static_cast<int32_t>(v));
297329
}
298330
current_field_value = field;

cpp/src/io/parquet/compact_protocol_writer.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class CompactProtocolWriter {
5151
size_t write(Statistics const&);
5252
size_t write(PageLocation const&);
5353
size_t write(OffsetIndex const&);
54+
size_t write(SizeStatistics const&);
5455
size_t write(ColumnOrder const&);
5556

5657
protected:
@@ -113,4 +114,8 @@ class CompactProtocolFieldWriter {
113114
inline void set_current_field(int const& field);
114115
};
115116

117+
template <>
118+
inline void CompactProtocolFieldWriter::field_int_list<int64_t>(int field,
119+
std::vector<int64_t> const& val);
120+
116121
} // namespace cudf::io::parquet::detail

0 commit comments

Comments
 (0)