Skip to content

Commit 8d2db22

Browse files
committed
ARROW-91: Basic Parquet read support
1 parent d9940d8 commit 8d2db22

5 files changed

Lines changed: 266 additions & 8 deletions

File tree

cpp/src/arrow/parquet/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
# arrow_parquet : Arrow <-> Parquet adapter
2020

2121
set(PARQUET_SRCS
22+
reader.cc
2223
schema.cc
2324
)
2425

@@ -36,6 +37,9 @@ SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX)
3637
ADD_ARROW_TEST(parquet-schema-test)
3738
ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet)
3839

40+
ADD_ARROW_TEST(parquet-reader-test)
41+
ARROW_TEST_LINK_LIBRARIES(parquet-reader-test arrow_parquet)
42+
3943
# Headers: top level
4044
install(FILES
4145
DESTINATION include/arrow/parquet)
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include "gtest/gtest.h"
19+
20+
#include "arrow/test-util.h"
21+
#include "arrow/parquet/reader.h"
22+
#include "arrow/types/primitive.h"
23+
#include "arrow/util/memory-pool.h"
24+
#include "arrow/util/status.h"
25+
26+
#include "parquet/api/schema.h"
27+
#include "parquet/column/writer.h"
28+
#include "parquet/file/reader.h"
29+
#include "parquet/file/writer.h"
30+
#include "parquet/util/input.h"
31+
#include "parquet/util/output.h"
32+
33+
using ParquetBuffer = parquet::Buffer;
34+
using parquet::BufferReader;
35+
using parquet::InMemoryOutputStream;
36+
using parquet::Int64Writer;
37+
using parquet::ParquetFileReader;
38+
using parquet::ParquetFileWriter;
39+
using parquet::RandomAccessSource;
40+
using parquet::Repetition;
41+
using parquet::SchemaDescriptor;
42+
using ParquetType = parquet::Type;
43+
using parquet::schema::GroupNode;
44+
using parquet::schema::NodePtr;
45+
using parquet::schema::PrimitiveNode;
46+
47+
namespace arrow {
48+
49+
namespace parquet {
50+
51+
class TestReadParquet : public ::testing::Test {
52+
public:
53+
virtual void SetUp() {}
54+
55+
std::shared_ptr<GroupNode> Int64Schema() {
56+
auto pnode = PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64);
57+
NodePtr node_ =
58+
GroupNode::Make("schema", Repetition::REQUIRED, std::vector<NodePtr>({pnode}));
59+
return std::static_pointer_cast<GroupNode>(node_);
60+
}
61+
62+
std::unique_ptr<ParquetFileReader> Int64File(
63+
std::vector<int64_t>& values, int num_chunks) {
64+
std::shared_ptr<GroupNode> schema = Int64Schema();
65+
std::shared_ptr<InMemoryOutputStream> sink(new InMemoryOutputStream());
66+
auto file_writer = ParquetFileWriter::Open(sink, schema);
67+
size_t chunk_size = values.size() / num_chunks;
68+
for (int i = 0; i < num_chunks; i++) {
69+
auto row_group_writer = file_writer->AppendRowGroup(chunk_size);
70+
auto column_writer = static_cast<Int64Writer*>(row_group_writer->NextColumn());
71+
int64_t* data = values.data() + i * chunk_size;
72+
column_writer->WriteBatch(chunk_size, nullptr, nullptr, data);
73+
column_writer->Close();
74+
row_group_writer->Close();
75+
}
76+
file_writer->Close();
77+
78+
std::shared_ptr<ParquetBuffer> buffer = sink->GetBuffer();
79+
std::unique_ptr<RandomAccessSource> source(new BufferReader(buffer));
80+
return ParquetFileReader::Open(std::move(source));
81+
}
82+
83+
private:
84+
};
85+
86+
TEST_F(TestReadParquet, SingleColumnInt64) {
87+
std::vector<int64_t> values(100, 128);
88+
std::unique_ptr<ParquetFileReader> file_reader = Int64File(values, 1);
89+
arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader));
90+
std::unique_ptr<arrow::parquet::FlatColumnReader> column_reader;
91+
ASSERT_OK(reader.GetFlatColumn(0, &column_reader));
92+
ASSERT_NE(nullptr, column_reader.get());
93+
std::shared_ptr<Array> out;
94+
ASSERT_OK(column_reader->NextBatch(100, &out));
95+
ASSERT_NE(nullptr, out.get());
96+
Int64Array* out_array = static_cast<Int64Array*>(out.get());
97+
for (size_t i = 0; i < values.size(); i++) {
98+
EXPECT_EQ(values[i], out_array->raw_data()[i]);
99+
}
100+
}
101+
102+
TEST_F(TestReadParquet, SingleColumnInt64Chunked) {
103+
std::vector<int64_t> values(100, 128);
104+
std::unique_ptr<ParquetFileReader> file_reader = Int64File(values, 4);
105+
arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader));
106+
std::unique_ptr<arrow::parquet::FlatColumnReader> column_reader;
107+
ASSERT_OK(reader.GetFlatColumn(0, &column_reader));
108+
ASSERT_NE(nullptr, column_reader.get());
109+
std::shared_ptr<Array> out;
110+
ASSERT_OK(column_reader->NextBatch(100, &out));
111+
ASSERT_NE(nullptr, out.get());
112+
Int64Array* out_array = static_cast<Int64Array*>(out.get());
113+
for (size_t i = 0; i < values.size(); i++) {
114+
EXPECT_EQ(values[i], out_array->raw_data()[i]);
115+
}
116+
}
117+
118+
} // namespace parquet
119+
120+
} // namespace arrow

cpp/src/arrow/parquet/reader.cc

Lines changed: 130 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,141 @@
1717

1818
#include "arrow/parquet/reader.h"
1919

20+
#include <queue>
21+
22+
#include "arrow/parquet/schema.h"
23+
#include "arrow/schema.h"
24+
#include "arrow/types/primitive.h"
25+
#include "arrow/util/status.h"
26+
27+
using parquet::ColumnReader;
28+
using parquet::TypedColumnReader;
29+
2030
namespace arrow {
2131
namespace parquet {
2232

2333
class FileReader::Impl {
34+
public:
35+
Impl(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader);
36+
virtual ~Impl() {}
37+
38+
Status GetFlatColumn(int i, std::unique_ptr<FlatColumnReader>* out);
39+
2440
private:
41+
MemoryPool* pool_;
2542
std::unique_ptr<::parquet::ParquetFileReader> reader_;
2643
};
2744

28-
} // namespace parquet
29-
} // namespace arrow
45+
class FlatColumnReader::Impl {
46+
public:
47+
Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor* descr,
48+
std::queue<std::shared_ptr<ColumnReader>>&& column_readers);
49+
virtual ~Impl() {}
50+
51+
Status NextBatch(int batch_size, std::shared_ptr<Array>* out);
52+
template <typename ArrowType, typename ParquetType, typename CType>
53+
Status TypedReadBatch(int batch_size, std::shared_ptr<Array>* out);
54+
55+
private:
56+
MemoryPool* pool_;
57+
const ::parquet::ColumnDescriptor* descr_;
58+
std::queue<std::shared_ptr<ColumnReader>> column_readers_;
59+
std::shared_ptr<Field> field_;
60+
};
61+
62+
FileReader::Impl::Impl(
63+
MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader)
64+
: pool_(pool), reader_(std::move(reader)) {}
65+
66+
Status FileReader::Impl::GetFlatColumn(int i, std::unique_ptr<FlatColumnReader>* out) {
67+
std::queue<std::shared_ptr<ColumnReader>> column_readers;
68+
for (int rg = 0; rg < reader_->num_row_groups(); rg++) {
69+
column_readers.push(reader_->RowGroup(rg)->Column(i));
70+
}
71+
std::unique_ptr<FlatColumnReader::Impl> impl(new FlatColumnReader::Impl(
72+
pool_, reader_->descr()->Column(i), std::move(column_readers)));
73+
*out = std::unique_ptr<FlatColumnReader>(new FlatColumnReader(std::move(impl)));
74+
return Status::OK();
75+
}
76+
77+
FileReader::FileReader(
78+
MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader)
79+
: impl_(new FileReader::Impl(pool, std::move(reader))) {}
80+
81+
FileReader::~FileReader() {}
82+
83+
Status FileReader::GetFlatColumn(int i, std::unique_ptr<FlatColumnReader>* out) {
84+
return impl_->GetFlatColumn(i, out);
85+
}
86+
87+
Status FileReader::ReadFlatColumn(int i, std::shared_ptr<Array>* out) {
88+
return Status::OK();
89+
}
90+
91+
FlatColumnReader::Impl::Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor* descr,
92+
std::queue<std::shared_ptr<ColumnReader>>&& column_readers)
93+
: pool_(pool), descr_(descr), column_readers_(column_readers) {
94+
NodeToField(descr_->schema_node(), &field_);
95+
}
96+
97+
template <typename ArrowType, typename ParquetType, typename CType>
98+
Status FlatColumnReader::Impl::TypedReadBatch(
99+
int batch_size, std::shared_ptr<Array>* out) {
100+
int values_to_read = batch_size;
101+
NumericBuilder<ArrowType> builder(pool_, field_->type);
102+
while ((values_to_read > 0) && (column_readers_.size() > 0)) {
103+
// TODO: This is a lot malloc-thresing and not using the memory pool.
104+
std::vector<CType> values(values_to_read);
105+
std::vector<int16_t> def_levels(values_to_read);
106+
auto reader =
107+
dynamic_cast<TypedColumnReader<ParquetType>*>(column_readers_.front().get());
108+
int64_t values_read;
109+
values_to_read -= reader->ReadBatch(
110+
values_to_read, def_levels.data(), nullptr, values.data(), &values_read);
111+
if (descr_->max_definition_level() == 0) {
112+
builder.Append(values.data(), values_read);
113+
} else {
114+
return Status::NotImplemented("no support for definition levels yet");
115+
}
116+
if (!column_readers_.front()->HasNext()) { column_readers_.pop(); }
117+
}
118+
*out = builder.Finish();
119+
return Status::OK();
120+
}
121+
122+
#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType, CType) \
123+
case Type::ENUM: \
124+
return TypedReadBatch<ArrowType, ParquetType, CType>(batch_size, out); \
125+
break;
126+
127+
Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr<Array>* out) {
128+
if (column_readers_.size() == 0) {
129+
// Exhausted all readers.
130+
*out = std::shared_ptr<Array>(nullptr);
131+
}
132+
133+
if (descr_->max_repetition_level() > 0) {
134+
return Status::NotImplemented("no support for repetition yet");
135+
}
136+
137+
*out = std::shared_ptr<Array>(nullptr);
138+
switch (field_->type->type) {
139+
TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type, int32_t)
140+
TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type, int64_t)
141+
TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType, float)
142+
TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType, double)
143+
default:
144+
return Status::NotImplemented(field_->type->ToString());
145+
}
146+
}
147+
148+
FlatColumnReader::FlatColumnReader(std::unique_ptr<Impl> impl) : impl_(std::move(impl)) {}
149+
150+
FlatColumnReader::~FlatColumnReader() {}
151+
152+
Status FlatColumnReader::NextBatch(int batch_size, std::shared_ptr<Array>* out) {
153+
return impl_->NextBatch(batch_size, out);
154+
}
155+
156+
} // namespace parquet
157+
} // namespace arrow

cpp/src/arrow/parquet/reader.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,7 @@ class FlatColumnReader;
7878
// arrays
7979
class FileReader {
8080
public:
81-
ArrowReader(MemoryPool* pool,
82-
std::unique_ptr<::parquet::ParquetFileReader> reader);
81+
FileReader(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader);
8382

8483
// Since the distribution of columns amongst a Parquet file's row groups may
8584
// be uneven (the number of values in each column chunk can be different), we
@@ -89,6 +88,10 @@ class FileReader {
8988
//
9089
// Returns error status if the column of interest is not flat.
9190
Status GetFlatColumn(int i, std::unique_ptr<FlatColumnReader>* out);
91+
// Read column as a whole into an Array.
92+
Status ReadFlatColumn(int i, std::shared_ptr<Array>* out);
93+
94+
virtual ~FileReader();
9295

9396
private:
9497
class Impl;
@@ -103,6 +106,8 @@ class FileReader {
103106
// might change in the future.
104107
class FlatColumnReader {
105108
public:
109+
virtual ~FlatColumnReader();
110+
106111
// Scan the next array of the indicated size. The actual size of the
107112
// returned array may be less than the passed size depending how much data is
108113
// available in the file.
@@ -117,12 +122,13 @@ class FlatColumnReader {
117122
private:
118123
class Impl;
119124
std::unique_ptr<Impl> impl_;
125+
FlatColumnReader(std::unique_ptr<Impl> impl);
120126

121127
friend class FileReader;
122128
};
123129

124-
} // namespace parquet
130+
} // namespace parquet
125131

126-
} // namespace arrow
132+
} // namespace arrow
127133

128-
#endif ARROW_PARQUET_READER_H
134+
#endif // ARROW_PARQUET_READER_H

cpp/src/arrow/parquet/schema.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,4 @@ Status ToParquetSchema(
4545

4646
} // namespace arrow
4747

48-
#endif // ARROW_PARQUET_SCHEMA_H
48+
#endif // ARROW_PARQUET_SCHEMA_H

0 commit comments

Comments
 (0)