Skip to content

Commit 4dd2e15

Browse files
committed
rebase master
1 parent fd99ce2 commit 4dd2e15

7 files changed

Lines changed: 147 additions & 845 deletions

File tree

parquet/src/arrow/array_reader/builder.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use std::sync::Arc;
1919

2020
use arrow_schema::{DataType, Fields, SchemaBuilder};
2121

22-
use crate::arrow::array_reader::byte_view_array::make_byte_view_array_reader;
22+
use crate::arrow::array_reader::byte_array::make_byte_view_array_reader;
2323
use crate::arrow::array_reader::empty_array::make_empty_array_reader;
2424
use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader;
2525
use crate::arrow::array_reader::{

parquet/src/arrow/array_reader/byte_array.rs

Lines changed: 143 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,36 @@ pub fn make_byte_array_reader(
7474
}
7575
}
7676

77+
/// Returns an [`ArrayReader`] that decodes the provided byte array column to view types.
78+
pub fn make_byte_view_array_reader(
79+
pages: Box<dyn PageIterator>,
80+
column_desc: ColumnDescPtr,
81+
arrow_type: Option<ArrowType>,
82+
) -> Result<Box<dyn ArrayReader>> {
83+
// Check if Arrow type is specified, else create it from Parquet type
84+
let data_type = match arrow_type {
85+
Some(t) => t,
86+
None => match parquet_to_arrow_field(column_desc.as_ref())?.data_type() {
87+
ArrowType::Utf8 | ArrowType::Utf8View => ArrowType::Utf8View,
88+
_ => ArrowType::BinaryView,
89+
},
90+
};
91+
92+
match data_type {
93+
ArrowType::BinaryView | ArrowType::Utf8View => {
94+
let reader = GenericRecordReader::new(column_desc);
95+
Ok(Box::new(ByteArrayReader::<i32>::new(
96+
pages, data_type, reader,
97+
)))
98+
}
99+
100+
_ => Err(general_err!(
101+
"invalid data type for byte array reader read to view type - {}",
102+
data_type
103+
)),
104+
}
105+
}
106+
77107
/// An [`ArrayReader`] for variable length byte arrays
78108
struct ByteArrayReader<I: OffsetSizeTrait> {
79109
data_type: ArrowType,
@@ -588,7 +618,7 @@ mod tests {
588618
use super::*;
589619
use crate::arrow::array_reader::test_util::{byte_array_all_encodings, utf8_column};
590620
use crate::arrow::record_reader::buffer::ValuesBuffer;
591-
use arrow_array::{Array, StringArray};
621+
use arrow_array::{Array, StringArray, StringViewArray};
592622
use arrow_buffer::Buffer;
593623

594624
#[test]
@@ -646,6 +676,64 @@ mod tests {
646676
}
647677
}
648678

679+
#[test]
680+
fn test_byte_array_string_view_decoder() {
681+
let (pages, encoded_dictionary) =
682+
byte_array_all_encodings(vec!["hello", "world", "large payload over 12 bytes", "b"]);
683+
684+
let column_desc = utf8_column();
685+
let mut decoder = ByteArrayColumnValueDecoder::new(&column_desc);
686+
687+
decoder
688+
.set_dict(encoded_dictionary, 4, Encoding::RLE_DICTIONARY, false)
689+
.unwrap();
690+
691+
for (encoding, page) in pages {
692+
let mut output = OffsetBuffer::<i32>::default();
693+
decoder.set_data(encoding, page, 4, Some(4)).unwrap();
694+
695+
assert_eq!(decoder.read(&mut output, 1).unwrap(), 1);
696+
697+
assert_eq!(output.values.as_slice(), "hello".as_bytes());
698+
assert_eq!(output.offsets.as_slice(), &[0, 5]);
699+
700+
assert_eq!(decoder.read(&mut output, 1).unwrap(), 1);
701+
assert_eq!(output.values.as_slice(), "helloworld".as_bytes());
702+
assert_eq!(output.offsets.as_slice(), &[0, 5, 10]);
703+
704+
assert_eq!(decoder.read(&mut output, 2).unwrap(), 2);
705+
assert_eq!(
706+
output.values.as_slice(),
707+
"helloworldlarge payload over 12 bytesb".as_bytes()
708+
);
709+
assert_eq!(output.offsets.as_slice(), &[0, 5, 10, 37, 38]);
710+
711+
assert_eq!(decoder.read(&mut output, 4).unwrap(), 0);
712+
713+
let valid = [false, false, true, true, false, true, true, false, false];
714+
let valid_buffer = Buffer::from_iter(valid.iter().cloned());
715+
716+
output.pad_nulls(0, 4, valid.len(), valid_buffer.as_slice());
717+
let array = output.into_array(Some(valid_buffer), ArrowType::Utf8View);
718+
let strings = array.as_any().downcast_ref::<StringViewArray>().unwrap();
719+
720+
assert_eq!(
721+
strings.iter().collect::<Vec<_>>(),
722+
vec![
723+
None,
724+
None,
725+
Some("hello"),
726+
Some("world"),
727+
None,
728+
Some("large payload over 12 bytes"),
729+
Some("b"),
730+
None,
731+
None,
732+
]
733+
);
734+
}
735+
}
736+
649737
#[test]
650738
fn test_byte_array_decoder_skip() {
651739
let (pages, encoded_dictionary) =
@@ -690,6 +778,60 @@ mod tests {
690778
}
691779
}
692780

781+
#[test]
782+
fn test_byte_array_string_view_decoder_skip() {
783+
let (pages, encoded_dictionary) =
784+
byte_array_all_encodings(vec!["hello", "world", "a", "large payload over 12 bytes"]);
785+
786+
let column_desc = utf8_column();
787+
let mut decoder = ByteArrayColumnValueDecoder::new(&column_desc);
788+
789+
decoder
790+
.set_dict(encoded_dictionary, 4, Encoding::RLE_DICTIONARY, false)
791+
.unwrap();
792+
793+
for (encoding, page) in pages {
794+
let mut output = OffsetBuffer::<i32>::default();
795+
decoder.set_data(encoding, page, 4, Some(4)).unwrap();
796+
797+
assert_eq!(decoder.read(&mut output, 1).unwrap(), 1);
798+
799+
assert_eq!(output.values.as_slice(), "hello".as_bytes());
800+
assert_eq!(output.offsets.as_slice(), &[0, 5]);
801+
802+
assert_eq!(decoder.skip_values(1).unwrap(), 1);
803+
assert_eq!(decoder.skip_values(1).unwrap(), 1);
804+
805+
assert_eq!(decoder.read(&mut output, 1).unwrap(), 1);
806+
assert_eq!(
807+
output.values.as_slice(),
808+
"hellolarge payload over 12 bytes".as_bytes()
809+
);
810+
assert_eq!(output.offsets.as_slice(), &[0, 5, 32]);
811+
812+
assert_eq!(decoder.read(&mut output, 4).unwrap(), 0);
813+
814+
let valid = [false, false, true, true, false, false];
815+
let valid_buffer = Buffer::from_iter(valid.iter().cloned());
816+
817+
output.pad_nulls(0, 2, valid.len(), valid_buffer.as_slice());
818+
let array = output.into_array(Some(valid_buffer), ArrowType::Utf8View);
819+
let strings = array.as_any().downcast_ref::<StringViewArray>().unwrap();
820+
821+
assert_eq!(
822+
strings.iter().collect::<Vec<_>>(),
823+
vec![
824+
None,
825+
None,
826+
Some("hello"),
827+
Some("large payload over 12 bytes"),
828+
None,
829+
None,
830+
]
831+
);
832+
}
833+
}
834+
693835
#[test]
694836
fn test_byte_array_decoder_nulls() {
695837
let (pages, encoded_dictionary) = byte_array_all_encodings(Vec::<&str>::new());

0 commit comments

Comments
 (0)