Skip to content

Commit 2816f37

Browse files
ARROW-10812: [Rust] Make BooleanArray not a PrimitiveArray
This PR creates a new struct `BooleanArray`, that replaces `PrimitiveArray<BooleanType>`, so that we do not have to consider the differences between being bit-packed and non-bit packed. This difference is causing a significant performance degradation described on ARROW-10453 and #8837 . This usage of different logic is already observed in most of our kernels, as the code for byte-width and bit-packed is almost always different, due to how offsets are computed. With this PR, that offset computation no longer depends on bit-packed vs non-bit-packed. IMPORTANT: this removed support from Boolean array to UnionArray, as `UnionArray` currently only supports `PrimitiveType`. Micro benchmarks (worse to best, statistically insignificant ignored): | benchmark | variation | |-------------- | -------------- | | min nulls 512 | 33.7 | | record_batches_to_csv | 23.1 | | array_string_from_vec 256 | 5.6 | | array_string_from_vec 512 | 5.2 | | take bool nulls 512 | 4.9 | | cast int32 to int64 512 | 2.5 | | equal_512 | 2.3 | | filter u8 very low selectivity | 2.2 | | array_slice 512 | 2.1 | | take bool nulls 1024 | 2.0 | | cast int64 to int32 512 | 1.6 | | min 512 | 1.6 | | take i32 512 | 1.1 | | add 512 | 1.1 | | array_slice 2048 | 1.0 | | length | 1.0 | | filter u8 low selectivity | 0.9 | | filter u8 high selectivity | 0.9 | | array_string_from_vec 128 | 0.9 | | cast int32 to float64 512 | 0.9 | | cast timestamp_ms to i64 512 | 0.8 | | take str null indices 512 | 0.6 | | sum 512 | 0.4 | | filter context u8 very low selectivity | -0.7 | | take i32 1024 | -0.9 | | filter context f32 very low selectivity | -0.9 | | cast float64 to float32 512 | -1.0 | | equal_nulls_512 | -1.0 | | cast time32s to time32ms 512 | -1.1 | | sort 2^12 | -1.2 | | struct_array_from_vec 128 | -1.4 | | array_from_vec 256 | -1.4 | | array_from_vec 128 | -1.5 | | filter context u8 high selectivity | -1.6 | | limit 512, 512 | -1.7 | | equal_string_nulls_512 | -1.8 | | take i32 nulls 1024 | -1.8 | | struct_array_from_vec 512 | -1.9 | | filter context f32 high selectivity | -2.0 | | cast timestamp_ms to timestamp_ns 512 | -2.2 | | take i32 nulls 512 | -2.3 | | buffer_bit_ops or | -2.4 | | array_from_vec 512 | -2.6 | | cast float64 to uint64 512 | -2.7 | | take str 512 | -2.8 | | min nulls string 512 | -3.1 | | cast int32 to int32 512 | -3.3 | | array_slice 128 | -3.3 | | filter context u8 w NULLs very low selectivity | -3.3 | | buffer_bit_ops and | -3.4 | | struct_array_from_vec 256 | -4.2 | | cast int32 to uint32 512 | -4.5 | | multiply 512 | -5.2 | | equal_string_512 | -5.5 | | take str null values null indices 1024 | -6.8 | | sum nulls 512 | -13.3 | | add_nulls_512 | -17.6 | | like_utf8 scalar contains | -17.8 | | nlike_utf8 scalar contains | -17.9 | | nlike_utf8 scalar complex | -24.6 | | like_utf8 scalar complex | -25.2 | | cast time64ns to time32s 512 | -42.7 | | cast date64 to date32 512 | -49.1 | | cast date32 to date64 512 | -50.7 | | nlike_utf8 scalar starts with | -51.1 | | nlike_utf8 scalar ends with | -55.1 | | like_utf8 scalar ends with | -55.5 | | like_utf8 scalar starts with | -56.3 | | nlike_utf8 scalar equals | -67.8 | | like_utf8 scalar equals | -74.2 | | eq Float32 | -75.7 | | gt_eq Float32 | -76.1 | | lt_eq Float32 | -76.5 | | not | -77.1 | | and | -78.6 | | or | -78.7 | | lt_eq scalar Float32 | -79.4 | | eq scalar Float32 | -82.1 | | neq Float32 | -82.1 | | lt scalar Float32 | -82.1 | | lt Float32 | -82.3 | | gt Float32 | -82.4 | | gt_eq scalar Float32 | -82.4 | | neq scalar Float32 | -82.6 | | gt scalar Float32 | -84.7 | Closes #8842 from jorgecarleitao/boolean Lead-authored-by: Jorge C. Leitao <jorgecarleitao@gmail.com> Co-authored-by: Jorge Leitao <jorgecarleitao@gmail.com> Signed-off-by: Jorge C. Leitao <jorgecarleitao@gmail.com>
1 parent a774ae7 commit 2816f37

24 files changed

Lines changed: 988 additions & 434 deletions

rust/arrow/benches/csv_writer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ fn record_batches_to_csv() {
4646
Some(-556132.25),
4747
]);
4848
let c3 = PrimitiveArray::<UInt32Type>::from(vec![3, 2, 1]);
49-
let c4 = PrimitiveArray::<BooleanType>::from(vec![Some(true), Some(false), None]);
49+
let c4 = BooleanArray::from(vec![Some(true), Some(false), None]);
5050

5151
let b = RecordBatch::try_new(
5252
Arc::new(schema),

rust/arrow/benches/take_kernels.rs

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,20 @@ where
4747
Arc::new(array) as ArrayRef
4848
}
4949

50+
// cast array from specified primitive array type to desired data type
51+
fn create_boolean(size: usize) -> ArrayRef
52+
where
53+
Standard: Distribution<bool>,
54+
{
55+
let array: BooleanArray = seedable_rng()
56+
.sample_iter(&Standard)
57+
.take(size)
58+
.map(Some)
59+
.collect();
60+
61+
Arc::new(array) as ArrayRef
62+
}
63+
5064
fn create_strings(size: usize, null_density: f32) -> ArrayRef {
5165
let rng = &mut seedable_rng();
5266

@@ -101,23 +115,23 @@ fn add_benchmark(c: &mut Criterion) {
101115
b.iter(|| bench_take(&values, &indices))
102116
});
103117

104-
let values = create_primitive::<BooleanType>(512);
118+
let values = create_boolean(512);
105119
let indices = create_random_index(512, 0.0);
106120
c.bench_function("take bool 512", |b| {
107121
b.iter(|| bench_take(&values, &indices))
108122
});
109-
let values = create_primitive::<BooleanType>(1024);
123+
let values = create_boolean(1024);
110124
let indices = create_random_index(1024, 0.0);
111125
c.bench_function("take bool 1024", |b| {
112126
b.iter(|| bench_take(&values, &indices))
113127
});
114128

115-
let values = create_primitive::<BooleanType>(512);
129+
let values = create_boolean(512);
116130
let indices = create_random_index(512, 0.5);
117131
c.bench_function("take bool nulls 512", |b| {
118132
b.iter(|| bench_take(&values, &indices))
119133
});
120-
let values = create_primitive::<BooleanType>(1024);
134+
let values = create_boolean(1024);
121135
let indices = create_random_index(1024, 0.5);
122136
c.bench_function("take bool nulls 1024", |b| {
123137
b.iter(|| bench_take(&values, &indices))
Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::borrow::Borrow;
19+
use std::iter::{FromIterator, IntoIterator};
20+
use std::mem;
21+
use std::{any::Any, fmt};
22+
use std::{convert::From, sync::Arc};
23+
24+
use super::*;
25+
use super::{array::print_long_array, raw_pointer::RawPtrBox};
26+
use crate::buffer::{Buffer, MutableBuffer};
27+
use crate::memory;
28+
use crate::util::bit_util;
29+
30+
/// Array of bools
31+
pub struct BooleanArray {
32+
data: ArrayDataRef,
33+
/// Pointer to the value array. The lifetime of this must be <= to the value buffer
34+
/// stored in `data`, so it's safe to store.
35+
raw_values: RawPtrBox<u8>,
36+
}
37+
38+
impl fmt::Debug for BooleanArray {
39+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
40+
write!(f, "BooleanArray\n[\n")?;
41+
print_long_array(self, f, |array, index, f| {
42+
fmt::Debug::fmt(&array.value(index), f)
43+
})?;
44+
write!(f, "]")
45+
}
46+
}
47+
48+
impl BooleanArray {
49+
/// Returns the length of this array.
50+
pub fn len(&self) -> usize {
51+
self.data.len()
52+
}
53+
54+
/// Returns whether this array is empty.
55+
pub fn is_empty(&self) -> bool {
56+
self.data.is_empty()
57+
}
58+
59+
/// Returns a raw pointer to the values of this array.
60+
pub fn raw_values(&self) -> *const u8 {
61+
unsafe { self.raw_values.get().add(self.data.offset()) }
62+
}
63+
64+
/// Returns a slice for the given offset and length
65+
///
66+
/// Note this doesn't do any bound checking, for performance reason.
67+
pub fn value_slice(&self, offset: usize, len: usize) -> &[u8] {
68+
let raw =
69+
unsafe { std::slice::from_raw_parts(self.raw_values().add(offset), len) };
70+
&raw[..]
71+
}
72+
73+
// Returns a new boolean array builder
74+
pub fn builder(capacity: usize) -> BooleanBuilder {
75+
BooleanBuilder::new(capacity)
76+
}
77+
78+
/// Returns a `Buffer` holding all the values of this array.
79+
///
80+
/// Note this doesn't take the offset of this array into account.
81+
pub fn values(&self) -> Buffer {
82+
self.data.buffers()[0].clone()
83+
}
84+
85+
/// Returns the boolean value at index `i`.
86+
///
87+
/// Note this doesn't do any bound checking, for performance reason.
88+
pub fn value(&self, i: usize) -> bool {
89+
let offset = i + self.offset();
90+
unsafe { bit_util::get_bit_raw(self.raw_values.get() as *const u8, offset) }
91+
}
92+
}
93+
94+
impl Array for BooleanArray {
95+
fn as_any(&self) -> &Any {
96+
self
97+
}
98+
99+
fn data(&self) -> ArrayDataRef {
100+
self.data.clone()
101+
}
102+
103+
fn data_ref(&self) -> &ArrayDataRef {
104+
&self.data
105+
}
106+
107+
/// Returns the total number of bytes of memory occupied by the buffers owned by this [BooleanArray].
108+
fn get_buffer_memory_size(&self) -> usize {
109+
self.data.get_buffer_memory_size()
110+
}
111+
112+
/// Returns the total number of bytes of memory occupied physically by this [BooleanArray].
113+
fn get_array_memory_size(&self) -> usize {
114+
self.data.get_array_memory_size() + mem::size_of_val(self)
115+
}
116+
}
117+
118+
impl From<Vec<bool>> for BooleanArray {
119+
fn from(data: Vec<bool>) -> Self {
120+
let mut mut_buf = MutableBuffer::new_null(data.len());
121+
{
122+
let mut_slice = mut_buf.data_mut();
123+
for (i, b) in data.iter().enumerate() {
124+
if *b {
125+
bit_util::set_bit(mut_slice, i);
126+
}
127+
}
128+
}
129+
let array_data = ArrayData::builder(DataType::Boolean)
130+
.len(data.len())
131+
.add_buffer(mut_buf.freeze())
132+
.build();
133+
BooleanArray::from(array_data)
134+
}
135+
}
136+
137+
impl From<Vec<Option<bool>>> for BooleanArray {
138+
fn from(data: Vec<Option<bool>>) -> Self {
139+
BooleanArray::from_iter(data.iter())
140+
}
141+
}
142+
143+
impl From<ArrayDataRef> for BooleanArray {
144+
fn from(data: ArrayDataRef) -> Self {
145+
assert_eq!(
146+
data.buffers().len(),
147+
1,
148+
"BooleanArray data should contain a single buffer only (values buffer)"
149+
);
150+
let raw_values = data.buffers()[0].raw_data();
151+
assert!(
152+
memory::is_aligned::<u8>(raw_values, mem::align_of::<bool>()),
153+
"memory is not aligned"
154+
);
155+
Self {
156+
data,
157+
raw_values: RawPtrBox::new(raw_values as *const u8),
158+
}
159+
}
160+
}
161+
162+
impl<'a> IntoIterator for &'a BooleanArray {
163+
type Item = Option<bool>;
164+
type IntoIter = BooleanIter<'a>;
165+
166+
fn into_iter(self) -> Self::IntoIter {
167+
BooleanIter::<'a>::new(self)
168+
}
169+
}
170+
171+
impl<'a> BooleanArray {
172+
/// constructs a new iterator
173+
pub fn iter(&'a self) -> BooleanIter<'a> {
174+
BooleanIter::<'a>::new(&self)
175+
}
176+
}
177+
178+
impl<Ptr: Borrow<Option<bool>>> FromIterator<Ptr> for BooleanArray {
179+
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
180+
let iter = iter.into_iter();
181+
let (_, data_len) = iter.size_hint();
182+
let data_len = data_len.expect("Iterator must be sized"); // panic if no upper bound.
183+
184+
let num_bytes = bit_util::ceil(data_len, 8);
185+
let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false);
186+
let mut val_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false);
187+
188+
let data = unsafe {
189+
std::slice::from_raw_parts_mut(val_buf.raw_data_mut(), val_buf.capacity())
190+
};
191+
192+
let null_slice = null_buf.data_mut();
193+
iter.enumerate().for_each(|(i, item)| {
194+
if let Some(a) = item.borrow() {
195+
bit_util::set_bit(null_slice, i);
196+
if *a {
197+
bit_util::set_bit(data, i);
198+
}
199+
}
200+
});
201+
202+
let data = ArrayData::new(
203+
DataType::Boolean,
204+
data_len,
205+
None,
206+
Some(null_buf.freeze()),
207+
0,
208+
vec![val_buf.freeze()],
209+
vec![],
210+
);
211+
BooleanArray::from(Arc::new(data))
212+
}
213+
}
214+
215+
#[cfg(test)]
216+
mod tests {
217+
use super::*;
218+
219+
use crate::buffer::Buffer;
220+
use crate::datatypes::DataType;
221+
222+
#[test]
223+
fn test_boolean_fmt_debug() {
224+
let arr = BooleanArray::from(vec![true, false, false]);
225+
assert_eq!(
226+
"BooleanArray\n[\n true,\n false,\n false,\n]",
227+
format!("{:?}", arr)
228+
);
229+
}
230+
231+
#[test]
232+
fn test_boolean_with_null_fmt_debug() {
233+
let mut builder = BooleanArray::builder(3);
234+
builder.append_value(true).unwrap();
235+
builder.append_null().unwrap();
236+
builder.append_value(false).unwrap();
237+
let arr = builder.finish();
238+
assert_eq!(
239+
"BooleanArray\n[\n true,\n null,\n false,\n]",
240+
format!("{:?}", arr)
241+
);
242+
}
243+
244+
#[test]
245+
fn test_boolean_array_from_vec() {
246+
let buf = Buffer::from([10_u8]);
247+
let arr = BooleanArray::from(vec![false, true, false, true]);
248+
assert_eq!(buf, arr.values());
249+
assert_eq!(4, arr.len());
250+
assert_eq!(0, arr.offset());
251+
assert_eq!(0, arr.null_count());
252+
for i in 0..4 {
253+
assert!(!arr.is_null(i));
254+
assert!(arr.is_valid(i));
255+
assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i)
256+
}
257+
}
258+
259+
#[test]
260+
fn test_boolean_array_from_vec_option() {
261+
let buf = Buffer::from([10_u8]);
262+
let arr = BooleanArray::from(vec![Some(false), Some(true), None, Some(true)]);
263+
assert_eq!(buf, arr.values());
264+
assert_eq!(4, arr.len());
265+
assert_eq!(0, arr.offset());
266+
assert_eq!(1, arr.null_count());
267+
for i in 0..4 {
268+
if i == 2 {
269+
assert!(arr.is_null(i));
270+
assert!(!arr.is_valid(i));
271+
} else {
272+
assert!(!arr.is_null(i));
273+
assert!(arr.is_valid(i));
274+
assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i)
275+
}
276+
}
277+
}
278+
279+
#[test]
280+
fn test_boolean_array_builder() {
281+
// Test building a boolean array with ArrayData builder and offset
282+
// 000011011
283+
let buf = Buffer::from([27_u8]);
284+
let buf2 = buf.clone();
285+
let data = ArrayData::builder(DataType::Boolean)
286+
.len(5)
287+
.offset(2)
288+
.add_buffer(buf)
289+
.build();
290+
let arr = BooleanArray::from(data);
291+
assert_eq!(buf2, arr.values());
292+
assert_eq!(5, arr.len());
293+
assert_eq!(2, arr.offset());
294+
assert_eq!(0, arr.null_count());
295+
for i in 0..3 {
296+
assert_eq!(i != 0, arr.value(i), "failed at {}", i);
297+
}
298+
}
299+
300+
#[test]
301+
#[should_panic(expected = "BooleanArray data should contain a single buffer only \
302+
(values buffer)")]
303+
fn test_boolean_array_invalid_buffer_len() {
304+
let data = ArrayData::builder(DataType::Boolean).len(5).build();
305+
BooleanArray::from(data);
306+
}
307+
}

0 commit comments

Comments
 (0)