Skip to content

Commit 88e9eb8

Browse files
nevi-mealamb
authored andcommitted
ARROW-11599: [Rust] Add function to create array with all nulls
This allows creating an array with n null values Closes #9469 from nevi-me/make-empty-arrays Lead-authored-by: Neville Dipale <nevilledips@gmail.com> Co-authored-by: Wakahisa <nevilledips@gmail.com> Signed-off-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 7660a22 commit 88e9eb8

5 files changed

Lines changed: 267 additions & 83 deletions

File tree

rust/arrow/src/array/array.rs

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use std::{any::Any, convert::TryFrom};
2222
use super::ArrayDataRef;
2323
use super::*;
2424
use crate::array::equal_json::JsonEqual;
25+
use crate::buffer::{Buffer, MutableBuffer};
2526
use crate::error::Result;
2627
use crate::ffi;
2728

@@ -326,6 +327,170 @@ pub fn new_empty_array(data_type: &DataType) -> ArrayRef {
326327
let data = ArrayData::new_empty(data_type);
327328
make_array(Arc::new(data))
328329
}
330+
/// Creates a new array of `data_type` of length `length` filled entirely of `NULL` values
331+
pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
332+
// context: https://github.com/apache/arrow/pull/9469#discussion_r574761687
333+
match data_type {
334+
DataType::Null => Arc::new(NullArray::new(length)),
335+
DataType::Boolean => {
336+
let null_buf: Buffer = MutableBuffer::new_null(length).into();
337+
make_array(Arc::new(ArrayData::new(
338+
data_type.clone(),
339+
length,
340+
Some(length),
341+
Some(null_buf.clone()),
342+
0,
343+
vec![null_buf],
344+
vec![],
345+
)))
346+
}
347+
DataType::Int8 => new_null_sized_array::<Int8Type>(data_type, length),
348+
DataType::UInt8 => new_null_sized_array::<UInt8Type>(data_type, length),
349+
DataType::Int16 => new_null_sized_array::<Int16Type>(data_type, length),
350+
DataType::UInt16 => new_null_sized_array::<UInt16Type>(data_type, length),
351+
DataType::Float16 => unreachable!(),
352+
DataType::Int32 => new_null_sized_array::<Int32Type>(data_type, length),
353+
DataType::UInt32 => new_null_sized_array::<UInt32Type>(data_type, length),
354+
DataType::Float32 => new_null_sized_array::<Float32Type>(data_type, length),
355+
DataType::Date32 => new_null_sized_array::<Date32Type>(data_type, length),
356+
// expanding this into Date23{unit}Type results in needless branching
357+
DataType::Time32(_) => new_null_sized_array::<Int32Type>(data_type, length),
358+
DataType::Int64 => new_null_sized_array::<Int64Type>(data_type, length),
359+
DataType::UInt64 => new_null_sized_array::<UInt64Type>(data_type, length),
360+
DataType::Float64 => new_null_sized_array::<Float64Type>(data_type, length),
361+
DataType::Date64 => new_null_sized_array::<Date64Type>(data_type, length),
362+
// expanding this into Timestamp{unit}Type results in needless branching
363+
DataType::Timestamp(_, _) => new_null_sized_array::<Int64Type>(data_type, length),
364+
DataType::Time64(_) => new_null_sized_array::<Int64Type>(data_type, length),
365+
DataType::Duration(_) => new_null_sized_array::<Int64Type>(data_type, length),
366+
DataType::Interval(unit) => match unit {
367+
IntervalUnit::YearMonth => {
368+
new_null_sized_array::<IntervalYearMonthType>(data_type, length)
369+
}
370+
IntervalUnit::DayTime => {
371+
new_null_sized_array::<IntervalDayTimeType>(data_type, length)
372+
}
373+
},
374+
DataType::FixedSizeBinary(value_len) => make_array(Arc::new(ArrayData::new(
375+
data_type.clone(),
376+
length,
377+
Some(length),
378+
Some(MutableBuffer::new_null(length).into()),
379+
0,
380+
vec![Buffer::from(vec![0u8; *value_len as usize * length])],
381+
vec![],
382+
))),
383+
DataType::Binary | DataType::Utf8 => {
384+
new_null_binary_array::<i32>(data_type, length)
385+
}
386+
DataType::LargeBinary | DataType::LargeUtf8 => {
387+
new_null_binary_array::<i64>(data_type, length)
388+
}
389+
DataType::List(field) => {
390+
new_null_list_array::<i32>(data_type, field.data_type(), length)
391+
}
392+
DataType::LargeList(field) => {
393+
new_null_list_array::<i64>(data_type, field.data_type(), length)
394+
}
395+
DataType::FixedSizeList(field, value_len) => {
396+
make_array(Arc::new(ArrayData::new(
397+
data_type.clone(),
398+
length,
399+
Some(length),
400+
Some(MutableBuffer::new_null(length).into()),
401+
0,
402+
vec![],
403+
vec![
404+
new_null_array(field.data_type(), *value_len as usize * length)
405+
.data(),
406+
],
407+
)))
408+
}
409+
DataType::Struct(fields) => make_array(Arc::new(ArrayData::new(
410+
data_type.clone(),
411+
length,
412+
Some(length),
413+
Some(MutableBuffer::new_null(length).into()),
414+
0,
415+
vec![],
416+
fields
417+
.iter()
418+
.map(|field| Arc::new(ArrayData::new_empty(field.data_type())))
419+
.collect(),
420+
))),
421+
DataType::Union(_) => {
422+
unimplemented!("Creating null Union array not yet supported")
423+
}
424+
DataType::Dictionary(_, value) => {
425+
make_array(Arc::new(ArrayData::new(
426+
data_type.clone(),
427+
length,
428+
Some(length),
429+
Some(MutableBuffer::new_null(length).into()),
430+
0,
431+
vec![MutableBuffer::new(0).into()], // values are empty
432+
vec![new_empty_array(value.as_ref()).data()],
433+
)))
434+
}
435+
DataType::Decimal(_, _) => {
436+
unimplemented!("Creating null Decimal array not yet supported")
437+
}
438+
}
439+
}
440+
441+
#[inline]
442+
fn new_null_list_array<OffsetSize: OffsetSizeTrait>(
443+
data_type: &DataType,
444+
child_data_type: &DataType,
445+
length: usize,
446+
) -> ArrayRef {
447+
make_array(Arc::new(ArrayData::new(
448+
data_type.clone(),
449+
length,
450+
Some(length),
451+
Some(MutableBuffer::new_null(length).into()),
452+
0,
453+
vec![Buffer::from(
454+
vec![OffsetSize::zero(); length + 1].to_byte_slice(),
455+
)],
456+
vec![Arc::new(ArrayData::new_empty(child_data_type))],
457+
)))
458+
}
459+
460+
#[inline]
461+
fn new_null_binary_array<OffsetSize: OffsetSizeTrait>(
462+
data_type: &DataType,
463+
length: usize,
464+
) -> ArrayRef {
465+
make_array(Arc::new(ArrayData::new(
466+
data_type.clone(),
467+
length,
468+
Some(length),
469+
Some(MutableBuffer::new_null(length).into()),
470+
0,
471+
vec![
472+
Buffer::from(vec![OffsetSize::zero(); length + 1].to_byte_slice()),
473+
MutableBuffer::new(0).into(),
474+
],
475+
vec![],
476+
)))
477+
}
478+
479+
#[inline]
480+
fn new_null_sized_array<T: ArrowPrimitiveType>(
481+
data_type: &DataType,
482+
length: usize,
483+
) -> ArrayRef {
484+
make_array(Arc::new(ArrayData::new(
485+
data_type.clone(),
486+
length,
487+
Some(length),
488+
Some(MutableBuffer::new_null(length).into()),
489+
0,
490+
vec![Buffer::from(vec![0u8; length * T::get_byte_width()])],
491+
vec![],
492+
)))
493+
}
329494

330495
/// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface
331496
/// # Safety
@@ -409,4 +574,60 @@ mod tests {
409574
assert_eq!(a.len(), 0);
410575
assert_eq!(a.value_offsets()[0], 0i32);
411576
}
577+
578+
#[test]
579+
fn test_null_boolean() {
580+
let array = new_null_array(&DataType::Boolean, 9);
581+
let a = array.as_any().downcast_ref::<BooleanArray>().unwrap();
582+
assert_eq!(a.len(), 9);
583+
for i in 0..9 {
584+
assert!(a.is_null(i));
585+
}
586+
}
587+
588+
#[test]
589+
fn test_null_primitive() {
590+
let array = new_null_array(&DataType::Int32, 9);
591+
let a = array.as_any().downcast_ref::<Int32Array>().unwrap();
592+
assert_eq!(a.len(), 9);
593+
for i in 0..9 {
594+
assert!(a.is_null(i));
595+
}
596+
}
597+
598+
#[test]
599+
fn test_null_variable_sized() {
600+
let array = new_null_array(&DataType::Utf8, 9);
601+
let a = array.as_any().downcast_ref::<StringArray>().unwrap();
602+
assert_eq!(a.len(), 9);
603+
assert_eq!(a.value_offsets()[9], 0i32);
604+
for i in 0..9 {
605+
assert!(a.is_null(i));
606+
}
607+
}
608+
609+
#[test]
610+
fn test_null_list_primitive() {
611+
let data_type =
612+
DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
613+
let array = new_null_array(&data_type, 9);
614+
let a = array.as_any().downcast_ref::<ListArray>().unwrap();
615+
assert_eq!(a.len(), 9);
616+
assert_eq!(a.value_offsets()[9], 0i32);
617+
for i in 0..9 {
618+
assert!(a.is_null(i));
619+
}
620+
}
621+
622+
#[test]
623+
fn test_null_dictionary() {
624+
let values = vec![None, None, None, None, None, None, None, None, None]
625+
as Vec<Option<&str>>;
626+
627+
let array: DictionaryArray<Int8Type> = values.into_iter().collect();
628+
let array = Arc::new(array) as ArrayRef;
629+
630+
let null_array = new_null_array(array.data_type(), 9);
631+
assert_eq!(&array, &null_array);
632+
}
412633
}

rust/arrow/src/array/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ pub use self::null::NullArray;
131131

132132
pub use self::array::make_array;
133133
pub use self::array::new_empty_array;
134+
pub use self::array::new_null_array;
134135

135136
pub type Int8Array = PrimitiveArray<Int8Type>;
136137
pub type Int16Array = PrimitiveArray<Int16Type>;

rust/arrow/src/array/null.rs

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,6 @@ impl NullArray {
5252
let array_data = ArrayData::builder(DataType::Null).len(length).build();
5353
NullArray::from(array_data)
5454
}
55-
56-
/// Create a new null array of the specified length and type
57-
pub fn new_with_type(length: usize, data_type: DataType) -> Self {
58-
let array_data = ArrayData::builder(data_type).len(length).build();
59-
NullArray::from(array_data)
60-
}
6155
}
6256

6357
impl Array for NullArray {
@@ -104,6 +98,11 @@ impl Array for NullArray {
10498

10599
impl From<ArrayDataRef> for NullArray {
106100
fn from(data: ArrayDataRef) -> Self {
101+
assert_eq!(
102+
data.data_type(),
103+
&DataType::Null,
104+
"NullArray data type should be Null"
105+
);
107106
assert_eq!(
108107
data.buffers().len(),
109108
0,
@@ -153,15 +152,6 @@ mod tests {
153152
assert_eq!(array2.offset(), 8);
154153
}
155154

156-
#[test]
157-
fn test_null_array_new_with_type() {
158-
let length = 10;
159-
let data_type = DataType::Int8;
160-
let array = NullArray::new_with_type(length, data_type.clone());
161-
assert_eq!(array.len(), length);
162-
assert_eq!(array.data_type(), &data_type);
163-
}
164-
165155
#[test]
166156
fn test_debug_null_array() {
167157
let array = NullArray::new(1024 * 1024);

rust/datafusion/src/physical_plan/parquet.rs

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,11 @@ use crate::{
3939
optimizer::utils,
4040
prelude::ExecutionConfig,
4141
};
42-
use arrow::error::{ArrowError, Result as ArrowResult};
4342
use arrow::record_batch::RecordBatch;
43+
use arrow::{
44+
array::new_null_array,
45+
error::{ArrowError, Result as ArrowResult},
46+
};
4447
use arrow::{
4548
array::{make_array, ArrayData, ArrayRef, BooleanArray, BooleanBufferBuilder},
4649
buffer::MutableBuffer,
@@ -646,13 +649,6 @@ enum StatisticsType {
646649
Max,
647650
}
648651

649-
fn build_null_array(data_type: &DataType, length: usize) -> ArrayRef {
650-
Arc::new(arrow::array::NullArray::new_with_type(
651-
length,
652-
data_type.clone(),
653-
))
654-
}
655-
656652
fn build_statistics_array(
657653
statistics: &[Option<&ParquetStatistics>],
658654
statistics_type: StatisticsType,
@@ -665,7 +661,7 @@ fn build_statistics_array(
665661
statistics
666662
} else {
667663
// no row group has statistics defined
668-
return build_null_array(data_type, statistics_count);
664+
return new_null_array(data_type, statistics_count);
669665
};
670666

671667
let (data_size, arrow_type) = match first_group_stats {
@@ -678,7 +674,7 @@ fn build_statistics_array(
678674
}
679675
_ => {
680676
// type of statistics not supported
681-
return build_null_array(data_type, statistics_count);
677+
return new_null_array(data_type, statistics_count);
682678
}
683679
};
684680

@@ -735,7 +731,7 @@ fn build_statistics_array(
735731
}
736732
// cast statistics array to required data type
737733
arrow::compute::cast(&statistics_array, data_type)
738-
.unwrap_or_else(|_| build_null_array(data_type, statistics_count))
734+
.unwrap_or_else(|_| new_null_array(data_type, statistics_count))
739735
}
740736

741737
#[async_trait]

0 commit comments

Comments
 (0)