Skip to content

Commit e355d72

Browse files
benibusdgreiss
authored andcommitted
apacheGH-37582: [Go][Parquet] Implement Float16 logical type (apache#37599)
### Rationale for this change There is an active proposal for a Float16 logical type in Parquet (apache/parquet-format#184) with C++/Python implementations in progress (apache#36073), so we should add one for Go as well. ### What changes are included in this PR? - [x] Adds `LogicalType` definitions and methods for `Float16` - [x] Adds support for `Float16` column statistics and comparators - [x] Adds support for interchange between Parquet and Arrow's half-precision float ### Are these changes tested? Yes ### Are there any user-facing changes? Yes * Closes: apache#37582 Authored-by: benibus <bpharks@gmx.com> Signed-off-by: Matt Topol <zotthewizard@gmail.com>
1 parent e46408a commit e355d72

25 files changed

Lines changed: 1183 additions & 75 deletions

go/arrow/float16/float16.go

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package float16
1818

1919
import (
20+
"encoding/binary"
2021
"math"
2122
"strconv"
2223
)
@@ -29,6 +30,11 @@ type Num struct {
2930
bits uint16
3031
}
3132

33+
var (
34+
MaxNum = Num{bits: 0b0111101111111111}
35+
MinNum = MaxNum.Negate()
36+
)
37+
3238
// New creates a new half-precision floating point value from the provided
3339
// float32 value.
3440
func New(f float32) Num {
@@ -86,6 +92,11 @@ func (n Num) Div(rhs Num) Num {
8692
return New(n.Float32() / rhs.Float32())
8793
}
8894

95+
// Equal returns true if the value represented by n is == other
96+
func (n Num) Equal(other Num) bool {
97+
return n.Float32() == other.Float32()
98+
}
99+
89100
// Greater returns true if the value represented by n is > other
90101
func (n Num) Greater(other Num) bool {
91102
return n.Float32() > other.Float32()
@@ -152,14 +163,39 @@ func (n Num) Abs() Num {
152163
}
153164

154165
func (n Num) Sign() int {
155-
f := n.Float32()
156-
if f > 0 {
157-
return 1
158-
} else if f == 0 {
166+
if n.IsZero() {
159167
return 0
168+
} else if n.Signbit() {
169+
return -1
160170
}
161-
return -1
171+
return 1
162172
}
163173

174+
func (n Num) Signbit() bool { return (n.bits & 0x8000) != 0 }
175+
176+
func (n Num) IsNaN() bool { return (n.bits & 0x7fff) > 0x7c00 }
177+
178+
func (n Num) IsZero() bool { return (n.bits & 0x7fff) == 0 }
179+
164180
func (f Num) Uint16() uint16 { return f.bits }
165181
func (f Num) String() string { return strconv.FormatFloat(float64(f.Float32()), 'g', -1, 32) }
182+
183+
func Inf() Num { return Num{bits: 0x7c00} }
184+
185+
func NaN() Num { return Num{bits: 0x7fff} }
186+
187+
func FromBits(src uint16) Num { return Num{bits: src} }
188+
189+
func FromLEBytes(src []byte) Num {
190+
return Num{bits: binary.LittleEndian.Uint16(src)}
191+
}
192+
193+
func (f Num) PutLEBytes(dst []byte) {
194+
binary.LittleEndian.PutUint16(dst, f.bits)
195+
}
196+
197+
func (f Num) ToLEBytes() []byte {
198+
dst := make([]byte, 2)
199+
f.PutLEBytes(dst)
200+
return dst
201+
}

go/arrow/float16/float16_test.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ func TestSign(t *testing.T) {
238238
}{
239239
{Num{bits: 0x4580}, 1}, // 5.5
240240
{Num{bits: 0x0000}, 0}, // 0
241+
{Num{bits: 0x8000}, 0}, // -0
241242
{Num{bits: 0xC580}, -1}, // -5.5
242243
} {
243244
t.Run("sign", func(t *testing.T) {
@@ -248,3 +249,45 @@ func TestSign(t *testing.T) {
248249
})
249250
}
250251
}
252+
253+
func TestSignbit(t *testing.T) {
254+
for _, tc := range []struct {
255+
n Num
256+
want bool
257+
}{
258+
{Num{bits: 0x4580}, false}, // 5.5
259+
{Num{bits: 0x0000}, false}, // 0
260+
{Num{bits: 0x8000}, true}, // -0
261+
{Num{bits: 0xC580}, true}, // -5.5
262+
} {
263+
t.Run("signbit", func(t *testing.T) {
264+
n := tc.n.Signbit()
265+
if got, want := n, tc.want; got != want {
266+
t.Fatalf("invalid value. got=%v, want=%v", got, want)
267+
}
268+
})
269+
}
270+
}
271+
272+
func TestIsNaN(t *testing.T) {
273+
for _, tc := range []struct {
274+
n Num
275+
want bool
276+
}{
277+
{NaN(), true},
278+
{NaN().Negate(), true},
279+
{Inf(), false},
280+
{Inf().Negate(), false},
281+
{Num{bits: 0x7c01}, true}, // nan
282+
{Num{bits: 0xfc01}, true}, // -nan
283+
{Num{bits: 0x7e00}, true}, // nan
284+
{Num{bits: 0xfe00}, true}, // -nan
285+
} {
286+
t.Run("isnan", func(t *testing.T) {
287+
n := tc.n.IsNaN()
288+
if got, want := n, tc.want; got != want {
289+
t.Fatalf("invalid value. got=%v, want=%v", got, want)
290+
}
291+
})
292+
}
293+
}

go/parquet/file/column_writer_types.gen.go

Lines changed: 11 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/parquet/file/column_writer_types.gen.go.tmpl

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ package file
1818

1919
import (
2020
"fmt"
21-
21+
2222
"github.com/apache/arrow/go/v15/parquet"
2323
"github.com/apache/arrow/go/v15/parquet/metadata"
2424
"github.com/apache/arrow/go/v15/parquet/internal/encoding"
@@ -83,7 +83,7 @@ func (w *{{.Name}}ColumnChunkWriter) WriteBatch(values []{{.name}}, defLevels, r
8383
// writes a large number of values, the DataPage size can be much above the limit.
8484
// The purpose of this chunking is to bound this. Even if a user writes large number
8585
// of values, the chunking will ensure the AddDataPage() is called at a reasonable
86-
// pagesize limit
86+
// pagesize limit
8787
var n int64
8888
switch {
8989
case defLevels != nil:
@@ -107,7 +107,7 @@ func (w *{{.Name}}ColumnChunkWriter) WriteBatch(values []{{.name}}, defLevels, r
107107
valueOffset += toWrite
108108
w.checkDictionarySizeLimit()
109109
})
110-
return
110+
return
111111
}
112112

113113
// WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the
@@ -132,7 +132,7 @@ func (w *{{.Name}}ColumnChunkWriter) WriteBatchSpaced(values []{{.name}}, defLev
132132
length = len(values)
133133
}
134134
doBatches(int64(length), w.props.WriteBatchSize(), func(offset, batch int64) {
135-
var vals []{{.name}}
135+
var vals []{{.name}}
136136
info := w.maybeCalculateValidityBits(levelSliceOrNil(defLevels, offset, batch), batch)
137137

138138
w.writeLevelsSpaced(batch, levelSliceOrNil(defLevels, offset, batch), levelSliceOrNil(repLevels, offset, batch))
@@ -165,7 +165,7 @@ func (w *{{.Name}}ColumnChunkWriter) WriteDictIndices(indices arrow.Array, defLe
165165
}
166166
}
167167
}()
168-
168+
169169
valueOffset := int64(0)
170170
length := len(defLevels)
171171
if defLevels == nil {
@@ -193,14 +193,22 @@ func (w *{{.Name}}ColumnChunkWriter) WriteDictIndices(indices arrow.Array, defLe
193193

194194
valueOffset += info.numSpaced()
195195
})
196-
196+
197197
return
198198
}
199199

200200
func (w *{{.Name}}ColumnChunkWriter) writeValues(values []{{.name}}, numNulls int64) {
201201
w.currentEncoder.(encoding.{{.Name}}Encoder).Put(values)
202202
if w.pageStatistics != nil {
203+
{{- if ne .Name "FixedLenByteArray"}}
203204
w.pageStatistics.(*metadata.{{.Name}}Statistics).Update(values, numNulls)
205+
{{- else}}
206+
if w.Descr().LogicalType().Equals(schema.Float16LogicalType{}) {
207+
w.pageStatistics.(*metadata.Float16Statistics).Update(values, numNulls)
208+
} else {
209+
w.pageStatistics.(*metadata.{{.Name}}Statistics).Update(values, numNulls)
210+
}
211+
{{- end}}
204212
}
205213
}
206214

@@ -212,7 +220,15 @@ func (w *{{.Name}}ColumnChunkWriter) writeValuesSpaced(spacedValues []{{.name}},
212220
}
213221
if w.pageStatistics != nil {
214222
nulls := numValues - numRead
223+
{{- if ne .Name "FixedLenByteArray"}}
215224
w.pageStatistics.(*metadata.{{.Name}}Statistics).UpdateSpaced(spacedValues, validBits, validBitsOffset, nulls)
225+
{{- else}}
226+
if w.Descr().LogicalType().Equals(schema.Float16LogicalType{}) {
227+
w.pageStatistics.(*metadata.Float16Statistics).UpdateSpaced(spacedValues, validBits, validBitsOffset, nulls)
228+
} else {
229+
w.pageStatistics.(*metadata.{{.Name}}Statistics).UpdateSpaced(spacedValues, validBits, validBitsOffset, nulls)
230+
}
231+
{{- end}}
216232
}
217233
}
218234

go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/parquet/internal/gen-go/parquet/parquet-consts.go

Lines changed: 8 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)