Skip to content

Commit 0c4c153

Browse files
julienledemkou
authored andcommitted
ARROW-372: json vector serialization format
This format serializes the vectors in JSON. It is not a generic JSON to arrow converter but rather a human readable version of the vectors to help with tests. Author: Julien Le Dem <julien@dremio.com> Closes apache#201 from julienledem/json_file and squashes the following commits: 2e63bec [Julien Le Dem] add missing license 5588729 [Julien Le Dem] refactor tests, improve format 5ef5356 [Julien Le Dem] improve format to allow empty column name 746430c [Julien Le Dem] ARROW-372: Create JSON arrow file format for integration tests
1 parent 137e864 commit 0c4c153

5 files changed

Lines changed: 741 additions & 189 deletions

File tree

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
/*******************************************************************************
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
******************************************************************************/
18+
package org.apache.arrow.vector.file.json;
19+
20+
import static com.fasterxml.jackson.core.JsonToken.END_ARRAY;
21+
import static com.fasterxml.jackson.core.JsonToken.END_OBJECT;
22+
import static com.fasterxml.jackson.core.JsonToken.START_ARRAY;
23+
import static com.fasterxml.jackson.core.JsonToken.START_OBJECT;
24+
import static java.nio.charset.StandardCharsets.UTF_8;
25+
26+
import java.io.File;
27+
import java.io.IOException;
28+
import java.util.List;
29+
30+
import org.apache.arrow.memory.BufferAllocator;
31+
import org.apache.arrow.vector.BigIntVector;
32+
import org.apache.arrow.vector.BitVector;
33+
import org.apache.arrow.vector.BufferBacked;
34+
import org.apache.arrow.vector.FieldVector;
35+
import org.apache.arrow.vector.Float4Vector;
36+
import org.apache.arrow.vector.Float8Vector;
37+
import org.apache.arrow.vector.IntVector;
38+
import org.apache.arrow.vector.SmallIntVector;
39+
import org.apache.arrow.vector.TimeStampVector;
40+
import org.apache.arrow.vector.TinyIntVector;
41+
import org.apache.arrow.vector.UInt1Vector;
42+
import org.apache.arrow.vector.UInt2Vector;
43+
import org.apache.arrow.vector.UInt4Vector;
44+
import org.apache.arrow.vector.UInt8Vector;
45+
import org.apache.arrow.vector.ValueVector;
46+
import org.apache.arrow.vector.ValueVector.Mutator;
47+
import org.apache.arrow.vector.VarCharVector;
48+
import org.apache.arrow.vector.VectorSchemaRoot;
49+
import org.apache.arrow.vector.schema.ArrowVectorType;
50+
import org.apache.arrow.vector.types.pojo.Field;
51+
import org.apache.arrow.vector.types.pojo.Schema;
52+
53+
import com.fasterxml.jackson.core.JsonParseException;
54+
import com.fasterxml.jackson.core.JsonParser;
55+
import com.fasterxml.jackson.core.JsonToken;
56+
import com.fasterxml.jackson.databind.MappingJsonFactory;
57+
import com.google.common.base.Objects;
58+
59+
public class JsonFileReader {
60+
private final File inputFile;
61+
private final JsonParser parser;
62+
private final BufferAllocator allocator;
63+
private Schema schema;
64+
65+
public JsonFileReader(File inputFile, BufferAllocator allocator) throws JsonParseException, IOException {
66+
super();
67+
this.inputFile = inputFile;
68+
this.allocator = allocator;
69+
MappingJsonFactory jsonFactory = new MappingJsonFactory();
70+
this.parser = jsonFactory.createParser(inputFile);
71+
}
72+
73+
public Schema start() throws JsonParseException, IOException {
74+
readToken(START_OBJECT);
75+
{
76+
this.schema = readNextField("schema", Schema.class);
77+
nextFieldIs("batches");
78+
readToken(START_ARRAY);
79+
return schema;
80+
}
81+
}
82+
83+
public VectorSchemaRoot read() throws IOException {
84+
VectorSchemaRoot recordBatch = new VectorSchemaRoot(schema, allocator);
85+
readToken(START_OBJECT);
86+
{
87+
int count = readNextField("count", Integer.class);
88+
recordBatch.setRowCount(count);
89+
nextFieldIs("columns");
90+
readToken(START_ARRAY);
91+
{
92+
for (Field field : schema.getFields()) {
93+
FieldVector vector = recordBatch.getVector(field.getName());
94+
readVector(field, vector);
95+
}
96+
}
97+
readToken(END_ARRAY);
98+
}
99+
readToken(END_OBJECT);
100+
return recordBatch;
101+
}
102+
103+
private void readVector(Field field, FieldVector vector) throws JsonParseException, IOException {
104+
List<ArrowVectorType> vectorTypes = field.getTypeLayout().getVectorTypes();
105+
List<BufferBacked> fieldInnerVectors = vector.getFieldInnerVectors();
106+
if (vectorTypes.size() != fieldInnerVectors.size()) {
107+
throw new IllegalArgumentException("vector types and inner vectors are not the same size: " + vectorTypes.size() + " != " + fieldInnerVectors.size());
108+
}
109+
readToken(START_OBJECT);
110+
{
111+
String name = readNextField("name", String.class);
112+
if (!Objects.equal(field.getName(), name)) {
113+
throw new IllegalArgumentException("Expected field " + field.getName() + " but got " + name);
114+
}
115+
int count = readNextField("count", Integer.class);
116+
for (int v = 0; v < vectorTypes.size(); v++) {
117+
ArrowVectorType vectorType = vectorTypes.get(v);
118+
BufferBacked innerVector = fieldInnerVectors.get(v);
119+
nextFieldIs(vectorType.getName());
120+
readToken(START_ARRAY);
121+
ValueVector valueVector = (ValueVector)innerVector;
122+
valueVector.allocateNew();
123+
Mutator mutator = valueVector.getMutator();
124+
mutator.setValueCount(count);
125+
for (int i = 0; i < count; i++) {
126+
parser.nextToken();
127+
setValueFromParser(valueVector, i);
128+
}
129+
readToken(END_ARRAY);
130+
}
131+
// if children
132+
List<Field> fields = field.getChildren();
133+
if (!fields.isEmpty()) {
134+
List<FieldVector> vectorChildren = vector.getChildrenFromFields();
135+
if (fields.size() != vectorChildren.size()) {
136+
throw new IllegalArgumentException("fields and children are not the same size: " + fields.size() + " != " + vectorChildren.size());
137+
}
138+
nextFieldIs("children");
139+
readToken(START_ARRAY);
140+
for (int i = 0; i < fields.size(); i++) {
141+
Field childField = fields.get(i);
142+
FieldVector childVector = vectorChildren.get(i);
143+
readVector(childField, childVector);
144+
}
145+
readToken(END_ARRAY);
146+
}
147+
}
148+
readToken(END_OBJECT);
149+
}
150+
151+
private void setValueFromParser(ValueVector valueVector, int i) throws IOException {
152+
switch (valueVector.getMinorType()) {
153+
case BIT:
154+
((BitVector)valueVector).getMutator().set(i, parser.readValueAs(Boolean.class) ? 1 : 0);
155+
break;
156+
case TINYINT:
157+
((TinyIntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
158+
break;
159+
case SMALLINT:
160+
((SmallIntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
161+
break;
162+
case INT:
163+
((IntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
164+
break;
165+
case BIGINT:
166+
((BigIntVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class));
167+
break;
168+
case UINT1:
169+
((UInt1Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
170+
break;
171+
case UINT2:
172+
((UInt2Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
173+
break;
174+
case UINT4:
175+
((UInt4Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class));
176+
break;
177+
case UINT8:
178+
((UInt8Vector)valueVector).getMutator().set(i, parser.readValueAs(Long.class));
179+
break;
180+
case FLOAT4:
181+
((Float4Vector)valueVector).getMutator().set(i, parser.readValueAs(Float.class));
182+
break;
183+
case FLOAT8:
184+
((Float8Vector)valueVector).getMutator().set(i, parser.readValueAs(Double.class));
185+
break;
186+
case VARCHAR:
187+
((VarCharVector)valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8));
188+
break;
189+
case TIMESTAMP:
190+
((TimeStampVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class));
191+
break;
192+
default:
193+
throw new UnsupportedOperationException("minor type: " + valueVector.getMinorType());
194+
}
195+
}
196+
197+
public void close() throws IOException {
198+
readToken(END_ARRAY);
199+
readToken(END_OBJECT);
200+
parser.close();
201+
}
202+
203+
private <T> T readNextField(String expectedFieldName, Class<T> c) throws IOException, JsonParseException {
204+
nextFieldIs(expectedFieldName);
205+
parser.nextToken();
206+
return parser.readValueAs(c);
207+
}
208+
209+
private void nextFieldIs(String expectedFieldName) throws IOException, JsonParseException {
210+
String name = parser.nextFieldName();
211+
if (name == null || !name.equals(expectedFieldName)) {
212+
throw new IllegalStateException("Expected " + expectedFieldName + " but got " + name);
213+
}
214+
}
215+
216+
private void readToken(JsonToken expected) throws JsonParseException, IOException {
217+
JsonToken t = parser.nextToken();
218+
if (t != expected) {
219+
throw new IllegalStateException("Expected " + expected + " but got " + t);
220+
}
221+
}
222+
223+
}
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/*******************************************************************************
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
******************************************************************************/
18+
package org.apache.arrow.vector.file.json;
19+
20+
import java.io.File;
21+
import java.io.IOException;
22+
import java.util.List;
23+
24+
import org.apache.arrow.vector.BitVector;
25+
import org.apache.arrow.vector.BufferBacked;
26+
import org.apache.arrow.vector.FieldVector;
27+
import org.apache.arrow.vector.TimeStampVector;
28+
import org.apache.arrow.vector.ValueVector;
29+
import org.apache.arrow.vector.ValueVector.Accessor;
30+
import org.apache.arrow.vector.VectorSchemaRoot;
31+
import org.apache.arrow.vector.schema.ArrowVectorType;
32+
import org.apache.arrow.vector.types.pojo.Field;
33+
import org.apache.arrow.vector.types.pojo.Schema;
34+
35+
import com.fasterxml.jackson.core.JsonEncoding;
36+
import com.fasterxml.jackson.core.JsonGenerator;
37+
import com.fasterxml.jackson.core.util.DefaultPrettyPrinter;
38+
import com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter;
39+
import com.fasterxml.jackson.databind.MappingJsonFactory;
40+
41+
public class JsonFileWriter {
42+
43+
public static final class JSONWriteConfig {
44+
private final boolean pretty;
45+
private JSONWriteConfig(boolean pretty) {
46+
this.pretty = pretty;
47+
}
48+
private JSONWriteConfig() {
49+
this.pretty = false;
50+
}
51+
public JSONWriteConfig pretty(boolean pretty) {
52+
return new JSONWriteConfig(pretty);
53+
}
54+
}
55+
56+
public static JSONWriteConfig config() {
57+
return new JSONWriteConfig();
58+
}
59+
60+
private final JsonGenerator generator;
61+
private Schema schema;
62+
63+
public JsonFileWriter(File outputFile) throws IOException {
64+
this(outputFile, config());
65+
}
66+
67+
public JsonFileWriter(File outputFile, JSONWriteConfig config) throws IOException {
68+
MappingJsonFactory jsonFactory = new MappingJsonFactory();
69+
this.generator = jsonFactory.createGenerator(outputFile, JsonEncoding.UTF8);
70+
if (config.pretty) {
71+
DefaultPrettyPrinter prettyPrinter = new DefaultPrettyPrinter();
72+
prettyPrinter.indentArraysWith(NopIndenter.instance);
73+
this.generator.setPrettyPrinter(prettyPrinter);
74+
}
75+
}
76+
77+
public void start(Schema schema) throws IOException {
78+
this.schema = schema;
79+
generator.writeStartObject();
80+
generator.writeObjectField("schema", schema);
81+
generator.writeArrayFieldStart("batches");
82+
}
83+
84+
public void write(VectorSchemaRoot recordBatch) throws IOException {
85+
if (!recordBatch.getSchema().equals(schema)) {
86+
throw new IllegalArgumentException("record batches must have the same schema: " + schema);
87+
}
88+
generator.writeStartObject();
89+
{
90+
generator.writeObjectField("count", recordBatch.getRowCount());
91+
generator.writeArrayFieldStart("columns");
92+
for (Field field : schema.getFields()) {
93+
FieldVector vector = recordBatch.getVector(field.getName());
94+
writeVector(field, vector);
95+
}
96+
generator.writeEndArray();
97+
}
98+
generator.writeEndObject();
99+
}
100+
101+
private void writeVector(Field field, FieldVector vector) throws IOException {
102+
List<ArrowVectorType> vectorTypes = field.getTypeLayout().getVectorTypes();
103+
List<BufferBacked> fieldInnerVectors = vector.getFieldInnerVectors();
104+
if (vectorTypes.size() != fieldInnerVectors.size()) {
105+
throw new IllegalArgumentException("vector types and inner vectors are not the same size: " + vectorTypes.size() + " != " + fieldInnerVectors.size());
106+
}
107+
generator.writeStartObject();
108+
{
109+
generator.writeObjectField("name", field.getName());
110+
int valueCount = vector.getAccessor().getValueCount();
111+
generator.writeObjectField("count", valueCount);
112+
for (int v = 0; v < vectorTypes.size(); v++) {
113+
ArrowVectorType vectorType = vectorTypes.get(v);
114+
BufferBacked innerVector = fieldInnerVectors.get(v);
115+
generator.writeArrayFieldStart(vectorType.getName());
116+
ValueVector valueVector = (ValueVector)innerVector;
117+
for (int i = 0; i < valueCount; i++) {
118+
writeValueToGenerator(valueVector, i);
119+
}
120+
generator.writeEndArray();
121+
}
122+
List<Field> fields = field.getChildren();
123+
List<FieldVector> children = vector.getChildrenFromFields();
124+
if (fields.size() != children.size()) {
125+
throw new IllegalArgumentException("fields and children are not the same size: " + fields.size() + " != " + children.size());
126+
}
127+
if (fields.size() > 0) {
128+
generator.writeArrayFieldStart("children");
129+
for (int i = 0; i < fields.size(); i++) {
130+
Field childField = fields.get(i);
131+
FieldVector childVector = children.get(i);
132+
writeVector(childField, childVector);
133+
}
134+
generator.writeEndArray();
135+
}
136+
}
137+
generator.writeEndObject();
138+
}
139+
140+
private void writeValueToGenerator(ValueVector valueVector, int i) throws IOException {
141+
switch (valueVector.getMinorType()) {
142+
case TIMESTAMP:
143+
generator.writeNumber(((TimeStampVector)valueVector).getAccessor().get(i));
144+
break;
145+
case BIT:
146+
generator.writeNumber(((BitVector)valueVector).getAccessor().get(i));
147+
break;
148+
default:
149+
// TODO: each type
150+
Accessor accessor = valueVector.getAccessor();
151+
Object value = accessor.getObject(i);
152+
if (value instanceof Number || value instanceof Boolean) {
153+
generator.writeObject(value);
154+
} else {
155+
generator.writeObject(value.toString());
156+
}
157+
break;
158+
}
159+
}
160+
161+
public void close() throws IOException {
162+
generator.writeEndArray();
163+
generator.writeEndObject();
164+
generator.close();
165+
}
166+
167+
}

0 commit comments

Comments
 (0)