diff --git a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java index 4e166190d3..86080fa25e 100644 --- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java +++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantBuilder.java @@ -497,7 +497,6 @@ public void endObject() { int numFields = fields.size(); Collections.sort(fields); int maxId = numFields == 0 ? 0 : fields.get(0).id; - int dataSize = numFields == 0 ? 0 : fields.get(0).valueSize; int distinctPos = 0; // Maintain a list of distinct keys in-place. @@ -512,7 +511,6 @@ public void endObject() { // Found a distinct key. Add the field to the list. distinctPos++; fields.set(distinctPos, fields.get(i)); - dataSize += fields.get(i).valueSize; } } @@ -522,6 +520,13 @@ public void endObject() { fields.subList(numFields, fields.size()).clear(); } + // Compute the data size from the retained fields. This must happen after deduplication, since a + // duplicate key keeps the last-written value, whose size may differ from the first occurrence. + int dataSize = 0; + for (int i = 0; i < numFields; ++i) { + dataSize += fields.get(i).valueSize; + } + boolean largeSize = numFields > VariantUtil.U8_MAX; int sizeBytes = largeSize ? VariantUtil.U32_SIZE : 1; int idSize = getMinIntegerSize(maxId); diff --git a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObjectBuilder.java b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObjectBuilder.java index d657f354b2..d3246e4e29 100644 --- a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObjectBuilder.java +++ b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObjectBuilder.java @@ -277,6 +277,72 @@ public void testDuplicateKeys() { Assert.assertEquals(1, v.getFieldByKey("duplicate").getLong()); } + @Test + public void testDuplicateKeysKeptValueLarger() { + // The retained (last-written) value is larger than the first occurrence. The data size must be + // computed from the retained value, otherwise the encoded object is truncated/corrupt. + VariantBuilder b = new VariantBuilder(); + VariantObjectBuilder objBuilder = b.startObject(); + objBuilder.appendKey("duplicate"); + objBuilder.appendInt(1); // 5 bytes + objBuilder.appendKey("duplicate"); + objBuilder.appendString("hello"); // 6 bytes + b.endObject(); + VariantTestUtil.testVariant(b.build(), v -> { + VariantTestUtil.checkType(v, VariantUtil.OBJECT, Variant.Type.OBJECT); + Assert.assertEquals(1, v.numObjectElements()); + Variant variant = v.getFieldByKey("duplicate"); + VariantTestUtil.checkType(variant, VariantUtil.SHORT_STR, Variant.Type.STRING); + Assert.assertEquals("hello", variant.getString()); + }); + } + + @Test + public void testDuplicateKeysKeptValueSmaller() { + // The retained (last-written) value is smaller than the first occurrence. The data size must be + // computed from the retained value, otherwise the encoded object reserves stale trailing bytes. + VariantBuilder b = new VariantBuilder(); + VariantObjectBuilder objBuilder = b.startObject(); + objBuilder.appendKey("duplicate"); + objBuilder.appendString("hello"); // 6 bytes + objBuilder.appendKey("duplicate"); + objBuilder.appendInt(1); // 5 bytes + b.endObject(); + VariantTestUtil.testVariant(b.build(), v -> { + VariantTestUtil.checkType(v, VariantUtil.OBJECT, Variant.Type.OBJECT); + Assert.assertEquals(1, v.numObjectElements()); + Variant variant = v.getFieldByKey("duplicate"); + VariantTestUtil.checkType(variant, VariantUtil.PRIMITIVE, Variant.Type.INT); + Assert.assertEquals(1, variant.getInt()); + }); + } + + @Test + public void testDuplicateKeysDifferentSizesAcrossMultipleKeys() { + // Exercises deduplication across several keys where the retained values differ in size from the + // first occurrence, ensuring offsets remain consistent for every field. + VariantBuilder b = new VariantBuilder(); + VariantObjectBuilder objBuilder = b.startObject(); + objBuilder.appendKey("a"); + objBuilder.appendInt(1); // 5 bytes + objBuilder.appendKey("b"); + objBuilder.appendBoolean(true); // 1 byte + objBuilder.appendKey("a"); + objBuilder.appendString("a-final"); // larger, retained for "a" + objBuilder.appendKey("b"); + objBuilder.appendLong(123456789L); // 9 bytes, retained for "b" + objBuilder.appendKey("c"); + objBuilder.appendString("c-only"); + b.endObject(); + VariantTestUtil.testVariant(b.build(), v -> { + VariantTestUtil.checkType(v, VariantUtil.OBJECT, Variant.Type.OBJECT); + Assert.assertEquals(3, v.numObjectElements()); + Assert.assertEquals("a-final", v.getFieldByKey("a").getString()); + Assert.assertEquals(123456789L, v.getFieldByKey("b").getLong()); + Assert.assertEquals("c-only", v.getFieldByKey("c").getString()); + }); + } + @Test public void testSortingKeys() { VariantBuilder b = new VariantBuilder();