diff --git a/CMakeLists.txt b/CMakeLists.txt index f70f1821b..d86a82128 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -347,48 +347,48 @@ set(DUCKDB_SRC_FILES src/duckdb/third_party/zstd/dict/divsufsort.cpp src/duckdb/third_party/zstd/dict/fastcover.cpp src/duckdb/third_party/zstd/dict/zdict.cpp - src/duckdb/extension/core_functions/core_functions_extension.cpp src/duckdb/extension/core_functions/lambda_functions.cpp src/duckdb/extension/core_functions/function_list.cpp - src/duckdb/ub_extension_core_functions_scalar_operators.cpp - src/duckdb/ub_extension_core_functions_scalar_date.cpp - src/duckdb/ub_extension_core_functions_scalar_math.cpp - src/duckdb/ub_extension_core_functions_scalar_generic.cpp - src/duckdb/ub_extension_core_functions_scalar_debug.cpp + src/duckdb/extension/core_functions/core_functions_extension.cpp + src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp + src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp + src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp + src/duckdb/ub_extension_core_functions_aggregate_regression.cpp + src/duckdb/ub_extension_core_functions_aggregate_nested.cpp src/duckdb/ub_extension_core_functions_scalar_blob.cpp - src/duckdb/ub_extension_core_functions_scalar_random.cpp src/duckdb/ub_extension_core_functions_scalar_bit.cpp - src/duckdb/ub_extension_core_functions_scalar_string.cpp + src/duckdb/ub_extension_core_functions_scalar_enum.cpp + src/duckdb/ub_extension_core_functions_scalar_struct.cpp + src/duckdb/ub_extension_core_functions_scalar_debug.cpp src/duckdb/ub_extension_core_functions_scalar_array.cpp src/duckdb/ub_extension_core_functions_scalar_list.cpp src/duckdb/ub_extension_core_functions_scalar_union.cpp - src/duckdb/ub_extension_core_functions_scalar_struct.cpp + src/duckdb/ub_extension_core_functions_scalar_generic.cpp + src/duckdb/ub_extension_core_functions_scalar_string.cpp + src/duckdb/ub_extension_core_functions_scalar_date.cpp src/duckdb/ub_extension_core_functions_scalar_map.cpp - src/duckdb/ub_extension_core_functions_scalar_enum.cpp - src/duckdb/ub_extension_core_functions_aggregate_holistic.cpp - src/duckdb/ub_extension_core_functions_aggregate_distributive.cpp - src/duckdb/ub_extension_core_functions_aggregate_algebraic.cpp - src/duckdb/ub_extension_core_functions_aggregate_regression.cpp - src/duckdb/ub_extension_core_functions_aggregate_nested.cpp - src/duckdb/extension/parquet/parquet_file_metadata_cache.cpp + src/duckdb/ub_extension_core_functions_scalar_math.cpp + src/duckdb/ub_extension_core_functions_scalar_random.cpp + src/duckdb/ub_extension_core_functions_scalar_operators.cpp src/duckdb/extension/parquet/geo_parquet.cpp - src/duckdb/extension/parquet/serialize_parquet.cpp - src/duckdb/extension/parquet/parquet_metadata.cpp - src/duckdb/extension/parquet/parquet_extension.cpp src/duckdb/extension/parquet/parquet_reader.cpp - src/duckdb/extension/parquet/zstd_file_system.cpp - src/duckdb/extension/parquet/parquet_writer.cpp - src/duckdb/extension/parquet/column_reader.cpp + src/duckdb/extension/parquet/parquet_extension.cpp + src/duckdb/extension/parquet/serialize_parquet.cpp src/duckdb/extension/parquet/column_writer.cpp - src/duckdb/extension/parquet/parquet_crypto.cpp src/duckdb/extension/parquet/parquet_float16.cpp src/duckdb/extension/parquet/parquet_timestamp.cpp - src/duckdb/extension/parquet/parquet_statistics.cpp src/duckdb/extension/parquet/parquet_multi_file_info.cpp + src/duckdb/extension/parquet/parquet_metadata.cpp + src/duckdb/extension/parquet/parquet_crypto.cpp + src/duckdb/extension/parquet/zstd_file_system.cpp + src/duckdb/extension/parquet/parquet_statistics.cpp + src/duckdb/extension/parquet/column_reader.cpp + src/duckdb/extension/parquet/parquet_file_metadata_cache.cpp + src/duckdb/extension/parquet/parquet_writer.cpp + src/duckdb/ub_extension_parquet_decoder.cpp + src/duckdb/ub_extension_parquet_writer.cpp src/duckdb/ub_extension_parquet_reader.cpp src/duckdb/ub_extension_parquet_reader_variant.cpp - src/duckdb/ub_extension_parquet_writer.cpp - src/duckdb/ub_extension_parquet_decoder.cpp src/duckdb/third_party/parquet/parquet_types.cpp src/duckdb/third_party/thrift/thrift/protocol/TProtocol.cpp src/duckdb/third_party/thrift/thrift/transport/TTransportException.cpp @@ -427,32 +427,32 @@ set(DUCKDB_SRC_FILES src/duckdb/third_party/brotli/enc/metablock.cpp src/duckdb/third_party/brotli/enc/static_dict.cpp src/duckdb/third_party/brotli/enc/utf8_util.cpp - src/duckdb/extension/icu/./icu-table-range.cpp - src/duckdb/extension/icu/./icu-datepart.cpp - src/duckdb/extension/icu/./icu-dateadd.cpp - src/duckdb/extension/icu/./icu-strptime.cpp src/duckdb/extension/icu/./icu-datesub.cpp - src/duckdb/extension/icu/./icu-makedate.cpp - src/duckdb/extension/icu/./icu-datetrunc.cpp src/duckdb/extension/icu/./icu-datefunc.cpp + src/duckdb/extension/icu/./icu-table-range.cpp + src/duckdb/extension/icu/./icu-datepart.cpp src/duckdb/extension/icu/./icu-list-range.cpp src/duckdb/extension/icu/./icu-timebucket.cpp src/duckdb/extension/icu/./icu-timezone.cpp + src/duckdb/extension/icu/./icu-makedate.cpp src/duckdb/extension/icu/./icu-current.cpp src/duckdb/extension/icu/./icu_extension.cpp + src/duckdb/extension/icu/./icu-dateadd.cpp + src/duckdb/extension/icu/./icu-datetrunc.cpp + src/duckdb/extension/icu/./icu-strptime.cpp src/duckdb/ub_extension_icu_third_party_icu_common.cpp src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp - src/duckdb/extension/json/json_extension.cpp - src/duckdb/extension/json/json_deserializer.cpp src/duckdb/extension/json/serialize_json.cpp + src/duckdb/extension/json/json_functions.cpp src/duckdb/extension/json/json_serializer.cpp - src/duckdb/extension/json/json_multi_file_info.cpp + src/duckdb/extension/json/json_extension.cpp + src/duckdb/extension/json/json_deserializer.cpp src/duckdb/extension/json/json_enums.cpp - src/duckdb/extension/json/json_reader.cpp + src/duckdb/extension/json/json_multi_file_info.cpp src/duckdb/extension/json/json_common.cpp + src/duckdb/extension/json/json_reader.cpp src/duckdb/extension/json/json_scan.cpp - src/duckdb/extension/json/json_functions.cpp src/duckdb/ub_extension_json_json_functions.cpp) set(JEMALLOC_SRC_FILES diff --git a/src/duckdb/extension/parquet/include/decode_utils.hpp b/src/duckdb/extension/parquet/include/decode_utils.hpp index 20ba91dd3..5dad16705 100644 --- a/src/duckdb/extension/parquet/include/decode_utils.hpp +++ b/src/duckdb/extension/parquet/include/decode_utils.hpp @@ -36,6 +36,11 @@ class ParquetDecodeUtils { static void BitUnpack(ByteBuffer &src, bitpacking_width_t &bitpack_pos, T *dst, idx_t count, const bitpacking_width_t width) { CheckWidth(width); + if (width > sizeof(T) * BITPACK_DLEN) { + throw IOException("The width (%d) of the bitpacked data exceeds the maximum width (%d) for " + "the target type, the file might be corrupted.", + width, sizeof(T) * BITPACK_DLEN); + } const auto mask = BITPACK_MASKS[width]; src.available(count * width / BITPACK_DLEN); // check if buffer has enough space available once if (bitpack_pos == 0 && count >= BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) { @@ -88,6 +93,12 @@ class ParquetDecodeUtils { template static void BitUnpackAlignedInternal(ByteBuffer &src, T *dst, const idx_t count, const bitpacking_width_t width) { D_ASSERT(count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0); + if (width > sizeof(T) * BITPACK_DLEN) { + throw IOException("The width (%d) of the bitpacked data exceeds the maximum width (%d) for " + "the target type, the file might be corrupted.", + width, sizeof(T) * BITPACK_DLEN); + } + if (cast_pointer_to_uint64(src.ptr) % sizeof(T) == 0) { // Fast path: aligned BitpackingPrimitives::UnPackBuffer(data_ptr_cast(dst), src.ptr, count, width); diff --git a/src/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp b/src/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp index a7c717709..f1d7e64b1 100644 --- a/src/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp +++ b/src/duckdb/extension/parquet/include/reader/variant/variant_binary_decoder.hpp @@ -32,12 +32,13 @@ struct VariantMetadata { public: VariantMetadataHeader header; - const_data_ptr_t offsets; - const_data_ptr_t bytes; //! The json object keys have to be null-terminated //! But we don't receive them null-terminated vector strings; + + //! Total byte length of the metadata region. + idx_t total_size; }; //! ------------ Value ------------ @@ -134,17 +135,18 @@ class VariantBinaryDecoder { VariantBinaryDecoder() = delete; public: - static VariantValue Decode(const VariantMetadata &metadata, const_data_ptr_t data); + static VariantValue Decode(const VariantMetadata &metadata, const_data_ptr_t data, idx_t data_offset, + idx_t data_size); public: static VariantValue PrimitiveTypeDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata, - const_data_ptr_t data); - static VariantValue ShortStringDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata, - const_data_ptr_t data); + const_data_ptr_t data, idx_t data_offset, idx_t data_size); + static VariantValue ShortStringDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size); static VariantValue ObjectDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata, - const_data_ptr_t data); + const_data_ptr_t data, idx_t data_offset, idx_t data_size); static VariantValue ArrayDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata, - const_data_ptr_t data); + const_data_ptr_t data, idx_t data_offset, idx_t data_size); }; } // namespace duckdb diff --git a/src/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp b/src/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp index eacff5501..4f1e4c427 100644 --- a/src/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp +++ b/src/duckdb/extension/parquet/reader/variant/variant_binary_decoder.cpp @@ -40,13 +40,18 @@ namespace duckdb { namespace { -static idx_t ReadVariableLengthLittleEndian(idx_t length_in_bytes, const_data_ptr_t &ptr) { +static idx_t ReadVariableLengthLittleEndian(idx_t length_in_bytes, const_data_ptr_t ptr, idx_t &offset, + const idx_t capacity) { if (length_in_bytes > sizeof(idx_t)) { throw NotImplementedException("Can't read little-endian value of %d bytes", length_in_bytes); } + if (offset + length_in_bytes > capacity) { + throw IOException("Data corruption detected, read of length_in_bytes (%d) would exceed buffer capacity", + length_in_bytes); + } idx_t result = 0; - memcpy(reinterpret_cast(&result), ptr, length_in_bytes); - ptr += length_in_bytes; + memcpy(reinterpret_cast(&result), ptr + offset, length_in_bytes); + offset += length_in_bytes; return result; } @@ -67,21 +72,34 @@ VariantMetadataHeader VariantMetadataHeader::FromHeaderByte(uint8_t byte) { } VariantMetadata::VariantMetadata(const string_t &metadata) : metadata(metadata) { - auto metadata_data = metadata.GetData(); + auto metadata_data = reinterpret_cast(metadata.GetData()); + const auto metadata_buffer_capacity = metadata.GetSize(); + if (!metadata_data || metadata.GetSize() < 1) { + throw IOException("Corrupted VARIANT 'metadata' buffer, empty or nullptr"); + } + + idx_t metadata_offset = 0; + header = VariantMetadataHeader::FromHeaderByte(metadata_data[metadata_offset]); + metadata_offset += sizeof(uint8_t); - header = VariantMetadataHeader::FromHeaderByte(metadata_data[0]); + idx_t dictionary_size = + ReadVariableLengthLittleEndian(header.offset_size, metadata_data, metadata_offset, metadata_buffer_capacity); - const_data_ptr_t ptr = reinterpret_cast(metadata_data + sizeof(uint8_t)); - idx_t dictionary_size = ReadVariableLengthLittleEndian(header.offset_size, ptr); + auto data_start = metadata_offset + ((dictionary_size + 1) * header.offset_size); + idx_t last_offset = + ReadVariableLengthLittleEndian(header.offset_size, metadata_data, metadata_offset, metadata_buffer_capacity); - offsets = ptr; - bytes = offsets + ((dictionary_size + 1) * header.offset_size); - idx_t last_offset = ReadVariableLengthLittleEndian(header.offset_size, ptr); for (idx_t i = 0; i < dictionary_size; i++) { - auto next_offset = ReadVariableLengthLittleEndian(header.offset_size, ptr); - strings.emplace_back(reinterpret_cast(bytes + last_offset), next_offset - last_offset); + auto next_offset = ReadVariableLengthLittleEndian(header.offset_size, metadata_data, metadata_offset, + metadata_buffer_capacity); + const idx_t string_size = next_offset - last_offset; + if (data_start + last_offset + string_size > metadata_buffer_capacity) { + throw IOException("Corrupted VARIANT 'metadata' buffer"); + } + strings.emplace_back(reinterpret_cast(metadata_data + data_start + last_offset), string_size); last_offset = next_offset; } + total_size = metadata_offset + last_offset; } VariantValueMetadata VariantValueMetadata::FromHeaderByte(uint8_t byte) { @@ -109,17 +127,20 @@ VariantValueMetadata VariantValueMetadata::FromHeaderByte(uint8_t byte) { break; } default: - throw InternalException("VariantBasicType (%d) not handled", static_cast(result.basic_type)); + throw IOException("VariantBasicType (%d) not handled", static_cast(result.basic_type)); } return result; } template -static T DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) { - scale = Load(data); - data++; +static T DecodeDecimal(const_data_ptr_t data, idx_t data_offset, idx_t data_size, uint8_t &scale, uint8_t &width) { + if (data_offset + sizeof(uint8_t) + sizeof(T) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + scale = Load(data + data_offset); + data_offset += sizeof(uint8_t); - auto result = Load(data); + auto result = Load(data + data_offset); //! FIXME: The spec says: //! The implied precision of a decimal value is `floor(log_10(val)) + 1` width = DecimalWidth::max; @@ -127,13 +148,17 @@ static T DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) { } template <> -hugeint_t DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) { - scale = Load(data); - data++; +hugeint_t DecodeDecimal(const_data_ptr_t data, idx_t data_offset, idx_t data_size, uint8_t &scale, uint8_t &width) { + if (data_offset + sizeof(uint8_t) + sizeof(uint64_t) + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + scale = Load(data + data_offset); + data_offset += sizeof(uint8_t); hugeint_t result; - result.lower = Load(data); - result.upper = Load(data + sizeof(uint64_t)); + result.lower = Load(data + data_offset); + data_offset += sizeof(uint64_t); + result.upper = Load(data + data_offset); //! FIXME: The spec says: //! The implied precision of a decimal value is `floor(log_10(val)) + 1` width = DecimalWidth::max; @@ -142,7 +167,7 @@ hugeint_t DecodeDecimal(const_data_ptr_t data, uint8_t &scale, uint8_t &width) { VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantMetadata &metadata, const VariantValueMetadata &value_metadata, - const_data_ptr_t data) { + const_data_ptr_t data, idx_t data_offset, idx_t data_size) { switch (value_metadata.primitive_type) { case VariantPrimitiveType::NULL_TYPE: { return VariantValue(Value()); @@ -154,34 +179,52 @@ VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantMetadata &me return VariantValue(Value::BOOLEAN(false)); } case VariantPrimitiveType::INT8: { - auto value = Load(data); + if (data_offset + sizeof(int8_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value = Load(data + data_offset); return VariantValue(Value::TINYINT(value)); } case VariantPrimitiveType::INT16: { - auto value = Load(data); + if (data_offset + sizeof(int16_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value = Load(data + data_offset); return VariantValue(Value::SMALLINT(value)); } case VariantPrimitiveType::INT32: { - auto value = Load(data); + if (data_offset + sizeof(int32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value = Load(data + data_offset); return VariantValue(Value::INTEGER(value)); } case VariantPrimitiveType::INT64: { - auto value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value = Load(data + data_offset); return VariantValue(Value::BIGINT(value)); } case VariantPrimitiveType::DOUBLE: { - double value = Load(data); + if (data_offset + sizeof(double) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + double value = Load(data + data_offset); return VariantValue(Value::DOUBLE(value)); } case VariantPrimitiveType::FLOAT: { - float value = Load(data); + if (data_offset + sizeof(float) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + float value = Load(data + data_offset); return VariantValue(Value::FLOAT(value)); } case VariantPrimitiveType::DECIMAL4: { uint8_t scale; uint8_t width; - auto value = DecodeDecimal(data, scale, width); + auto value = DecodeDecimal(data, data_offset, data_size, scale, width); auto value_str = Decimal::ToString(value, width, scale); return VariantValue(Value(value_str)); } @@ -189,7 +232,7 @@ VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantMetadata &me uint8_t scale; uint8_t width; - auto value = DecodeDecimal(data, scale, width); + auto value = DecodeDecimal(data, data_offset, data_size, scale, width); auto value_str = Decimal::ToString(value, width, scale); return VariantValue(Value(value_str)); } @@ -197,23 +240,32 @@ VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantMetadata &me uint8_t scale; uint8_t width; - auto value = DecodeDecimal(data, scale, width); + auto value = DecodeDecimal(data, data_offset, data_size, scale, width); auto value_str = Decimal::ToString(value, width, scale); return VariantValue(Value(value_str)); } case VariantPrimitiveType::DATE: { date_t value; - value.days = Load(data); + if (data_offset + sizeof(int32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + value.days = Load(data + data_offset); return VariantValue(Value::DATE(value)); } case VariantPrimitiveType::TIMESTAMP_MICROS: { timestamp_tz_t micros_ts_tz; - micros_ts_tz.value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + micros_ts_tz.value = Load(data + data_offset); return VariantValue(Value::TIMESTAMPTZ(micros_ts_tz)); } case VariantPrimitiveType::TIMESTAMP_NTZ_MICROS: { timestamp_t micros_ts; - micros_ts.value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + micros_ts.value = Load(data + data_offset); auto value = Value::TIMESTAMP(micros_ts); auto value_str = value.ToString(); @@ -222,27 +274,49 @@ VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantMetadata &me case VariantPrimitiveType::BINARY: { //! Follow the JSON serialization guide by converting BINARY to Base64: //! For example: `"dmFyaWFudAo="` - auto size = Load(data); - auto string_data = reinterpret_cast(data + sizeof(uint32_t)); + if (data_offset + sizeof(uint32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto size = Load(data + data_offset); + data_offset += sizeof(uint32_t); + + auto string_data = reinterpret_cast(data + data_offset); + if (data_offset + size > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } auto base64_string = Blob::ToBase64(string_t(string_data, size)); return VariantValue(Value(base64_string)); } case VariantPrimitiveType::STRING: { - auto size = Load(data); - auto string_data = reinterpret_cast(data + sizeof(uint32_t)); + if (data_offset + sizeof(uint32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto size = Load(data + data_offset); + data_offset += sizeof(uint32_t); + + auto string_data = reinterpret_cast(data + data_offset); + if (data_offset + size > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } if (!Utf8Proc::IsValid(string_data, size)) { - throw InternalException("Can't decode Variant short-string, string isn't valid UTF8"); + throw IOException("Can't decode Variant short-string, string isn't valid UTF8"); } return VariantValue(Value(string(string_data, size))); } case VariantPrimitiveType::TIME_NTZ_MICROS: { dtime_t micros_time; - micros_time.micros = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + micros_time.micros = Load(data + data_offset); return VariantValue(Value::TIME(micros_time)); } case VariantPrimitiveType::TIMESTAMP_NANOS: { timestamp_ns_t nanos_ts; - nanos_ts.value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + nanos_ts.value = Load(data + data_offset); //! Convert the nanos timestamp to a micros timestamp (not lossless) auto micros_ts = Timestamp::FromEpochNanoSeconds(nanos_ts.value); @@ -250,14 +324,20 @@ VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantMetadata &me } case VariantPrimitiveType::TIMESTAMP_NTZ_NANOS: { timestamp_ns_t nanos_ts; - nanos_ts.value = Load(data); + if (data_offset + sizeof(int64_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + nanos_ts.value = Load(data + data_offset); auto value = Value::TIMESTAMPNS(nanos_ts); auto value_str = value.ToString(); return VariantValue(Value(value_str)); } case VariantPrimitiveType::UUID: { - auto uuid_value = UUIDValueConversion::ReadParquetUUID(data); + if (data_offset + sizeof(hugeint_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto uuid_value = UUIDValueConversion::ReadParquetUUID(data + data_offset); auto value_str = UUID::ToString(uuid_value); return VariantValue(Value(value_str)); } @@ -267,19 +347,24 @@ VariantValue VariantBinaryDecoder::PrimitiveTypeDecode(const VariantMetadata &me } } -VariantValue VariantBinaryDecoder::ShortStringDecode(const VariantMetadata &metadata, - const VariantValueMetadata &value_metadata, - const_data_ptr_t data) { - D_ASSERT(value_metadata.string_size < 64); - auto string_data = reinterpret_cast(data); +VariantValue VariantBinaryDecoder::ShortStringDecode(const VariantValueMetadata &value_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size) { + if (value_metadata.string_size >= 64) { + throw IOException("Corrupted VARIANT 'metadata' buffer"); + } + auto string_data = reinterpret_cast(data + data_offset); + if (data_offset + value_metadata.string_size > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } if (!Utf8Proc::IsValid(string_data, value_metadata.string_size)) { - throw InternalException("Can't decode Variant short-string, string isn't valid UTF8"); + throw IOException("Can't decode Variant short-string, string isn't valid UTF8"); } return VariantValue(Value(string(string_data, value_metadata.string_size))); } VariantValue VariantBinaryDecoder::ObjectDecode(const VariantMetadata &metadata, - const VariantValueMetadata &value_metadata, const_data_ptr_t data) { + const VariantValueMetadata &value_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size) { VariantValue ret(VariantValueType::OBJECT); auto field_offset_size = value_metadata.field_offset_size; @@ -288,23 +373,32 @@ VariantValue VariantBinaryDecoder::ObjectDecode(const VariantMetadata &metadata, idx_t num_elements; if (is_large) { - num_elements = Load(data); - data += sizeof(uint32_t); + if (data_offset + sizeof(uint32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + num_elements = Load(data + data_offset); + data_offset += sizeof(uint32_t); } else { - num_elements = Load(data); - data += sizeof(uint8_t); + if (data_offset + sizeof(uint8_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + num_elements = Load(data + data_offset); + data_offset += sizeof(uint8_t); } - auto field_ids = data; - auto field_offsets = data + (num_elements * field_id_size); - auto values = field_offsets + ((num_elements + 1) * field_offset_size); + auto field_ids_offset = data_offset; + auto field_offsets_offset = data_offset + (num_elements * field_id_size); + auto values_offset = field_offsets_offset + ((num_elements + 1) * field_offset_size); - idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets); + idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, data, field_offsets_offset, data_size); for (idx_t i = 0; i < num_elements; i++) { - auto field_id = ReadVariableLengthLittleEndian(field_id_size, field_ids); - auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets); + auto field_id = ReadVariableLengthLittleEndian(field_id_size, data, field_ids_offset, data_size); + auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, data, field_offsets_offset, data_size); - auto value = Decode(metadata, values + last_offset); + auto value = Decode(metadata, data, values_offset + last_offset, data_size); + if (field_id >= metadata.strings.size()) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } auto &key = metadata.strings[field_id]; ret.AddChild(key, std::move(value)); @@ -314,7 +408,8 @@ VariantValue VariantBinaryDecoder::ObjectDecode(const VariantMetadata &metadata, } VariantValue VariantBinaryDecoder::ArrayDecode(const VariantMetadata &metadata, - const VariantValueMetadata &value_metadata, const_data_ptr_t data) { + const VariantValueMetadata &value_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size) { VariantValue ret(VariantValueType::ARRAY); auto field_offset_size = value_metadata.field_offset_size; @@ -322,42 +417,52 @@ VariantValue VariantBinaryDecoder::ArrayDecode(const VariantMetadata &metadata, uint32_t num_elements; if (is_large) { - num_elements = Load(data); - data += sizeof(uint32_t); + if (data_offset + sizeof(uint32_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + num_elements = Load(data + data_offset); + data_offset += sizeof(uint32_t); } else { - num_elements = Load(data); - data += sizeof(uint8_t); + if (data_offset + sizeof(uint8_t) > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + num_elements = Load(data + data_offset); + data_offset += sizeof(uint8_t); } - auto field_offsets = data; - auto values = field_offsets + ((num_elements + 1) * field_offset_size); + auto field_offsets_offset = data_offset; + auto values_offset = field_offsets_offset + ((num_elements + 1) * field_offset_size); - idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets); + idx_t last_offset = ReadVariableLengthLittleEndian(field_offset_size, data, field_offsets_offset, data_size); for (idx_t i = 0; i < num_elements; i++) { - auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, field_offsets); + auto next_offset = ReadVariableLengthLittleEndian(field_offset_size, data, field_offsets_offset, data_size); - ret.AddItem(Decode(metadata, values + last_offset)); + ret.AddItem(Decode(metadata, data, values_offset + last_offset, data_size)); last_offset = next_offset; } return ret; } -VariantValue VariantBinaryDecoder::Decode(const VariantMetadata &variant_metadata, const_data_ptr_t data) { - auto value_metadata = VariantValueMetadata::FromHeaderByte(data[0]); +VariantValue VariantBinaryDecoder::Decode(const VariantMetadata &variant_metadata, const_data_ptr_t data, + idx_t data_offset, idx_t data_size) { + if (data_offset + 1 > data_size) { + throw IOException("Corrupted VARIANT 'value' buffer"); + } + auto value_metadata = VariantValueMetadata::FromHeaderByte(data[data_offset]); + data_offset += sizeof(uint8_t); - data++; switch (value_metadata.basic_type) { case VariantBasicType::PRIMITIVE: { - return PrimitiveTypeDecode(variant_metadata, value_metadata, data); + return PrimitiveTypeDecode(variant_metadata, value_metadata, data, data_offset, data_size); } case VariantBasicType::SHORT_STRING: { - return ShortStringDecode(variant_metadata, value_metadata, data); + return ShortStringDecode(value_metadata, data, data_offset, data_size); } case VariantBasicType::OBJECT: { - return ObjectDecode(variant_metadata, value_metadata, data); + return ObjectDecode(variant_metadata, value_metadata, data, data_offset, data_size); } case VariantBasicType::ARRAY: { - return ArrayDecode(variant_metadata, value_metadata, data); + return ArrayDecode(variant_metadata, value_metadata, data, data_offset, data_size); } default: throw InternalException("Unexpected value for VariantBasicType"); diff --git a/src/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp b/src/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp index 8278eb740..f2b17508f 100644 --- a/src/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp +++ b/src/duckdb/extension/parquet/reader/variant/variant_shredded_conversion.cpp @@ -177,8 +177,10 @@ vector ConvertTypedValues(Vector &vec, Vector &metadata, Vector &b } else if (value_validity.RowIsValid(value_index)) { auto metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - ret[i] = VariantBinaryDecoder::Decode(variant_metadata, - const_data_ptr_cast(value_data[value_index].GetData())); + + auto &value_buffer = value_data[value_index]; + ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(value_buffer.GetData()), 0, + value_buffer.GetSize()); } } } @@ -336,8 +338,12 @@ static vector ConvertBinaryEncoding(Vector &metadata, Vector &valu D_ASSERT(validity.RowIsValid(index)); auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - auto binary_value = value_data[index].GetData(); - ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value)); + + auto &value_buffer = value_data[index]; + auto binary_value = value_buffer.GetData(); + + ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value), 0, + value_buffer.GetSize()); } } else { //! Even though 'typed_value' is not present, 'value' is allowed to contain NULLs because we're scanning an @@ -349,8 +355,12 @@ static vector ConvertBinaryEncoding(Vector &metadata, Vector &valu if (validity.RowIsValid(index)) { auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - auto binary_value = value_data[index].GetData(); - ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value)); + + auto &value_buffer = value_data[index]; + auto binary_value = value_buffer.GetData(); + + ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value), 0, + value_buffer.GetSize()); } } } @@ -381,8 +391,11 @@ static VariantValue ConvertPartiallyShreddedObject(vector //! Object is partially shredded, decode the object and merge the values auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - auto binary_value = value_data[index].GetData(); - auto unshredded = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value)); + + auto &value_buffer = value_data[index]; + auto binary_value = value_buffer.GetData(); + auto unshredded = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value), 0, + value_buffer.GetSize()); if (unshredded.value_type != VariantValueType::OBJECT) { throw InvalidInputException("Partially shredded objects have to encode Object Variants in the 'value'"); } @@ -448,8 +461,11 @@ vector VariantShreddedConversion::ConvertShreddedObject(Vector &me D_ASSERT(validity.RowIsValid(value_index)); auto &metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - auto binary_value = value_data[value_index].GetData(); - ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value)); + + auto &value_buffer = value_data[value_index]; + auto binary_value = value_buffer.GetData(); + ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(binary_value), 0, + value_buffer.GetSize()); if (ret[i].value_type == VariantValueType::OBJECT) { throw InvalidInputException( "When 'typed_value' for a shredded Object is NULL, 'value' can not contain an Object value"); @@ -507,8 +523,10 @@ vector VariantShreddedConversion::ConvertShreddedArray(Vector &met } else if (value_validity.RowIsValid(value_index)) { auto metadata_value = metadata_data[metadata_format.sel->get_index(i)]; VariantMetadata variant_metadata(metadata_value); - ret[i] = VariantBinaryDecoder::Decode(variant_metadata, - const_data_ptr_cast(value_data[value_index].GetData())); + + const auto &value_buffer = value_data[value_index]; + ret[i] = VariantBinaryDecoder::Decode(variant_metadata, const_data_ptr_cast(value_buffer.GetData()), 0, + value_buffer.GetSize()); } } } diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 4688c28a8..b40316c12 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "5-dev101" +#define DUCKDB_PATCH_VERSION "5-dev107" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 4 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.4.5-dev101" +#define DUCKDB_VERSION "v1.4.5-dev107" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "ca5f01efef" +#define DUCKDB_SOURCE_ID "f31be57c18" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/storage/compression/alp/alp_scan.hpp b/src/duckdb/src/include/duckdb/storage/compression/alp/alp_scan.hpp index 28b52b848..6197a08b9 100644 --- a/src/duckdb/src/include/duckdb/storage/compression/alp/alp_scan.hpp +++ b/src/duckdb/src/include/duckdb/storage/compression/alp/alp_scan.hpp @@ -130,7 +130,14 @@ struct AlpScanState : public SegmentScanState { // Load the offset (metadata) indicating where the vector data starts metadata_ptr -= AlpConstants::METADATA_POINTER_SIZE; auto data_byte_offset = Load(metadata_ptr); - D_ASSERT(data_byte_offset < segment.GetBlockManager().GetBlockSize()); + + const auto block_size = segment.GetBlockManager().GetBlockSize(); + + if (data_byte_offset >= block_size) { + throw IOException( + "Corrupted ALP segment: stored data_byte_offset (%d) exceeds the segments block size (%d)", + data_byte_offset, block_size); + } idx_t vector_size = MinValue((idx_t)AlpConstants::ALP_VECTOR_SIZE, (count - total_value_count)); @@ -152,22 +159,56 @@ struct AlpScanState : public SegmentScanState { vector_state.bit_width = Load(vector_ptr); vector_ptr += AlpConstants::BIT_WIDTH_SIZE; - D_ASSERT(vector_state.exceptions_count <= vector_size); D_ASSERT(vector_state.v_exponent <= AlpTypedConstants::MAX_EXPONENT); - D_ASSERT(vector_state.v_factor <= vector_state.v_exponent); - D_ASSERT(vector_state.bit_width <= sizeof(uint64_t) * 8); + if (vector_state.exceptions_count > vector_size) { + throw IOException("Corrupted ALP segment: exceptions_count (%d) exceeds vector_size (%d)", + vector_state.exceptions_count, vector_size); + } + if (vector_state.v_factor > vector_state.v_exponent) { + throw IOException("Corrupted ALP segment: v_factor (%d) exceeds v_exponent (%d)", vector_state.v_factor, + vector_state.v_exponent); + } + if (vector_state.bit_width > sizeof(uint64_t) * 8) { + throw IOException("Corrupted ALP segment: Invalid bit_width encountered: %d", vector_state.bit_width); + } + + idx_t read_bytes = 0; if (vector_state.bit_width > 0) { auto bp_size = BitpackingPrimitives::GetRequiredSize(vector_size, vector_state.bit_width); + + const idx_t max_encoded = sizeof(vector_state.for_encoded); + if (bp_size > max_encoded || data_byte_offset + read_bytes + bp_size > block_size) { + throw IOException("Corrupted ALP segment: encoded payload too large"); + } memcpy(vector_state.for_encoded, (void *)vector_ptr, bp_size); vector_ptr += bp_size; + read_bytes += bp_size; } if (vector_state.exceptions_count > 0) { - memcpy(vector_state.exceptions, (void *)vector_ptr, sizeof(EXACT_TYPE) * vector_state.exceptions_count); - vector_ptr += sizeof(EXACT_TYPE) * vector_state.exceptions_count; - memcpy(vector_state.exceptions_positions, (void *)vector_ptr, - AlpConstants::EXCEPTION_POSITION_SIZE * vector_state.exceptions_count); + //! Load the exceptions + const idx_t max_exceptions_size = sizeof(vector_state.exceptions); + const idx_t exceptions_copy_size = sizeof(EXACT_TYPE) * vector_state.exceptions_count; + if (exceptions_copy_size > max_exceptions_size || + data_byte_offset + read_bytes + exceptions_copy_size > block_size) { + throw IOException("Corrupted ALP segment: exceptions payload too large"); + } + memcpy(vector_state.exceptions, (void *)vector_ptr, exceptions_copy_size); + vector_ptr += exceptions_copy_size; + read_bytes += exceptions_copy_size; + + //! Load the exceptions_positions + const idx_t max_exceptions_positions_size = sizeof(vector_state.exceptions_positions); + const idx_t exceptions_positions_copy_size = + AlpConstants::EXCEPTION_POSITION_SIZE * vector_state.exceptions_count; + if (exceptions_positions_copy_size > max_exceptions_positions_size || + data_byte_offset + read_bytes + exceptions_positions_copy_size > block_size) { + throw IOException("Corrupted ALP segment: exceptions_positions payload too large"); + } + memcpy(vector_state.exceptions_positions, (void *)vector_ptr, exceptions_positions_copy_size); + vector_ptr += exceptions_positions_copy_size; + read_bytes += exceptions_positions_copy_size; } // Decode all the vector values to the specified 'value_buffer' diff --git a/src/duckdb/src/include/duckdb/storage/compression/alprd/alprd_scan.hpp b/src/duckdb/src/include/duckdb/storage/compression/alprd/alprd_scan.hpp index 4b3f6b991..5ceb13701 100644 --- a/src/duckdb/src/include/duckdb/storage/compression/alprd/alprd_scan.hpp +++ b/src/duckdb/src/include/duckdb/storage/compression/alprd/alprd_scan.hpp @@ -79,26 +79,49 @@ struct AlpRDScanState : public SegmentScanState { // ScanStates never exceed the boundaries of a Segment, // but are not guaranteed to start at the beginning of the Block segment_data = handle.Ptr() + segment.GetBlockOffset(); + const auto block_size = segment.GetBlockManager().GetBlockSize(); + + idx_t total_segment_offset = segment.GetBlockOffset(); auto metadata_offset = Load(segment_data); + auto segment_ptr = segment_data + AlpRDConstants::METADATA_POINTER_SIZE; + total_segment_offset += AlpRDConstants::METADATA_POINTER_SIZE; + metadata_ptr = segment_data + metadata_offset; + const idx_t metadata_ptr_offset = segment.GetBlockOffset() + metadata_offset; + if (metadata_ptr_offset > block_size) { + throw IOException("Corrupted ALPRD segment: metadata_offset value is corrupted"); + } + + if (total_segment_offset + AlpRDConstants::HEADER_SIZE > block_size) { + throw IOException("Corrupted ALPRD segment: reading header bytes would exceed block space"); + } // Load the Right Bit Width which is in the segment header after the pointer to the first metadata - vector_state.right_bit_width = Load(segment_data + AlpRDConstants::METADATA_POINTER_SIZE); - vector_state.left_bit_width = - Load(segment_data + AlpRDConstants::METADATA_POINTER_SIZE + AlpRDConstants::RIGHT_BIT_WIDTH_SIZE); + vector_state.right_bit_width = Load(segment_ptr); + segment_ptr += AlpRDConstants::RIGHT_BIT_WIDTH_SIZE; + + vector_state.left_bit_width = Load(segment_ptr); + segment_ptr += AlpRDConstants::LEFT_BIT_WIDTH_SIZE; + + uint8_t actual_dictionary_size = Load(segment_ptr); + segment_ptr += AlpRDConstants::N_DICTIONARY_ELEMENTS_SIZE; + + total_segment_offset += AlpRDConstants::HEADER_SIZE; - uint8_t actual_dictionary_size = - Load(segment_data + AlpRDConstants::METADATA_POINTER_SIZE + AlpRDConstants::RIGHT_BIT_WIDTH_SIZE + - AlpRDConstants::LEFT_BIT_WIDTH_SIZE); if (actual_dictionary_size > AlpRDConstants::MAX_DICTIONARY_SIZE) { throw IOException("Corrupt database file: ALPRD dictionary size exceeds maximum"); } idx_t actual_dictionary_size_bytes = static_cast(actual_dictionary_size) * AlpRDConstants::DICTIONARY_ELEMENT_SIZE; + const idx_t left_parts_dict_max_size = sizeof(vector_state.left_parts_dict); + if (total_segment_offset + actual_dictionary_size_bytes > metadata_ptr_offset || + actual_dictionary_size_bytes > left_parts_dict_max_size) { + throw IOException("Corrupted ALPRD segment: actual_dictionary_size is corrupted"); + } + // Load the left parts dictionary which is after the segment header and is of a fixed size - memcpy(vector_state.left_parts_dict, (void *)(segment_data + AlpRDConstants::HEADER_SIZE), - actual_dictionary_size_bytes); + memcpy(vector_state.left_parts_dict, segment_ptr, actual_dictionary_size_bytes); } BufferHandle handle; @@ -153,7 +176,12 @@ struct AlpRDScanState : public SegmentScanState { // Load the offset (metadata) indicating where the vector data starts metadata_ptr -= AlpRDConstants::METADATA_POINTER_SIZE; auto data_byte_offset = Load(metadata_ptr); - D_ASSERT(data_byte_offset < segment.GetBlockManager().GetBlockSize()); + const auto block_size = segment.GetBlockManager().GetBlockSize(); + if (data_byte_offset >= block_size) { + throw IOException( + "Corrupted ALPRD segment: stored data_byte_offset (%d) exceeds the segments block size (%d)", + data_byte_offset, block_size); + } idx_t vector_size = MinValue((idx_t)AlpRDConstants::ALP_VECTOR_SIZE, (count - total_value_count)); @@ -167,18 +195,46 @@ struct AlpRDScanState : public SegmentScanState { auto left_bp_size = BitpackingPrimitives::GetRequiredSize(vector_size, vector_state.left_bit_width); auto right_bp_size = BitpackingPrimitives::GetRequiredSize(vector_size, vector_state.right_bit_width); + idx_t read_bytes = 0; + const idx_t max_left_encoded_size = sizeof(vector_state.left_encoded); + if (left_bp_size > max_left_encoded_size || data_byte_offset + read_bytes + left_bp_size > block_size) { + throw IOException("Corrupted ALPRD segment: left_encoded payload too large"); + } memcpy(vector_state.left_encoded, (void *)vector_ptr, left_bp_size); vector_ptr += left_bp_size; + read_bytes += left_bp_size; + const idx_t max_right_encoded_size = sizeof(vector_state.right_encoded); + if (right_bp_size > max_right_encoded_size || data_byte_offset + read_bytes + right_bp_size > block_size) { + throw IOException("Corrupted ALPRD segment: left_encoded payload too large"); + } memcpy(vector_state.right_encoded, (void *)vector_ptr, right_bp_size); vector_ptr += right_bp_size; + read_bytes += right_bp_size; if (vector_state.exceptions_count > 0) { - memcpy(vector_state.exceptions, (void *)vector_ptr, - AlpRDConstants::EXCEPTION_SIZE * vector_state.exceptions_count); - vector_ptr += AlpRDConstants::EXCEPTION_SIZE * vector_state.exceptions_count; - memcpy(vector_state.exceptions_positions, (void *)vector_ptr, - AlpRDConstants::EXCEPTION_POSITION_SIZE * vector_state.exceptions_count); + //! Load the exceptions + const idx_t max_exceptions_size = sizeof(vector_state.exceptions); + const idx_t exceptions_copy_size = AlpRDConstants::EXCEPTION_SIZE * vector_state.exceptions_count; + if (exceptions_copy_size > max_exceptions_size || + data_byte_offset + read_bytes + exceptions_copy_size > block_size) { + throw IOException("Corrupted ALPRD segment: exceptions payload too large"); + } + memcpy(vector_state.exceptions, (void *)vector_ptr, exceptions_copy_size); + vector_ptr += exceptions_copy_size; + read_bytes += exceptions_copy_size; + + //! Load the exceptions_positions + const idx_t max_exceptions_positions_size = sizeof(vector_state.exceptions_positions); + const idx_t exceptions_positions_copy_size = + AlpRDConstants::EXCEPTION_POSITION_SIZE * vector_state.exceptions_count; + if (exceptions_positions_copy_size > max_exceptions_positions_size || + data_byte_offset + read_bytes + exceptions_positions_copy_size > block_size) { + throw IOException("Corrupted ALPRD segment: exceptions_positions payload too large"); + } + memcpy(vector_state.exceptions_positions, (void *)vector_ptr, exceptions_positions_copy_size); + vector_ptr += exceptions_positions_copy_size; + read_bytes += exceptions_positions_copy_size; } // Decode all the vector values to the specified 'value_buffer' diff --git a/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp b/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp index b523600e3..a4f214735 100644 --- a/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp +++ b/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp @@ -74,6 +74,14 @@ struct PatasGroupState { } value_buffer[0] = (EXACT_TYPE)0; for (idx_t i = 0; i < count; i++) { + if (unpacked_data[i].index_diff > i) { + throw IOException("Corrupted Patas segment: invalid backward reference"); + } + if (unpacked_data[i].significant_bytes > sizeof(EXACT_TYPE) || + unpacked_data[i].trailing_zeros >= sizeof(EXACT_TYPE) * 8) { + throw IOException("Corrupted Patas segment: invalid packed value metadata"); + } + value_buffer[i] = patas::PatasDecompression::DecompressValue( byte_reader, unpacked_data[i].significant_bytes, unpacked_data[i].trailing_zeros, value_buffer[i - unpacked_data[i].index_diff]); @@ -102,6 +110,9 @@ struct PatasScanState : public SegmentScanState { // but are not guaranteed to start at the beginning of the Block segment_data = handle.Ptr() + segment.GetBlockOffset(); auto metadata_offset = Load(segment_data); + if (segment.GetBlockOffset() + metadata_offset > segment.GetBlockManager().GetBlockSize()) { + throw IOException("Corrupted Patas segment: metadata_offset reaches outside of the blocks memory"); + } metadata_ptr = segment_data + metadata_offset; } @@ -161,7 +172,10 @@ struct PatasScanState : public SegmentScanState { // Load the offset indicating where a groups data starts metadata_ptr -= sizeof(uint32_t); auto data_byte_offset = Load(metadata_ptr); - D_ASSERT(data_byte_offset < segment.GetBlockManager().GetBlockSize()); + + if (segment.GetBlockOffset() + data_byte_offset >= segment.GetBlockManager().GetBlockSize()) { + throw IOException("Corrupted Patas segment: data_byte_offset would reach outside of the blocks memory"); + } // Initialize the byte_reader with the data values for the group group_state.Init(segment_data + data_byte_offset); diff --git a/src/duckdb/src/storage/compression/rle.cpp b/src/duckdb/src/storage/compression/rle.cpp index 57ebaf1fa..c005d8428 100644 --- a/src/duckdb/src/storage/compression/rle.cpp +++ b/src/duckdb/src/storage/compression/rle.cpp @@ -254,52 +254,72 @@ void RLEFinalizeCompress(CompressionState &state_p) { //===--------------------------------------------------------------------===// template struct RLEScanState : public SegmentScanState { - explicit RLEScanState(ColumnSegment &segment) { - auto &buffer_manager = BufferManager::GetBufferManager(segment.db); - handle = buffer_manager.Pin(segment.block); - entry_pos = 0; - position_in_entry = 0; - rle_count_offset = UnsafeNumericCast(Load(handle.Ptr() + segment.GetBlockOffset())); - D_ASSERT(rle_count_offset <= segment.GetBlockManager().GetBlockSize()); + explicit RLEScanState(ColumnSegment &segment) + : handle(BufferManager::GetBufferManager(segment.db).Pin(segment.block)), entry_pos(0), position_in_entry(0), + rle_count_offset(UnsafeNumericCast(Load(handle.Ptr() + segment.GetBlockOffset()))), + data_pointer(reinterpret_cast(handle.Ptr() + segment.GetBlockOffset() + RLEConstants::RLE_HEADER_SIZE)), + index_pointer(reinterpret_cast(handle.Ptr() + segment.GetBlockOffset() + rle_count_offset)), + max_entry_pos(static_cast(reinterpret_cast( + handle.Ptr() + segment.GetBlockManager().GetBlockSize()) - + reinterpret_cast(index_pointer)) / + static_cast(sizeof(rle_count_t))) { + if (rle_count_offset < RLEConstants::RLE_HEADER_SIZE) { + //! This would make the index_pointer point into a region reserved for the header data + throw IOException("Corrupted RLE segment: rle_count_offset is corrupted"); + } + if (segment.GetBlockOffset() + rle_count_offset > segment.GetBlockManager().GetBlockSize()) { + //! This would make the index_pointer start outside of the segment + throw IOException("Corrupted RLE segment: rle_count_offset is corrupted"); + } + if ((rle_count_offset - RLEConstants::RLE_HEADER_SIZE) / sizeof(T) > max_entry_pos) { + //! This would make the indexing of the index_pointer[entry_pos] reach outside of the segment + throw IOException("Corrupted RLE segment: rle_count_offset is corrupted"); + } } - inline void SkipInternal(rle_count_t *index_pointer, idx_t skip_count) { + inline void SkipInternal(idx_t skip_count) { while (skip_count > 0) { rle_count_t run_end = index_pointer[entry_pos]; idx_t skip_amount = MinValue(skip_count, run_end - position_in_entry); skip_count -= skip_amount; position_in_entry += skip_amount; - if (ExhaustedRun(index_pointer)) { + if (ExhaustedRun()) { ForwardToNextRun(); } } } void Skip(ColumnSegment &segment, idx_t skip_count) { - auto data = handle.Ptr() + segment.GetBlockOffset(); - auto index_pointer = reinterpret_cast(data + rle_count_offset); - SkipInternal(index_pointer, skip_count); + SkipInternal(skip_count); } inline void ForwardToNextRun() { // handled all entries in this RLE value // move to the next entry entry_pos++; + if (entry_pos > max_entry_pos) { + throw IOException( + "Corrupted RLE segment: index_pointer[entry_pos] would reach outside of the blocks memory"); + } position_in_entry = 0; } - inline bool ExhaustedRun(rle_count_t *index_pointer) { + inline bool ExhaustedRun() { return position_in_entry >= index_pointer[entry_pos]; } BufferHandle handle; idx_t entry_pos; idx_t position_in_entry; - uint32_t rle_count_offset; + const uint32_t rle_count_offset; //! If we are running a filter over the column - the runs that match the filter unsafe_unique_array matching_runs; idx_t matching_run_count = 0; + + const T *data_pointer; + const rle_count_t *index_pointer; + const idx_t max_entry_pos; }; template @@ -334,13 +354,12 @@ static bool CanEmitConstantVector(idx_t position, idx_t run_length, idx_t scan_c } template -static void RLEScanConstant(RLEScanState &scan_state, rle_count_t *index_pointer, T *data_pointer, idx_t scan_count, - Vector &result) { +static void RLEScanConstant(RLEScanState &scan_state, idx_t scan_count, Vector &result) { result.SetVectorType(VectorType::CONSTANT_VECTOR); auto result_data = ConstantVector::GetData(result); - result_data[0] = data_pointer[scan_state.entry_pos]; + result_data[0] = scan_state.data_pointer[scan_state.entry_pos]; scan_state.position_in_entry += scan_count; - if (scan_state.ExhaustedRun(index_pointer)) { + if (scan_state.ExhaustedRun()) { scan_state.ForwardToNextRun(); } return; @@ -351,14 +370,10 @@ void RLEScanPartialInternal(ColumnSegment &segment, ColumnScanState &state, idx_ idx_t result_offset) { auto &scan_state = state.scan_state->Cast>(); - auto data = scan_state.handle.Ptr() + segment.GetBlockOffset(); - auto data_pointer = reinterpret_cast(data + RLEConstants::RLE_HEADER_SIZE); - auto index_pointer = reinterpret_cast(data + scan_state.rle_count_offset); - // If we are scanning an entire Vector and it contains only a single run - if (CanEmitConstantVector(scan_state.position_in_entry, index_pointer[scan_state.entry_pos], - scan_count)) { - RLEScanConstant(scan_state, index_pointer, data_pointer, scan_count, result); + if (CanEmitConstantVector(scan_state.position_in_entry, + scan_state.index_pointer[scan_state.entry_pos], scan_count)) { + RLEScanConstant(scan_state, scan_count, result); return; } @@ -367,10 +382,10 @@ void RLEScanPartialInternal(ColumnSegment &segment, ColumnScanState &state, idx_ idx_t result_end = result_offset + scan_count; while (result_offset < result_end) { - rle_count_t run_end = index_pointer[scan_state.entry_pos]; + rle_count_t run_end = scan_state.index_pointer[scan_state.entry_pos]; idx_t run_count = run_end - scan_state.position_in_entry; idx_t remaining_scan_count = result_end - result_offset; - T element = data_pointer[scan_state.entry_pos]; + T element = scan_state.data_pointer[scan_state.entry_pos]; if (DUCKDB_UNLIKELY(run_count > remaining_scan_count)) { for (idx_t i = 0; i < remaining_scan_count; i++) { result_data[result_offset + i] = element; @@ -407,13 +422,10 @@ void RLESelect(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun const SelectionVector &sel, idx_t sel_count) { auto &scan_state = state.scan_state->Cast>(); - auto data = scan_state.handle.Ptr() + segment.GetBlockOffset(); - auto data_pointer = reinterpret_cast(data + RLEConstants::RLE_HEADER_SIZE); - auto index_pointer = reinterpret_cast(data + scan_state.rle_count_offset); - // If we are scanning an entire Vector and it contains only a single run we don't need to select at all - if (CanEmitConstantVector(scan_state.position_in_entry, index_pointer[scan_state.entry_pos], vector_count)) { - RLEScanConstant(scan_state, index_pointer, data_pointer, vector_count, result); + if (CanEmitConstantVector(scan_state.position_in_entry, scan_state.index_pointer[scan_state.entry_pos], + vector_count)) { + RLEScanConstant(scan_state, vector_count, result); return; } @@ -427,14 +439,14 @@ void RLESelect(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun throw InternalException("Error in RLESelect - selection vector indices are not ordered"); } // skip forward to the next index - scan_state.SkipInternal(index_pointer, next_idx - prev_idx); + scan_state.SkipInternal(next_idx - prev_idx); // read the element - result_data[i] = data_pointer[scan_state.entry_pos]; + result_data[i] = scan_state.data_pointer[scan_state.entry_pos]; // move the next to the prev prev_idx = next_idx; } // skip the tail - scan_state.SkipInternal(index_pointer, vector_count - prev_idx); + scan_state.SkipInternal(vector_count - prev_idx); } //===--------------------------------------------------------------------===// @@ -445,9 +457,8 @@ void RLEFilter(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun idx_t &sel_count, const TableFilter &filter, TableFilterState &filter_state) { auto &scan_state = state.scan_state->Cast>(); - auto data = scan_state.handle.Ptr() + segment.GetBlockOffset(); - auto data_pointer = reinterpret_cast(data + RLEConstants::RLE_HEADER_SIZE); - auto index_pointer = reinterpret_cast(data + scan_state.rle_count_offset); + auto data_pointer = const_cast(scan_state.data_pointer); + auto index_pointer = const_cast(scan_state.index_pointer); auto total_run_count = (scan_state.rle_count_offset - RLEConstants::RLE_HEADER_SIZE) / sizeof(T); if (!scan_state.matching_runs) { @@ -528,7 +539,7 @@ void RLEFilter(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun throw InternalException("Error in RLEFilter - selection vector indices are not ordered"); } // skip forward to the next index - scan_state.SkipInternal(index_pointer, read_idx - prev_idx); + scan_state.SkipInternal(read_idx - prev_idx); prev_idx = read_idx; if (!scan_state.matching_runs[scan_state.entry_pos]) { // this run is filtered out - we don't need to scan it @@ -539,7 +550,7 @@ void RLEFilter(ColumnSegment &segment, ColumnScanState &state, idx_t vector_coun matching_sel.set_index(matching_count++, read_idx); } // skip the tail - scan_state.SkipInternal(index_pointer, vector_count - prev_idx); + scan_state.SkipInternal(vector_count - prev_idx); } // set up the filter result diff --git a/src/duckdb/third_party/fmt/include/fmt/format.h b/src/duckdb/third_party/fmt/include/fmt/format.h index 4c5163010..b5ab10c1e 100644 --- a/src/duckdb/third_party/fmt/include/fmt/format.h +++ b/src/duckdb/third_party/fmt/include/fmt/format.h @@ -321,24 +321,13 @@ inline typename Container::value_type* get_data(Container& c) { return c.data(); } -#ifdef _SECURE_SCL -// Make a checked iterator to avoid MSVC warnings. -template using checked_ptr = stdext::checked_array_iterator; -template checked_ptr make_checked(T* p, std::size_t size) { - return {p, size}; -} -#else -template using checked_ptr = T*; -template inline T* make_checked(T* p, std::size_t) { return p; } -#endif - template ::value)> -inline checked_ptr reserve( +inline typename Container::value_type* reserve( std::back_insert_iterator& it, std::size_t n) { Container& c = get_container(it); std::size_t size = c.size(); c.resize(size + n); - return make_checked(get_data(c) + size, n); + return get_data(c) + size; } template @@ -540,7 +529,7 @@ template void buffer::append(const U* begin, const U* end) { std::size_t new_size = size_ + to_unsigned(end - begin); reserve(new_size); - std::uninitialized_copy(begin, end, make_checked(ptr_, capacity_) + size_); + std::uninitialized_copy(begin, end, ptr_ + size_); size_ = new_size; } } // namespace internal @@ -642,7 +631,7 @@ class basic_memory_buffer : private Allocator, public internal::buffer { if (data == other.store_) { this->set(store_, capacity); std::uninitialized_copy(other.store_, other.store_ + size, - internal::make_checked(store_, capacity)); + store_); } else { this->set(data, capacity); // Set pointer to the inline array so that delete is not called @@ -689,7 +678,7 @@ void basic_memory_buffer::grow(std::size_t size) { T* new_data = std::allocator_traits::allocate(*this, new_capacity); // The following code doesn't throw, so the raw pointer above doesn't leak. std::uninitialized_copy(old_data, old_data + this->size(), - internal::make_checked(new_data, new_capacity)); + new_data); this->set(new_data, new_capacity); // deallocate must not throw according to the standard, but even if it does, // the buffer already uses the new storage and will deallocate it in @@ -1565,7 +1554,7 @@ template class basic_writer { } buffer -= s.size(); std::uninitialized_copy(s.data(), s.data() + s.size(), - make_checked(buffer, s.size())); + buffer); }); } }; diff --git a/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp b/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp index ea3403fca..006e56973 100644 --- a/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp +++ b/src/duckdb/ub_extension_icu_third_party_icu_i18n.cpp @@ -348,17 +348,17 @@ #include "extension/icu/third_party/icu/i18n/wintzimpl.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-cached-powers.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-bignum.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-fast-dtoa.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-string-to-double.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-double-to-string.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-cached-powers.cpp" #include "extension/icu/third_party/icu/i18n/double-conversion-bignum-dtoa.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-bignum.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-fast-dtoa.cpp" #include "extension/icu/third_party/icu/i18n/double-conversion-strtod.cpp" -#include "extension/icu/third_party/icu/i18n/double-conversion-string-to-double.cpp" +#include "extension/icu/third_party/icu/i18n/double-conversion-double-to-string.cpp"