Skip to content

Commit 83a5a61

Browse files
authored
GH-48146: [C++][Parquet] Fix undefined behavior with invalid column/offset index (#48147)
### Rationale for this change This should fix the following issues found by OSS-Fuzz: * https://issues.oss-fuzz.com/issues/461058054 * https://issues.oss-fuzz.com/issues/461058060 Also fixes a signed integer overflow issue in `CappedMemoryPool`: * https://issues.oss-fuzz.com/issues/461314335 ### Are these changes tested? Yes, by existing tests and additional fuzz regression files. ### Are there any user-facing changes? No. **This PR contains a "Critical Fix".** (If the changes fix either (a) a security vulnerability, (b) a bug that caused incorrect or invalid data to be produced, or (c) a bug that causes a crash (even when the API contract is upheld), please provide explanation. If not, you can remove this.) * GitHub Issue: #48146 Authored-by: Antoine Pitrou <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent e4b5b11 commit 83a5a61

File tree

5 files changed

+79
-62
lines changed

5 files changed

+79
-62
lines changed

cpp/src/arrow/memory_pool.cc

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,45 @@ std::vector<std::string> SupportedMemoryBackendNames() {
857857
return supported;
858858
}
859859

860+
///////////////////////////////////////////////////////////////////////
861+
// CappedMemoryPool implementation
862+
863+
Status CappedMemoryPool::Allocate(int64_t size, int64_t alignment, uint8_t** out) {
864+
// XXX Another thread may allocate memory between the limit check and
865+
// the `Allocate` call. It is possible for the two allocations to be successful
866+
// while going above the limit.
867+
// Solving this issue would require refactoring the `MemoryPool` implementation
868+
// to delegate the limit check to `MemoryPoolStats`.
869+
const int64_t allocated = wrapped_->bytes_allocated();
870+
if (ARROW_PREDICT_FALSE(bytes_allocated_limit_ - allocated < size)) {
871+
return OutOfMemory(allocated, size);
872+
}
873+
return wrapped_->Allocate(size, alignment, out);
874+
}
875+
876+
Status CappedMemoryPool::Reallocate(int64_t old_size, int64_t new_size, int64_t alignment,
877+
uint8_t** ptr) {
878+
if (new_size > old_size) {
879+
const int64_t allocated = wrapped_->bytes_allocated();
880+
if (ARROW_PREDICT_FALSE(bytes_allocated_limit_ - allocated < new_size - old_size)) {
881+
return OutOfMemory(allocated, new_size - old_size);
882+
}
883+
}
884+
return wrapped_->Reallocate(old_size, new_size, alignment, ptr);
885+
}
886+
887+
void CappedMemoryPool::Free(uint8_t* buffer, int64_t size, int64_t alignment) {
888+
return wrapped_->Free(buffer, size, alignment);
889+
}
890+
891+
Status CappedMemoryPool::OutOfMemory(int64_t current_allocated, int64_t requested) const {
892+
return Status::OutOfMemory(
893+
"MemoryPool bytes_allocated cap exceeded: "
894+
"limit=",
895+
bytes_allocated_limit_, ", current allocation=", current_allocated,
896+
", requested=", requested);
897+
}
898+
860899
// -----------------------------------------------------------------------
861900
// Pool buffer and allocation
862901

cpp/src/arrow/memory_pool.h

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -259,31 +259,10 @@ class ARROW_EXPORT CappedMemoryPool : public MemoryPool {
259259
using MemoryPool::Allocate;
260260
using MemoryPool::Reallocate;
261261

262-
Status Allocate(int64_t size, int64_t alignment, uint8_t** out) override {
263-
// XXX Another thread may allocate memory between the limit check and
264-
// the `Allocate` call. It is possible for the two allocations to be successful
265-
// while going above the limit.
266-
// Solving this issue would require refactoring the `MemoryPool` implementation
267-
// to delegate the limit check to `MemoryPoolStats`.
268-
const auto attempted = size + wrapped_->bytes_allocated();
269-
if (ARROW_PREDICT_FALSE(attempted > bytes_allocated_limit_)) {
270-
return OutOfMemory(attempted);
271-
}
272-
return wrapped_->Allocate(size, alignment, out);
273-
}
274-
262+
Status Allocate(int64_t size, int64_t alignment, uint8_t** out) override;
275263
Status Reallocate(int64_t old_size, int64_t new_size, int64_t alignment,
276-
uint8_t** ptr) override {
277-
const auto attempted = new_size - old_size + wrapped_->bytes_allocated();
278-
if (ARROW_PREDICT_FALSE(attempted > bytes_allocated_limit_)) {
279-
return OutOfMemory(attempted);
280-
}
281-
return wrapped_->Reallocate(old_size, new_size, alignment, ptr);
282-
}
283-
284-
void Free(uint8_t* buffer, int64_t size, int64_t alignment) override {
285-
return wrapped_->Free(buffer, size, alignment);
286-
}
264+
uint8_t** ptr) override;
265+
void Free(uint8_t* buffer, int64_t size, int64_t alignment) override;
287266

288267
void ReleaseUnused() override { wrapped_->ReleaseUnused(); }
289268

@@ -302,12 +281,7 @@ class ARROW_EXPORT CappedMemoryPool : public MemoryPool {
302281
std::string backend_name() const override { return wrapped_->backend_name(); }
303282

304283
private:
305-
Status OutOfMemory(int64_t value) {
306-
return Status::OutOfMemory(
307-
"MemoryPool bytes_allocated cap exceeded: "
308-
"limit=",
309-
bytes_allocated_limit_, ", attempted=", value);
310-
}
284+
Status OutOfMemory(int64_t current_allocated, int64_t requested) const;
311285

312286
MemoryPool* wrapped_;
313287
const int64_t bytes_allocated_limit_;

cpp/src/parquet/page_index.cc

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,9 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader {
244244
row_group_ordinal_);
245245

246246
if (column_index_buffer_ == nullptr) {
247-
PARQUET_ASSIGN_OR_THROW(column_index_buffer_,
248-
input_->ReadAt(index_read_range_.column_index->offset,
249-
index_read_range_.column_index->length));
247+
column_index_buffer_ =
248+
ReadIndexBuffer(index_read_range_.column_index->offset,
249+
index_read_range_.column_index->length, "ColumnIndex");
250250
}
251251

252252
int64_t buffer_offset =
@@ -285,9 +285,9 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader {
285285
row_group_ordinal_);
286286

287287
if (offset_index_buffer_ == nullptr) {
288-
PARQUET_ASSIGN_OR_THROW(offset_index_buffer_,
289-
input_->ReadAt(index_read_range_.offset_index->offset,
290-
index_read_range_.offset_index->length));
288+
offset_index_buffer_ =
289+
ReadIndexBuffer(index_read_range_.offset_index->offset,
290+
index_read_range_.offset_index->length, "OffsetIndex");
291291
}
292292

293293
int64_t buffer_offset =
@@ -346,7 +346,16 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader {
346346
}
347347
}
348348

349-
private:
349+
std::shared_ptr<Buffer> ReadIndexBuffer(int64_t offset, int64_t length,
350+
const char* offset_kind) {
351+
PARQUET_ASSIGN_OR_THROW(auto buffer, input_->ReadAt(offset, length));
352+
if (buffer->size() < length) {
353+
throw ParquetException("Invalid or truncated ", offset_kind, ": attempted to read ",
354+
length, " bytes but got only ", buffer->size(), " bytes");
355+
}
356+
return buffer;
357+
}
358+
350359
/// The input stream that can perform random access read.
351360
::arrow::io::RandomAccessFile* input_;
352361

@@ -962,6 +971,11 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const ColumnDescriptor& descr,
962971
ThriftDeserializer deserializer(properties);
963972
deserializer.DeserializeMessage(reinterpret_cast<const uint8_t*>(serialized_index),
964973
&index_len, &column_index, decryptor);
974+
if (ARROW_PREDICT_FALSE(LoadEnumSafe(&column_index.boundary_order) ==
975+
BoundaryOrder::UNDEFINED)) {
976+
// Guard against UB when moving column_index
977+
throw ParquetException("Invalid ColumnIndex boundary_order");
978+
}
965979
switch (descr.physical_type()) {
966980
case Type::BOOLEAN:
967981
return std::make_unique<TypedColumnIndexImpl<BooleanType>>(descr,

cpp/src/parquet/thrift_internal.h

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -186,32 +186,22 @@ struct SafeLoader {
186186
return static_cast<ApiTypeRawEnum>(LoadEnumRaw(in));
187187
}
188188

189-
template <typename ThriftType, bool IsUnsigned = true>
190-
inline static ApiTypeEnum LoadChecked(
191-
const typename std::enable_if<IsUnsigned, ThriftType>::type* in) {
192-
auto raw_value = LoadRaw(in);
193-
if (ARROW_PREDICT_FALSE(raw_value >=
194-
static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED))) {
195-
return ApiType::UNDEFINED;
196-
}
197-
return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
198-
}
199-
200-
template <typename ThriftType, bool IsUnsigned = false>
201-
inline static ApiTypeEnum LoadChecked(
202-
const typename std::enable_if<!IsUnsigned, ThriftType>::type* in) {
203-
auto raw_value = LoadRaw(in);
204-
if (ARROW_PREDICT_FALSE(raw_value >=
205-
static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED) ||
206-
raw_value < 0)) {
207-
return ApiType::UNDEFINED;
208-
}
209-
return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
210-
}
211-
212189
template <typename ThriftType>
213190
inline static ApiTypeEnum Load(const ThriftType* in) {
214-
return LoadChecked<ThriftType, std::is_unsigned<ApiTypeRawEnum>::value>(in);
191+
const auto raw_value = LoadRaw(in);
192+
if constexpr (std::is_unsigned_v<ApiTypeRawEnum>) {
193+
if (ARROW_PREDICT_FALSE(raw_value >=
194+
static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED))) {
195+
return ApiType::UNDEFINED;
196+
}
197+
} else {
198+
if (ARROW_PREDICT_FALSE(raw_value >=
199+
static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED) ||
200+
raw_value < 0)) {
201+
return ApiType::UNDEFINED;
202+
}
203+
}
204+
return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
215205
}
216206
};
217207

0 commit comments

Comments
 (0)