Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions port/lang.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ constexpr bool kMustFreeHeapAllocations = false;
#define TSAN_SUPPRESSION
#endif // TSAN_SUPPRESSION

// Fail in debug build with a useful message, for automatically grouping
// related failures
#define DEBUG_FAIL(msg) assert(false && msg)

// Compile-time CPU feature testing compatibility
//
// A way to be extra sure these defines have been included.
Expand Down
117 changes: 55 additions & 62 deletions table/block_based/block.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1292,39 +1292,12 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
}
}

uint32_t Block::NumRestarts() const {
assert(size() >= 2 * sizeof(uint32_t));
uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
uint32_t num_restarts = block_footer;
if (size() > kMaxBlockSizeSupportedByHashIndex) {
// In BlockBuilder, we have ensured a block with HashIndex is less than
// kMaxBlockSizeSupportedByHashIndex (64KiB).
//
// Therefore, if we encounter a block with a size > 64KiB, the block
// cannot have HashIndex. So the footer will directly interpreted as
// num_restarts.
//
// Such check is for backward compatibility. We can ensure legacy block
// with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
// correctly as no HashIndex even if the MSB of num_restarts is set.
return num_restarts;
}
BlockBasedTableOptions::DataBlockIndexType index_type;
UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
return num_restarts;
}

BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
assert(size() >= 2 * sizeof(uint32_t));
if (size() > kMaxBlockSizeSupportedByHashIndex) {
// The check is for the same reason as that in NumRestarts()
return BlockBasedTableOptions::kDataBlockBinarySearch;
}
uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
uint32_t num_restarts = block_footer;
BlockBasedTableOptions::DataBlockIndexType index_type;
UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
return index_type;
assert(size() >= DataBlockFooter::kMinEncodedLength);
Slice input(data(), size());
DataBlockFooter footer;
footer.DecodeFrom(&input).PermitUncheckedError();
return footer.index_type;
}

Block::~Block() {
Expand All @@ -1334,51 +1307,71 @@ Block::~Block() {
delete[] kv_checksum_;
}

Status Block::GetCorruptionStatus() const {
// Re-process the footer to get a detailed error status.
// This should only be called when size() == 0 (error marker).
assert(size() == 0);
// When size() == 0 and restart_offset_ != 0, restart_offset_ stores the
// original data size for re-decoding the footer to get detailed error.
if (restart_offset_ == 0) {
return Status::Corruption("bad block contents");
}
Slice input(contents_.data.data(), restart_offset_);
DataBlockFooter footer;
Status s = footer.DecodeFrom(&input);
if (!s.ok()) {
return s; // Return the detailed error from DecodeFrom
}
// Footer decoded OK, so error was in later processing (shouldn't happen)
DEBUG_FAIL("ok status on presumed bad block contents");
return Status::Corruption("presumed bad block contents");
}

Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
Statistics* statistics)
: contents_(std::move(contents)), restart_offset_(0), num_restarts_(0) {
TEST_SYNC_POINT("Block::Block:0");
auto& size = contents_.data.size_;
if (size < sizeof(uint32_t)) {
// `contents` is assumed to be uncompressed in the proper format
Slice input(contents_.data.data(), size);
DataBlockFooter footer;
Status s = footer.DecodeFrom(&input);
if (!s.ok()) {
// Save original size for GetCorruptionStatus() to re-decode footer
restart_offset_ = static_cast<uint32_t>(size);
size = 0; // Error marker
} else {
// Should only decode restart points for uncompressed blocks
num_restarts_ = NumRestarts();
switch (IndexType()) {
// After DecodeFrom, input has the footer removed. Each case below
// may strip additional suffix (e.g., hash index) so that input ends
// with just the restart array.
num_restarts_ = footer.num_restarts;
switch (footer.index_type) {
case BlockBasedTableOptions::kDataBlockBinarySearch:
restart_offset_ = static_cast<uint32_t>(size) -
(1 + num_restarts_) * sizeof(uint32_t);
if (restart_offset_ > size - sizeof(uint32_t)) {
// The size is too small for NumRestarts() and therefore
// restart_offset_ wrapped around.
size = 0;
}
break;
case BlockBasedTableOptions::kDataBlockBinaryAndHash:
if (size < sizeof(uint32_t) /* block footer */ +
sizeof(uint16_t) /* NUM_BUCK */) {
if (input.size() < sizeof(uint16_t) /* NUM_BUCK */) {
size = 0;
break;
}

uint16_t map_offset;
data_block_hash_index_.Initialize(
contents_.data.data(),
/* chop off NUM_RESTARTS */
static_cast<uint16_t>(size - sizeof(uint32_t)), &map_offset);

restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);

if (restart_offset_ > map_offset) {
// map_offset is too small for NumRestarts() and
// therefore restart_offset_ wrapped around.
size = 0;
break;
}
data_block_hash_index_.Initialize(contents_.data.data(),
static_cast<uint16_t>(input.size()),
&map_offset);
// Strip the hash index, leaving just data + restarts
input.remove_suffix(input.size() - map_offset);
break;
default:
size = 0; // Error marker
}
// After the switch, input should end with restarts[num_restarts_]
if (size != 0) {
if (input.size() < num_restarts_ * sizeof(uint32_t)) {
size = 0; // Block too small for the declared number of restarts
} else {
restart_offset_ = static_cast<uint32_t>(input.size()) -
num_restarts_ * sizeof(uint32_t);
}
}
}
if (read_amp_bytes_per_bit != 0 && statistics && size != 0) {
read_amp_bitmap_.reset(new BlockReadAmpBitmap(
Expand Down Expand Up @@ -1515,7 +1508,7 @@ void Block::InitializeMetaIndexBlockProtectionInfo(
MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
MetaBlockIter* iter = new MetaBlockIter();
if (size() < 2 * sizeof(uint32_t)) {
iter->Invalidate(Status::Corruption("bad block contents"));
iter->Invalidate(GetCorruptionStatus());
return iter;
} else if (num_restarts_ == 0) {
// Empty block.
Expand All @@ -1540,7 +1533,7 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
ret_iter = new DataBlockIter;
}
if (size() < 2 * sizeof(uint32_t)) {
ret_iter->Invalidate(Status::Corruption("bad block contents"));
ret_iter->Invalidate(GetCorruptionStatus());
return ret_iter;
}
if (num_restarts_ == 0) {
Expand Down Expand Up @@ -1579,7 +1572,7 @@ IndexBlockIter* Block::NewIndexIterator(
ret_iter = new IndexBlockIter;
}
if (size() < 2 * sizeof(uint32_t)) {
ret_iter->Invalidate(Status::Corruption("bad block contents"));
ret_iter->Invalidate(GetCorruptionStatus());
return ret_iter;
}
if (num_restarts_ == 0) {
Expand Down
11 changes: 9 additions & 2 deletions table/block_based/block.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ class Block {
const char* data() const { return contents_.data.data(); }
// The additional memory space taken by the block data.
size_t usable_size() const { return contents_.usable_size(); }
uint32_t NumRestarts() const;
uint32_t NumRestarts() const { return num_restarts_; }
bool own_bytes() const { return contents_.own_bytes(); }

BlockBasedTableOptions::DataBlockIndexType IndexType() const;
Expand Down Expand Up @@ -282,8 +282,15 @@ class Block {
const char* TEST_GetKVChecksum() const { return kv_checksum_; }

private:
// Returns a detailed error status by re-processing the footer.
// Should only be called when size() == 0 (error marker).
Status GetCorruptionStatus() const;

BlockContents contents_;
uint32_t restart_offset_; // Offset in data_ of restart array
// Normal state: offset in data_ of restart array.
// Error state (size()==0): original data size if footer decode failed,
// otherwise 0. Used by GetCorruptionStatus() to re-decode footer.
uint32_t restart_offset_;
uint32_t num_restarts_;
std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
char* kv_checksum_{nullptr};
Expand Down
13 changes: 5 additions & 8 deletions table/block_based/block_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,19 +133,16 @@ Slice BlockBuilder::Finish() {
PutFixed32(&buffer_, restarts_[i]);
}

uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
BlockBasedTableOptions::DataBlockIndexType index_type =
BlockBasedTableOptions::kDataBlockBinarySearch;
DataBlockFooter footer;
footer.num_restarts = static_cast<uint32_t>(restarts_.size());
footer.index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
if (data_block_hash_index_builder_.Valid() &&
CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
data_block_hash_index_builder_.Finish(buffer_);
index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
footer.index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
}

// footer is a packed format of data_block_index_type and num_restarts
uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);

PutFixed32(&buffer_, block_footer);
footer.EncodeTo(&buffer_);
finished_ = true;
return Slice(buffer_);
}
Expand Down
68 changes: 36 additions & 32 deletions table/block_based/data_block_footer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,51 +9,55 @@

#include "table/block_based/data_block_footer.h"

#include "rocksdb/table.h"
#include "util/coding.h"

namespace ROCKSDB_NAMESPACE {

const int kDataBlockIndexTypeBitShift = 31;
// Hash index bit (bit 31)
constexpr uint32_t kHashIndexBit = 1u << 31;

// 0x7FFFFFFF
const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
void DataBlockFooter::EncodeTo(std::string* dst) const {
assert(num_restarts <= kMaxNumRestarts);

// 0x7FFFFFFF
const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;

uint32_t PackIndexTypeAndNumRestarts(
BlockBasedTableOptions::DataBlockIndexType index_type,
uint32_t num_restarts) {
if (num_restarts > kMaxNumRestarts) {
assert(0); // mute travis "unused" warning
}

uint32_t block_footer = num_restarts;
uint32_t packed = num_restarts;
if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
block_footer |= 1u << kDataBlockIndexTypeBitShift;
} else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
assert(0);
packed |= kHashIndexBit;
} else {
assert(index_type == BlockBasedTableOptions::kDataBlockBinarySearch);
}

return block_footer;
PutFixed32(dst, packed);
}

void UnPackIndexTypeAndNumRestarts(
uint32_t block_footer,
BlockBasedTableOptions::DataBlockIndexType* index_type,
uint32_t* num_restarts) {
if (index_type) {
if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
*index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
} else {
*index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
}
Status DataBlockFooter::DecodeFrom(Slice* input) {
if (input->size() < kMinEncodedLength) {
return Status::Corruption("Block too small for footer");
}

if (num_restarts) {
*num_restarts = block_footer & kNumRestartsMask;
assert(*num_restarts <= kMaxNumRestarts);
// Decode from the end of the input
const char* footer_ptr = input->data() + input->size() - kMinEncodedLength;
uint32_t packed = DecodeFixed32(footer_ptr);

if (packed & kHashIndexBit) {
index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
packed &= ~kHashIndexBit;
} else {
index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
}

// Check for reserved/unrecognized feature bits (anything beyond
// kMaxNumRestarts)
if (packed > kMaxNumRestarts) {
return Status::Corruption(
"Unrecognized feature in block footer (reserved bits set)");
}

num_restarts = packed;

// Remove the footer from the input slice
input->remove_suffix(kMinEncodedLength);

return Status::OK();
}

} // namespace ROCKSDB_NAMESPACE
60 changes: 53 additions & 7 deletions table/block_based/data_block_footer.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,63 @@

#pragma once

#include <cstdint>
#include <string>

#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"

namespace ROCKSDB_NAMESPACE {

uint32_t PackIndexTypeAndNumRestarts(
BlockBasedTableOptions::DataBlockIndexType index_type,
uint32_t num_restarts);
// DataBlockFooter represents the footer of a data block, containing metadata
// about the block's structure and features.
//
// Current encoding (may expand in future format versions):
// - A single uint32_t where:
// - The low 28 bits store the number of restart points (num_restarts)
// - The high 4 bits are reserved for metadata/features:
// - Bit 31: Hash index present (kDataBlockBinaryAndHash)
// - Bits 28-30: Reserved for future features
//
// When any unrecognized reserved bit is set, DecodeFrom() returns an error,
// allowing older versions to fail gracefully on newer formats.
//
// The encoding size is not fixed - future format versions may expand it.
// Use kMaxEncodedLength for buffer sizing.
struct DataBlockFooter {
// Maximum number of restarts that can be stored (2^28 - 1 = 268,435,455).
// This reserves the top 4 bits for metadata (bit 31 for hash index, bits
// 28-30 for future features). For historical compatibility purposes, the
// limit is adequate because a 4GiB block (maximum due to 32-bit block size)
// with restart_interval=1 and minimum entries (12 bytes: 3 varint bytes +
// 9-byte internal key + empty value) plus 4-byte restart offsets = 16 bytes
// per restart, fits at most (2^32 - 4) / 16 ≈ 268 million restarts.
static constexpr uint32_t kMaxNumRestarts = (1u << 28) - 1;

// Maximum encoded length of a DataBlockFooter (for buffer sizing)
// Currently 4 bytes, but may grow in future format versions.
static constexpr uint32_t kMaxEncodedLength = sizeof(uint32_t);

// Minimum encoded length (for current format version)
static constexpr uint32_t kMinEncodedLength = sizeof(uint32_t);

BlockBasedTableOptions::DataBlockIndexType index_type =
BlockBasedTableOptions::kDataBlockBinarySearch;
uint32_t num_restarts = 0;

DataBlockFooter() = default;
DataBlockFooter(BlockBasedTableOptions::DataBlockIndexType _index_type,
uint32_t _num_restarts)
: index_type(_index_type), num_restarts(_num_restarts) {}

// Appends the encoded footer to dst.
void EncodeTo(std::string* dst) const;

void UnPackIndexTypeAndNumRestarts(
uint32_t block_footer,
BlockBasedTableOptions::DataBlockIndexType* index_type,
uint32_t* num_restarts);
// Decodes a footer from the end of input (consumes bytes from the end).
// Returns an error if reserved/unrecognized feature bits are set.
// On success, advances input to exclude the consumed footer bytes.
Status DecodeFrom(Slice* input);
};

} // namespace ROCKSDB_NAMESPACE
Loading
Loading