Skip to content

Commit de72196

Browse files
committed
Merge remote-tracking branch 'upstream/main' into 2026_04_07_ws_bdw
# Conflicts: # db/blob/db_blob_direct_write_test.cc
2 parents 483e1a0 + b7395b3 commit de72196

92 files changed

Lines changed: 12128 additions & 990 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,5 @@ third-party/folly/
105105

106106
# Claude Code local settings
107107
.claude/settings.local.json
108+
109+
tools/__pycache__/

BUCK

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
111111
"db/version_util.cc",
112112
"db/wal_edit.cc",
113113
"db/wal_manager.cc",
114+
"db/wide/read_path_blob_resolver.cc",
114115
"db/wide/wide_column_serialization.cc",
115116
"db/wide/wide_columns.cc",
116117
"db/wide/wide_columns_helper.cc",
@@ -4951,6 +4952,12 @@ cpp_unittest_wrapper(name="db_merge_operator_test",
49514952
extra_compiler_flags=[])
49524953

49534954

4955+
cpp_unittest_wrapper(name="db_open_with_config_test",
4956+
srcs=["db/db_open_with_config_test.cc"],
4957+
deps=[":rocksdb_test_lib"],
4958+
extra_compiler_flags=[])
4959+
4960+
49544961
cpp_unittest_wrapper(name="db_options_test",
49554962
srcs=["db/db_options_test.cc"],
49564963
deps=[":rocksdb_test_lib"],
@@ -5041,6 +5048,12 @@ cpp_unittest_wrapper(name="db_wide_basic_test",
50415048
extra_compiler_flags=[])
50425049

50435050

5051+
cpp_unittest_wrapper(name="db_wide_blob_direct_write_test",
5052+
srcs=["db/wide/db_wide_blob_direct_write_test.cc"],
5053+
deps=[":rocksdb_test_lib"],
5054+
extra_compiler_flags=[])
5055+
5056+
50445057
cpp_unittest_wrapper(name="db_with_timestamp_basic_test",
50455058
srcs=["db/db_with_timestamp_basic_test.cc"],
50465059
deps=[":rocksdb_test_lib"],

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,7 @@ set(SOURCES
788788
db/version_util.cc
789789
db/wal_edit.cc
790790
db/wal_manager.cc
791+
db/wide/read_path_blob_resolver.cc
791792
db/wide/wide_column_serialization.cc
792793
db/wide/wide_columns.cc
793794
db/wide/wide_columns_helper.cc
@@ -1427,6 +1428,7 @@ if(WITH_TESTS)
14271428
db/db_memtable_test.cc
14281429
db/db_merge_operator_test.cc
14291430
db/db_merge_operand_test.cc
1431+
db/db_open_with_config_test.cc
14301432
db/db_options_test.cc
14311433
db/db_properties_test.cc
14321434
db/db_range_del_test.cc
@@ -1479,6 +1481,7 @@ if(WITH_TESTS)
14791481
db/wal_manager_test.cc
14801482
db/wal_edit_test.cc
14811483
db/wide/db_wide_basic_test.cc
1484+
db/wide/db_wide_blob_direct_write_test.cc
14821485
db/wide/wide_column_serialization_test.cc
14831486
db/wide/wide_columns_helper_test.cc
14841487
db/write_batch_test.cc

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1496,6 +1496,9 @@ db_readonly_with_timestamp_test: $(OBJ_DIR)/db/db_readonly_with_timestamp_test.o
14961496
db_wide_basic_test: $(OBJ_DIR)/db/wide/db_wide_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
14971497
$(AM_LINK)
14981498

1499+
db_wide_blob_direct_write_test: $(OBJ_DIR)/db/wide/db_wide_blob_direct_write_test.o $(TEST_LIBRARY) $(LIBRARY)
1500+
$(AM_LINK)
1501+
14991502
db_with_timestamp_basic_test: $(OBJ_DIR)/db/db_with_timestamp_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
15001503
$(AM_LINK)
15011504

@@ -1505,6 +1508,9 @@ db_with_timestamp_compaction_test: db/db_with_timestamp_compaction_test.o $(TEST
15051508
db_encryption_test: $(OBJ_DIR)/db/db_encryption_test.o $(TEST_LIBRARY) $(LIBRARY)
15061509
$(AM_LINK)
15071510

1511+
db_open_with_config_test: $(OBJ_DIR)/db/db_open_with_config_test.o $(TEST_LIBRARY) $(LIBRARY)
1512+
$(AM_LINK)
1513+
15081514
db_test: $(OBJ_DIR)/db/db_test.o $(TEST_LIBRARY) $(LIBRARY)
15091515
$(AM_LINK)
15101516

db/blob/blob_fetcher.cc

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
#include "db/blob/blob_fetcher.h"
77

8+
#include "db/blob/blob_file_partition_manager.h"
9+
#include "db/blob/blob_index.h"
810
#include "db/version_set.h"
911

1012
namespace ROCKSDB_NAMESPACE {
@@ -14,21 +16,30 @@ Status BlobFetcher::FetchBlob(const Slice& user_key,
1416
FilePrefetchBuffer* prefetch_buffer,
1517
PinnableSlice* blob_value,
1618
uint64_t* bytes_read) const {
17-
assert(version_);
18-
19-
return version_->GetBlob(read_options_, user_key, blob_index_slice,
20-
prefetch_buffer, blob_value, bytes_read);
19+
BlobIndex blob_index;
20+
Status status = blob_index.DecodeFrom(blob_index_slice);
21+
if (status.ok()) {
22+
status = FetchBlob(user_key, blob_index, prefetch_buffer, blob_value,
23+
bytes_read);
24+
}
25+
return status;
2126
}
2227

2328
Status BlobFetcher::FetchBlob(const Slice& user_key,
2429
const BlobIndex& blob_index,
2530
FilePrefetchBuffer* prefetch_buffer,
2631
PinnableSlice* blob_value,
2732
uint64_t* bytes_read) const {
28-
assert(version_);
33+
if (!allow_write_path_fallback_) {
34+
assert(version_);
35+
36+
return version_->GetBlob(read_options_, user_key, blob_index,
37+
prefetch_buffer, blob_value, bytes_read);
38+
}
2939

30-
return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer,
31-
blob_value, bytes_read);
40+
return BlobFilePartitionManager::ResolveBlobDirectWriteIndex(
41+
read_options_, user_key, blob_index, version_, blob_file_cache_,
42+
prefetch_buffer, blob_value, bytes_read);
3243
}
3344

3445
} // namespace ROCKSDB_NAMESPACE

db/blob/blob_fetcher.h

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,25 @@
1010

1111
namespace ROCKSDB_NAMESPACE {
1212

13+
class BlobFileCache;
1314
class Version;
1415
class Slice;
1516
class FilePrefetchBuffer;
1617
class PinnableSlice;
1718
class BlobIndex;
1819

19-
// A thin wrapper around the blob retrieval functionality of Version.
20+
// A thin wrapper around blob retrieval. By default it reads through Version,
21+
// and it can optionally fall back to direct-write blob files that are not yet
22+
// manifest-visible.
2023
class BlobFetcher {
2124
public:
22-
BlobFetcher(const Version* version, const ReadOptions& read_options)
23-
: version_(version), read_options_(read_options) {}
25+
BlobFetcher(const Version* version, const ReadOptions& read_options,
26+
BlobFileCache* blob_file_cache = nullptr,
27+
bool allow_write_path_fallback = false)
28+
: version_(version),
29+
read_options_(read_options),
30+
blob_file_cache_(blob_file_cache),
31+
allow_write_path_fallback_(allow_write_path_fallback) {}
2432

2533
Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice,
2634
FilePrefetchBuffer* prefetch_buffer,
@@ -33,5 +41,7 @@ class BlobFetcher {
3341
private:
3442
const Version* version_;
3543
ReadOptions read_options_;
44+
BlobFileCache* blob_file_cache_;
45+
bool allow_write_path_fallback_;
3646
};
3747
} // namespace ROCKSDB_NAMESPACE

db/blob/blob_file_partition_manager.cc

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ namespace {
3535

3636
class RoundRobinBlobFilePartitionStrategy : public BlobFilePartitionStrategy {
3737
public:
38+
using BlobFilePartitionStrategy::SelectPartition;
39+
3840
const char* Name() const override {
3941
return "RoundRobinBlobFilePartitionStrategy";
4042
}
@@ -408,11 +410,19 @@ Status BlobFilePartitionManager::MaybePrepopulateBlobCache(
408410
CacheTier::kVolatileTier);
409411
}
410412

413+
uint32_t BlobFilePartitionManager::SelectWideColumnPartition(
414+
uint32_t column_family_id, const Slice& key,
415+
const WideColumns& columns) const {
416+
return strategy_->SelectPartition(num_partitions_, column_family_id, key,
417+
columns) %
418+
num_partitions_;
419+
}
420+
411421
Status BlobFilePartitionManager::WriteBlob(
412422
const WriteOptions& write_options, uint32_t column_family_id,
413423
CompressionType compression, const Slice& key, const Slice& value,
414424
uint64_t* blob_file_number, uint64_t* blob_offset, uint64_t* blob_size,
415-
const BlobDirectWriteSettings* settings) {
425+
const BlobDirectWriteSettings* settings, const uint32_t* partition_idx) {
416426
assert(blob_file_number != nullptr);
417427
assert(blob_offset != nullptr);
418428
assert(blob_size != nullptr);
@@ -443,14 +453,16 @@ Status BlobFilePartitionManager::WriteBlob(
443453
// uncompressed value rather than `write_value`. The modulo here is
444454
// intentional so custom strategies can return arbitrary hashed or sentinel
445455
// values without violating the partition bounds.
446-
const uint32_t partition_idx =
447-
strategy_->SelectPartition(num_partitions_, column_family_id, key,
448-
value) %
449-
num_partitions_;
456+
const uint32_t selected_partition_idx =
457+
partition_idx != nullptr
458+
? (*partition_idx % num_partitions_)
459+
: (strategy_->SelectPartition(num_partitions_, column_family_id, key,
460+
value) %
461+
num_partitions_);
450462

451463
{
452464
MutexLock lock(&mutex_);
453-
Partition* partition = partitions_[partition_idx].get();
465+
Partition* partition = partitions_[selected_partition_idx].get();
454466

455467
auto seal_current_file = [&]() -> Status {
456468
if (!partition->writer) {
@@ -481,7 +493,7 @@ Status BlobFilePartitionManager::WriteBlob(
481493

482494
if (!partition->writer) {
483495
Status s = OpenNewBlobFile(partition, column_family_id, compression,
484-
partition_idx);
496+
selected_partition_idx);
485497
if (!s.ok()) {
486498
return s;
487499
}
@@ -681,6 +693,14 @@ void BlobFilePartitionManager::GetProtectedBlobFileNumbers(
681693
}
682694
}
683695

696+
bool BlobFilePartitionManager::IsTrackedBlobFileNumber(
697+
uint64_t file_number) const {
698+
ReadLock lock(&file_partition_mutex_);
699+
return file_to_partition_.find(file_number) != file_to_partition_.end() ||
700+
protected_blob_file_refs_.find(file_number) !=
701+
protected_blob_file_refs_.end();
702+
}
703+
684704
void BlobFilePartitionManager::ProtectSealedBlobFileNumbers(
685705
const std::vector<uint64_t>& file_numbers) {
686706
if (file_numbers.empty()) {
@@ -747,12 +767,10 @@ void BlobFilePartitionManager::RemoveFilePartitionMappings(
747767
Status BlobFilePartitionManager::ResolveBlobDirectWriteIndex(
748768
const ReadOptions& read_options, const Slice& user_key,
749769
const BlobIndex& blob_idx, const Version* version,
750-
BlobFileCache* blob_file_cache, PinnableSlice* blob_value) {
770+
BlobFileCache* blob_file_cache, FilePrefetchBuffer* prefetch_buffer,
771+
PinnableSlice* blob_value, uint64_t* bytes_read) {
751772
assert(blob_value != nullptr);
752773

753-
constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
754-
constexpr uint64_t* bytes_read = nullptr;
755-
756774
if (version != nullptr) {
757775
// Only fall back when the blob file is still owned exclusively by the
758776
// write path and therefore absent from Version metadata. Once Version

db/blob/blob_file_partition_manager.h

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class BlobFileCache;
3131
class BlobFileCompletionCallback;
3232
class BlobIndex;
3333
class BlobLogWriter;
34+
class FilePrefetchBuffer;
3435
class IOTracer;
3536
class Logger;
3637
class PinnableSlice;
@@ -76,7 +77,14 @@ class BlobFilePartitionManager {
7677
CompressionType compression, const Slice& key,
7778
const Slice& value, uint64_t* blob_file_number,
7879
uint64_t* blob_offset, uint64_t* blob_size,
79-
const BlobDirectWriteSettings* settings = nullptr);
80+
const BlobDirectWriteSettings* settings = nullptr,
81+
const uint32_t* partition_idx = nullptr);
82+
83+
// Selects the partition to use for all blob-backed columns of one PutEntity
84+
// operation. The return value is already normalized to [0, num_partitions_).
85+
uint32_t SelectWideColumnPartition(uint32_t column_family_id,
86+
const Slice& key,
87+
const WideColumns& columns) const;
8088

8189
// Move the current active partition files into the next immutable
8290
// memtable-generation batch. Called from SwitchMemtable() while DB mutex is
@@ -117,6 +125,10 @@ class BlobFilePartitionManager {
117125
// or old SuperVersions and therefore must not be purged yet.
118126
void GetProtectedBlobFileNumbers(UnorderedSet<uint64_t>* file_numbers) const;
119127

128+
// Returns true when the blob file is still owned by the write path or
129+
// protected by a live memtable / old SuperVersion.
130+
bool IsTrackedBlobFileNumber(uint64_t file_number) const;
131+
120132
// Increments / decrements memtable-held protection on sealed blob files.
121133
void ProtectSealedBlobFileNumbers(const std::vector<uint64_t>& file_numbers);
122134
void UnprotectSealedBlobFileNumbers(
@@ -136,12 +148,11 @@ class BlobFilePartitionManager {
136148
// blob file is still write-path-owned and therefore not yet tracked by
137149
// Version. Existing manifest-visible read results, including I/O failures,
138150
// are returned directly rather than masked by fallback logic.
139-
static Status ResolveBlobDirectWriteIndex(const ReadOptions& read_options,
140-
const Slice& user_key,
141-
const BlobIndex& blob_idx,
142-
const Version* version,
143-
BlobFileCache* blob_file_cache,
144-
PinnableSlice* blob_value);
151+
static Status ResolveBlobDirectWriteIndex(
152+
const ReadOptions& read_options, const Slice& user_key,
153+
const BlobIndex& blob_idx, const Version* version,
154+
BlobFileCache* blob_file_cache, FilePrefetchBuffer* prefetch_buffer,
155+
PinnableSlice* blob_value, uint64_t* bytes_read);
145156

146157
private:
147158
struct Partition {

0 commit comments

Comments
 (0)