Commit 7a63c355 authored by Yi Wu's avatar Yi Wu

Copy Titan from latest pingcap/rocksdb tikv-3.0 branch

Signed-off-by: 's avatarYi Wu <yiwu@pingcap.com>
parent 77146969
#pragma once
#include "rocksdb/utilities/stackable_db.h"
#include "utilities/titandb/options.h"
namespace rocksdb {
namespace titandb {
struct TitanCFDescriptor {
std::string name;
TitanCFOptions options;
TitanCFDescriptor()
: name(kDefaultColumnFamilyName), options(TitanCFOptions()) {}
TitanCFDescriptor(const std::string& _name, const TitanCFOptions& _options)
: name(_name), options(_options) {}
};
class TitanDB : public StackableDB {
public:
static Status Open(const TitanOptions& options, const std::string& dbname,
TitanDB** db);
static Status Open(const TitanDBOptions& db_options,
const std::string& dbname,
const std::vector<TitanCFDescriptor>& descs,
std::vector<ColumnFamilyHandle*>* handles, TitanDB** db);
TitanDB() : StackableDB(nullptr) {}
using StackableDB::CreateColumnFamily;
Status CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& name,
ColumnFamilyHandle** handle) override {
TitanCFDescriptor desc(name, TitanCFOptions(options));
return CreateColumnFamily(desc, handle);
}
Status CreateColumnFamily(const TitanCFDescriptor& desc,
ColumnFamilyHandle** handle) {
std::vector<ColumnFamilyHandle*> handles;
Status s = CreateColumnFamilies({desc}, &handles);
if (s.ok()) {
*handle = handles[0];
}
return s;
}
using StackableDB::CreateColumnFamilies;
Status CreateColumnFamilies(
const ColumnFamilyOptions& options,
const std::vector<std::string>& names,
std::vector<ColumnFamilyHandle*>* handles) override {
std::vector<TitanCFDescriptor> descs;
for (auto& name : names) {
descs.emplace_back(name, TitanCFOptions(options));
}
return CreateColumnFamilies(descs, handles);
}
Status CreateColumnFamilies(
const std::vector<ColumnFamilyDescriptor>& base_descs,
std::vector<ColumnFamilyHandle*>* handles) override {
std::vector<TitanCFDescriptor> descs;
for (auto& desc : base_descs) {
descs.emplace_back(desc.name, TitanCFOptions(desc.options));
}
return CreateColumnFamilies(descs, handles);
}
virtual Status CreateColumnFamilies(
const std::vector<TitanCFDescriptor>& descs,
std::vector<ColumnFamilyHandle*>* handles) = 0;
Status DropColumnFamily(ColumnFamilyHandle* handle) override {
return DropColumnFamilies({handle});
}
virtual Status DropColumnFamilies(
const std::vector<ColumnFamilyHandle*>& handles) override = 0;
using StackableDB::Merge;
Status Merge(const WriteOptions&, ColumnFamilyHandle*, const Slice& /*key*/,
const Slice& /*value*/) override {
return Status::NotSupported("TitanDB doesn't support this operation");
}
using rocksdb::StackableDB::SingleDelete;
virtual Status SingleDelete(const WriteOptions& /*wopts*/,
ColumnFamilyHandle* /*column_family*/,
const Slice& /*key*/) override {
return Status::NotSupported("Not supported operation in titan db.");
}
using rocksdb::StackableDB::CompactFiles;
virtual Status CompactFiles(
const CompactionOptions& compact_options,
ColumnFamilyHandle* column_family,
const std::vector<std::string>& input_file_names, const int output_level,
const int output_path_id = -1,
std::vector<std::string>* const output_file_names = nullptr,
CompactionJobInfo* compaction_job_info = nullptr) override = 0;
};
} // namespace titandb
} // namespace rocksdb
/*
C bindings for TitanDB. May be useful as a stable ABI that can be
used by programs that keep rocksdb in a shared library, or for
a JNI api.
Does not support:
. getters for the option types
. custom comparators that implement key shortening
. capturing post-write-snapshot
. custom iter, db, env, cache implementations using just the C bindings
Some conventions:
(1) We expose just opaque struct pointers and functions to clients.
This allows us to change internal representations without having to
recompile clients.
(2) For simplicity, there is no equivalent to the Slice type. Instead,
the caller has to pass the pointer and length as separate
arguments.
(3) Errors are represented by a null-terminated c string. NULL
means no error. All operations that can raise an error are passed
a "char** errptr" as the last argument. One of the following must
be true on entry:
*errptr == NULL
*errptr points to a malloc()ed null-terminated error message
On success, a leveldb routine leaves *errptr unchanged.
On failure, leveldb frees the old value of *errptr and
set *errptr to a malloc()ed error message.
(4) Bools have the type unsigned char (0 == false; rest == true)
(5) All of the pointer arguments must be non-NULL.
*/
#ifndef ROCKSDB_TITAN_C_H
#define ROCKSDB_TITAN_C_H
#pragma once
#ifdef _WIN32
#ifdef ROCKSDB_DLL
#ifdef ROCKSDB_LIBRARY_EXPORTS
#define ROCKSDB_LIBRARY_API __declspec(dllexport)
#else
#define ROCKSDB_LIBRARY_API __declspec(dllimport)
#endif
#else
#define ROCKSDB_LIBRARY_API
#endif
#else
#define ROCKSDB_LIBRARY_API
#endif
#ifdef __cplusplus
extern "C" {
#endif
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
#include "rocksdb/c.h"
/* Exported types */
// TitanDB
typedef struct titandb_options_t titandb_options_t;
extern ROCKSDB_LIBRARY_API rocksdb_t* titandb_open(
const titandb_options_t* options, const char* name, char** errptr);
extern ROCKSDB_LIBRARY_API titandb_options_t* titandb_options_create();
extern ROCKSDB_LIBRARY_API void titandb_options_destroy(titandb_options_t*);
extern ROCKSDB_LIBRARY_API void titandb_options_set_rocksdb(
titandb_options_t* options, rocksdb_options_t* rocksdb);
extern ROCKSDB_LIBRARY_API void titandb_options_set_dirname(
titandb_options_t* options, const char* name);
extern ROCKSDB_LIBRARY_API void titandb_options_set_min_blob_size(
titandb_options_t* options, uint64_t size);
extern ROCKSDB_LIBRARY_API void titandb_options_set_blob_file_compression(
titandb_options_t* options, int compression);
extern ROCKSDB_LIBRARY_API void titandb_options_set_blob_cache(
titandb_options_t* options, rocksdb_cache_t* blob_cache);
extern ROCKSDB_LIBRARY_API void titandb_options_set_disable_background_gc(
titandb_options_t* options, unsigned char disable);
extern ROCKSDB_LIBRARY_API void titandb_options_set_max_gc_batch_size(
titandb_options_t* options, uint64_t size);
extern ROCKSDB_LIBRARY_API void titandb_options_set_min_gc_batch_size(
titandb_options_t* options, uint64_t size);
extern ROCKSDB_LIBRARY_API void titandb_options_set_blob_file_discardable_ratio(
titandb_options_t* options, float ratio);
extern ROCKSDB_LIBRARY_API void titandb_options_set_sample_file_size_ratio(
titandb_options_t* options, float ratio);
extern ROCKSDB_LIBRARY_API void titandb_options_set_merge_small_file_threshold(
titandb_options_t* options, uint64_t size);
#ifdef __cplusplus
} /* end extern "C" */
#endif
#endif // ROCKSDB_TITAN_C_H
#include "base_db_listener.h"
namespace rocksdb {
namespace titandb {
BaseDbListener::BaseDbListener(TitanDBImpl* db) : db_impl_(db) {}
BaseDbListener::~BaseDbListener() {}
void BaseDbListener::OnFlushCompleted(DB* /*db*/,
const FlushJobInfo& flush_job_info) {
db_impl_->OnFlushCompleted(flush_job_info);
}
void BaseDbListener::OnCompactionCompleted(
DB* /* db */, const CompactionJobInfo& compaction_job_info) {
db_impl_->OnCompactionCompleted(compaction_job_info);
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "rocksdb/listener.h"
#include "utilities/titandb/db_impl.h"
namespace rocksdb {
namespace titandb {
class BaseDbListener final : public EventListener {
public:
BaseDbListener(TitanDBImpl* db);
~BaseDbListener();
void OnFlushCompleted(DB* db, const FlushJobInfo& flush_job_info) override;
void OnCompactionCompleted(
DB* db, const CompactionJobInfo& compaction_job_info) override;
private:
rocksdb::titandb::TitanDBImpl* db_impl_;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_file_builder.h"
namespace rocksdb {
namespace titandb {
BlobFileBuilder::BlobFileBuilder(const TitanCFOptions& options,
WritableFileWriter* file)
: options_(options), file_(file), encoder_(options_.blob_file_compression) {
BlobFileHeader header;
std::string buffer;
header.EncodeTo(&buffer);
status_ = file_->Append(buffer);
}
void BlobFileBuilder::Add(const BlobRecord& record, BlobHandle* handle) {
if (!ok()) return;
encoder_.EncodeRecord(record);
handle->offset = file_->GetFileSize();
handle->size = encoder_.GetEncodedSize();
status_ = file_->Append(encoder_.GetHeader());
if (ok()) {
status_ = file_->Append(encoder_.GetRecord());
}
}
Status BlobFileBuilder::Finish() {
if (!ok()) return status();
std::string buffer;
BlobFileFooter footer;
footer.EncodeTo(&buffer);
status_ = file_->Append(buffer);
if (ok()) {
status_ = file_->Flush();
}
return status();
}
void BlobFileBuilder::Abandon() {}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "util/file_reader_writer.h"
#include "utilities/titandb/blob_format.h"
#include "utilities/titandb/options.h"
namespace rocksdb {
namespace titandb {
// Blob file format:
//
// <begin>
// [blob record 1]
// [blob record 2]
// ...
// [blob record N]
// [meta block 1]
// [meta block 2]
// ...
// [meta block K]
// [meta index block]
// [footer]
// <end>
//
// 1. The sequence of blob records in the file are stored in sorted
// order. These records come one after another at the beginning of the
// file, and are compressed according to the compression options.
//
// 2. After the blob records we store a bunch of meta blocks, and a
// meta index block with block handles pointed to the meta blocks. The
// meta block and the meta index block are formatted the same as the
// BlockBasedTable.
class BlobFileBuilder {
public:
// Constructs a builder that will store the contents of the file it
// is building in "*file". Does not close the file. It is up to the
// caller to sync and close the file after calling Finish().
BlobFileBuilder(const TitanCFOptions& options, WritableFileWriter* file);
// Adds the record to the file and points the handle to it.
void Add(const BlobRecord& record, BlobHandle* handle);
// Returns non-ok iff some error has been detected.
Status status() const { return status_; }
// Finishes building the table.
// REQUIRES: Finish(), Abandon() have not been called.
Status Finish();
// Abandons building the table. If the caller is not going to call
// Finish(), it must call Abandon() before destroying this builder.
// REQUIRES: Finish(), Abandon() have not been called.
void Abandon();
private:
bool ok() const { return status().ok(); }
TitanCFOptions options_;
WritableFileWriter* file_;
Status status_;
BlobEncoder encoder_;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_file_cache.h"
#include "util/filename.h"
#include "utilities/titandb/util.h"
namespace rocksdb {
namespace titandb {
namespace {
Slice EncodeFileNumber(const uint64_t* number) {
return Slice(reinterpret_cast<const char*>(number), sizeof(*number));
}
} // namespace
BlobFileCache::BlobFileCache(const TitanDBOptions& db_options,
const TitanCFOptions& cf_options,
std::shared_ptr<Cache> cache)
: env_(db_options.env),
env_options_(db_options),
db_options_(db_options),
cf_options_(cf_options),
cache_(cache) {}
Status BlobFileCache::Get(const ReadOptions& options, uint64_t file_number,
uint64_t file_size, const BlobHandle& handle,
BlobRecord* record, PinnableSlice* buffer) {
Cache::Handle* cache_handle = nullptr;
Status s = FindFile(file_number, file_size, &cache_handle);
if (!s.ok()) return s;
auto reader = reinterpret_cast<BlobFileReader*>(cache_->Value(cache_handle));
s = reader->Get(options, handle, record, buffer);
cache_->Release(cache_handle);
return s;
}
Status BlobFileCache::NewPrefetcher(
uint64_t file_number, uint64_t file_size,
std::unique_ptr<BlobFilePrefetcher>* result) {
Cache::Handle* cache_handle = nullptr;
Status s = FindFile(file_number, file_size, &cache_handle);
if (!s.ok()) return s;
auto reader = reinterpret_cast<BlobFileReader*>(cache_->Value(cache_handle));
auto prefetcher = new BlobFilePrefetcher(reader);
prefetcher->RegisterCleanup(&UnrefCacheHandle, cache_.get(), cache_handle);
result->reset(prefetcher);
return s;
}
void BlobFileCache::Evict(uint64_t file_number) {
cache_->Erase(EncodeFileNumber(&file_number));
}
Status BlobFileCache::FindFile(uint64_t file_number, uint64_t file_size,
Cache::Handle** handle) {
Status s;
Slice cache_key = EncodeFileNumber(&file_number);
*handle = cache_->Lookup(cache_key);
if (*handle) return s;
std::unique_ptr<RandomAccessFileReader> file;
{
std::unique_ptr<RandomAccessFile> f;
auto file_name = BlobFileName(db_options_.dirname, file_number);
s = env_->NewRandomAccessFile(file_name, &f, env_options_);
if (!s.ok()) return s;
if (db_options_.advise_random_on_open) {
f->Hint(RandomAccessFile::RANDOM);
}
file.reset(new RandomAccessFileReader(std::move(f), file_name));
}
std::unique_ptr<BlobFileReader> reader;
s = BlobFileReader::Open(cf_options_, std::move(file), file_size, &reader);
if (!s.ok()) return s;
cache_->Insert(cache_key, reader.release(), 1,
&DeleteCacheValue<BlobFileReader>, handle);
return s;
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "rocksdb/options.h"
#include "utilities/titandb/blob_file_reader.h"
#include "utilities/titandb/blob_format.h"
#include "utilities/titandb/options.h"
namespace rocksdb {
namespace titandb {
class BlobFileCache {
public:
// Constructs a blob file cache to cache opened files.
BlobFileCache(const TitanDBOptions& db_options,
const TitanCFOptions& cf_options, std::shared_ptr<Cache> cache);
// Gets the blob record pointed by the handle in the specified file
// number. The corresponding file size must be exactly "file_size"
// bytes. The provided buffer is used to store the record data, so
// the buffer must be valid when the record is used.
Status Get(const ReadOptions& options, uint64_t file_number,
uint64_t file_size, const BlobHandle& handle, BlobRecord* record,
PinnableSlice* buffer);
// Creates a prefetcher for the specified file number.
Status NewPrefetcher(uint64_t file_number, uint64_t file_size,
std::unique_ptr<BlobFilePrefetcher>* result);
// Evicts the file cache for the specified file number.
void Evict(uint64_t file_number);
private:
// Finds the file for the specified file number. Opens the file if
// the file is not found in the cache and caches it.
// If successful, sets "*handle" to the cached file.
Status FindFile(uint64_t file_number, uint64_t file_size,
Cache::Handle** handle);
Env* env_;
EnvOptions env_options_;
TitanDBOptions db_options_;
TitanCFOptions cf_options_;
std::shared_ptr<Cache> cache_;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_file_iterator.h"
#include "util/crc32c.h"
#include "utilities/titandb/util.h"
namespace rocksdb {
namespace titandb {
BlobFileIterator::BlobFileIterator(
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_name,
uint64_t file_size, const TitanCFOptions& titan_cf_options)
: file_(std::move(file)),
file_number_(file_name),
file_size_(file_size),
titan_cf_options_(titan_cf_options) {}
BlobFileIterator::~BlobFileIterator() {}
bool BlobFileIterator::Init() {
Slice slice;
char header_buf[BlobFileHeader::kEncodedLength];
status_ = file_->Read(0, BlobFileHeader::kEncodedLength, &slice, header_buf);
if (!status_.ok()) {
return false;
}
BlobFileHeader blob_file_header;
status_ = blob_file_header.DecodeFrom(&slice);
if (!status_.ok()) {
return false;
}
char footer_buf[BlobFileFooter::kEncodedLength];
status_ = file_->Read(file_size_ - BlobFileFooter::kEncodedLength,
BlobFileFooter::kEncodedLength, &slice, footer_buf);
if (!status_.ok()) return false;
BlobFileFooter blob_file_footer;
status_ = blob_file_footer.DecodeFrom(&slice);
end_of_blob_record_ = file_size_ - BlobFileFooter::kEncodedLength -
blob_file_footer.meta_index_handle.size();
assert(end_of_blob_record_ > BlobFileHeader::kEncodedLength);
init_ = true;
return true;
}
void BlobFileIterator::SeekToFirst() {
if (!init_ && !Init()) return;
status_ = Status::OK();
iterate_offset_ = BlobFileHeader::kEncodedLength;
PrefetchAndGet();
}
bool BlobFileIterator::Valid() const { return valid_ && status().ok(); }
void BlobFileIterator::Next() {
assert(init_);
PrefetchAndGet();
}
Slice BlobFileIterator::key() const { return cur_blob_record_.key; }
Slice BlobFileIterator::value() const { return cur_blob_record_.value; }
void BlobFileIterator::IterateForPrev(uint64_t offset) {
if (!init_ && !Init()) return;
status_ = Status::OK();
if (offset >= end_of_blob_record_) {
iterate_offset_ = offset;
status_ = Status::InvalidArgument("Out of bound");
return;
}
uint64_t total_length = 0;
FixedSlice<kBlobHeaderSize> header_buffer;
iterate_offset_ = BlobFileHeader::kEncodedLength;
for (; iterate_offset_ < offset; iterate_offset_ += total_length) {
status_ = file_->Read(iterate_offset_, kBlobHeaderSize, &header_buffer,
header_buffer.get());
if (!status_.ok()) return;
status_ = decoder_.DecodeHeader(&header_buffer);
if (!status_.ok()) return;
total_length = kBlobHeaderSize + decoder_.GetRecordSize();
}
if (iterate_offset_ > offset) iterate_offset_ -= total_length;
valid_ = false;
}
void BlobFileIterator::GetBlobRecord() {
FixedSlice<kBlobHeaderSize> header_buffer;
status_ = file_->Read(iterate_offset_, kBlobHeaderSize, &header_buffer,
header_buffer.get());
if (!status_.ok()) return;
status_ = decoder_.DecodeHeader(&header_buffer);
if (!status_.ok()) return;
Slice record_slice;
auto record_size = decoder_.GetRecordSize();
buffer_.reserve(record_size);
status_ = file_->Read(iterate_offset_ + kBlobHeaderSize, record_size,
&record_slice, buffer_.data());
if (status_.ok()) {
status_ =
decoder_.DecodeRecord(&record_slice, &cur_blob_record_, &uncompressed_);
}
if (!status_.ok()) return;
cur_record_offset_ = iterate_offset_;
cur_record_size_ = kBlobHeaderSize + record_size;
iterate_offset_ += cur_record_size_;
valid_ = true;
}
void BlobFileIterator::PrefetchAndGet() {
if (iterate_offset_ >= end_of_blob_record_) {
valid_ = false;
return;
}
if (readahead_begin_offset_ > iterate_offset_ ||
readahead_end_offset_ < iterate_offset_) {
// alignment
readahead_begin_offset_ =
iterate_offset_ - (iterate_offset_ & (kDefaultPageSize - 1));
readahead_end_offset_ = readahead_begin_offset_;
readahead_size_ = kMinReadaheadSize;
}
auto min_blob_size =
iterate_offset_ + kBlobHeaderSize + titan_cf_options_.min_blob_size;
if (readahead_end_offset_ <= min_blob_size) {
while (readahead_end_offset_ + readahead_size_ <= min_blob_size &&
readahead_size_ < kMaxReadaheadSize)
readahead_size_ <<= 1;
file_->Prefetch(readahead_end_offset_, readahead_size_);
readahead_end_offset_ += readahead_size_;
readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ << 1);
}
GetBlobRecord();
if (readahead_end_offset_ < iterate_offset_) {
readahead_end_offset_ = iterate_offset_;
}
}
BlobFileMergeIterator::BlobFileMergeIterator(
std::vector<std::unique_ptr<BlobFileIterator>>&& blob_file_iterators)
: blob_file_iterators_(std::move(blob_file_iterators)) {}
bool BlobFileMergeIterator::Valid() const {
if (current_ == nullptr) return false;
if (!status().ok()) return false;
return current_->Valid() && current_->status().ok();
}
void BlobFileMergeIterator::SeekToFirst() {
for (auto& iter : blob_file_iterators_) {
iter->SeekToFirst();
if (iter->status().ok() && iter->Valid()) min_heap_.push(iter.get());
}
if (!min_heap_.empty()) {
current_ = min_heap_.top();
min_heap_.pop();
} else {
status_ = Status::Aborted("No iterator is valid");
}
}
void BlobFileMergeIterator::Next() {
assert(current_ != nullptr);
current_->Next();
if (current_->status().ok() && current_->Valid()) min_heap_.push(current_);
current_ = min_heap_.top();
min_heap_.pop();
}
Slice BlobFileMergeIterator::key() const {
assert(current_ != nullptr);
return current_->key();
}
Slice BlobFileMergeIterator::value() const {
assert(current_ != nullptr);
return current_->value();
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include <cstdint>
#include <queue>
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "table/internal_iterator.h"
#include "util/file_reader_writer.h"
#include "utilities/titandb/blob_format.h"
#include "utilities/titandb/options.h"
#include "utilities/titandb/util.h"
namespace rocksdb {
namespace titandb {
class BlobFileIterator {
public:
const uint64_t kMinReadaheadSize = 4 << 10;
const uint64_t kMaxReadaheadSize = 256 << 10;
BlobFileIterator(std::unique_ptr<RandomAccessFileReader>&& file,
uint64_t file_name, uint64_t file_size,
const TitanCFOptions& titan_cf_options);
~BlobFileIterator();
bool Init();
bool Valid() const;
void SeekToFirst();
void Next();
Slice key() const;
Slice value() const;
Status status() const { return status_; }
void IterateForPrev(uint64_t);
BlobIndex GetBlobIndex() {
BlobIndex blob_index;
blob_index.file_number = file_number_;
blob_index.blob_handle.offset = cur_record_offset_;
blob_index.blob_handle.size = cur_record_size_;
return blob_index;
}
private:
// Blob file info
const std::unique_ptr<RandomAccessFileReader> file_;
const uint64_t file_number_;
const uint64_t file_size_;
TitanCFOptions titan_cf_options_;
bool init_{false};
uint64_t end_of_blob_record_{0};
// Iterator status
Status status_;
bool valid_{false};
BlobDecoder decoder_;
uint64_t iterate_offset_{0};
std::vector<char> buffer_;
OwnedSlice uncompressed_;
BlobRecord cur_blob_record_;
uint64_t cur_record_offset_;
uint64_t cur_record_size_;
uint64_t readahead_begin_offset_{0};
uint64_t readahead_end_offset_{0};
uint64_t readahead_size_{kMinReadaheadSize};
void PrefetchAndGet();
void GetBlobRecord();
};
class BlobFileMergeIterator {
public:
explicit BlobFileMergeIterator(
std::vector<std::unique_ptr<BlobFileIterator>>&&);
~BlobFileMergeIterator() = default;
bool Valid() const;
void SeekToFirst();
void Next();
Slice key() const;
Slice value() const;
Status status() const {
if (current_ != nullptr && !current_->status().ok())
return current_->status();
return status_;
}
BlobIndex GetBlobIndex() { return current_->GetBlobIndex(); }
private:
class IternalComparator {
public:
// Smaller value get Higher priority
bool operator()(const BlobFileIterator* iter1,
const BlobFileIterator* iter2) {
return BytewiseComparator()->Compare(iter1->key(), iter2->key()) > 0;
}
};
Status status_;
std::vector<std::unique_ptr<BlobFileIterator>> blob_file_iterators_;
std::priority_queue<BlobFileIterator*, std::vector<BlobFileIterator*>,
IternalComparator>
min_heap_;
BlobFileIterator* current_ = nullptr;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_file_iterator.h"
#include <cinttypes>
#include "util/filename.h"
#include "util/testharness.h"
#include "utilities/titandb/blob_file_builder.h"
#include "utilities/titandb/blob_file_cache.h"
#include "utilities/titandb/blob_file_reader.h"
namespace rocksdb {
namespace titandb {
class BlobFileIteratorTest : public testing::Test {
public:
Env* env_{Env::Default()};
TitanOptions titan_options_;
EnvOptions env_options_;
std::string dirname_;
std::string file_name_;
uint64_t file_number_;
std::unique_ptr<BlobFileBuilder> builder_;
std::unique_ptr<WritableFileWriter> writable_file_;
std::unique_ptr<BlobFileIterator> blob_file_iterator_;
std::unique_ptr<RandomAccessFileReader> readable_file_;
BlobFileIteratorTest() : dirname_(test::TmpDir(env_)) {
titan_options_.dirname = dirname_;
file_number_ = Random::GetTLSInstance()->Next();
file_name_ = BlobFileName(dirname_, file_number_);
}
~BlobFileIteratorTest() {
env_->DeleteFile(file_name_);
env_->DeleteDir(dirname_);
}
std::string GenKey(uint64_t i) {
char buf[64];
snprintf(buf, sizeof(buf), "k-%08" PRIu64, i);
return buf;
}
std::string GenValue(uint64_t k) {
if (k % 2 == 0) {
return std::string(titan_options_.min_blob_size - 1, 'v');
} else {
return std::string(titan_options_.min_blob_size + 1, 'v');
}
}
void NewBuiler() {
TitanDBOptions db_options(titan_options_);
TitanCFOptions cf_options(titan_options_);
BlobFileCache cache(db_options, cf_options, {NewLRUCache(128)});
{
std::unique_ptr<WritableFile> f;
ASSERT_OK(env_->NewWritableFile(file_name_, &f, env_options_));
writable_file_.reset(new WritableFileWriter(std::move(f), file_name_, env_options_));
}
builder_.reset(new BlobFileBuilder(cf_options, writable_file_.get()));
}
void AddKeyValue(const std::string& key, const std::string& value,
BlobHandle* blob_handle) {
BlobRecord record;
record.key = key;
record.value = value;
builder_->Add(record, blob_handle);
ASSERT_OK(builder_->status());
}
void FinishBuiler() {
ASSERT_OK(builder_->Finish());
ASSERT_OK(builder_->status());
}
void NewBlobFileIterator() {
uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(file_name_, &file_size));
NewBlobFileReader(file_number_, 0, titan_options_, env_options_, env_,
&readable_file_);
blob_file_iterator_.reset(new BlobFileIterator{
std::move(readable_file_), file_number_, file_size, TitanCFOptions()});
}
void TestBlobFileIterator() {
NewBuiler();
const int n = 1000;
std::vector<BlobHandle> handles(n);
for (int i = 0; i < n; i++) {
auto id = std::to_string(i);
AddKeyValue(id, id, &handles[i]);
}
FinishBuiler();
NewBlobFileIterator();
blob_file_iterator_->SeekToFirst();
for (int i = 0; i < n; blob_file_iterator_->Next(), i++) {
ASSERT_OK(blob_file_iterator_->status());
ASSERT_EQ(blob_file_iterator_->Valid(), true);
auto id = std::to_string(i);
ASSERT_EQ(id, blob_file_iterator_->key());
ASSERT_EQ(id, blob_file_iterator_->value());
BlobIndex blob_index = blob_file_iterator_->GetBlobIndex();
ASSERT_EQ(handles[i], blob_index.blob_handle);
}
}
};
TEST_F(BlobFileIteratorTest, Basic) {
TitanOptions options;
TestBlobFileIterator();
}
TEST_F(BlobFileIteratorTest, IterateForPrev) {
NewBuiler();
const int n = 1000;
std::vector<BlobHandle> handles(n);
for (int i = 0; i < n; i++) {
auto id = std::to_string(i);
AddKeyValue(id, id, &handles[i]);
}
FinishBuiler();
NewBlobFileIterator();
int i = n / 2;
blob_file_iterator_->IterateForPrev(handles[i].offset);
ASSERT_OK(blob_file_iterator_->status());
for (blob_file_iterator_->Next(); i < n; i++, blob_file_iterator_->Next()) {
ASSERT_OK(blob_file_iterator_->status());
ASSERT_EQ(blob_file_iterator_->Valid(), true);
BlobIndex blob_index;
blob_index = blob_file_iterator_->GetBlobIndex();
ASSERT_EQ(handles[i], blob_index.blob_handle);
auto id = std::to_string(i);
ASSERT_EQ(id, blob_file_iterator_->key());
ASSERT_EQ(id, blob_file_iterator_->value());
}
auto idx = Random::GetTLSInstance()->Uniform(n);
blob_file_iterator_->IterateForPrev(handles[idx].offset);
ASSERT_OK(blob_file_iterator_->status());
blob_file_iterator_->Next();
ASSERT_OK(blob_file_iterator_->status());
ASSERT_TRUE(blob_file_iterator_->Valid());
BlobIndex blob_index;
blob_index = blob_file_iterator_->GetBlobIndex();
ASSERT_EQ(handles[idx], blob_index.blob_handle);
while ((idx = Random::GetTLSInstance()->Uniform(n)) == 0)
;
blob_file_iterator_->IterateForPrev(handles[idx].offset - kBlobHeaderSize -
1);
ASSERT_OK(blob_file_iterator_->status());
blob_file_iterator_->Next();
ASSERT_OK(blob_file_iterator_->status());
ASSERT_TRUE(blob_file_iterator_->Valid());
blob_index = blob_file_iterator_->GetBlobIndex();
ASSERT_EQ(handles[idx - 1], blob_index.blob_handle);
idx = Random::GetTLSInstance()->Uniform(n);
blob_file_iterator_->IterateForPrev(handles[idx].offset + 1);
ASSERT_OK(blob_file_iterator_->status());
blob_file_iterator_->Next();
ASSERT_OK(blob_file_iterator_->status());
ASSERT_TRUE(blob_file_iterator_->Valid());
blob_index = blob_file_iterator_->GetBlobIndex();
ASSERT_EQ(handles[idx], blob_index.blob_handle);
}
TEST_F(BlobFileIteratorTest, MergeIterator) {
const int kMaxKeyNum = 1000;
std::vector<BlobHandle> handles(kMaxKeyNum);
std::vector<std::unique_ptr<BlobFileIterator>> iters;
NewBuiler();
for (int i = 1; i < kMaxKeyNum; i++) {
AddKeyValue(GenKey(i), GenValue(i), &handles[i]);
if (i % 100 == 0) {
FinishBuiler();
uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(file_name_, &file_size));
NewBlobFileReader(file_number_, 0, titan_options_, env_options_, env_,
&readable_file_);
iters.emplace_back(std::unique_ptr<BlobFileIterator>(
new BlobFileIterator{std::move(readable_file_), file_number_,
file_size, TitanCFOptions()}));
file_number_ = Random::GetTLSInstance()->Next();
file_name_ = BlobFileName(dirname_, file_number_);
NewBuiler();
}
}
FinishBuiler();
uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(file_name_, &file_size));
NewBlobFileReader(file_number_, 0, titan_options_, env_options_, env_,
&readable_file_);
iters.emplace_back(std::unique_ptr<BlobFileIterator>(new BlobFileIterator{
std::move(readable_file_), file_number_, file_size, TitanCFOptions()}));
BlobFileMergeIterator iter(std::move(iters));
iter.SeekToFirst();
for (int i = 1; i < kMaxKeyNum; i++, iter.Next()) {
ASSERT_OK(iter.status());
ASSERT_TRUE(iter.Valid());
ASSERT_EQ(iter.key(), GenKey(i));
ASSERT_EQ(iter.value(), GenValue(i));
ASSERT_EQ(iter.GetBlobIndex().blob_handle, handles[i]);
}
}
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#pragma once
#include "util/file_reader_writer.h"
#include "utilities/titandb/blob_format.h"
namespace rocksdb {
namespace titandb {
// Contains information to complete a blob file creation.
class BlobFileHandle {
public:
virtual ~BlobFileHandle() {}
virtual uint64_t GetNumber() const = 0;
virtual const std::string& GetName() const = 0;
virtual WritableFileWriter* GetFile() const = 0;
};
// Manages the process of blob files creation.
class BlobFileManager {
public:
virtual ~BlobFileManager() {}
// Creates a new file. The new file should not be accessed until
// FinishFile() has been called.
// If successful, sets "*handle* to the new file handle.
virtual Status NewFile(std::unique_ptr<BlobFileHandle>* handle) = 0;
// Finishes the file with the provided metadata. Stops writting to
// the file anymore.
// REQUIRES: FinishFile(), DeleteFile() have not been called.
virtual Status FinishFile(uint32_t cf_id, std::shared_ptr<BlobFileMeta> file,
std::unique_ptr<BlobFileHandle>&& handle) {
std::vector<std::pair<std::shared_ptr<BlobFileMeta>,
std::unique_ptr<BlobFileHandle>>>
tmp;
tmp.emplace_back(std::make_pair(file, std::move(handle)));
return BatchFinishFiles(cf_id, tmp);
}
// Batch version of FinishFile
virtual Status BatchFinishFiles(
uint32_t cf_id,
const std::vector<std::pair<std::shared_ptr<BlobFileMeta>,
std::unique_ptr<BlobFileHandle>>>& files) {
(void)cf_id;
(void)files;
return Status::OK();
};
// Deletes the file. If the caller is not going to call
// FinishFile(), it must call DeleteFile() to release the handle.
// REQUIRES: FinishFile(), DeleteFile() have not been called.
virtual Status DeleteFile(std::unique_ptr<BlobFileHandle>&& handle) {
std::vector<std::unique_ptr<BlobFileHandle>> tmp;
tmp.emplace_back(std::move(handle));
return BatchDeleteFiles(tmp);
}
// Batch version of DeleteFile
virtual Status BatchDeleteFiles(
const std::vector<std::unique_ptr<BlobFileHandle>>& handles) {
(void)handles;
return Status::OK();
}
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_file_reader.h"
#include "util/crc32c.h"
#include "util/filename.h"
#include "util/sync_point.h"
namespace rocksdb {
namespace titandb {
Status NewBlobFileReader(uint64_t file_number, uint64_t readahead_size,
const TitanDBOptions& db_options,
const EnvOptions& env_options, Env* env,
std::unique_ptr<RandomAccessFileReader>* result) {
std::unique_ptr<RandomAccessFile> file;
auto file_name = BlobFileName(db_options.dirname, file_number);
Status s = env->NewRandomAccessFile(file_name, &file, env_options);
if (!s.ok()) return s;
if (readahead_size > 0) {
file = NewReadaheadRandomAccessFile(std::move(file), readahead_size);
}
result->reset(new RandomAccessFileReader(std::move(file), file_name));
return s;
}
const uint64_t kMaxReadaheadSize = 256 << 10;
namespace {
void GenerateCachePrefix(std::string* dst, Cache* cc, RandomAccessFile* file) {
char buffer[kMaxVarint64Length * 3 + 1];
auto size = file->GetUniqueId(buffer, sizeof(buffer));
if (size == 0) {
auto end = EncodeVarint64(buffer, cc->NewId());
size = end - buffer;
}
dst->assign(buffer, size);
}
void EncodeBlobCache(std::string* dst, const Slice& prefix, uint64_t offset) {
dst->assign(prefix.data(), prefix.size());
PutVarint64(dst, offset);
}
} // namespace
Status BlobFileReader::Open(const TitanCFOptions& options,
std::unique_ptr<RandomAccessFileReader> file,
uint64_t file_size,
std::unique_ptr<BlobFileReader>* result) {
if (file_size < BlobFileFooter::kEncodedLength) {
return Status::Corruption("file is too short to be a blob file");
}
FixedSlice<BlobFileFooter::kEncodedLength> buffer;
TRY(file->Read(file_size - BlobFileFooter::kEncodedLength,
BlobFileFooter::kEncodedLength, &buffer, buffer.get()));
BlobFileFooter footer;
TRY(DecodeInto(buffer, &footer));
auto reader = new BlobFileReader(options, std::move(file));
reader->footer_ = footer;
result->reset(reader);
return Status::OK();
}
BlobFileReader::BlobFileReader(const TitanCFOptions& options,
std::unique_ptr<RandomAccessFileReader> file)
: options_(options), file_(std::move(file)), cache_(options.blob_cache) {
if (cache_) {
GenerateCachePrefix(&cache_prefix_, cache_.get(), file_->file());
}
}
Status BlobFileReader::Get(const ReadOptions& /*options*/,
const BlobHandle& handle, BlobRecord* record,
PinnableSlice* buffer) {
TEST_SYNC_POINT("BlobFileReader::Get");
std::string cache_key;
Cache::Handle* cache_handle = nullptr;
if (cache_) {
EncodeBlobCache(&cache_key, cache_prefix_, handle.offset);
cache_handle = cache_->Lookup(cache_key);
if (cache_handle) {
auto blob = reinterpret_cast<OwnedSlice*>(cache_->Value(cache_handle));
buffer->PinSlice(*blob, UnrefCacheHandle, cache_.get(), cache_handle);
return DecodeInto(*blob, record);
}
}
OwnedSlice blob;
TRY(ReadRecord(handle, record, &blob));
if (cache_) {
auto cache_value = new OwnedSlice(std::move(blob));
auto cache_size = cache_value->size() + sizeof(*cache_value);
cache_->Insert(cache_key, cache_value, cache_size,
&DeleteCacheValue<OwnedSlice>, &cache_handle);
buffer->PinSlice(*cache_value, UnrefCacheHandle, cache_.get(),
cache_handle);
} else {
buffer->PinSlice(blob, OwnedSlice::CleanupFunc, blob.release(), nullptr);
}
return Status::OK();
}
Status BlobFileReader::ReadRecord(const BlobHandle& handle, BlobRecord* record,
OwnedSlice* buffer) {
Slice blob;
CacheAllocationPtr ubuf(new char[handle.size]);
TRY(file_->Read(handle.offset, handle.size, &blob, ubuf.get()));
// something must be wrong
if (handle.size != blob.size()) {
fprintf(stderr, "ReadRecord actual size:%lu != blob size:%lu\n",
blob.size(), static_cast<std::size_t>(handle.size));
abort();
}
BlobDecoder decoder;
TRY(decoder.DecodeHeader(&blob));
buffer->reset(std::move(ubuf), blob);
TRY(decoder.DecodeRecord(&blob, record, buffer));
return Status::OK();
}
Status BlobFilePrefetcher::Get(const ReadOptions& options,
const BlobHandle& handle, BlobRecord* record,
PinnableSlice* buffer) {
if (handle.offset == last_offset_) {
last_offset_ = handle.offset + handle.size;
if (handle.offset + handle.size > readahead_limit_) {
readahead_size_ = std::max(handle.size, readahead_size_);
reader_->file_->Prefetch(handle.offset, readahead_size_);
readahead_limit_ = handle.offset + readahead_size_;
readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ * 2);
}
} else {
last_offset_ = handle.offset + handle.size;
readahead_size_ = 0;
readahead_limit_ = 0;
}
return reader_->Get(options, handle, record, buffer);
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "util/file_reader_writer.h"
#include "utilities/titandb/blob_format.h"
#include "utilities/titandb/options.h"
namespace rocksdb {
namespace titandb {
Status NewBlobFileReader(uint64_t file_number, uint64_t readahead_size,
const TitanDBOptions& db_options,
const EnvOptions& env_options, Env* env,
std::unique_ptr<RandomAccessFileReader>* result);
class BlobFileReader {
public:
// Opens a blob file and read the necessary metadata from it.
// If successful, sets "*result" to the newly opened file reader.
static Status Open(const TitanCFOptions& options,
std::unique_ptr<RandomAccessFileReader> file,
uint64_t file_size,
std::unique_ptr<BlobFileReader>* result);
// Gets the blob record pointed by the handle in this file. The data
// of the record is stored in the provided buffer, so the buffer
// must be valid when the record is used.
Status Get(const ReadOptions& options, const BlobHandle& handle,
BlobRecord* record, PinnableSlice* buffer);
private:
friend class BlobFilePrefetcher;
BlobFileReader(const TitanCFOptions& options,
std::unique_ptr<RandomAccessFileReader> file);
Status ReadRecord(const BlobHandle& handle, BlobRecord* record,
OwnedSlice* buffer);
TitanCFOptions options_;
std::unique_ptr<RandomAccessFileReader> file_;
std::shared_ptr<Cache> cache_;
std::string cache_prefix_;
// Information read from the file.
BlobFileFooter footer_;
};
// Performs readahead on continuous reads.
class BlobFilePrefetcher : public Cleanable {
public:
// Constructs a prefetcher with the blob file reader.
// "*reader" must be valid when the prefetcher is used.
BlobFilePrefetcher(BlobFileReader* reader) : reader_(reader) {}
Status Get(const ReadOptions& options, const BlobHandle& handle,
BlobRecord* record, PinnableSlice* buffer);
private:
BlobFileReader* reader_;
uint64_t last_offset_{0};
uint64_t readahead_size_{0};
uint64_t readahead_limit_{0};
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_file_size_collector.h"
#include "base_db_listener.h"
namespace rocksdb {
namespace titandb {
TablePropertiesCollector*
BlobFileSizeCollectorFactory::CreateTablePropertiesCollector(
rocksdb::TablePropertiesCollectorFactory::Context /* context */) {
return new BlobFileSizeCollector();
}
const std::string BlobFileSizeCollector::kPropertiesName =
"TitanDB.blob_discardable_size";
bool BlobFileSizeCollector::Encode(
const std::map<uint64_t, uint64_t>& blob_files_size, std::string* result) {
PutVarint32(result, static_cast<uint32_t>(blob_files_size.size()));
for (const auto& bfs : blob_files_size) {
PutVarint64(result, bfs.first);
PutVarint64(result, bfs.second);
}
return true;
}
bool BlobFileSizeCollector::Decode(
Slice* slice, std::map<uint64_t, uint64_t>* blob_files_size) {
uint32_t num = 0;
if (!GetVarint32(slice, &num)) {
return false;
}
uint64_t file_number;
uint64_t size;
for (uint32_t i = 0; i < num; ++i) {
if (!GetVarint64(slice, &file_number)) {
return false;
}
if (!GetVarint64(slice, &size)) {
return false;
}
(*blob_files_size)[file_number] = size;
}
return true;
}
Status BlobFileSizeCollector::AddUserKey(const Slice& /* key */,
const Slice& value, EntryType type,
SequenceNumber /* seq */,
uint64_t /* file_size */) {
if (type != kEntryBlobIndex) {
return Status::OK();
}
BlobIndex index;
auto s = index.DecodeFrom(const_cast<Slice*>(&value));
if (!s.ok()) {
return s;
}
auto iter = blob_files_size_.find(index.file_number);
if (iter == blob_files_size_.end()) {
blob_files_size_[index.file_number] = index.blob_handle.size;
} else {
iter->second += index.blob_handle.size;
}
return Status::OK();
}
Status BlobFileSizeCollector::Finish(UserCollectedProperties* properties) {
if (blob_files_size_.empty()) {
return Status::OK();
}
std::string res;
if (!Encode(blob_files_size_, &res) || res.empty()) {
fprintf(stderr, "blob file size collector encode failed\n");
abort();
}
properties->emplace(std::make_pair(kPropertiesName, res));
return Status::OK();
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "rocksdb/listener.h"
#include "rocksdb/table_properties.h"
#include "util/coding.h"
#include "utilities/titandb/db_impl.h"
#include "utilities/titandb/version_set.h"
namespace rocksdb {
namespace titandb {
class BlobFileSizeCollectorFactory final
: public TablePropertiesCollectorFactory {
public:
TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override;
const char* Name() const override { return "BlobFileSizeCollector"; }
};
class BlobFileSizeCollector final : public TablePropertiesCollector {
public:
const static std::string kPropertiesName;
static bool Encode(const std::map<uint64_t, uint64_t>& blob_files_size,
std::string* result);
static bool Decode(Slice* slice,
std::map<uint64_t, uint64_t>* blob_files_size);
Status AddUserKey(const Slice& key, const Slice& value, EntryType type,
SequenceNumber seq, uint64_t file_size) override;
Status Finish(UserCollectedProperties* properties) override;
UserCollectedProperties GetReadableProperties() const override {
return UserCollectedProperties();
}
const char* Name() const override { return "BlobFileSizeCollector"; }
private:
std::map<uint64_t, uint64_t> blob_files_size_;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_file_size_collector.h"
#include "util/testharness.h"
namespace rocksdb {
namespace titandb {
class BlobFileSizeCollectorTest : public testing::Test {
public:
Env* env_{Env::Default()};
EnvOptions env_options_;
Options options_;
TitanDBOptions db_options_;
TitanCFOptions cf_options_;
MutableCFOptions cf_moptions_;
ImmutableCFOptions cf_ioptions_;
std::unique_ptr<TableFactory> table_factory_;
std::vector<std::unique_ptr<IntTblPropCollectorFactory>> collectors_;
std::string tmpdir_;
std::string file_name_;
BlobFileSizeCollectorTest()
: cf_moptions_(cf_options_),
cf_ioptions_(options_),
table_factory_(NewBlockBasedTableFactory()),
tmpdir_(test::TmpDir(env_)),
file_name_(tmpdir_ + "/TEST") {
db_options_.dirname = tmpdir_;
auto blob_file_size_collector_factory =
std::make_shared<BlobFileSizeCollectorFactory>();
collectors_.emplace_back(new UserKeyTablePropertiesCollectorFactory(
blob_file_size_collector_factory));
}
~BlobFileSizeCollectorTest() {
env_->DeleteFile(file_name_);
env_->DeleteDir(tmpdir_);
}
void NewFileWriter(std::unique_ptr<WritableFileWriter>* result) {
std::unique_ptr<WritableFile> writable_file;
ASSERT_OK(env_->NewWritableFile(file_name_, &writable_file, env_options_));
result->reset(
new WritableFileWriter(std::move(writable_file), file_name_, env_options_));
ASSERT_TRUE(*result);
}
void NewTableBuilder(WritableFileWriter* file,
std::unique_ptr<TableBuilder>* result) {
TableBuilderOptions options(cf_ioptions_, cf_moptions_,
cf_ioptions_.internal_comparator, &collectors_,
kNoCompression, CompressionOptions(), nullptr,
false, kDefaultColumnFamilyName, 0);
result->reset(table_factory_->NewTableBuilder(options, 0, file));
ASSERT_TRUE(*result);
}
void NewFileReader(std::unique_ptr<RandomAccessFileReader>* result) {
std::unique_ptr<RandomAccessFile> file;
ASSERT_OK(env_->NewRandomAccessFile(file_name_, &file, env_options_));
result->reset(
new RandomAccessFileReader(std::move(file), file_name_, env_));
ASSERT_TRUE(*result);
}
void NewTableReader(std::unique_ptr<RandomAccessFileReader>&& file,
std::unique_ptr<TableReader>* result) {
TableReaderOptions options(cf_ioptions_, nullptr, env_options_,
cf_ioptions_.internal_comparator);
uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size));
ASSERT_TRUE(file_size > 0);
ASSERT_OK(table_factory_->NewTableReader(options, std::move(file),
file_size, result));
ASSERT_TRUE(*result);
}
};
TEST_F(BlobFileSizeCollectorTest, Basic) {
std::unique_ptr<WritableFileWriter> wfile;
NewFileWriter(&wfile);
std::unique_ptr<TableBuilder> table_builder;
NewTableBuilder(wfile.get(), &table_builder);
const int kNumEntries = 100;
char buf[16];
for (int i = 0; i < kNumEntries; i++) {
ParsedInternalKey ikey;
snprintf(buf, sizeof(buf), "%15d", i);
ikey.user_key = buf;
ikey.type = kTypeBlobIndex;
std::string key;
AppendInternalKey(&key, ikey);
BlobIndex index;
if (i % 2 == 0) {
index.file_number = 0ULL;
} else {
index.file_number = 1ULL;
}
index.blob_handle.size = 10;
std::string value;
index.EncodeTo(&value);
table_builder->Add(key, value);
}
ASSERT_OK(table_builder->status());
ASSERT_EQ(kNumEntries, table_builder->NumEntries());
ASSERT_OK(table_builder->Finish());
ASSERT_OK(wfile->Flush());
ASSERT_OK(wfile->Sync(true));
std::unique_ptr<RandomAccessFileReader> rfile;
NewFileReader(&rfile);
std::unique_ptr<TableReader> table_reader;
NewTableReader(std::move(rfile), &table_reader);
auto table_properties = table_reader->GetTableProperties();
ASSERT_TRUE(table_properties);
auto iter = table_properties->user_collected_properties.find(
BlobFileSizeCollector::kPropertiesName);
ASSERT_TRUE(iter != table_properties->user_collected_properties.end());
Slice raw_blob_file_size_prop(iter->second);
std::map<uint64_t, uint64_t> result;
BlobFileSizeCollector::Decode(&raw_blob_file_size_prop, &result);
ASSERT_EQ(2, result.size());
ASSERT_EQ(kNumEntries / 2 * 10, result[0]);
ASSERT_EQ(kNumEntries / 2 * 10, result[1]);
}
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#include "util/filename.h"
#include "util/testharness.h"
#include "utilities/titandb/blob_file_builder.h"
#include "utilities/titandb/blob_file_cache.h"
#include "utilities/titandb/blob_file_reader.h"
namespace rocksdb {
namespace titandb {
class BlobFileTest : public testing::Test {
public:
BlobFileTest() : dirname_(test::TmpDir(env_)) {
file_name_ = BlobFileName(dirname_, file_number_);
}
~BlobFileTest() {
env_->DeleteFile(file_name_);
env_->DeleteDir(dirname_);
}
void TestBlobFilePrefetcher(TitanOptions options) {
options.dirname = dirname_;
TitanDBOptions db_options(options);
TitanCFOptions cf_options(options);
BlobFileCache cache(db_options, cf_options, {NewLRUCache(128)});
const int n = 100;
std::vector<BlobHandle> handles(n);
std::unique_ptr<WritableFileWriter> file;
{
std::unique_ptr<WritableFile> f;
ASSERT_OK(env_->NewWritableFile(file_name_, &f, env_options_));
file.reset(new WritableFileWriter(std::move(f), file_name_, env_options_));
}
std::unique_ptr<BlobFileBuilder> builder(
new BlobFileBuilder(cf_options, file.get()));
for (int i = 0; i < n; i++) {
auto key = std::to_string(i);
auto value = std::string(1024, i);
BlobRecord record;
record.key = key;
record.value = value;
builder->Add(record, &handles[i]);
ASSERT_OK(builder->status());
}
ASSERT_OK(builder->Finish());
ASSERT_OK(builder->status());
uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(file_name_, &file_size));
ReadOptions ro;
std::unique_ptr<BlobFilePrefetcher> prefetcher;
ASSERT_OK(cache.NewPrefetcher(file_number_, file_size, &prefetcher));
for (int i = 0; i < n; i++) {
auto key = std::to_string(i);
auto value = std::string(1024, i);
BlobRecord expect;
expect.key = key;
expect.value = value;
BlobRecord record;
PinnableSlice buffer;
ASSERT_OK(
cache.Get(ro, file_number_, file_size, handles[i], &record, &buffer));
ASSERT_EQ(record, expect);
buffer.Reset();
ASSERT_OK(
cache.Get(ro, file_number_, file_size, handles[i], &record, &buffer));
ASSERT_EQ(record, expect);
buffer.Reset();
ASSERT_OK(prefetcher->Get(ro, handles[i], &record, &buffer));
ASSERT_EQ(record, expect);
buffer.Reset();
ASSERT_OK(prefetcher->Get(ro, handles[i], &record, &buffer));
ASSERT_EQ(record, expect);
}
}
void TestBlobFileReader(TitanOptions options) {
options.dirname = dirname_;
TitanDBOptions db_options(options);
TitanCFOptions cf_options(options);
BlobFileCache cache(db_options, cf_options, {NewLRUCache(128)});
const int n = 100;
std::vector<BlobHandle> handles(n);
std::unique_ptr<WritableFileWriter> file;
{
std::unique_ptr<WritableFile> f;
ASSERT_OK(env_->NewWritableFile(file_name_, &f, env_options_));
file.reset(new WritableFileWriter(std::move(f), file_name_, env_options_));
}
std::unique_ptr<BlobFileBuilder> builder(
new BlobFileBuilder(cf_options, file.get()));
for (int i = 0; i < n; i++) {
auto key = std::to_string(i);
auto value = std::string(1024, i);
BlobRecord record;
record.key = key;
record.value = value;
builder->Add(record, &handles[i]);
ASSERT_OK(builder->status());
}
ASSERT_OK(builder->Finish());
ASSERT_OK(builder->status());
uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(file_name_, &file_size));
ReadOptions ro;
std::unique_ptr<RandomAccessFileReader> random_access_file_reader;
ASSERT_OK(NewBlobFileReader(file_number_, 0, db_options, env_options_, env_,
&random_access_file_reader));
std::unique_ptr<BlobFileReader> blob_file_reader;
ASSERT_OK(BlobFileReader::Open(cf_options,
std::move(random_access_file_reader),
file_size, &blob_file_reader));
for (int i = 0; i < n; i++) {
auto key = std::to_string(i);
auto value = std::string(1024, i);
BlobRecord expect;
expect.key = key;
expect.value = value;
BlobRecord record;
PinnableSlice buffer;
ASSERT_OK(
cache.Get(ro, file_number_, file_size, handles[i], &record, &buffer));
ASSERT_EQ(record, expect);
buffer.Reset();
ASSERT_OK(
cache.Get(ro, file_number_, file_size, handles[i], &record, &buffer));
ASSERT_EQ(record, expect);
buffer.Reset();
ASSERT_OK(blob_file_reader->Get(ro, handles[i], &record, &buffer));
ASSERT_EQ(record, expect);
buffer.Reset();
ASSERT_OK(blob_file_reader->Get(ro, handles[i], &record, &buffer));
ASSERT_EQ(record, expect);
}
}
Env* env_{Env::Default()};
EnvOptions env_options_;
std::string dirname_;
std::string file_name_;
uint64_t file_number_{1};
};
TEST_F(BlobFileTest, BlobFileReader) {
TitanOptions options;
TestBlobFileReader(options);
options.blob_file_compression = kLZ4Compression;
TestBlobFileReader(options);
}
TEST_F(BlobFileTest, BlobFilePrefetcher) {
TitanOptions options;
TestBlobFilePrefetcher(options);
options.blob_cache = NewLRUCache(1 << 20);
TestBlobFilePrefetcher(options);
options.blob_file_compression = kLZ4Compression;
TestBlobFilePrefetcher(options);
}
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#include "utilities/titandb/blob_format.h"
#include "util/crc32c.h"
#include "util/sync_point.h"
namespace rocksdb {
namespace titandb {
namespace {
bool GetChar(Slice* src, unsigned char* value) {
if (src->size() < 1) return false;
*value = *src->data();
src->remove_prefix(1);
return true;
}
} // namespace
void BlobRecord::EncodeTo(std::string* dst) const {
PutLengthPrefixedSlice(dst, key);
PutLengthPrefixedSlice(dst, value);
}
Status BlobRecord::DecodeFrom(Slice* src) {
if (!GetLengthPrefixedSlice(src, &key) ||
!GetLengthPrefixedSlice(src, &value)) {
return Status::Corruption("BlobRecord");
}
return Status::OK();
}
bool operator==(const BlobRecord& lhs, const BlobRecord& rhs) {
return lhs.key == rhs.key && lhs.value == rhs.value;
}
void BlobEncoder::EncodeRecord(const BlobRecord& record) {
record_buffer_.clear();
compressed_buffer_.clear();
CompressionType compression;
record.EncodeTo(&record_buffer_);
record_ = Compress(compression_ctx_, record_buffer_, &compressed_buffer_,
&compression);
EXPECT(record_.size() < std::numeric_limits<uint32_t>::max());
EncodeFixed32(header_ + 4, static_cast<uint32_t>(record_.size()));
header_[8] = compression;
uint32_t crc = crc32c::Value(header_ + 4, sizeof(header_) - 4);
crc = crc32c::Extend(crc, record_.data(), record_.size());
EncodeFixed32(header_, crc);
}
Status BlobDecoder::DecodeHeader(Slice* src) {
if (!GetFixed32(src, &crc_)) {
return Status::Corruption("BlobHeader");
}
header_crc_ = crc32c::Value(src->data(), kBlobHeaderSize - 4);
unsigned char compression;
if (!GetFixed32(src, &record_size_) || !GetChar(src, &compression)) {
return Status::Corruption("BlobHeader");
}
compression_ = static_cast<CompressionType>(compression);
return Status::OK();
}
Status BlobDecoder::DecodeRecord(Slice* src, BlobRecord* record,
OwnedSlice* buffer) {
TEST_SYNC_POINT_CALLBACK("BlobDecoder::DecodeRecord", &crc_);
Slice input(src->data(), record_size_);
src->remove_prefix(record_size_);
uint32_t crc = crc32c::Extend(header_crc_, input.data(), input.size());
if (crc != crc_) {
return Status::Corruption("BlobRecord", "checksum mismatch");
}
if (compression_ == kNoCompression) {
return DecodeInto(input, record);
}
UncompressionContext ctx(compression_);
TRY(Uncompress(ctx, input, buffer));
return DecodeInto(*buffer, record);
}
void BlobHandle::EncodeTo(std::string* dst) const {
PutVarint64(dst, offset);
PutVarint64(dst, size);
}
Status BlobHandle::DecodeFrom(Slice* src) {
if (!GetVarint64(src, &offset) || !GetVarint64(src, &size)) {
return Status::Corruption("BlobHandle");
}
return Status::OK();
}
bool operator==(const BlobHandle& lhs, const BlobHandle& rhs) {
return lhs.offset == rhs.offset && lhs.size == rhs.size;
}
void BlobIndex::EncodeTo(std::string* dst) const {
dst->push_back(kBlobRecord);
PutVarint64(dst, file_number);
blob_handle.EncodeTo(dst);
}
Status BlobIndex::DecodeFrom(Slice* src) {
unsigned char type;
if (!GetChar(src, &type) || type != kBlobRecord ||
!GetVarint64(src, &file_number)) {
return Status::Corruption("BlobIndex");
}
Status s = blob_handle.DecodeFrom(src);
if (!s.ok()) {
return Status::Corruption("BlobIndex", s.ToString());
}
return s;
}
bool operator==(const BlobIndex& lhs, const BlobIndex& rhs) {
return (lhs.file_number == rhs.file_number &&
lhs.blob_handle == rhs.blob_handle);
}
void BlobFileMeta::EncodeTo(std::string* dst) const {
PutVarint64(dst, file_number_);
PutVarint64(dst, file_size_);
}
Status BlobFileMeta::DecodeFrom(Slice* src) {
if (!GetVarint64(src, &file_number_) || !GetVarint64(src, &file_size_)) {
return Status::Corruption("BlobFileMeta Decode failed");
}
return Status::OK();
}
bool operator==(const BlobFileMeta& lhs, const BlobFileMeta& rhs) {
return (lhs.file_number_ == rhs.file_number_ &&
lhs.file_size_ == rhs.file_size_);
}
void BlobFileMeta::FileStateTransit(const FileEvent& event) {
switch (event) {
case FileEvent::kFlushCompleted:
// blob file maybe generated by flush or gc, because gc will rewrite valid
// keys to memtable. If it's generated by gc, we will leave gc to change
// its file state. If it's generated by flush, we need to change it to
// normal state after flush completed.
assert(state_ == FileState::kPendingLSM ||
state_ == FileState::kPendingGC || state_ == FileState::kNormal ||
state_ == FileState::kBeingGC);
if (state_ == FileState::kPendingLSM) state_ = FileState::kNormal;
break;
case FileEvent::kGCCompleted:
// file is marked obsoleted during gc
if (state_ == FileState::kObsolete) {
break;
}
assert(state_ == FileState::kPendingGC || state_ == FileState::kBeingGC);
state_ = FileState::kNormal;
break;
case FileEvent::kCompactionCompleted:
assert(state_ == FileState::kPendingLSM);
state_ = FileState::kNormal;
break;
case FileEvent::kGCBegin:
assert(state_ == FileState::kNormal);
state_ = FileState::kBeingGC;
break;
case FileEvent::kGCOutput:
assert(state_ == FileState::kInit);
state_ = FileState::kPendingGC;
break;
case FileEvent::kFlushOrCompactionOutput:
assert(state_ == FileState::kInit);
state_ = FileState::kPendingLSM;
break;
case FileEvent::kDbRestart:
assert(state_ == FileState::kInit);
state_ = FileState::kNormal;
break;
case FileEvent::kDelete:
assert(state_ != FileState::kObsolete);
state_ = FileState::kObsolete;
break;
default:
fprintf(stderr,
"Unknown file event[%d], file number[%lu], file state[%d]",
static_cast<int>(event), static_cast<std::size_t>(file_number_),
static_cast<int>(state_));
abort();
}
}
void BlobFileMeta::AddDiscardableSize(uint64_t _discardable_size) {
assert(_discardable_size < file_size_);
discardable_size_ += _discardable_size;
assert(discardable_size_ < file_size_);
}
double BlobFileMeta::GetDiscardableRatio() const {
return static_cast<double>(discardable_size_) /
static_cast<double>(file_size_);
}
void BlobFileHeader::EncodeTo(std::string* dst) const {
PutFixed32(dst, kHeaderMagicNumber);
PutFixed32(dst, version);
}
Status BlobFileHeader::DecodeFrom(Slice* src) {
uint32_t magic_number = 0;
if (!GetFixed32(src, &magic_number) || magic_number != kHeaderMagicNumber) {
return Status::Corruption(
"Blob file header magic number missing or mismatched.");
}
if (!GetFixed32(src, &version) || version != kVersion1) {
return Status::Corruption("Blob file header version missing or invalid.");
}
return Status::OK();
}
void BlobFileFooter::EncodeTo(std::string* dst) const {
auto size = dst->size();
meta_index_handle.EncodeTo(dst);
// Add padding to make a fixed size footer.
dst->resize(size + kEncodedLength - 12);
PutFixed64(dst, kFooterMagicNumber);
Slice encoded(dst->data() + size, dst->size() - size);
PutFixed32(dst, crc32c::Value(encoded.data(), encoded.size()));
}
Status BlobFileFooter::DecodeFrom(Slice* src) {
auto data = src->data();
Status s = meta_index_handle.DecodeFrom(src);
if (!s.ok()) {
return Status::Corruption("BlobFileFooter", s.ToString());
}
// Remove padding.
src->remove_prefix(data + kEncodedLength - 12 - src->data());
uint64_t magic_number = 0;
if (!GetFixed64(src, &magic_number) || magic_number != kFooterMagicNumber) {
return Status::Corruption("BlobFileFooter", "magic number");
}
Slice decoded(data, src->data() - data);
uint32_t checksum = 0;
if (!GetFixed32(src, &checksum) ||
crc32c::Value(decoded.data(), decoded.size()) != checksum) {
return Status::Corruption("BlobFileFooter", "checksum");
}
return Status::OK();
}
bool operator==(const BlobFileFooter& lhs, const BlobFileFooter& rhs) {
return (lhs.meta_index_handle.offset() == rhs.meta_index_handle.offset() &&
lhs.meta_index_handle.size() == rhs.meta_index_handle.size());
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "rocksdb/options.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "table/format.h"
#include "utilities/titandb/util.h"
namespace rocksdb {
namespace titandb {
// Blob header format:
//
// crc : fixed32
// size : fixed32
// compression : char
const uint64_t kBlobHeaderSize = 9;
// Blob record format:
//
// key : varint64 length + length bytes
// value : varint64 length + length bytes
struct BlobRecord {
Slice key;
Slice value;
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* src);
friend bool operator==(const BlobRecord& lhs, const BlobRecord& rhs);
};
class BlobEncoder {
public:
BlobEncoder(CompressionType compression) : compression_ctx_(compression) {}
void EncodeRecord(const BlobRecord& record);
Slice GetHeader() const { return Slice(header_, sizeof(header_)); }
Slice GetRecord() const { return record_; }
size_t GetEncodedSize() const { return sizeof(header_) + record_.size(); }
private:
char header_[kBlobHeaderSize];
Slice record_;
std::string record_buffer_;
std::string compressed_buffer_;
CompressionContext compression_ctx_;
};
class BlobDecoder {
public:
Status DecodeHeader(Slice* src);
Status DecodeRecord(Slice* src, BlobRecord* record, OwnedSlice* buffer);
size_t GetRecordSize() const { return record_size_; }
private:
uint32_t crc_{0};
uint32_t header_crc_{0};
uint32_t record_size_{0};
CompressionType compression_{kNoCompression};
};
// Blob handle format:
//
// offset : varint64
// size : varint64
struct BlobHandle {
uint64_t offset{0};
uint64_t size{0};
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* src);
friend bool operator==(const BlobHandle& lhs, const BlobHandle& rhs);
};
// Blob index format:
//
// type : char
// file_number_ : varint64
// blob_handle : varint64 offset + varint64 size
struct BlobIndex {
enum Type : unsigned char {
kBlobRecord = 1,
};
uint64_t file_number{0};
BlobHandle blob_handle;
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* src);
friend bool operator==(const BlobIndex& lhs, const BlobIndex& rhs);
};
// Blob file meta format:
//
// file_number_ : varint64
// file_size_ : varint64
class BlobFileMeta {
public:
enum class FileEvent {
kInit,
kFlushCompleted,
kCompactionCompleted,
kGCCompleted,
kGCBegin,
kGCOutput,
kFlushOrCompactionOutput,
kDbRestart,
kDelete,
};
enum class FileState {
kInit, // file never at this state
kNormal,
kPendingLSM, // waiting keys adding to LSM
kBeingGC, // being gced
kPendingGC, // output of gc, waiting gc finish and keys adding to LSM
kObsolete, // already gced, but wait to be physical deleted
};
BlobFileMeta() = default;
BlobFileMeta(uint64_t _file_number, uint64_t _file_size)
: file_number_(_file_number), file_size_(_file_size) {}
friend bool operator==(const BlobFileMeta& lhs, const BlobFileMeta& rhs);
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* src);
uint64_t file_number() const { return file_number_; }
uint64_t file_size() const { return file_size_; }
FileState file_state() const { return state_; }
bool is_obsolete() const { return state_ == FileState::kObsolete; }
uint64_t discardable_size() const { return discardable_size_; }
void FileStateTransit(const FileEvent& event);
void AddDiscardableSize(uint64_t _discardable_size);
double GetDiscardableRatio() const;
private:
// Persistent field
uint64_t file_number_{0};
uint64_t file_size_{0};
// Not persistent field
FileState state_{FileState::kInit};
uint64_t discardable_size_{0};
// bool marked_for_gc_{false};
};
// Blob file header format.
// The header is mean to be compatible with header of BlobDB blob files, except
// we use a different magic number.
//
// magic_number : fixed32
// version : fixed32
struct BlobFileHeader {
// The first 32bits from $(echo titandb/blob | sha1sum).
static const uint32_t kHeaderMagicNumber = 0x2be0a614ul;
static const uint32_t kVersion1 = 1;
static const uint64_t kEncodedLength = 4 + 4;
uint32_t version = kVersion1;
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* src);
};
// Blob file footer format:
//
// meta_index_handle : varint64 offset + varint64 size
// <padding> : [... kEncodedLength - 12] bytes
// magic_number : fixed64
// checksum : fixed32
struct BlobFileFooter {
// The first 64bits from $(echo titandb/blob | sha1sum).
static const uint64_t kFooterMagicNumber{0x2be0a6148e39edc6ull};
static const uint64_t kEncodedLength{BlockHandle::kMaxEncodedLength + 8 + 4};
BlockHandle meta_index_handle{BlockHandle::NullBlockHandle()};
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* src);
friend bool operator==(const BlobFileFooter& lhs, const BlobFileFooter& rhs);
};
// A convenient template to decode a const slice.
template <typename T>
Status DecodeInto(const Slice& src, T* target) {
auto tmp = src;
auto s = target->DecodeFrom(&tmp);
if (s.ok() && !tmp.empty()) {
s = Status::Corruption(Slice());
}
return s;
}
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_format.h"
#include "util/testharness.h"
#include "utilities/titandb/testutil.h"
#include "utilities/titandb/util.h"
namespace rocksdb {
namespace titandb {
class BlobFormatTest : public testing::Test {};
TEST(BlobFormatTest, BlobRecord) {
BlobRecord input;
CheckCodec(input);
input.key = "hello";
input.value = "world";
CheckCodec(input);
}
TEST(BlobFormatTest, BlobHandle) {
BlobHandle input;
CheckCodec(input);
input.offset = 2;
input.size = 3;
CheckCodec(input);
}
TEST(BlobFormatTest, BlobIndex) {
BlobIndex input;
CheckCodec(input);
input.file_number = 1;
input.blob_handle.offset = 2;
input.blob_handle.size = 3;
CheckCodec(input);
}
TEST(BlobFormatTest, BlobFileMeta) {
BlobFileMeta input(2, 3);
CheckCodec(input);
}
TEST(BlobFormatTest, BlobFileFooter) {
BlobFileFooter input;
CheckCodec(input);
input.meta_index_handle.set_offset(123);
input.meta_index_handle.set_size(321);
CheckCodec(input);
}
TEST(BlobFormatTest, BlobFileStateTransit) {
BlobFileMeta blob_file;
ASSERT_EQ(blob_file.file_state(), BlobFileMeta::FileState::kInit);
blob_file.FileStateTransit(BlobFileMeta::FileEvent::kDbRestart);
ASSERT_EQ(blob_file.file_state(), BlobFileMeta::FileState::kNormal);
blob_file.FileStateTransit(BlobFileMeta::FileEvent::kGCBegin);
ASSERT_EQ(blob_file.file_state(), BlobFileMeta::FileState::kBeingGC);
blob_file.FileStateTransit(BlobFileMeta::FileEvent::kGCCompleted);
BlobFileMeta compaction_output;
ASSERT_EQ(compaction_output.file_state(), BlobFileMeta::FileState::kInit);
compaction_output.FileStateTransit(
BlobFileMeta::FileEvent::kFlushOrCompactionOutput);
ASSERT_EQ(compaction_output.file_state(),
BlobFileMeta::FileState::kPendingLSM);
compaction_output.FileStateTransit(
BlobFileMeta::FileEvent::kCompactionCompleted);
ASSERT_EQ(compaction_output.file_state(), BlobFileMeta::FileState::kNormal);
}
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#include "utilities/titandb/blob_gc.h"
namespace rocksdb {
namespace titandb {
BlobGC::BlobGC(std::vector<BlobFileMeta*>&& blob_files,
TitanCFOptions&& _titan_cf_options)
: inputs_(std::move(blob_files)),
titan_cf_options_(std::move(_titan_cf_options)) {
MarkFilesBeingGC();
}
BlobGC::~BlobGC() {}
void BlobGC::SetColumnFamily(ColumnFamilyHandle* cfh) {
cfh_ = cfh;
}
ColumnFamilyData* BlobGC::GetColumnFamilyData() {
auto* cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh_);
return cfhi->cfd();
}
void BlobGC::AddOutputFile(BlobFileMeta* blob_file) {
blob_file->FileStateTransit(BlobFileMeta::FileEvent::kGCOutput);
outputs_.push_back(blob_file);
}
void BlobGC::MarkFilesBeingGC() {
for (auto& f : inputs_) {
f->FileStateTransit(BlobFileMeta::FileEvent::kGCBegin);
}
}
void BlobGC::ReleaseGcFiles() {
for (auto& f : inputs_) {
f->FileStateTransit(BlobFileMeta::FileEvent::kGCCompleted);
}
for (auto& f : outputs_) {
f->FileStateTransit(BlobFileMeta::FileEvent::kGCCompleted);
}
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include <memory>
#include "db/column_family.h"
#include "utilities/titandb/blob_format.h"
#include "utilities/titandb/options.h"
namespace rocksdb {
namespace titandb {
// A BlobGC encapsulates information about a blob gc.
class BlobGC {
public:
BlobGC(std::vector<BlobFileMeta*>&& blob_files,
TitanCFOptions&& _titan_cf_options);
// No copying allowed
BlobGC(const BlobGC&) = delete;
void operator=(const BlobGC&) = delete;
~BlobGC();
const std::vector<BlobFileMeta*>& inputs() { return inputs_; }
void set_sampled_inputs(std::vector<BlobFileMeta*>&& files) {
sampled_inputs_ = std::move(files);
}
const std::vector<BlobFileMeta*>& sampled_inputs() { return sampled_inputs_; }
const TitanCFOptions& titan_cf_options() { return titan_cf_options_; }
void SetColumnFamily(ColumnFamilyHandle* cfh);
ColumnFamilyHandle* column_family_handle() { return cfh_; }
ColumnFamilyData* GetColumnFamilyData();
void MarkFilesBeingGC();
void AddOutputFile(BlobFileMeta*);
void ReleaseGcFiles();
private:
std::vector<BlobFileMeta*> inputs_;
std::vector<BlobFileMeta*> sampled_inputs_;
std::vector<BlobFileMeta*> outputs_;
TitanCFOptions titan_cf_options_;
ColumnFamilyHandle* cfh_{nullptr};
};
struct GCScore {
uint64_t file_number;
double score;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_gc_job.h"
namespace rocksdb {
namespace titandb {
// Write callback for garbage collection to check if key has been updated
// since last read. Similar to how OptimisticTransaction works.
class BlobGCJob::GarbageCollectionWriteCallback : public WriteCallback {
public:
GarbageCollectionWriteCallback(ColumnFamilyHandle* cfh, std::string&& _key,
BlobIndex&& blob_index)
: cfh_(cfh), key_(std::move(_key)), blob_index_(blob_index) {
assert(!key_.empty());
}
std::string value;
virtual Status Callback(DB* db) override {
auto* db_impl = reinterpret_cast<DBImpl*>(db);
PinnableSlice index_entry;
bool is_blob_index;
auto s = db_impl->GetImpl(ReadOptions(), cfh_, key_, &index_entry,
nullptr /*value_found*/,
nullptr /*read_callback*/, &is_blob_index);
if (!s.ok() && !s.IsNotFound()) {
fprintf(stderr, "GetImpl err, status:%s\n", s.ToString().c_str());
abort();
}
if (s.IsNotFound()) {
// Either the key is deleted or updated with a newer version which is
// inlined in LSM.
s = Status::Busy("key deleted");
} else if (!is_blob_index) {
s = Status::Busy("key overwritten with other value");
}
if (s.ok()) {
BlobIndex other_blob_index;
s = other_blob_index.DecodeFrom(&index_entry);
if (!s.ok()) {
fprintf(stderr, "Decode blob index [%s] failed, err:%s\n",
index_entry.ToString(true).c_str(), s.ToString().c_str());
abort();
}
if (!(blob_index_ == other_blob_index)) {
s = Status::Busy("key overwritten with other blob");
}
}
return s;
}
virtual bool AllowWriteBatching() override { return false; }
std::string key() { return key_; }
private:
ColumnFamilyHandle* cfh_;
// Key to check
std::string key_;
BlobIndex blob_index_;
};
BlobGCJob::BlobGCJob(BlobGC* blob_gc, DB* db, port::Mutex* mutex,
const TitanDBOptions& titan_db_options, Env* env,
const EnvOptions& env_options,
BlobFileManager* blob_file_manager,
VersionSet* version_set, LogBuffer* log_buffer,
std::atomic_bool* shuting_down)
: blob_gc_(blob_gc),
base_db_(db),
base_db_impl_(reinterpret_cast<DBImpl*>(base_db_)),
mutex_(mutex),
db_options_(titan_db_options),
env_(env),
env_options_(env_options),
blob_file_manager_(blob_file_manager),
version_set_(version_set),
log_buffer_(log_buffer),
shuting_down_(shuting_down) {}
BlobGCJob::~BlobGCJob() {
if (cmp_) delete cmp_;
}
Status BlobGCJob::Prepare() { return Status::OK(); }
Status BlobGCJob::Run() {
Status s;
s = SampleCandidateFiles();
std::string tmp;
for (const auto& f : blob_gc_->inputs()) {
if (!tmp.empty()) {
tmp.append(" ");
}
tmp.append(std::to_string(f->file_number()));
}
std::string tmp2;
for (const auto& f : blob_gc_->sampled_inputs()) {
if (!tmp2.empty()) {
tmp2.append(" ");
}
tmp2.append(std::to_string(f->file_number()));
}
ROCKS_LOG_BUFFER(log_buffer_, "[%s] Titan GC candidates[%s] selected[%s]",
blob_gc_->column_family_handle()->GetName().c_str(),
tmp.c_str(), tmp2.c_str());
log_buffer_->FlushBufferToLog();
LogFlush(db_options_.info_log.get());
if (!s.ok()) return s;
return DoRunGC();
}
Status BlobGCJob::SampleCandidateFiles() {
std::vector<BlobFileMeta*> result;
for (const auto& file : blob_gc_->inputs()) {
if (DoSample(file)) {
result.push_back(file);
}
}
if (result.empty()) return Status::Aborted("No blob file need to be gc");
blob_gc_->set_sampled_inputs(std::move(result));
return Status::OK();
}
bool BlobGCJob::DoSample(const BlobFileMeta* file) {
if (file->GetDiscardableRatio() >=
blob_gc_->titan_cf_options().blob_file_discardable_ratio) {
return true;
}
Status s;
uint64_t sample_size_window = static_cast<uint64_t>(
file->file_size() * blob_gc_->titan_cf_options().sample_file_size_ratio);
Random64 random64(file->file_size());
uint64_t sample_begin_offset =
random64.Uniform(file->file_size() - sample_size_window);
std::unique_ptr<RandomAccessFileReader> file_reader;
const int readahead = 256 << 10;
s = NewBlobFileReader(file->file_number(), readahead, db_options_,
env_options_, env_, &file_reader);
if (!s.ok()) {
fprintf(stderr, "NewBlobFileReader failed, status:%s\n",
s.ToString().c_str());
abort();
}
BlobFileIterator iter(std::move(file_reader), file->file_number(),
file->file_size(), blob_gc_->titan_cf_options());
iter.IterateForPrev(sample_begin_offset);
// TODO(@DorianZheng) sample_begin_offset maybe out of data block size, need
// more elegant solution
if (iter.status().IsInvalidArgument()) {
iter.IterateForPrev(0);
}
if (!iter.status().ok()) {
fprintf(stderr,
"IterateForPrev faile, file number[%lu] size[%lu] status[%s]\n",
static_cast<size_t>(file->file_number()),
static_cast<size_t>(file->file_size()),
iter.status().ToString().c_str());
abort();
}
uint64_t iterated_size{0};
uint64_t discardable_size{0};
for (iter.Next();
iterated_size < sample_size_window && iter.status().ok() && iter.Valid();
iter.Next()) {
BlobIndex blob_index = iter.GetBlobIndex();
uint64_t total_length = blob_index.blob_handle.size;
iterated_size += total_length;
if (DiscardEntry(iter.key(), blob_index)) {
discardable_size += total_length;
}
}
assert(iter.status().ok());
return discardable_size >=
sample_size_window *
blob_gc_->titan_cf_options().blob_file_discardable_ratio;
}
Status BlobGCJob::DoRunGC() {
Status s;
std::unique_ptr<BlobFileMergeIterator> gc_iter;
s = BuildIterator(&gc_iter);
if (!s.ok()) return s;
if (!gc_iter) return Status::Aborted("Build iterator for gc failed");
// Similar to OptimisticTransaction, we obtain latest_seq from
// base DB, which is guaranteed to be no smaller than the sequence of
// current key. We use a WriteCallback on write to check the key sequence
// on write. If the key sequence is larger than latest_seq, we know
// a new versions is inserted and the old blob can be discard.
//
// We cannot use OptimisticTransaction because we need to pass
// is_blob_index flag to GetImpl.
std::unique_ptr<BlobFileHandle> blob_file_handle;
std::unique_ptr<BlobFileBuilder> blob_file_builder;
auto* cfh = blob_gc_->column_family_handle();
// uint64_t drop_entry_num = 0;
// uint64_t drop_entry_size = 0;
// uint64_t total_entry_num = 0;
// uint64_t total_entry_size = 0;
uint64_t file_size = 0;
std::string last_key;
bool last_key_valid = false;
gc_iter->SeekToFirst();
assert(gc_iter->Valid());
for (; gc_iter->Valid(); gc_iter->Next()) {
if (IsShutingDown()) {
s = Status::ShutdownInProgress();
break;
}
BlobIndex blob_index = gc_iter->GetBlobIndex();
if (!last_key.empty() && !gc_iter->key().compare(last_key)) {
if (last_key_valid) {
continue;
}
} else {
last_key = gc_iter->key().ToString();
last_key_valid = false;
}
if (DiscardEntry(gc_iter->key(), blob_index)) {
continue;
}
last_key_valid = true;
// Rewrite entry to new blob file
if ((!blob_file_handle && !blob_file_builder) ||
file_size >= blob_gc_->titan_cf_options().blob_file_target_size) {
if (file_size >= blob_gc_->titan_cf_options().blob_file_target_size) {
assert(blob_file_builder);
assert(blob_file_handle);
assert(blob_file_builder->status().ok());
blob_file_builders_.emplace_back(std::make_pair(
std::move(blob_file_handle), std::move(blob_file_builder)));
}
s = blob_file_manager_->NewFile(&blob_file_handle);
if (!s.ok()) {
break;
}
blob_file_builder = unique_ptr<BlobFileBuilder>(new BlobFileBuilder(
blob_gc_->titan_cf_options(), blob_file_handle->GetFile()));
file_size = 0;
}
assert(blob_file_handle);
assert(blob_file_builder);
BlobRecord blob_record;
blob_record.key = gc_iter->key();
blob_record.value = gc_iter->value();
// file_size_ += blob_record.key.size() + blob_record.value.size();
BlobIndex new_blob_index;
new_blob_index.file_number = blob_file_handle->GetNumber();
blob_file_builder->Add(blob_record, &new_blob_index.blob_handle);
std::string index_entry;
new_blob_index.EncodeTo(&index_entry);
// Store WriteBatch for rewriting new Key-Index pairs to LSM
GarbageCollectionWriteCallback callback(cfh, blob_record.key.ToString(),
std::move(blob_index));
callback.value = index_entry;
rewrite_batches_.emplace_back(
std::make_pair(WriteBatch(), std::move(callback)));
auto& wb = rewrite_batches_.back().first;
s = WriteBatchInternal::PutBlobIndex(&wb, cfh->GetID(), blob_record.key,
index_entry);
if (!s.ok()) {
break;
}
}
if (gc_iter->status().ok() && s.ok()) {
if (blob_file_builder && blob_file_handle) {
assert(blob_file_builder->status().ok());
blob_file_builders_.emplace_back(std::make_pair(
std::move(blob_file_handle), std::move(blob_file_builder)));
} else {
assert(!blob_file_builder);
assert(!blob_file_handle);
}
} else if (!gc_iter->status().ok()) {
return gc_iter->status();
}
return s;
}
Status BlobGCJob::BuildIterator(unique_ptr<BlobFileMergeIterator>* result) {
Status s;
const auto& inputs = blob_gc_->sampled_inputs();
assert(!inputs.empty());
std::vector<std::unique_ptr<BlobFileIterator>> list;
for (std::size_t i = 0; i < inputs.size(); ++i) {
std::unique_ptr<RandomAccessFileReader> file;
// TODO(@DorianZheng) set read ahead size
s = NewBlobFileReader(inputs[i]->file_number(), 0, db_options_,
env_options_, env_, &file);
if (!s.ok()) {
break;
}
list.emplace_back(std::unique_ptr<BlobFileIterator>(new BlobFileIterator(
std::move(file), inputs[i]->file_number(), inputs[i]->file_size(),
blob_gc_->titan_cf_options())));
}
if (s.ok()) result->reset(new BlobFileMergeIterator(std::move(list)));
return s;
}
bool BlobGCJob::DiscardEntry(const Slice& key, const BlobIndex& blob_index) {
PinnableSlice index_entry;
bool is_blob_index;
auto s = base_db_impl_->GetImpl(
ReadOptions(), blob_gc_->column_family_handle(), key, &index_entry,
nullptr /*value_found*/, nullptr /*read_callback*/, &is_blob_index);
if (!s.ok() && !s.IsNotFound()) {
fprintf(stderr, "GetImpl err, status:%s\n", s.ToString().c_str());
abort();
}
if (s.IsNotFound() || !is_blob_index) {
// Either the key is deleted or updated with a newer version which is
// inlined in LSM.
return true;
}
BlobIndex other_blob_index;
s = other_blob_index.DecodeFrom(&index_entry);
if (!s.ok()) {
abort();
}
return !(blob_index == other_blob_index);
}
// We have to make sure crash consistency, but LSM db MANIFEST and BLOB db
// MANIFEST are separate, so we need to make sure all new blob file have
// added to db before we rewrite any key to LSM
Status BlobGCJob::Finish() {
Status s;
{
mutex_->Unlock();
s = InstallOutputBlobFiles();
if (s.ok()) s = RewriteValidKeyToLSM();
mutex_->Lock();
}
// TODO(@DorianZheng) cal discardable size for new blob file
if (s.ok() && !blob_gc_->GetColumnFamilyData()->IsDropped()) {
s = DeleteInputBlobFiles();
}
return s;
}
Status BlobGCJob::InstallOutputBlobFiles() {
Status s;
for (auto& builder : blob_file_builders_) {
s = builder.second->Finish();
if (!s.ok()) {
break;
}
}
if (s.ok()) {
std::vector<std::pair<std::shared_ptr<BlobFileMeta>,
std::unique_ptr<BlobFileHandle>>>
files;
for (auto& builder : this->blob_file_builders_) {
auto file = std::make_shared<BlobFileMeta>(
builder.first->GetNumber(), builder.first->GetFile()->GetFileSize());
blob_gc_->AddOutputFile(file.get());
files.emplace_back(std::make_pair(file, std::move(builder.first)));
}
this->blob_file_manager_->BatchFinishFiles(
blob_gc_->column_family_handle()->GetID(), files);
} else {
std::vector<unique_ptr<BlobFileHandle>> handles;
for (auto& builder : this->blob_file_builders_)
handles.emplace_back(std::move(builder.first));
this->blob_file_manager_->BatchDeleteFiles(handles);
}
return s;
}
Status BlobGCJob::RewriteValidKeyToLSM() {
Status s;
auto* db_impl = reinterpret_cast<DBImpl*>(this->base_db_);
WriteOptions wo;
wo.low_pri = true;
wo.ignore_missing_column_families = true;
for (auto& write_batch : this->rewrite_batches_) {
if (blob_gc_->GetColumnFamilyData()->IsDropped()) {
s = Status::Aborted("Column family drop");
break;
}
if (IsShutingDown()) {
s = Status::ShutdownInProgress();
break;
}
s = db_impl->WriteWithCallback(wo, &write_batch.first, &write_batch.second);
if (s.ok()) {
// Key is successfully written to LSM
} else if (s.IsBusy()) {
// The key is overwritten in the meanwhile. Drop the blob record.
} else {
// We hit an error.
break;
}
}
if (s.IsBusy()) {
s = Status::OK();
}
if (s.ok()) {
db_impl->FlushWAL(true);
}
return s;
}
Status BlobGCJob::DeleteInputBlobFiles() const {
SequenceNumber obsolete_sequence = base_db_impl_->GetLatestSequenceNumber();
Status s;
VersionEdit edit;
edit.SetColumnFamilyID(blob_gc_->column_family_handle()->GetID());
for (const auto& file : blob_gc_->sampled_inputs()) {
ROCKS_LOG_INFO(db_options_.info_log, "Titan add obsolete file [%llu]",
file->file_number());
edit.DeleteBlobFile(file->file_number(), obsolete_sequence);
}
s = version_set_->LogAndApply(&edit);
// TODO(@DorianZheng) Purge pending outputs
// base_db_->pending_outputs_.erase(handle->GetNumber());
return s;
}
bool BlobGCJob::IsShutingDown() {
return (shuting_down_ && shuting_down_->load(std::memory_order_acquire));
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "db/db_impl.h"
#include "rocksdb/status.h"
#include "utilities/titandb/blob_file_builder.h"
#include "utilities/titandb/blob_file_iterator.h"
#include "utilities/titandb/blob_file_manager.h"
#include "utilities/titandb/blob_gc.h"
#include "utilities/titandb/options.h"
#include "utilities/titandb/version_set.h"
namespace rocksdb {
namespace titandb {
class BlobGCJob {
public:
BlobGCJob(BlobGC* blob_gc, DB* db, port::Mutex* mutex,
const TitanDBOptions& titan_db_options, Env* env,
const EnvOptions& env_options, BlobFileManager* blob_file_manager,
VersionSet* version_set, LogBuffer* log_buffer,
std::atomic_bool* shuting_down);
// No copying allowed
BlobGCJob(const BlobGCJob&) = delete;
void operator=(const BlobGCJob&) = delete;
~BlobGCJob();
// REQUIRE: mutex held
Status Prepare();
// REQUIRE: mutex not held
Status Run();
// REQUIRE: mutex held
Status Finish();
private:
class GarbageCollectionWriteCallback;
friend class BlobGCJobTest;
BlobGC* blob_gc_;
DB* base_db_;
DBImpl* base_db_impl_;
port::Mutex* mutex_;
TitanDBOptions db_options_;
Env* env_;
EnvOptions env_options_;
BlobFileManager* blob_file_manager_;
titandb::VersionSet* version_set_;
LogBuffer* log_buffer_{nullptr};
std::vector<std::pair<std::unique_ptr<BlobFileHandle>,
std::unique_ptr<BlobFileBuilder>>>
blob_file_builders_;
std::vector<std::pair<WriteBatch, GarbageCollectionWriteCallback>>
rewrite_batches_;
InternalKeyComparator* cmp_{nullptr};
std::atomic_bool* shuting_down_{nullptr};
Status SampleCandidateFiles();
bool DoSample(const BlobFileMeta* file);
Status DoRunGC();
Status BuildIterator(std::unique_ptr<BlobFileMergeIterator>* result);
bool DiscardEntry(const Slice& key, const BlobIndex& blob_index);
Status InstallOutputBlobFiles();
Status RewriteValidKeyToLSM();
Status DeleteInputBlobFiles() const;
bool IsShutingDown();
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_gc_job.h"
#include "util/testharness.h"
#include "utilities/titandb/blob_gc_picker.h"
#include "utilities/titandb/db_impl.h"
namespace rocksdb {
namespace titandb {
const static int MAX_KEY_NUM = 1000;
std::string GenKey(int i) {
char buffer[32];
snprintf(buffer, sizeof(buffer), "k-%08d", i);
return buffer;
}
std::string GenValue(int i) {
char buffer[32];
snprintf(buffer, sizeof(buffer), "v-%08d", i);
return buffer;
}
class BlobGCJobTest : public testing::Test {
public:
std::string dbname_;
TitanDB* db_;
DBImpl* base_db_;
TitanDBImpl* tdb_;
VersionSet* version_set_;
TitanOptions options_;
port::Mutex* mutex_;
BlobGCJobTest() : dbname_(test::TmpDir()) {
options_.dirname = dbname_ + "/titandb";
options_.create_if_missing = true;
options_.disable_background_gc = true;
options_.min_blob_size = 0;
options_.env->CreateDirIfMissing(dbname_);
options_.env->CreateDirIfMissing(options_.dirname);
}
~BlobGCJobTest() {}
void CheckBlobNumber(int expected) {
auto b = version_set_->GetBlobStorage(base_db_->DefaultColumnFamily()->GetID()).lock();
ASSERT_EQ(expected, b->files_.size());
}
void ClearDir() {
std::vector<std::string> filenames;
options_.env->GetChildren(options_.dirname, &filenames);
for (auto& fname : filenames) {
if (fname != "." && fname != "..") {
ASSERT_OK(options_.env->DeleteFile(options_.dirname + "/" + fname));
}
}
options_.env->DeleteDir(options_.dirname);
filenames.clear();
options_.env->GetChildren(dbname_, &filenames);
for (auto& fname : filenames) {
if (fname != "." && fname != "..") {
options_.env->DeleteFile(dbname_ + "/" + fname);
}
}
}
void NewDB() {
ClearDir();
ASSERT_OK(TitanDB::Open(options_, dbname_, &db_));
tdb_ = reinterpret_cast<TitanDBImpl*>(db_);
version_set_ = tdb_->vset_.get();
mutex_ = &tdb_->mutex_;
base_db_ = reinterpret_cast<DBImpl*>(tdb_->GetRootDB());
}
void Flush() {
FlushOptions fopts;
fopts.wait = true;
ASSERT_OK(db_->Flush(fopts));
}
void DestroyDB() {
Status s __attribute__((__unused__)) = db_->Close();
assert(s.ok());
delete db_;
db_ = nullptr;
}
void RunGC() {
MutexLock l(mutex_);
Status s;
auto* cfh = base_db_->DefaultColumnFamily();
// Build BlobGC
TitanDBOptions db_options;
TitanCFOptions cf_options;
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options.info_log.get());
cf_options.min_gc_batch_size = 0;
cf_options.blob_file_discardable_ratio = 0.4;
std::unique_ptr<BlobGC> blob_gc;
{
std::shared_ptr<BlobGCPicker> blob_gc_picker =
std::make_shared<BasicBlobGCPicker>(db_options, cf_options);
blob_gc = blob_gc_picker->PickBlobGC(
version_set_->GetBlobStorage(cfh->GetID()).lock().get());
}
if (blob_gc) {
blob_gc->SetColumnFamily(cfh);
BlobGCJob blob_gc_job(blob_gc.get(), base_db_, mutex_, tdb_->db_options_,
tdb_->env_, EnvOptions(), tdb_->blob_manager_.get(),
version_set_, &log_buffer, nullptr);
s = blob_gc_job.Prepare();
ASSERT_OK(s);
{
mutex_->Unlock();
s = blob_gc_job.Run();
mutex_->Lock();
}
if (s.ok()) {
s = blob_gc_job.Finish();
ASSERT_OK(s);
}
}
mutex_->Unlock();
tdb_->PurgeObsoleteFiles();
mutex_->Lock();
}
Status NewIterator(uint64_t file_number, uint64_t file_size,
std::unique_ptr<BlobFileIterator>* iter) {
std::unique_ptr<RandomAccessFileReader> file;
Status s = NewBlobFileReader(file_number, 0, tdb_->db_options_,
tdb_->env_options_, tdb_->env_, &file);
if (!s.ok()) {
return s;
}
iter->reset(new BlobFileIterator(std::move(file), file_number, file_size,
TitanCFOptions()));
return Status::OK();
}
void TestDiscardEntry() {
NewDB();
auto* cfh = base_db_->DefaultColumnFamily();
BlobIndex blob_index;
blob_index.file_number = 0x81;
blob_index.blob_handle.offset = 0x98;
blob_index.blob_handle.size = 0x17;
std::string res;
blob_index.EncodeTo(&res);
std::string key = "test_discard_entry";
WriteBatch wb;
ASSERT_OK(WriteBatchInternal::PutBlobIndex(&wb, cfh->GetID(), key, res));
auto rewrite_status = base_db_->Write(WriteOptions(), &wb);
std::vector<BlobFileMeta*> tmp;
BlobGC blob_gc(std::move(tmp), TitanCFOptions());
blob_gc.SetColumnFamily(cfh);
BlobGCJob blob_gc_job(&blob_gc, base_db_, mutex_, TitanDBOptions(),
Env::Default(), EnvOptions(), nullptr, version_set_,
nullptr, nullptr);
ASSERT_FALSE(blob_gc_job.DiscardEntry(key, blob_index));
DestroyDB();
}
void TestRunGC() {
NewDB();
for (int i = 0; i < MAX_KEY_NUM; i++) {
db_->Put(WriteOptions(), GenKey(i), GenValue(i));
}
Flush();
std::string result;
for (int i = 0; i < MAX_KEY_NUM; i++) {
if (i % 2 != 0) continue;
db_->Delete(WriteOptions(), GenKey(i));
}
Flush();
auto b = version_set_->GetBlobStorage(base_db_->DefaultColumnFamily()->GetID()).lock();
ASSERT_EQ(b->files_.size(), 1);
auto old = b->files_.begin()->first;
// for (auto& f : b->files_) {
// f.second->marked_for_sample = false;
// }
std::unique_ptr<BlobFileIterator> iter;
ASSERT_OK(NewIterator(b->files_.begin()->second->file_number(),
b->files_.begin()->second->file_size(), &iter));
iter->SeekToFirst();
for (int i = 0; i < MAX_KEY_NUM; i++, iter->Next()) {
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(iter->key().compare(Slice(GenKey(i))) == 0);
}
RunGC();
b = version_set_->GetBlobStorage(base_db_->DefaultColumnFamily()->GetID()).lock();
ASSERT_EQ(b->files_.size(), 1);
auto new1 = b->files_.begin()->first;
ASSERT_TRUE(old != new1);
ASSERT_OK(NewIterator(b->files_.begin()->second->file_number(),
b->files_.begin()->second->file_size(), &iter));
iter->SeekToFirst();
auto* db_iter = db_->NewIterator(ReadOptions(), db_->DefaultColumnFamily());
db_iter->SeekToFirst();
for (int i = 0; i < MAX_KEY_NUM; i++) {
if (i % 2 == 0) continue;
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_TRUE(iter->key().compare(Slice(GenKey(i))) == 0);
ASSERT_TRUE(iter->value().compare(Slice(GenValue(i))) == 0);
ASSERT_OK(db_->Get(ReadOptions(), iter->key(), &result));
ASSERT_TRUE(iter->value().size() == result.size());
ASSERT_TRUE(iter->value().compare(result) == 0);
ASSERT_OK(db_iter->status());
ASSERT_TRUE(db_iter->Valid());
ASSERT_TRUE(db_iter->key().compare(Slice(GenKey(i))) == 0);
ASSERT_TRUE(db_iter->value().compare(Slice(GenValue(i))) == 0);
iter->Next();
db_iter->Next();
}
delete db_iter;
ASSERT_FALSE(iter->Valid() || !iter->status().ok());
DestroyDB();
}
};
TEST_F(BlobGCJobTest, DiscardEntry) { TestDiscardEntry(); }
TEST_F(BlobGCJobTest, RunGC) { TestRunGC(); }
// Tests blob file will be kept after GC, if it is still visible by active snapshots.
TEST_F(BlobGCJobTest, PurgeBlobs) {
NewDB();
auto snap1 = db_->GetSnapshot();
for (int i = 0; i < 10; i++) {
db_->Put(WriteOptions(), GenKey(i), GenValue(i));
}
Flush();
CheckBlobNumber(1);
auto snap2 = db_->GetSnapshot();
auto snap3 = db_->GetSnapshot();
for (int i = 0; i < 10; i++) {
db_->Delete(WriteOptions(), GenKey(i));
}
Flush();
CheckBlobNumber(1);
auto snap4 = db_->GetSnapshot();
RunGC();
CheckBlobNumber(1);
for (int i = 10; i < 20; i++) {
db_->Put(WriteOptions(), GenKey(i), GenValue(i));
}
Flush();
auto snap5 = db_->GetSnapshot();
CheckBlobNumber(2);
db_->ReleaseSnapshot(snap2);
RunGC();
CheckBlobNumber(2);
db_->ReleaseSnapshot(snap3);
RunGC();
CheckBlobNumber(2);
db_->ReleaseSnapshot(snap1);
RunGC();
CheckBlobNumber(2);
db_->ReleaseSnapshot(snap4);
RunGC();
CheckBlobNumber(1);
db_->ReleaseSnapshot(snap5);
RunGC();
CheckBlobNumber(1);
DestroyDB();
}
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#include "utilities/titandb/blob_gc_picker.h"
namespace rocksdb {
namespace titandb {
BasicBlobGCPicker::BasicBlobGCPicker(TitanDBOptions db_options,
TitanCFOptions cf_options)
: db_options_(db_options), cf_options_(cf_options) {}
BasicBlobGCPicker::~BasicBlobGCPicker() {}
std::unique_ptr<BlobGC> BasicBlobGCPicker::PickBlobGC(
BlobStorage* blob_storage) {
Status s;
std::vector<BlobFileMeta*> blob_files;
uint64_t batch_size = 0;
// ROCKS_LOG_INFO(db_options_.info_log, "blob file num:%lu gc score:%lu",
// blob_storage->NumBlobFiles(), blob_storage->gc_score().size());
for (auto& gc_score : blob_storage->gc_score()) {
auto blob_file = blob_storage->FindFile(gc_score.file_number).lock();
assert(blob_file);
// ROCKS_LOG_INFO(db_options_.info_log,
// "file number:%lu score:%f being_gc:%d pending:%d, "
// "size:%lu discard:%lu mark_for_gc:%d
// mark_for_sample:%d", blob_file->file_number_,
// gc_score.score, blob_file->being_gc,
// blob_file->pending, blob_file->file_size_,
// blob_file->discardable_size_,
// blob_file->marked_for_gc_,
// blob_file->marked_for_sample);
if (!CheckBlobFile(blob_file.get())) {
ROCKS_LOG_INFO(db_options_.info_log, "file number:%lu no need gc",
blob_file->file_number());
continue;
}
blob_files.push_back(blob_file.get());
batch_size += blob_file->file_size();
if (batch_size >= cf_options_.max_gc_batch_size) break;
}
if (blob_files.empty() || batch_size < cf_options_.min_gc_batch_size)
return nullptr;
return std::unique_ptr<BlobGC>(
new BlobGC(std::move(blob_files), std::move(cf_options_)));
}
bool BasicBlobGCPicker::CheckBlobFile(BlobFileMeta* blob_file) const {
assert(blob_file->file_state() != BlobFileMeta::FileState::kInit);
if (blob_file->file_state() != BlobFileMeta::FileState::kNormal) return false;
return true;
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include <memory>
#include "db/column_family.h"
#include "db/write_callback.h"
#include "rocksdb/status.h"
#include "util/filename.h"
#include "utilities/titandb/blob_file_manager.h"
#include "utilities/titandb/blob_format.h"
#include "utilities/titandb/blob_gc.h"
#include "utilities/titandb/version.h"
namespace rocksdb {
namespace titandb {
class BlobGCPicker {
public:
BlobGCPicker(){};
virtual ~BlobGCPicker(){};
// Pick candidate blob files for a new gc.
// Returns nullptr if there is no gc to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the gc. Caller should delete the result.
virtual std::unique_ptr<BlobGC> PickBlobGC(BlobStorage* blob_storage) = 0;
};
class BasicBlobGCPicker final : public BlobGCPicker {
public:
BasicBlobGCPicker(TitanDBOptions, TitanCFOptions);
~BasicBlobGCPicker();
std::unique_ptr<BlobGC> PickBlobGC(BlobStorage* blob_storage) override;
private:
TitanDBOptions db_options_;
TitanCFOptions cf_options_;
// Check if blob_file needs to gc, return true means we need pick this
// file for gc
bool CheckBlobFile(BlobFileMeta* blob_file) const;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/blob_gc_picker.h"
#include "util/filename.h"
#include "util/testharness.h"
#include "utilities/titandb/blob_file_builder.h"
#include "utilities/titandb/blob_file_cache.h"
#include "utilities/titandb/blob_file_iterator.h"
#include "utilities/titandb/blob_file_reader.h"
namespace rocksdb {
namespace titandb {
class BlobGCPickerTest : public testing::Test {
public:
std::unique_ptr<BlobStorage> blob_storage_;
std::unique_ptr<BlobGCPicker> basic_blob_gc_picker_;
BlobGCPickerTest() {}
~BlobGCPickerTest() {}
void NewBlobStorageAndPicker(const TitanDBOptions& titan_db_options,
const TitanCFOptions& titan_cf_options) {
auto blob_file_cache = std::make_shared<BlobFileCache>(
titan_db_options, titan_cf_options, NewLRUCache(128));
blob_storage_.reset(new BlobStorage(titan_cf_options, blob_file_cache));
basic_blob_gc_picker_.reset(new BasicBlobGCPicker(titan_db_options, titan_cf_options));
}
void AddBlobFile(uint64_t file_number, uint64_t file_size,
uint64_t discardable_size, bool being_gc = false) {
auto f = std::make_shared<BlobFileMeta>(file_number, file_size);
f->AddDiscardableSize(discardable_size);
f->FileStateTransit(BlobFileMeta::FileEvent::kDbRestart);
if (being_gc) {
f->FileStateTransit(BlobFileMeta::FileEvent::kGCBegin);
}
blob_storage_->files_[file_number] = f;
}
void UpdateBlobStorage() { blob_storage_->ComputeGCScore(); }
};
TEST_F(BlobGCPickerTest, Basic) {
TitanDBOptions titan_db_options;
TitanCFOptions titan_cf_options;
titan_cf_options.min_gc_batch_size = 0;
NewBlobStorageAndPicker(titan_db_options, titan_cf_options);
AddBlobFile(1U, 1U, 0U);
UpdateBlobStorage();
auto blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get());
ASSERT_TRUE(blob_gc != nullptr);
ASSERT_EQ(blob_gc->inputs().size(), 1);
ASSERT_EQ(blob_gc->inputs()[0]->file_number(), 1U);
}
TEST_F(BlobGCPickerTest, BeingGC) {
TitanDBOptions titan_db_options;
TitanCFOptions titan_cf_options;
titan_cf_options.min_gc_batch_size = 0;
NewBlobStorageAndPicker(titan_db_options, titan_cf_options);
AddBlobFile(1U, 1U, 0U, true);
UpdateBlobStorage();
auto blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get());
ASSERT_EQ(nullptr, blob_gc);
NewBlobStorageAndPicker(titan_db_options, titan_cf_options);
AddBlobFile(1U, 1U, 0U, true);
AddBlobFile(2U, 1U, 0U);
UpdateBlobStorage();
blob_gc = basic_blob_gc_picker_->PickBlobGC(blob_storage_.get());
ASSERT_EQ(blob_gc->inputs().size(), 1);
ASSERT_EQ(blob_gc->inputs()[0]->file_number(), 2U);
}
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#include "rocksdb/utilities/titandb/db.h"
#include "utilities/titandb/db_impl.h"
namespace rocksdb {
namespace titandb {
Status TitanDB::Open(const TitanOptions& options, const std::string& dbname,
TitanDB** db) {
TitanDBOptions db_options(options);
TitanCFOptions cf_options(options);
std::vector<TitanCFDescriptor> descs;
descs.emplace_back(kDefaultColumnFamilyName, cf_options);
std::vector<ColumnFamilyHandle*> handles;
Status s = TitanDB::Open(db_options, dbname, descs, &handles, db);
if (s.ok()) {
assert(handles.size() == 1);
// DBImpl is always holding the default handle.
delete handles[0];
}
return s;
}
Status TitanDB::Open(const TitanDBOptions& db_options,
const std::string& dbname,
const std::vector<TitanCFDescriptor>& descs,
std::vector<ColumnFamilyHandle*>* handles, TitanDB** db) {
auto impl = new TitanDBImpl(db_options, dbname);
auto s = impl->Open(descs, handles);
if (s.ok()) {
*db = impl;
impl->StartBackgroundTasks();
} else {
*db = nullptr;
delete impl;
}
return s;
}
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/db_impl.h"
#include "utilities/titandb/base_db_listener.h"
#include "utilities/titandb/blob_file_builder.h"
#include "utilities/titandb/blob_file_iterator.h"
#include "utilities/titandb/blob_file_size_collector.h"
#include "utilities/titandb/blob_gc.h"
#include "utilities/titandb/db_iter.h"
#include "utilities/titandb/table_factory.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
namespace rocksdb {
namespace titandb {
class TitanDBImpl::FileManager : public BlobFileManager {
public:
FileManager(TitanDBImpl* db) : db_(db) {}
Status NewFile(std::unique_ptr<BlobFileHandle>* handle) override {
auto number = db_->vset_->NewFileNumber();
auto name = BlobFileName(db_->dirname_, number);
Status s;
std::unique_ptr<WritableFileWriter> file;
{
std::unique_ptr<WritableFile> f;
s = db_->env_->NewWritableFile(name, &f, db_->env_options_);
if (!s.ok()) return s;
file.reset(new WritableFileWriter(std::move(f), name, db_->env_options_));
}
handle->reset(new FileHandle(number, name, std::move(file)));
{
MutexLock l(&db_->mutex_);
db_->pending_outputs_.insert(number);
}
return s;
}
Status BatchFinishFiles(
uint32_t cf_id,
const std::vector<std::pair<std::shared_ptr<BlobFileMeta>,
std::unique_ptr<BlobFileHandle>>>& files)
override {
Status s;
VersionEdit edit;
edit.SetColumnFamilyID(cf_id);
for (auto& file : files) {
s = file.second->GetFile()->Sync(false);
if (s.ok()) {
s = file.second->GetFile()->Close();
}
if (!s.ok()) return s;
ROCKS_LOG_INFO(db_->db_options_.info_log, "Titan adding blob file [%llu]",
file.first->file_number());
edit.AddBlobFile(file.first);
}
{
MutexLock l(&db_->mutex_);
s = db_->vset_->LogAndApply(&edit);
for (const auto& file : files)
db_->pending_outputs_.erase(file.second->GetNumber());
}
return s;
}
Status BatchDeleteFiles(
const std::vector<std::unique_ptr<BlobFileHandle>>& handles) override {
Status s;
for (auto& handle : handles) s = db_->env_->DeleteFile(handle->GetName());
{
MutexLock l(&db_->mutex_);
for (const auto& handle : handles)
db_->pending_outputs_.erase(handle->GetNumber());
}
return s;
}
private:
class FileHandle : public BlobFileHandle {
public:
FileHandle(uint64_t number, const std::string& name,
std::unique_ptr<WritableFileWriter> file)
: number_(number), name_(name), file_(std::move(file)) {}
uint64_t GetNumber() const override { return number_; }
const std::string& GetName() const override { return name_; }
WritableFileWriter* GetFile() const override { return file_.get(); }
private:
uint64_t number_;
std::string name_;
std::unique_ptr<WritableFileWriter> file_;
};
TitanDBImpl* db_;
};
TitanDBImpl::TitanDBImpl(const TitanDBOptions& options,
const std::string& dbname)
: TitanDB(),
mutex_(),
bg_cv_(&mutex_),
dbname_(dbname),
env_(options.env),
env_options_(options),
db_options_(options) {
if (db_options_.dirname.empty()) {
db_options_.dirname = dbname_ + "/titandb";
}
dirname_ = db_options_.dirname;
vset_.reset(new VersionSet(db_options_));
blob_manager_.reset(new FileManager(this));
}
TitanDBImpl::~TitanDBImpl() { Close(); }
// how often to schedule delete obs files periods
static constexpr uint32_t kDeleteObsoleteFilesPeriodSecs = 10; // 10s
void TitanDBImpl::StartBackgroundTasks() {
if (!thread_purge_obsolete_) {
thread_purge_obsolete_.reset(new rocksdb::RepeatableThread(
[this]() { TitanDBImpl::PurgeObsoleteFiles(); }, "titanbg", env_,
kDeleteObsoleteFilesPeriodSecs * 1000 * 1000));
}
}
Status TitanDBImpl::Open(const std::vector<TitanCFDescriptor>& descs,
std::vector<ColumnFamilyHandle*>* handles) {
// Sets up directories for base DB and Titan.
Status s = env_->CreateDirIfMissing(dbname_);
if (!s.ok()) return s;
if (!db_options_.info_log) {
s = CreateLoggerFromOptions(dbname_, db_options_, &db_options_.info_log);
if (!s.ok()) return s;
}
s = env_->CreateDirIfMissing(dirname_);
if (!s.ok()) return s;
s = env_->LockFile(LockFileName(dirname_), &lock_);
if (!s.ok()) return s;
std::vector<ColumnFamilyDescriptor> base_descs;
for (auto& desc : descs) {
base_descs.emplace_back(desc.name, desc.options);
}
std::map<uint32_t, TitanCFOptions> column_families;
// Opens the base DB first to collect the column families information.
// Avoid flush here because we haven't replaced the table factory yet.
db_options_.avoid_flush_during_recovery = true;
s = DB::Open(db_options_, dbname_, base_descs, handles, &db_);
if (s.ok()) {
for (size_t i = 0; i < descs.size(); i++) {
auto handle = (*handles)[i];
uint32_t cf_id = handle->GetID();
column_families.emplace(cf_id, descs[i].options);
db_->DestroyColumnFamilyHandle(handle);
// Replaces the provided table factory with TitanTableFactory.
// While we need to preserve original table_factory for GetOptions.
auto& original_table_factory = base_descs[i].options.table_factory;
assert(original_table_factory != nullptr);
original_table_factory_[cf_id] = original_table_factory;
base_descs[i].options.table_factory = std::make_shared<TitanTableFactory>(
db_options_, descs[i].options, blob_manager_);
// Add TableProperties for collecting statistics GC
base_descs[i].options.table_properties_collector_factories.emplace_back(
std::make_shared<BlobFileSizeCollectorFactory>());
}
handles->clear();
s = db_->Close();
delete db_;
}
if (!s.ok()) return s;
s = vset_->Open(column_families);
if (!s.ok()) return s;
// Add EventListener to collect statistics for GC
db_options_.listeners.emplace_back(std::make_shared<BaseDbListener>(this));
static bool has_init_background_threads = false;
if (!has_init_background_threads) {
auto low_pri_threads_num = env_->GetBackgroundThreads(Env::Priority::LOW);
assert(low_pri_threads_num > 0);
if (!db_options_.disable_background_gc &&
db_options_.max_background_gc > 0) {
env_->IncBackgroundThreadsIfNeeded(
db_options_.max_background_gc + low_pri_threads_num,
Env::Priority::LOW);
assert(env_->GetBackgroundThreads(Env::Priority::LOW) ==
low_pri_threads_num + db_options_.max_background_gc);
}
has_init_background_threads = true;
}
s = DB::Open(db_options_, dbname_, base_descs, handles, &db_);
if (s.ok()) {
db_impl_ = reinterpret_cast<DBImpl*>(db_->GetRootDB());
}
return s;
}
Status TitanDBImpl::Close() {
Status s;
CloseImpl();
if (db_) {
s = db_->Close();
delete db_;
db_ = nullptr;
db_impl_ = nullptr;
}
if (lock_) {
env_->UnlockFile(lock_);
lock_ = nullptr;
}
return s;
}
Status TitanDBImpl::CloseImpl() {
{
MutexLock l(&mutex_);
// Although `shuting_down_` is atomic bool object, we should set it under
// the protection of mutex_, otherwise, there maybe something wrong with it,
// like:
// 1, A thread: shuting_down_.load = false
// 2, B thread: shuting_down_.store(true)
// 3, B thread: unschedule all bg work
// 4, A thread: schedule bg work
shuting_down_.store(true, std::memory_order_release);
}
int gc_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
{
MutexLock l(&mutex_);
bg_gc_scheduled_ -= gc_unscheduled;
while (bg_gc_scheduled_ > 0) {
bg_cv_.Wait();
}
}
if (thread_purge_obsolete_ != nullptr) {
thread_purge_obsolete_->cancel();
mutex_.Lock();
thread_purge_obsolete_.reset();
mutex_.Unlock();
}
return Status::OK();
}
Status TitanDBImpl::CreateColumnFamilies(
const std::vector<TitanCFDescriptor>& descs,
std::vector<ColumnFamilyHandle*>* handles) {
std::vector<ColumnFamilyDescriptor> base_descs;
for (auto& desc : descs) {
ColumnFamilyOptions options = desc.options;
// Replaces the provided table factory with TitanTableFactory.
options.table_factory.reset(
new TitanTableFactory(db_options_, desc.options, blob_manager_));
base_descs.emplace_back(desc.name, options);
}
MutexLock l(&mutex_);
Status s = db_impl_->CreateColumnFamilies(base_descs, handles);
assert(handles->size() == descs.size());
if (s.ok()) {
std::map<uint32_t, TitanCFOptions> column_families;
for (size_t i = 0; i < descs.size(); i++) {
column_families.emplace((*handles)[i]->GetID(), descs[i].options);
}
vset_->AddColumnFamilies(column_families);
}
return s;
}
Status TitanDBImpl::DropColumnFamilies(
const std::vector<ColumnFamilyHandle*>& handles) {
std::vector<uint32_t> column_families;
std::vector<ColumnFamilyData*> cfds;
for (auto& handle : handles) {
column_families.push_back(handle->GetID());
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(handle)->cfd();
cfds.push_back(cfd);
}
MutexLock l(&mutex_);
// TODO:
// As rocksdb described, `DropColumnFamilies()` only records the drop of the column family specified by ColumnFamilyHandle.
// The actual data is not deleted until the client calls `delete column_family`, namely `DestroyColumnFamilyHandle()`.
// We can still continue using the column family if we have outstanding ColumnFamilyHandle pointer.
// So we should delete blob files in `DestroyColumnFamilyHandle()` but not here.
Status s = db_impl_->DropColumnFamilies(handles);
if (s.ok()) {
SequenceNumber obsolete_sequence = db_impl_->GetLatestSequenceNumber();
vset_->DropColumnFamilies(column_families, obsolete_sequence);
}
return s;
}
Status TitanDBImpl::CompactFiles(
const CompactionOptions& compact_options, ColumnFamilyHandle* column_family,
const std::vector<std::string>& input_file_names, const int output_level,
const int output_path_id, std::vector<std::string>* const output_file_names,
CompactionJobInfo* compaction_job_info) {
std::unique_ptr<CompactionJobInfo> compaction_job_info_ptr;
if (compaction_job_info == nullptr) {
compaction_job_info_ptr.reset(new CompactionJobInfo());
compaction_job_info = compaction_job_info_ptr.get();
}
auto s = db_impl_->CompactFiles(
compact_options, column_family, input_file_names, output_level,
output_path_id, output_file_names, compaction_job_info);
if (s.ok()) {
OnCompactionCompleted(*compaction_job_info);
}
return s;
}
Status TitanDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle* handle,
const Slice& key, PinnableSlice* value) {
if (options.snapshot) {
return GetImpl(options, handle, key, value);
}
ReadOptions ro(options);
ManagedSnapshot snapshot(this);
ro.snapshot = snapshot.snapshot();
return GetImpl(ro, handle, key, value);
}
Status TitanDBImpl::GetImpl(const ReadOptions& options,
ColumnFamilyHandle* handle, const Slice& key,
PinnableSlice* value) {
Status s;
bool is_blob_index = false;
s = db_impl_->GetImpl(options, handle, key, value, nullptr /*value_found*/,
nullptr /*read_callback*/, &is_blob_index);
if (!s.ok() || !is_blob_index) return s;
BlobIndex index;
s = index.DecodeFrom(value);
assert(s.ok());
if (!s.ok()) return s;
BlobRecord record;
PinnableSlice buffer;
mutex_.Lock();
auto storage = vset_->GetBlobStorage(handle->GetID()).lock();
mutex_.Unlock();
s = storage->Get(options, index, &record, &buffer);
if (s.IsCorruption()) {
ROCKS_LOG_DEBUG(db_options_.info_log, "Key:%s Snapshot:%" PRIu64 " GetBlobFile err:%s\n",
key.ToString(true).c_str(),
options.snapshot->GetSequenceNumber(),
s.ToString().c_str());
}
if (s.ok()) {
value->Reset();
value->PinSelf(record.value);
}
return s;
}
std::vector<Status> TitanDBImpl::MultiGet(
const ReadOptions& options, const std::vector<ColumnFamilyHandle*>& handles,
const std::vector<Slice>& keys, std::vector<std::string>* values) {
auto options_copy = options;
options_copy.total_order_seek = true;
if (options_copy.snapshot) {
return MultiGetImpl(options_copy, handles, keys, values);
}
ReadOptions ro(options_copy);
ManagedSnapshot snapshot(this);
ro.snapshot = snapshot.snapshot();
return MultiGetImpl(ro, handles, keys, values);
}
std::vector<Status> TitanDBImpl::MultiGetImpl(
const ReadOptions& options, const std::vector<ColumnFamilyHandle*>& handles,
const std::vector<Slice>& keys, std::vector<std::string>* values) {
std::vector<Status> res;
res.resize(keys.size());
values->resize(keys.size());
for (size_t i = 0; i < keys.size(); i++) {
auto value = &(*values)[i];
PinnableSlice pinnable_value(value);
res[i] = GetImpl(options, handles[i], keys[i], &pinnable_value);
if (res[i].ok() && pinnable_value.IsPinned()) {
value->assign(pinnable_value.data(), pinnable_value.size());
}
}
return res;
}
Iterator* TitanDBImpl::NewIterator(const ReadOptions& options,
ColumnFamilyHandle* handle) {
ReadOptions options_copy = options;
options_copy.total_order_seek = true;
std::shared_ptr<ManagedSnapshot> snapshot;
if (options_copy.snapshot) {
return NewIteratorImpl(options_copy, handle, snapshot);
}
ReadOptions ro(options_copy);
snapshot.reset(new ManagedSnapshot(this));
ro.snapshot = snapshot->snapshot();
return NewIteratorImpl(ro, handle, snapshot);
}
Iterator* TitanDBImpl::NewIteratorImpl(
const ReadOptions& options, ColumnFamilyHandle* handle,
std::shared_ptr<ManagedSnapshot> snapshot) {
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(handle)->cfd();
mutex_.Lock();
auto storage = vset_->GetBlobStorage(handle->GetID());
mutex_.Unlock();
std::unique_ptr<ArenaWrappedDBIter> iter(db_impl_->NewIteratorImpl(
options, cfd, options.snapshot->GetSequenceNumber(), nullptr /*read_callback*/,
true /*allow_blob*/, true /*allow_refresh*/));
return new TitanDBIterator(options, storage.lock().get(), snapshot,
std::move(iter));
}
Status TitanDBImpl::NewIterators(
const ReadOptions& options, const std::vector<ColumnFamilyHandle*>& handles,
std::vector<Iterator*>* iterators) {
ReadOptions ro(options);
ro.total_order_seek = true;
std::shared_ptr<ManagedSnapshot> snapshot;
if (!ro.snapshot) {
snapshot.reset(new ManagedSnapshot(this));
ro.snapshot = snapshot->snapshot();
}
iterators->clear();
iterators->reserve(handles.size());
for (auto& handle : handles) {
iterators->emplace_back(NewIteratorImpl(ro, handle, snapshot));
}
return Status::OK();
}
const Snapshot* TitanDBImpl::GetSnapshot() {
return db_->GetSnapshot();
}
void TitanDBImpl::ReleaseSnapshot(const Snapshot* snapshot) {
// TODO:
// We can record here whether the oldest snapshot is released.
// If not, we can just skip the next round of purging obsolete files.
db_->ReleaseSnapshot(snapshot);
}
Options TitanDBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
assert(column_family != nullptr);
Options options = db_->GetOptions(column_family);
uint32_t cf_id = column_family->GetID();
if (original_table_factory_.count(cf_id) > 0) {
options.table_factory = original_table_factory_.at(cf_id);
} else {
ROCKS_LOG_ERROR(
db_options_.info_log,
"Failed to get original table factory for column family %s.",
column_family->GetName().c_str());
options.table_factory.reset();
}
return options;
}
void TitanDBImpl::OnFlushCompleted(const FlushJobInfo& flush_job_info) {
const auto& tps = flush_job_info.table_properties;
auto ucp_iter = tps.user_collected_properties.find(
BlobFileSizeCollector::kPropertiesName);
// sst file doesn't contain any blob index
if (ucp_iter == tps.user_collected_properties.end()) {
return;
}
std::map<uint64_t, uint64_t> blob_files_size;
Slice src{ucp_iter->second};
if (!BlobFileSizeCollector::Decode(&src, &blob_files_size)) {
fprintf(stderr, "BlobFileSizeCollector::Decode failed size:%lu\n",
ucp_iter->second.size());
abort();
}
assert(!blob_files_size.empty());
std::set<uint64_t> outputs;
for (const auto f : blob_files_size) {
outputs.insert(f.first);
}
{
MutexLock l(&mutex_);
auto blob_storage = vset_->GetBlobStorage(flush_job_info.cf_id).lock();
if (!blob_storage) {
fprintf(stderr, "Column family id:%u Not Found\n", flush_job_info.cf_id);
abort();
}
for (const auto& file_number : outputs) {
auto file = blob_storage->FindFile(file_number).lock();
// This file maybe output of a gc job, and it's been gced out.
if (!file) {
continue;
}
file->FileStateTransit(BlobFileMeta::FileEvent::kFlushCompleted);
}
}
}
void TitanDBImpl::OnCompactionCompleted(
const CompactionJobInfo& compaction_job_info) {
std::map<uint64_t, int64_t> blob_files_size;
std::set<uint64_t> outputs;
std::set<uint64_t> inputs;
auto calc_bfs = [&compaction_job_info, &blob_files_size, &outputs, &inputs](
const std::vector<std::string>& files, int coefficient,
bool output) {
for (const auto& file : files) {
auto tp_iter = compaction_job_info.table_properties.find(file);
if (tp_iter == compaction_job_info.table_properties.end()) {
if (output) {
fprintf(stderr, "can't find property for output\n");
abort();
}
continue;
}
auto ucp_iter = tp_iter->second->user_collected_properties.find(
BlobFileSizeCollector::kPropertiesName);
// this sst file doesn't contain any blob index
if (ucp_iter == tp_iter->second->user_collected_properties.end()) {
continue;
}
std::map<uint64_t, uint64_t> input_blob_files_size;
std::string s = ucp_iter->second;
Slice slice{s};
if (!BlobFileSizeCollector::Decode(&slice, &input_blob_files_size)) {
fprintf(stderr, "BlobFileSizeCollector::Decode failed\n");
abort();
}
for (const auto& input_bfs : input_blob_files_size) {
if (output) {
if (inputs.find(input_bfs.first) == inputs.end()) {
outputs.insert(input_bfs.first);
}
} else {
inputs.insert(input_bfs.first);
}
auto bfs_iter = blob_files_size.find(input_bfs.first);
if (bfs_iter == blob_files_size.end()) {
blob_files_size[input_bfs.first] = coefficient * input_bfs.second;
} else {
bfs_iter->second += coefficient * input_bfs.second;
}
}
}
};
calc_bfs(compaction_job_info.input_files, -1, false);
calc_bfs(compaction_job_info.output_files, 1, true);
{
MutexLock l(&mutex_);
auto bs = vset_->GetBlobStorage(compaction_job_info.cf_id).lock();
if (!bs) {
fprintf(stderr, "Column family id:%u Not Found\n",
compaction_job_info.cf_id);
return;
}
for (const auto& o : outputs) {
auto file = bs->FindFile(o).lock();
if (!file) {
fprintf(stderr, "OnCompactionCompleted get file failed\n");
abort();
}
file->FileStateTransit(BlobFileMeta::FileEvent::kCompactionCompleted);
}
for (const auto& bfs : blob_files_size) {
// blob file size < 0 means discardable size > 0
if (bfs.second >= 0) {
continue;
}
auto file = bs->FindFile(bfs.first).lock();
if (!file) {
// file has been gc out
continue;
}
file->AddDiscardableSize(static_cast<uint64_t>(-bfs.second));
}
bs->ComputeGCScore();
AddToGCQueue(compaction_job_info.cf_id);
MaybeScheduleGC();
}
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "db/db_impl.h"
#include "util/repeatable_thread.h"
#include "rocksdb/utilities/titandb/db.h"
#include "utilities/titandb/blob_file_manager.h"
#include "utilities/titandb/version_set.h"
namespace rocksdb {
namespace titandb {
class TitanDBImpl : public TitanDB {
public:
TitanDBImpl(const TitanDBOptions& options, const std::string& dbname);
~TitanDBImpl();
Status Open(const std::vector<TitanCFDescriptor>& descs,
std::vector<ColumnFamilyHandle*>* handles);
Status Close() override;
using TitanDB::CreateColumnFamilies;
Status CreateColumnFamilies(
const std::vector<TitanCFDescriptor>& descs,
std::vector<ColumnFamilyHandle*>* handles) override;
Status DropColumnFamilies(
const std::vector<ColumnFamilyHandle*>& handles) override;
using TitanDB::CompactFiles;
Status CompactFiles(
const CompactionOptions& compact_options,
ColumnFamilyHandle* column_family,
const std::vector<std::string>& input_file_names, const int output_level,
const int output_path_id = -1,
std::vector<std::string>* const output_file_names = nullptr,
CompactionJobInfo* compaction_job_info = nullptr) override;
Status CloseImpl();
using TitanDB::Get;
Status Get(const ReadOptions& options, ColumnFamilyHandle* handle,
const Slice& key, PinnableSlice* value) override;
using TitanDB::MultiGet;
std::vector<Status> MultiGet(const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& handles,
const std::vector<Slice>& keys,
std::vector<std::string>* values) override;
using TitanDB::NewIterator;
Iterator* NewIterator(const ReadOptions& options,
ColumnFamilyHandle* handle) override;
Status NewIterators(const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& handles,
std::vector<Iterator*>* iterators) override;
const Snapshot* GetSnapshot() override;
void ReleaseSnapshot(const Snapshot* snapshot) override;
using TitanDB::GetOptions;
Options GetOptions(ColumnFamilyHandle* column_family) const override;
void OnFlushCompleted(const FlushJobInfo& flush_job_info);
void OnCompactionCompleted(const CompactionJobInfo& compaction_job_info);
void StartBackgroundTasks();
private:
class FileManager;
friend class FileManager;
friend class BlobGCJobTest;
friend class BaseDbListener;
friend class TitanDBTest;
Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* handle,
const Slice& key, PinnableSlice* value);
std::vector<Status> MultiGetImpl(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& handles,
const std::vector<Slice>& keys, std::vector<std::string>* values);
Iterator* NewIteratorImpl(const ReadOptions& options,
ColumnFamilyHandle* handle,
std::shared_ptr<ManagedSnapshot> snapshot);
// REQUIRE: mutex_ held
void AddToGCQueue(uint32_t column_family_id) {
gc_queue_.push_back(column_family_id);
}
// REQUIRE: gc_queue_ not empty
// REQUIRE: mutex_ held
uint32_t PopFirstFromGCQueue() {
assert(!gc_queue_.empty());
auto column_family_id = *gc_queue_.begin();
gc_queue_.pop_front();
return column_family_id;
}
// REQUIRE: mutex_ held
void MaybeScheduleGC();
static void BGWorkGC(void* db);
void BackgroundCallGC();
Status BackgroundGC(LogBuffer* log_buffer);
void PurgeObsoleteFiles();
SequenceNumber GetOldestSnapshotSequence() {
SequenceNumber oldest_snapshot = kMaxSequenceNumber;
{
// Need to lock DBImpl mutex before access snapshot list.
InstrumentedMutexLock l(db_impl_->mutex());
auto& snapshots = db_impl_->snapshots();
if (!snapshots.empty()) {
oldest_snapshot = snapshots.oldest()->GetSequenceNumber();
}
}
return oldest_snapshot;
}
FileLock* lock_{nullptr};
// The lock sequence must be Titan.mutex_.Lock() -> Base DB mutex_.Lock()
// while the unlock sequence must be Base DB mutex.Unlock() ->
// Titan.mutex_.Unlock() Only if we all obey these sequence, we can prevent
// potential dead lock.
port::Mutex mutex_;
// This condition variable is signaled on these conditions:
// * whenever bg_gc_scheduled_ goes down to 0
port::CondVar bg_cv_;
std::string dbname_;
std::string dirname_;
Env* env_;
EnvOptions env_options_;
DBImpl* db_impl_;
TitanDBOptions db_options_;
std::unordered_map<uint32_t, std::shared_ptr<TableFactory>>
original_table_factory_;
// handle for purging obsolete blob files at fixed intervals
std::unique_ptr<RepeatableThread> thread_purge_obsolete_;
std::unique_ptr<VersionSet> vset_;
std::set<uint64_t> pending_outputs_;
std::shared_ptr<BlobFileManager> blob_manager_;
// gc_queue_ hold column families that we need to gc.
// pending_gc_ hold column families that already on gc_queue_.
std::deque<uint32_t> gc_queue_;
std::atomic_int bg_gc_scheduled_{0};
std::atomic_bool shuting_down_{false};
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/db_impl.h"
namespace rocksdb {
namespace titandb {
void TitanDBImpl::PurgeObsoleteFiles() {
Status s;
ObsoleteFiles obsolete_files;
auto oldest_sequence = GetOldestSnapshotSequence();
{
MutexLock l(&mutex_);
vset_->GetObsoleteFiles(&obsolete_files, oldest_sequence);
}
{
std::vector<std::string> candidate_files;
for (auto& blob_file : obsolete_files.blob_files) {
candidate_files.emplace_back(
BlobFileName(db_options_.dirname, std::get<0>(blob_file)));
}
for (auto& manifest : obsolete_files.manifests) {
candidate_files.emplace_back(std::move(manifest));
}
// dedup state.inputs so we don't try to delete the same
// file twice
std::sort(candidate_files.begin(), candidate_files.end());
candidate_files.erase(
std::unique(candidate_files.begin(), candidate_files.end()),
candidate_files.end());
for (const auto& candidate_file : candidate_files) {
ROCKS_LOG_INFO(db_options_.info_log, "Titan deleting obsolete file [%s]",
candidate_file.c_str());
s = env_->DeleteFile(candidate_file);
if (!s.ok()) {
fprintf(stderr, "Titan deleting file [%s] failed, status:%s",
candidate_file.c_str(), s.ToString().c_str());
abort();
}
}
}
}
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/db_impl.h"
#include "utilities/titandb/blob_file_iterator.h"
#include "utilities/titandb/blob_gc_job.h"
#include "utilities/titandb/blob_gc_picker.h"
namespace rocksdb {
namespace titandb {
void TitanDBImpl::MaybeScheduleGC() {
mutex_.AssertHeld();
if (db_options_.disable_background_gc) return;
if (shuting_down_.load(std::memory_order_acquire)) return;
if (bg_gc_scheduled_.load(std::memory_order_acquire) >=
db_options_.max_background_gc)
return;
bg_gc_scheduled_.fetch_add(1, std::memory_order_release);
env_->Schedule(&TitanDBImpl::BGWorkGC, this, Env::Priority::LOW, this);
}
void TitanDBImpl::BGWorkGC(void* db) {
reinterpret_cast<TitanDBImpl*>(db)->BackgroundCallGC();
}
void TitanDBImpl::BackgroundCallGC() {
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
{
MutexLock l(&mutex_);
assert(bg_gc_scheduled_ > 0);
BackgroundGC(&log_buffer);
{
mutex_.Unlock();
log_buffer.FlushBufferToLog();
LogFlush(db_options_.info_log.get());
mutex_.Lock();
}
bg_gc_scheduled_--;
if (bg_gc_scheduled_ == 0) {
// signal if
// * bg_gc_scheduled_ == 0 -- need to wakeup ~TitanDBImpl
// If none of this is true, there is no need to signal since nobody is
// waiting for it
bg_cv_.SignalAll();
}
// IMPORTANT: there should be no code after calling SignalAll. This call may
// signal the DB destructor that it's OK to proceed with destruction. In
// that case, all DB variables will be dealloacated and referencing them
// will cause trouble.
}
}
Status TitanDBImpl::BackgroundGC(LogBuffer* log_buffer) {
mutex_.AssertHeld();
std::unique_ptr<BlobGC> blob_gc;
std::unique_ptr<ColumnFamilyHandle> cfh;
Status s;
if (!gc_queue_.empty()) {
uint32_t column_family_id = PopFirstFromGCQueue();
auto bs = vset_->GetBlobStorage(column_family_id).lock().get();
const auto& titan_cf_options = bs->titan_cf_options();
std::shared_ptr<BlobGCPicker> blob_gc_picker =
std::make_shared<BasicBlobGCPicker>(db_options_, titan_cf_options);
blob_gc = blob_gc_picker->PickBlobGC(bs);
if (blob_gc) {
cfh = db_impl_->GetColumnFamilyHandleUnlocked(column_family_id);
assert(column_family_id == cfh->GetID());
blob_gc->SetColumnFamily(cfh.get());
}
}
// TODO(@DorianZheng) Make sure enough room for GC
if (UNLIKELY(!blob_gc)) {
// Nothing to do
ROCKS_LOG_BUFFER(log_buffer, "Titan GC nothing to do");
} else {
BlobGCJob blob_gc_job(blob_gc.get(), db_, &mutex_, db_options_, env_,
env_options_, blob_manager_.get(), vset_.get(),
log_buffer, &shuting_down_);
s = blob_gc_job.Prepare();
if (s.ok()) {
mutex_.Unlock();
s = blob_gc_job.Run();
mutex_.Lock();
}
if (s.ok()) {
s = blob_gc_job.Finish();
}
blob_gc->ReleaseGcFiles();
}
if (s.ok()) {
// Done
} else {
ROCKS_LOG_WARN(db_options_.info_log, "Titan GC error: %s",
s.ToString().c_str());
}
return s;
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include "db/db_iter.h"
namespace rocksdb {
namespace titandb {
class TitanDBIterator : public Iterator {
public:
TitanDBIterator(const ReadOptions& options, BlobStorage* storage,
std::shared_ptr<ManagedSnapshot> snap,
std::unique_ptr<ArenaWrappedDBIter> iter)
: options_(options),
storage_(storage),
snap_(snap),
iter_(std::move(iter)) {}
bool Valid() const override { return iter_->Valid() && status_.ok(); }
Status status() const override {
// assume volatile inner iter
if(status_.ok()) {
return iter_->status();
} else {
return status_;
}
}
void SeekToFirst() override {
iter_->SeekToFirst();
GetBlobValue();
}
void SeekToLast() override {
iter_->SeekToLast();
GetBlobValue();
}
void Seek(const Slice& target) override {
iter_->Seek(target);
GetBlobValue();
}
void SeekForPrev(const Slice& target) override {
iter_->SeekForPrev(target);
GetBlobValue();
}
void Next() override {
assert(Valid());
iter_->Next();
GetBlobValue();
}
void Prev() override {
assert(Valid());
iter_->Prev();
GetBlobValue();
}
Slice key() const override {
assert(Valid());
return iter_->key();
}
Slice value() const override {
assert(Valid());
if (!iter_->IsBlob()) return iter_->value();
return record_.value;
}
private:
void GetBlobValue() {
if (!iter_->Valid() || !iter_->IsBlob()) {
status_ = iter_->status();
return;
}
assert(iter_->status().ok());
BlobIndex index;
status_ = DecodeInto(iter_->value(), &index);
if (!status_.ok()) {
fprintf(stderr, "GetBlobValue decode blob index err:%s\n",
status_.ToString().c_str());
abort();
}
auto it = files_.find(index.file_number);
if (it == files_.end()) {
std::unique_ptr<BlobFilePrefetcher> prefetcher;
status_ = storage_->NewPrefetcher(index.file_number, &prefetcher);
if (status_.IsCorruption()) {
fprintf(stderr,
"key:%s GetBlobValue err:%s with sequence number:%" PRIu64 "\n",
iter_->key().ToString(true).c_str(), status_.ToString().c_str(),
options_.snapshot->GetSequenceNumber());
}
if (!status_.ok()) return;
it = files_.emplace(index.file_number, std::move(prefetcher)).first;
}
buffer_.Reset();
status_ = it->second->Get(options_, index.blob_handle, &record_, &buffer_);
}
Status status_;
BlobRecord record_;
PinnableSlice buffer_;
ReadOptions options_;
BlobStorage* storage_;
std::shared_ptr<ManagedSnapshot> snap_;
std::unique_ptr<ArenaWrappedDBIter> iter_;
std::map<uint64_t, std::unique_ptr<BlobFilePrefetcher>> files_;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/options.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include "rocksdb/convenience.h"
namespace rocksdb {
namespace titandb {
std::string TitanCFOptions::ToString() const {
char buf[256];
std::string str;
std::string res = "[titandb]\n";
snprintf(buf, sizeof(buf), "min_blob_size = %" PRIu64 "\n", min_blob_size);
res += buf;
GetStringFromCompressionType(&str, blob_file_compression);
snprintf(buf, sizeof(buf), "blob_file_compression = %s\n", str.c_str());
res += buf;
snprintf(buf, sizeof(buf), "blob_file_target_size = %" PRIu64 "\n",
blob_file_target_size);
res += buf;
return res;
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "rocksdb/options.h"
namespace rocksdb {
namespace titandb {
struct TitanDBOptions : public DBOptions {
// The directory to store data specific to TitanDB alongside with
// the base DB.
//
// Default: {dbname}/titandb
std::string dirname;
// Disable background GC
//
// Default: false
bool disable_background_gc{false};
// Max background GC thread
//
// Default: 1
int32_t max_background_gc{1};
TitanDBOptions() = default;
explicit TitanDBOptions(const DBOptions& options) : DBOptions(options) {}
TitanDBOptions& operator=(const DBOptions& options) {
*dynamic_cast<DBOptions*>(this) = options;
return *this;
}
};
struct TitanCFOptions : public ColumnFamilyOptions {
// The smallest value to store in blob files. Value smaller than
// this threshold will be inlined in base DB.
//
// Default: 4096
uint64_t min_blob_size{4096};
// The compression algorithm used to compress data in blob files.
//
// Default: kNoCompression
CompressionType blob_file_compression{kNoCompression};
// The desirable blob file size. This is not a hard limit but a wish.
//
// Default: 256MB
uint64_t blob_file_target_size{256 << 20};
// If non-NULL use the specified cache for blob records.
//
// Default: nullptr
std::shared_ptr<Cache> blob_cache;
// Max batch size for gc
//
// Default: 1GB
uint64_t max_gc_batch_size{1 << 30};
// Min batch size for gc
//
// Default: 512MB
uint64_t min_gc_batch_size{512 << 20};
// The ratio of how much discardable size of a blob file can be GC
//
// Default: 0.5
float blob_file_discardable_ratio{0.5};
// The ratio of how much size of a blob file need to be sample before GC
//
// Default: 0.1
float sample_file_size_ratio{0.1};
// The blob file size less than this option will be mark gc
//
// Default: 8MB
uint64_t merge_small_file_threshold{8 << 20};
TitanCFOptions() = default;
explicit TitanCFOptions(const ColumnFamilyOptions& options)
: ColumnFamilyOptions(options) {}
TitanCFOptions& operator=(const ColumnFamilyOptions& options) {
*dynamic_cast<ColumnFamilyOptions*>(this) = options;
return *this;
}
std::string ToString() const;
};
struct TitanOptions : public TitanDBOptions, public TitanCFOptions {
TitanOptions() = default;
explicit TitanOptions(const Options& options)
: TitanDBOptions(options), TitanCFOptions(options) {}
TitanOptions& operator=(const Options& options) {
*dynamic_cast<TitanDBOptions*>(this) = options;
*dynamic_cast<TitanCFOptions*>(this) = options;
return *this;
}
operator Options() {
Options options;
*dynamic_cast<DBOptions*>(&options) = *dynamic_cast<DBOptions*>(this);
*dynamic_cast<ColumnFamilyOptions*>(&options) =
*dynamic_cast<ColumnFamilyOptions*>(this);
return options;
}
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/table_builder.h"
namespace rocksdb {
namespace titandb {
void TitanTableBuilder::Add(const Slice& key, const Slice& value) {
if (!ok()) return;
ParsedInternalKey ikey;
if (!ParseInternalKey(key, &ikey)) {
status_ = Status::Corruption(Slice());
return;
}
if (ikey.type != kTypeValue || value.size() < cf_options_.min_blob_size) {
base_builder_->Add(key, value);
return;
}
std::string index_value;
AddBlob(ikey.user_key, value, &index_value);
if (!ok()) return;
ikey.type = kTypeBlobIndex;
std::string index_key;
AppendInternalKey(&index_key, ikey);
base_builder_->Add(index_key, index_value);
}
void TitanTableBuilder::AddBlob(const Slice& key, const Slice& value,
std::string* index_value) {
if (!ok()) return;
if (!blob_builder_) {
status_ = blob_manager_->NewFile(&blob_handle_);
if (!ok()) return;
blob_builder_.reset(
new BlobFileBuilder(cf_options_, blob_handle_->GetFile()));
}
BlobIndex index;
BlobRecord record;
record.key = key;
record.value = value;
index.file_number = blob_handle_->GetNumber();
blob_builder_->Add(record, &index.blob_handle);
if (ok()) {
index.EncodeTo(index_value);
}
}
Status TitanTableBuilder::status() const {
Status s = status_;
if (s.ok()) {
s = base_builder_->status();
}
if (s.ok() && blob_builder_) {
s = blob_builder_->status();
}
return s;
}
Status TitanTableBuilder::Finish() {
base_builder_->Finish();
if (blob_builder_) {
blob_builder_->Finish();
if (ok()) {
std::shared_ptr<BlobFileMeta> file = std::make_shared<BlobFileMeta>(
blob_handle_->GetNumber(), blob_handle_->GetFile()->GetFileSize());
file->FileStateTransit(BlobFileMeta::FileEvent::kFlushOrCompactionOutput);
status_ =
blob_manager_->FinishFile(cf_id_, file, std::move(blob_handle_));
// ROCKS_LOG_INFO(db_options_.info_log, "[%u] AddFile %lu", cf_id_,
// file->file_number_);
} else {
status_ = blob_manager_->DeleteFile(std::move(blob_handle_));
}
}
return status();
}
void TitanTableBuilder::Abandon() {
base_builder_->Abandon();
if (blob_builder_) {
blob_builder_->Abandon();
status_ = blob_manager_->DeleteFile(std::move(blob_handle_));
}
}
uint64_t TitanTableBuilder::NumEntries() const {
return base_builder_->NumEntries();
}
uint64_t TitanTableBuilder::FileSize() const {
return base_builder_->FileSize();
}
bool TitanTableBuilder::NeedCompact() const {
return base_builder_->NeedCompact();
}
TableProperties TitanTableBuilder::GetTableProperties() const {
return base_builder_->GetTableProperties();
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "table/table_builder.h"
#include "utilities/titandb/blob_file_builder.h"
#include "utilities/titandb/blob_file_manager.h"
#include "utilities/titandb/options.h"
namespace rocksdb {
namespace titandb {
class TitanTableBuilder : public TableBuilder {
public:
TitanTableBuilder(uint32_t cf_id, const TitanDBOptions& db_options,
const TitanCFOptions& cf_options,
std::unique_ptr<TableBuilder> base_builder,
std::shared_ptr<BlobFileManager> blob_manager)
: cf_id_(cf_id),
db_options_(db_options),
cf_options_(cf_options),
base_builder_(std::move(base_builder)),
blob_manager_(blob_manager) {}
void Add(const Slice& key, const Slice& value) override;
Status status() const override;
Status Finish() override;
void Abandon() override;
uint64_t NumEntries() const override;
uint64_t FileSize() const override;
bool NeedCompact() const override;
TableProperties GetTableProperties() const override;
private:
bool ok() const { return status().ok(); }
void AddBlob(const Slice& key, const Slice& value, std::string* index_value);
Status status_;
uint32_t cf_id_;
TitanDBOptions db_options_;
TitanCFOptions cf_options_;
std::unique_ptr<TableBuilder> base_builder_;
std::unique_ptr<BlobFileHandle> blob_handle_;
std::shared_ptr<BlobFileManager> blob_manager_;
std::unique_ptr<BlobFileBuilder> blob_builder_;
};
} // namespace titandb
} // namespace rocksdb
#include "table/table_builder.h"
#include "table/table_reader.h"
#include "util/filename.h"
#include "util/testharness.h"
#include "utilities/titandb/blob_file_manager.h"
#include "utilities/titandb/blob_file_reader.h"
#include "utilities/titandb/table_factory.h"
namespace rocksdb {
namespace titandb {
const uint64_t kMinBlobSize = 128;
const uint64_t kTestFileNumber = 123;
class FileManager : public BlobFileManager {
public:
FileManager(const TitanDBOptions& db_options) : db_options_(db_options) {}
Status NewFile(std::unique_ptr<BlobFileHandle>* handle) override {
auto number = kTestFileNumber;
auto name = BlobFileName(db_options_.dirname, number);
std::unique_ptr<WritableFileWriter> file;
{
std::unique_ptr<WritableFile> f;
Status s = env_->NewWritableFile(name, &f, env_options_);
if (!s.ok()) return s;
file.reset(new WritableFileWriter(std::move(f), name, env_options_));
}
handle->reset(new FileHandle(number, name, std::move(file)));
return Status::OK();
}
Status FinishFile(uint32_t /*cf_id*/, std::shared_ptr<BlobFileMeta> /*file*/,
std::unique_ptr<BlobFileHandle>&& handle) override {
Status s = handle->GetFile()->Sync(true);
if (s.ok()) {
s = handle->GetFile()->Close();
}
return s;
}
Status DeleteFile(std::unique_ptr<BlobFileHandle>&& handle) override {
return env_->DeleteFile(handle->GetName());
}
private:
class FileHandle : public BlobFileHandle {
public:
FileHandle(uint64_t number, const std::string& name,
std::unique_ptr<WritableFileWriter> file)
: number_(number), name_(name), file_(std::move(file)) {}
uint64_t GetNumber() const override { return number_; }
const std::string& GetName() const override { return name_; }
WritableFileWriter* GetFile() const override { return file_.get(); }
private:
friend class FileManager;
uint64_t number_;
std::string name_;
std::unique_ptr<WritableFileWriter> file_;
};
Env* env_{Env::Default()};
EnvOptions env_options_;
TitanDBOptions db_options_;
};
class TableBuilderTest : public testing::Test {
public:
TableBuilderTest()
: cf_moptions_(cf_options_),
cf_ioptions_(options_),
tmpdir_(test::TmpDir(env_)),
base_name_(tmpdir_ + "/base"),
blob_name_(BlobFileName(tmpdir_, kTestFileNumber)) {
db_options_.dirname = tmpdir_;
cf_options_.min_blob_size = kMinBlobSize;
blob_manager_.reset(new FileManager(db_options_));
table_factory_.reset(new TitanTableFactory(db_options_, cf_options_, blob_manager_));
}
~TableBuilderTest() {
env_->DeleteFile(base_name_);
env_->DeleteFile(blob_name_);
env_->DeleteDir(tmpdir_);
}
void BlobFileExists(bool exists) {
Status s = env_->FileExists(blob_name_);
if (exists) {
ASSERT_TRUE(s.ok());
} else {
ASSERT_TRUE(s.IsNotFound());
}
}
void NewFileWriter(const std::string& fname,
std::unique_ptr<WritableFileWriter>* result) {
std::unique_ptr<WritableFile> file;
ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
result->reset(new WritableFileWriter(std::move(file), fname, env_options_));
}
void NewFileReader(const std::string& fname,
std::unique_ptr<RandomAccessFileReader>* result) {
std::unique_ptr<RandomAccessFile> file;
ASSERT_OK(env_->NewRandomAccessFile(fname, &file, env_options_));
result->reset(new RandomAccessFileReader(std::move(file), fname, env_));
}
void NewBaseFileWriter(std::unique_ptr<WritableFileWriter>* result) {
NewFileWriter(base_name_, result);
}
void NewBaseFileReader(std::unique_ptr<RandomAccessFileReader>* result) {
NewFileReader(base_name_, result);
}
void NewBlobFileReader(std::unique_ptr<BlobFileReader>* result) {
std::unique_ptr<RandomAccessFileReader> file;
NewFileReader(blob_name_, &file);
uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(blob_name_, &file_size));
ASSERT_OK(
BlobFileReader::Open(cf_options_, std::move(file), file_size, result));
}
void NewTableReader(std::unique_ptr<TableReader>* result) {
std::unique_ptr<RandomAccessFileReader> file;
NewBaseFileReader(&file);
uint64_t file_size = 0;
ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size));
TableReaderOptions options(cf_ioptions_, nullptr, env_options_,
cf_ioptions_.internal_comparator);
ASSERT_OK(table_factory_->NewTableReader(options, std::move(file),
file_size, result));
}
void NewTableBuilder(WritableFileWriter* file,
std::unique_ptr<TableBuilder>* result) {
TableBuilderOptions options(cf_ioptions_, cf_moptions_,
cf_ioptions_.internal_comparator, &collectors_,
kNoCompression, CompressionOptions(), nullptr,
false, kDefaultColumnFamilyName, 0);
result->reset(table_factory_->NewTableBuilder(options, 0, file));
}
Env* env_{Env::Default()};
EnvOptions env_options_;
Options options_;
TitanDBOptions db_options_;
TitanCFOptions cf_options_;
MutableCFOptions cf_moptions_;
ImmutableCFOptions cf_ioptions_;
std::vector<std::unique_ptr<IntTblPropCollectorFactory>> collectors_;
std::string tmpdir_;
std::string base_name_;
std::string blob_name_;
std::unique_ptr<TableFactory> table_factory_;
std::shared_ptr<BlobFileManager> blob_manager_;
};
TEST_F(TableBuilderTest, Basic) {
std::unique_ptr<WritableFileWriter> base_file;
NewBaseFileWriter(&base_file);
std::unique_ptr<TableBuilder> table_builder;
NewTableBuilder(base_file.get(), &table_builder);
// Build a base table and a blob file.
const int n = 100;
for (char i = 0; i < n; i++) {
std::string key(1, i);
InternalKey ikey(key, 1, kTypeValue);
std::string value;
if (i % 2 == 0) {
value = std::string(1, i);
} else {
value = std::string(kMinBlobSize, i);
}
table_builder->Add(ikey.Encode(), value);
}
ASSERT_OK(table_builder->Finish());
ASSERT_OK(base_file->Sync(true));
ASSERT_OK(base_file->Close());
std::unique_ptr<TableReader> base_reader;
NewTableReader(&base_reader);
std::unique_ptr<BlobFileReader> blob_reader;
NewBlobFileReader(&blob_reader);
ReadOptions ro;
std::unique_ptr<InternalIterator> iter;
iter.reset(base_reader->NewIterator(ro, nullptr));
iter->SeekToFirst();
for (char i = 0; i < n; i++) {
ASSERT_TRUE(iter->Valid());
std::string key(1, i);
ParsedInternalKey ikey;
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
ASSERT_EQ(ikey.user_key, key);
if (i % 2 == 0) {
ASSERT_EQ(ikey.type, kTypeValue);
ASSERT_EQ(iter->value(), std::string(1, i));
} else {
ASSERT_EQ(ikey.type, kTypeBlobIndex);
BlobIndex index;
ASSERT_OK(DecodeInto(iter->value(), &index));
ASSERT_EQ(index.file_number, kTestFileNumber);
BlobRecord record;
PinnableSlice buffer;
ASSERT_OK(blob_reader->Get(ro, index.blob_handle, &record, &buffer));
ASSERT_EQ(record.key, key);
ASSERT_EQ(record.value, std::string(kMinBlobSize, i));
}
iter->Next();
}
}
TEST_F(TableBuilderTest, NoBlob) {
std::unique_ptr<WritableFileWriter> base_file;
NewBaseFileWriter(&base_file);
std::unique_ptr<TableBuilder> table_builder;
NewTableBuilder(base_file.get(), &table_builder);
const int n = 100;
for (char i = 0; i < n; i++) {
std::string key(1, i);
InternalKey ikey(key, 1, kTypeValue);
std::string value(1, i);
table_builder->Add(ikey.Encode(), value);
}
ASSERT_OK(table_builder->Finish());
ASSERT_OK(base_file->Sync(true));
ASSERT_OK(base_file->Close());
BlobFileExists(false);
std::unique_ptr<TableReader> base_reader;
NewTableReader(&base_reader);
ReadOptions ro;
std::unique_ptr<InternalIterator> iter;
iter.reset(base_reader->NewIterator(ro, nullptr));
iter->SeekToFirst();
for (char i = 0; i < n; i++) {
ASSERT_TRUE(iter->Valid());
std::string key(1, i);
ParsedInternalKey ikey;
ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
ASSERT_EQ(ikey.user_key, key);
ASSERT_EQ(ikey.type, kTypeValue);
ASSERT_EQ(iter->value(), std::string(1, i));
iter->Next();
}
}
TEST_F(TableBuilderTest, Abandon) {
std::unique_ptr<WritableFileWriter> base_file;
NewBaseFileWriter(&base_file);
std::unique_ptr<TableBuilder> table_builder;
NewTableBuilder(base_file.get(), &table_builder);
const int n = 100;
for (char i = 0; i < n; i++) {
std::string key(1, i);
InternalKey ikey(key, 1, kTypeValue);
std::string value;
if (i % 2 == 0) {
value = std::string(1, i);
} else {
value = std::string(kMinBlobSize, i);
}
table_builder->Add(ikey.Encode(), value);
}
BlobFileExists(true);
table_builder->Abandon();
BlobFileExists(false);
}
TEST_F(TableBuilderTest, NumEntries) {
std::unique_ptr<WritableFileWriter> base_file;
NewBaseFileWriter(&base_file);
std::unique_ptr<TableBuilder> table_builder;
NewTableBuilder(base_file.get(), &table_builder);
// Build a base table and a blob file.
const int n = 100;
for (char i = 0; i < n; i++) {
std::string key(1, i);
InternalKey ikey(key, 1, kTypeValue);
std::string value;
if (i % 2 == 0) {
value = std::string(1, i);
} else {
value = std::string(kMinBlobSize, i);
}
table_builder->Add(ikey.Encode(), value);
}
ASSERT_EQ(n, table_builder->NumEntries());
ASSERT_OK(table_builder->Finish());
}
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#include "utilities/titandb/table_factory.h"
#include "utilities/titandb/table_builder.h"
namespace rocksdb {
namespace titandb {
Status TitanTableFactory::NewTableReader(
const TableReaderOptions& options,
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
std::unique_ptr<TableReader>* result,
bool prefetch_index_and_filter_in_cache) const {
return base_factory_->NewTableReader(options, std::move(file), file_size,
result,
prefetch_index_and_filter_in_cache);
}
TableBuilder* TitanTableFactory::NewTableBuilder(
const TableBuilderOptions& options, uint32_t column_family_id,
WritableFileWriter* file) const {
std::unique_ptr<TableBuilder> base_builder(
base_factory_->NewTableBuilder(options, column_family_id, file));
return new TitanTableBuilder(column_family_id, db_options_, cf_options_,
std::move(base_builder), blob_manager_);
}
std::string TitanTableFactory::GetPrintableTableOptions() const {
return base_factory_->GetPrintableTableOptions() + cf_options_.ToString();
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "rocksdb/table.h"
#include "utilities/titandb/blob_file_manager.h"
#include "utilities/titandb/options.h"
namespace rocksdb {
namespace titandb {
class TitanTableFactory : public TableFactory {
public:
TitanTableFactory(const TitanDBOptions& db_options,
const TitanCFOptions& cf_options,
std::shared_ptr<BlobFileManager> blob_manager)
: db_options_(db_options),
cf_options_(cf_options),
base_factory_(cf_options.table_factory),
blob_manager_(blob_manager) {}
const char* Name() const override { return "TitanTable"; }
Status NewTableReader(
const TableReaderOptions& options,
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
std::unique_ptr<TableReader>* result,
bool prefetch_index_and_filter_in_cache = true) const override;
TableBuilder* NewTableBuilder(const TableBuilderOptions& options,
uint32_t column_family_id,
WritableFileWriter* file) const override;
std::string GetPrintableTableOptions() const override;
Status SanitizeOptions(const DBOptions& db_options,
const ColumnFamilyOptions& cf_options) const override {
// Override this when we need to validate our options.
return base_factory_->SanitizeOptions(db_options, cf_options);
}
Status GetOptionString(std::string* opt_string,
const std::string& delimiter) const override {
// Override this when we need to persist our options.
return base_factory_->GetOptionString(opt_string, delimiter);
}
void* GetOptions() override { return base_factory_->GetOptions(); }
bool IsDeleteRangeSupported() const override {
return base_factory_->IsDeleteRangeSupported();
}
private:
TitanDBOptions db_options_;
TitanCFOptions cf_options_;
std::shared_ptr<TableFactory> base_factory_;
std::shared_ptr<BlobFileManager> blob_manager_;
};
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "rocksdb/cache.h"
#include "util/compression.h"
#include "util/testharness.h"
namespace rocksdb {
namespace titandb {
template <typename T>
void CheckCodec(const T& input) {
std::string buffer;
input.EncodeTo(&buffer);
T output;
ASSERT_OK(DecodeInto(buffer, &output));
ASSERT_EQ(output, input);
}
} // namespace titandb
} // namespace rocksdb
#include "rocksdb/utilities/titan_c.h"
#include "rocksdb/db.h"
#include "rocksdb/utilities/titandb/db.h"
using rocksdb::Cache;
using rocksdb::CompressionType;
using rocksdb::DB;
using rocksdb::Options;
using rocksdb::Status;
using rocksdb::titandb::TitanDB;
using rocksdb::titandb::TitanOptions;
extern "C" {
struct rocksdb_t {
DB* rep;
};
struct rocksdb_options_t {
Options rep;
};
struct rocksdb_cache_t {
std::shared_ptr<Cache> rep;
};
struct titandb_options_t {
TitanOptions rep;
};
static bool SaveError(char** errptr, const Status& s) {
assert(errptr != nullptr);
if (s.ok()) {
return false;
} else if (*errptr == nullptr) {
*errptr = strdup(s.ToString().c_str());
} else {
// TODO(sanjay): Merge with existing error?
// This is a bug if *errptr is not created by malloc()
free(*errptr);
*errptr = strdup(s.ToString().c_str());
}
return true;
}
rocksdb_t* titandb_open(const titandb_options_t* options, const char* name,
char** errptr) {
TitanDB* db;
if (SaveError(errptr, TitanDB::Open(options->rep, name, &db))) {
return nullptr;
}
rocksdb_t* result = new rocksdb_t;
result->rep = db;
return result;
}
titandb_options_t* titandb_options_create() { return new titandb_options_t; }
void titandb_options_destroy(titandb_options_t* options) { delete options; }
void titandb_options_set_rocksdb(titandb_options_t* options,
rocksdb_options_t* rocksdb) {
options->rep = TitanOptions(rocksdb->rep);
}
void titandb_options_set_dirname(titandb_options_t* options, const char* name) {
options->rep.dirname = name;
}
void titandb_options_set_min_blob_size(titandb_options_t* options,
uint64_t size) {
options->rep.min_blob_size = size;
}
void titandb_options_set_blob_file_compression(titandb_options_t* options,
int compression) {
options->rep.blob_file_compression =
static_cast<CompressionType>(compression);
}
void titandb_options_set_blob_cache(titandb_options_t* options,
rocksdb_cache_t* blob_cache) {
if (blob_cache) {
options->rep.blob_cache = blob_cache->rep;
} else {
options->rep.blob_cache.reset();
}
}
void titandb_options_set_disable_background_gc(titandb_options_t* options,
unsigned char disable) {
options->rep.disable_background_gc = disable;
}
void titandb_options_set_max_gc_batch_size(titandb_options_t* options,
uint64_t size) {
options->rep.max_gc_batch_size = size;
}
void titandb_options_set_min_gc_batch_size(titandb_options_t* options,
uint64_t size) {
options->rep.min_gc_batch_size = size;
}
void titandb_options_set_blob_file_discardable_ratio(titandb_options_t* options,
float ratio) {
options->rep.blob_file_discardable_ratio = ratio;
}
void titandb_options_set_sample_file_size_ratio(titandb_options_t* options,
float ratio) {
options->rep.sample_file_size_ratio = ratio;
}
void titandb_options_set_merge_small_file_threshold(titandb_options_t* options,
uint64_t size) {
options->rep.merge_small_file_threshold = size;
}
} // end extern "C"
#include <inttypes.h>
#include <options/cf_options.h>
#include "rocksdb/utilities/titandb/db.h"
#include "utilities/titandb/db_impl.h"
#include "utilities/titandb/titan_fault_injection_test_env.h"
#include "util/filename.h"
#include "util/random.h"
#include "util/testharness.h"
#include "util/sync_point.h"
#include "blob_file_reader.h"
#include "blob_file_iterator.h"
#include "db_iter.h"
namespace rocksdb {
namespace titandb {
void DeleteDir(Env* env, const std::string& dirname) {
std::vector<std::string> filenames;
env->GetChildren(dirname, &filenames);
for (auto& fname : filenames) {
env->DeleteFile(dirname + "/" + fname);
}
env->DeleteDir(dirname);
}
class TitanDBTest : public testing::Test {
public:
TitanDBTest() : dbname_(test::TmpDir()) {
options_.dirname = dbname_ + "/titandb";
options_.create_if_missing = true;
options_.min_blob_size = 32;
options_.min_gc_batch_size = 1;
options_.blob_file_compression = CompressionType::kLZ4Compression;
DeleteDir(env_, options_.dirname);
DeleteDir(env_, dbname_);
}
~TitanDBTest() { Close(); }
void Open() {
if (cf_names_.empty()) {
ASSERT_OK(TitanDB::Open(options_, dbname_, &db_));
db_impl_ = reinterpret_cast<TitanDBImpl*>(db_);
} else {
TitanDBOptions db_options(options_);
TitanCFOptions cf_options(options_);
cf_names_.clear();
ASSERT_OK(DB::ListColumnFamilies(db_options, dbname_, &cf_names_));
std::vector<TitanCFDescriptor> descs;
for (auto& name : cf_names_) {
descs.emplace_back(name, cf_options);
}
cf_handles_.clear();
ASSERT_OK(TitanDB::Open(db_options, dbname_, descs, &cf_handles_, &db_));
}
}
void Close() {
if (!db_) return;
for (auto& handle : cf_handles_) {
db_->DestroyColumnFamilyHandle(handle);
}
ASSERT_OK(db_->Close());
delete db_;
db_ = nullptr;
}
void Reopen() {
Close();
Open();
}
void AddCF(const std::string& name) {
TitanCFDescriptor desc(name, options_);
ColumnFamilyHandle* handle = nullptr;
ASSERT_OK(db_->CreateColumnFamily(desc, &handle));
cf_names_.emplace_back(name);
cf_handles_.emplace_back(handle);
}
void DropCF(const std::string& name) {
for (size_t i = 0; i < cf_names_.size(); i++) {
if (cf_names_[i] != name) continue;
auto handle = cf_handles_[i];
ASSERT_OK(db_->DropColumnFamily(handle));
db_->DestroyColumnFamilyHandle(handle);
cf_names_.erase(cf_names_.begin() + i);
cf_handles_.erase(cf_handles_.begin() + i);
break;
}
}
void Put(uint64_t k, std::map<std::string, std::string>* data = nullptr) {
WriteOptions wopts;
std::string key = GenKey(k);
std::string value = GenValue(k);
ASSERT_OK(db_->Put(wopts, key, value));
for (auto& handle : cf_handles_) {
ASSERT_OK(db_->Put(wopts, handle, key, value));
}
if (data != nullptr) {
data->emplace(key, value);
}
}
void Flush() {
FlushOptions fopts;
ASSERT_OK(db_->Flush(fopts));
for (auto& handle : cf_handles_) {
ASSERT_OK(db_->Flush(fopts, handle));
}
}
std::weak_ptr<BlobStorage> GetBlobStorage(ColumnFamilyHandle* cf_handle = nullptr) {
if(cf_handle == nullptr) {
cf_handle = db_->DefaultColumnFamily();
}
return db_impl_->vset_->GetBlobStorage(cf_handle->GetID());
}
void VerifyDB(const std::map<std::string, std::string>& data, ReadOptions ropts = ReadOptions()) {
for (auto& kv : data) {
std::string value;
ASSERT_OK(db_->Get(ropts, kv.first, &value));
ASSERT_EQ(value, kv.second);
for (auto& handle : cf_handles_) {
ASSERT_OK(db_->Get(ropts, handle, kv.first, &value));
ASSERT_EQ(value, kv.second);
}
std::vector<Slice> keys(cf_handles_.size(), kv.first);
std::vector<std::string> values;
auto res = db_->MultiGet(ropts, cf_handles_, keys, &values);
for (auto& s : res) ASSERT_OK(s);
for (auto& v : values) ASSERT_EQ(v, kv.second);
}
std::vector<Iterator*> iterators;
db_->NewIterators(ropts, cf_handles_, &iterators);
iterators.emplace_back(db_->NewIterator(ropts));
for (auto& handle : cf_handles_) {
iterators.emplace_back(db_->NewIterator(ropts, handle));
}
for (auto& iter : iterators) {
iter->SeekToFirst();
for (auto& kv : data) {
ASSERT_EQ(iter->Valid(), true);
ASSERT_EQ(iter->key(), kv.first);
ASSERT_EQ(iter->value(), kv.second);
iter->Next();
}
delete iter;
}
}
void VerifyBlob(
uint64_t file_number,
const std::map<std::string, std::string>& data) {
// Open blob file and iterate in-file records
EnvOptions env_opt;
uint64_t file_size = 0;
std::map<std::string, std::string> file_data;
std::unique_ptr<RandomAccessFileReader> readable_file;
std::string file_name = BlobFileName(options_.dirname, file_number);
ASSERT_OK(env_->GetFileSize(file_name, &file_size));
NewBlobFileReader(file_number, 0, options_, env_opt, env_,
&readable_file);
BlobFileIterator iter(std::move(readable_file),
file_number,
file_size,
options_
);
iter.SeekToFirst();
for(auto& kv : data) {
if(kv.second.size() < options_.min_blob_size) {
continue;
}
ASSERT_EQ(iter.Valid(), true);
ASSERT_EQ(iter.key(), kv.first);
ASSERT_EQ(iter.value(), kv.second);
iter.Next();
}
}
std::string GenKey(uint64_t i) {
char buf[64];
snprintf(buf, sizeof(buf), "k-%08" PRIu64, i);
return buf;
}
std::string GenValue(uint64_t k) {
if (k % 2 == 0) {
return std::string(options_.min_blob_size - 1, 'v');
} else {
return std::string(options_.min_blob_size + 1, 'v');
}
}
void TestTableFactory() {
DeleteDir(env_, options_.dirname);
DeleteDir(env_, dbname_);
Options options;
options.create_if_missing = true;
options.table_factory.reset(
NewBlockBasedTableFactory(BlockBasedTableOptions()));
auto* original_table_factory = options.table_factory.get();
TitanDB* db;
ASSERT_OK(TitanDB::Open(TitanOptions(options), dbname_, &db));
auto cf_options = db->GetOptions(db->DefaultColumnFamily());
auto db_options = db->GetDBOptions();
ImmutableCFOptions immu_cf_options(ImmutableDBOptions(db_options),
cf_options);
ASSERT_EQ(original_table_factory, immu_cf_options.table_factory);
ASSERT_OK(db->Close());
DeleteDir(env_, options_.dirname);
DeleteDir(env_, dbname_);
}
Env* env_{Env::Default()};
std::string dbname_;
TitanOptions options_;
TitanDB* db_{nullptr};
TitanDBImpl* db_impl_{nullptr};
std::vector<std::string> cf_names_;
std::vector<ColumnFamilyHandle*> cf_handles_;
};
TEST_F(TitanDBTest, Basic) {
const uint64_t kNumKeys = 100;
std::map<std::string, std::string> data;
for (auto i = 0; i < 6; i++) {
if (i == 0) {
Open();
} else {
Reopen();
VerifyDB(data);
AddCF(std::to_string(i));
if (i % 3 == 0) {
DropCF(std::to_string(i - 1));
DropCF(std::to_string(i - 2));
}
}
for (uint64_t k = 1; k <= kNumKeys; k++) {
Put(k, &data);
}
Flush();
VerifyDB(data);
}
}
TEST_F(TitanDBTest, TableFactory) { TestTableFactory(); }
TEST_F(TitanDBTest, DbIter) {
Open();
std::map<std::string, std::string> data;
const int kNumEntries = 100;
for (uint64_t i = 1; i <= kNumEntries; i++) {
Put(i, &data);
}
ASSERT_EQ(kNumEntries, data.size());
std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
iter->SeekToFirst();
for (const auto& it : data) {
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(it.first, iter->key());
ASSERT_EQ(it.second, iter->value());
iter->Next();
}
ASSERT_FALSE(iter->Valid());
}
TEST_F(TitanDBTest, DBIterSeek) {
Open();
std::map<std::string, std::string> data;
const int kNumEntries = 100;
for (uint64_t i = 1; i <= kNumEntries; i++) {
Put(i, &data);
}
ASSERT_EQ(kNumEntries, data.size());
std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(data.begin()->first, iter->key());
ASSERT_EQ(data.begin()->second, iter->value());
iter->SeekToLast();
ASSERT_EQ(data.rbegin()->first, iter->key());
ASSERT_EQ(data.rbegin()->second, iter->value());
for (auto it = data.rbegin(); it != data.rend(); it++) {
iter->SeekToLast();
ASSERT_TRUE(iter->Valid());
iter->SeekForPrev(it->first);
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(it->first, iter->key());
ASSERT_EQ(it->second, iter->value());
}
for (const auto& it : data) {
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
iter->Seek(it.first);
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(it.first, iter->key());
ASSERT_EQ(it.second, iter->value());
}
}
TEST_F(TitanDBTest, Snapshot) {
Open();
std::map<std::string, std::string> data;
Put(1, &data);
ASSERT_EQ(1, data.size());
const Snapshot* snapshot(db_->GetSnapshot());
ReadOptions ropts;
ropts.snapshot = snapshot;
VerifyDB(data, ropts);
Flush();
VerifyDB(data, ropts);
db_->ReleaseSnapshot(snapshot);
}
TEST_F(TitanDBTest, IngestExternalFiles) {
Open();
SstFileWriter sst_file_writer(EnvOptions(), options_);
ASSERT_EQ(sst_file_writer.FileSize(), 0);
const uint64_t kNumEntries = 100;
std::map<std::string, std::string> total_data;
std::map<std::string, std::string> original_data;
std::map<std::string, std::string> ingested_data;
for (uint64_t i = 1; i <= kNumEntries; i++) {
Put(i, &original_data);
}
ASSERT_EQ(kNumEntries, original_data.size());
total_data.insert(original_data.begin(), original_data.end());
VerifyDB(total_data);
Flush();
VerifyDB(total_data);
const uint64_t kNumIngestedEntries = 100;
// Make sure that keys in SST overlaps with existing keys
const uint64_t kIngestedStart = kNumEntries - kNumEntries / 2;
std::string sst_file = options_.dirname + "/for_ingest.sst";
ASSERT_OK(sst_file_writer.Open(sst_file));
for (uint64_t i = 1; i <= kNumIngestedEntries; i++) {
std::string key = GenKey(kIngestedStart + i);
std::string value = GenValue(kIngestedStart + i);
ASSERT_OK(sst_file_writer.Put(key, value));
total_data[key] = value;
ingested_data.emplace(key, value);
}
ASSERT_OK(sst_file_writer.Finish());
IngestExternalFileOptions ifo;
ASSERT_OK(db_->IngestExternalFile({sst_file}, ifo));
VerifyDB(total_data);
Flush();
VerifyDB(total_data);
for(auto& handle : cf_handles_) {
auto blob = GetBlobStorage(handle);
ASSERT_EQ(1, blob.lock()->NumBlobFiles());
}
CompactRangeOptions copt;
ASSERT_OK(db_->CompactRange(copt, nullptr, nullptr));
VerifyDB(total_data);
for(auto& handle : cf_handles_) {
auto blob = GetBlobStorage(handle);
ASSERT_EQ(2, blob.lock()->NumBlobFiles());
std::map<uint64_t, std::weak_ptr<BlobFileMeta>> blob_files;
blob.lock()->ExportBlobFiles(blob_files);
ASSERT_EQ(2, blob_files.size());
auto bf = blob_files.begin();
VerifyBlob(bf->first, original_data);
bf ++;
VerifyBlob(bf->first, ingested_data);
}
}
TEST_F(TitanDBTest, DISABLED_ReadAfterDropCF) {
Open();
const uint64_t kNumCF = 3;
for(uint64_t i = 1; i <= kNumCF; i++) {
AddCF(std::to_string(i));
}
const uint64_t kNumEntries = 100;
std::map<std::string, std::string> data;
for(uint64_t i = 1; i <= kNumEntries; i++) {
Put(i, &data);
}
VerifyDB(data);
Flush();
VerifyDB(data);
for(auto& handle : cf_handles_) {
ASSERT_OK(db_->DropColumnFamily(handle));
VerifyDB(data);
}
}
#ifndef NDEBUG
TEST_F(TitanDBTest, BlobFileIOError) {
std::unique_ptr<TitanFaultInjectionTestEnv> mock_env(
new TitanFaultInjectionTestEnv(env_));
options_.env = mock_env.get();
options_.disable_background_gc = true; // avoid abort by BackgroundGC
Open();
std::map<std::string, std::string> data;
const int kNumEntries = 100;
for (uint64_t i = 1; i <= kNumEntries; i++) {
Put(i, &data);
}
ASSERT_EQ(kNumEntries, data.size());
CompactRangeOptions copts;
ASSERT_OK(db_->CompactRange(copts, nullptr, nullptr));
VerifyDB(data);
SyncPoint::GetInstance()->SetCallBack(
"BlobFileReader::Get", [&](void *) {
mock_env->SetFilesystemActive(
false,
Status::IOError("Injected error")
);
});
SyncPoint::GetInstance()->EnableProcessing();
for(auto& it : data) {
std::string value;
if(it.second.size() > options_.min_blob_size) {
ASSERT_TRUE(db_->Get(ReadOptions(), it.first, &value).IsIOError());
mock_env->SetFilesystemActive(true);
}
}
SyncPoint::GetInstance()->DisableProcessing();
mock_env->SetFilesystemActive(true);
std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
SyncPoint::GetInstance()->EnableProcessing();
iter->SeekToFirst();
ASSERT_TRUE(iter->status().IsIOError());
SyncPoint::GetInstance()->DisableProcessing();
mock_env->SetFilesystemActive(true);
iter.reset(db_->NewIterator(ReadOptions()));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
SyncPoint::GetInstance()->EnableProcessing();
iter->Next(); // second value (k=2) is inlined
ASSERT_TRUE(iter->Valid());
iter->Next();
ASSERT_TRUE(iter->status().IsIOError());
SyncPoint::GetInstance()->DisableProcessing();
mock_env->SetFilesystemActive(true);
options_.env = env_;
SyncPoint::GetInstance()->ClearAllCallBacks();
// env must be destructed AFTER db is closed to avoid
// `pure abstract method called` complaint.
iter.reset(nullptr); // early release to avoid outstanding reference
Close();
db_ = nullptr;
}
TEST_F(TitanDBTest, FlushWriteIOErrorHandling) {
std::unique_ptr<TitanFaultInjectionTestEnv> mock_env(
new TitanFaultInjectionTestEnv(env_));
options_.env = mock_env.get();
options_.disable_background_gc = true; // avoid abort by BackgroundGC
Open();
std::map<std::string, std::string> data;
const int kNumEntries = 100;
for (uint64_t i = 1; i <= kNumEntries; i++) {
Put(i, &data);
}
ASSERT_EQ(kNumEntries, data.size());
CompactRangeOptions copts;
// no compaction to enable Flush
VerifyDB(data);
SyncPoint::GetInstance()->SetCallBack(
"FlushJob::Start", [&](void *) {
mock_env->SetFilesystemActive(
false,
Status::IOError("FlushJob injected error")
);
});
SyncPoint::GetInstance()->EnableProcessing();
FlushOptions fopts;
ASSERT_TRUE(db_->Flush(fopts).IsIOError());
SyncPoint::GetInstance()->DisableProcessing();
mock_env->SetFilesystemActive(true);
// subsequent writes return error too
WriteOptions wopts;
std::string key = "key_after_flush";
std::string value = "value_after_flush";
ASSERT_TRUE(db_->Put(wopts, key, value).IsIOError());
options_.env = env_;
SyncPoint::GetInstance()->ClearAllCallBacks();
// env must be destructed AFTER db is closed to avoid
// `pure abstract method called` complaint.
Close();
db_ = nullptr;
}
TEST_F(TitanDBTest, CompactionWriteIOErrorHandling) {
std::unique_ptr<TitanFaultInjectionTestEnv> mock_env(
new TitanFaultInjectionTestEnv(env_));
options_.env = mock_env.get();
options_.disable_background_gc = true; // avoid abort by BackgroundGC
Open();
std::map<std::string, std::string> data;
const int kNumEntries = 100;
for (uint64_t i = 1; i <= kNumEntries; i++) {
Put(i, &data);
}
ASSERT_EQ(kNumEntries, data.size());
CompactRangeOptions copts;
// do not compact to enable following Compaction
VerifyDB(data);
SyncPoint::GetInstance()->SetCallBack(
"BackgroundCallCompaction:0", [&](void *) {
mock_env->SetFilesystemActive(
false,
Status::IOError("Compaction injected error")
);
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_TRUE(db_->CompactRange(copts, nullptr, nullptr).IsIOError());
SyncPoint::GetInstance()->DisableProcessing();
mock_env->SetFilesystemActive(true);
// subsequent writes return error too
WriteOptions wopts;
std::string key = "key_after_compaction";
std::string value = "value_after_compaction";
ASSERT_TRUE(db_->Put(wopts, key, value).IsIOError());
options_.env = env_;
SyncPoint::GetInstance()->ClearAllCallBacks();
// env must be destructed AFTER db is closed to avoid
// `pure abstract method called` complaint.
Close();
db_ = nullptr;
}
TEST_F(TitanDBTest, BlobFileCorruptionErrorHandling) {
options_.disable_background_gc = true; // avoid abort by BackgroundGC
Open();
std::map<std::string, std::string> data;
const int kNumEntries = 100;
for (uint64_t i = 1; i <= kNumEntries; i++) {
Put(i, &data);
}
ASSERT_EQ(kNumEntries, data.size());
CompactRangeOptions copt;
ASSERT_OK(db_->CompactRange(copt, nullptr, nullptr));
VerifyDB(data);
// Modify the checksum data to reproduce a mismatch
SyncPoint::GetInstance()->SetCallBack(
"BlobDecoder::DecodeRecord", [&](void* arg) {
auto* crc = reinterpret_cast<uint32_t*>(arg);
*crc = *crc + 1;
});
SyncPoint::GetInstance()->EnableProcessing();
for (auto& it : data) {
std::string value;
if(it.second.size() < options_.min_blob_size) {
continue;
}
ASSERT_TRUE(db_->Get(ReadOptions(), it.first, &value).IsCorruption());
}
SyncPoint::GetInstance()->DisableProcessing();
std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
SyncPoint::GetInstance()->EnableProcessing();
iter->SeekToFirst();
ASSERT_TRUE(iter->status().IsCorruption());
SyncPoint::GetInstance()->DisableProcessing();
iter.reset(db_->NewIterator(ReadOptions()));
iter->SeekToFirst();
ASSERT_TRUE(iter->Valid());
SyncPoint::GetInstance()->EnableProcessing();
iter->Next(); // second value (k=2) is inlined
ASSERT_TRUE(iter->Valid());
iter->Next();
ASSERT_TRUE(iter->status().IsCorruption());
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
#endif // !NDEBUG
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#pragma once
#include "rocksdb/env.h"
#include "util/fault_injection_test_env.h"
#include <memory>
namespace rocksdb {
namespace titandb {
class TitanFaultInjectionTestEnv;
class TitanTestRandomAccessFile : public RandomAccessFile {
public:
explicit TitanTestRandomAccessFile(std::unique_ptr<RandomAccessFile>&& f,
TitanFaultInjectionTestEnv* env)
: target_(std::move(f)),
env_(env) {
assert(target_ != nullptr);
}
virtual ~TitanTestRandomAccessFile() { }
Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override;
Status Prefetch(uint64_t offset, size_t n) override;
size_t GetUniqueId(char* id, size_t max_size) const override {
return target_->GetUniqueId(id, max_size);
}
void Hint(AccessPattern pattern) override {
return target_->Hint(pattern);
}
bool use_direct_io() const override {
return target_->use_direct_io();
}
size_t GetRequiredBufferAlignment() const override {
return target_->GetRequiredBufferAlignment();
}
Status InvalidateCache(size_t offset, size_t length) override;
private:
std::unique_ptr<RandomAccessFile> target_;
TitanFaultInjectionTestEnv* env_;
};
class TitanFaultInjectionTestEnv : public FaultInjectionTestEnv {
public:
TitanFaultInjectionTestEnv(Env* t)
: FaultInjectionTestEnv(t) { }
virtual ~TitanFaultInjectionTestEnv() { }
Status NewRandomAccessFile(const std::string& fname,
std::unique_ptr<RandomAccessFile>* result,
const EnvOptions& soptions) {
if (!IsFilesystemActive()) {
return GetError();
}
Status s = target()->NewRandomAccessFile(fname, result, soptions);
if (s.ok()) {
result->reset(new TitanTestRandomAccessFile(std::move(*result), this));
}
return s;
}
};
Status TitanTestRandomAccessFile::Read(uint64_t offset, size_t n,
Slice* result, char* scratch) const {
if(!env_->IsFilesystemActive()) {
return env_->GetError();
}
return target_->Read(offset, n, result, scratch);
}
Status TitanTestRandomAccessFile::Prefetch(uint64_t offset, size_t n) {
if(!env_->IsFilesystemActive()) {
return env_->GetError();
}
return target_->Prefetch(offset, n);
}
Status TitanTestRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
if(!env_->IsFilesystemActive()) {
return env_->GetError();
}
return target_->InvalidateCache(offset, length);
}
} // namespace titandb
} // namespace rocksdb
\ No newline at end of file
#include "utilities/titandb/util.h"
namespace rocksdb {
namespace titandb {
// See util/compression.h.
const uint32_t kCompressionFormat = 2;
bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
// Check to see if compressed less than 12.5%
return compressed_size < raw_size - (raw_size / 8u);
}
Slice Compress(const CompressionContext& ctx, const Slice& input,
std::string* output, CompressionType* type) {
*type = ctx.type();
if (ctx.type() == kNoCompression) {
return input;
}
// Returns compressed block contents if:
// (1) the compression method is supported in this platform and
// (2) the compression rate is "good enough".
switch (ctx.type()) {
case kSnappyCompression:
if (Snappy_Compress(ctx, input.data(), input.size(), output) &&
GoodCompressionRatio(output->size(), input.size())) {
return *output;
}
break;
case kZlibCompression:
if (Zlib_Compress(ctx, kCompressionFormat, input.data(), input.size(),
output) &&
GoodCompressionRatio(output->size(), input.size())) {
return *output;
}
break;
case kBZip2Compression:
if (BZip2_Compress(ctx, kCompressionFormat, input.data(), input.size(),
output) &&
GoodCompressionRatio(output->size(), input.size())) {
return *output;
}
break;
case kLZ4Compression:
if (LZ4_Compress(ctx, kCompressionFormat, input.data(), input.size(),
output) &&
GoodCompressionRatio(output->size(), input.size())) {
return *output;
}
break;
case kLZ4HCCompression:
if (LZ4HC_Compress(ctx, kCompressionFormat, input.data(), input.size(),
output) &&
GoodCompressionRatio(output->size(), input.size())) {
return *output;
}
break;
case kXpressCompression:
if (XPRESS_Compress(input.data(), input.size(), output) &&
GoodCompressionRatio(output->size(), input.size())) {
return *output;
}
break;
case kZSTD:
case kZSTDNotFinalCompression:
if (ZSTD_Compress(ctx, input.data(), input.size(), output) &&
GoodCompressionRatio(output->size(), input.size())) {
return *output;
}
break;
default: {} // Do not recognize this compression type
}
// Compression method is not supported, or not good compression
// ratio, so just fall back to uncompressed form.
*type = kNoCompression;
return input;
}
Status Uncompress(const UncompressionContext& ctx, const Slice& input,
OwnedSlice* output) {
int size = 0;
CacheAllocationPtr ubuf;
assert(ctx.type() != kNoCompression);
switch (ctx.type()) {
case kSnappyCompression: {
size_t usize = 0;
if (!Snappy_GetUncompressedLength(input.data(), input.size(), &usize)) {
return Status::Corruption("Corrupted compressed blob", "Snappy");
}
ubuf.reset(new char[usize]);
if (!Snappy_Uncompress(input.data(), input.size(), ubuf.get())) {
return Status::Corruption("Corrupted compressed blob", "Snappy");
}
output->reset(std::move(ubuf), usize);
break;
}
case kZlibCompression:
ubuf = Zlib_Uncompress(ctx, input.data(), input.size(), &size,
kCompressionFormat);
if (!ubuf.get()) {
return Status::Corruption("Corrupted compressed blob", "Zlib");
}
output->reset(std::move(ubuf), size);
break;
case kBZip2Compression:
ubuf = BZip2_Uncompress(input.data(), input.size(), &size,
kCompressionFormat);
if (!ubuf.get()) {
return Status::Corruption("Corrupted compressed blob", "Bzip2");
}
output->reset(std::move(ubuf), size);
break;
case kLZ4Compression:
ubuf = LZ4_Uncompress(ctx, input.data(), input.size(), &size,
kCompressionFormat);
if (!ubuf.get()) {
return Status::Corruption("Corrupted compressed blob", "LZ4");
}
output->reset(std::move(ubuf), size);
break;
case kLZ4HCCompression:
ubuf = LZ4_Uncompress(ctx, input.data(), input.size(), &size,
kCompressionFormat);
if (!ubuf.get()) {
return Status::Corruption("Corrupted compressed blob", "LZ4HC");
}
output->reset(std::move(ubuf), size);
break;
case kXpressCompression:
ubuf.reset(XPRESS_Uncompress(input.data(), input.size(), &size));
if (!ubuf.get()) {
return Status::Corruption("Corrupted compressed blob", "Xpress");
}
output->reset(std::move(ubuf), size);
break;
case kZSTD:
case kZSTDNotFinalCompression:
ubuf = ZSTD_Uncompress(ctx, input.data(), input.size(), &size);
if (!ubuf.get()) {
return Status::Corruption("Corrupted compressed blob", "ZSTD");
}
output->reset(std::move(ubuf), size);
break;
default:
return Status::Corruption("bad compression type");
}
return Status::OK();
}
void UnrefCacheHandle(void* arg1, void* arg2) {
Cache* cache = reinterpret_cast<Cache*>(arg1);
Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
cache->Release(h);
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "rocksdb/cache.h"
#include "util/compression.h"
namespace rocksdb {
namespace titandb {
#define TRY(expr) \
do { \
auto s = (expr); \
if (!s.ok()) return s; \
} while (0)
#define EXPECT(expr) \
do { \
if (!(expr)) abort(); \
} while (0)
// A slice pointed to an owned buffer.
class OwnedSlice : public Slice {
public:
void reset(CacheAllocationPtr _data, size_t _size) {
data_ = _data.get();
size_ = _size;
buffer_ = std::move(_data);
}
void reset(CacheAllocationPtr buffer, const Slice& s) {
data_ = s.data();
size_ = s.size();
buffer_ = std::move(buffer);
}
char* release() {
data_ = nullptr;
size_ = 0;
return buffer_.release();
}
static void CleanupFunc(void* buffer, void*) {
delete[] reinterpret_cast<char*>(buffer);
}
private:
CacheAllocationPtr buffer_;
};
// A slice pointed to a fixed size buffer.
template <size_t T>
class FixedSlice : public Slice {
public:
FixedSlice() : Slice(buffer_, T) {}
char* get() { return buffer_; }
private:
char buffer_[T];
};
// Compresses the input data according to the compression context.
// Returns a slice with the output data and sets "*type" to the output
// compression type.
//
// If compression is actually performed, fills "*output" with the
// compressed data. However, if the compression ratio is not good, it
// returns the input slice directly and sets "*type" to
// kNoCompression.
Slice Compress(const CompressionContext& ctx, const Slice& input,
std::string* output, CompressionType* type);
// Uncompresses the input data according to the uncompression type.
// If successful, fills "*buffer" with the uncompressed data and
// points "*output" to it.
Status Uncompress(const UncompressionContext& ctx, const Slice& input,
OwnedSlice* output);
void UnrefCacheHandle(void* cache, void* handle);
template <class T>
void DeleteCacheValue(const Slice&, void* value) {
delete reinterpret_cast<T*>(value);
}
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/util.h"
#include "util/testharness.h"
namespace rocksdb {
namespace titandb {
class UtilTest : public testing::Test {};
TEST(UtilTest, Compression) {
std::string input(1024, 'a');
for (auto compression :
{kSnappyCompression, kZlibCompression, kLZ4Compression, kZSTD}) {
CompressionContext compression_ctx(compression);
std::string buffer;
auto compressed = Compress(compression_ctx, input, &buffer, &compression);
if (compression != kNoCompression) {
ASSERT_TRUE(compressed.size() <= input.size());
UncompressionContext uncompression_ctx(compression);
OwnedSlice output;
ASSERT_OK(Uncompress(uncompression_ctx, compressed, &output));
ASSERT_EQ(output, input);
}
}
}
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#include "utilities/titandb/version.h"
#include "utilities/titandb/version_set.h"
namespace rocksdb {
namespace titandb {
Status BlobStorage::Get(const ReadOptions& options, const BlobIndex& index,
BlobRecord* record, PinnableSlice* buffer) {
auto sfile = FindFile(index.file_number).lock();
if (!sfile)
return Status::Corruption("Missing blob file: " +
std::to_string(index.file_number));
return file_cache_->Get(options, sfile->file_number(), sfile->file_size(),
index.blob_handle, record, buffer);
}
Status BlobStorage::NewPrefetcher(uint64_t file_number,
std::unique_ptr<BlobFilePrefetcher>* result) {
auto sfile = FindFile(file_number).lock();
if (!sfile)
return Status::Corruption("Missing blob wfile: " +
std::to_string(file_number));
return file_cache_->NewPrefetcher(sfile->file_number(), sfile->file_size(),
result);
}
std::weak_ptr<BlobFileMeta> BlobStorage::FindFile(uint64_t file_number) const {
ReadLock rl(&mutex_);
auto it = files_.find(file_number);
if (it != files_.end()) {
assert(file_number == it->second->file_number());
return it->second;
}
return std::weak_ptr<BlobFileMeta>();
}
void BlobStorage::ExportBlobFiles(
std::map<uint64_t, std::weak_ptr<BlobFileMeta>>& ret) const {
ReadLock rl(&mutex_);
for(auto& kv : files_) {
ret.emplace(kv.first, std::weak_ptr<BlobFileMeta>(kv.second));
}
}
void BlobStorage::AddBlobFile(std::shared_ptr<BlobFileMeta>& file) {
WriteLock wl(&mutex_);
files_.emplace(std::make_pair(file->file_number(), file));
}
void BlobStorage::DeleteBlobFile(uint64_t file) {
{
WriteLock wl(&mutex_);
files_.erase(file);
}
file_cache_->Evict(file);
}
void BlobStorage::ComputeGCScore() {
// TODO: no need to recompute all everytime
gc_score_.clear();
{
ReadLock rl(&mutex_);
for (auto& file : files_) {
if (file.second->is_obsolete()) {
continue;
}
gc_score_.push_back({});
auto& gcs = gc_score_.back();
gcs.file_number = file.first;
if (file.second->file_size() <
titan_cf_options_.merge_small_file_threshold) {
gcs.score = 1;
} else {
gcs.score = file.second->GetDiscardableRatio();
}
}
}
std::sort(gc_score_.begin(), gc_score_.end(),
[](const GCScore& first, const GCScore& second) {
return first.score > second.score;
});
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include "rocksdb/options.h"
#include "utilities/titandb/blob_file_cache.h"
#include "utilities/titandb/blob_format.h"
#include "utilities/titandb/blob_gc.h"
namespace rocksdb {
namespace titandb {
// Provides methods to access the blob storage for a specific
// column family. The version must be valid when this storage is used.
class BlobStorage {
public:
BlobStorage(const BlobStorage& bs) : mutex_() {
this->files_ = bs.files_;
this->file_cache_ = bs.file_cache_;
this->titan_cf_options_ = bs.titan_cf_options_;
}
BlobStorage(const TitanCFOptions& _options,
std::shared_ptr<BlobFileCache> _file_cache)
: titan_cf_options_(_options), mutex_(), file_cache_(_file_cache) {}
// Gets the blob record pointed by the blob index. The provided
// buffer is used to store the record data, so the buffer must be
// valid when the record is used.
Status Get(const ReadOptions& options, const BlobIndex& index,
BlobRecord* record, PinnableSlice* buffer);
// Creates a prefetcher for the specified file number.
Status NewPrefetcher(uint64_t file_number,
std::unique_ptr<BlobFilePrefetcher>* result);
// Finds the blob file meta for the specified file number. It is a
// corruption if the file doesn't exist in the specific version.
std::weak_ptr<BlobFileMeta> FindFile(uint64_t file_number) const;
std::size_t NumBlobFiles() const {
ReadLock rl(&mutex_);
return files_.size();
}
void ExportBlobFiles(
std::map<uint64_t, std::weak_ptr<BlobFileMeta>>& ret) const;
void MarkAllFilesForGC() {
WriteLock wl(&mutex_);
for (auto& file : files_) {
file.second->FileStateTransit(BlobFileMeta::FileEvent::kDbRestart);
// file.second->marked_for_gc_ = true;
}
}
const std::vector<GCScore> gc_score() { return gc_score_; }
void ComputeGCScore();
const TitanCFOptions& titan_cf_options() { return titan_cf_options_; }
void AddBlobFile(std::shared_ptr<BlobFileMeta>& file);
void DeleteBlobFile(uint64_t file);
private:
friend class VersionSet;
friend class VersionTest;
friend class BlobGCPickerTest;
friend class BlobGCJobTest;
friend class BlobFileSizeCollectorTest;
TitanCFOptions titan_cf_options_;
// Read Write Mutex, which protects the `files_` structures
mutable port::RWMutex mutex_;
// Only BlobStorage OWNS BlobFileMeta
std::unordered_map<uint64_t, std::shared_ptr<BlobFileMeta>> files_;
std::shared_ptr<BlobFileCache> file_cache_;
std::vector<GCScore> gc_score_;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/version_edit.h"
#include "util/coding.h"
namespace rocksdb {
namespace titandb {
enum Tag {
kNextFileNumber = 1,
kColumnFamilyID = 10,
kAddedBlobFile = 11,
kDeletedBlobFile = 12,
};
void VersionEdit::EncodeTo(std::string* dst) const {
if (has_next_file_number_) {
PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
}
PutVarint32Varint32(dst, kColumnFamilyID, column_family_id_);
for (auto& file : added_files_) {
PutVarint32(dst, kAddedBlobFile);
file->EncodeTo(dst);
}
for (auto& file : deleted_files_) {
PutVarint32Varint64(dst, kDeletedBlobFile, file.first);
}
}
Status VersionEdit::DecodeFrom(Slice* src) {
uint32_t tag;
uint64_t file_number;
std::shared_ptr<BlobFileMeta> blob_file;
const char* error = nullptr;
while (!error && !src->empty()) {
if (!GetVarint32(src, &tag)) {
error = "invalid tag";
break;
}
switch (tag) {
case kNextFileNumber:
if (GetVarint64(src, &next_file_number_)) {
has_next_file_number_ = true;
} else {
error = "next file number";
}
break;
case kColumnFamilyID:
if (GetVarint32(src, &column_family_id_)) {
} else {
error = "column family id";
}
break;
case kAddedBlobFile:
blob_file = std::make_shared<BlobFileMeta>();
if (blob_file->DecodeFrom(src).ok()) {
AddBlobFile(blob_file);
} else {
error = "added blob file";
}
break;
case kDeletedBlobFile:
if (GetVarint64(src, &file_number)) {
DeleteBlobFile(file_number);
} else {
error = "deleted blob file";
}
break;
default:
error = "unknown tag";
break;
}
}
if (error) {
return Status::Corruption("VersionEdit", error);
}
return Status::OK();
}
bool operator==(const VersionEdit& lhs, const VersionEdit& rhs) {
if (lhs.added_files_.size() != rhs.added_files_.size()) {
return false;
}
std::map<uint64_t, std::shared_ptr<BlobFileMeta>> blob_files;
for (std::size_t idx = 0; idx < lhs.added_files_.size(); idx++) {
blob_files.insert(
{lhs.added_files_[idx]->file_number(), lhs.added_files_[idx]});
}
for (std::size_t idx = 0; idx < rhs.added_files_.size(); idx++) {
auto iter = blob_files.find(rhs.added_files_[idx]->file_number());
if (iter == blob_files.end() || !(*iter->second == *rhs.added_files_[idx]))
return false;
}
return (lhs.has_next_file_number_ == rhs.has_next_file_number_ &&
lhs.next_file_number_ == rhs.next_file_number_ &&
lhs.column_family_id_ == rhs.column_family_id_ &&
lhs.deleted_files_ == rhs.deleted_files_);
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include <set>
#include "rocksdb/slice.h"
#include "utilities/titandb/blob_format.h"
namespace rocksdb {
namespace titandb {
class VersionEdit {
public:
void SetNextFileNumber(uint64_t v) {
has_next_file_number_ = true;
next_file_number_ = v;
}
void SetColumnFamilyID(uint32_t v) { column_family_id_ = v; }
void AddBlobFile(std::shared_ptr<BlobFileMeta> file) {
added_files_.push_back(file);
}
void DeleteBlobFile(uint64_t file_number, SequenceNumber obsolete_sequence = 0) {
deleted_files_.emplace_back(std::make_pair(file_number, obsolete_sequence));
}
void EncodeTo(std::string* dst) const;
Status DecodeFrom(Slice* src);
friend bool operator==(const VersionEdit& lhs, const VersionEdit& rhs);
private:
friend class VersionSet;
bool has_next_file_number_{false};
uint64_t next_file_number_{0};
uint32_t column_family_id_{0};
std::vector<std::shared_ptr<BlobFileMeta>> added_files_;
std::vector<std::pair<uint64_t, SequenceNumber>> deleted_files_;
};
} // namespace titandb
} // namespace rocksdb
#include "utilities/titandb/version_set.h"
#include <inttypes.h>
#include "util/autovector.h"
#include "util/filename.h"
namespace rocksdb {
namespace titandb {
const size_t kMaxFileCacheSize = 1024 * 1024;
VersionSet::VersionSet(const TitanDBOptions& options)
: dirname_(options.dirname),
env_(options.env),
env_options_(options),
db_options_(options) {
auto file_cache_size = db_options_.max_open_files;
if (file_cache_size < 0) {
file_cache_size = kMaxFileCacheSize;
}
file_cache_ = NewLRUCache(file_cache_size);
}
Status VersionSet::Open(
const std::map<uint32_t, TitanCFOptions>& column_families) {
// Sets up initial column families.
AddColumnFamilies(column_families);
Status s = env_->FileExists(CurrentFileName(dirname_));
if (s.ok()) {
return Recover();
}
if (!s.IsNotFound()) {
return s;
}
return OpenManifest(NewFileNumber());
}
Status VersionSet::Recover() {
struct LogReporter : public log::Reader::Reporter {
Status* status;
void Corruption(size_t, const Status& s) override {
if (status->ok()) *status = s;
}
};
// Reads "CURRENT" file, which contains the name of the current manifest file.
std::string manifest;
Status s = ReadFileToString(env_, CurrentFileName(dirname_), &manifest);
if (!s.ok()) return s;
if (manifest.empty() || manifest.back() != '\n') {
return Status::Corruption("CURRENT file does not end with newline");
}
manifest.resize(manifest.size() - 1);
// Opens the current manifest file.
auto file_name = dirname_ + "/" + manifest;
std::unique_ptr<SequentialFileReader> file;
{
std::unique_ptr<SequentialFile> f;
s = env_->NewSequentialFile(file_name, &f,
env_->OptimizeForManifestRead(env_options_));
if (!s.ok()) return s;
file.reset(new SequentialFileReader(std::move(f), file_name));
}
bool has_next_file_number = false;
uint64_t next_file_number = 0;
// Reads edits from the manifest and applies them one by one.
{
LogReporter reporter;
reporter.status = &s;
log::Reader reader(nullptr, std::move(file), &reporter, true /*checksum*/,
0 /*initial_offset*/, 0);
Slice record;
std::string scratch;
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
VersionEdit edit;
s = DecodeInto(record, &edit);
if (!s.ok()) return s;
Apply(&edit);
if (edit.has_next_file_number_) {
assert(edit.next_file_number_ >= next_file_number);
next_file_number = edit.next_file_number_;
has_next_file_number = true;
}
}
}
if (!has_next_file_number) {
return Status::Corruption("no next file number in manifest file");
}
next_file_number_.store(next_file_number);
auto new_manifest_file_number = NewFileNumber();
s = OpenManifest(new_manifest_file_number);
if (!s.ok()) return s;
// Purge inactive files at start
std::set<uint64_t> alive_files;
alive_files.insert(new_manifest_file_number);
for (const auto& bs : column_families_) {
autovector<uint64_t> obsolete_files;
for (const auto& f : bs.second->files_) {
if (f.second->is_obsolete()) {
// delete already obsoleted files at reopen
obsolete_files.push_back(f.second->file_number());
for (auto it = obsolete_files_.blob_files.begin(); it != obsolete_files_.blob_files.end(); ++it) {
if (std::get<0>(*it) == f.second->file_number()) {
it = this->obsolete_files_.blob_files.erase(it);
break;
}
}
} else {
alive_files.insert(f.second->file_number());
}
}
for (uint64_t obsolete_file : obsolete_files) {
bs.second->DeleteBlobFile(obsolete_file);
}
}
std::vector<std::string> files;
env_->GetChildren(dirname_, &files);
for (const auto& f : files) {
uint64_t file_number;
FileType file_type;
if (!ParseFileName(f, &file_number, &file_type)) continue;
if (alive_files.find(file_number) != alive_files.end()) continue;
if (file_type != FileType::kBlobFile &&
file_type != FileType::kDescriptorFile)
continue;
env_->DeleteFile(dirname_ + "/" + f);
}
// Make sure perform gc on all files at the beginning
MarkAllFilesForGC();
return Status::OK();
}
Status VersionSet::OpenManifest(uint64_t file_number) {
Status s;
auto file_name = DescriptorFileName(dirname_, file_number);
std::unique_ptr<WritableFileWriter> file;
{
std::unique_ptr<WritableFile> f;
s = env_->NewWritableFile(file_name, &f, env_options_);
if (!s.ok()) return s;
file.reset(new WritableFileWriter(std::move(f), file_name, env_options_));
}
manifest_.reset(new log::Writer(std::move(file), 0, false));
// Saves current snapshot
s = WriteSnapshot(manifest_.get());
if (s.ok()) {
ImmutableDBOptions ioptions(db_options_);
s = SyncManifest(env_, &ioptions, manifest_->file());
}
if (s.ok()) {
// Makes "CURRENT" file that points to the new manifest file.
s = SetCurrentFile(env_, dirname_, file_number, nullptr);
}
if (!s.ok()) {
manifest_.reset();
obsolete_files_.manifests.emplace_back(file_name);
}
return s;
}
Status VersionSet::WriteSnapshot(log::Writer* log) {
Status s;
// Saves global information
{
VersionEdit edit;
edit.SetNextFileNumber(next_file_number_.load());
std::string record;
edit.EncodeTo(&record);
s = log->AddRecord(record);
if (!s.ok()) return s;
}
// Saves column families information
for (auto& it : this->column_families_) {
VersionEdit edit;
edit.SetColumnFamilyID(it.first);
for (auto& file : it.second->files_) {
// skip obsolete file
if (file.second->is_obsolete()) {
continue;
}
edit.AddBlobFile(file.second);
}
std::string record;
edit.EncodeTo(&record);
s = log->AddRecord(record);
if (!s.ok()) return s;
}
return s;
}
Status VersionSet::LogAndApply(VersionEdit* edit) {
// TODO(@huachao): write manifest file unlocked
std::string record;
edit->SetNextFileNumber(next_file_number_.load());
edit->EncodeTo(&record);
Status s = manifest_->AddRecord(record);
if (s.ok()) {
ImmutableDBOptions ioptions(db_options_);
s = SyncManifest(env_, &ioptions, manifest_->file());
}
if (!s.ok()) return s;
Apply(edit);
return s;
}
void VersionSet::Apply(VersionEdit* edit) {
auto cf_id = edit->column_family_id_;
auto it = column_families_.find(cf_id);
if (it == column_families_.end()) {
// Ignore unknown column families.
return;
}
auto& files = it->second->files_;
for (auto& file : edit->deleted_files_) {
auto number = file.first;
auto blob_it = files.find(number);
if (blob_it == files.end()) {
fprintf(stderr, "blob file %" PRIu64 " doesn't exist before\n", number);
abort();
} else if (blob_it->second->is_obsolete()) {
fprintf(stderr, "blob file %" PRIu64 " has been deleted before\n", number);
abort();
}
MarkFileObsolete(blob_it->second, file.second, cf_id);
}
for (auto& file : edit->added_files_) {
auto number = file->file_number();
auto blob_it = files.find(number);
if (blob_it != files.end()) {
if (blob_it->second->is_obsolete()) {
fprintf(stderr, "blob file %" PRIu64 " has been deleted before\n", number);
} else {
fprintf(stderr, "blob file %" PRIu64 " has been added before\n", number);
}
abort();
}
it->second->AddBlobFile(file);
}
it->second->ComputeGCScore();
}
void VersionSet::AddColumnFamilies(const std::map<uint32_t, TitanCFOptions>& column_families) {
for (auto& cf : column_families) {
auto file_cache =
std::make_shared<BlobFileCache>(db_options_, cf.second, file_cache_);
auto blob_storage = std::make_shared<BlobStorage>(cf.second, file_cache);
column_families_.emplace(cf.first, blob_storage);
}
}
void VersionSet::DropColumnFamilies(const std::vector<uint32_t>& column_families, SequenceNumber obsolete_sequence) {
for (auto& cf : column_families) {
auto it = column_families_.find(cf);
if (it != column_families_.end()) {
VersionEdit edit;
edit.SetColumnFamilyID(it->first);
for (auto& file: it->second->files_) {
ROCKS_LOG_INFO(db_options_.info_log, "Titan add obsolete file [%llu]",
file.second->file_number());
edit.DeleteBlobFile(file.first, obsolete_sequence);
}
// TODO: check status
LogAndApply(&edit);
}
obsolete_columns_.insert(cf);
}
}
void VersionSet::MarkFileObsolete(std::shared_ptr<BlobFileMeta> file, SequenceNumber obsolete_sequence, uint32_t cf_id) {
obsolete_files_.blob_files.push_back(std::make_tuple(file->file_number(), obsolete_sequence, cf_id));
file->FileStateTransit(BlobFileMeta::FileEvent::kDelete);
}
void VersionSet::GetObsoleteFiles(ObsoleteFiles* obsolete_files, SequenceNumber oldest_sequence) {
for (auto tuple_it = obsolete_files_.blob_files.begin(); tuple_it != obsolete_files_.blob_files.end();) {
auto& obsolete_sequence = std::get<1>(*tuple_it);
// We check whether the oldest snapshot is no less than the last sequence
// by the time the blob file become obsolete. If so, the blob file is not
// visible to all existing snapshots.
if (oldest_sequence > obsolete_sequence) {
auto& file_number = std::get<0>(*tuple_it);
auto& cf_id = std::get<2>(*tuple_it);
ROCKS_LOG_INFO(db_options_.info_log,
"Obsolete blob file %" PRIu64 " (obsolete at %" PRIu64
") not visible to oldest snapshot %" PRIu64 ", delete it.",
file_number, obsolete_sequence, oldest_sequence);
// Cleanup obsolete column family when all the blob files for that are deleted.
auto it = column_families_.find(cf_id);
if (it != column_families_.end()) {
it->second->DeleteBlobFile(file_number);
if (it->second->files_.empty() && obsolete_columns_.find(cf_id) != obsolete_columns_.end()) {
column_families_.erase(it);
obsolete_columns_.erase(cf_id);
}
} else {
fprintf(stderr, "column %u not found when deleting obsolete file%" PRIu64 "\n",
cf_id, file_number);
abort();
}
auto now = tuple_it++;
obsolete_files->blob_files.splice(obsolete_files->blob_files.end(), obsolete_files_.blob_files, now);
} else {
++tuple_it;
}
}
obsolete_files_.manifests.swap(obsolete_files->manifests);
}
} // namespace titandb
} // namespace rocksdb
#pragma once
#include <stdint.h>
#include <atomic>
#include <unordered_map>
#include <unordered_set>
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "port/port_posix.h"
#include "rocksdb/options.h"
#include "rocksdb/status.h"
#include "util/mutexlock.h"
#include "utilities/titandb/blob_file_cache.h"
#include "utilities/titandb/options.h"
#include "utilities/titandb/version_edit.h"
#include "utilities/titandb/version.h"
namespace rocksdb {
namespace titandb {
struct ObsoleteFiles {
ObsoleteFiles() = default;
ObsoleteFiles(const ObsoleteFiles&) = delete;
ObsoleteFiles& operator=(const ObsoleteFiles&) = delete;
ObsoleteFiles(ObsoleteFiles&&) = delete;
ObsoleteFiles& operator=(ObsoleteFiles&&) = delete;
// TODO: make it map
// file_number -> (obsolete_sequence, cf_id)
std::list<std::tuple<uint64_t, SequenceNumber, uint32_t>> blob_files;
std::vector<std::string> manifests;
};
class VersionSet {
public:
explicit VersionSet(const TitanDBOptions& options);
// Sets up the storage specified in "options.dirname".
// If the manifest doesn't exist, it will create one.
// If the manifest exists, it will recover from the latest one.
// It is a corruption if the persistent storage contains data
// outside of the provided column families.
Status Open(const std::map<uint32_t, TitanCFOptions>& column_families);
// Applies *edit on the current version to form a new version that is
// both saved to the manifest and installed as the new current version.
// REQUIRES: mutex is held
Status LogAndApply(VersionEdit* edit);
// Adds some column families with the specified options.
// REQUIRES: mutex is held
void AddColumnFamilies(
const std::map<uint32_t, TitanCFOptions>& column_families);
// Drops some column families. The obsolete files will be deleted in
// background when they will not be accessed anymore.
// REQUIRES: mutex is held
void DropColumnFamilies(const std::vector<uint32_t>& column_families, SequenceNumber obsolete_sequence);
// Allocates a new file number.
uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
// REQUIRES: mutex is held
std::weak_ptr<BlobStorage> GetBlobStorage(uint32_t cf_id) {
auto it = column_families_.find(cf_id);
if (it != column_families_.end()) {
return it->second;
}
return std::weak_ptr<BlobStorage>();
}
// REQUIRES: mutex is held
void GetObsoleteFiles(ObsoleteFiles* obsolete_files, SequenceNumber oldest_sequence);
// REQUIRES: mutex is held
void MarkAllFilesForGC() {
for (auto& cf : column_families_) {
cf.second->MarkAllFilesForGC();
}
}
private:
friend class BlobFileSizeCollectorTest;
friend class VersionTest;
Status Recover();
Status OpenManifest(uint64_t number);
Status WriteSnapshot(log::Writer* log);
void Apply(VersionEdit* edit);
void MarkFileObsolete(std::shared_ptr<BlobFileMeta> file, SequenceNumber obsolete_sequence, uint32_t cf_id);
std::string dirname_;
Env* env_;
EnvOptions env_options_;
TitanDBOptions db_options_;
std::shared_ptr<Cache> file_cache_;
ObsoleteFiles obsolete_files_;
std::unordered_set<uint32_t> obsolete_columns_;
std::unordered_map<uint32_t, std::shared_ptr<BlobStorage>> column_families_;
std::unique_ptr<log::Writer> manifest_;
std::atomic<uint64_t> next_file_number_{1};
};
} // namespace titandb
} // namespace rocksdb
#include "util/filename.h"
#include "util/testharness.h"
#include "utilities/titandb/testutil.h"
#include "utilities/titandb/util.h"
#include "utilities/titandb/version_edit.h"
#include "utilities/titandb/version_set.h"
namespace rocksdb {
namespace titandb {
void DeleteDir(Env* env, const std::string& dirname) {
std::vector<std::string> filenames;
env->GetChildren(dirname, &filenames);
for (auto& fname : filenames) {
uint64_t number;
FileType type;
if (ParseFileName(fname, &number, &type)) {
ASSERT_OK(env->DeleteFile(dirname + "/" + fname));
}
}
env->DeleteDir(dirname);
}
class VersionTest : public testing::Test {
public:
TitanDBOptions db_options_;
TitanCFOptions cf_options_;
std::shared_ptr<BlobFileCache> file_cache_;
std::map<uint32_t, std::shared_ptr<BlobStorage>> column_families_;
std::unique_ptr<VersionSet> vset_;
port::Mutex mutex_;
std::string dbname_;
Env* env_;
VersionTest() : dbname_(test::TmpDir()), env_(Env::Default()) {
db_options_.dirname = dbname_ + "/titandb";
db_options_.create_if_missing = true;
env_->CreateDirIfMissing(dbname_);
env_->CreateDirIfMissing(db_options_.dirname);
auto cache = NewLRUCache(db_options_.max_open_files);
file_cache_.reset(new BlobFileCache(db_options_, cf_options_, cache));
Reset();
}
void Reset() {
DeleteDir(env_, dbname_);
vset_.reset(new VersionSet(db_options_));
ASSERT_OK(vset_->Open({}));
column_families_.clear();
// Sets up some column families.
for (uint32_t id = 0; id < 10; id++) {
std::shared_ptr<BlobStorage> storage;
storage.reset(new BlobStorage(cf_options_, file_cache_));
column_families_.emplace(id, storage);
storage.reset(new BlobStorage(cf_options_, file_cache_));
vset_->column_families_.emplace(id, storage);
}
}
void AddBlobFiles(uint32_t cf_id, uint64_t start, uint64_t end) {
auto storage = column_families_[cf_id];
for (auto i = start; i < end; i++) {
auto file = std::make_shared<BlobFileMeta>(i, i);
storage->files_.emplace(i, file);
}
}
void DeleteBlobFiles(uint32_t cf_id, uint64_t start, uint64_t end) {
auto& storage = column_families_[cf_id];
for (auto i = start; i < end; i++) {
storage->files_.erase(i);
}
}
void BuildAndCheck(std::vector<VersionEdit> edits) {
for (auto& edit : edits) {
vset_->Apply(&edit);
}
for (auto& it : vset_->column_families_) {
auto& storage = column_families_[it.first];
// ignore obsolete file
auto size = 0;
for (auto& file: it.second->files_) {
if (!file.second->is_obsolete()) {
size++;
}
}
ASSERT_EQ(storage->files_.size(), size);
for (auto& f : storage->files_) {
auto iter = it.second->files_.find(f.first);
ASSERT_TRUE(iter != it.second->files_.end());
ASSERT_EQ(*f.second, *(iter->second));
}
}
}
};
TEST_F(VersionTest, VersionEdit) {
VersionEdit input;
CheckCodec(input);
input.SetNextFileNumber(1);
input.SetColumnFamilyID(2);
CheckCodec(input);
auto file1 = std::make_shared<BlobFileMeta>(3, 4);
auto file2 = std::make_shared<BlobFileMeta>(5, 6);
input.AddBlobFile(file1);
input.AddBlobFile(file2);
input.DeleteBlobFile(7);
input.DeleteBlobFile(8);
CheckCodec(input);
}
VersionEdit AddBlobFilesEdit(uint32_t cf_id, uint64_t start, uint64_t end) {
VersionEdit edit;
edit.SetColumnFamilyID(cf_id);
for (auto i = start; i < end; i++) {
auto file = std::make_shared<BlobFileMeta>(i, i);
edit.AddBlobFile(file);
}
return edit;
}
VersionEdit DeleteBlobFilesEdit(uint32_t cf_id, uint64_t start, uint64_t end) {
VersionEdit edit;
edit.SetColumnFamilyID(cf_id);
for (auto i = start; i < end; i++) {
edit.DeleteBlobFile(i);
}
return edit;
}
TEST_F(VersionTest, VersionBuilder) {
// {(0, 4)}, {}
auto add1_0_4 = AddBlobFilesEdit(1, 0, 4);
AddBlobFiles(1, 0, 4);
BuildAndCheck({add1_0_4});
// {(0, 8)}, {(4, 8)}
auto add1_4_8 = AddBlobFilesEdit(1, 4, 8);
auto add2_4_8 = AddBlobFilesEdit(2, 4, 8);
AddBlobFiles(1, 4, 8);
AddBlobFiles(2, 4, 8);
BuildAndCheck({add1_4_8, add2_4_8});
// {(0, 4), (6, 8)}, {(4, 8)}
auto del1_4_6 = DeleteBlobFilesEdit(1, 4, 6);
DeleteBlobFiles(1, 4, 6);
BuildAndCheck({del1_4_6});
// {(0, 4)}, {(4, 6)}
auto del1_6_8 = DeleteBlobFilesEdit(1, 6, 8);
auto del2_6_8 = DeleteBlobFilesEdit(2, 6, 8);
DeleteBlobFiles(1, 6, 8);
DeleteBlobFiles(2, 6, 8);
BuildAndCheck({del1_6_8, del2_6_8});
// {(0, 4)}, {(4, 6)}
Reset();
AddBlobFiles(1, 0, 4);
AddBlobFiles(2, 4, 6);
add1_0_4 = AddBlobFilesEdit(1, 0, 4);
add1_4_8 = AddBlobFilesEdit(1, 4, 8);
add2_4_8 = AddBlobFilesEdit(2, 4, 8);
del1_4_6 = DeleteBlobFilesEdit(1, 4, 6);
del1_6_8 = DeleteBlobFilesEdit(1, 6, 8);
del2_6_8 = DeleteBlobFilesEdit(2, 6, 8);
BuildAndCheck({add1_0_4, add1_4_8, del1_4_6, del1_6_8, add2_4_8, del2_6_8});
}
TEST_F(VersionTest, ObsoleteFiles) {
std::map<uint32_t, TitanCFOptions> m;
m.insert({1, TitanCFOptions()});
vset_->AddColumnFamilies(m);
{
auto add1_0_4 = AddBlobFilesEdit(1, 0, 4);
MutexLock l(&mutex_);
vset_->LogAndApply(&add1_0_4);
}
ObsoleteFiles of;
vset_->GetObsoleteFiles(&of, kMaxSequenceNumber);
ASSERT_EQ(of.blob_files.size(), 0);
{
auto del1_3_4 = DeleteBlobFilesEdit(1, 3, 4);
MutexLock l(&mutex_);
vset_->LogAndApply(&del1_3_4);
}
vset_->GetObsoleteFiles(&of, kMaxSequenceNumber);
ASSERT_EQ(of.blob_files.size(), 1);
}
} // namespace titandb
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#ifndef GFLAGS
#include <cstdio>
int main() {
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
return 1;
}
#else
#include <rocksdb/db_bench_tool.h>
int main(int argc, char** argv) { return rocksdb::db_bench_tool(argc, argv); }
#endif // GFLAGS
This source diff could not be displayed because it is too large. You can view the blob instead.
// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
namespace rocksdb {
int db_bench_tool(int argc, char** argv);
} // namespace rocksdb
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment