https://github.com/facebook/rocksdb
Raw File
Tip revision: 145a50ba007326eab90da9b12d697b35f5b60e7d authored by Hui Xiao on 27 September 2023, 02:47:02 UTC
Update history and version for 8.5.4
Tip revision: 145a50b
db_test2.cc
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include <atomic>
#include <cstdlib>
#include <functional>
#include <memory>

#include "db/db_test_util.h"
#include "db/read_callback.h"
#include "db/version_edit.h"
#include "options/options_helper.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/experimental.h"
#include "rocksdb/iostats_context.h"
#include "rocksdb/persistent_cache.h"
#include "rocksdb/trace_record.h"
#include "rocksdb/trace_record_result.h"
#include "rocksdb/utilities/replayer.h"
#include "rocksdb/wal_filter.h"
#include "test_util/testutil.h"
#include "util/random.h"
#include "utilities/fault_injection_env.h"

namespace ROCKSDB_NAMESPACE {

class DBTest2 : public DBTestBase {
 public:
  DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
  std::vector<FileMetaData*> GetLevelFileMetadatas(int level, int cf = 0) {
    VersionSet* const versions = dbfull()->GetVersionSet();
    assert(versions);
    ColumnFamilyData* const cfd =
        versions->GetColumnFamilySet()->GetColumnFamily(cf);
    assert(cfd);
    Version* const current = cfd->current();
    assert(current);
    VersionStorageInfo* const storage_info = current->storage_info();
    assert(storage_info);
    return storage_info->LevelFiles(level);
  }
};

TEST_F(DBTest2, OpenForReadOnly) {
  DB* db_ptr = nullptr;
  std::string dbname = test::PerThreadDBPath("db_readonly");
  Options options = CurrentOptions();
  options.create_if_missing = true;
  // OpenForReadOnly should fail but will create <dbname> in the file system
  ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
  // Since <dbname> is created, we should be able to delete the dir
  // We first get the list files under <dbname>
  // There should not be any subdirectories -- this is not checked here
  std::vector<std::string> files;
  ASSERT_OK(env_->GetChildren(dbname, &files));
  for (auto& f : files) {
    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
  }
  // <dbname> should be empty now and we should be able to delete it
  ASSERT_OK(env_->DeleteDir(dbname));
  options.create_if_missing = false;
  // OpenForReadOnly should fail since <dbname> was successfully deleted
  ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
  // With create_if_missing false, there should not be a dir in the file system
  ASSERT_NOK(env_->FileExists(dbname));
}

TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
  DB* db_ptr = nullptr;
  std::string dbname = test::PerThreadDBPath("db_readonly");
  Options options = CurrentOptions();
  options.create_if_missing = true;

  ColumnFamilyOptions cf_options(options);
  std::vector<ColumnFamilyDescriptor> column_families;
  column_families.push_back(
      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
  column_families.push_back(ColumnFamilyDescriptor("goku", cf_options));
  std::vector<ColumnFamilyHandle*> handles;
  // OpenForReadOnly should fail but will create <dbname> in the file system
  ASSERT_NOK(
      DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
  // Since <dbname> is created, we should be able to delete the dir
  // We first get the list files under <dbname>
  // There should not be any subdirectories -- this is not checked here
  std::vector<std::string> files;
  ASSERT_OK(env_->GetChildren(dbname, &files));
  for (auto& f : files) {
    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
  }
  // <dbname> should be empty now and we should be able to delete it
  ASSERT_OK(env_->DeleteDir(dbname));
  options.create_if_missing = false;
  // OpenForReadOnly should fail since <dbname> was successfully deleted
  ASSERT_NOK(
      DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
  // With create_if_missing false, there should not be a dir in the file system
  ASSERT_NOK(env_->FileExists(dbname));
}

class PartitionedIndexTestListener : public EventListener {
 public:
  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
    ASSERT_GT(info.table_properties.index_partitions, 1);
    ASSERT_EQ(info.table_properties.index_key_is_user_key, 0);
  }
};

TEST_F(DBTest2, PartitionedIndexUserToInternalKey) {
  const int kValueSize = 10500;
  const int kNumEntriesPerFile = 1000;
  const int kNumFiles = 3;
  const int kNumDistinctKeys = 30;

  BlockBasedTableOptions table_options;
  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
  PartitionedIndexTestListener* listener = new PartitionedIndexTestListener();
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  options.listeners.emplace_back(listener);
  std::vector<const Snapshot*> snapshots;
  Reopen(options);
  Random rnd(301);

  for (int i = 0; i < kNumFiles; i++) {
    for (int j = 0; j < kNumEntriesPerFile; j++) {
      int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys;
      std::string value = rnd.RandomString(kValueSize);
      ASSERT_OK(Put("keykey_" + std::to_string(key_id), value));
      snapshots.push_back(db_->GetSnapshot());
    }
    ASSERT_OK(Flush());
  }

  for (auto s : snapshots) {
    db_->ReleaseSnapshot(s);
  }
}


class PrefixFullBloomWithReverseComparator
    : public DBTestBase,
      public ::testing::WithParamInterface<bool> {
 public:
  PrefixFullBloomWithReverseComparator()
      : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {}
  void SetUp() override { if_cache_filter_ = GetParam(); }
  bool if_cache_filter_;
};

TEST_P(PrefixFullBloomWithReverseComparator,
       PrefixFullBloomWithReverseComparator) {
  Options options = last_options_;
  options.comparator = ReverseBytewiseComparator();
  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
  BlockBasedTableOptions bbto;
  if (if_cache_filter_) {
    bbto.no_block_cache = false;
    bbto.cache_index_and_filter_blocks = true;
    bbto.block_cache = NewLRUCache(1);
  }
  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
  bbto.whole_key_filtering = false;
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
  DestroyAndReopen(options);

  ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo"));
  ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2"));
  ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3"));

  ASSERT_OK(dbfull()->Flush(FlushOptions()));

  if (bbto.block_cache) {
    bbto.block_cache->EraseUnRefEntries();
  }

  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
  iter->Seek("bar345");
  ASSERT_OK(iter->status());
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("bar234", iter->key().ToString());
  ASSERT_EQ("foo2", iter->value().ToString());
  iter->Next();
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("bar123", iter->key().ToString());
  ASSERT_EQ("foo", iter->value().ToString());

  iter->Seek("foo234");
  ASSERT_OK(iter->status());
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("foo123", iter->key().ToString());
  ASSERT_EQ("foo3", iter->value().ToString());

  iter->Seek("bar");
  ASSERT_OK(iter->status());
  ASSERT_TRUE(!iter->Valid());
}

INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator,
                        PrefixFullBloomWithReverseComparator, testing::Bool());

TEST_F(DBTest2, IteratorPropertyVersionNumber) {
  ASSERT_OK(Put("", ""));
  Iterator* iter1 = db_->NewIterator(ReadOptions());
  ASSERT_OK(iter1->status());
  std::string prop_value;
  ASSERT_OK(
      iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
  uint64_t version_number1 =
      static_cast<uint64_t>(std::atoi(prop_value.c_str()));

  ASSERT_OK(Put("", ""));
  ASSERT_OK(Flush());

  Iterator* iter2 = db_->NewIterator(ReadOptions());
  ASSERT_OK(iter2->status());
  ASSERT_OK(
      iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
  uint64_t version_number2 =
      static_cast<uint64_t>(std::atoi(prop_value.c_str()));

  ASSERT_GT(version_number2, version_number1);

  ASSERT_OK(Put("", ""));

  Iterator* iter3 = db_->NewIterator(ReadOptions());
  ASSERT_OK(iter3->status());
  ASSERT_OK(
      iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
  uint64_t version_number3 =
      static_cast<uint64_t>(std::atoi(prop_value.c_str()));

  ASSERT_EQ(version_number2, version_number3);

  iter1->SeekToFirst();
  ASSERT_OK(
      iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
  uint64_t version_number1_new =
      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
  ASSERT_EQ(version_number1, version_number1_new);

  delete iter1;
  delete iter2;
  delete iter3;
}

TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
  Options options = CurrentOptions();
  options.create_if_missing = true;
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
  BlockBasedTableOptions table_options;
  table_options.cache_index_and_filter_blocks = true;
  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  CreateAndReopenWithCF({"pikachu"}, options);

  ASSERT_OK(Put(1, "a", "begin"));
  ASSERT_OK(Put(1, "z", "end"));
  ASSERT_OK(Flush(1));
  TryReopenWithColumnFamilies({"default", "pikachu"}, options);

  std::string value;
  value = Get(1, "a");
}

TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) {
  Options options = CurrentOptions();
  options.create_if_missing = true;
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
  options.max_successive_merges = 3;
  options.merge_operator = MergeOperators::CreatePutOperator();
  options.disable_auto_compactions = true;
  DestroyAndReopen(options);
  ASSERT_OK(Put("poi", "Finch"));
  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese"));
  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw"));
  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root"));
  options.max_successive_merges = 2;
  Reopen(options);
}

class DBTestSharedWriteBufferAcrossCFs
    : public DBTestBase,
      public testing::WithParamInterface<std::tuple<bool, bool>> {
 public:
  DBTestSharedWriteBufferAcrossCFs()
      : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {}
  void SetUp() override {
    use_old_interface_ = std::get<0>(GetParam());
    cost_cache_ = std::get<1>(GetParam());
  }
  bool use_old_interface_;
  bool cost_cache_;
};

TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
  Options options = CurrentOptions();
  options.arena_block_size = 4096;
  auto flush_listener = std::make_shared<FlushCounterListener>();
  options.listeners.push_back(flush_listener);
  // Don't trip the listener at shutdown.
  options.avoid_flush_during_shutdown = true;

  // Avoid undeterministic value by malloc_usable_size();
  // Force arena block size to 1
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "Arena::Arena:0", [&](void* arg) {
        size_t* block_size = static_cast<size_t*>(arg);
        *block_size = 1;
      });

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "Arena::AllocateNewBlock:0", [&](void* arg) {
        std::pair<size_t*, size_t*>* pair =
            static_cast<std::pair<size_t*, size_t*>*>(arg);
        *std::get<0>(*pair) = *std::get<1>(*pair);
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  // The total soft write buffer size is about 105000
  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
  ASSERT_LT(cache->GetUsage(), 256 * 1024);

  if (use_old_interface_) {
    options.db_write_buffer_size = 120000;  // this is the real limit
  } else if (!cost_cache_) {
    options.write_buffer_manager.reset(new WriteBufferManager(114285));
  } else {
    options.write_buffer_manager.reset(new WriteBufferManager(114285, cache));
  }
  options.write_buffer_size = 500000;  // this is never hit
  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);

  WriteOptions wo;
  wo.disableWAL = true;

  std::function<void()> wait_flush = [&]() {
    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
    // Ensure background work is fully finished including listener callbacks
    // before accessing listener state.
    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
  };

  // Create some data and flush "default" and "nikitich" so that they
  // are newer CFs created.
  flush_listener->expected_flush_reason = FlushReason::kManualFlush;
  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
  Flush(3);
  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
  Flush(0);
  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
            static_cast<uint64_t>(1));
  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
            static_cast<uint64_t>(1));

  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
  if (cost_cache_) {
    ASSERT_GE(cache->GetUsage(), 256 * 1024);
    ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
  }
  wait_flush();
  ASSERT_OK(Put(0, Key(1), DummyString(60000), wo));
  if (cost_cache_) {
    ASSERT_GE(cache->GetUsage(), 256 * 1024);
    ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
  }
  wait_flush();
  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
  // No flush should trigger
  wait_flush();
  {
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
              static_cast<uint64_t>(1));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
              static_cast<uint64_t>(1));
  }

  // Trigger a flush. Flushing "nikitich".
  ASSERT_OK(Put(3, Key(2), DummyString(30000), wo));
  wait_flush();
  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
  wait_flush();
  {
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
              static_cast<uint64_t>(1));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
              static_cast<uint64_t>(2));
  }

  // Without hitting the threshold, no flush should trigger.
  ASSERT_OK(Put(2, Key(1), DummyString(30000), wo));
  wait_flush();
  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
  wait_flush();
  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
  wait_flush();
  {
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
              static_cast<uint64_t>(1));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
              static_cast<uint64_t>(2));
  }

  // Hit the write buffer limit again. "default"
  // will have been flushed.
  ASSERT_OK(Put(2, Key(2), DummyString(10000), wo));
  wait_flush();
  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
  wait_flush();
  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
  wait_flush();
  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
  wait_flush();
  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
  wait_flush();
  {
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
              static_cast<uint64_t>(2));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
              static_cast<uint64_t>(2));
  }

  // Trigger another flush. This time "dobrynia". "pikachu" should not
  // be flushed, althrough it was never flushed.
  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
  wait_flush();
  ASSERT_OK(Put(2, Key(1), DummyString(80000), wo));
  wait_flush();
  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
  wait_flush();
  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
  wait_flush();

  {
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
              static_cast<uint64_t>(2));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
              static_cast<uint64_t>(1));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
              static_cast<uint64_t>(2));
  }
  if (cost_cache_) {
    ASSERT_GE(cache->GetUsage(), 256 * 1024);
    Close();
    options.write_buffer_manager.reset();
    last_options_.write_buffer_manager.reset();
    ASSERT_LT(cache->GetUsage(), 256 * 1024);
  }
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs,
                        DBTestSharedWriteBufferAcrossCFs,
                        ::testing::Values(std::make_tuple(true, false),
                                          std::make_tuple(false, false),
                                          std::make_tuple(false, true)));

TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
  std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
  Options options = CurrentOptions();
  options.arena_block_size = 4096;
  auto flush_listener = std::make_shared<FlushCounterListener>();
  options.listeners.push_back(flush_listener);
  // Don't trip the listener at shutdown.
  options.avoid_flush_during_shutdown = true;
  // Avoid undeterministic value by malloc_usable_size();
  // Force arena block size to 1
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "Arena::Arena:0", [&](void* arg) {
        size_t* block_size = static_cast<size_t*>(arg);
        *block_size = 1;
      });

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "Arena::AllocateNewBlock:0", [&](void* arg) {
        std::pair<size_t*, size_t*>* pair =
            static_cast<std::pair<size_t*, size_t*>*>(arg);
        *std::get<0>(*pair) = *std::get<1>(*pair);
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  options.write_buffer_size = 500000;  // this is never hit
  // Use a write buffer total size so that the soft limit is about
  // 105000.
  options.write_buffer_manager.reset(new WriteBufferManager(120000));
  CreateAndReopenWithCF({"cf1", "cf2"}, options);

  ASSERT_OK(DestroyDB(dbname2, options));
  DB* db2 = nullptr;
  ASSERT_OK(DB::Open(options, dbname2, &db2));

  WriteOptions wo;
  wo.disableWAL = true;

  std::function<void()> wait_flush = [&]() {
    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
    ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
    // Ensure background work is fully finished including listener callbacks
    // before accessing listener state.
    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
    ASSERT_OK(
        static_cast_with_check<DBImpl>(db2)->TEST_WaitForBackgroundWork());
  };

  // Trigger a flush on cf2
  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
  ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
  wait_flush();
  ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
  wait_flush();

  // Insert to DB2
  ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
  wait_flush();

  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
  wait_flush();
  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
  {
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
                  GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
                  GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
              static_cast<uint64_t>(1));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
              static_cast<uint64_t>(0));
  }

  // Triggering to flush another CF in DB1
  ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
  wait_flush();
  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
  wait_flush();
  {
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
              static_cast<uint64_t>(1));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
              static_cast<uint64_t>(1));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
              static_cast<uint64_t>(0));
  }

  // Triggering flush in DB2.
  ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000)));
  wait_flush();
  ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
  wait_flush();
  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
  {
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
              static_cast<uint64_t>(1));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
              static_cast<uint64_t>(0));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
              static_cast<uint64_t>(1));
    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
              static_cast<uint64_t>(1));
  }

  delete db2;
  ASSERT_OK(DestroyDB(dbname2, options));

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
  Options options = CurrentOptions();
  options.arena_block_size = 4096;
  std::shared_ptr<Cache> cache = NewLRUCache(LRUCacheOptions(
      10000000 /* capacity */, 1 /* num_shard_bits */,
      false /* strict_capacity_limit */, 0.0 /* high_pri_pool_ratio */,
      nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
      kDontChargeCacheMetadata));

  options.write_buffer_size = 50000;  // this is never hit
  // Use a write buffer total size so that the soft limit is about
  // 105000.
  options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
  Reopen(options);

  ASSERT_OK(Put("foo", "bar"));
  // One dummy entry is 256KB.
  ASSERT_GT(cache->GetUsage(), 128000);
}

namespace {
void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
                          const std::vector<Slice>& keys_must_not_exist) {
  // Ensure that expected keys exist
  std::vector<std::string> values;
  if (keys_must_exist.size() > 0) {
    std::vector<Status> status_list =
        db->MultiGet(ReadOptions(), keys_must_exist, &values);
    for (size_t i = 0; i < keys_must_exist.size(); i++) {
      ASSERT_OK(status_list[i]);
    }
  }

  // Ensure that given keys don't exist
  if (keys_must_not_exist.size() > 0) {
    std::vector<Status> status_list =
        db->MultiGet(ReadOptions(), keys_must_not_exist, &values);
    for (size_t i = 0; i < keys_must_not_exist.size(); i++) {
      ASSERT_TRUE(status_list[i].IsNotFound());
    }
  }
}

}  // anonymous namespace

TEST_F(DBTest2, WalFilterTest) {
  class TestWalFilter : public WalFilter {
   private:
    // Processing option that is requested to be applied at the given index
    WalFilter::WalProcessingOption wal_processing_option_;
    // Index at which to apply wal_processing_option_
    // At other indexes default wal_processing_option::kContinueProcessing is
    // returned.
    size_t apply_option_at_record_index_;
    // Current record index, incremented with each record encountered.
    size_t current_record_index_;

   public:
    TestWalFilter(WalFilter::WalProcessingOption wal_processing_option,
                  size_t apply_option_for_record_index)
        : wal_processing_option_(wal_processing_option),
          apply_option_at_record_index_(apply_option_for_record_index),
          current_record_index_(0) {}

    WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
                                  WriteBatch* /*new_batch*/,
                                  bool* /*batch_changed*/) const override {
      WalFilter::WalProcessingOption option_to_return;

      if (current_record_index_ == apply_option_at_record_index_) {
        option_to_return = wal_processing_option_;
      } else {
        option_to_return = WalProcessingOption::kContinueProcessing;
      }

      // Filter is passed as a const object for RocksDB to not modify the
      // object, however we modify it for our own purpose here and hence
      // cast the constness away.
      (const_cast<TestWalFilter*>(this)->current_record_index_)++;

      return option_to_return;
    }

    const char* Name() const override { return "TestWalFilter"; }
  };

  // Create 3 batches with two keys each
  std::vector<std::vector<std::string>> batch_keys(3);

  batch_keys[0].push_back("key1");
  batch_keys[0].push_back("key2");
  batch_keys[1].push_back("key3");
  batch_keys[1].push_back("key4");
  batch_keys[2].push_back("key5");
  batch_keys[2].push_back("key6");

  // Test with all WAL processing options
  for (int option = 0;
       option < static_cast<int>(
                    WalFilter::WalProcessingOption::kWalProcessingOptionMax);
       option++) {
    Options options = OptionsForLogIterTest();
    DestroyAndReopen(options);
    CreateAndReopenWithCF({"pikachu"}, options);

    // Write given keys in given batches
    for (size_t i = 0; i < batch_keys.size(); i++) {
      WriteBatch batch;
      for (size_t j = 0; j < batch_keys[i].size(); j++) {
        ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
      }
      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
    }

    WalFilter::WalProcessingOption wal_processing_option =
        static_cast<WalFilter::WalProcessingOption>(option);

    // Create a test filter that would apply wal_processing_option at the first
    // record
    size_t apply_option_for_record_index = 1;
    TestWalFilter test_wal_filter(wal_processing_option,
                                  apply_option_for_record_index);

    // Reopen database with option to use WAL filter
    options = OptionsForLogIterTest();
    options.wal_filter = &test_wal_filter;
    Status status =
        TryReopenWithColumnFamilies({"default", "pikachu"}, options);
    if (wal_processing_option ==
        WalFilter::WalProcessingOption::kCorruptedRecord) {
      ASSERT_NOK(status);
      // In case of corruption we can turn off paranoid_checks to reopen
      // databse
      options.paranoid_checks = false;
      ReopenWithColumnFamilies({"default", "pikachu"}, options);
    } else {
      ASSERT_OK(status);
    }

    // Compute which keys we expect to be found
    // and which we expect not to be found after recovery.
    std::vector<Slice> keys_must_exist;
    std::vector<Slice> keys_must_not_exist;
    switch (wal_processing_option) {
      case WalFilter::WalProcessingOption::kCorruptedRecord:
      case WalFilter::WalProcessingOption::kContinueProcessing: {
        fprintf(stderr, "Testing with complete WAL processing\n");
        // we expect all records to be processed
        for (size_t i = 0; i < batch_keys.size(); i++) {
          for (size_t j = 0; j < batch_keys[i].size(); j++) {
            keys_must_exist.push_back(Slice(batch_keys[i][j]));
          }
        }
        break;
      }
      case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: {
        fprintf(stderr,
                "Testing with ignoring record %" ROCKSDB_PRIszt " only\n",
                apply_option_for_record_index);
        // We expect the record with apply_option_for_record_index to be not
        // found.
        for (size_t i = 0; i < batch_keys.size(); i++) {
          for (size_t j = 0; j < batch_keys[i].size(); j++) {
            if (i == apply_option_for_record_index) {
              keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
            } else {
              keys_must_exist.push_back(Slice(batch_keys[i][j]));
            }
          }
        }
        break;
      }
      case WalFilter::WalProcessingOption::kStopReplay: {
        fprintf(stderr,
                "Testing with stopping replay from record %" ROCKSDB_PRIszt
                "\n",
                apply_option_for_record_index);
        // We expect records beyond apply_option_for_record_index to be not
        // found.
        for (size_t i = 0; i < batch_keys.size(); i++) {
          for (size_t j = 0; j < batch_keys[i].size(); j++) {
            if (i >= apply_option_for_record_index) {
              keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
            } else {
              keys_must_exist.push_back(Slice(batch_keys[i][j]));
            }
          }
        }
        break;
      }
      default:
        FAIL();  // unhandled case
    }

    bool checked_after_reopen = false;

    while (true) {
      // Ensure that expected keys exists
      // and not expected keys don't exist after recovery
      ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);

      if (checked_after_reopen) {
        break;
      }

      // reopen database again to make sure previous log(s) are not used
      //(even if they were skipped)
      // reopn database with option to use WAL filter
      options = OptionsForLogIterTest();
      ReopenWithColumnFamilies({"default", "pikachu"}, options);

      checked_after_reopen = true;
    }
  }
}

TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
  class ChangeBatchHandler : public WriteBatch::Handler {
   private:
    // Batch to insert keys in
    WriteBatch* new_write_batch_;
    // Number of keys to add in the new batch
    size_t num_keys_to_add_in_new_batch_;
    // Number of keys added to new batch
    size_t num_keys_added_;

   public:
    ChangeBatchHandler(WriteBatch* new_write_batch,
                       size_t num_keys_to_add_in_new_batch)
        : new_write_batch_(new_write_batch),
          num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
          num_keys_added_(0) {}
    void Put(const Slice& key, const Slice& value) override {
      if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
        ASSERT_OK(new_write_batch_->Put(key, value));
        ++num_keys_added_;
      }
    }
  };

  class TestWalFilterWithChangeBatch : public WalFilter {
   private:
    // Index at which to start changing records
    size_t change_records_from_index_;
    // Number of keys to add in the new batch
    size_t num_keys_to_add_in_new_batch_;
    // Current record index, incremented with each record encountered.
    size_t current_record_index_;

   public:
    TestWalFilterWithChangeBatch(size_t change_records_from_index,
                                 size_t num_keys_to_add_in_new_batch)
        : change_records_from_index_(change_records_from_index),
          num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
          current_record_index_(0) {}

    WalProcessingOption LogRecord(const WriteBatch& batch,
                                  WriteBatch* new_batch,
                                  bool* batch_changed) const override {
      if (current_record_index_ >= change_records_from_index_) {
        ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
        Status s = batch.Iterate(&handler);
        if (s.ok()) {
          *batch_changed = true;
        } else {
          assert(false);
        }
      }

      // Filter is passed as a const object for RocksDB to not modify the
      // object, however we modify it for our own purpose here and hence
      // cast the constness away.
      (const_cast<TestWalFilterWithChangeBatch*>(this)
           ->current_record_index_)++;

      return WalProcessingOption::kContinueProcessing;
    }

    const char* Name() const override { return "TestWalFilterWithChangeBatch"; }
  };

  std::vector<std::vector<std::string>> batch_keys(3);

  batch_keys[0].push_back("key1");
  batch_keys[0].push_back("key2");
  batch_keys[1].push_back("key3");
  batch_keys[1].push_back("key4");
  batch_keys[2].push_back("key5");
  batch_keys[2].push_back("key6");

  Options options = OptionsForLogIterTest();
  DestroyAndReopen(options);
  CreateAndReopenWithCF({"pikachu"}, options);

  // Write given keys in given batches
  for (size_t i = 0; i < batch_keys.size(); i++) {
    WriteBatch batch;
    for (size_t j = 0; j < batch_keys[i].size(); j++) {
      ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
    }
    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
  }

  // Create a test filter that would apply wal_processing_option at the first
  // record
  size_t change_records_from_index = 1;
  size_t num_keys_to_add_in_new_batch = 1;
  TestWalFilterWithChangeBatch test_wal_filter_with_change_batch(
      change_records_from_index, num_keys_to_add_in_new_batch);

  // Reopen database with option to use WAL filter
  options = OptionsForLogIterTest();
  options.wal_filter = &test_wal_filter_with_change_batch;
  ReopenWithColumnFamilies({"default", "pikachu"}, options);

  // Ensure that all keys exist before change_records_from_index_
  // And after that index only single key exists
  // as our filter adds only single key for each batch
  std::vector<Slice> keys_must_exist;
  std::vector<Slice> keys_must_not_exist;

  for (size_t i = 0; i < batch_keys.size(); i++) {
    for (size_t j = 0; j < batch_keys[i].size(); j++) {
      if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) {
        keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
      } else {
        keys_must_exist.push_back(Slice(batch_keys[i][j]));
      }
    }
  }

  bool checked_after_reopen = false;

  while (true) {
    // Ensure that expected keys exists
    // and not expected keys don't exist after recovery
    ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);

    if (checked_after_reopen) {
      break;
    }

    // reopen database again to make sure previous log(s) are not used
    //(even if they were skipped)
    // reopn database with option to use WAL filter
    options = OptionsForLogIterTest();
    ReopenWithColumnFamilies({"default", "pikachu"}, options);

    checked_after_reopen = true;
  }
}

TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
  class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter {
   public:
    WalProcessingOption LogRecord(const WriteBatch& batch,
                                  WriteBatch* new_batch,
                                  bool* batch_changed) const override {
      *new_batch = batch;
      Status s = new_batch->Put("key_extra", "value_extra");
      if (s.ok()) {
        *batch_changed = true;
      } else {
        assert(false);
      }
      return WalProcessingOption::kContinueProcessing;
    }

    const char* Name() const override {
      return "WalFilterTestWithChangeBatchExtraKeys";
    }
  };

  std::vector<std::vector<std::string>> batch_keys(3);

  batch_keys[0].push_back("key1");
  batch_keys[0].push_back("key2");
  batch_keys[1].push_back("key3");
  batch_keys[1].push_back("key4");
  batch_keys[2].push_back("key5");
  batch_keys[2].push_back("key6");

  Options options = OptionsForLogIterTest();
  DestroyAndReopen(options);
  CreateAndReopenWithCF({"pikachu"}, options);

  // Write given keys in given batches
  for (size_t i = 0; i < batch_keys.size(); i++) {
    WriteBatch batch;
    for (size_t j = 0; j < batch_keys[i].size(); j++) {
      ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
    }
    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
  }

  // Create a test filter that would add extra keys
  TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys;

  // Reopen database with option to use WAL filter
  options = OptionsForLogIterTest();
  options.wal_filter = &test_wal_filter_extra_keys;
  Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
  ASSERT_TRUE(status.IsNotSupported());

  // Reopen without filter, now reopen should succeed - previous
  // attempt to open must not have altered the db.
  options = OptionsForLogIterTest();
  ReopenWithColumnFamilies({"default", "pikachu"}, options);

  std::vector<Slice> keys_must_exist;
  std::vector<Slice> keys_must_not_exist;  // empty vector

  for (size_t i = 0; i < batch_keys.size(); i++) {
    for (size_t j = 0; j < batch_keys[i].size(); j++) {
      keys_must_exist.push_back(Slice(batch_keys[i][j]));
    }
  }

  ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
}

TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
  class TestWalFilterWithColumnFamilies : public WalFilter {
   private:
    // column_family_id -> log_number map (provided to WALFilter)
    std::map<uint32_t, uint64_t> cf_log_number_map_;
    // column_family_name -> column_family_id map (provided to WALFilter)
    std::map<std::string, uint32_t> cf_name_id_map_;
    // column_family_name -> keys_found_in_wal map
    // We store keys that are applicable to the column_family
    // during recovery (i.e. aren't already flushed to SST file(s))
    // for verification against the keys we expect.
    std::map<uint32_t, std::vector<std::string>> cf_wal_keys_;

   public:
    void ColumnFamilyLogNumberMap(
        const std::map<uint32_t, uint64_t>& cf_lognumber_map,
        const std::map<std::string, uint32_t>& cf_name_id_map) override {
      cf_log_number_map_ = cf_lognumber_map;
      cf_name_id_map_ = cf_name_id_map;
    }

    WalProcessingOption LogRecordFound(unsigned long long log_number,
                                       const std::string& /*log_file_name*/,
                                       const WriteBatch& batch,
                                       WriteBatch* /*new_batch*/,
                                       bool* /*batch_changed*/) override {
      class LogRecordBatchHandler : public WriteBatch::Handler {
       private:
        const std::map<uint32_t, uint64_t>& cf_log_number_map_;
        std::map<uint32_t, std::vector<std::string>>& cf_wal_keys_;
        unsigned long long log_number_;

       public:
        LogRecordBatchHandler(
            unsigned long long current_log_number,
            const std::map<uint32_t, uint64_t>& cf_log_number_map,
            std::map<uint32_t, std::vector<std::string>>& cf_wal_keys)
            : cf_log_number_map_(cf_log_number_map),
              cf_wal_keys_(cf_wal_keys),
              log_number_(current_log_number) {}

        Status PutCF(uint32_t column_family_id, const Slice& key,
                     const Slice& /*value*/) override {
          auto it = cf_log_number_map_.find(column_family_id);
          assert(it != cf_log_number_map_.end());
          unsigned long long log_number_for_cf = it->second;
          // If the current record is applicable for column_family_id
          // (i.e. isn't flushed to SST file(s) for column_family_id)
          // add it to the cf_wal_keys_ map for verification.
          if (log_number_ >= log_number_for_cf) {
            cf_wal_keys_[column_family_id].push_back(
                std::string(key.data(), key.size()));
          }
          return Status::OK();
        }
      } handler(log_number, cf_log_number_map_, cf_wal_keys_);

      Status s = batch.Iterate(&handler);
      if (!s.ok()) {
        // TODO(AR) is this ok?
        return WalProcessingOption::kCorruptedRecord;
      }

      return WalProcessingOption::kContinueProcessing;
    }

    const char* Name() const override {
      return "WalFilterTestWithColumnFamilies";
    }

    const std::map<uint32_t, std::vector<std::string>>& GetColumnFamilyKeys() {
      return cf_wal_keys_;
    }

    const std::map<std::string, uint32_t>& GetColumnFamilyNameIdMap() {
      return cf_name_id_map_;
    }
  };

  std::vector<std::vector<std::string>> batch_keys_pre_flush(3);

  batch_keys_pre_flush[0].push_back("key1");
  batch_keys_pre_flush[0].push_back("key2");
  batch_keys_pre_flush[1].push_back("key3");
  batch_keys_pre_flush[1].push_back("key4");
  batch_keys_pre_flush[2].push_back("key5");
  batch_keys_pre_flush[2].push_back("key6");

  Options options = OptionsForLogIterTest();
  DestroyAndReopen(options);
  CreateAndReopenWithCF({"pikachu"}, options);

  // Write given keys in given batches
  for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
    WriteBatch batch;
    for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
      ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j],
                          DummyString(1024)));
      ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j],
                          DummyString(1024)));
    }
    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
  }

  // Flush default column-family
  ASSERT_OK(db_->Flush(FlushOptions(), handles_[0]));

  // Do some more writes
  std::vector<std::vector<std::string>> batch_keys_post_flush(3);

  batch_keys_post_flush[0].push_back("key7");
  batch_keys_post_flush[0].push_back("key8");
  batch_keys_post_flush[1].push_back("key9");
  batch_keys_post_flush[1].push_back("key10");
  batch_keys_post_flush[2].push_back("key11");
  batch_keys_post_flush[2].push_back("key12");

  // Write given keys in given batches
  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
    WriteBatch batch;
    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
      ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j],
                          DummyString(1024)));
      ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j],
                          DummyString(1024)));
    }
    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
  }

  // On Recovery we should only find the second batch applicable to default CF
  // But both batches applicable to pikachu CF

  // Create a test filter that would add extra keys
  TestWalFilterWithColumnFamilies test_wal_filter_column_families;

  // Reopen database with option to use WAL filter
  options = OptionsForLogIterTest();
  options.wal_filter = &test_wal_filter_column_families;
  Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
  ASSERT_TRUE(status.ok());

  // verify that handles_[0] only has post_flush keys
  // while handles_[1] has pre and post flush keys
  auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys();
  auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap();
  size_t index = 0;
  auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]];
  // default column-family, only post_flush keys are expected
  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
      Slice key_from_the_log(keys_cf[index++]);
      Slice batch_key(batch_keys_post_flush[i][j]);
      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
    }
  }
  ASSERT_EQ(index, keys_cf.size());

  index = 0;
  keys_cf = cf_wal_keys[name_id_map["pikachu"]];
  // pikachu column-family, all keys are expected
  for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
    for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
      Slice key_from_the_log(keys_cf[index++]);
      Slice batch_key(batch_keys_pre_flush[i][j]);
      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
    }
  }

  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
      Slice key_from_the_log(keys_cf[index++]);
      Slice batch_key(batch_keys_post_flush[i][j]);
      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
    }
  }
  ASSERT_EQ(index, keys_cf.size());
}

TEST_F(DBTest2, PresetCompressionDict) {
  // Verifies that compression ratio improves when dictionary is enabled, and
  // improves even further when the dictionary is trained by ZSTD.
  const size_t kBlockSizeBytes = 4 << 10;
  const size_t kL0FileBytes = 128 << 10;
  const size_t kApproxPerBlockOverheadBytes = 50;
  const int kNumL0Files = 5;

  Options options;
  // Make sure to use any custom env that the test is configured with.
  options.env = CurrentOptions().env;
  options.allow_concurrent_memtable_write = false;
  options.arena_block_size = kBlockSizeBytes;
  options.create_if_missing = true;
  options.disable_auto_compactions = true;
  options.level0_file_num_compaction_trigger = kNumL0Files;
  options.memtable_factory.reset(
      test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
  options.num_levels = 2;
  options.target_file_size_base = kL0FileBytes;
  options.target_file_size_multiplier = 2;
  options.write_buffer_size = kL0FileBytes;
  BlockBasedTableOptions table_options;
  table_options.block_size = kBlockSizeBytes;
  std::vector<CompressionType> compression_types;
  if (Zlib_Supported()) {
    compression_types.push_back(kZlibCompression);
  }
#if LZ4_VERSION_NUMBER >= 10400  // r124+
  compression_types.push_back(kLZ4Compression);
  compression_types.push_back(kLZ4HCCompression);
#endif  // LZ4_VERSION_NUMBER >= 10400
  if (ZSTD_Supported()) {
    compression_types.push_back(kZSTD);
  }

  enum DictionaryTypes : int {
    kWithoutDict,
    kWithDict,
    kWithZSTDfinalizeDict,
    kWithZSTDTrainedDict,
    kDictEnd,
  };

  for (auto compression_type : compression_types) {
    options.compression = compression_type;
    size_t bytes_without_dict = 0;
    size_t bytes_with_dict = 0;
    size_t bytes_with_zstd_finalize_dict = 0;
    size_t bytes_with_zstd_trained_dict = 0;
    for (int i = kWithoutDict; i < kDictEnd; i++) {
      // First iteration: compress without preset dictionary
      // Second iteration: compress with preset dictionary
      // Third iteration (zstd only): compress with zstd-trained dictionary
      //
      // To make sure the compression dictionary has the intended effect, we
      // verify the compressed size is smaller in successive iterations. Also in
      // the non-first iterations, verify the data we get out is the same data
      // we put in.
      switch (i) {
        case kWithoutDict:
          options.compression_opts.max_dict_bytes = 0;
          options.compression_opts.zstd_max_train_bytes = 0;
          break;
        case kWithDict:
          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
          options.compression_opts.zstd_max_train_bytes = 0;
          break;
        case kWithZSTDfinalizeDict:
          if (compression_type != kZSTD ||
              !ZSTD_FinalizeDictionarySupported()) {
            continue;
          }
          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
          options.compression_opts.use_zstd_dict_trainer = false;
          break;
        case kWithZSTDTrainedDict:
          if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) {
            continue;
          }
          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
          options.compression_opts.use_zstd_dict_trainer = true;
          break;
        default:
          assert(false);
      }

      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
      CreateAndReopenWithCF({"pikachu"}, options);
      Random rnd(301);
      std::string seq_datas[10];
      for (int j = 0; j < 10; ++j) {
        seq_datas[j] =
            rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
      }

      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
      for (int j = 0; j < kNumL0Files; ++j) {
        for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
          auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
          ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
                        seq_datas[(key_num / 10) % 10]));
        }
        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
        ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
      }
      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
                                            true /* disallow_trivial_move */));
      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);

      // Get the live sst files size
      size_t total_sst_bytes = TotalSize(1);
      if (i == kWithoutDict) {
        bytes_without_dict = total_sst_bytes;
      } else if (i == kWithDict) {
        bytes_with_dict = total_sst_bytes;
      } else if (i == kWithZSTDfinalizeDict) {
        bytes_with_zstd_finalize_dict = total_sst_bytes;
      } else if (i == kWithZSTDTrainedDict) {
        bytes_with_zstd_trained_dict = total_sst_bytes;
      }

      for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
           j++) {
        ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
      }
      if (i == kWithDict) {
        ASSERT_GT(bytes_without_dict, bytes_with_dict);
      } else if (i == kWithZSTDTrainedDict) {
        // In zstd compression, it is sometimes possible that using a finalized
        // dictionary does not get as good a compression ratio as raw content
        // dictionary. But using a dictionary should always get better
        // compression ratio than not using one.
        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict ||
                    bytes_without_dict > bytes_with_zstd_finalize_dict);
      } else if (i == kWithZSTDTrainedDict) {
        // In zstd compression, it is sometimes possible that using a trained
        // dictionary does not get as good a compression ratio as without
        // training.
        // But using a dictionary (with or without training) should always get
        // better compression ratio than not using one.
        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
                    bytes_without_dict > bytes_with_zstd_trained_dict);
      }

      DestroyAndReopen(options);
    }
  }
}

TEST_F(DBTest2, PresetCompressionDictLocality) {
  if (!ZSTD_Supported()) {
    return;
  }
  // Verifies that compression dictionary is generated from local data. The
  // verification simply checks all output SSTs have different compression
  // dictionaries. We do not verify effectiveness as that'd likely be flaky in
  // the future.
  const int kNumEntriesPerFile = 1 << 10;  // 1KB
  const int kNumBytesPerEntry = 1 << 10;   // 1KB
  const int kNumFiles = 4;
  Options options = CurrentOptions();
  options.compression = kZSTD;
  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
  options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
  BlockBasedTableOptions table_options;
  table_options.cache_index_and_filter_blocks = true;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  Reopen(options);

  Random rnd(301);
  for (int i = 0; i < kNumFiles; ++i) {
    for (int j = 0; j < kNumEntriesPerFile; ++j) {
      ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
                    rnd.RandomString(kNumBytesPerEntry)));
    }
    ASSERT_OK(Flush());
    MoveFilesToLevel(1);
    ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
  }

  // Store all the dictionaries generated during a full compaction.
  std::vector<std::string> compression_dicts;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
      [&](void* arg) {
        compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
  CompactRangeOptions compact_range_opts;
  compact_range_opts.bottommost_level_compaction =
      BottommostLevelCompaction::kForceOptimized;
  ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));

  // Dictionary compression should not be so good as to compress four totally
  // random files into one. If it does then there's probably something wrong
  // with the test.
  ASSERT_GT(NumTableFilesAtLevel(1), 1);

  // Furthermore, there should be one compression dictionary generated per file.
  // And they should all be different from each other.
  ASSERT_EQ(NumTableFilesAtLevel(1),
            static_cast<int>(compression_dicts.size()));
  for (size_t i = 1; i < compression_dicts.size(); ++i) {
    std::string& a = compression_dicts[i - 1];
    std::string& b = compression_dicts[i];
    size_t alen = a.size();
    size_t blen = b.size();
    ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
  }
}

class PresetCompressionDictTest
    : public DBTestBase,
      public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
 public:
  PresetCompressionDictTest()
      : DBTestBase("db_test2", false /* env_do_fsync */),
        compression_type_(std::get<0>(GetParam())),
        bottommost_(std::get<1>(GetParam())) {}

 protected:
  const CompressionType compression_type_;
  const bool bottommost_;
};

INSTANTIATE_TEST_CASE_P(
    DBTest2, PresetCompressionDictTest,
    ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
                       ::testing::Bool()));

TEST_P(PresetCompressionDictTest, Flush) {
  // Verifies that dictionary is generated and written during flush only when
  // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
  // size of the dictionary is within expectations according to the limit on
  // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
  const size_t kValueLen = 256;
  const size_t kKeysPerFile = 1 << 10;
  const size_t kDictLen = 16 << 10;
  const size_t kBlockLen = 4 << 10;

  Options options = CurrentOptions();
  if (bottommost_) {
    options.bottommost_compression = compression_type_;
    options.bottommost_compression_opts.enabled = true;
    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
  } else {
    options.compression = compression_type_;
    options.compression_opts.max_dict_bytes = kDictLen;
    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
  }
  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
  options.statistics = CreateDBStatistics();
  BlockBasedTableOptions bbto;
  bbto.block_size = kBlockLen;
  bbto.cache_index_and_filter_blocks = true;
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
  Reopen(options);

  Random rnd(301);
  for (size_t i = 0; i <= kKeysPerFile; ++i) {
    ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
  }
  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());

  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
  // compression dictionary exists since dictionaries would be preloaded when
  // the flush finishes.
  if (bottommost_) {
    // Flush is never considered bottommost. This should change in the future
    // since flushed files may have nothing underneath them, like the one in
    // this test case.
    ASSERT_EQ(
        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
        0);
  } else {
    ASSERT_GT(
        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
        0);
    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
    // number of bytes needs to be adjusted in case the cached block is in
    // ZSTD's digested dictionary format.
    if (compression_type_ != kZSTD &&
        compression_type_ != kZSTDNotFinalCompression) {
      // Although we limited buffering to `kBlockLen`, there may be up to two
      // blocks of data included in the dictionary since we only check limit
      // after each block is built.
      ASSERT_LE(TestGetTickerCount(options,
                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
                2 * kBlockLen);
    }
  }
}

TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
  // Verifies that dictionary is generated and written during compaction to
  // non-bottommost level only when `ColumnFamilyOptions::compression` enables
  // dictionary. Also verifies the size of the dictionary is within expectations
  // according to the limit on buffering set by
  // `CompressionOptions::max_dict_buffer_bytes`.
  const size_t kValueLen = 256;
  const size_t kKeysPerFile = 1 << 10;
  const size_t kDictLen = 16 << 10;
  const size_t kBlockLen = 4 << 10;

  Options options = CurrentOptions();
  if (bottommost_) {
    options.bottommost_compression = compression_type_;
    options.bottommost_compression_opts.enabled = true;
    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
  } else {
    options.compression = compression_type_;
    options.compression_opts.max_dict_bytes = kDictLen;
    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
  }
  options.disable_auto_compactions = true;
  options.statistics = CreateDBStatistics();
  BlockBasedTableOptions bbto;
  bbto.block_size = kBlockLen;
  bbto.cache_index_and_filter_blocks = true;
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
  Reopen(options);

  Random rnd(301);
  for (size_t j = 0; j <= kKeysPerFile; ++j) {
    ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
  }
  ASSERT_OK(Flush());
  MoveFilesToLevel(2);

  for (int i = 0; i < 2; ++i) {
    for (size_t j = 0; j <= kKeysPerFile; ++j) {
      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
    }
    ASSERT_OK(Flush());
  }
  ASSERT_EQ("2,0,1", FilesPerLevel(0));

  uint64_t prev_compression_dict_bytes_inserted =
      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
  // This L0->L1 compaction merges the two L0 files into L1. The produced L1
  // file is not bottommost due to the existing L2 file covering the same key-
  // range.
  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
  ASSERT_EQ("0,1,1", FilesPerLevel(0));
  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
  // compression dictionary exists since dictionaries would be preloaded when
  // the compaction finishes.
  if (bottommost_) {
    ASSERT_EQ(
        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
        prev_compression_dict_bytes_inserted);
  } else {
    ASSERT_GT(
        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
        prev_compression_dict_bytes_inserted);
    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
    // number of bytes needs to be adjusted in case the cached block is in
    // ZSTD's digested dictionary format.
    if (compression_type_ != kZSTD &&
        compression_type_ != kZSTDNotFinalCompression) {
      // Although we limited buffering to `kBlockLen`, there may be up to two
      // blocks of data included in the dictionary since we only check limit
      // after each block is built.
      ASSERT_LE(TestGetTickerCount(options,
                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
                prev_compression_dict_bytes_inserted + 2 * kBlockLen);
    }
  }
}

TEST_P(PresetCompressionDictTest, CompactBottommost) {
  // Verifies that dictionary is generated and written during compaction to
  // non-bottommost level only when either `ColumnFamilyOptions::compression` or
  // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
  // verifies the size of the dictionary is within expectations according to the
  // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
  const size_t kValueLen = 256;
  const size_t kKeysPerFile = 1 << 10;
  const size_t kDictLen = 16 << 10;
  const size_t kBlockLen = 4 << 10;

  Options options = CurrentOptions();
  if (bottommost_) {
    options.bottommost_compression = compression_type_;
    options.bottommost_compression_opts.enabled = true;
    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
  } else {
    options.compression = compression_type_;
    options.compression_opts.max_dict_bytes = kDictLen;
    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
  }
  options.disable_auto_compactions = true;
  options.statistics = CreateDBStatistics();
  BlockBasedTableOptions bbto;
  bbto.block_size = kBlockLen;
  bbto.cache_index_and_filter_blocks = true;
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
  Reopen(options);

  Random rnd(301);
  for (int i = 0; i < 2; ++i) {
    for (size_t j = 0; j <= kKeysPerFile; ++j) {
      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
    }
    ASSERT_OK(Flush());
  }
  ASSERT_EQ("2", FilesPerLevel(0));

  uint64_t prev_compression_dict_bytes_inserted =
      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
  CompactRangeOptions cro;
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
  ASSERT_EQ("0,1", FilesPerLevel(0));
  ASSERT_GT(
      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
      prev_compression_dict_bytes_inserted);
  // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
  // number of bytes needs to be adjusted in case the cached block is in ZSTD's
  // digested dictionary format.
  if (compression_type_ != kZSTD &&
      compression_type_ != kZSTDNotFinalCompression) {
    // Although we limited buffering to `kBlockLen`, there may be up to two
    // blocks of data included in the dictionary since we only check limit after
    // each block is built.
    ASSERT_LE(
        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
        prev_compression_dict_bytes_inserted + 2 * kBlockLen);
  }
}

class CompactionCompressionListener : public EventListener {
 public:
  explicit CompactionCompressionListener(Options* db_options)
      : db_options_(db_options) {}

  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
    // Figure out last level with files
    int bottommost_level = 0;
    for (int level = 0; level < db->NumberLevels(); level++) {
      std::string files_at_level;
      ASSERT_TRUE(
          db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level),
                          &files_at_level));
      if (files_at_level != "0") {
        bottommost_level = level;
      }
    }

    if (db_options_->bottommost_compression != kDisableCompressionOption &&
        ci.output_level == bottommost_level) {
      ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
    } else if (db_options_->compression_per_level.size() != 0) {
      ASSERT_EQ(ci.compression,
                db_options_->compression_per_level[ci.output_level]);
    } else {
      ASSERT_EQ(ci.compression, db_options_->compression);
    }
    max_level_checked = std::max(max_level_checked, ci.output_level);
  }

  int max_level_checked = 0;
  const Options* db_options_;
};

enum CompressionFailureType {
  kTestCompressionFail,
  kTestDecompressionFail,
  kTestDecompressionCorruption
};

class CompressionFailuresTest
    : public DBTest2,
      public testing::WithParamInterface<std::tuple<
          CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
 public:
  CompressionFailuresTest() {
    std::tie(compression_failure_type_, compression_type_,
             compression_max_dict_bytes_, compression_parallel_threads_) =
        GetParam();
  }

  CompressionFailureType compression_failure_type_ = kTestCompressionFail;
  CompressionType compression_type_ = kNoCompression;
  uint32_t compression_max_dict_bytes_ = 0;
  uint32_t compression_parallel_threads_ = 0;
};

INSTANTIATE_TEST_CASE_P(
    DBTest2, CompressionFailuresTest,
    ::testing::Combine(::testing::Values(kTestCompressionFail,
                                         kTestDecompressionFail,
                                         kTestDecompressionCorruption),
                       ::testing::ValuesIn(GetSupportedCompressions()),
                       ::testing::Values(0, 10), ::testing::Values(1, 4)));

TEST_P(CompressionFailuresTest, CompressionFailures) {
  if (compression_type_ == kNoCompression) {
    return;
  }

  Options options = CurrentOptions();
  options.level0_file_num_compaction_trigger = 2;
  options.max_bytes_for_level_base = 1024;
  options.max_bytes_for_level_multiplier = 2;
  options.num_levels = 7;
  options.max_background_compactions = 1;
  options.target_file_size_base = 512;

  BlockBasedTableOptions table_options;
  table_options.block_size = 512;
  table_options.verify_compression = true;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));

  options.compression = compression_type_;
  options.compression_opts.parallel_threads = compression_parallel_threads_;
  options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
  options.bottommost_compression_opts.parallel_threads =
      compression_parallel_threads_;
  options.bottommost_compression_opts.max_dict_bytes =
      compression_max_dict_bytes_;

  if (compression_failure_type_ == kTestCompressionFail) {
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
        "CompressData:TamperWithReturnValue", [](void* arg) {
          bool* ret = static_cast<bool*>(arg);
          *ret = false;
        });
  } else if (compression_failure_type_ == kTestDecompressionFail) {
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
        "UncompressBlockData:TamperWithReturnValue", [](void* arg) {
          Status* ret = static_cast<Status*>(arg);
          ASSERT_OK(*ret);
          *ret = Status::Corruption("kTestDecompressionFail");
        });
  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
        "UncompressBlockData:"
        "TamperWithDecompressionOutput",
        [](void* arg) {
          BlockContents* contents = static_cast<BlockContents*>(arg);
          // Ensure uncompressed data != original data
          const size_t len = contents->data.size() + 1;
          std::unique_ptr<char[]> fake_data(new char[len]());
          *contents = BlockContents(std::move(fake_data), len);
        });
  }

  std::map<std::string, std::string> key_value_written;

  const int kKeySize = 5;
  const int kValUnitSize = 16;
  const int kValSize = 256;
  Random rnd(405);

  Status s = Status::OK();

  DestroyAndReopen(options);
  // Write 10 random files
  for (int i = 0; i < 10; i++) {
    for (int j = 0; j < 5; j++) {
      std::string key = rnd.RandomString(kKeySize);
      // Ensure good compression ratio
      std::string valueUnit = rnd.RandomString(kValUnitSize);
      std::string value;
      for (int k = 0; k < kValSize; k += kValUnitSize) {
        value += valueUnit;
      }
      s = Put(key, value);
      if (compression_failure_type_ == kTestCompressionFail) {
        key_value_written[key] = value;
        ASSERT_OK(s);
      }
    }
    s = Flush();
    if (compression_failure_type_ == kTestCompressionFail) {
      ASSERT_OK(s);
    }
    s = dbfull()->TEST_WaitForCompact();
    if (compression_failure_type_ == kTestCompressionFail) {
      ASSERT_OK(s);
    }
    if (i == 4) {
      // Make compression fail at the mid of table building
      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
    }
  }
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();

  if (compression_failure_type_ == kTestCompressionFail) {
    // Should be kNoCompression, check content consistency
    std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
      std::string key = db_iter->key().ToString();
      std::string value = db_iter->value().ToString();
      ASSERT_NE(key_value_written.find(key), key_value_written.end());
      ASSERT_EQ(key_value_written[key], value);
      key_value_written.erase(key);
    }
    ASSERT_EQ(0, key_value_written.size());
  } else if (compression_failure_type_ == kTestDecompressionFail) {
    ASSERT_EQ(std::string(s.getState()),
              "Could not decompress: kTestDecompressionFail");
  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
    ASSERT_EQ(std::string(s.getState()),
              "Decompressed block did not match pre-compression block");
  }
}

TEST_F(DBTest2, CompressionOptions) {
  if (!Zlib_Supported() || !Snappy_Supported()) {
    return;
  }

  Options options = CurrentOptions();
  options.level0_file_num_compaction_trigger = 2;
  options.max_bytes_for_level_base = 100;
  options.max_bytes_for_level_multiplier = 2;
  options.num_levels = 7;
  options.max_background_compactions = 1;

  CompactionCompressionListener* listener =
      new CompactionCompressionListener(&options);
  options.listeners.emplace_back(listener);

  const int kKeySize = 5;
  const int kValSize = 20;
  Random rnd(301);

  std::vector<uint32_t> compression_parallel_threads = {1, 4};

  std::map<std::string, std::string> key_value_written;

  for (int iter = 0; iter <= 2; iter++) {
    listener->max_level_checked = 0;

    if (iter == 0) {
      // Use different compression algorithms for different levels but
      // always use Zlib for bottommost level
      options.compression_per_level = {kNoCompression,     kNoCompression,
                                       kNoCompression,     kSnappyCompression,
                                       kSnappyCompression, kSnappyCompression,
                                       kZlibCompression};
      options.compression = kNoCompression;
      options.bottommost_compression = kZlibCompression;
    } else if (iter == 1) {
      // Use Snappy except for bottommost level use ZLib
      options.compression_per_level = {};
      options.compression = kSnappyCompression;
      options.bottommost_compression = kZlibCompression;
    } else if (iter == 2) {
      // Use Snappy everywhere
      options.compression_per_level = {};
      options.compression = kSnappyCompression;
      options.bottommost_compression = kDisableCompressionOption;
    }

    for (auto num_threads : compression_parallel_threads) {
      options.compression_opts.parallel_threads = num_threads;
      options.bottommost_compression_opts.parallel_threads = num_threads;

      DestroyAndReopen(options);
      // Write 10 random files
      for (int i = 0; i < 10; i++) {
        for (int j = 0; j < 5; j++) {
          std::string key = rnd.RandomString(kKeySize);
          std::string value = rnd.RandomString(kValSize);
          key_value_written[key] = value;
          ASSERT_OK(Put(key, value));
        }
        ASSERT_OK(Flush());
        ASSERT_OK(dbfull()->TEST_WaitForCompact());
      }

      // Make sure that we wrote enough to check all 7 levels
      ASSERT_EQ(listener->max_level_checked, 6);

      // Make sure database content is the same as key_value_written
      std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
      for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
        std::string key = db_iter->key().ToString();
        std::string value = db_iter->value().ToString();
        ASSERT_NE(key_value_written.find(key), key_value_written.end());
        ASSERT_EQ(key_value_written[key], value);
        key_value_written.erase(key);
      }
      ASSERT_OK(db_iter->status());
      ASSERT_EQ(0, key_value_written.size());
    }
  }
}

class CompactionStallTestListener : public EventListener {
 public:
  CompactionStallTestListener()
      : compacting_files_cnt_(0), compacted_files_cnt_(0) {}

  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
    ASSERT_EQ(ci.cf_name, "default");
    ASSERT_EQ(ci.base_input_level, 0);
    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
    compacting_files_cnt_ += ci.input_files.size();
  }

  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
    ASSERT_EQ(ci.cf_name, "default");
    ASSERT_EQ(ci.base_input_level, 0);
    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
    compacted_files_cnt_ += ci.input_files.size();
  }

  std::atomic<size_t> compacting_files_cnt_;
  std::atomic<size_t> compacted_files_cnt_;
};

TEST_F(DBTest2, CompactionStall) {
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
      {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
       {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
       {"DBTest2::CompactionStall:2",
        "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
       {"DBTest2::CompactionStall:3",
        "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  Options options = CurrentOptions();
  options.level0_file_num_compaction_trigger = 4;
  options.max_background_compactions = 40;
  CompactionStallTestListener* listener = new CompactionStallTestListener();
  options.listeners.emplace_back(listener);
  DestroyAndReopen(options);
  // make sure all background compaction jobs can be scheduled
  auto stop_token =
      dbfull()->TEST_write_controler().GetCompactionPressureToken();

  Random rnd(301);

  // 4 Files in L0
  for (int i = 0; i < 4; i++) {
    for (int j = 0; j < 10; j++) {
      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
    }
    ASSERT_OK(Flush());
  }

  // Wait for compaction to be triggered
  TEST_SYNC_POINT("DBTest2::CompactionStall:0");

  // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again
  // at DBTest2::CompactionStall::1
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();

  // Another 6 L0 files to trigger compaction again
  for (int i = 0; i < 6; i++) {
    for (int j = 0; j < 10; j++) {
      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
    }
    ASSERT_OK(Flush());
  }

  // Wait for another compaction to be triggered
  TEST_SYNC_POINT("DBTest2::CompactionStall:1");

  // Hold NotifyOnCompactionBegin in the unlock mutex section
  TEST_SYNC_POINT("DBTest2::CompactionStall:2");

  // Hold NotifyOnCompactionCompleted in the unlock mutex section
  TEST_SYNC_POINT("DBTest2::CompactionStall:3");

  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_LT(NumTableFilesAtLevel(0),
            options.level0_file_num_compaction_trigger);
  ASSERT_GT(listener->compacted_files_cnt_.load(),
            10 - options.level0_file_num_compaction_trigger);
  ASSERT_EQ(listener->compacting_files_cnt_.load(),
            listener->compacted_files_cnt_.load());

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}


TEST_F(DBTest2, FirstSnapshotTest) {
  Options options;
  options.write_buffer_size = 100000;  // Small write buffer
  options = CurrentOptions(options);
  CreateAndReopenWithCF({"pikachu"}, options);

  // This snapshot will have sequence number 0 what is expected behaviour.
  const Snapshot* s1 = db_->GetSnapshot();

  ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
  ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush

  db_->ReleaseSnapshot(s1);
}

TEST_F(DBTest2, DuplicateSnapshot) {
  Options options;
  options = CurrentOptions(options);
  std::vector<const Snapshot*> snapshots;
  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
  SequenceNumber oldest_ww_snap, first_ww_snap;

  ASSERT_OK(Put("k", "v"));  // inc seq
  snapshots.push_back(db_->GetSnapshot());
  snapshots.push_back(db_->GetSnapshot());
  ASSERT_OK(Put("k", "v"));  // inc seq
  snapshots.push_back(db_->GetSnapshot());
  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
  first_ww_snap = snapshots.back()->GetSequenceNumber();
  ASSERT_OK(Put("k", "v"));  // inc seq
  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
  snapshots.push_back(db_->GetSnapshot());
  ASSERT_OK(Put("k", "v"));  // inc seq
  snapshots.push_back(db_->GetSnapshot());

  {
    InstrumentedMutexLock l(dbi->mutex());
    auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap);
    ASSERT_EQ(seqs.size(), 4);  // duplicates are not counted
    ASSERT_EQ(oldest_ww_snap, first_ww_snap);
  }

  for (auto s : snapshots) {
    db_->ReleaseSnapshot(s);
  }
}

class PinL0IndexAndFilterBlocksTest
    : public DBTestBase,
      public testing::WithParamInterface<std::tuple<bool, bool>> {
 public:
  PinL0IndexAndFilterBlocksTest()
      : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {}
  void SetUp() override {
    infinite_max_files_ = std::get<0>(GetParam());
    disallow_preload_ = std::get<1>(GetParam());
  }

  void CreateTwoLevels(Options* options, bool close_afterwards) {
    if (infinite_max_files_) {
      options->max_open_files = -1;
    }
    options->create_if_missing = true;
    options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
    BlockBasedTableOptions table_options;
    table_options.cache_index_and_filter_blocks = true;
    table_options.pin_l0_filter_and_index_blocks_in_cache = true;
    table_options.filter_policy.reset(NewBloomFilterPolicy(20));
    options->table_factory.reset(NewBlockBasedTableFactory(table_options));
    CreateAndReopenWithCF({"pikachu"}, *options);

    ASSERT_OK(Put(1, "a", "begin"));
    ASSERT_OK(Put(1, "z", "end"));
    ASSERT_OK(Flush(1));
    // move this table to L1
    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
    ASSERT_EQ(1, NumTableFilesAtLevel(1, 1));

    // reset block cache
    table_options.block_cache = NewLRUCache(64 * 1024);
    options->table_factory.reset(NewBlockBasedTableFactory(table_options));
    TryReopenWithColumnFamilies({"default", "pikachu"}, *options);
    // create new table at L0
    ASSERT_OK(Put(1, "a2", "begin2"));
    ASSERT_OK(Put(1, "z2", "end2"));
    ASSERT_OK(Flush(1));

    if (close_afterwards) {
      Close();  // This ensures that there is no ref to block cache entries
    }
    table_options.block_cache->EraseUnRefEntries();
  }

  bool infinite_max_files_;
  bool disallow_preload_;
};

TEST_P(PinL0IndexAndFilterBlocksTest,
       IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) {
  Options options = CurrentOptions();
  if (infinite_max_files_) {
    options.max_open_files = -1;
  }
  options.create_if_missing = true;
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
  BlockBasedTableOptions table_options;
  table_options.cache_index_and_filter_blocks = true;
  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  CreateAndReopenWithCF({"pikachu"}, options);

  ASSERT_OK(Put(1, "key", "val"));
  // Create a new table.
  ASSERT_OK(Flush(1));

  // index/filter blocks added to block cache right after table creation.
  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));

  // only index/filter were added
  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD));
  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));

  std::string value;
  // Miss and hit count should remain the same, they're all pinned.
  ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value));
  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));

  // Miss and hit count should remain the same, they're all pinned.
  value = Get(1, "key");
  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
}

TEST_P(PinL0IndexAndFilterBlocksTest,
       MultiLevelIndexAndFilterBlocksCachedWithPinning) {
  Options options = CurrentOptions();
  PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false);
  // get base cache values
  uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
  uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
  uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
  uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);

  std::string value;
  // this should be read from L0
  // so cache values don't change
  value = Get(1, "a2");
  ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
  ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
  ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
  ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));

  // this should be read from L1
  // the file is opened, prefetching results in a cache filter miss
  // the block is loaded and added to the cache,
  // then the get results in a cache hit for L1
  // When we have inifinite max_files, there is still cache miss because we have
  // reset the block cache
  value = Get(1, "a");
  ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
  ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
}

TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
  Options options = CurrentOptions();
  // This ensures that db does not ref anything in the block cache, so
  // EraseUnRefEntries could clear them up.
  bool close_afterwards = true;
  PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards);

  // Get base cache values
  uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
  uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
  uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
  uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);

  if (disallow_preload_) {
    // Now we have two files. We narrow the max open files to allow 3 entries
    // so that preloading SST files won't happen.
    options.max_open_files = 13;
    // RocksDB sanitize max open files to at least 20. Modify it back.
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
          int* max_open_files = static_cast<int*>(arg);
          *max_open_files = 13;
        });
  }
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  // Reopen database. If max_open_files is set as -1, table readers will be
  // preloaded. This will trigger a BlockBasedTable::Open() and prefetch
  // L0 index and filter. Level 1's prefetching is disabled in DB::Open()
  TryReopenWithColumnFamilies({"default", "pikachu"}, options);

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();

  if (!disallow_preload_) {
    // After reopen, cache miss are increased by one because we read (and only
    // read) filter and index on L0
    ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  } else {
    // If max_open_files is not -1, we do not preload table readers, so there is
    // no change.
    ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  }
  std::string value;
  // this should be read from L0
  value = Get(1, "a2");
  // If max_open_files is -1, we have pinned index and filter in Rep, so there
  // will not be changes in index and filter misses or hits. If max_open_files
  // is not -1, Get() will open a TableReader and prefetch index and filter.
  ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
  ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
  ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
  ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));

  // this should be read from L1
  value = Get(1, "a");
  if (!disallow_preload_) {
    // In infinite max files case, there's a cache miss in executing Get()
    // because index and filter are not prefetched before.
    ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  } else {
    // In this case, cache miss will be increased by one in
    // BlockBasedTable::Open() because this is not in DB::Open() code path so we
    // will prefetch L1's index and filter. Cache hit will also be increased by
    // one because Get() will read index and filter from the block cache
    // prefetched in previous Open() call.
    ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
    ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  }

  // Force a full compaction to one single file. There will be a block
  // cache read for both of index and filter. If prefetch doesn't explicitly
  // happen, it will happen when verifying the file.
  Compact(1, "a", "zzzzz");
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  if (!disallow_preload_) {
    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
    ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  } else {
    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  }

  // Bloom and index hit will happen when a Get() happens.
  value = Get(1, "a");
  if (!disallow_preload_) {
    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  } else {
    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
    ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
  }
}

INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest,
                        PinL0IndexAndFilterBlocksTest,
                        ::testing::Values(std::make_tuple(true, false),
                                          std::make_tuple(false, false),
                                          std::make_tuple(false, true)));

TEST_F(DBTest2, MaxCompactionBytesTest) {
  Options options = CurrentOptions();
  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
      DBTestBase::kNumKeysByGenerateNewRandomFile));
  options.compaction_style = kCompactionStyleLevel;
  options.write_buffer_size = 200 << 10;
  options.arena_block_size = 4 << 10;
  options.level0_file_num_compaction_trigger = 4;
  options.num_levels = 4;
  options.compression = kNoCompression;
  options.max_bytes_for_level_base = 450 << 10;
  options.target_file_size_base = 100 << 10;
  // Infinite for full compaction.
  options.max_compaction_bytes = options.target_file_size_base * 100;

  Reopen(options);

  Random rnd(301);

  for (int num = 0; num < 8; num++) {
    GenerateNewRandomFile(&rnd);
  }
  CompactRangeOptions cro;
  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
  ASSERT_EQ("0,0,8", FilesPerLevel(0));

  // When compact from Ln -> Ln+1, cut a file if the file overlaps with
  // more than three files in Ln+1.
  options.max_compaction_bytes = options.target_file_size_base * 3;
  Reopen(options);

  GenerateNewRandomFile(&rnd);
  // Add three more small files that overlap with the previous file
  for (int i = 0; i < 3; i++) {
    ASSERT_OK(Put("a", "z"));
    ASSERT_OK(Flush());
  }
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  // Output files to L1 are cut to 4 pieces, according to
  // options.max_compaction_bytes (300K)
  // There are 8 files on L2 (grandparents level), each one is 100K. The first
  // file overlaps with a, b which max_compaction_bytes is less than 300K, the
  // second one overlaps with d, e, which is also less than 300K. Including any
  // extra grandparent file will make the future compaction larger than 300K.
  // L1: [  1  ] [  2 ]  [  3  ] [ 4 ]
  // L2: [a] [b] [c] [d] [e] [f] [g] [h]
  ASSERT_EQ("0,4,8", FilesPerLevel(0));
}

static void UniqueIdCallback(void* arg) {
  int* result = reinterpret_cast<int*>(arg);
  if (*result == -1) {
    *result = 0;
  }

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
}

class MockPersistentCache : public PersistentCache {
 public:
  explicit MockPersistentCache(const bool is_compressed, const size_t max_size)
      : is_compressed_(is_compressed), max_size_(max_size) {
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
        "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
  }

  ~MockPersistentCache() override {}

  PersistentCache::StatsType Stats() override {
    return PersistentCache::StatsType();
  }

  uint64_t NewId() override {
    return last_id_.fetch_add(1, std::memory_order_relaxed);
  }

  Status Insert(const Slice& page_key, const char* data,
                const size_t size) override {
    MutexLock _(&lock_);

    if (size_ > max_size_) {
      size_ -= data_.begin()->second.size();
      data_.erase(data_.begin());
    }

    data_.insert(std::make_pair(page_key.ToString(), std::string(data, size)));
    size_ += size;
    return Status::OK();
  }

  Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
                size_t* size) override {
    MutexLock _(&lock_);
    auto it = data_.find(page_key.ToString());
    if (it == data_.end()) {
      return Status::NotFound();
    }

    assert(page_key.ToString() == it->first);
    data->reset(new char[it->second.size()]);
    memcpy(data->get(), it->second.c_str(), it->second.size());
    *size = it->second.size();
    return Status::OK();
  }

  bool IsCompressed() override { return is_compressed_; }

  std::string GetPrintableOptions() const override {
    return "MockPersistentCache";
  }

  port::Mutex lock_;
  std::map<std::string, std::string> data_;
  const bool is_compressed_ = true;
  size_t size_ = 0;
  const size_t max_size_ = 10 * 1024;  // 10KiB
  std::atomic<uint64_t> last_id_{1};
};

#ifdef OS_LINUX
// Make sure that in CPU time perf context counters, Env::NowCPUNanos()
// is used, rather than Env::CPUNanos();
TEST_F(DBTest2, TestPerfContextGetCpuTime) {
  // force resizing table cache so table handle is not preloaded so that
  // we can measure find_table_nanos during Get().
  dbfull()->TEST_table_cache()->SetCapacity(0);
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Flush());
  env_->now_cpu_count_.store(0);
  env_->SetMockSleep();

  // NOTE: Presumed unnecessary and removed: resetting mock time in env

  // CPU timing is not enabled with kEnableTimeExceptForMutex
  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
  ASSERT_EQ("bar", Get("foo"));
  ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
  ASSERT_EQ(0, env_->now_cpu_count_.load());

  constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
  constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;

  // Add time to NowNanos() reading.
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "TableCache::FindTable:0",
      [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
  ASSERT_EQ("bar", Get("foo"));
  ASSERT_GT(env_->now_cpu_count_.load(), 2);
  ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos);
  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);

  SetPerfLevel(PerfLevel::kDisable);
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, TestPerfContextIterCpuTime) {
  DestroyAndReopen(CurrentOptions());
  // force resizing table cache so table handle is not preloaded so that
  // we can measure find_table_nanos during iteration
  dbfull()->TEST_table_cache()->SetCapacity(0);

  const size_t kNumEntries = 10;
  for (size_t i = 0; i < kNumEntries; ++i) {
    ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i)));
  }
  ASSERT_OK(Flush());
  for (size_t i = 0; i < kNumEntries; ++i) {
    ASSERT_EQ("v" + std::to_string(i), Get("k" + std::to_string(i)));
  }
  std::string last_key = "k" + std::to_string(kNumEntries - 1);
  std::string last_value = "v" + std::to_string(kNumEntries - 1);
  env_->now_cpu_count_.store(0);
  env_->SetMockSleep();

  // NOTE: Presumed unnecessary and removed: resetting mock time in env

  // CPU timing is not enabled with kEnableTimeExceptForMutex
  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
  Iterator* iter = db_->NewIterator(ReadOptions());
  iter->Seek("k0");
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("v0", iter->value().ToString());
  iter->SeekForPrev(last_key);
  ASSERT_TRUE(iter->Valid());
  iter->SeekToLast();
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ(last_value, iter->value().ToString());
  iter->SeekToFirst();
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("v0", iter->value().ToString());
  ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos);
  iter->Next();
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("v1", iter->value().ToString());
  ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
  iter->Prev();
  ASSERT_TRUE(iter->Valid());
  ASSERT_OK(iter->status());
  ASSERT_EQ("v0", iter->value().ToString());
  ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
  ASSERT_EQ(0, env_->now_cpu_count_.load());
  delete iter;

  constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
  constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;

  // Add time to NowNanos() reading.
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "TableCache::FindTable:0",
      [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
  iter = db_->NewIterator(ReadOptions());
  iter->Seek("k0");
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("v0", iter->value().ToString());
  iter->SeekForPrev(last_key);
  ASSERT_TRUE(iter->Valid());
  iter->SeekToLast();
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ(last_value, iter->value().ToString());
  iter->SeekToFirst();
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("v0", iter->value().ToString());
  ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
  ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos);
  iter->Next();
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("v1", iter->value().ToString());
  ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
  ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos);
  iter->Prev();
  ASSERT_TRUE(iter->Valid());
  ASSERT_OK(iter->status());
  ASSERT_EQ("v0", iter->value().ToString());
  ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
  ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos);
  ASSERT_GE(env_->now_cpu_count_.load(), 12);
  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);

  SetPerfLevel(PerfLevel::kDisable);
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  delete iter;
}
#endif  // OS_LINUX

#if !defined OS_SOLARIS
TEST_F(DBTest2, PersistentCache) {
  int num_iter = 80;

  Options options;
  options.write_buffer_size = 64 * 1024;  // small write buffer
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
  options = CurrentOptions(options);

  auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024};
  auto types = {/*compressed*/ 1, /*uncompressed*/ 0};
  for (auto bsize : bsizes) {
    for (auto type : types) {
      BlockBasedTableOptions table_options;
      table_options.persistent_cache.reset(
          new MockPersistentCache(type, 10 * 1024));
      table_options.no_block_cache = true;
      table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr;
      options.table_factory.reset(NewBlockBasedTableFactory(table_options));

      DestroyAndReopen(options);
      CreateAndReopenWithCF({"pikachu"}, options);
      // default column family doesn't have block cache
      Options no_block_cache_opts;
      no_block_cache_opts.statistics = options.statistics;
      no_block_cache_opts = CurrentOptions(no_block_cache_opts);
      BlockBasedTableOptions table_options_no_bc;
      table_options_no_bc.no_block_cache = true;
      no_block_cache_opts.table_factory.reset(
          NewBlockBasedTableFactory(table_options_no_bc));
      ReopenWithColumnFamilies(
          {"default", "pikachu"},
          std::vector<Options>({no_block_cache_opts, options}));

      Random rnd(301);

      // Write 8MB (80 values, each 100K)
      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
      std::vector<std::string> values;
      std::string str;
      for (int i = 0; i < num_iter; i++) {
        if (i % 4 == 0) {  // high compression ratio
          str = rnd.RandomString(1000);
        }
        values.push_back(str);
        ASSERT_OK(Put(1, Key(i), values[i]));
      }

      // flush all data from memtable so that reads are from block cache
      ASSERT_OK(Flush(1));

      for (int i = 0; i < num_iter; i++) {
        ASSERT_EQ(Get(1, Key(i)), values[i]);
      }

      auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT);
      auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS);

      ASSERT_GT(hit, 0);
      ASSERT_GT(miss, 0);
    }
  }
}
#endif  // !defined OS_SOLARIS

namespace {
void CountSyncPoint() {
  TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */);
}
}  // anonymous namespace

TEST_F(DBTest2, SyncPointMarker) {
  std::atomic<int> sync_point_called(0);
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "DBTest2::MarkedPoint",
      [&](void* /*arg*/) { sync_point_called.fetch_add(1); });

  // The first dependency enforces Marker can be loaded before MarkedPoint.
  // The second checks that thread 1's MarkedPoint should be disabled here.
  // Execution order:
  // |   Thread 1    |  Thread 2   |
  // |               |   Marker    |
  // |  MarkedPoint  |             |
  // | Thread1First  |             |
  // |               | MarkedPoint |
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
      {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}},
      {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}});

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  std::function<void()> func1 = [&]() {
    CountSyncPoint();
    TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First");
  };

  std::function<void()> func2 = [&]() {
    TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker");
    CountSyncPoint();
  };

  auto thread1 = port::Thread(func1);
  auto thread2 = port::Thread(func2);
  thread1.join();
  thread2.join();

  // Callback is only executed once
  ASSERT_EQ(sync_point_called.load(), 1);
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
  std::string buffer;

  PutVarint32(&buffer, static_cast<uint32_t>(0));
  PutVarint32(&buffer, static_cast<uint32_t>(key_size));
  PutVarint32(&buffer, static_cast<uint32_t>(value_size));

  return buffer.size() + key_size + value_size;
}

TEST_F(DBTest2, ReadAmpBitmap) {
  Options options = CurrentOptions();
  BlockBasedTableOptions bbto;
  uint32_t bytes_per_bit[2] = {1, 16};
  for (size_t k = 0; k < 2; k++) {
    // Disable delta encoding to make it easier to calculate read amplification
    bbto.use_delta_encoding = false;
    // Huge block cache to make it easier to calculate read amplification
    bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
    bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
    DestroyAndReopen(options);

    const size_t kNumEntries = 10000;

    Random rnd(301);
    for (size_t i = 0; i < kNumEntries; i++) {
      ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(100)));
    }
    ASSERT_OK(Flush());

    Close();
    Reopen(options);

    // Read keys/values randomly and verify that reported read amp error
    // is less than 2%
    uint64_t total_useful_bytes = 0;
    std::set<int> read_keys;
    std::string value;
    for (size_t i = 0; i < kNumEntries * 5; i++) {
      int key_idx = rnd.Next() % kNumEntries;
      std::string key = Key(key_idx);
      ASSERT_OK(db_->Get(ReadOptions(), key, &value));

      if (read_keys.find(key_idx) == read_keys.end()) {
        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
        total_useful_bytes +=
            GetEncodedEntrySize(internal_key.size(), value.size());
        read_keys.insert(key_idx);
      }

      double expected_read_amp =
          static_cast<double>(total_useful_bytes) /
          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);

      double read_amp =
          static_cast<double>(options.statistics->getTickerCount(
              READ_AMP_ESTIMATE_USEFUL_BYTES)) /
          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);

      double error_pct = fabs(expected_read_amp - read_amp) * 100;
      // Error between reported read amp and real read amp should be less than
      // 2%
      EXPECT_LE(error_pct, 2);
    }

    // Make sure we read every thing in the DB (which is smaller than our cache)
    Iterator* iter = db_->NewIterator(ReadOptions());
    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
      ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
    }
    ASSERT_OK(iter->status());
    delete iter;

    // Read amp is on average 100% since we read all what we loaded in memory
    if (k == 0) {
      ASSERT_EQ(
          options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
    } else {
      ASSERT_NEAR(
          options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) *
              1.0f /
              options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
          1, .01);
    }
  }
}

#ifndef OS_SOLARIS  // GetUniqueIdFromFile is not implemented
TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
  {
    const int kIdBufLen = 100;
    char id_buf[kIdBufLen];
    Status s = Status::NotSupported();
#ifndef OS_WIN
    // You can't open a directory on windows using random access file
    std::unique_ptr<RandomAccessFile> file;
    s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions());
    if (s.ok()) {
      if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
        // fs holding db directory doesn't support getting a unique file id,
        // this means that running this test will fail because lru_cache will
        // load the blocks again regardless of them being already in the cache
        return;
      }
    }
#endif
    if (!s.ok()) {
      std::unique_ptr<Directory> dir;
      ASSERT_OK(env_->NewDirectory(dbname_, &dir));
      if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
        // fs holding db directory doesn't support getting a unique file id,
        // this means that running this test will fail because lru_cache will
        // load the blocks again regardless of them being already in the cache
        return;
      }
    }
  }
  uint32_t bytes_per_bit[2] = {1, 16};
  for (size_t k = 0; k < 2; k++) {
    std::shared_ptr<Cache> lru_cache = NewLRUCache(1024 * 1024 * 1024);
    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();

    Options options = CurrentOptions();
    BlockBasedTableOptions bbto;
    // Disable delta encoding to make it easier to calculate read amplification
    bbto.use_delta_encoding = false;
    // Huge block cache to make it easier to calculate read amplification
    bbto.block_cache = lru_cache;
    bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
    options.statistics = stats;
    DestroyAndReopen(options);

    const int kNumEntries = 10000;

    Random rnd(301);
    for (int i = 0; i < kNumEntries; i++) {
      ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
    }
    ASSERT_OK(Flush());

    Close();
    Reopen(options);

    std::set<int> read_keys;
    std::string value;
    // Iter1: Read half the DB, Read even keys
    // Key(0), Key(2), Key(4), Key(6), Key(8), ...
    for (int i = 0; i < kNumEntries; i += 2) {
      std::string key = Key(i);
      ASSERT_OK(db_->Get(ReadOptions(), key, &value));

      if (read_keys.find(i) == read_keys.end()) {
        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
        read_keys.insert(i);
      }
    }

    size_t total_useful_bytes_iter1 =
        options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
    size_t total_loaded_bytes_iter1 =
        options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);

    Close();
    std::shared_ptr<Statistics> new_statistics =
        ROCKSDB_NAMESPACE::CreateDBStatistics();
    // Destroy old statistics obj that the blocks in lru_cache are pointing to
    options.statistics.reset();
    // Use the statistics object that we just created
    options.statistics = new_statistics;
    Reopen(options);

    // Iter2: Read half the DB, Read odd keys
    // Key(1), Key(3), Key(5), Key(7), Key(9), ...
    for (int i = 1; i < kNumEntries; i += 2) {
      std::string key = Key(i);
      ASSERT_OK(db_->Get(ReadOptions(), key, &value));

      if (read_keys.find(i) == read_keys.end()) {
        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
        read_keys.insert(i);
      }
    }

    size_t total_useful_bytes_iter2 =
        options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
    size_t total_loaded_bytes_iter2 =
        options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);

    // Read amp is on average 100% since we read all what we loaded in memory
    if (k == 0) {
      ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2,
                total_loaded_bytes_iter1 + total_loaded_bytes_iter2);
    } else {
      ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f /
                      (total_loaded_bytes_iter1 + total_loaded_bytes_iter2),
                  1, .01);
    }
  }
}
#endif  // !OS_SOLARIS

TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
  Options options = CurrentOptions();
  options.num_levels = 3;
  options.IncreaseParallelism(20);
  DestroyAndReopen(options);

  ASSERT_OK(Put(Key(0), "a"));
  ASSERT_OK(Put(Key(5), "a"));
  ASSERT_OK(Flush());

  ASSERT_OK(Put(Key(10), "a"));
  ASSERT_OK(Put(Key(15), "a"));
  ASSERT_OK(Flush());

  CompactRangeOptions cro;
  cro.change_level = true;
  cro.target_level = 2;
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));

  auto get_stat = [](std::string level_str, LevelStatType type,
                     std::map<std::string, std::string> props) {
    auto prop_str =
        "compaction." + level_str + "." +
        InternalStats::compaction_level_stats.at(type).property_name.c_str();
    auto prop_item = props.find(prop_str);
    return prop_item == props.end() ? 0 : std::stod(prop_item->second);
  };

  // Trivial move 2 files to L2
  ASSERT_EQ("0,0,2", FilesPerLevel());
  // Also test that the stats GetMapProperty API reporting the same result
  {
    std::map<std::string, std::string> prop;
    ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
    ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop));
    ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop));
    ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop));
    ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop));
  }

  // While the compaction is running, we will create 2 new files that
  // can fit in L2, these 2 files will be moved to L2 and overlap with
  // the running compaction and break the LSM consistency.
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::Run():Start", [&](void* /*arg*/) {
        ASSERT_OK(
            dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
                                  {"max_bytes_for_level_base", "1"}}));
        ASSERT_OK(Put(Key(6), "a"));
        ASSERT_OK(Put(Key(7), "a"));
        ASSERT_OK(Flush());

        ASSERT_OK(Put(Key(8), "a"));
        ASSERT_OK(Put(Key(9), "a"));
        ASSERT_OK(Flush());
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  // Run a manual compaction that will compact the 2 files in L2
  // into 1 file in L2
  cro.exclusive_manual_compaction = false;
  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();

  // Test that the stats GetMapProperty API reporting 1 file in L2
  {
    std::map<std::string, std::string> prop;
    ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
    ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop));
  }
}

TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
  Options options = CurrentOptions();
  options.num_levels = 2;
  options.IncreaseParallelism(20);
  options.disable_auto_compactions = true;
  DestroyAndReopen(options);

  ASSERT_OK(Put(Key(0), "a"));
  ASSERT_OK(Put(Key(5), "a"));
  ASSERT_OK(Flush());

  ASSERT_OK(Put(Key(10), "a"));
  ASSERT_OK(Put(Key(15), "a"));
  ASSERT_OK(Flush());

  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));

  // Trivial move 2 files to L1
  ASSERT_EQ("0,2", FilesPerLevel());

  std::function<void()> bg_manual_compact = [&]() {
    std::string k1 = Key(6);
    std::string k2 = Key(9);
    Slice k1s(k1);
    Slice k2s(k2);
    CompactRangeOptions cro;
    cro.exclusive_manual_compaction = false;
    ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s));
  };
  ROCKSDB_NAMESPACE::port::Thread bg_thread;

  // While the compaction is running, we will create 2 new files that
  // can fit in L1, these 2 files will be moved to L1 and overlap with
  // the running compaction and break the LSM consistency.
  std::atomic<bool> flag(false);
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::Run():Start", [&](void* /*arg*/) {
        if (flag.exchange(true)) {
          // We want to make sure to call this callback only once
          return;
        }
        ASSERT_OK(Put(Key(6), "a"));
        ASSERT_OK(Put(Key(7), "a"));
        ASSERT_OK(Flush());

        ASSERT_OK(Put(Key(8), "a"));
        ASSERT_OK(Put(Key(9), "a"));
        ASSERT_OK(Flush());

        // Start a non-exclusive manual compaction in a bg thread
        bg_thread = port::Thread(bg_manual_compact);
        // This manual compaction conflict with the other manual compaction
        // so it should wait until the first compaction finish
        env_->SleepForMicroseconds(1000000);
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  // Run a manual compaction that will compact the 2 files in L1
  // into 1 file in L1
  CompactRangeOptions cro;
  cro.exclusive_manual_compaction = false;
  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
  bg_thread.join();

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, PausingManualCompaction1) {
  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  options.num_levels = 7;

  DestroyAndReopen(options);
  Random rnd(301);
  // Generate a file containing 10 keys.
  for (int i = 0; i < 10; i++) {
    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
  }
  ASSERT_OK(Flush());

  // Generate another file containing same keys
  for (int i = 0; i < 10; i++) {
    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
  }
  ASSERT_OK(Flush());

  int manual_compactions_paused = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
        auto canceled = static_cast<std::atomic<bool>*>(arg);
        // CompactRange triggers manual compaction and cancel the compaction
        // by set *canceled as true
        if (canceled != nullptr) {
          canceled->store(true, std::memory_order_release);
        }
        manual_compactions_paused += 1;
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
        auto paused = static_cast<std::atomic<int>*>(arg);
        // CompactFiles() relies on manual_compactions_paused to
        // determine if thie compaction should be paused or not
        ASSERT_EQ(0, paused->load(std::memory_order_acquire));
        paused->fetch_add(1, std::memory_order_release);
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  std::vector<std::string> files_before_compact, files_after_compact;
  // Remember file name before compaction is triggered
  std::vector<LiveFileMetaData> files_meta;
  dbfull()->GetLiveFilesMetaData(&files_meta);
  for (auto file : files_meta) {
    files_before_compact.push_back(file.name);
  }

  // OK, now trigger a manual compaction
  ASSERT_TRUE(dbfull()
                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
                  .IsManualCompactionPaused());

  // Wait for compactions to get scheduled and stopped
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  // Get file names after compaction is stopped
  files_meta.clear();
  dbfull()->GetLiveFilesMetaData(&files_meta);
  for (auto file : files_meta) {
    files_after_compact.push_back(file.name);
  }

  // Like nothing happened
  ASSERT_EQ(files_before_compact, files_after_compact);
  ASSERT_EQ(manual_compactions_paused, 1);

  manual_compactions_paused = 0;
  // Now make sure CompactFiles also not run
  ASSERT_TRUE(dbfull()
                  ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
                                 files_before_compact, 0)
                  .IsManualCompactionPaused());
  // Wait for manual compaction to get scheduled and finish
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  files_meta.clear();
  files_after_compact.clear();
  dbfull()->GetLiveFilesMetaData(&files_meta);
  for (auto file : files_meta) {
    files_after_compact.push_back(file.name);
  }

  ASSERT_EQ(files_before_compact, files_after_compact);
  // CompactFiles returns at entry point
  ASSERT_EQ(manual_compactions_paused, 0);

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

// PausingManualCompaction does not affect auto compaction
TEST_F(DBTest2, PausingManualCompaction2) {
  Options options = CurrentOptions();
  options.level0_file_num_compaction_trigger = 2;
  options.disable_auto_compactions = false;

  DestroyAndReopen(options);
  dbfull()->DisableManualCompaction();

  Random rnd(301);
  for (int i = 0; i < 2; i++) {
    // Generate a file containing 100 keys.
    for (int j = 0; j < 100; j++) {
      ASSERT_OK(Put(Key(j), rnd.RandomString(50)));
    }
    ASSERT_OK(Flush());
  }
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  std::vector<LiveFileMetaData> files_meta;
  dbfull()->GetLiveFilesMetaData(&files_meta);
  ASSERT_EQ(files_meta.size(), 1);
}

TEST_F(DBTest2, PausingManualCompaction3) {
  CompactRangeOptions compact_options;
  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  options.num_levels = 7;

  Random rnd(301);
  auto generate_files = [&]() {
    for (int i = 0; i < options.num_levels; i++) {
      for (int j = 0; j < options.num_levels - i + 1; j++) {
        for (int k = 0; k < 1000; k++) {
          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
        }
        ASSERT_OK(Flush());
      }

      for (int l = 1; l < options.num_levels - i; l++) {
        MoveFilesToLevel(l);
      }
    }
  };

  DestroyAndReopen(options);
  generate_files();
  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
  int run_manual_compactions = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::Run():PausingManualCompaction:1",
      [&](void* /*arg*/) { run_manual_compactions++; });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  dbfull()->DisableManualCompaction();
  ASSERT_TRUE(dbfull()
                  ->CompactRange(compact_options, nullptr, nullptr)
                  .IsManualCompactionPaused());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  // As manual compaction disabled, not even reach sync point
  ASSERT_EQ(run_manual_compactions, 0);
  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
      "CompactionJob::Run():PausingManualCompaction:1");
  dbfull()->EnableManualCompaction();
  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, PausingManualCompaction4) {
  CompactRangeOptions compact_options;
  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  options.num_levels = 7;

  Random rnd(301);
  auto generate_files = [&]() {
    for (int i = 0; i < options.num_levels; i++) {
      for (int j = 0; j < options.num_levels - i + 1; j++) {
        for (int k = 0; k < 1000; k++) {
          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
        }
        ASSERT_OK(Flush());
      }

      for (int l = 1; l < options.num_levels - i; l++) {
        MoveFilesToLevel(l);
      }
    }
  };

  DestroyAndReopen(options);
  generate_files();
  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
  int run_manual_compactions = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
        auto canceled = static_cast<std::atomic<bool>*>(arg);
        // CompactRange triggers manual compaction and cancel the compaction
        // by set *canceled as true
        if (canceled != nullptr) {
          canceled->store(true, std::memory_order_release);
        }
        run_manual_compactions++;
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
        auto paused = static_cast<std::atomic<int>*>(arg);
        // CompactFiles() relies on manual_compactions_paused to
        // determine if thie compaction should be paused or not
        ASSERT_EQ(0, paused->load(std::memory_order_acquire));
        paused->fetch_add(1, std::memory_order_release);
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  ASSERT_TRUE(dbfull()
                  ->CompactRange(compact_options, nullptr, nullptr)
                  .IsManualCompactionPaused());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_EQ(run_manual_compactions, 1);
  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
      "CompactionJob::Run():PausingManualCompaction:2");
  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, CancelManualCompaction1) {
  CompactRangeOptions compact_options;
  auto canceledPtr =
      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
  compact_options.canceled = canceledPtr.get();

  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  options.num_levels = 7;

  Random rnd(301);
  auto generate_files = [&]() {
    for (int i = 0; i < options.num_levels; i++) {
      for (int j = 0; j < options.num_levels - i + 1; j++) {
        for (int k = 0; k < 1000; k++) {
          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
        }
        ASSERT_OK(Flush());
      }

      for (int l = 1; l < options.num_levels - i; l++) {
        MoveFilesToLevel(l);
      }
    }
  };

  DestroyAndReopen(options);
  generate_files();
  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());

  int run_manual_compactions = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::Run():PausingManualCompaction:1",
      [&](void* /*arg*/) { run_manual_compactions++; });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  // Setup a callback to disable compactions after a couple of levels are
  // compacted
  int compactions_run = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "DBImpl::RunManualCompaction()::1",
      [&](void* /*arg*/) { ++compactions_run; });

  ASSERT_TRUE(dbfull()
                  ->CompactRange(compact_options, nullptr, nullptr)
                  .IsManualCompactionPaused());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  // Since compactions are disabled, we shouldn't start compacting.
  // E.g. we should call the compaction function exactly one time.
  ASSERT_EQ(compactions_run, 0);
  ASSERT_EQ(run_manual_compactions, 0);
  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());

  compactions_run = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
      "DBImpl::RunManualCompaction()::1");
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
        ++compactions_run;
        // After 3 compactions disable
        if (compactions_run == 3) {
          compact_options.canceled->store(true, std::memory_order_release);
        }
      });

  compact_options.canceled->store(false, std::memory_order_release);
  ASSERT_TRUE(dbfull()
                  ->CompactRange(compact_options, nullptr, nullptr)
                  .IsManualCompactionPaused());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  ASSERT_EQ(compactions_run, 3);

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
      "DBImpl::RunManualCompaction()::1");
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
      "CompactionJob::Run():PausingManualCompaction:1");

  // Compactions should work again if we re-enable them..
  compact_options.canceled->store(false, std::memory_order_relaxed);
  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, CancelManualCompaction2) {
  CompactRangeOptions compact_options;
  auto canceledPtr =
      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
  compact_options.canceled = canceledPtr.get();
  compact_options.max_subcompactions = 1;

  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  options.num_levels = 7;

  Random rnd(301);
  auto generate_files = [&]() {
    for (int i = 0; i < options.num_levels; i++) {
      for (int j = 0; j < options.num_levels - i + 1; j++) {
        for (int k = 0; k < 1000; k++) {
          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
        }
        ASSERT_OK(Flush());
      }

      for (int l = 1; l < options.num_levels - i; l++) {
        MoveFilesToLevel(l);
      }
    }
  };

  DestroyAndReopen(options);
  generate_files();
  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  int compactions_run = 0;
  std::atomic<int> kv_compactions{0};
  int compactions_stopped_at = 0;
  int kv_compactions_stopped_at = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
        ++compactions_run;
        // After 3 compactions disable
      });

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
        int kv_compactions_run =
            kv_compactions.fetch_add(1, std::memory_order_release);
        if (kv_compactions_run == 5) {
          compact_options.canceled->store(true, std::memory_order_release);
          kv_compactions_stopped_at = kv_compactions_run;
          compactions_stopped_at = compactions_run;
        }
      });

  compact_options.canceled->store(false, std::memory_order_release);
  ASSERT_TRUE(dbfull()
                  ->CompactRange(compact_options, nullptr, nullptr)
                  .IsManualCompactionPaused());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to
  // the canceled variable from the single compacting thread (via callback),
  // this value is deterministically kv_compactions_stopped_at + 1.
  ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1);
  ASSERT_EQ(compactions_run, compactions_stopped_at);

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
      "CompactionIterator::ProcessKV");
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
      "DBImpl::RunManualCompaction()::1");
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
      "CompactionJob::Run():PausingManualCompaction:1");

  // Compactions should work again if we re-enable them..
  compact_options.canceled->store(false, std::memory_order_relaxed);
  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

class CancelCompactionListener : public EventListener {
 public:
  CancelCompactionListener()
      : num_compaction_started_(0), num_compaction_ended_(0) {}

  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
    ASSERT_EQ(ci.cf_name, "default");
    ASSERT_EQ(ci.base_input_level, 0);
    num_compaction_started_++;
  }

  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
    ASSERT_EQ(ci.cf_name, "default");
    ASSERT_EQ(ci.base_input_level, 0);
    ASSERT_EQ(ci.status.code(), code_);
    ASSERT_EQ(ci.status.subcode(), subcode_);
    num_compaction_ended_++;
  }

  std::atomic<size_t> num_compaction_started_;
  std::atomic<size_t> num_compaction_ended_;
  Status::Code code_;
  Status::SubCode subcode_;
};

TEST_F(DBTest2, CancelManualCompactionWithListener) {
  CompactRangeOptions compact_options;
  auto canceledPtr =
      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
  compact_options.canceled = canceledPtr.get();
  compact_options.max_subcompactions = 1;

  Options options = CurrentOptions();
  options.disable_auto_compactions = true;
  CancelCompactionListener* listener = new CancelCompactionListener();
  options.listeners.emplace_back(listener);

  DestroyAndReopen(options);

  Random rnd(301);
  for (int i = 0; i < 10; i++) {
    for (int j = 0; j < 10; j++) {
      ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50)));
    }
    ASSERT_OK(Flush());
  }

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
        compact_options.canceled->store(true, std::memory_order_release);
      });

  int running_compaction = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::FinishCompactionOutputFile1",
      [&](void* /*arg*/) { running_compaction++; });

  // Case I: 1 Notify begin compaction, 2 Set *canceled as true to disable
  // manual compaction in the callback function, 3 Compaction not run,
  // 4 Notify compaction end.
  listener->code_ = Status::kIncomplete;
  listener->subcode_ = Status::SubCode::kManualCompactionPaused;

  compact_options.canceled->store(false, std::memory_order_release);
  ASSERT_TRUE(dbfull()
                  ->CompactRange(compact_options, nullptr, nullptr)
                  .IsManualCompactionPaused());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  ASSERT_GT(listener->num_compaction_started_, 0);
  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
  ASSERT_EQ(running_compaction, 0);

  listener->num_compaction_started_ = 0;
  listener->num_compaction_ended_ = 0;

  // Case II: 1 Set *canceled as true in the callback function to disable manual
  // compaction, 2 Notify begin compaction (return without notifying), 3 Notify
  // compaction end (return without notifying).
  ASSERT_TRUE(dbfull()
                  ->CompactRange(compact_options, nullptr, nullptr)
                  .IsManualCompactionPaused());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  ASSERT_EQ(listener->num_compaction_started_, 0);
  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
  ASSERT_EQ(running_compaction, 0);

  // Case III: 1 Notify begin compaction, 2 Compaction in between
  // 3. Set *canceled as true in the callback function to disable manual
  // compaction, 4 Notify compaction end.
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
      "CompactionIterator:ProcessKV");

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) {
        compact_options.canceled->store(true, std::memory_order_release);
      });

  listener->code_ = Status::kOk;
  listener->subcode_ = Status::SubCode::kNone;

  compact_options.canceled->store(false, std::memory_order_release);
  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  ASSERT_GT(listener->num_compaction_started_, 0);
  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);

  // Compaction job will succeed.
  ASSERT_GT(running_compaction, 0);

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) {
  int num_levels = 3;
  const int kNumFilesTrigger = 4;

  Options options = CurrentOptions();
  env_->SetBackgroundThreads(0, Env::Priority::HIGH);
  env_->SetBackgroundThreads(0, Env::Priority::LOW);
  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
  options.env = env_;
  options.compaction_style = kCompactionStyleUniversal;
  options.num_levels = num_levels;
  options.write_buffer_size = 100 << 10;     // 100KB
  options.target_file_size_base = 32 << 10;  // 32KB
  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
  // Trigger compaction if size amplification exceeds 110%
  options.compaction_options_universal.max_size_amplification_percent = 110;

  CancelCompactionListener* listener = new CancelCompactionListener();
  options.listeners.emplace_back(listener);

  DestroyAndReopen(options);

  int num_bottom_thread_compaction_scheduled = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
      [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; });

  int num_compaction_jobs = 0;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::Run():End",
      [&](void* /*arg*/) { num_compaction_jobs++; });

  listener->code_ = Status::kOk;
  listener->subcode_ = Status::SubCode::kNone;

  Random rnd(301);
  for (int i = 0; i < 1; ++i) {
    for (int num = 0; num < kNumFilesTrigger; num++) {
      int key_idx = 0;
      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
      // use no_wait above because that one waits for flush and compaction. We
      // don't want to wait for compaction because the full compaction is
      // intentionally blocked while more files are flushed.
      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
    }
  }
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_GT(num_bottom_thread_compaction_scheduled, 0);
  ASSERT_EQ(num_compaction_jobs, 1);
  ASSERT_GT(listener->num_compaction_started_, 0);
  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, OptimizeForPointLookup) {
  Options options = CurrentOptions();
  Close();
  options.OptimizeForPointLookup(2);
  ASSERT_OK(DB::Open(options, dbname_, &db_));

  ASSERT_OK(Put("foo", "v1"));
  ASSERT_EQ("v1", Get("foo"));
  ASSERT_OK(Flush());
  ASSERT_EQ("v1", Get("foo"));
}

TEST_F(DBTest2, OptimizeForSmallDB) {
  Options options = CurrentOptions();
  Close();
  options.OptimizeForSmallDb();

  // Find the cache object
  ASSERT_TRUE(options.table_factory->IsInstanceOf(
      TableFactory::kBlockBasedTableName()));
  auto table_options =
      options.table_factory->GetOptions<BlockBasedTableOptions>();

  ASSERT_TRUE(table_options != nullptr);
  std::shared_ptr<Cache> cache = table_options->block_cache;

  ASSERT_EQ(0, cache->GetUsage());
  ASSERT_OK(DB::Open(options, dbname_, &db_));
  ASSERT_OK(Put("foo", "v1"));

  // memtable size is costed to the block cache
  ASSERT_NE(0, cache->GetUsage());

  ASSERT_EQ("v1", Get("foo"));
  ASSERT_OK(Flush());

  size_t prev_size = cache->GetUsage();
  // Remember block cache size, so that we can find that
  // it is filled after Get().
  // Use pinnable slice so that it can ping the block so that
  // when we check the size it is not evicted.
  PinnableSlice value;
  ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
  ASSERT_GT(cache->GetUsage(), prev_size);
  value.Reset();
}


TEST_F(DBTest2, IterRaceFlush1) {
  ASSERT_OK(Put("foo", "v1"));

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
      {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"},
       {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}});

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  ROCKSDB_NAMESPACE::port::Thread t1([&] {
    TEST_SYNC_POINT("DBTest2::IterRaceFlush:1");
    ASSERT_OK(Put("foo", "v2"));
    ASSERT_OK(Flush());
    TEST_SYNC_POINT("DBTest2::IterRaceFlush:2");
  });

  // iterator is created after the first Put(), and its snapshot sequence is
  // assigned after second Put(), so it must see v2.
  {
    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
    it->Seek("foo");
    ASSERT_TRUE(it->Valid());
    ASSERT_OK(it->status());
    ASSERT_EQ("foo", it->key().ToString());
    ASSERT_EQ("v2", it->value().ToString());
  }

  t1.join();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, IterRaceFlush2) {
  ASSERT_OK(Put("foo", "v1"));

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
      {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"},
       {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}});

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  ROCKSDB_NAMESPACE::port::Thread t1([&] {
    TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1");
    ASSERT_OK(Put("foo", "v2"));
    ASSERT_OK(Flush());
    TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2");
  });

  // iterator is created after the first Put(), and its snapshot sequence is
  // assigned before second Put(), thus it must see v1.
  {
    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
    it->Seek("foo");
    ASSERT_TRUE(it->Valid());
    ASSERT_OK(it->status());
    ASSERT_EQ("foo", it->key().ToString());
    ASSERT_EQ("v1", it->value().ToString());
  }

  t1.join();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, IterRefreshRaceFlush) {
  ASSERT_OK(Put("foo", "v1"));

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
      {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"},
       {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}});

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  ROCKSDB_NAMESPACE::port::Thread t1([&] {
    TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1");
    ASSERT_OK(Put("foo", "v2"));
    ASSERT_OK(Flush());
    TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2");
  });

  // iterator is refreshed after the first Put(), and its sequence number is
  // assigned after second Put(), thus it must see v2.
  {
    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
    ASSERT_OK(it->status());
    ASSERT_OK(it->Refresh());
    it->Seek("foo");
    ASSERT_TRUE(it->Valid());
    ASSERT_OK(it->status());
    ASSERT_EQ("foo", it->key().ToString());
    ASSERT_EQ("v2", it->value().ToString());
  }

  t1.join();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, GetRaceFlush1) {
  ASSERT_OK(Put("foo", "v1"));

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
      {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"},
       {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}});

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  ROCKSDB_NAMESPACE::port::Thread t1([&] {
    TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
    ASSERT_OK(Put("foo", "v2"));
    ASSERT_OK(Flush());
    TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
  });

  // Get() is issued after the first Put(), so it should see either
  // "v1" or "v2".
  ASSERT_NE("NOT_FOUND", Get("foo"));
  t1.join();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, GetRaceFlush2) {
  ASSERT_OK(Put("foo", "v1"));

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
      {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"},
       {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}});

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  port::Thread t1([&] {
    TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
    ASSERT_OK(Put("foo", "v2"));
    ASSERT_OK(Flush());
    TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
  });

  // Get() is issued after the first Put(), so it should see either
  // "v1" or "v2".
  ASSERT_NE("NOT_FOUND", Get("foo"));
  t1.join();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, DirectIO) {
  if (!IsDirectIOSupported()) {
    return;
  }
  Options options = CurrentOptions();
  options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
      true;
  options.allow_mmap_reads = options.allow_mmap_writes = false;
  DestroyAndReopen(options);

  ASSERT_OK(Put(Key(0), "a"));
  ASSERT_OK(Put(Key(5), "a"));
  ASSERT_OK(Flush());

  ASSERT_OK(Put(Key(10), "a"));
  ASSERT_OK(Put(Key(15), "a"));
  ASSERT_OK(Flush());

  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
  Reopen(options);
}

TEST_F(DBTest2, MemtableOnlyIterator) {
  Options options = CurrentOptions();
  CreateAndReopenWithCF({"pikachu"}, options);

  ASSERT_OK(Put(1, "foo", "first"));
  ASSERT_OK(Put(1, "bar", "second"));

  ReadOptions ropt;
  ropt.read_tier = kMemtableTier;
  std::string value;
  Iterator* it = nullptr;

  // Before flushing
  // point lookups
  ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
  ASSERT_EQ("first", value);
  ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
  ASSERT_EQ("second", value);

  // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet.
  it = db_->NewIterator(ropt, handles_[1]);
  int count = 0;
  for (it->SeekToFirst(); it->Valid(); it->Next()) {
    ASSERT_TRUE(it->Valid());
    count++;
  }
  ASSERT_TRUE(!it->Valid());
  ASSERT_EQ(2, count);
  delete it;

  Flush(1);

  // After flushing
  // point lookups
  ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
  ASSERT_EQ("first", value);
  ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
  ASSERT_EQ("second", value);
  // nothing should be returned using memtable-only iterator after flushing.
  it = db_->NewIterator(ropt, handles_[1]);
  ASSERT_OK(it->status());
  count = 0;
  for (it->SeekToFirst(); it->Valid(); it->Next()) {
    ASSERT_TRUE(it->Valid());
    count++;
  }
  ASSERT_TRUE(!it->Valid());
  ASSERT_EQ(0, count);
  ASSERT_OK(it->status());
  delete it;

  // Add a key to memtable
  ASSERT_OK(Put(1, "foobar", "third"));
  it = db_->NewIterator(ropt, handles_[1]);
  ASSERT_OK(it->status());
  count = 0;
  for (it->SeekToFirst(); it->Valid(); it->Next()) {
    ASSERT_TRUE(it->Valid());
    ASSERT_EQ("foobar", it->key().ToString());
    ASSERT_EQ("third", it->value().ToString());
    count++;
  }
  ASSERT_TRUE(!it->Valid());
  ASSERT_EQ(1, count);
  ASSERT_OK(it->status());
  delete it;
}

TEST_F(DBTest2, LowPriWrite) {
  Options options = CurrentOptions();
  // Compaction pressure should trigger since 6 files
  options.level0_file_num_compaction_trigger = 4;
  options.level0_slowdown_writes_trigger = 12;
  options.level0_stop_writes_trigger = 30;
  options.delayed_write_rate = 8 * 1024 * 1024;
  Reopen(options);

  std::atomic<int> rate_limit_count(0);

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "GenericRateLimiter::Request:1", [&](void* arg) {
        rate_limit_count.fetch_add(1);
        int64_t* rate_bytes_per_sec = static_cast<int64_t*>(arg);
        ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec);
      });
  // Block compaction
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
      {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"},
  });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
  WriteOptions wo;
  for (int i = 0; i < 6; i++) {
    wo.low_pri = false;
    ASSERT_OK(Put("", "", wo));
    wo.low_pri = true;
    ASSERT_OK(Put("", "", wo));
    ASSERT_OK(Flush());
  }
  ASSERT_EQ(0, rate_limit_count.load());
  wo.low_pri = true;
  ASSERT_OK(Put("", "", wo));
  ASSERT_EQ(1, rate_limit_count.load());
  wo.low_pri = false;
  ASSERT_OK(Put("", "", wo));
  ASSERT_EQ(1, rate_limit_count.load());

  TEST_SYNC_POINT("DBTest.LowPriWrite:0");
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();

  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  wo.low_pri = true;
  ASSERT_OK(Put("", "", wo));
  ASSERT_EQ(1, rate_limit_count.load());
  wo.low_pri = false;
  ASSERT_OK(Put("", "", wo));
  ASSERT_EQ(1, rate_limit_count.load());
}

TEST_F(DBTest2, RateLimitedCompactionReads) {
  // compaction input has 512KB data
  const int kNumKeysPerFile = 128;
  const int kBytesPerKey = 1024;
  const int kNumL0Files = 4;

  for (int compaction_readahead_size : {0, 32 << 10}) {
    for (auto use_direct_io : {false, true}) {
      if (use_direct_io && !IsDirectIOSupported()) {
        continue;
      }
      Options options = CurrentOptions();
      options.compaction_readahead_size = compaction_readahead_size;
      options.compression = kNoCompression;
      options.level0_file_num_compaction_trigger = kNumL0Files;
      options.memtable_factory.reset(
          test::NewSpecialSkipListFactory(kNumKeysPerFile));
      // takes roughly one second, split into 100 x 10ms intervals. Each
      // interval permits 5.12KB, which is smaller than the block size, so this
      // test exercises the code for chunking reads.
      options.rate_limiter.reset(NewGenericRateLimiter(
          static_cast<int64_t>(kNumL0Files * kNumKeysPerFile *
                               kBytesPerKey) /* rate_bytes_per_sec */,
          10 * 1000 /* refill_period_us */, 10 /* fairness */,
          RateLimiter::Mode::kReadsOnly));
      options.use_direct_reads =
          options.use_direct_io_for_flush_and_compaction = use_direct_io;
      BlockBasedTableOptions bbto;
      bbto.block_size = 16384;
      bbto.no_block_cache = true;
      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
      DestroyAndReopen(options);

      for (int i = 0; i < kNumL0Files; ++i) {
        for (int j = 0; j <= kNumKeysPerFile; ++j) {
          ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey)));
        }
        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
        if (i + 1 < kNumL0Files) {
          ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
        }
      }
      ASSERT_OK(dbfull()->TEST_WaitForCompact());
      ASSERT_EQ(0, NumTableFilesAtLevel(0));

      // should be slightly above 512KB due to non-data blocks read. Arbitrarily
      // chose 1MB as the upper bound on the total bytes read.
      size_t rate_limited_bytes = static_cast<size_t>(
          options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL));
      // The charges can exist for `IO_LOW` and `IO_USER` priorities.
      size_t rate_limited_bytes_by_pri =
          options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
          options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
      ASSERT_EQ(rate_limited_bytes,
                static_cast<size_t>(rate_limited_bytes_by_pri));
      // Include the explicit prefetch of the footer in direct I/O case.
      size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
      ASSERT_GE(
          rate_limited_bytes,
          static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
      ASSERT_LT(
          rate_limited_bytes,
          static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
                              direct_io_extra));

      Iterator* iter = db_->NewIterator(ReadOptions());
      ASSERT_OK(iter->status());
      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
        ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey));
      }
      delete iter;
      // bytes read for user iterator shouldn't count against the rate limit.
      rate_limited_bytes_by_pri =
          options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
          options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
      ASSERT_EQ(rate_limited_bytes,
                static_cast<size_t>(rate_limited_bytes_by_pri));
    }
  }
}

// Make sure DB can be reopen with reduced number of levels, given no file
// is on levels higher than the new num_levels.
TEST_F(DBTest2, ReduceLevel) {
  Options options;
  options.env = env_;
  options.disable_auto_compactions = true;
  options.num_levels = 7;
  Reopen(options);
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Flush());
  MoveFilesToLevel(6);
  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
  CompactRangeOptions compact_options;
  compact_options.change_level = true;
  compact_options.target_level = 1;
  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
  ASSERT_EQ("0,1", FilesPerLevel());
  options.num_levels = 3;
  Reopen(options);
  ASSERT_EQ("0,1", FilesPerLevel());
}

// Test that ReadCallback is actually used in both memtbale and sst tables
TEST_F(DBTest2, ReadCallbackTest) {
  Options options;
  options.disable_auto_compactions = true;
  options.num_levels = 7;
  options.env = env_;
  Reopen(options);
  std::vector<const Snapshot*> snapshots;
  // Try to create a db with multiple layers and a memtable
  const std::string key = "foo";
  const std::string value = "bar";
  // This test assumes that the seq start with 1 and increased by 1 after each
  // write batch of size 1. If that behavior changes, the test needs to be
  // updated as well.
  // TODO(myabandeh): update this test to use the seq number that is returned by
  // the DB instead of assuming what seq the DB used.
  int i = 1;
  for (; i < 10; i++) {
    ASSERT_OK(Put(key, value + std::to_string(i)));
    // Take a snapshot to avoid the value being removed during compaction
    auto snapshot = dbfull()->GetSnapshot();
    snapshots.push_back(snapshot);
  }
  ASSERT_OK(Flush());
  for (; i < 20; i++) {
    ASSERT_OK(Put(key, value + std::to_string(i)));
    // Take a snapshot to avoid the value being removed during compaction
    auto snapshot = dbfull()->GetSnapshot();
    snapshots.push_back(snapshot);
  }
  ASSERT_OK(Flush());
  MoveFilesToLevel(6);
  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
  for (; i < 30; i++) {
    ASSERT_OK(Put(key, value + std::to_string(i)));
    auto snapshot = dbfull()->GetSnapshot();
    snapshots.push_back(snapshot);
  }
  ASSERT_OK(Flush());
  ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
  // And also add some values to the memtable
  for (; i < 40; i++) {
    ASSERT_OK(Put(key, value + std::to_string(i)));
    auto snapshot = dbfull()->GetSnapshot();
    snapshots.push_back(snapshot);
  }

  class TestReadCallback : public ReadCallback {
   public:
    explicit TestReadCallback(SequenceNumber snapshot)
        : ReadCallback(snapshot), snapshot_(snapshot) {}
    bool IsVisibleFullCheck(SequenceNumber seq) override {
      return seq <= snapshot_;
    }

   private:
    SequenceNumber snapshot_;
  };

  for (int seq = 1; seq < i; seq++) {
    PinnableSlice pinnable_val;
    ReadOptions roptions;
    TestReadCallback callback(seq);
    bool dont_care = true;
    DBImpl::GetImplOptions get_impl_options;
    get_impl_options.column_family = dbfull()->DefaultColumnFamily();
    get_impl_options.value = &pinnable_val;
    get_impl_options.value_found = &dont_care;
    get_impl_options.callback = &callback;
    Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
    ASSERT_TRUE(s.ok());
    // Assuming that after each Put the DB increased seq by one, the value and
    // seq number must be equal since we also inc value by 1 after each Put.
    ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString());
  }

  for (auto snapshot : snapshots) {
    dbfull()->ReleaseSnapshot(snapshot);
  }
}


TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
  // Regression test for race condition where an obsolete file is returned to
  // user as a "live file" but then deleted, all while file deletions are
  // disabled.
  //
  // It happened like this:
  //
  // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles
  // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the
  //    latter returned "x.log"
  // 3. [flush thread] PurgeObsoleteFiles deleted "x.log"
  // 4. [user thread] Reading "x.log" failed
  //
  // Unfortunately the only regression test I can come up with involves sleep.
  // We cannot set SyncPoints to repro since, once the fix is applied, the
  // SyncPoints would cause a deadlock as the repro's sequence of events is now
  // prohibited.
  //
  // Instead, if we sleep for a second between Find and Purge, and ensure the
  // read attempt happens after purge, then the sequence of events will almost
  // certainly happen on the old code.
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
      {"DBImpl::BackgroundCallFlush:FilesFound",
       "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"},
      {"DBImpl::PurgeObsoleteFiles:End",
       "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"},
  });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "DBImpl::PurgeObsoleteFiles:Begin",
      [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  ASSERT_OK(Put("key", "val"));
  FlushOptions flush_opts;
  flush_opts.wait = false;
  db_->Flush(flush_opts);
  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");

  ASSERT_OK(db_->DisableFileDeletions());
  VectorLogPtr log_files;
  ASSERT_OK(db_->GetSortedWalFiles(log_files));
  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
  for (const auto& log_file : log_files) {
    ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
  }

  ASSERT_OK(db_->EnableFileDeletions());
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, TestNumPread) {
  Options options = CurrentOptions();
  bool prefetch_supported =
      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
  // disable block cache
  BlockBasedTableOptions table_options;
  table_options.no_block_cache = true;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  Reopen(options);
  env_->count_random_reads_ = true;
  env_->random_file_open_counter_.store(0);
  ASSERT_OK(Put("bar", "foo"));
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Flush());
  if (prefetch_supported) {
    // After flush, we'll open the file and read footer, meta block,
    // property block and index block.
    ASSERT_EQ(4, env_->random_read_counter_.Read());
  } else {
    // With prefetch not supported, we will do a single read into a buffer
    ASSERT_EQ(1, env_->random_read_counter_.Read());
  }
  ASSERT_EQ(1, env_->random_file_open_counter_.load());

  // One pread per a normal data block read
  env_->random_file_open_counter_.store(0);
  env_->random_read_counter_.Reset();
  ASSERT_EQ("bar", Get("foo"));
  ASSERT_EQ(1, env_->random_read_counter_.Read());
  // All files are already opened.
  ASSERT_EQ(0, env_->random_file_open_counter_.load());

  env_->random_file_open_counter_.store(0);
  env_->random_read_counter_.Reset();
  ASSERT_OK(Put("bar2", "foo2"));
  ASSERT_OK(Put("foo2", "bar2"));
  ASSERT_OK(Flush());
  if (prefetch_supported) {
    // After flush, we'll open the file and read footer, meta block,
    // property block and index block.
    ASSERT_EQ(4, env_->random_read_counter_.Read());
  } else {
    // With prefetch not supported, we will do a single read into a buffer
    ASSERT_EQ(1, env_->random_read_counter_.Read());
  }
  ASSERT_EQ(1, env_->random_file_open_counter_.load());

  env_->random_file_open_counter_.store(0);
  env_->random_read_counter_.Reset();
  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
  if (prefetch_supported) {
    // Compaction needs two input blocks, which requires 2 preads, and
    // generate a new SST file which needs 4 preads (footer, meta block,
    // property block and index block). In total 6.
    ASSERT_EQ(6, env_->random_read_counter_.Read());
  } else {
    // With prefetch off, compaction needs two input blocks,
    // followed by a single buffered read.  In total 3.
    ASSERT_EQ(3, env_->random_read_counter_.Read());
  }
  // All compaction input files should have already been opened.
  ASSERT_EQ(1, env_->random_file_open_counter_.load());

  // One pread per a normal data block read
  env_->random_file_open_counter_.store(0);
  env_->random_read_counter_.Reset();
  ASSERT_EQ("foo2", Get("bar2"));
  ASSERT_EQ(1, env_->random_read_counter_.Read());
  // SST files are already opened.
  ASSERT_EQ(0, env_->random_file_open_counter_.load());
}

class TraceExecutionResultHandler : public TraceRecordResult::Handler {
 public:
  TraceExecutionResultHandler() {}
  ~TraceExecutionResultHandler() override {}

  virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override {
    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
      return Status::InvalidArgument("Invalid timestamps.");
    }
    result.GetStatus().PermitUncheckedError();
    switch (result.GetTraceType()) {
      case kTraceWrite: {
        total_latency_ += result.GetLatency();
        cnt_++;
        writes_++;
        break;
      }
      default:
        return Status::Corruption("Type mismatch.");
    }
    return Status::OK();
  }

  virtual Status Handle(
      const SingleValueTraceExecutionResult& result) override {
    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
      return Status::InvalidArgument("Invalid timestamps.");
    }
    result.GetStatus().PermitUncheckedError();
    switch (result.GetTraceType()) {
      case kTraceGet: {
        total_latency_ += result.GetLatency();
        cnt_++;
        gets_++;
        break;
      }
      default:
        return Status::Corruption("Type mismatch.");
    }
    return Status::OK();
  }

  virtual Status Handle(
      const MultiValuesTraceExecutionResult& result) override {
    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
      return Status::InvalidArgument("Invalid timestamps.");
    }
    for (const Status& s : result.GetMultiStatus()) {
      s.PermitUncheckedError();
    }
    switch (result.GetTraceType()) {
      case kTraceMultiGet: {
        total_latency_ += result.GetLatency();
        cnt_++;
        multigets_++;
        break;
      }
      default:
        return Status::Corruption("Type mismatch.");
    }
    return Status::OK();
  }

  virtual Status Handle(const IteratorTraceExecutionResult& result) override {
    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
      return Status::InvalidArgument("Invalid timestamps.");
    }
    result.GetStatus().PermitUncheckedError();
    switch (result.GetTraceType()) {
      case kTraceIteratorSeek:
      case kTraceIteratorSeekForPrev: {
        total_latency_ += result.GetLatency();
        cnt_++;
        seeks_++;
        break;
      }
      default:
        return Status::Corruption("Type mismatch.");
    }
    return Status::OK();
  }

  void Reset() {
    total_latency_ = 0;
    cnt_ = 0;
    writes_ = 0;
    gets_ = 0;
    seeks_ = 0;
    multigets_ = 0;
  }

  double GetAvgLatency() const {
    return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_;
  }

  int GetNumWrites() const { return writes_; }

  int GetNumGets() const { return gets_; }

  int GetNumIterSeeks() const { return seeks_; }

  int GetNumMultiGets() const { return multigets_; }

 private:
  std::atomic<uint64_t> total_latency_{0};
  std::atomic<uint32_t> cnt_{0};
  std::atomic<int> writes_{0};
  std::atomic<int> gets_{0};
  std::atomic<int> seeks_{0};
  std::atomic<int> multigets_{0};
};

TEST_F(DBTest2, TraceAndReplay) {
  Options options = CurrentOptions();
  options.merge_operator = MergeOperators::CreatePutOperator();
  ReadOptions ro;
  WriteOptions wo;
  TraceOptions trace_opts;
  EnvOptions env_opts;
  CreateAndReopenWithCF({"pikachu"}, options);
  Random rnd(301);
  Iterator* single_iter = nullptr;

  ASSERT_TRUE(db_->EndTrace().IsIOError());

  std::string trace_filename = dbname_ + "/rocksdb.trace";
  std::unique_ptr<TraceWriter> trace_writer;
  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));

  // 5 Writes
  ASSERT_OK(Put(0, "a", "1"));
  ASSERT_OK(Merge(0, "b", "2"));
  ASSERT_OK(Delete(0, "c"));
  ASSERT_OK(SingleDelete(0, "d"));
  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));

  // 6th Write
  WriteBatch batch;
  ASSERT_OK(batch.Put("f", "11"));
  ASSERT_OK(batch.Merge("g", "12"));
  ASSERT_OK(batch.Delete("h"));
  ASSERT_OK(batch.SingleDelete("i"));
  ASSERT_OK(batch.DeleteRange("j", "k"));
  ASSERT_OK(db_->Write(wo, &batch));

  // 2 Seek(ForPrev)s
  single_iter = db_->NewIterator(ro);
  single_iter->Seek("f");  // Seek 1
  single_iter->SeekForPrev("g");
  ASSERT_OK(single_iter->status());
  delete single_iter;

  // 2 Gets
  ASSERT_EQ("1", Get(0, "a"));
  ASSERT_EQ("12", Get(0, "g"));

  // 7th and 8th Write, 3rd Get
  ASSERT_OK(Put(1, "foo", "bar"));
  ASSERT_OK(Put(1, "rocksdb", "rocks"));
  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));

  // Total Write x 8, Get x 3, Seek x 2.
  ASSERT_OK(db_->EndTrace());
  // These should not get into the trace file as it is after EndTrace.
  ASSERT_OK(Put("hello", "world"));
  ASSERT_OK(Merge("foo", "bar"));

  // Open another db, replay, and verify the data
  std::string value;
  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
  ASSERT_OK(DestroyDB(dbname2, options));

  // Using a different name than db2, to pacify infer's use-after-lifetime
  // warnings (http://fbinfer.com).
  DB* db2_init = nullptr;
  options.create_if_missing = true;
  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
  ColumnFamilyHandle* cf;
  ASSERT_OK(
      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
  delete cf;
  delete db2_init;

  DB* db2 = nullptr;
  std::vector<ColumnFamilyDescriptor> column_families;
  ColumnFamilyOptions cf_options;
  cf_options.merge_operator = MergeOperators::CreatePutOperator();
  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
  column_families.push_back(
      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
  std::vector<ColumnFamilyHandle*> handles;
  DBOptions db_opts;
  db_opts.env = env_;
  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));

  env_->SleepForMicroseconds(100);
  // Verify that the keys don't already exist
  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());

  std::unique_ptr<TraceReader> trace_reader;
  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
  std::unique_ptr<Replayer> replayer;
  ASSERT_OK(
      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));

  TraceExecutionResultHandler res_handler;
  std::function<void(Status, std::unique_ptr<TraceRecordResult> &&)> res_cb =
      [&res_handler](Status exec_s, std::unique_ptr<TraceRecordResult>&& res) {
        ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported());
        if (res != nullptr) {
          ASSERT_OK(res->Accept(&res_handler));
          res.reset();
        }
      };

  // Unprepared replay should fail with Status::Incomplete()
  ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
  ASSERT_OK(replayer->Prepare());
  // Ok to repeatedly Prepare().
  ASSERT_OK(replayer->Prepare());
  // Replay using 1 thread, 1x speed.
  ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb));
  ASSERT_GE(res_handler.GetAvgLatency(), 0.0);
  ASSERT_EQ(res_handler.GetNumWrites(), 8);
  ASSERT_EQ(res_handler.GetNumGets(), 3);
  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
  res_handler.Reset();

  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
  ASSERT_EQ("1", value);
  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
  ASSERT_EQ("12", value);
  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());

  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
  ASSERT_EQ("bar", value);
  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
  ASSERT_EQ("rocks", value);

  // Re-replay should fail with Status::Incomplete() if Prepare() was not
  // called. Currently we don't distinguish between unprepared and trace end.
  ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());

  // Re-replay using 2 threads, 2x speed.
  ASSERT_OK(replayer->Prepare());
  ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb));
  ASSERT_GE(res_handler.GetAvgLatency(), 0.0);
  ASSERT_EQ(res_handler.GetNumWrites(), 8);
  ASSERT_EQ(res_handler.GetNumGets(), 3);
  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
  res_handler.Reset();

  // Re-replay using 2 threads, 1/2 speed.
  ASSERT_OK(replayer->Prepare());
  ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb));
  ASSERT_GE(res_handler.GetAvgLatency(), 0.0);
  ASSERT_EQ(res_handler.GetNumWrites(), 8);
  ASSERT_EQ(res_handler.GetNumGets(), 3);
  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
  res_handler.Reset();

  replayer.reset();

  for (auto handle : handles) {
    delete handle;
  }
  delete db2;
  ASSERT_OK(DestroyDB(dbname2, options));
}

TEST_F(DBTest2, TraceAndManualReplay) {
  Options options = CurrentOptions();
  options.merge_operator = MergeOperators::CreatePutOperator();
  ReadOptions ro;
  WriteOptions wo;
  TraceOptions trace_opts;
  EnvOptions env_opts;
  CreateAndReopenWithCF({"pikachu"}, options);
  Random rnd(301);
  Iterator* single_iter = nullptr;

  ASSERT_TRUE(db_->EndTrace().IsIOError());

  std::string trace_filename = dbname_ + "/rocksdb.trace";
  std::unique_ptr<TraceWriter> trace_writer;
  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));

  ASSERT_OK(Put(0, "a", "1"));
  ASSERT_OK(Merge(0, "b", "2"));
  ASSERT_OK(Delete(0, "c"));
  ASSERT_OK(SingleDelete(0, "d"));
  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));

  WriteBatch batch;
  ASSERT_OK(batch.Put("f", "11"));
  ASSERT_OK(batch.Merge("g", "12"));
  ASSERT_OK(batch.Delete("h"));
  ASSERT_OK(batch.SingleDelete("i"));
  ASSERT_OK(batch.DeleteRange("j", "k"));
  ASSERT_OK(db_->Write(wo, &batch));

  single_iter = db_->NewIterator(ro);
  single_iter->Seek("f");
  single_iter->SeekForPrev("g");
  ASSERT_OK(single_iter->status());
  delete single_iter;

  // Write some sequenced keys for testing lower/upper bounds of iterator.
  batch.Clear();
  ASSERT_OK(batch.Put("iter-0", "iter-0"));
  ASSERT_OK(batch.Put("iter-1", "iter-1"));
  ASSERT_OK(batch.Put("iter-2", "iter-2"));
  ASSERT_OK(batch.Put("iter-3", "iter-3"));
  ASSERT_OK(batch.Put("iter-4", "iter-4"));
  ASSERT_OK(db_->Write(wo, &batch));

  ReadOptions bounded_ro = ro;
  Slice lower_bound("iter-1");
  Slice upper_bound("iter-3");
  bounded_ro.iterate_lower_bound = &lower_bound;
  bounded_ro.iterate_upper_bound = &upper_bound;
  single_iter = db_->NewIterator(bounded_ro);
  single_iter->Seek("iter-0");
  ASSERT_EQ(single_iter->key().ToString(), "iter-1");
  single_iter->Seek("iter-2");
  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
  single_iter->Seek("iter-4");
  ASSERT_FALSE(single_iter->Valid());
  single_iter->SeekForPrev("iter-0");
  ASSERT_FALSE(single_iter->Valid());
  single_iter->SeekForPrev("iter-2");
  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
  single_iter->SeekForPrev("iter-4");
  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
  ASSERT_OK(single_iter->status());
  delete single_iter;

  ASSERT_EQ("1", Get(0, "a"));
  ASSERT_EQ("12", Get(0, "g"));

  ASSERT_OK(Put(1, "foo", "bar"));
  ASSERT_OK(Put(1, "rocksdb", "rocks"));
  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));

  // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2.
  // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6
  // Seek(ForPrev)s.
  // Total Write x 9, Get x 3, Seek x 8
  ASSERT_OK(db_->EndTrace());
  // These should not get into the trace file as it is after EndTrace.
  ASSERT_OK(Put("hello", "world"));
  ASSERT_OK(Merge("foo", "bar"));

  // Open another db, replay, and verify the data
  std::string value;
  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
  ASSERT_OK(DestroyDB(dbname2, options));

  // Using a different name than db2, to pacify infer's use-after-lifetime
  // warnings (http://fbinfer.com).
  DB* db2_init = nullptr;
  options.create_if_missing = true;
  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
  ColumnFamilyHandle* cf;
  ASSERT_OK(
      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
  delete cf;
  delete db2_init;

  DB* db2 = nullptr;
  std::vector<ColumnFamilyDescriptor> column_families;
  ColumnFamilyOptions cf_options;
  cf_options.merge_operator = MergeOperators::CreatePutOperator();
  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
  column_families.push_back(
      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
  std::vector<ColumnFamilyHandle*> handles;
  DBOptions db_opts;
  db_opts.env = env_;
  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));

  env_->SleepForMicroseconds(100);
  // Verify that the keys don't already exist
  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());

  std::unique_ptr<TraceReader> trace_reader;
  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
  std::unique_ptr<Replayer> replayer;
  ASSERT_OK(
      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));

  TraceExecutionResultHandler res_handler;

  // Manual replay for 2 times. The 2nd checks if the replay can restart.
  std::unique_ptr<TraceRecord> record;
  std::unique_ptr<TraceRecordResult> result;
  for (int i = 0; i < 2; i++) {
    // Next should fail if unprepared.
    ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
    ASSERT_OK(replayer->Prepare());
    Status s = Status::OK();
    // Looping until trace end.
    while (s.ok()) {
      s = replayer->Next(&record);
      // Skip unsupported operations.
      if (s.IsNotSupported()) {
        continue;
      }
      if (s.ok()) {
        ASSERT_OK(replayer->Execute(record, &result));
        if (result != nullptr) {
          ASSERT_OK(result->Accept(&res_handler));
          if (record->GetTraceType() == kTraceIteratorSeek ||
              record->GetTraceType() == kTraceIteratorSeekForPrev) {
            IteratorSeekQueryTraceRecord* iter_rec =
                dynamic_cast<IteratorSeekQueryTraceRecord*>(record.get());
            IteratorTraceExecutionResult* iter_res =
                dynamic_cast<IteratorTraceExecutionResult*>(result.get());
            // Check if lower/upper bounds are correctly saved and decoded.
            std::string lower_str = iter_rec->GetLowerBound().ToString();
            std::string upper_str = iter_rec->GetUpperBound().ToString();
            std::string iter_key = iter_res->GetKey().ToString();
            std::string iter_value = iter_res->GetValue().ToString();
            if (!lower_str.empty() && !upper_str.empty()) {
              ASSERT_EQ(lower_str, "iter-1");
              ASSERT_EQ(upper_str, "iter-3");
              if (iter_res->GetValid()) {
                // If iterator is valid, then lower_bound <= key < upper_bound.
                ASSERT_GE(iter_key, lower_str);
                ASSERT_LT(iter_key, upper_str);
              } else {
                // If iterator is invalid, then
                //   key < lower_bound or key >= upper_bound.
                ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str);
              }
            }
            // If iterator is invalid, the key and value should be empty.
            if (!iter_res->GetValid()) {
              ASSERT_TRUE(iter_key.empty());
              ASSERT_TRUE(iter_value.empty());
            }
          }
          result.reset();
        }
      }
    }
    // Status::Incomplete() will be returned when manually reading the trace
    // end, or Prepare() was not called.
    ASSERT_TRUE(s.IsIncomplete());
    ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
    ASSERT_GE(res_handler.GetAvgLatency(), 0.0);
    ASSERT_EQ(res_handler.GetNumWrites(), 9);
    ASSERT_EQ(res_handler.GetNumGets(), 3);
    ASSERT_EQ(res_handler.GetNumIterSeeks(), 8);
    ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
    res_handler.Reset();
  }

  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
  ASSERT_EQ("1", value);
  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
  ASSERT_EQ("12", value);
  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());

  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
  ASSERT_EQ("bar", value);
  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
  ASSERT_EQ("rocks", value);

  // Test execution of artificially created TraceRecords.
  uint64_t fake_ts = 1U;
  // Write
  batch.Clear();
  ASSERT_OK(batch.Put("trace-record-write1", "write1"));
  ASSERT_OK(batch.Put("trace-record-write2", "write2"));
  record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++));
  ASSERT_OK(replayer->Execute(record, &result));
  ASSERT_TRUE(result != nullptr);
  ASSERT_OK(result->Accept(&res_handler));  // Write x 1
  ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value));
  ASSERT_EQ("write1", value);
  ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value));
  ASSERT_EQ("write2", value);
  ASSERT_GE(res_handler.GetAvgLatency(), 0.0);
  ASSERT_EQ(res_handler.GetNumWrites(), 1);
  ASSERT_EQ(res_handler.GetNumGets(), 0);
  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
  res_handler.Reset();

  // Get related
  // Get an existing key.
  record.reset(new GetQueryTraceRecord(handles[0]->GetID(),
                                       "trace-record-write1", fake_ts++));
  ASSERT_OK(replayer->Execute(record, &result));
  ASSERT_TRUE(result != nullptr);
  ASSERT_OK(result->Accept(&res_handler));  // Get x 1
  // Get an non-existing key, should still return Status::OK().
  record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get",
                                       fake_ts++));
  ASSERT_OK(replayer->Execute(record, &result));
  ASSERT_TRUE(result != nullptr);
  ASSERT_OK(result->Accept(&res_handler));  // Get x 2
  // Get from an invalid (non-existing) cf_id.
  uint32_t invalid_cf_id = handles[1]->GetID() + 1;
  record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++));
  ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
  ASSERT_TRUE(result == nullptr);
  ASSERT_GE(res_handler.GetAvgLatency(), 0.0);
  ASSERT_EQ(res_handler.GetNumWrites(), 0);
  ASSERT_EQ(res_handler.GetNumGets(), 2);
  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
  res_handler.Reset();

  // Iteration related
  for (IteratorSeekQueryTraceRecord::SeekType seekType :
       {IteratorSeekQueryTraceRecord::kSeek,
        IteratorSeekQueryTraceRecord::kSeekForPrev}) {
    // Seek to an existing key.
    record.reset(new IteratorSeekQueryTraceRecord(
        seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++));
    ASSERT_OK(replayer->Execute(record, &result));
    ASSERT_TRUE(result != nullptr);
    ASSERT_OK(result->Accept(&res_handler));  // Seek x 1 in one iteration
    // Seek to an non-existing key, should still return Status::OK().
    record.reset(new IteratorSeekQueryTraceRecord(
        seekType, handles[0]->GetID(), "trace-record-get", fake_ts++));
    ASSERT_OK(replayer->Execute(record, &result));
    ASSERT_TRUE(result != nullptr);
    ASSERT_OK(result->Accept(&res_handler));  // Seek x 2 in one iteration
    // Seek from an invalid cf_id.
    record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id,
                                                  "whatever", fake_ts++));
    ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
    ASSERT_TRUE(result == nullptr);
  }
  ASSERT_GE(res_handler.GetAvgLatency(), 0.0);
  ASSERT_EQ(res_handler.GetNumWrites(), 0);
  ASSERT_EQ(res_handler.GetNumGets(), 0);
  ASSERT_EQ(res_handler.GetNumIterSeeks(), 4);  // Seek x 2 in two iterations
  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
  res_handler.Reset();

  // MultiGet related
  // Get existing keys.
  record.reset(new MultiGetQueryTraceRecord(
      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
      std::vector<std::string>({"a", "foo"}), fake_ts++));
  ASSERT_OK(replayer->Execute(record, &result));
  ASSERT_TRUE(result != nullptr);
  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 1
  // Get all non-existing keys, should still return Status::OK().
  record.reset(new MultiGetQueryTraceRecord(
      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
      std::vector<std::string>({"no1", "no2"}), fake_ts++));
  ASSERT_OK(replayer->Execute(record, &result));
  ASSERT_TRUE(result != nullptr);
  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 2
  // Get mixed of existing and non-existing keys, should still return
  // Status::OK().
  record.reset(new MultiGetQueryTraceRecord(
      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
      std::vector<std::string>({"a", "no2"}), fake_ts++));
  ASSERT_OK(replayer->Execute(record, &result));
  ASSERT_TRUE(result != nullptr);
  MultiValuesTraceExecutionResult* mvr =
      dynamic_cast<MultiValuesTraceExecutionResult*>(result.get());
  ASSERT_TRUE(mvr != nullptr);
  ASSERT_OK(mvr->GetMultiStatus()[0]);
  ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound());
  ASSERT_EQ(mvr->GetValues()[0], "1");
  ASSERT_EQ(mvr->GetValues()[1], "");
  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 3
  // Get from an invalid (non-existing) cf_id.
  record.reset(new MultiGetQueryTraceRecord(
      std::vector<uint32_t>(
          {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}),
      std::vector<std::string>({"a", "foo", "whatever"}), fake_ts++));
  ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
  ASSERT_TRUE(result == nullptr);
  // Empty MultiGet
  record.reset(new MultiGetQueryTraceRecord(
      std::vector<uint32_t>(), std::vector<std::string>(), fake_ts++));
  ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
  ASSERT_TRUE(result == nullptr);
  // MultiGet size mismatch
  record.reset(new MultiGetQueryTraceRecord(
      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
      std::vector<std::string>({"a"}), fake_ts++));
  ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
  ASSERT_TRUE(result == nullptr);
  ASSERT_GE(res_handler.GetAvgLatency(), 0.0);
  ASSERT_EQ(res_handler.GetNumWrites(), 0);
  ASSERT_EQ(res_handler.GetNumGets(), 0);
  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
  ASSERT_EQ(res_handler.GetNumMultiGets(), 3);
  res_handler.Reset();

  replayer.reset();

  for (auto handle : handles) {
    delete handle;
  }
  delete db2;
  ASSERT_OK(DestroyDB(dbname2, options));
}

TEST_F(DBTest2, TraceWithLimit) {
  Options options = CurrentOptions();
  options.merge_operator = MergeOperators::CreatePutOperator();
  ReadOptions ro;
  WriteOptions wo;
  TraceOptions trace_opts;
  EnvOptions env_opts;
  CreateAndReopenWithCF({"pikachu"}, options);
  Random rnd(301);

  // test the max trace file size options
  trace_opts.max_trace_file_size = 5;
  std::string trace_filename = dbname_ + "/rocksdb.trace1";
  std::unique_ptr<TraceWriter> trace_writer;
  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
  ASSERT_OK(Put(0, "a", "1"));
  ASSERT_OK(Put(0, "b", "1"));
  ASSERT_OK(Put(0, "c", "1"));
  ASSERT_OK(db_->EndTrace());

  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2");
  std::string value;
  ASSERT_OK(DestroyDB(dbname2, options));

  // Using a different name than db2, to pacify infer's use-after-lifetime
  // warnings (http://fbinfer.com).
  DB* db2_init = nullptr;
  options.create_if_missing = true;
  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
  ColumnFamilyHandle* cf;
  ASSERT_OK(
      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
  delete cf;
  delete db2_init;

  DB* db2 = nullptr;
  std::vector<ColumnFamilyDescriptor> column_families;
  ColumnFamilyOptions cf_options;
  cf_options.merge_operator = MergeOperators::CreatePutOperator();
  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
  column_families.push_back(
      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
  std::vector<ColumnFamilyHandle*> handles;
  DBOptions db_opts;
  db_opts.env = env_;
  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));

  env_->SleepForMicroseconds(100);
  // Verify that the keys don't already exist
  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());

  std::unique_ptr<TraceReader> trace_reader;
  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
  std::unique_ptr<Replayer> replayer;
  ASSERT_OK(
      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
  ASSERT_OK(replayer->Prepare());
  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
  replayer.reset();

  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());

  for (auto handle : handles) {
    delete handle;
  }
  delete db2;
  ASSERT_OK(DestroyDB(dbname2, options));
}

TEST_F(DBTest2, TraceWithSampling) {
  Options options = CurrentOptions();
  ReadOptions ro;
  WriteOptions wo;
  TraceOptions trace_opts;
  EnvOptions env_opts;
  CreateAndReopenWithCF({"pikachu"}, options);
  Random rnd(301);

  // test the trace file sampling options
  trace_opts.sampling_frequency = 2;
  std::string trace_filename = dbname_ + "/rocksdb.trace_sampling";
  std::unique_ptr<TraceWriter> trace_writer;
  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
  ASSERT_OK(Put(0, "a", "1"));
  ASSERT_OK(Put(0, "b", "2"));
  ASSERT_OK(Put(0, "c", "3"));
  ASSERT_OK(Put(0, "d", "4"));
  ASSERT_OK(Put(0, "e", "5"));
  ASSERT_OK(db_->EndTrace());

  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling");
  std::string value;
  ASSERT_OK(DestroyDB(dbname2, options));

  // Using a different name than db2, to pacify infer's use-after-lifetime
  // warnings (http://fbinfer.com).
  DB* db2_init = nullptr;
  options.create_if_missing = true;
  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
  ColumnFamilyHandle* cf;
  ASSERT_OK(
      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
  delete cf;
  delete db2_init;

  DB* db2 = nullptr;
  std::vector<ColumnFamilyDescriptor> column_families;
  ColumnFamilyOptions cf_options;
  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
  column_families.push_back(
      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
  std::vector<ColumnFamilyHandle*> handles;
  DBOptions db_opts;
  db_opts.env = env_;
  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));

  env_->SleepForMicroseconds(100);
  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());

  std::unique_ptr<TraceReader> trace_reader;
  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
  std::unique_ptr<Replayer> replayer;
  ASSERT_OK(
      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
  ASSERT_OK(replayer->Prepare());
  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
  replayer.reset();

  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
  ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
  ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());

  for (auto handle : handles) {
    delete handle;
  }
  delete db2;
  ASSERT_OK(DestroyDB(dbname2, options));
}

TEST_F(DBTest2, TraceWithFilter) {
  Options options = CurrentOptions();
  options.merge_operator = MergeOperators::CreatePutOperator();
  ReadOptions ro;
  WriteOptions wo;
  TraceOptions trace_opts;
  EnvOptions env_opts;
  CreateAndReopenWithCF({"pikachu"}, options);
  Random rnd(301);
  Iterator* single_iter = nullptr;

  trace_opts.filter = TraceFilterType::kTraceFilterWrite;

  std::string trace_filename = dbname_ + "/rocksdb.trace";
  std::unique_ptr<TraceWriter> trace_writer;
  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));

  ASSERT_OK(Put(0, "a", "1"));
  ASSERT_OK(Merge(0, "b", "2"));
  ASSERT_OK(Delete(0, "c"));
  ASSERT_OK(SingleDelete(0, "d"));
  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));

  WriteBatch batch;
  ASSERT_OK(batch.Put("f", "11"));
  ASSERT_OK(batch.Merge("g", "12"));
  ASSERT_OK(batch.Delete("h"));
  ASSERT_OK(batch.SingleDelete("i"));
  ASSERT_OK(batch.DeleteRange("j", "k"));
  ASSERT_OK(db_->Write(wo, &batch));

  single_iter = db_->NewIterator(ro);
  single_iter->Seek("f");
  single_iter->SeekForPrev("g");
  delete single_iter;

  ASSERT_EQ("1", Get(0, "a"));
  ASSERT_EQ("12", Get(0, "g"));

  ASSERT_OK(Put(1, "foo", "bar"));
  ASSERT_OK(Put(1, "rocksdb", "rocks"));
  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));

  ASSERT_OK(db_->EndTrace());
  // These should not get into the trace file as it is after EndTrace.
  ASSERT_OK(Put("hello", "world"));
  ASSERT_OK(Merge("foo", "bar"));

  // Open another db, replay, and verify the data
  std::string value;
  std::string dbname2 = test::PerThreadDBPath(env_, "db_replay");
  ASSERT_OK(DestroyDB(dbname2, options));

  // Using a different name than db2, to pacify infer's use-after-lifetime
  // warnings (http://fbinfer.com).
  DB* db2_init = nullptr;
  options.create_if_missing = true;
  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
  ColumnFamilyHandle* cf;
  ASSERT_OK(
      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
  delete cf;
  delete db2_init;

  DB* db2 = nullptr;
  std::vector<ColumnFamilyDescriptor> column_families;
  ColumnFamilyOptions cf_options;
  cf_options.merge_operator = MergeOperators::CreatePutOperator();
  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
  column_families.push_back(
      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
  std::vector<ColumnFamilyHandle*> handles;
  DBOptions db_opts;
  db_opts.env = env_;
  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));

  env_->SleepForMicroseconds(100);
  // Verify that the keys don't already exist
  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());

  std::unique_ptr<TraceReader> trace_reader;
  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
  std::unique_ptr<Replayer> replayer;
  ASSERT_OK(
      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
  ASSERT_OK(replayer->Prepare());
  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
  replayer.reset();

  // All the key-values should not present since we filter out the WRITE ops.
  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound());
  ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound());

  for (auto handle : handles) {
    delete handle;
  }
  delete db2;
  ASSERT_OK(DestroyDB(dbname2, options));

  // Set up a new db.
  std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
  ASSERT_OK(DestroyDB(dbname3, options));

  DB* db3_init = nullptr;
  options.create_if_missing = true;
  ColumnFamilyHandle* cf3;
  ASSERT_OK(DB::Open(options, dbname3, &db3_init));
  ASSERT_OK(
      db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
  delete cf3;
  delete db3_init;

  column_families.clear();
  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
  column_families.push_back(
      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
  handles.clear();

  DB* db3 = nullptr;
  ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));

  env_->SleepForMicroseconds(100);
  // Verify that the keys don't already exist
  ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound());
  ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound());

  // The tracer will not record the READ ops.
  trace_opts.filter = TraceFilterType::kTraceFilterGet;
  std::string trace_filename3 = dbname_ + "/rocksdb.trace_3";
  std::unique_ptr<TraceWriter> trace_writer3;
  ASSERT_OK(
      NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3));
  ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3)));

  ASSERT_OK(db3->Put(wo, handles[0], "a", "1"));
  ASSERT_OK(db3->Merge(wo, handles[0], "b", "2"));
  ASSERT_OK(db3->Delete(wo, handles[0], "c"));
  ASSERT_OK(db3->SingleDelete(wo, handles[0], "d"));

  ASSERT_OK(db3->Get(ro, handles[0], "a", &value));
  ASSERT_EQ(value, "1");
  ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound());

  ASSERT_OK(db3->EndTrace());

  for (auto handle : handles) {
    delete handle;
  }
  delete db3;
  ASSERT_OK(DestroyDB(dbname3, options));

  std::unique_ptr<TraceReader> trace_reader3;
  ASSERT_OK(
      NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3));

  // Count the number of records in the trace file;
  int count = 0;
  std::string data;
  Status s;
  while (true) {
    s = trace_reader3->Read(&data);
    if (!s.ok()) {
      break;
    }
    count += 1;
  }
  // We also need to count the header and footer
  // 4 WRITE + HEADER + FOOTER = 6
  ASSERT_EQ(count, 6);
}


TEST_F(DBTest2, PinnableSliceAndMmapReads) {
  Options options = CurrentOptions();
  options.env = env_;
  if (!IsMemoryMappedAccessSupported()) {
    ROCKSDB_GTEST_SKIP("Test requires default environment");
    return;
  }
  options.allow_mmap_reads = true;
  options.max_open_files = 100;
  options.compression = kNoCompression;
  Reopen(options);

  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Flush());

  PinnableSlice pinned_value;
  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
  // It is not safe to pin mmap files as they might disappear by compaction
  ASSERT_FALSE(pinned_value.IsPinned());
  ASSERT_EQ(pinned_value.ToString(), "bar");

  ASSERT_OK(dbfull()->TEST_CompactRange(
      0 /* level */, nullptr /* begin */, nullptr /* end */,
      nullptr /* column_family */, true /* disallow_trivial_move */));

  // Ensure pinned_value doesn't rely on memory munmap'd by the above
  // compaction. It crashes if it does.
  ASSERT_EQ(pinned_value.ToString(), "bar");

  pinned_value.Reset();
  // Unsafe to pin mmap files when they could be kicked out of table cache
  Close();
  ASSERT_OK(ReadOnlyReopen(options));
  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
  ASSERT_FALSE(pinned_value.IsPinned());
  ASSERT_EQ(pinned_value.ToString(), "bar");

  pinned_value.Reset();
  // In read-only mode with infinite capacity on table cache it should pin the
  // value and avoid the memcpy
  Close();
  options.max_open_files = -1;
  ASSERT_OK(ReadOnlyReopen(options));
  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
  ASSERT_TRUE(pinned_value.IsPinned());
  ASSERT_EQ(pinned_value.ToString(), "bar");
}

TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
  Options options = CurrentOptions();
  options.create_if_missing = true;
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
  BlockBasedTableOptions bbto;
  bbto.no_block_cache = false;
  bbto.cache_index_and_filter_blocks = false;
  bbto.block_cache = NewLRUCache(100000);
  bbto.block_size = 400;  // small block size
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
  Reopen(options);

  Random rnd(301);
  std::string v = rnd.RandomString(400);

  // Since v is the size of a block, each key should take a block
  // of 400+ bytes.
  ASSERT_OK(Put("1", v));
  ASSERT_OK(Put("3", v));
  ASSERT_OK(Put("5", v));
  ASSERT_OK(Put("7", v));
  ASSERT_OK(Flush());

  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());

  // Verify that iterators don't pin more than one data block in block cache
  // at each time.
  {
    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
    iter->SeekToFirst();

    for (int i = 0; i < 4; i++) {
      ASSERT_TRUE(iter->Valid());
      // Block cache should contain exactly one block.
      ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
      ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
      iter->Next();
    }
    ASSERT_FALSE(iter->Valid());

    iter->Seek("4");
    ASSERT_TRUE(iter->Valid());

    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);

    iter->Seek("3");
    ASSERT_TRUE(iter->Valid());

    ASSERT_OK(iter->status());

    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
  }
  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());

  // Test compaction case
  ASSERT_OK(Put("2", v));
  ASSERT_OK(Put("5", v));
  ASSERT_OK(Put("6", v));
  ASSERT_OK(Put("8", v));
  ASSERT_OK(Flush());

  // Clear existing data in block cache
  bbto.block_cache->SetCapacity(0);
  bbto.block_cache->SetCapacity(100000);

  // Verify compaction input iterators don't hold more than one data blocks at
  // one time.
  std::atomic<bool> finished(false);
  std::atomic<int> block_newed(0);
  std::atomic<int> block_destroyed(0);
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "Block::Block:0", [&](void* /*arg*/) {
        if (finished) {
          return;
        }
        // Two iterators. At most 2 outstanding blocks.
        EXPECT_GE(block_newed.load(), block_destroyed.load());
        EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
        block_newed.fetch_add(1);
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "Block::~Block", [&](void* /*arg*/) {
        if (finished) {
          return;
        }
        // Two iterators. At most 2 outstanding blocks.
        EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
        EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
        block_destroyed.fetch_add(1);
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "CompactionJob::Run:BeforeVerify",
      [&](void* /*arg*/) { finished = true; });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));

  // Two input files. Each of them has 4 data blocks.
  ASSERT_EQ(8, block_newed.load());
  ASSERT_EQ(8, block_destroyed.load());

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
  // Setup sync point dependency to reproduce the race condition of
  // DBImpl::GetColumnFamilyHandleUnlocked
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
      {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
       "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
      {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
       "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
  });
  SyncPoint::GetInstance()->EnableProcessing();

  CreateColumnFamilies({"test1", "test2"}, Options());
  ASSERT_EQ(handles_.size(), 2);

  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
  port::Thread user_thread1([&]() {
    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
    TEST_SYNC_POINT(
        "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
    TEST_SYNC_POINT(
        "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
  });

  port::Thread user_thread2([&]() {
    TEST_SYNC_POINT(
        "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
    TEST_SYNC_POINT(
        "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
  });

  user_thread1.join();
  user_thread2.join();

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}

TEST_F(DBTest2, TestCompactFiles) {
  // Setup sync point dependency to reproduce the race condition of
  // DBImpl::GetColumnFamilyHandleUnlocked
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
      {"TestCompactFiles::IngestExternalFile1",
       "TestCompactFiles::IngestExternalFile2"},
  });
  SyncPoint::GetInstance()->EnableProcessing();

  Options options;
  options.env = env_;
  options.num_levels = 2;
  options.disable_auto_compactions = true;
  Reopen(options);
  auto* handle = db_->DefaultColumnFamily();
  ASSERT_EQ(db_->NumberLevels(handle), 2);

  ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
      ROCKSDB_NAMESPACE::EnvOptions(), options};
  std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
  std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
  std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";

  ASSERT_OK(sst_file_writer.Open(external_file1));
  ASSERT_OK(sst_file_writer.Put("1", "1"));
  ASSERT_OK(sst_file_writer.Put("2", "2"));
  ASSERT_OK(sst_file_writer.Finish());

  ASSERT_OK(sst_file_writer.Open(external_file2));
  ASSERT_OK(sst_file_writer.Put("3", "3"));
  ASSERT_OK(sst_file_writer.Put("4", "4"));
  ASSERT_OK(sst_file_writer.Finish());

  ASSERT_OK(sst_file_writer.Open(external_file3));
  ASSERT_OK(sst_file_writer.Put("5", "5"));
  ASSERT_OK(sst_file_writer.Put("6", "6"));
  ASSERT_OK(sst_file_writer.Finish());

  ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
                                    IngestExternalFileOptions()));
  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
  std::vector<std::string> files;
  GetSstFiles(env_, dbname_, &files);
  ASSERT_EQ(files.size(), 2);

  Status user_thread1_status;
  port::Thread user_thread1([&]() {
    user_thread1_status =
        db_->CompactFiles(CompactionOptions(), handle, files, 1);
  });

  Status user_thread2_status;
  port::Thread user_thread2([&]() {
    user_thread2_status = db_->IngestExternalFile(handle, {external_file2},
                                                  IngestExternalFileOptions());
    TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
  });

  user_thread1.join();
  user_thread2.join();

  ASSERT_OK(user_thread1_status);
  ASSERT_OK(user_thread2_status);

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}

TEST_F(DBTest2, MultiDBParallelOpenTest) {
  const int kNumDbs = 2;
  Options options = CurrentOptions();
  std::vector<std::string> dbnames;
  for (int i = 0; i < kNumDbs; ++i) {
    dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + std::to_string(i)));
    ASSERT_OK(DestroyDB(dbnames.back(), options));
  }

  // Verify empty DBs can be created in parallel
  std::vector<std::thread> open_threads;
  std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
  options.create_if_missing = true;
  for (int i = 0; i < kNumDbs; ++i) {
    open_threads.emplace_back(
        [&](int dbnum) {
          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
        },
        i);
  }

  // Now add some data and close, so next we can verify non-empty DBs can be
  // recovered in parallel
  for (int i = 0; i < kNumDbs; ++i) {
    open_threads[i].join();
    ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
    delete dbs[i];
  }

  // Verify non-empty DBs can be recovered in parallel
  open_threads.clear();
  for (int i = 0; i < kNumDbs; ++i) {
    open_threads.emplace_back(
        [&](int dbnum) {
          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
        },
        i);
  }

  // Wait and cleanup
  for (int i = 0; i < kNumDbs; ++i) {
    open_threads[i].join();
    delete dbs[i];
    ASSERT_OK(DestroyDB(dbnames[i], options));
  }
}

namespace {
class DummyOldStats : public Statistics {
 public:
  const char* Name() const override { return "DummyOldStats"; }
  uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
  void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
    num_rt++;
  }
  void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {}
  uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override {
    return 0;
  }
  void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override {
    num_mt++;
  }
  void histogramData(
      uint32_t /*histogram_type*/,
      ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {}
  std::string getHistogramString(uint32_t /*type*/) const override {
    return "";
  }
  bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
  std::string ToString() const override { return ""; }
  std::atomic<int> num_rt{0};
  std::atomic<int> num_mt{0};
};
}  // anonymous namespace

TEST_F(DBTest2, OldStatsInterface) {
  DummyOldStats* dos = new DummyOldStats();
  std::shared_ptr<Statistics> stats(dos);
  Options options = CurrentOptions();
  options.create_if_missing = true;
  options.statistics = stats;
  Reopen(options);

  ASSERT_OK(Put("foo", "bar"));
  ASSERT_EQ("bar", Get("foo"));
  ASSERT_OK(Flush());
  ASSERT_EQ("bar", Get("foo"));

  ASSERT_GT(dos->num_rt, 0);
  ASSERT_GT(dos->num_mt, 0);
}

TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
  const Snapshot* ss = db_->GetSnapshot();

  for (auto h : handles_) {
    db_->DestroyColumnFamilyHandle(h);
  }
  handles_.clear();

  ASSERT_NOK(db_->Close());
  db_->ReleaseSnapshot(ss);
  ASSERT_OK(db_->Close());
  delete db_;
  db_ = nullptr;
}

TEST_F(DBTest2, PrefixBloomReseek) {
  Options options = CurrentOptions();
  options.create_if_missing = true;
  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
  BlockBasedTableOptions bbto;
  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
  bbto.whole_key_filtering = false;
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
  DestroyAndReopen(options);

  // Construct two L1 files with keys:
  // f1:[aaa1 ccc1] f2:[ddd0]
  ASSERT_OK(Put("aaa1", ""));
  ASSERT_OK(Put("ccc1", ""));
  ASSERT_OK(Flush());
  ASSERT_OK(Put("ddd0", ""));
  ASSERT_OK(Flush());
  CompactRangeOptions cro;
  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));

  ASSERT_OK(Put("bbb1", ""));

  Iterator* iter = db_->NewIterator(ReadOptions());
  ASSERT_OK(iter->status());

  // Seeking into f1, the iterator will check bloom filter which returns the
  // file iterator ot be invalidate, and the cursor will put into f2, with
  // the next key to be "ddd0".
  iter->Seek("bbb1");
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("bbb1", iter->key().ToString());

  // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
  iter->Seek("ccc1");
  ASSERT_TRUE(iter->Valid());
  ASSERT_EQ("ccc1", iter->key().ToString());

  delete iter;
}

TEST_F(DBTest2, PrefixBloomFilteredOut) {
  Options options = CurrentOptions();
  options.create_if_missing = true;
  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
  BlockBasedTableOptions bbto;
  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
  bbto.whole_key_filtering = false;
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
  DestroyAndReopen(options);

  // Construct two L1 files with keys:
  // f1:[aaa1 ccc1] f2:[ddd0]
  ASSERT_OK(Put("aaa1", ""));
  ASSERT_OK(Put("ccc1", ""));
  ASSERT_OK(Flush());
  ASSERT_OK(Put("ddd0", ""));
  ASSERT_OK(Flush());
  CompactRangeOptions cro;
  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));

  Iterator* iter = db_->NewIterator(ReadOptions());
  ASSERT_OK(iter->status());

  // Bloom filter is filterd out by f1.
  // This is just one of several valid position following the contract.
  // Postioning to ccc1 or ddd0 is also valid. This is just to validate
  // the behavior of the current implementation. If underlying implementation
  // changes, the test might fail here.
  iter->Seek("bbb1");
  ASSERT_OK(iter->status());
  ASSERT_FALSE(iter->Valid());

  delete iter;
}

TEST_F(DBTest2, RowCacheSnapshot) {
  Options options = CurrentOptions();
  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
  options.row_cache = NewLRUCache(8 * 8192);
  DestroyAndReopen(options);

  ASSERT_OK(Put("foo", "bar1"));

  const Snapshot* s1 = db_->GetSnapshot();

  ASSERT_OK(Put("foo", "bar2"));
  ASSERT_OK(Flush());

  ASSERT_OK(Put("foo2", "bar"));
  const Snapshot* s2 = db_->GetSnapshot();
  ASSERT_OK(Put("foo3", "bar"));
  const Snapshot* s3 = db_->GetSnapshot();

  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
  ASSERT_EQ(Get("foo"), "bar2");
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
  ASSERT_EQ(Get("foo"), "bar2");
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
  ASSERT_EQ(Get("foo", s1), "bar1");
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
  ASSERT_EQ(Get("foo", s2), "bar2");
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
  ASSERT_EQ(Get("foo", s1), "bar1");
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
  ASSERT_EQ(Get("foo", s3), "bar2");
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);

  db_->ReleaseSnapshot(s1);
  db_->ReleaseSnapshot(s2);
  db_->ReleaseSnapshot(s3);
}

// When DB is reopened with multiple column families, the manifest file
// is written after the first CF is flushed, and it is written again
// after each flush. If DB crashes between the flushes, the flushed CF
// flushed will pass the latest log file, and now we require it not
// to be corrupted, and triggering a corruption report.
// We need to fix the bug and enable the test.
TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
  const std::vector<std::string> sync_points = {
      "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
  for (const auto& test_sync_point : sync_points) {
    Options options = CurrentOptions();
    // First destroy original db to ensure a clean start.
    DestroyAndReopen(options);
    options.create_if_missing = true;
    options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
    CreateAndReopenWithCF({"pikachu"}, options);
    ASSERT_OK(Put("foo", "bar"));
    ASSERT_OK(Flush());
    ASSERT_OK(Put(1, "foo", "bar"));
    ASSERT_OK(Flush(1));
    ASSERT_OK(Put("foo", "bar"));
    ASSERT_OK(Put(1, "foo", "bar"));
    // The value is large enough to be divided to two blocks.
    std::string large_value(400, ' ');
    ASSERT_OK(Put("foo1", large_value));
    ASSERT_OK(Put("foo2", large_value));
    Close();

    // Corrupt the log file in the middle, so that it is not corrupted
    // in the tail.
    std::vector<std::string> filenames;
    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
    for (const auto& f : filenames) {
      uint64_t number;
      FileType type;
      if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) {
        std::string fname = dbname_ + "/" + f;
        std::string file_content;
        ASSERT_OK(ReadFileToString(env_, fname, &file_content));
        file_content[400] = 'h';
        file_content[401] = 'a';
        ASSERT_OK(WriteStringToFile(env_, file_content, fname));
        break;
      }
    }

    // Reopen and freeze the file system after the first manifest write.
    FaultInjectionTestEnv fit_env(options.env);
    options.env = &fit_env;
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
        test_sync_point,
        [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
    ASSERT_NOK(TryReopenWithColumnFamilies(
        {kDefaultColumnFamilyName, "pikachu"}, options));
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();

    fit_env.SetFilesystemActive(true);
    // If we continue using failure ingestion Env, it will conplain something
    // when renaming current file, which is not expected. Need to investigate
    // why.
    options.env = env_;
    ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
                                          options));
  }
}

TEST_F(DBTest2, SeekFileRangeDeleteTail) {
  Options options = CurrentOptions();
  options.prefix_extractor.reset(NewCappedPrefixTransform(1));
  options.num_levels = 3;
  DestroyAndReopen(options);

  ASSERT_OK(Put("a", "a"));
  const Snapshot* s1 = db_->GetSnapshot();
  ASSERT_OK(
      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
  ASSERT_OK(Put("b", "a"));
  ASSERT_OK(Flush());

  ASSERT_OK(Put("x", "a"));
  ASSERT_OK(Put("z", "a"));
  ASSERT_OK(Flush());

  CompactRangeOptions cro;
  cro.change_level = true;
  cro.target_level = 2;
  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));

  {
    ReadOptions ro;
    ro.total_order_seek = true;
    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
    ASSERT_OK(iter->status());
    iter->Seek("e");
    ASSERT_TRUE(iter->Valid());
    ASSERT_EQ("x", iter->key().ToString());
  }
  db_->ReleaseSnapshot(s1);
}

TEST_F(DBTest2, BackgroundPurgeTest) {
  Options options = CurrentOptions();
  options.write_buffer_manager =
      std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(1 << 20);
  options.avoid_unnecessary_blocking_io = true;
  DestroyAndReopen(options);
  size_t base_value = options.write_buffer_manager->memory_usage();

  ASSERT_OK(Put("a", "a"));
  Iterator* iter = db_->NewIterator(ReadOptions());
  ASSERT_OK(iter->status());
  ASSERT_OK(Flush());
  size_t value = options.write_buffer_manager->memory_usage();
  ASSERT_GT(value, base_value);

  db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH);
  test::SleepingBackgroundTask sleeping_task_after;
  db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
                          &sleeping_task_after, Env::Priority::HIGH);
  delete iter;

  Env::Default()->SleepForMicroseconds(100000);
  value = options.write_buffer_manager->memory_usage();
  ASSERT_GT(value, base_value);

  sleeping_task_after.WakeUp();
  sleeping_task_after.WaitUntilDone();

  test::SleepingBackgroundTask sleeping_task_after2;
  db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
                          &sleeping_task_after2, Env::Priority::HIGH);
  sleeping_task_after2.WakeUp();
  sleeping_task_after2.WaitUntilDone();

  value = options.write_buffer_manager->memory_usage();
  ASSERT_EQ(base_value, value);
}

TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
  Options options = CurrentOptions();
  DestroyAndReopen(options);
  options.max_manifest_file_size = 10;
  options.create_if_missing = true;
  CreateAndReopenWithCF({"pikachu"}, options);
  ASSERT_EQ(2, handles_.size());

  ASSERT_OK(Put("foo", "value"));
  const int kL0Files = options.level0_file_num_compaction_trigger;
  for (int i = 0; i < kL0Files; ++i) {
    ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i)));
    ASSERT_OK(Flush(/*cf=*/1));
  }

  port::Thread thread([&]() { ASSERT_OK(Flush()); });
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  thread.join();
}

TEST_F(DBTest2, SameSmallestInSameLevel) {
  // This test validates fractional casacading logic when several files at one
  // one level only contains the same user key.
  Options options = CurrentOptions();
  options.merge_operator = MergeOperators::CreateStringAppendOperator();
  DestroyAndReopen(options);

  ASSERT_OK(Put("key", "1"));
  ASSERT_OK(Put("key", "2"));
  ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
  ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
  ASSERT_OK(Flush());
  CompactRangeOptions cro;
  cro.change_level = true;
  cro.target_level = 2;
  ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr,
                                   nullptr));

  ASSERT_OK(db_->Merge(WriteOptions(), "key", "5"));
  ASSERT_OK(Flush());
  ASSERT_OK(db_->Merge(WriteOptions(), "key", "6"));
  ASSERT_OK(Flush());
  ASSERT_OK(db_->Merge(WriteOptions(), "key", "7"));
  ASSERT_OK(Flush());
  ASSERT_OK(db_->Merge(WriteOptions(), "key", "8"));
  ASSERT_OK(Flush());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_EQ("0,4,1", FilesPerLevel());

  ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
}

TEST_F(DBTest2, FileConsistencyCheckInOpen) {
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Flush());

  SyncPoint::GetInstance()->SetCallBack(
      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
        Status* ret_s = static_cast<Status*>(arg);
        *ret_s = Status::Corruption("fcc");
      });
  SyncPoint::GetInstance()->EnableProcessing();

  Options options = CurrentOptions();
  options.force_consistency_checks = true;
  ASSERT_NOK(TryReopen(options));

  SyncPoint::GetInstance()->DisableProcessing();
}

TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
  // create a DB with block prefix index
  BlockBasedTableOptions table_options;
  Options options = CurrentOptions();
  table_options.block_size = 300;
  table_options.index_type = BlockBasedTableOptions::kHashSearch;
  table_options.index_shortening =
      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  options.prefix_extractor.reset(NewFixedPrefixTransform(1));

  Reopen(options);

  Random rnd(301);
  std::string large_value = rnd.RandomString(500);

  ASSERT_OK(Put("a1", large_value));
  ASSERT_OK(Put("x1", large_value));
  ASSERT_OK(Put("y1", large_value));
  ASSERT_OK(Flush());

  {
    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
    ASSERT_OK(iterator->status());
    iterator->SeekForPrev("x3");
    ASSERT_TRUE(iterator->Valid());
    ASSERT_EQ("x1", iterator->key().ToString());

    iterator->SeekForPrev("a3");
    ASSERT_TRUE(iterator->Valid());
    ASSERT_EQ("a1", iterator->key().ToString());

    iterator->SeekForPrev("y3");
    ASSERT_TRUE(iterator->Valid());
    ASSERT_EQ("y1", iterator->key().ToString());

    // Query more than one non-existing prefix to cover the case both
    // of empty hash bucket and hash bucket conflict.
    iterator->SeekForPrev("b1");
    // Result should be not valid or "a1".
    if (iterator->Valid()) {
      ASSERT_EQ("a1", iterator->key().ToString());
    }

    iterator->SeekForPrev("c1");
    // Result should be not valid or "a1".
    if (iterator->Valid()) {
      ASSERT_EQ("a1", iterator->key().ToString());
    }

    iterator->SeekForPrev("d1");
    // Result should be not valid or "a1".
    if (iterator->Valid()) {
      ASSERT_EQ("a1", iterator->key().ToString());
    }

    iterator->SeekForPrev("y3");
    ASSERT_TRUE(iterator->Valid());
    ASSERT_EQ("y1", iterator->key().ToString());
  }
}

TEST_F(DBTest2, PartitionedIndexPrefetchFailure) {
  Options options = last_options_;
  options.env = env_;
  options.max_open_files = 20;
  BlockBasedTableOptions bbto;
  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
  bbto.metadata_block_size = 128;
  bbto.block_size = 128;
  bbto.block_cache = NewLRUCache(16777216);
  bbto.cache_index_and_filter_blocks = true;
  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
  DestroyAndReopen(options);

  // Force no table cache so every read will preload the SST file.
  dbfull()->TEST_table_cache()->SetCapacity(0);
  bbto.block_cache->SetCapacity(0);

  Random rnd(301);
  for (int i = 0; i < 4096; i++) {
    ASSERT_OK(Put(Key(i), rnd.RandomString(32)));
  }
  ASSERT_OK(Flush());

  // Try different random failures in table open for 300 times.
  for (int i = 0; i < 300; i++) {
    env_->num_reads_fails_ = 0;
    env_->rand_reads_fail_odd_ = 8;

    std::string value;
    Status s = dbfull()->Get(ReadOptions(), Key(1), &value);
    if (env_->num_reads_fails_ > 0) {
      ASSERT_NOK(s);
    } else {
      ASSERT_OK(s);
    }
  }

  env_->rand_reads_fail_odd_ = 0;
}

TEST_F(DBTest2, ChangePrefixExtractor) {
  for (bool use_partitioned_filter : {true, false}) {
    // create a DB with block prefix index
    BlockBasedTableOptions table_options;
    Options options = CurrentOptions();

    // Sometimes filter is checked based on upper bound. Assert counters
    // for that case. Otherwise, only check data correctness.
    bool expect_filter_check = !use_partitioned_filter;
    table_options.partition_filters = use_partitioned_filter;
    if (use_partitioned_filter) {
      table_options.index_type =
          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
    }
    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));

    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
    options.statistics = CreateDBStatistics();

    options.prefix_extractor.reset(NewFixedPrefixTransform(2));
    DestroyAndReopen(options);

    Random rnd(301);

    ASSERT_OK(Put("aa", ""));
    ASSERT_OK(Put("xb", ""));
    ASSERT_OK(Put("xx1", ""));
    ASSERT_OK(Put("xz1", ""));
    ASSERT_OK(Put("zz", ""));
    ASSERT_OK(Flush());

    // After reopening DB with prefix size 2 => 1, prefix extractor
    // won't take effective unless it won't change results based
    // on upper bound and seek key.
    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
    Reopen(options);

    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
      ASSERT_OK(iterator->status());
      iterator->Seek("xa");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xb", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }

      iterator->Seek("xz");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xz1", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }
    }

    std::string ub_str = "xg9";
    Slice ub(ub_str);
    ReadOptions ro;
    ro.iterate_upper_bound = &ub;

    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
      ASSERT_OK(iterator->status());

      // SeekForPrev() never uses prefix bloom if it is changed.
      iterator->SeekForPrev("xg0");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xb", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }
    }

    ub_str = "xx9";
    ub = Slice(ub_str);
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
      ASSERT_OK(iterator->status());

      iterator->Seek("x");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xb", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }

      iterator->Seek("xx0");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xx1", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }
    }

    CompactRangeOptions compact_range_opts;
    compact_range_opts.bottommost_level_compaction =
        BottommostLevelCompaction::kForce;
    ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
    ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));

    // Re-execute similar queries after a full compaction
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));

      iterator->Seek("x");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xb", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }

      iterator->Seek("xg");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xx1", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }

      iterator->Seek("xz");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xz1", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }

      ASSERT_OK(iterator->status());
    }
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));

      iterator->SeekForPrev("xx0");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xb", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }

      iterator->Seek("xx0");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xx1", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }

      ASSERT_OK(iterator->status());
    }

    ub_str = "xg9";
    ub = Slice(ub_str);
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
      iterator->SeekForPrev("xg0");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("xb", iterator->key().ToString());
      if (expect_filter_check) {
        EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH));
      }
      ASSERT_OK(iterator->status());
    }
  }
}

TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
  // create a DB with block prefix index
  BlockBasedTableOptions table_options;
  Options options = CurrentOptions();
  table_options.block_size = 300;
  table_options.index_type = BlockBasedTableOptions::kHashSearch;
  table_options.index_shortening =
      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
  options.level0_file_num_compaction_trigger = 8;

  Reopen(options);

  ASSERT_OK(Put("b1", "ok"));
  ASSERT_OK(Flush());

  // Flushing several files so that the chance that hash bucket
  // is empty fo "b" in at least one of the files is high.
  ASSERT_OK(Put("a1", ""));
  ASSERT_OK(Put("c1", ""));
  ASSERT_OK(Flush());

  ASSERT_OK(Put("a2", ""));
  ASSERT_OK(Put("c2", ""));
  ASSERT_OK(Flush());

  ASSERT_OK(Put("a3", ""));
  ASSERT_OK(Put("c3", ""));
  ASSERT_OK(Flush());

  ASSERT_OK(Put("a4", ""));
  ASSERT_OK(Put("c4", ""));
  ASSERT_OK(Flush());

  ASSERT_OK(Put("a5", ""));
  ASSERT_OK(Put("c5", ""));
  ASSERT_OK(Flush());

  ASSERT_EQ("ok", Get("b1"));
}

TEST_F(DBTest2, AutoPrefixMode1) {
  do {
    // create a DB with block prefix index
    Options options = CurrentOptions();
    BlockBasedTableOptions table_options =
        *options.table_factory->GetOptions<BlockBasedTableOptions>();
    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
    options.statistics = CreateDBStatistics();

    Reopen(options);

    Random rnd(301);
    std::string large_value = rnd.RandomString(500);

    ASSERT_OK(Put("a1", large_value));
    ASSERT_OK(Put("x1", large_value));
    ASSERT_OK(Put("y1", large_value));
    ASSERT_OK(Flush());

    ReadOptions ro;
    ro.total_order_seek = false;
    ro.auto_prefix_mode = true;

    const auto hit_stat = options.num_levels == 1
                              ? LAST_LEVEL_SEEK_FILTER_MATCH
                              : NON_LAST_LEVEL_SEEK_FILTER_MATCH;
    const auto miss_stat = options.num_levels == 1
                               ? LAST_LEVEL_SEEK_FILTERED
                               : NON_LAST_LEVEL_SEEK_FILTERED;
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
      iterator->Seek("b1");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("x1", iterator->key().ToString());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());
    }

    Slice ub;
    ro.iterate_upper_bound = &ub;

    ub = "b9";
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
      iterator->Seek("b1");
      ASSERT_FALSE(iterator->Valid());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());
    }

    ub = "z";
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
      iterator->Seek("b1");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("x1", iterator->key().ToString());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());
    }

    ub = "c";
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
      iterator->Seek("b1");
      ASSERT_FALSE(iterator->Valid());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());
    }

    ub = "c1";
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
      iterator->Seek("b1");
      ASSERT_FALSE(iterator->Valid());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());
    }

    // The same queries without recreating iterator
    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));

      ub = "b9";
      iterator->Seek("b1");
      ASSERT_FALSE(iterator->Valid());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());

      ub = "z";
      iterator->Seek("b1");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("x1", iterator->key().ToString());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));

      ub = "c";
      iterator->Seek("b1");
      ASSERT_FALSE(iterator->Valid());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat));

      ub = "b9";
      iterator->SeekForPrev("b1");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("a1", iterator->key().ToString());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));

      ub = "zz";
      iterator->SeekToLast();
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("y1", iterator->key().ToString());

      iterator->SeekToFirst();
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("a1", iterator->key().ToString());
    }

    // Similar, now with reverse comparator
    // Technically, we are violating axiom 2 of prefix_extractors, but
    // it should be revised because of major use-cases using
    // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME)
    options.comparator = ReverseBytewiseComparator();
    options.prefix_extractor.reset(NewFixedPrefixTransform(1));

    DestroyAndReopen(options);

    ASSERT_OK(Put("a1", large_value));
    ASSERT_OK(Put("x1", large_value));
    ASSERT_OK(Put("y1", large_value));
    ASSERT_OK(Flush());

    {
      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));

      ub = "b1";
      iterator->Seek("b9");
      ASSERT_FALSE(iterator->Valid());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());

      ub = "b1";
      iterator->Seek("z");
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("y1", iterator->key().ToString());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));

      ub = "b1";
      iterator->Seek("c");
      ASSERT_FALSE(iterator->Valid());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));

      ub = "b";
      iterator->Seek("c9");
      ASSERT_FALSE(iterator->Valid());
      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
      // is "correctly" implemented.
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));

      ub = "a";
      iterator->Seek("b9");
      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
      // is "correctly" implemented.
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("a1", iterator->key().ToString());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));

      ub = "b";
      iterator->Seek("a");
      ASSERT_FALSE(iterator->Valid());
      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
      // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper
      // comparing before seek key prevents a real bug from surfacing.
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));

      ub = "b1";
      iterator->SeekForPrev("b9");
      ASSERT_TRUE(iterator->Valid());
      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
      // is "correctly" implemented.
      ASSERT_EQ("x1", iterator->key().ToString());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));

      ub = "a";
      iterator->SeekToLast();
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("a1", iterator->key().ToString());

      iterator->SeekToFirst();
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("y1", iterator->key().ToString());
    }

    // Now something a bit different, related to "short" keys that
    // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode.
    options.comparator = BytewiseComparator();
    for (const auto config : {"fixed:2", "capped:2"}) {
      ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config,
                                                 &options.prefix_extractor));

      // FIXME: kHashSearch, etc. requires all keys be InDomain
      if (StartsWith(config, "fixed") &&
          (table_options.index_type == BlockBasedTableOptions::kHashSearch ||
           StartsWith(options.memtable_factory->Name(), "Hash"))) {
        continue;
      }
      DestroyAndReopen(options);

      const char* a_end_stuff = "a\xffXYZ";
      const char* b_begin_stuff = "b\x00XYZ";
      ASSERT_OK(Put("a", large_value));
      ASSERT_OK(Put("b", large_value));
      ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value));
      ASSERT_OK(Put("c", large_value));
      ASSERT_OK(Flush());

      // control showing valid optimization with auto_prefix mode
      ub = Slice(a_end_stuff, 4);
      ro.iterate_upper_bound = &ub;

      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
      iterator->Seek(Slice(a_end_stuff, 2));
      ASSERT_FALSE(iterator->Valid());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());

      // test, cannot be validly optimized with auto_prefix_mode
      ub = Slice(b_begin_stuff, 2);
      ro.iterate_upper_bound = &ub;

      iterator->Seek(Slice(a_end_stuff, 2));
      // !!! BUG !!! See "BUG" section of auto_prefix_mode.
      ASSERT_FALSE(iterator->Valid());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());

      // To prove that is the wrong result, now use total order seek
      ReadOptions tos_ro = ro;
      tos_ro.total_order_seek = true;
      tos_ro.auto_prefix_mode = false;
      iterator.reset(db_->NewIterator(tos_ro));
      iterator->Seek(Slice(a_end_stuff, 2));
      ASSERT_TRUE(iterator->Valid());
      ASSERT_EQ("b", iterator->key().ToString());
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat));
      EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat));
      ASSERT_OK(iterator->status());
    }
  } while (ChangeOptions(kSkipPlainTable));
}

class RenameCurrentTest : public DBTestBase,
                          public testing::WithParamInterface<std::string> {
 public:
  RenameCurrentTest()
      : DBTestBase("rename_current_test", /*env_do_fsync=*/true),
        sync_point_(GetParam()) {}

  ~RenameCurrentTest() override {}

  void SetUp() override {
    env_->no_file_overwrite_.store(true, std::memory_order_release);
  }

  void TearDown() override {
    env_->no_file_overwrite_.store(false, std::memory_order_release);
  }

  void SetupSyncPoints() {
    SyncPoint::GetInstance()->DisableProcessing();
    SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) {
      Status* s = reinterpret_cast<Status*>(arg);
      assert(s);
      *s = Status::IOError("Injected IO error.");
    });
  }

  const std::string sync_point_;
};

INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest,
                        ::testing::Values("SetCurrentFile:BeforeRename",
                                          "SetCurrentFile:AfterRename"));

TEST_P(RenameCurrentTest, Open) {
  Destroy(last_options_);
  Options options = GetDefaultOptions();
  options.create_if_missing = true;
  SetupSyncPoints();
  SyncPoint::GetInstance()->EnableProcessing();
  Status s = TryReopen(options);
  ASSERT_NOK(s);

  SyncPoint::GetInstance()->DisableProcessing();
  Reopen(options);
}

TEST_P(RenameCurrentTest, Flush) {
  Destroy(last_options_);
  Options options = GetDefaultOptions();
  options.max_manifest_file_size = 1;
  options.create_if_missing = true;
  Reopen(options);
  ASSERT_OK(Put("key", "value"));
  SetupSyncPoints();
  SyncPoint::GetInstance()->EnableProcessing();
  ASSERT_NOK(Flush());

  ASSERT_NOK(Put("foo", "value"));

  SyncPoint::GetInstance()->DisableProcessing();
  Reopen(options);
  ASSERT_EQ("value", Get("key"));
  ASSERT_EQ("NOT_FOUND", Get("foo"));
}

TEST_P(RenameCurrentTest, Compaction) {
  Destroy(last_options_);
  Options options = GetDefaultOptions();
  options.max_manifest_file_size = 1;
  options.create_if_missing = true;
  Reopen(options);
  ASSERT_OK(Put("a", "a_value"));
  ASSERT_OK(Put("c", "c_value"));
  ASSERT_OK(Flush());

  ASSERT_OK(Put("b", "b_value"));
  ASSERT_OK(Put("d", "d_value"));
  ASSERT_OK(Flush());

  SetupSyncPoints();
  SyncPoint::GetInstance()->EnableProcessing();
  ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
                               /*end=*/nullptr));

  ASSERT_NOK(Put("foo", "value"));

  SyncPoint::GetInstance()->DisableProcessing();
  Reopen(options);
  ASSERT_EQ("NOT_FOUND", Get("foo"));
  ASSERT_EQ("d_value", Get("d"));
}

TEST_F(DBTest2, LastLevelTemperature) {
  class TestListener : public EventListener {
   public:
    void OnFileReadFinish(const FileOperationInfo& info) override {
      UpdateFileTemperature(info);
    }

    void OnFileWriteFinish(const FileOperationInfo& info) override {
      UpdateFileTemperature(info);
    }

    void OnFileFlushFinish(const FileOperationInfo& info) override {
      UpdateFileTemperature(info);
    }

    void OnFileSyncFinish(const FileOperationInfo& info) override {
      UpdateFileTemperature(info);
    }

    void OnFileCloseFinish(const FileOperationInfo& info) override {
      UpdateFileTemperature(info);
    }

    bool ShouldBeNotifiedOnFileIO() override { return true; }

    std::unordered_map<uint64_t, Temperature> file_temperatures;

   private:
    void UpdateFileTemperature(const FileOperationInfo& info) {
      auto filename = GetFileName(info.path);
      uint64_t number;
      FileType type;
      ASSERT_TRUE(ParseFileName(filename, &number, &type));
      if (type == kTableFile) {
        MutexLock l(&mutex_);
        auto ret = file_temperatures.insert({number, info.temperature});
        if (!ret.second) {
          // the same file temperature should always be the same for all events
          ASSERT_TRUE(ret.first->second == info.temperature);
        }
      }
    }

    std::string GetFileName(const std::string& fname) {
      auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
      // workaround only for Windows that the file path could contain both
      // Windows FilePathSeparator and '/'
      filename = filename.substr(filename.find_last_of('/') + 1);
      return filename;
    }

    port::Mutex mutex_;
  };

  const int kNumLevels = 7;
  const int kLastLevel = kNumLevels - 1;

  auto* listener = new TestListener();

  Options options = CurrentOptions();
  options.bottommost_temperature = Temperature::kWarm;
  options.level0_file_num_compaction_trigger = 2;
  options.level_compaction_dynamic_level_bytes = true;
  options.num_levels = kNumLevels;
  options.statistics = CreateDBStatistics();
  options.listeners.emplace_back(listener);
  Reopen(options);

  auto size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kHot);
  ASSERT_EQ(size, 0);

  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  get_iostats_context()->Reset();
  IOStatsContext* iostats = get_iostats_context();

  ColumnFamilyMetaData metadata;
  db_->GetColumnFamilyMetaData(&metadata);
  ASSERT_EQ(1, metadata.file_count);
  SstFileMetaData meta = metadata.levels[kLastLevel].files[0];
  ASSERT_EQ(Temperature::kWarm, meta.temperature);
  uint64_t number;
  FileType type;
  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);

  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_GT(size, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);

  ASSERT_EQ("bar", Get("foo"));

  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
  ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);

  // non-bottommost file still has unknown temperature
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_EQ("bar", Get("bar"));
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
  ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);

  db_->GetColumnFamilyMetaData(&metadata);
  ASSERT_EQ(2, metadata.file_count);
  meta = metadata.levels[0].files[0];
  ASSERT_EQ(Temperature::kUnknown, meta.temperature);
  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);

  meta = metadata.levels[kLastLevel].files[0];
  ASSERT_EQ(Temperature::kWarm, meta.temperature);
  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);

  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_GT(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_GT(size, 0);

  // reopen and check the information is persisted
  Reopen(options);
  db_->GetColumnFamilyMetaData(&metadata);
  ASSERT_EQ(2, metadata.file_count);
  meta = metadata.levels[0].files[0];
  ASSERT_EQ(Temperature::kUnknown, meta.temperature);
  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);

  meta = metadata.levels[kLastLevel].files[0];
  ASSERT_EQ(Temperature::kWarm, meta.temperature);
  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_GT(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_GT(size, 0);

  // check other non-exist temperatures
  size = GetSstSizeHelper(Temperature::kHot);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kCold);
  ASSERT_EQ(size, 0);
  std::string prop;
  ASSERT_TRUE(dbfull()->GetProperty(
      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
      &prop));
  ASSERT_EQ(std::atoi(prop.c_str()), 0);

  Reopen(options);
  db_->GetColumnFamilyMetaData(&metadata);
  ASSERT_EQ(2, metadata.file_count);
  meta = metadata.levels[0].files[0];
  ASSERT_EQ(Temperature::kUnknown, meta.temperature);
  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);

  meta = metadata.levels[kLastLevel].files[0];
  ASSERT_EQ(Temperature::kWarm, meta.temperature);
  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
}

TEST_F(DBTest2, LastLevelTemperatureUniversal) {
  const int kTriggerNum = 3;
  const int kNumLevels = 5;
  const int kBottommostLevel = kNumLevels - 1;
  Options options = CurrentOptions();
  options.compaction_style = kCompactionStyleUniversal;
  options.level0_file_num_compaction_trigger = kTriggerNum;
  options.num_levels = kNumLevels;
  options.statistics = CreateDBStatistics();
  DestroyAndReopen(options);

  auto size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kHot);
  ASSERT_EQ(size, 0);
  get_iostats_context()->Reset();
  IOStatsContext* iostats = get_iostats_context();

  for (int i = 0; i < kTriggerNum; i++) {
    ASSERT_OK(Put("foo", "bar"));
    ASSERT_OK(Put("bar", "bar"));
    ASSERT_OK(Flush());
  }
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  ColumnFamilyMetaData metadata;
  db_->GetColumnFamilyMetaData(&metadata);
  ASSERT_EQ(1, metadata.file_count);
  ASSERT_EQ(Temperature::kUnknown,
            metadata.levels[kBottommostLevel].files[0].temperature);
  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_GT(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_EQ(size, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
  ASSERT_EQ("bar", Get("foo"));

  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);

  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  db_->GetColumnFamilyMetaData(&metadata);
  ASSERT_EQ(2, metadata.file_count);
  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_GT(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_EQ(size, 0);

  // Update bottommost temperature
  options.bottommost_temperature = Temperature::kWarm;
  Reopen(options);
  db_->GetColumnFamilyMetaData(&metadata);
  // Should not impact existing ones
  ASSERT_EQ(Temperature::kUnknown,
            metadata.levels[kBottommostLevel].files[0].temperature);
  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_GT(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_EQ(size, 0);

  // new generated file should have the new settings
  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
  db_->GetColumnFamilyMetaData(&metadata);
  ASSERT_EQ(1, metadata.file_count);
  ASSERT_EQ(Temperature::kWarm,
            metadata.levels[kBottommostLevel].files[0].temperature);
  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_GT(size, 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);

  // non-bottommost file still has unknown temperature
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  db_->GetColumnFamilyMetaData(&metadata);
  ASSERT_EQ(2, metadata.file_count);
  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_GT(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_GT(size, 0);

  // check other non-exist temperatures
  size = GetSstSizeHelper(Temperature::kHot);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kCold);
  ASSERT_EQ(size, 0);
  std::string prop;
  ASSERT_TRUE(dbfull()->GetProperty(
      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
      &prop));
  ASSERT_EQ(std::atoi(prop.c_str()), 0);

  // Update bottommost temperature dynamically with SetOptions
  auto s = db_->SetOptions({{"last_level_temperature", "kCold"}});
  ASSERT_OK(s);
  ASSERT_EQ(db_->GetOptions().bottommost_temperature, Temperature::kCold);
  db_->GetColumnFamilyMetaData(&metadata);
  // Should not impact the existing files
  ASSERT_EQ(Temperature::kWarm,
            metadata.levels[kBottommostLevel].files[0].temperature);
  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_GT(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_GT(size, 0);
  size = GetSstSizeHelper(Temperature::kCold);
  ASSERT_EQ(size, 0);

  // new generated files should have the new settings
  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
  db_->GetColumnFamilyMetaData(&metadata);
  ASSERT_EQ(1, metadata.file_count);
  ASSERT_EQ(Temperature::kCold,
            metadata.levels[kBottommostLevel].files[0].temperature);
  size = GetSstSizeHelper(Temperature::kUnknown);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_EQ(size, 0);
  size = GetSstSizeHelper(Temperature::kCold);
  ASSERT_GT(size, 0);

  // kLastTemperature is an invalid temperature
  options.bottommost_temperature = Temperature::kLastTemperature;
  s = TryReopen(options);
  ASSERT_TRUE(s.IsIOError());
}

TEST_F(DBTest2, LastLevelStatistics) {
  Options options = CurrentOptions();
  options.bottommost_temperature = Temperature::kWarm;
  options.level0_file_num_compaction_trigger = 2;
  options.level_compaction_dynamic_level_bytes = true;
  options.statistics = CreateDBStatistics();
  Reopen(options);

  // generate 1 sst on level 0
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_EQ("bar", Get("bar"));

  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0);
  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0);
  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0);
  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0);

  // 2nd flush to trigger compaction
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_EQ("bar", Get("bar"));

  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES),
            options.statistics->getTickerCount(WARM_FILE_READ_BYTES));
  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
            options.statistics->getTickerCount(WARM_FILE_READ_COUNT));

  auto pre_bytes =
      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES);
  auto pre_count =
      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);

  // 3rd flush to generate 1 sst on level 0
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_EQ("bar", Get("bar"));

  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES),
            pre_bytes);
  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT),
            pre_count);
  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES),
            options.statistics->getTickerCount(WARM_FILE_READ_BYTES));
  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
            options.statistics->getTickerCount(WARM_FILE_READ_COUNT));
}

TEST_F(DBTest2, CheckpointFileTemperature) {
  class NoLinkTestFS : public FileTemperatureTestFS {
    using FileTemperatureTestFS::FileTemperatureTestFS;

    IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
                      IODebugContext*) override {
      // return not supported to force checkpoint copy the file instead of just
      // link
      return IOStatus::NotSupported();
    }
  };
  auto test_fs = std::make_shared<NoLinkTestFS>(env_->GetFileSystem());
  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
  Options options = CurrentOptions();
  options.bottommost_temperature = Temperature::kWarm;
  // set dynamic_level to true so the compaction would compact the data to the
  // last level directly which will have the last_level_temperature
  options.level_compaction_dynamic_level_bytes = true;
  options.level0_file_num_compaction_trigger = 2;
  options.env = env.get();
  Reopen(options);

  // generate a bottommost file and a non-bottommost file
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  ASSERT_OK(dbfull()->TEST_WaitForCompact());
  ASSERT_OK(Put("foo", "bar"));
  ASSERT_OK(Put("bar", "bar"));
  ASSERT_OK(Flush());
  auto size = GetSstSizeHelper(Temperature::kWarm);
  ASSERT_GT(size, 0);

  std::map<uint64_t, Temperature> temperatures;
  std::vector<LiveFileStorageInfo> infos;
  ASSERT_OK(
      dbfull()->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos));
  for (auto info : infos) {
    temperatures.emplace(info.file_number, info.temperature);
  }

  test_fs->PopRequestedSstFileTemperatures();
  Checkpoint* checkpoint;
  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
  ASSERT_OK(
      checkpoint->CreateCheckpoint(dbname_ + kFilePathSeparator + "tempcp"));

  // checking src file src_temperature hints: 2 sst files: 1 sst is kWarm,
  // another is kUnknown
  std::vector<std::pair<uint64_t, Temperature>> requested_temps;
  test_fs->PopRequestedSstFileTemperatures(&requested_temps);
  // Two requests
  ASSERT_EQ(requested_temps.size(), 2);
  std::set<uint64_t> distinct_requests;
  for (const auto& requested_temp : requested_temps) {
    // Matching manifest temperatures
    ASSERT_EQ(temperatures.at(requested_temp.first), requested_temp.second);
    distinct_requests.insert(requested_temp.first);
  }
  // Each request to distinct file
  ASSERT_EQ(distinct_requests.size(), requested_temps.size());

  delete checkpoint;
  Close();
}

TEST_F(DBTest2, FileTemperatureManifestFixup) {
  auto test_fs = std::make_shared<FileTemperatureTestFS>(env_->GetFileSystem());
  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
  Options options = CurrentOptions();
  options.bottommost_temperature = Temperature::kWarm;
  // set dynamic_level to true so the compaction would compact the data to the
  // last level directly which will have the last_level_temperature
  options.level_compaction_dynamic_level_bytes = true;
  options.level0_file_num_compaction_trigger = 2;
  options.env = env.get();
  std::vector<std::string> cfs = {/*"default",*/ "test1", "test2"};
  CreateAndReopenWithCF(cfs, options);
  // Needed for later re-opens (weird)
  cfs.insert(cfs.begin(), kDefaultColumnFamilyName);

  // Generate a bottommost file in all CFs
  for (int cf = 0; cf < 3; ++cf) {
    ASSERT_OK(Put(cf, "a", "val"));
    ASSERT_OK(Put(cf, "c", "val"));
    ASSERT_OK(Flush(cf));
    ASSERT_OK(Put(cf, "b", "val"));
    ASSERT_OK(Put(cf, "d", "val"));
    ASSERT_OK(Flush(cf));
  }
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  // verify
  ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);

  // Generate a non-bottommost file in all CFs
  for (int cf = 0; cf < 3; ++cf) {
    ASSERT_OK(Put(cf, "e", "val"));
    ASSERT_OK(Flush(cf));
  }

  // re-verify
  ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0);
  // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);

  // Now change FS temperature on bottommost file(s) to kCold
  std::map<uint64_t, Temperature> current_temps;
  test_fs->CopyCurrentSstFileTemperatures(&current_temps);
  for (auto e : current_temps) {
    if (e.second == Temperature::kWarm) {
      test_fs->OverrideSstFileTemperature(e.first, Temperature::kCold);
    }
  }
  // Metadata not yet updated
  ASSERT_EQ(Get("a"), "val");
  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);

  // Update with Close and UpdateManifestForFilesState, but first save cf
  // descriptors
  std::vector<ColumnFamilyDescriptor> column_families;
  for (size_t i = 0; i < handles_.size(); ++i) {
    ColumnFamilyDescriptor cfdescriptor;
    handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError();
    column_families.push_back(cfdescriptor);
  }
  Close();
  experimental::UpdateManifestForFilesStateOptions update_opts;
  update_opts.update_temperatures = true;

  ASSERT_OK(experimental::UpdateManifestForFilesState(
      options, dbname_, column_families, update_opts));

  // Re-open and re-verify after update
  ReopenWithColumnFamilies(cfs, options);
  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
  // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);

  // Change kUnknown to kHot
  test_fs->CopyCurrentSstFileTemperatures(&current_temps);
  for (auto e : current_temps) {
    if (e.second == Temperature::kUnknown) {
      test_fs->OverrideSstFileTemperature(e.first, Temperature::kHot);
    }
  }

  // Update with Close and UpdateManifestForFilesState
  Close();
  ASSERT_OK(experimental::UpdateManifestForFilesState(
      options, dbname_, column_families, update_opts));

  // Re-open and re-verify after update
  ReopenWithColumnFamilies(cfs, options);
  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
  ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0);
  ASSERT_GT(GetSstSizeHelper(Temperature::kHot), 0);

  Close();
}

// WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
  Options options = CurrentOptions();
  DestroyAndReopen(options);
  ASSERT_OK(Put("foo", "value0"));
  Close();
  SyncPoint::GetInstance()->DisableProcessing();
  SyncPoint::GetInstance()->ClearAllCallBacks();
  bool should_inject_error = false;
  SyncPoint::GetInstance()->SetCallBack(
      "DBImpl::RecoverLogFiles:BeforeReadWal",
      [&](void* /*arg*/) { should_inject_error = true; });
  SyncPoint::GetInstance()->SetCallBack(
      "LogReader::ReadMore:AfterReadFile", [&](void* arg) {
        if (should_inject_error) {
          ASSERT_NE(nullptr, arg);
          *reinterpret_cast<Status*>(arg) = Status::IOError("Injected IOError");
        }
      });
  SyncPoint::GetInstance()->EnableProcessing();
  options.avoid_flush_during_recovery = true;
  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
  Status s = TryReopen(options);
  ASSERT_TRUE(s.IsIOError());
}

TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
      {{"DBImpl::BackgroundCallFlush:Start:1",
        "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
       {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
        "DBImpl::BackgroundCallFlush:Start:2"}});
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

  CreateColumnFamilies({"test1"}, Options());
  ASSERT_OK(Put("foo", "bar"));

  // Creating a CF when a flush is going on, log is synced but the
  // closed log file is not synced and corrupted.
  port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
  CreateColumnFamilies({"test2"}, Options());
  env_->corrupt_in_sync_ = true;
  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
  flush_thread.join();
  env_->corrupt_in_sync_ = false;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();

  // Reopening the DB should not corrupt anything
  Options options = CurrentOptions();
  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
  ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
}

TEST_F(DBTest2, SortL0FilesByEpochNumber) {
  Options options = CurrentOptions();
  options.num_levels = 1;
  options.compaction_style = kCompactionStyleUniversal;
  DestroyAndReopen(options);

  // Set up L0 files to be sorted by their epoch_number
  ASSERT_OK(Put("key1", "seq1"));

  SstFileWriter sst_file_writer{EnvOptions(), options};
  std::string external_file1 = dbname_ + "/test_files1.sst";
  std::string external_file2 = dbname_ + "/test_files2.sst";
  ASSERT_OK(sst_file_writer.Open(external_file1));
  ASSERT_OK(sst_file_writer.Put("key2", "seq0"));
  ASSERT_OK(sst_file_writer.Finish());
  ASSERT_OK(sst_file_writer.Open(external_file2));
  ASSERT_OK(sst_file_writer.Put("key3", "seq0"));
  ASSERT_OK(sst_file_writer.Finish());

  ASSERT_OK(Put("key4", "seq2"));
  ASSERT_OK(Flush());

  auto* handle = db_->DefaultColumnFamily();
  ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file2},
                                    IngestExternalFileOptions()));

  // To verify L0 files are sorted by epoch_number in descending order
  // instead of largest_seqno
  std::vector<FileMetaData*> level0_files = GetLevelFileMetadatas(0 /* level*/);
  ASSERT_EQ(level0_files.size(), 3);

  EXPECT_EQ(level0_files[0]->epoch_number, 3);
  EXPECT_EQ(level0_files[0]->fd.largest_seqno, 0);
  ASSERT_EQ(level0_files[0]->num_entries, 1);
  ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key3"));

  EXPECT_EQ(level0_files[1]->epoch_number, 2);
  EXPECT_EQ(level0_files[1]->fd.largest_seqno, 0);
  ASSERT_EQ(level0_files[1]->num_entries, 1);
  ASSERT_TRUE(level0_files[1]->largest.user_key() == Slice("key2"));

  EXPECT_EQ(level0_files[2]->epoch_number, 1);
  EXPECT_EQ(level0_files[2]->fd.largest_seqno, 2);
  ASSERT_EQ(level0_files[2]->num_entries, 2);
  ASSERT_TRUE(level0_files[2]->largest.user_key() == Slice("key4"));
  ASSERT_TRUE(level0_files[2]->smallest.user_key() == Slice("key1"));

  // To verify compacted file is assigned with the minimum epoch_number
  // among input files'
  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));

  level0_files = GetLevelFileMetadatas(0 /* level*/);
  ASSERT_EQ(level0_files.size(), 1);
  EXPECT_EQ(level0_files[0]->epoch_number, 1);
  ASSERT_EQ(level0_files[0]->num_entries, 4);
  ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key4"));
  ASSERT_TRUE(level0_files[0]->smallest.user_key() == Slice("key1"));
}

TEST_F(DBTest2, SameEpochNumberAfterCompactRangeChangeLevel) {
  Options options = CurrentOptions();
  options.num_levels = 7;
  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
  options.disable_auto_compactions = true;
  DestroyAndReopen(options);

  // Set up the file in L1 to be moved to L0 in later step of CompactRange()
  ASSERT_OK(Put("key1", "seq1"));
  ASSERT_OK(Flush());
  MoveFilesToLevel(1, 0);
  std::vector<FileMetaData*> level0_files = GetLevelFileMetadatas(0 /* level*/);
  ASSERT_EQ(level0_files.size(), 0);
  std::vector<FileMetaData*> level1_files = GetLevelFileMetadatas(1 /* level*/);
  ASSERT_EQ(level1_files.size(), 1);
  std::vector<FileMetaData*> level2_files = GetLevelFileMetadatas(2 /* level*/);
  ASSERT_EQ(level2_files.size(), 0);

  ASSERT_EQ(level1_files[0]->epoch_number, 1);

  // To verify CompactRange() moving file to L0 still keeps the file's
  // epoch_number
  CompactRangeOptions croptions;
  croptions.change_level = true;
  croptions.target_level = 0;
  ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
  level0_files = GetLevelFileMetadatas(0 /* level*/);
  level1_files = GetLevelFileMetadatas(1 /* level*/);
  ASSERT_EQ(level0_files.size(), 1);
  ASSERT_EQ(level1_files.size(), 0);

  EXPECT_EQ(level0_files[0]->epoch_number, 1);

  ASSERT_EQ(level0_files[0]->num_entries, 1);
  ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key1"));
}

TEST_F(DBTest2, RecoverEpochNumber) {
  for (bool allow_ingest_behind : {true, false}) {
    Options options = CurrentOptions();
    options.allow_ingest_behind = allow_ingest_behind;
    options.num_levels = 7;
    options.compaction_style = kCompactionStyleLevel;
    options.disable_auto_compactions = true;
    DestroyAndReopen(options);
    CreateAndReopenWithCF({"cf1"}, options);
    VersionSet* versions = dbfull()->GetVersionSet();
    assert(versions);
    const ColumnFamilyData* default_cf =
        versions->GetColumnFamilySet()->GetDefault();
    const ColumnFamilyData* cf1 =
        versions->GetColumnFamilySet()->GetColumnFamily("cf1");

    // Set up files in default CF to recover in later step
    ASSERT_OK(Put("key1", "epoch1"));
    ASSERT_OK(Flush());
    MoveFilesToLevel(1 /* level*/, 0 /* cf*/);
    ASSERT_OK(Put("key2", "epoch2"));
    ASSERT_OK(Flush());

    std::vector<FileMetaData*> level0_files =
        GetLevelFileMetadatas(0 /* level*/);
    ASSERT_EQ(level0_files.size(), 1);
    ASSERT_EQ(level0_files[0]->epoch_number,
              allow_ingest_behind
                  ? 2 + kReservedEpochNumberForFileIngestedBehind
                  : 2);
    ASSERT_EQ(level0_files[0]->num_entries, 1);
    ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2"));

    std::vector<FileMetaData*> level1_files =
        GetLevelFileMetadatas(1 /* level*/);
    ASSERT_EQ(level1_files.size(), 1);
    ASSERT_EQ(level1_files[0]->epoch_number,
              allow_ingest_behind
                  ? 1 + kReservedEpochNumberForFileIngestedBehind
                  : 1);
    ASSERT_EQ(level1_files[0]->num_entries, 1);
    ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1"));

    // Set up files in cf1 to recover in later step
    ASSERT_OK(Put(1 /* cf */, "cf1_key1", "epoch1"));
    ASSERT_OK(Flush(1 /* cf */));

    std::vector<FileMetaData*> level0_files_cf1 =
        GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/);
    ASSERT_EQ(level0_files_cf1.size(), 1);
    ASSERT_EQ(level0_files_cf1[0]->epoch_number,
              allow_ingest_behind
                  ? 1 + kReservedEpochNumberForFileIngestedBehind
                  : 1);
    ASSERT_EQ(level0_files_cf1[0]->num_entries, 1);
    ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1"));

    ASSERT_EQ(default_cf->GetNextEpochNumber(),
              allow_ingest_behind
                  ? 3 + kReservedEpochNumberForFileIngestedBehind
                  : 3);
    ASSERT_EQ(cf1->GetNextEpochNumber(),
              allow_ingest_behind
                  ? 2 + kReservedEpochNumberForFileIngestedBehind
                  : 2);

    // To verify epoch_number of files of different levels/CFs are
    // persisted and recovered correctly
    ReopenWithColumnFamilies({"default", "cf1"}, options);
    versions = dbfull()->GetVersionSet();
    assert(versions);
    default_cf = versions->GetColumnFamilySet()->GetDefault();
    cf1 = versions->GetColumnFamilySet()->GetColumnFamily("cf1");

    level0_files = GetLevelFileMetadatas(0 /* level*/);
    ASSERT_EQ(level0_files.size(), 1);
    EXPECT_EQ(level0_files[0]->epoch_number,
              allow_ingest_behind
                  ? 2 + kReservedEpochNumberForFileIngestedBehind
                  : 2);
    ASSERT_EQ(level0_files[0]->num_entries, 1);
    ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2"));

    level1_files = GetLevelFileMetadatas(1 /* level*/);
    ASSERT_EQ(level1_files.size(), 1);
    EXPECT_EQ(level1_files[0]->epoch_number,
              allow_ingest_behind
                  ? 1 + kReservedEpochNumberForFileIngestedBehind
                  : 1);
    ASSERT_EQ(level1_files[0]->num_entries, 1);
    ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1"));

    level0_files_cf1 = GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/);
    ASSERT_EQ(level0_files_cf1.size(), 1);
    EXPECT_EQ(level0_files_cf1[0]->epoch_number,
              allow_ingest_behind
                  ? 1 + kReservedEpochNumberForFileIngestedBehind
                  : 1);
    ASSERT_EQ(level0_files_cf1[0]->num_entries, 1);
    ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1"));

    // To verify next epoch number is recovered correctly
    EXPECT_EQ(default_cf->GetNextEpochNumber(),
              allow_ingest_behind
                  ? 3 + kReservedEpochNumberForFileIngestedBehind
                  : 3);
    EXPECT_EQ(cf1->GetNextEpochNumber(),
              allow_ingest_behind
                  ? 2 + kReservedEpochNumberForFileIngestedBehind
                  : 2);
  }
}


TEST_F(DBTest2, RenameDirectory) {
  Options options = CurrentOptions();
  DestroyAndReopen(options);
  ASSERT_OK(Put("foo", "value0"));
  Close();
  auto old_dbname = dbname_;
  auto new_dbname = dbname_ + "_2";
  EXPECT_OK(env_->RenameFile(dbname_, new_dbname));
  options.create_if_missing = false;
  dbname_ = new_dbname;
  ASSERT_OK(TryReopen(options));
  ASSERT_EQ("value0", Get("foo"));
  Destroy(options);
  dbname_ = old_dbname;
}

TEST_F(DBTest2, SstUniqueIdVerifyBackwardCompatible) {
  const int kNumSst = 3;
  const int kLevel0Trigger = 4;
  auto options = CurrentOptions();
  options.level0_file_num_compaction_trigger = kLevel0Trigger;
  options.statistics = CreateDBStatistics();
  // Skip for now
  options.verify_sst_unique_id_in_manifest = false;
  Reopen(options);

  std::atomic_int skipped = 0;
  std::atomic_int passed = 0;
  SyncPoint::GetInstance()->SetCallBack(
      "BlockBasedTable::Open::SkippedVerifyUniqueId",
      [&](void* /*arg*/) { skipped++; });
  SyncPoint::GetInstance()->SetCallBack(
      "BlockBasedTable::Open::PassedVerifyUniqueId",
      [&](void* /*arg*/) { passed++; });
  SyncPoint::GetInstance()->EnableProcessing();

  // generate a few SSTs
  for (int i = 0; i < kNumSst; i++) {
    for (int j = 0; j < 100; j++) {
      ASSERT_OK(Put(Key(i * 10 + j), "value"));
    }
    ASSERT_OK(Flush());
  }

  // Verification has been skipped on files so far
  EXPECT_EQ(skipped, kNumSst);
  EXPECT_EQ(passed, 0);

  // Reopen with verification
  options.verify_sst_unique_id_in_manifest = true;
  skipped = 0;
  passed = 0;
  Reopen(options);
  EXPECT_EQ(skipped, 0);
  EXPECT_EQ(passed, kNumSst);

  // Now simulate no unique id in manifest for next file
  // NOTE: this only works for loading manifest from disk,
  // not in-memory manifest, so we need to re-open below.
  SyncPoint::GetInstance()->SetCallBack(
      "VersionEdit::EncodeTo:UniqueId", [&](void* arg) {
        auto unique_id = static_cast<UniqueId64x2*>(arg);
        // remove id before writing it to manifest
        (*unique_id)[0] = 0;
        (*unique_id)[1] = 0;
      });

  // test compaction generated Sst
  for (int i = kNumSst; i < kLevel0Trigger; i++) {
    for (int j = 0; j < 100; j++) {
      ASSERT_OK(Put(Key(i * 10 + j), "value"));
    }
    ASSERT_OK(Flush());
  }
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  ASSERT_EQ("0,1", FilesPerLevel(0));

  // Reopen (with verification)
  ASSERT_TRUE(options.verify_sst_unique_id_in_manifest);
  skipped = 0;
  passed = 0;
  Reopen(options);
  EXPECT_EQ(skipped, 1);
  EXPECT_EQ(passed, 0);
}

TEST_F(DBTest2, SstUniqueIdVerify) {
  const int kNumSst = 3;
  const int kLevel0Trigger = 4;
  auto options = CurrentOptions();
  options.level0_file_num_compaction_trigger = kLevel0Trigger;
  // Allow mismatch for now
  options.verify_sst_unique_id_in_manifest = false;
  Reopen(options);

  SyncPoint::GetInstance()->SetCallBack(
      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
        auto props = static_cast<TableProperties*>(props_vs);
        // update table property session_id to a different one, which
        // changes unique ID
        props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
      });
  SyncPoint::GetInstance()->EnableProcessing();

  // generate a few SSTs
  for (int i = 0; i < kNumSst; i++) {
    for (int j = 0; j < 100; j++) {
      ASSERT_OK(Put(Key(i * 10 + j), "value"));
    }
    ASSERT_OK(Flush());
  }

  // Reopen with verification should report corruption
  options.verify_sst_unique_id_in_manifest = true;
  auto s = TryReopen(options);
  ASSERT_TRUE(s.IsCorruption());

  // Reopen without verification should be fine
  options.verify_sst_unique_id_in_manifest = false;
  Reopen(options);

  // test compaction generated Sst
  for (int i = kNumSst; i < kLevel0Trigger; i++) {
    for (int j = 0; j < 100; j++) {
      ASSERT_OK(Put(Key(i * 10 + j), "value"));
    }
    ASSERT_OK(Flush());
  }
  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  ASSERT_EQ("0,1", FilesPerLevel(0));

  // Reopen with verification should fail
  options.verify_sst_unique_id_in_manifest = true;
  s = TryReopen(options);
  ASSERT_TRUE(s.IsCorruption());
}

TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) {
  const int kNumSst = 3;
  const int kLevel0Trigger = 4;
  auto options = CurrentOptions();
  options.level0_file_num_compaction_trigger = kLevel0Trigger;
  // Allow mismatch for now
  options.verify_sst_unique_id_in_manifest = false;

  CreateAndReopenWithCF({"one", "two"}, options);

  // generate good SSTs
  for (int cf_num : {0, 2}) {
    for (int i = 0; i < kNumSst; i++) {
      for (int j = 0; j < 100; j++) {
        ASSERT_OK(Put(cf_num, Key(i * 10 + j), "value"));
      }
      ASSERT_OK(Flush(cf_num));
    }
  }

  // generate SSTs with bad unique id
  SyncPoint::GetInstance()->SetCallBack(
      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
        auto props = static_cast<TableProperties*>(props_vs);
        // update table property session_id to a different one
        props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
      });
  SyncPoint::GetInstance()->EnableProcessing();
  for (int i = 0; i < kNumSst; i++) {
    for (int j = 0; j < 100; j++) {
      ASSERT_OK(Put(1, Key(i * 10 + j), "value"));
    }
    ASSERT_OK(Flush(1));
  }

  // Reopen with verification should report corruption
  options.verify_sst_unique_id_in_manifest = true;
  auto s = TryReopenWithColumnFamilies({"default", "one", "two"}, options);
  ASSERT_TRUE(s.IsCorruption());
}

TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) {
  const auto tamper_with_uniq_id = [&](void* arg) {
    auto props = static_cast<TableProperties*>(arg);
    assert(props);
    // update table property session_id to a different one
    props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
  };

  const auto assert_db = [&](size_t expected_count,
                             const std::string& expected_v) {
    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
    size_t cnt = 0;
    for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) {
      ASSERT_EQ(std::to_string(cnt), it->key());
      ASSERT_EQ(expected_v, it->value());
    }
    ASSERT_EQ(expected_count, cnt);
  };

  const int num_l0_compaction_trigger = 8;
  const int num_l0 = num_l0_compaction_trigger - 1;
  Options options = CurrentOptions();
  options.level0_file_num_compaction_trigger = num_l0_compaction_trigger;

  for (int k = 0; k < num_l0; ++k) {
    // Allow mismatch for now
    options.verify_sst_unique_id_in_manifest = false;

    DestroyAndReopen(options);

    constexpr size_t num_keys_per_file = 10;
    for (int i = 0; i < num_l0; ++i) {
      for (size_t j = 0; j < num_keys_per_file; ++j) {
        ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i)));
      }
      if (i == k) {
        SyncPoint::GetInstance()->DisableProcessing();
        SyncPoint::GetInstance()->SetCallBack(
            "PropertyBlockBuilder::AddTableProperty:Start",
            tamper_with_uniq_id);
        SyncPoint::GetInstance()->EnableProcessing();
      }
      ASSERT_OK(Flush());
    }

    options.verify_sst_unique_id_in_manifest = true;
    Status s = TryReopen(options);
    ASSERT_TRUE(s.IsCorruption());

    options.best_efforts_recovery = true;
    Reopen(options);
    assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));

    // Reopen with regular recovery
    options.best_efforts_recovery = false;
    Reopen(options);
    assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));

    SyncPoint::GetInstance()->DisableProcessing();
    SyncPoint::GetInstance()->ClearAllCallBacks();

    for (size_t i = 0; i < num_keys_per_file; ++i) {
      ASSERT_OK(Put(std::to_string(i), "v"));
    }
    ASSERT_OK(Flush());
    Reopen(options);
    {
      for (size_t i = 0; i < num_keys_per_file; ++i) {
        ASSERT_EQ("v", Get(std::to_string(i)));
      }
    }
  }
}

TEST_F(DBTest2, GetLatestSeqAndTsForKey) {
  Destroy(last_options_);

  Options options = CurrentOptions();
  options.max_write_buffer_size_to_maintain = 64 << 10;
  options.create_if_missing = true;
  options.disable_auto_compactions = true;
  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
  options.statistics = CreateDBStatistics();

  Reopen(options);

  constexpr uint64_t kTsU64Value = 12;

  for (uint64_t key = 0; key < 100; ++key) {
    std::string ts;
    PutFixed64(&ts, kTsU64Value);

    std::string key_str;
    PutFixed64(&key_str, key);
    std::reverse(key_str.begin(), key_str.end());
    ASSERT_OK(db_->Put(WriteOptions(), key_str, ts, "value"));
  }

  ASSERT_OK(Flush());

  constexpr bool cache_only = true;
  constexpr SequenceNumber lower_bound_seq = 0;
  auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(
      dbfull()->DefaultColumnFamily());
  assert(cfhi);
  assert(cfhi->cfd());
  SuperVersion* sv = cfhi->cfd()->GetSuperVersion();
  for (uint64_t key = 0; key < 100; ++key) {
    std::string key_str;
    PutFixed64(&key_str, key);
    std::reverse(key_str.begin(), key_str.end());
    std::string ts;
    SequenceNumber seq = kMaxSequenceNumber;
    bool found_record_for_key = false;
    bool is_blob_index = false;

    const Status s = dbfull()->GetLatestSequenceForKey(
        sv, key_str, cache_only, lower_bound_seq, &seq, &ts,
        &found_record_for_key, &is_blob_index);
    ASSERT_OK(s);
    std::string expected_ts;
    PutFixed64(&expected_ts, kTsU64Value);
    ASSERT_EQ(expected_ts, ts);
    ASSERT_TRUE(found_record_for_key);
    ASSERT_FALSE(is_blob_index);
  }

  // Verify that no read to SST files.
  ASSERT_EQ(0, options.statistics->getTickerCount(GET_HIT_L0));
}

}  // namespace ROCKSDB_NAMESPACE

int main(int argc, char** argv) {
  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
  ::testing::InitGoogleTest(&argc, argv);
  RegisterCustomObjects(argc, argv);
  return RUN_ALL_TESTS();
}
back to top