Revision 0bb555630f5e85a1471843f8dc0dabec297c1c49 authored by Siying Dong on 08 April 2019, 20:24:29 UTC, committed by Facebook Github Bot on 08 April 2019, 20:32:06 UTC
Summary:
Create new function NPHash64() and GetSliceNPHash64(), which are currently
implemented using murmurhash.
Replace the current direct call of murmurhash() to use the new functions
if the hash results are not used in on-disk format.
This will make it easier to try out or switch to alternative functions
in the uses where data format compatibility doesn't need to be considered.
This part shouldn't have any performance impact.

Also, the sharded cache hash function is changed to the new format, because
it falls into this categoery. It doesn't show visible performance impact
in db_bench results. CPU showed by perf is increased from about 0.2% to 0.4%
in an extreme benchmark setting (4KB blocks, no-compression, everything
cached in block cache). We've known that the current hash function used,
our own Hash() has serious hash quality problem. It can generate a lots of
conflicts with similar input. In this use case, it means extra lock contention
for reads from the same file. This slight CPU regression is worthy to me
to counter the potential bad performance with hot keys. And hopefully this
will get further improved in the future with a better hash function.

cache_test's condition is relaxed a little bit to. The new hash is slightly
more skewed in this use case, but I manually checked the data and see
the hash results are still in a reasonable range.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5155

Differential Revision: D14834821

Pulled By: siying

fbshipit-source-id: ec9a2c0a2f8ae4b54d08b13a5c2e9cc97aa80cb5
1 parent de00f28
Raw File
env_librados.cc
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab

#include "rocksdb/utilities/env_librados.h"
#include "util/random.h"
#include <mutex>
#include <cstdlib>

namespace rocksdb {
/* GLOBAL DIFINE */
// #define DEBUG
#ifdef DEBUG
#include <cstdio>
#include <sys/syscall.h>
#include <unistd.h>
#define LOG_DEBUG(...)  do{\
    printf("[%ld:%s:%i:%s]", syscall(SYS_gettid), __FILE__, __LINE__, __FUNCTION__);\
    printf(__VA_ARGS__);\
  }while(0)
#else
#define LOG_DEBUG(...)
#endif

/* GLOBAL CONSTANT */
const char *default_db_name     = "default_envlibrados_db";
const char *default_pool_name   = "default_envlibrados_pool";
const char *default_config_path = "CEPH_CONFIG_PATH";           // the env variable name of ceph configure file
// maximum dir/file that can store in the fs
const int MAX_ITEMS_IN_FS = 1 << 30;
// root dir tag
const std::string ROOT_DIR_KEY = "/";
const std::string DIR_ID_VALUE = "<DIR>";

/**
 * @brief convert error code to status
 * @details Convert internal linux error code to Status
 *
 * @param r [description]
 * @return [description]
 */
Status err_to_status(int r)
{
  switch (r) {
  case 0:
    return Status::OK();
  case -ENOENT:
    return Status::IOError();
  case -ENODATA:
  case -ENOTDIR:
    return Status::NotFound(Status::kNone);
  case -EINVAL:
    return Status::InvalidArgument(Status::kNone);
  case -EIO:
    return Status::IOError(Status::kNone);
  default:
    // FIXME :(
    assert(0 == "unrecognized error code");
    return Status::NotSupported(Status::kNone);
  }
}

/**
 * @brief split file path into dir path and file name
 * @details
 * Because rocksdb only need a 2-level structure (dir/file), all input path will be shortened to dir/file format
 *  For example:
 *    b/c => dir '/b', file 'c'
 *    /a/b/c => dir '/b', file 'c'
 *
 * @param fn [description]
 * @param dir [description]
 * @param file [description]
 */
void split(const std::string &fn, std::string *dir, std::string *file) {
  LOG_DEBUG("[IN]%s\n", fn.c_str());
  int pos = fn.size() - 1;
  while ('/' == fn[pos]) --pos;
  size_t fstart = fn.rfind('/', pos);
  *file = fn.substr(fstart + 1, pos - fstart);

  pos = fstart;
  while (pos >= 0 && '/' == fn[pos]) --pos;

  if (pos < 0) {
    *dir = "/";
  } else {
    size_t dstart = fn.rfind('/', pos);
    *dir = fn.substr(dstart + 1, pos - dstart);
    *dir = std::string("/") + *dir;
  }

  LOG_DEBUG("[OUT]%s | %s\n", dir->c_str(), file->c_str());
}

// A file abstraction for reading sequentially through a file
class LibradosSequentialFile : public SequentialFile {
  librados::IoCtx * _io_ctx;
  std::string _fid;
  std::string _hint;
  int _offset;
public:
  LibradosSequentialFile(librados::IoCtx * io_ctx, std::string fid, std::string hint):
    _io_ctx(io_ctx), _fid(fid), _hint(hint), _offset(0) {}

  ~LibradosSequentialFile() {}

  /**
   * @brief read file
   * @details
   *  Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
   *  written by this routine.  Sets "*result" to the data that was
   *  read (including if fewer than "n" bytes were successfully read).
   *  May set "*result" to point at data in "scratch[0..n-1]", so
   *  "scratch[0..n-1]" must be live when "*result" is used.
   *  If an error was encountered, returns a non-OK status.
   *
   *  REQUIRES: External synchronization
   *
   * @param n [description]
   * @param result [description]
   * @param scratch [description]
   * @return [description]
   */
  Status Read(size_t n, Slice* result, char* scratch) {
    LOG_DEBUG("[IN]%i\n", (int)n);
    librados::bufferlist buffer;
    Status s;
    int r = _io_ctx->read(_fid, buffer, n, _offset);
    if (r >= 0) {
      buffer.copy(0, r, scratch);
      *result = Slice(scratch, r);
      _offset += r;
      s = Status::OK();
    } else {
      s = err_to_status(r);
      if (s == Status::IOError()) {
        *result = Slice();
        s = Status::OK();
      }
    }
    LOG_DEBUG("[OUT]%s, %i, %s\n", s.ToString().c_str(), (int)r, buffer.c_str());
    return s;
  }

  /**
   * @brief skip "n" bytes from the file
   * @details
   *  Skip "n" bytes from the file. This is guaranteed to be no
   *  slower that reading the same data, but may be faster.
   *
   *  If end of file is reached, skipping will stop at the end of the
   *  file, and Skip will return OK.
   *
   *  REQUIRES: External synchronization
   *
   * @param n [description]
   * @return [description]
   */
  Status Skip(uint64_t n) {
    _offset += n;
    return Status::OK();
  }

  /**
   * @brief noop
   * @details
   *  rocksdb has it's own caching capabilities that we should be able to use,
   *  without relying on a cache here. This can safely be a no-op.
   *
   * @param offset [description]
   * @param length [description]
   *
   * @return [description]
   */
  Status InvalidateCache(size_t offset, size_t length) {
    return Status::OK();
  }
};

// A file abstraction for randomly reading the contents of a file.
class LibradosRandomAccessFile : public RandomAccessFile {
  librados::IoCtx * _io_ctx;
  std::string _fid;
  std::string _hint;
public:
  LibradosRandomAccessFile(librados::IoCtx * io_ctx, std::string fid, std::string hint):
    _io_ctx(io_ctx), _fid(fid), _hint(hint) {}

  ~LibradosRandomAccessFile() {}

  /**
   * @brief read file
   * @details similar to LibradosSequentialFile::Read
   *
   * @param offset [description]
   * @param n [description]
   * @param result [description]
   * @param scratch [description]
   * @return [description]
   */
  Status Read(uint64_t offset, size_t n, Slice* result,
              char* scratch) const {
    LOG_DEBUG("[IN]%i\n", (int)n);
    librados::bufferlist buffer;
    Status s;
    int r = _io_ctx->read(_fid, buffer, n, offset);
    if (r >= 0) {
      buffer.copy(0, r, scratch);
      *result = Slice(scratch, r);
      s = Status::OK();
    } else {
      s = err_to_status(r);
      if (s == Status::IOError()) {
        *result = Slice();
        s = Status::OK();
      }
    }
    LOG_DEBUG("[OUT]%s, %i, %s\n", s.ToString().c_str(), (int)r, buffer.c_str());
    return s;
  }

  /**
   * @brief [brief description]
   * @details Get unique id for each file and guarantee this id is different for each file
   *
   * @param id [description]
   * @param max_size max size of id, it shoud be larger than 16
   *
   * @return [description]
   */
  size_t GetUniqueId(char* id, size_t max_size) const {
    // All fid has the same db_id prefix, so we need to ignore db_id prefix
    size_t s = std::min(max_size, _fid.size());
    strncpy(id, _fid.c_str() + (_fid.size() - s), s);
    id[s - 1] = '\0';
    return s;
  };

  //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
  void Hint(AccessPattern pattern) {
    /* Do nothing */
  }

  /**
   * @brief noop
   * @details [long description]
   *
   * @param offset [description]
   * @param length [description]
   *
   * @return [description]
   */
  Status InvalidateCache(size_t offset, size_t length) {
    return Status::OK();
  }
};


// A file abstraction for sequential writing.  The implementation
// must provide buffering since callers may append small fragments
// at a time to the file.
class LibradosWritableFile : public WritableFile {
  librados::IoCtx * _io_ctx;
  std::string _fid;
  std::string _hint;
  const EnvLibrados * const _env;

  std::mutex _mutex;                 // used to protect modification of all following variables
  librados::bufferlist _buffer;      // write buffer
  uint64_t _buffer_size;             // write buffer size
  uint64_t _file_size;               // this file size doesn't include buffer size

  /**
   * @brief assuming caller holds lock
   * @details [long description]
   * @return [description]
   */
  int _SyncLocked() {
    // 1. sync append data to RADOS
    int r = _io_ctx->append(_fid, _buffer, _buffer_size);
    assert(r >= 0);

    // 2. update local variables
    if (0 == r) {
      _buffer.clear();
      _file_size += _buffer_size;
      _buffer_size = 0;
    }

    return r;
  }

public:
  LibradosWritableFile(librados::IoCtx * io_ctx,
                       std::string fid,
                       std::string hint,
                       const EnvLibrados * const env)
    : _io_ctx(io_ctx), _fid(fid), _hint(hint), _env(env), _buffer(), _buffer_size(0), _file_size(0) {
    int ret = _io_ctx->stat(_fid, &_file_size, nullptr);

    // if file not exist
    if (ret < 0) {
      _file_size = 0;
    }
  }

  ~LibradosWritableFile() {
    // sync before closeing writable file
    Sync();
  }

  /**
   * @brief append data to file
   * @details
   *  Append will save all written data in buffer util buffer size
   *  reaches buffer max size. Then, it will write buffer into rados
   *
   * @param data [description]
   * @return [description]
   */
  Status Append(const Slice& data) {
    // append buffer
    LOG_DEBUG("[IN] %i | %s\n", (int)data.size(), data.data());
    int r = 0;

    std::lock_guard<std::mutex> lock(_mutex);
    _buffer.append(data.data(), data.size());
    _buffer_size += data.size();

    if (_buffer_size > _env->_write_buffer_size) {
      r = _SyncLocked();
    }

    LOG_DEBUG("[OUT] %i\n", r);
    return err_to_status(r);
  }

  /**
   * @brief not supported
   * @details [long description]
   * @return [description]
   */
  Status PositionedAppend(
    const Slice& /* data */,
    uint64_t /* offset */) {
    return Status::NotSupported();
  }

  /**
   * @brief truncate file to assigned size
   * @details [long description]
   *
   * @param size [description]
   * @return [description]
   */
  Status Truncate(uint64_t size) {
    LOG_DEBUG("[IN]%lld|%lld|%lld\n", (long long)size, (long long)_file_size, (long long)_buffer_size);
    int r = 0;

    std::lock_guard<std::mutex> lock(_mutex);
    if (_file_size > size) {
      r = _io_ctx->trunc(_fid, size);

      if (r == 0) {
        _buffer.clear();
        _buffer_size = 0;
        _file_size = size;
      }
    } else if (_file_size == size) {
      _buffer.clear();
      _buffer_size = 0;
    } else {
      librados::bufferlist tmp;
      tmp.claim(_buffer);
      _buffer.substr_of(tmp, 0, size - _file_size);
      _buffer_size = size - _file_size;
    }

    LOG_DEBUG("[OUT] %i\n", r);
    return err_to_status(r);
  }

  /**
   * @brief close file
   * @details [long description]
   * @return [description]
   */
  Status Close() {
    LOG_DEBUG("%s | %lld | %lld\n", _hint.c_str(), (long long)_buffer_size, (long long)_file_size);
    return Sync();
  }

  /**
   * @brief flush file,
   * @details initiate an aio write and not wait
   *
   * @return [description]
   */
  Status Flush() {
    librados::AioCompletion *write_completion = librados::Rados::aio_create_completion();
    int r = 0;

    std::lock_guard<std::mutex> lock(_mutex);
    r = _io_ctx->aio_append(_fid, write_completion, _buffer, _buffer_size);

    if (0 == r) {
      _file_size += _buffer_size;
      _buffer.clear();
      _buffer_size = 0;
    }

    write_completion->release();

    return err_to_status(r);
  }

  /**
   * @brief write buffer data to rados
   * @details initiate an aio write and wait for result
   * @return [description]
   */
  Status Sync() { // sync data
    int r = 0;

    std::lock_guard<std::mutex> lock(_mutex);
    if (_buffer_size > 0) {
      r = _SyncLocked();
    }

    return err_to_status(r);
  }

  /**
   * @brief [brief description]
   * @details [long description]
   * @return true if Sync() and Fsync() are safe to call concurrently with Append()and Flush().
   */
  bool IsSyncThreadSafe() const {
    return true;
  }

  /**
   * @brief Indicates the upper layers if the current WritableFile implementation uses direct IO.
   * @details [long description]
   * @return [description]
   */
  bool use_direct_io() const {
    return false;
  }

  /**
   * @brief Get file size
   * @details
   *  This API will use cached file_size.
   * @return [description]
   */
  uint64_t GetFileSize() {
    LOG_DEBUG("%lld|%lld\n", (long long)_buffer_size, (long long)_file_size);

    std::lock_guard<std::mutex> lock(_mutex);
    int file_size = _file_size + _buffer_size;

    return file_size;
  }

  /**
   * @brief For documentation, refer to RandomAccessFile::GetUniqueId()
   * @details [long description]
   *
   * @param id [description]
   * @param max_size [description]
   *
   * @return [description]
   */
  size_t GetUniqueId(char* id, size_t max_size) const {
    // All fid has the same db_id prefix, so we need to ignore db_id prefix
    size_t s = std::min(max_size, _fid.size());
    strncpy(id, _fid.c_str() + (_fid.size() - s), s);
    id[s - 1] = '\0';
    return s;
  }

  /**
   * @brief noop
   * @details [long description]
   *
   * @param offset [description]
   * @param length [description]
   *
   * @return [description]
   */
  Status InvalidateCache(size_t offset, size_t length) {
    return Status::OK();
  }

  using WritableFile::RangeSync;
  /**
   * @brief No RangeSync support, just call Sync()
   * @details [long description]
   *
   * @param offset [description]
   * @param nbytes [description]
   *
   * @return [description]
   */
  Status RangeSync(off_t offset, off_t nbytes) {
    return Sync();
  }

protected:
  using WritableFile::Allocate;
  /**
   * @brief noop
   * @details [long description]
   *
   * @param offset [description]
   * @param len [description]
   *
   * @return [description]
   */
  Status Allocate(off_t offset, off_t len) {
    return Status::OK();
  }
};


// Directory object represents collection of files and implements
// filesystem operations that can be executed on directories.
class LibradosDirectory : public Directory {
  librados::IoCtx * _io_ctx;
  std::string _fid;
public:
  explicit LibradosDirectory(librados::IoCtx * io_ctx, std::string fid):
    _io_ctx(io_ctx), _fid(fid) {}

  // Fsync directory. Can be called concurrently from multiple threads.
  Status Fsync() {
    return Status::OK();
  }
};

// Identifies a locked file.
// This is exclusive lock and can't nested lock by same thread
class LibradosFileLock : public FileLock {
  librados::IoCtx * _io_ctx;
  const std::string _obj_name;
  const std::string _lock_name;
  const std::string _cookie;
  int lock_state;
public:
  LibradosFileLock(
    librados::IoCtx * io_ctx,
    const std::string obj_name):
    _io_ctx(io_ctx),
    _obj_name(obj_name),
    _lock_name("lock_name"),
    _cookie("cookie") {

    // TODO: the lock will never expire. It may cause problem if the process crash or abnormally exit.
    while (!_io_ctx->lock_exclusive(
             _obj_name,
             _lock_name,
             _cookie,
             "description", nullptr, 0));
  }

  ~LibradosFileLock() {
    _io_ctx->unlock(_obj_name, _lock_name, _cookie);
  }
};


// --------------------
// --- EnvLibrados ----
// --------------------
/**
 * @brief EnvLibrados ctor
 * @details [long description]
 *
 * @param db_name unique database name
 * @param config_path the configure file path for rados
 */
EnvLibrados::EnvLibrados(const std::string& db_name,
                         const std::string& config_path,
                         const std::string& db_pool)
  : EnvLibrados("client.admin",
                "ceph",
                0,
                db_name,
                config_path,
                db_pool,
                "/wal",
                db_pool,
                1 << 20) {}

/**
 * @brief EnvLibrados ctor
 * @details [long description]
 *
 * @param client_name       first 3 parameters is for RADOS client init
 * @param cluster_name
 * @param flags
 * @param db_name           unique database name, used as db_id key
 * @param config_path the   configure file path for rados
 * @param db_pool the pool  for db data
 * @param wal_pool the pool for WAL data
 * @param write_buffer_size WritableFile buffer max size
 */
EnvLibrados::EnvLibrados(const std::string& client_name,
                         const std::string& cluster_name,
                         const uint64_t flags,
                         const std::string& db_name,
                         const std::string& config_path,
                         const std::string& db_pool,
                         const std::string& wal_dir,
                         const std::string& wal_pool,
                         const uint64_t write_buffer_size)
  : EnvWrapper(Env::Default()),
    _client_name(client_name),
    _cluster_name(cluster_name),
    _flags(flags),
    _db_name(db_name),
    _config_path(config_path),
    _db_pool_name(db_pool),
    _wal_dir(wal_dir),
    _wal_pool_name(wal_pool),
    _write_buffer_size(write_buffer_size) {
  int ret = 0;

  // 1. create a Rados object and initialize it
  ret = _rados.init2(_client_name.c_str(), _cluster_name.c_str(), _flags); // just use the client.admin keyring
  if (ret < 0) { // let's handle any error that might have come back
    std::cerr << "couldn't initialize rados! error " << ret << std::endl;
    ret = EXIT_FAILURE;
    goto out;
  }

  // 2. read configure file
  ret = _rados.conf_read_file(_config_path.c_str());
  if (ret < 0) {
    // This could fail if the config file is malformed, but it'd be hard.
    std::cerr << "failed to parse config file " << _config_path
              << "! error" << ret << std::endl;
    ret = EXIT_FAILURE;
    goto out;
  }

  // 3. we actually connect to the cluster
  ret = _rados.connect();
  if (ret < 0) {
    std::cerr << "couldn't connect to cluster! error " << ret << std::endl;
    ret = EXIT_FAILURE;
    goto out;
  }

  // 4. create db_pool if not exist
  ret = _rados.pool_create(_db_pool_name.c_str());
  if (ret < 0 && ret != -EEXIST && ret !=  -EPERM) {
    std::cerr << "couldn't create pool! error " << ret << std::endl;
    goto out;
  }

  // 5. create db_pool_ioctx
  ret = _rados.ioctx_create(_db_pool_name.c_str(), _db_pool_ioctx);
  if (ret < 0) {
    std::cerr << "couldn't set up ioctx! error " << ret << std::endl;
    ret = EXIT_FAILURE;
    goto out;
  }

  // 6. create wal_pool if not exist
  ret = _rados.pool_create(_wal_pool_name.c_str());
  if (ret < 0 && ret != -EEXIST && ret !=  -EPERM) {
    std::cerr << "couldn't create pool! error " << ret << std::endl;
    goto out;
  }

  // 7. create wal_pool_ioctx
  ret = _rados.ioctx_create(_wal_pool_name.c_str(), _wal_pool_ioctx);
  if (ret < 0) {
    std::cerr << "couldn't set up ioctx! error " << ret << std::endl;
    ret = EXIT_FAILURE;
    goto out;
  }

  // 8. add root dir
  _AddFid(ROOT_DIR_KEY, DIR_ID_VALUE);

out:
  LOG_DEBUG("rados connect result code : %i\n", ret);
}

/****************************************************
  private functions to handle fid operation.
  Dir also have fid, but the value is DIR_ID_VALUE
****************************************************/

/**
 * @brief generate a new fid
 * @details [long description]
 * @return [description]
 */
std::string EnvLibrados::_CreateFid() {
  return _db_name + "." + GenerateUniqueId();
}

/**
 * @brief get fid
 * @details [long description]
 *
 * @param fname [description]
 * @param fid [description]
 *
 * @return
 *  Status::OK()
 *  Status::NotFound()
 */
Status EnvLibrados::_GetFid(
  const std::string &fname,
  std::string& fid) {
  std::set<std::string> keys;
  std::map<std::string, librados::bufferlist> kvs;
  keys.insert(fname);
  int r = _db_pool_ioctx.omap_get_vals_by_keys(_db_name, keys, &kvs);

  if (0 == r && 0 == kvs.size()) {
    return Status::NotFound();
  } else if (0 == r && 0 != kvs.size()) {
    fid.assign(kvs[fname].c_str(), kvs[fname].length());
    return Status::OK();
  } else {
    return err_to_status(r);
  }
}

/**
 * @brief rename fid
 * @details Only modify object in rados once,
 * so this rename operation is atomic in term of rados
 *
 * @param old_fname [description]
 * @param new_fname [description]
 *
 * @return [description]
 */
Status EnvLibrados::_RenameFid(const std::string& old_fname,
                               const std::string& new_fname) {
  std::string fid;
  Status s = _GetFid(old_fname, fid);

  if (Status::OK() != s) {
    return s;
  }

  librados::bufferlist bl;
  std::set<std::string> keys;
  std::map<std::string, librados::bufferlist> kvs;
  librados::ObjectWriteOperation o;
  bl.append(fid);
  keys.insert(old_fname);
  kvs[new_fname] = bl;
  o.omap_rm_keys(keys);
  o.omap_set(kvs);
  int r = _db_pool_ioctx.operate(_db_name, &o);
  return err_to_status(r);
}

/**
 * @brief add <file path, fid> to metadata object. It may overwrite exist key.
 * @details [long description]
 *
 * @param fname [description]
 * @param fid [description]
 *
 * @return [description]
 */
Status EnvLibrados::_AddFid(
  const std::string& fname,
  const std::string& fid) {
  std::map<std::string, librados::bufferlist> kvs;
  librados::bufferlist value;
  value.append(fid);
  kvs[fname] = value;
  int r = _db_pool_ioctx.omap_set(_db_name, kvs);
  return err_to_status(r);
}

/**
 * @brief return subfile names of dir.
 * @details
 *  RocksDB has a 2-level structure, so all keys
 *  that have dir as prefix are subfiles of dir.
 *  So we can just return these files' name.
 *
 * @param dir [description]
 * @param result [description]
 *
 * @return [description]
 */
Status EnvLibrados::_GetSubFnames(
  const std::string& dir,
  std::vector<std::string> * result
) {
  std::string start_after(dir);
  std::string filter_prefix(dir);
  std::map<std::string, librados::bufferlist> kvs;
  _db_pool_ioctx.omap_get_vals(_db_name,
                               start_after, filter_prefix,
                               MAX_ITEMS_IN_FS, &kvs);

  result->clear();
  for (auto i = kvs.begin(); i != kvs.end(); i++) {
    result->push_back(i->first.substr(dir.size() + 1));
  }
  return Status::OK();
}

/**
 * @brief delete key fname from metadata object
 * @details [long description]
 *
 * @param fname [description]
 * @return [description]
 */
Status EnvLibrados::_DelFid(
  const std::string& fname) {
  std::set<std::string> keys;
  keys.insert(fname);
  int r = _db_pool_ioctx.omap_rm_keys(_db_name, keys);
  return err_to_status(r);
}

/**
 * @brief get match IoCtx from _prefix_pool_map
 * @details [long description]
 *
 * @param prefix [description]
 * @return [description]
 *
 */
librados::IoCtx* EnvLibrados::_GetIoctx(const std::string& fpath) {
  auto is_prefix = [](const std::string & s1, const std::string & s2) {
    auto it1 = s1.begin(), it2 = s2.begin();
    while (it1 != s1.end() && it2 != s2.end() && *it1 == *it2) ++it1, ++it2;
    return it1 == s1.end();
  };

  if (is_prefix(_wal_dir, fpath)) {
    return &_wal_pool_ioctx;
  } else {
    return &_db_pool_ioctx;
  }
}

/************************************************************
                public functions
************************************************************/
/**
 * @brief generate unique id
 * @details Combine system time and random number.
 * @return [description]
 */
std::string EnvLibrados::GenerateUniqueId() {
  Random64 r(time(nullptr));
  uint64_t random_uuid_portion =
    r.Uniform(std::numeric_limits<uint64_t>::max());
  uint64_t nanos_uuid_portion = NowNanos();
  char uuid2[200];
  snprintf(uuid2,
           200,
           "%16lx-%16lx",
           (unsigned long)nanos_uuid_portion,
           (unsigned long)random_uuid_portion);
  return uuid2;
}

/**
 * @brief create a new sequential read file handler
 * @details it will check the existence of fname
 *
 * @param fname [description]
 * @param result [description]
 * @param options [description]
 * @return [description]
 */
Status EnvLibrados::NewSequentialFile(
  const std::string& fname,
  std::unique_ptr<SequentialFile>* result,
  const EnvOptions& options)
{
  LOG_DEBUG("[IN]%s\n", fname.c_str());
  std::string dir, file, fid;
  split(fname, &dir, &file);
  Status s;
  std::string fpath = dir + "/" + file;
  do {
    s = _GetFid(dir, fid);

    if (!s.ok() || fid != DIR_ID_VALUE) {
      if (fid != DIR_ID_VALUE) s = Status::IOError();
      break;
    }

    s = _GetFid(fpath, fid);

    if (Status::NotFound() == s) {
      s = Status::IOError();
      errno = ENOENT;
      break;
    }

    result->reset(new LibradosSequentialFile(_GetIoctx(fpath), fid, fpath));
    s = Status::OK();
  } while (0);

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief create a new random access file handler
 * @details it will check the existence of fname
 *
 * @param fname [description]
 * @param result [description]
 * @param options [description]
 * @return [description]
 */
Status EnvLibrados::NewRandomAccessFile(
  const std::string& fname,
  std::unique_ptr<RandomAccessFile>* result,
  const EnvOptions& options)
{
  LOG_DEBUG("[IN]%s\n", fname.c_str());
  std::string dir, file, fid;
  split(fname, &dir, &file);
  Status s;
  std::string fpath = dir + "/" + file;
  do {
    s = _GetFid(dir, fid);

    if (!s.ok() || fid != DIR_ID_VALUE) {
      s = Status::IOError();
      break;
    }

    s = _GetFid(fpath, fid);

    if (Status::NotFound() == s) {
      s = Status::IOError();
      errno = ENOENT;
      break;
    }

    result->reset(new LibradosRandomAccessFile(_GetIoctx(fpath), fid, fpath));
    s = Status::OK();
  } while (0);

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief create a new write file handler
 * @details it will check the existence of fname
 *
 * @param fname [description]
 * @param result [description]
 * @param options [description]
 * @return [description]
 */
Status EnvLibrados::NewWritableFile(
  const std::string& fname,
  std::unique_ptr<WritableFile>* result,
  const EnvOptions& options)
{
  LOG_DEBUG("[IN]%s\n", fname.c_str());
  std::string dir, file, fid;
  split(fname, &dir, &file);
  Status s;
  std::string fpath = dir + "/" + file;

  do {
    // 1. check if dir exist
    s = _GetFid(dir, fid);
    if (!s.ok()) {
      break;
    }

    if (fid != DIR_ID_VALUE) {
      s = Status::IOError();
      break;
    }

    // 2. check if file exist.
    // 2.1 exist, use it
    // 2.2 not exist, create it
    s = _GetFid(fpath, fid);
    if (Status::NotFound() == s) {
      fid = _CreateFid();
      _AddFid(fpath, fid);
    }

    result->reset(new LibradosWritableFile(_GetIoctx(fpath), fid, fpath, this));
    s = Status::OK();
  } while (0);

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief reuse write file handler
 * @details
 *  This function will rename old_fname to new_fname,
 *  then return the handler of new_fname
 *
 * @param new_fname [description]
 * @param old_fname [description]
 * @param result [description]
 * @param options [description]
 * @return [description]
 */
Status EnvLibrados::ReuseWritableFile(
  const std::string& new_fname,
  const std::string& old_fname,
  std::unique_ptr<WritableFile>* result,
  const EnvOptions& options)
{
  LOG_DEBUG("[IN]%s => %s\n", old_fname.c_str(), new_fname.c_str());
  std::string src_fid, tmp_fid, src_dir, src_file, dst_dir, dst_file;
  split(old_fname, &src_dir, &src_file);
  split(new_fname, &dst_dir, &dst_file);

  std::string src_fpath = src_dir + "/" + src_file;
  std::string dst_fpath = dst_dir + "/" + dst_file;
  Status r = Status::OK();
  do {
    r = _RenameFid(src_fpath,
                   dst_fpath);
    if (!r.ok()) {
      break;
    }

    result->reset(new LibradosWritableFile(_GetIoctx(dst_fpath), src_fid, dst_fpath, this));
  } while (0);

  LOG_DEBUG("[OUT]%s\n", r.ToString().c_str());
  return r;
}

/**
 * @brief create a new directory handler
 * @details [long description]
 *
 * @param name [description]
 * @param result [description]
 *
 * @return [description]
 */
Status EnvLibrados::NewDirectory(
  const std::string& name,
  std::unique_ptr<Directory>* result)
{
  LOG_DEBUG("[IN]%s\n", name.c_str());
  std::string fid, dir, file;
  /* just want to get dir name */
  split(name + "/tmp", &dir, &file);
  Status s;

  do {
    s = _GetFid(dir, fid);

    if (!s.ok() || DIR_ID_VALUE != fid) {
      s = Status::IOError(name, strerror(-ENOENT));
      break;
    }

    if (Status::NotFound() == s) {
      s = _AddFid(dir, DIR_ID_VALUE);
      if (!s.ok()) break;
    } else if (!s.ok()) {
      break;
    }

    result->reset(new LibradosDirectory(_GetIoctx(dir), dir));
    s = Status::OK();
  } while (0);

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief check if fname is exist
 * @details [long description]
 *
 * @param fname [description]
 * @return [description]
 */
Status EnvLibrados::FileExists(const std::string& fname)
{
  LOG_DEBUG("[IN]%s\n", fname.c_str());
  std::string fid, dir, file;
  split(fname, &dir, &file);
  Status s = _GetFid(dir + "/" + file, fid);

  if (s.ok() && fid != DIR_ID_VALUE) {
    s = Status::OK();
  }

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief get subfile name of dir_in
 * @details [long description]
 *
 * @param dir_in [description]
 * @param result [description]
 *
 * @return [description]
 */
Status EnvLibrados::GetChildren(
  const std::string& dir_in,
  std::vector<std::string>* result)
{
  LOG_DEBUG("[IN]%s\n", dir_in.c_str());
  std::string fid, dir, file;
  split(dir_in + "/temp", &dir, &file);
  Status s;

  do {
    s = _GetFid(dir, fid);
    if (!s.ok()) {
      break;
    }

    if (fid != DIR_ID_VALUE) {
      s = Status::IOError();
      break;
    }

    s = _GetSubFnames(dir, result);
  } while (0);

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief delete fname
 * @details [long description]
 *
 * @param fname [description]
 * @return [description]
 */
Status EnvLibrados::DeleteFile(const std::string& fname)
{
  LOG_DEBUG("[IN]%s\n", fname.c_str());
  std::string fid, dir, file;
  split(fname, &dir, &file);
  Status s = _GetFid(dir + "/" + file, fid);

  if (s.ok() && DIR_ID_VALUE != fid) {
    s = _DelFid(dir + "/" + file);
  } else {
    s = Status::NotFound();
  }
  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief create new dir
 * @details [long description]
 *
 * @param dirname [description]
 * @return [description]
 */
Status EnvLibrados::CreateDir(const std::string& dirname)
{
  LOG_DEBUG("[IN]%s\n", dirname.c_str());
  std::string fid, dir, file;
  split(dirname + "/temp", &dir, &file);
  Status s = _GetFid(dir + "/" + file, fid);

  do {
    if (Status::NotFound() != s && fid != DIR_ID_VALUE) {
      break;
    } else if (Status::OK() == s && fid == DIR_ID_VALUE) {
      break;
    }

    s = _AddFid(dir, DIR_ID_VALUE);
  } while (0);

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief create dir if missing
 * @details [long description]
 *
 * @param dirname [description]
 * @return [description]
 */
Status EnvLibrados::CreateDirIfMissing(const std::string& dirname)
{
  LOG_DEBUG("[IN]%s\n", dirname.c_str());
  std::string fid, dir, file;
  split(dirname + "/temp", &dir, &file);
  Status s = Status::OK();

  do {
    s = _GetFid(dir, fid);
    if (Status::NotFound() != s) {
      break;
    }

    s = _AddFid(dir, DIR_ID_VALUE);
  } while (0);

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief delete dir
 * @details
 *
 * @param dirname [description]
 * @return [description]
 */
Status EnvLibrados::DeleteDir(const std::string& dirname)
{
  LOG_DEBUG("[IN]%s\n", dirname.c_str());
  std::string fid, dir, file;
  split(dirname + "/temp", &dir, &file);
  Status s = Status::OK();

  s = _GetFid(dir, fid);

  if (s.ok() && DIR_ID_VALUE == fid) {
    std::vector<std::string> subs;
    s = _GetSubFnames(dir, &subs);
    // if subfiles exist, can't delete dir
    if (subs.size() > 0) {
      s = Status::IOError();
    } else {
      s = _DelFid(dir);
    }
  } else {
    s = Status::NotFound();
  }

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief return file size
 * @details [long description]
 *
 * @param fname [description]
 * @param file_size [description]
 *
 * @return [description]
 */
Status EnvLibrados::GetFileSize(
  const std::string& fname,
  uint64_t* file_size)
{
  LOG_DEBUG("[IN]%s\n", fname.c_str());
  std::string fid, dir, file;
  split(fname, &dir, &file);
  time_t mtime;
  Status s;

  do {
    std::string fpath = dir + "/" + file;
    s = _GetFid(fpath, fid);

    if (!s.ok()) {
      break;
    }

    int ret = _GetIoctx(fpath)->stat(fid, file_size, &mtime);
    if (ret < 0) {
      LOG_DEBUG("%i\n", ret);
      if (-ENOENT == ret) {
        *file_size = 0;
        s = Status::OK();
      } else {
        s = err_to_status(ret);
      }
    } else {
      s = Status::OK();
    }
  } while (0);

  LOG_DEBUG("[OUT]%s|%lld\n", s.ToString().c_str(), (long long)*file_size);
  return s;
}

/**
 * @brief get file modification time
 * @details [long description]
 *
 * @param fname [description]
 * @param file_mtime [description]
 *
 * @return [description]
 */
Status EnvLibrados::GetFileModificationTime(const std::string& fname,
    uint64_t* file_mtime)
{
  LOG_DEBUG("[IN]%s\n", fname.c_str());
  std::string fid, dir, file;
  split(fname, &dir, &file);
  time_t mtime;
  uint64_t file_size;
  Status s = Status::OK();
  do {
    std::string fpath = dir + "/" + file;
    s = _GetFid(dir + "/" + file, fid);

    if (!s.ok()) {
      break;
    }

    int ret = _GetIoctx(fpath)->stat(fid, &file_size, &mtime);
    if (ret < 0) {
      if (Status::NotFound() == err_to_status(ret)) {
        *file_mtime = static_cast<uint64_t>(mtime);
        s = Status::OK();
      } else {
        s = err_to_status(ret);
      }
    } else {
      s = Status::OK();
    }
  } while (0);

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief rename file
 * @details
 *
 * @param src [description]
 * @param target_in [description]
 *
 * @return [description]
 */
Status EnvLibrados::RenameFile(
  const std::string& src,
  const std::string& target_in)
{
  LOG_DEBUG("[IN]%s => %s\n", src.c_str(), target_in.c_str());
  std::string src_fid, tmp_fid, src_dir, src_file, dst_dir, dst_file;
  split(src, &src_dir, &src_file);
  split(target_in, &dst_dir, &dst_file);

  auto s = _RenameFid(src_dir + "/" + src_file,
                      dst_dir + "/" + dst_file);
  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief not support
 * @details [long description]
 *
 * @param src [description]
 * @param target_in [description]
 *
 * @return [description]
 */
Status EnvLibrados::LinkFile(
  const std::string& src,
  const std::string& target_in)
{
  LOG_DEBUG("[IO]%s => %s\n", src.c_str(), target_in.c_str());
  return Status::NotSupported();
}

/**
 * @brief lock file. create if missing.
 * @details [long description]
 *
 * It seems that LockFile is used for preventing other instance of RocksDB
 * from opening up the database at the same time. From RocksDB source code,
 * the invokes of LockFile are at following locations:
 *
 *  ./db/db_impl.cc:1159:    s = env_->LockFile(LockFileName(dbname_), &db_lock_);    // DBImpl::Recover
 *  ./db/db_impl.cc:5839:  Status result = env->LockFile(lockname, &lock);            // Status DestroyDB
 *
 * When db recovery and db destroy, RocksDB will call LockFile
 *
 * @param fname [description]
 * @param lock [description]
 *
 * @return [description]
 */
Status EnvLibrados::LockFile(
  const std::string& fname,
  FileLock** lock)
{
  LOG_DEBUG("[IN]%s\n", fname.c_str());
  std::string fid, dir, file;
  split(fname, &dir, &file);
  Status s = Status::OK();

  do {
    std::string fpath = dir + "/" + file;
    s = _GetFid(fpath, fid);

    if (Status::OK() != s &&
        Status::NotFound() != s) {
      break;
    } else if (Status::NotFound() == s) {
      s = _AddFid(fpath, _CreateFid());
      if (!s.ok()) {
        break;
      }
    } else if (Status::OK() == s && DIR_ID_VALUE == fid) {
      s = Status::IOError();
      break;
    }

    *lock = new LibradosFileLock(_GetIoctx(fpath), fpath);
  } while (0);

  LOG_DEBUG("[OUT]%s\n", s.ToString().c_str());
  return s;
}

/**
 * @brief unlock file
 * @details [long description]
 *
 * @param lock [description]
 * @return [description]
 */
Status EnvLibrados::UnlockFile(FileLock* lock)
{
  LOG_DEBUG("[IO]%p\n", lock);
  if (nullptr != lock) {
    delete lock;
  }
  return Status::OK();
}


/**
 * @brief not support
 * @details [long description]
 *
 * @param db_path [description]
 * @param output_path [description]
 *
 * @return [description]
 */
Status EnvLibrados::GetAbsolutePath(
  const std::string& db_path,
  std::string* output_path)
{
  LOG_DEBUG("[IO]%s\n", db_path.c_str());
  return Status::NotSupported();
}

/**
 * @brief Get default EnvLibrados
 * @details [long description]
 * @return [description]
 */
EnvLibrados* EnvLibrados::Default() {
  static EnvLibrados default_env(default_db_name,
                                 std::getenv(default_config_path),
                                 default_pool_name);
  return &default_env;
}
// @lint-ignore TXT4 T25377293 Grandfathered in
}
back to top