Revision 499ebb3ab5ea4207950fc95acf102b8f58add1c5 authored by Maysam Yabandeh on 24 June 2017, 21:06:43 UTC, committed by Facebook Github Bot on 24 June 2017, 21:11:29 UTC
Summary:
Throughput: 46k tps in our sysbench settings (filling the details later)

The idea is to have the simplest change that gives us a reasonable boost
in 2PC throughput.

Major design changes:
1. The WAL file internal buffer is not flushed after each write. Instead
it is flushed before critical operations (WAL copy via fs) or when
FlushWAL is called by MySQL. Flushing the WAL buffer is also protected
via mutex_.
2. Use two sequence numbers: last seq, and last seq for write. Last seq
is the last visible sequence number for reads. Last seq for write is the
next sequence number that should be used to write to WAL/memtable. This
allows to have a memtable write be in parallel to WAL writes.
3. BatchGroup is not used for writes. This means that we can have
parallel writers which changes a major assumption in the code base. To
accommodate for that i) allow only 1 WriteImpl that intends to write to
memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes
come via group commit phase which is serial anyway, ii) make all the
parts in the code base that assumed to be the only writer (via
EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are
protected via a stat_mutex_.

Note: the first commit has the approach figured out but is not clean.
Submitting the PR anyway to get the early feedback on the approach. If
we are ok with the approach I will go ahead with this updates:
0) Rebase with Yi's pipelining changes
1) Currently batching is disabled by default to make sure that it will be
consistent with all unit tests. Will make this optional via a config.
2) A couple of unit tests are disabled. They need to be updated with the
serial commit of 2PC taken into account.
3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires
releasing mutex_ beforehand (the same way EnterUnbatched does). This
needs to be cleaned up.
Closes https://github.com/facebook/rocksdb/pull/2345

Differential Revision: D5210732

Pulled By: maysamyabandeh

fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
1 parent 0ac4afb
Raw File
rocksdb_option_file_example.ini
# This is a RocksDB option file.
#
# A typical RocksDB options file has four sections, which are
# Version section, DBOptions section, at least one CFOptions
# section, and one TableOptions section for each column family.
# The RocksDB options file in general follows the basic INI
# file format with the following extensions / modifications:
#
#  * Escaped characters
#    We escaped the following characters:
#     - \n -- line feed - new line
#     - \r -- carriage return
#     - \\ -- backslash \
#     - \: -- colon symbol :
#     - \# -- hash tag #
#  * Comments
#    We support # style comments.  Comments can appear at the ending
#    part of a line.
#  * Statements
#    A statement is of the form option_name = value.
#    Each statement contains a '=', where extra white-spaces
#    are supported. However, we don't support multi-lined statement.
#    Furthermore, each line can only contain at most one statement.
#  * Sections
#    Sections are of the form [SecitonTitle "SectionArgument"],
#    where section argument is optional.
#  * List
#    We use colon-separated string to represent a list.
#    For instance, n1:n2:n3:n4 is a list containing four values.
#
# Below is an example of a RocksDB options file:
[Version]
  rocksdb_version=4.3.0
  options_file_version=1.1

[DBOptions]
  stats_dump_period_sec=600
  max_manifest_file_size=18446744073709551615
  bytes_per_sync=8388608
  delayed_write_rate=2097152
  WAL_ttl_seconds=0
  WAL_size_limit_MB=0
  max_subcompactions=1
  wal_dir=
  wal_bytes_per_sync=0
  db_write_buffer_size=0
  keep_log_file_num=1000
  table_cache_numshardbits=4
  max_file_opening_threads=1
  writable_file_max_buffer_size=1048576
  random_access_max_buffer_size=1048576
  use_fsync=false
  max_total_wal_size=0
  max_open_files=-1
  skip_stats_update_on_db_open=false
  max_background_compactions=16
  manifest_preallocation_size=4194304
  max_background_flushes=7
  is_fd_close_on_exec=true
  max_log_file_size=0
  advise_random_on_open=true
  create_missing_column_families=false
  paranoid_checks=true
  delete_obsolete_files_period_micros=21600000000
  log_file_time_to_roll=0
  compaction_readahead_size=0
  create_if_missing=false
  use_adaptive_mutex=false
  enable_thread_tracking=false
  allow_fallocate=true
  error_if_exists=false
  recycle_log_file_num=0
  skip_log_error_on_recovery=false
  db_log_dir=
  new_table_reader_for_compaction_inputs=true
  allow_mmap_reads=false
  allow_mmap_writes=false
  use_direct_reads=false
  use_direct_writes=false


[CFOptions "default"]
  compaction_style=kCompactionStyleLevel
  compaction_filter=nullptr
  num_levels=6
  table_factory=BlockBasedTable
  comparator=leveldb.BytewiseComparator
  max_sequential_skip_in_iterations=8
  soft_rate_limit=0.000000
  max_bytes_for_level_base=1073741824
  memtable_prefix_bloom_probes=6
  memtable_prefix_bloom_bits=0
  memtable_prefix_bloom_huge_page_tlb_size=0
  max_successive_merges=0
  arena_block_size=16777216
  min_write_buffer_number_to_merge=1
  target_file_size_multiplier=1
  source_compaction_factor=1
  max_bytes_for_level_multiplier=8
  max_bytes_for_level_multiplier_additional=2:3:5
  compaction_filter_factory=nullptr
  max_write_buffer_number=8
  level0_stop_writes_trigger=20
  compression=kSnappyCompression
  level0_file_num_compaction_trigger=4
  purge_redundant_kvs_while_flush=true
  max_write_buffer_number_to_maintain=0
  memtable_factory=SkipListFactory
  max_grandparent_overlap_factor=8
  expanded_compaction_factor=25
  hard_pending_compaction_bytes_limit=137438953472
  inplace_update_num_locks=10000
  level_compaction_dynamic_level_bytes=true
  level0_slowdown_writes_trigger=12
  filter_deletes=false
  verify_checksums_in_compaction=true
  min_partial_merge_operands=2
  paranoid_file_checks=false
  target_file_size_base=134217728
  optimize_filters_for_hits=false
  merge_operator=PutOperator
  compression_per_level=kNoCompression:kNoCompression:kNoCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression
  compaction_measure_io_stats=false
  prefix_extractor=nullptr
  bloom_locality=0
  write_buffer_size=134217728
  disable_auto_compactions=false
  inplace_update_support=false

[TableOptions/BlockBasedTable "default"]
  format_version=2
  whole_key_filtering=true
  no_block_cache=false
  checksum=kCRC32c
  filter_policy=rocksdb.BuiltinBloomFilter
  block_size_deviation=10
  block_size=8192
  block_restart_interval=16
  cache_index_and_filter_blocks=false
  pin_l0_filter_and_index_blocks_in_cache=false
  index_type=kBinarySearch
  hash_index_allow_collision=true
  flush_block_policy_factory=FlushBlockBySizePolicyFactory
back to top