Revision 499ebb3ab5ea4207950fc95acf102b8f58add1c5 authored by Maysam Yabandeh on 24 June 2017, 21:06:43 UTC, committed by Facebook Github Bot on 24 June 2017, 21:11:29 UTC
Summary: Throughput: 46k tps in our sysbench settings (filling the details later) The idea is to have the simplest change that gives us a reasonable boost in 2PC throughput. Major design changes: 1. The WAL file internal buffer is not flushed after each write. Instead it is flushed before critical operations (WAL copy via fs) or when FlushWAL is called by MySQL. Flushing the WAL buffer is also protected via mutex_. 2. Use two sequence numbers: last seq, and last seq for write. Last seq is the last visible sequence number for reads. Last seq for write is the next sequence number that should be used to write to WAL/memtable. This allows to have a memtable write be in parallel to WAL writes. 3. BatchGroup is not used for writes. This means that we can have parallel writers which changes a major assumption in the code base. To accommodate for that i) allow only 1 WriteImpl that intends to write to memtable via mem_mutex_--which is fine since in 2PC almost all of the memtable writes come via group commit phase which is serial anyway, ii) make all the parts in the code base that assumed to be the only writer (via EnterUnbatched) to also acquire mem_mutex_, iii) stat updates are protected via a stat_mutex_. Note: the first commit has the approach figured out but is not clean. Submitting the PR anyway to get the early feedback on the approach. If we are ok with the approach I will go ahead with this updates: 0) Rebase with Yi's pipelining changes 1) Currently batching is disabled by default to make sure that it will be consistent with all unit tests. Will make this optional via a config. 2) A couple of unit tests are disabled. They need to be updated with the serial commit of 2PC taken into account. 3) Replacing BatchGroup with mem_mutex_ got a bit ugly as it requires releasing mutex_ beforehand (the same way EnterUnbatched does). This needs to be cleaned up. Closes https://github.com/facebook/rocksdb/pull/2345 Differential Revision: D5210732 Pulled By: maysamyabandeh fbshipit-source-id: 78653bd95a35cd1e831e555e0e57bdfd695355a4
1 parent 0ac4afb
rocksdb_option_file_example.ini
# This is a RocksDB option file.
#
# A typical RocksDB options file has four sections, which are
# Version section, DBOptions section, at least one CFOptions
# section, and one TableOptions section for each column family.
# The RocksDB options file in general follows the basic INI
# file format with the following extensions / modifications:
#
# * Escaped characters
# We escaped the following characters:
# - \n -- line feed - new line
# - \r -- carriage return
# - \\ -- backslash \
# - \: -- colon symbol :
# - \# -- hash tag #
# * Comments
# We support # style comments. Comments can appear at the ending
# part of a line.
# * Statements
# A statement is of the form option_name = value.
# Each statement contains a '=', where extra white-spaces
# are supported. However, we don't support multi-lined statement.
# Furthermore, each line can only contain at most one statement.
# * Sections
# Sections are of the form [SecitonTitle "SectionArgument"],
# where section argument is optional.
# * List
# We use colon-separated string to represent a list.
# For instance, n1:n2:n3:n4 is a list containing four values.
#
# Below is an example of a RocksDB options file:
[Version]
rocksdb_version=4.3.0
options_file_version=1.1
[DBOptions]
stats_dump_period_sec=600
max_manifest_file_size=18446744073709551615
bytes_per_sync=8388608
delayed_write_rate=2097152
WAL_ttl_seconds=0
WAL_size_limit_MB=0
max_subcompactions=1
wal_dir=
wal_bytes_per_sync=0
db_write_buffer_size=0
keep_log_file_num=1000
table_cache_numshardbits=4
max_file_opening_threads=1
writable_file_max_buffer_size=1048576
random_access_max_buffer_size=1048576
use_fsync=false
max_total_wal_size=0
max_open_files=-1
skip_stats_update_on_db_open=false
max_background_compactions=16
manifest_preallocation_size=4194304
max_background_flushes=7
is_fd_close_on_exec=true
max_log_file_size=0
advise_random_on_open=true
create_missing_column_families=false
paranoid_checks=true
delete_obsolete_files_period_micros=21600000000
log_file_time_to_roll=0
compaction_readahead_size=0
create_if_missing=false
use_adaptive_mutex=false
enable_thread_tracking=false
allow_fallocate=true
error_if_exists=false
recycle_log_file_num=0
skip_log_error_on_recovery=false
db_log_dir=
new_table_reader_for_compaction_inputs=true
allow_mmap_reads=false
allow_mmap_writes=false
use_direct_reads=false
use_direct_writes=false
[CFOptions "default"]
compaction_style=kCompactionStyleLevel
compaction_filter=nullptr
num_levels=6
table_factory=BlockBasedTable
comparator=leveldb.BytewiseComparator
max_sequential_skip_in_iterations=8
soft_rate_limit=0.000000
max_bytes_for_level_base=1073741824
memtable_prefix_bloom_probes=6
memtable_prefix_bloom_bits=0
memtable_prefix_bloom_huge_page_tlb_size=0
max_successive_merges=0
arena_block_size=16777216
min_write_buffer_number_to_merge=1
target_file_size_multiplier=1
source_compaction_factor=1
max_bytes_for_level_multiplier=8
max_bytes_for_level_multiplier_additional=2:3:5
compaction_filter_factory=nullptr
max_write_buffer_number=8
level0_stop_writes_trigger=20
compression=kSnappyCompression
level0_file_num_compaction_trigger=4
purge_redundant_kvs_while_flush=true
max_write_buffer_number_to_maintain=0
memtable_factory=SkipListFactory
max_grandparent_overlap_factor=8
expanded_compaction_factor=25
hard_pending_compaction_bytes_limit=137438953472
inplace_update_num_locks=10000
level_compaction_dynamic_level_bytes=true
level0_slowdown_writes_trigger=12
filter_deletes=false
verify_checksums_in_compaction=true
min_partial_merge_operands=2
paranoid_file_checks=false
target_file_size_base=134217728
optimize_filters_for_hits=false
merge_operator=PutOperator
compression_per_level=kNoCompression:kNoCompression:kNoCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression
compaction_measure_io_stats=false
prefix_extractor=nullptr
bloom_locality=0
write_buffer_size=134217728
disable_auto_compactions=false
inplace_update_support=false
[TableOptions/BlockBasedTable "default"]
format_version=2
whole_key_filtering=true
no_block_cache=false
checksum=kCRC32c
filter_policy=rocksdb.BuiltinBloomFilter
block_size_deviation=10
block_size=8192
block_restart_interval=16
cache_index_and_filter_blocks=false
pin_l0_filter_and_index_blocks_in_cache=false
index_type=kBinarySearch
hash_index_allow_collision=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory
Computing file changes ...