Revision a7061474f905fe60a9581dbf4e14431d00b130a2 authored by Felix GV on 12 December 2015, 01:26:55 UTC, committed by Felix GV on 15 December 2015, 02:45:06 UTC
Summary: This new mode provides the capability of pushing to multiple clusters with different number of nodes and different partition assignments. Compatibility: Although this new mode only works if both the BnP job and the Voldemort servers are upgraded, the change can be rolled out gradually without breaking anything. There is a negotiation phase at the beginning of the BnP job which determines if all servers of all clusters are capable and willing (i.e.: configured) of using the new mode. If not all servers are upgraded and enabled, then the BnP job falls back to its old behavior. Likewise, if a server gets a fetch request from a non-upgraded BnP job, it will work just like before. By default, servers answer the negotiation by saying they support the new mode. The old behavior can be forced with the following server-side configuration: readonly.build.primary.replicas.only=false Running in this new mode has several implications: 1. When running in the new mode, store files are stored in the BnP output directory under nested partition directories, rather than in nested node directories. 2. The MR job uses half as many reducers and half as much shuffle bandwidth compared to before. 3. The meta checksum is now done per partition, rather than per node. 4. Instead of having one .metadata file per partition, there is now only a single full-store.metadata file at the root of the output directory. 5. The server-side HdfsFetcher code inspects the metadata file and determines if it should operate in 'build.primary.replicas.only' mode or not. If yes, then the server determines which partitions it needs to fetch on its own, rather than relying on what the BnP job placed in a node-specific directory. 6. The replica type number contained in Read-Only V2 file names is now useless, but we are keeping it in there just to avoid unnecessary changes. 7. When initializing a Read-Only V2 store directory, the server now looks for files named with the incorrect replica type, and if it finds any, it renames them to the replica type expected by this server. Other changes: 1. Added socket port to Node's toString functions. Also made the output of the Node's toString(), briefToString() and getStateString() functions more consistent. 2. Introduced new Protobuf message for the GetConfig admin request. This new message is intended to be a generic way to retrieve any of server config. 3. Refactored VoldemortConfig to provide access to any config by its string key. Also cleaned up a lot of hard-coded strings, which are constants now. 4. Various minor refactorings in BnP code.
1 parent 69fcd3f
rebalance-new-zoned-cluster.sh
#!/bin/bash -e
#
# Copyright 2013 LinkedIn, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script generates a final-cluster.xml for spinning up a new cluster.
# Argument = -c current_cluster -s current_stores -o output dir
# The final cluster is placed in output_dir/
# This script uses getopts which means only single character switches are allowed.
# Using getopt would allow for multi charcter switch names but would come at a
# cost of not being cross compatible.
# Function to display usage
usage_and_exit() {
echo "ERROR: $1."
cat <<EOF
Usage: $0 options
OPTIONS:
-h Show this message
-c Current cluster that describes the cluster.
-s Current stores that describes the store. If you do not have info about the stores yet, look
under 'voldemort_home/config/tools/' for some store examples.
-o Output dir where all interim and final files will be stored.
The directory will be created if it does not exist yet.
EOF
exit 1
}
# initialize variables to an empty string
current_cluster=""
current_stores=""
output_dir=""
# Figure out voldemort home directory
dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
vold_home="$(dirname "$dir")"
# Parse options
while getopts “hc:s:o:” OPTION
do
case $OPTION in
h)
usage_and_exit
exit 1
;;
c)
current_cluster=$OPTARG
echo "[rebalance-new-zoned-cluster] Will rebalance on the cluster described in '$current_cluster'."
;;
s)
current_stores=$OPTARG
echo "[rebalance-new-zoned-cluster] Will rebalance on the stores described in '$current_stores'."
;;
o)
output_dir=$OPTARG
mkdir -p $output_dir
echo "[rebalance-new-zoned-cluster] Using '$output_dir' for all interim and final files generated."
;;
?)
usage_and_exit
;;
esac
done
if [[ -z $current_cluster ]] || [[ -z $current_stores ]] || [[ -z $output_dir ]]
then
printf "\n"
echo "[rebalance-new-zoned-cluster] Missing argument. Check again."
usage_and_exit
exit 1
fi
if [ ! -e $current_cluster ]; then
usage_and_exit "File '$current_cluster' does not exist."
fi
if [ ! -e $current_stores ]; then
usage_and_exit "File '$current_stores' does not exist."
fi
# The final cluster.xml for new cluster is generated in three steps.
# Step 1 : Repartitioner is executed to limit the max contiguous partition. Four such
# runs are attempted and the best cluster.xml from this step is piped to step 2.
# Step 2: Cluster.xml from step 1 is fed to the repartitioner along with random swap
# attempts. The repartitioner randomly swaps the partitions and tries to balance the ring.
# Step 3: Similar to step1, to improve the balance.
#
step2_swap_attempts=1000
step2_swap_successes=1000
# Step 1
mkdir -p $output_dir/step1/
$vold_home/bin/run-class.sh voldemort.tools.RepartitionerCLI \
--current-cluster $current_cluster \
--current-stores $current_stores \
--max-contiguous-partitions 3 \
--attempts 4 \
--output-dir $output_dir/step1/
if [ ! -e $output_dir/step1/final-cluster.xml ]; then
usage_and_exit "File '$final-cluster.xml' does not exist."
fi
# Step 2
mkdir -p $output_dir/step2
$vold_home/bin/run-class.sh voldemort.tools.RepartitionerCLI \
--current-cluster $output_dir/step1/final-cluster.xml \
--current-stores $current_stores \
--output-dir $output_dir/step2/ \
--enable-random-swaps \
--random-swap-attempts $step2_swap_attempts \
--random-swap-successes $step2_swap_successes
if [ ! -e $output_dir/step2/final-cluster.xml ]; then
usage_and_exit "File '$final-cluster.xml' does not exist."
fi
mkdir -p $output_dir/step3/
# Step 3
$vold_home/bin/run-class.sh voldemort.tools.RepartitionerCLI \
--current-cluster $output_dir/step2/final-cluster.xml \
--current-stores $current_stores \
--max-contiguous-partitions 3 \
--attempts 4 \
--output-dir $output_dir/step3/ \
echo "[rebalance-new-zoned-cluster] Placing final-cluster.xml in '$output_dir'"
cp $output_dir/step3/final-cluster.xml $output_dir/final-cluster.xml
echo "[rebalance-new-zoned-cluster] Placing final-cluster.xml.analysis in '$output_dir'"
cp $output_dir/step2/final-cluster.xml.analysis $output_dir/final-cluster.xml.analysis
Computing file changes ...