Revision 8e12638f3d0d91791cf06253493b8b15827f4b6c authored by sdong on 19 August 2019, 17:50:25 UTC, committed by Facebook Github Bot on 19 August 2019, 17:51:59 UTC
Summary:
Atomic white box test's kill odd is the same as normal test. However, in the scenario that only WritableFileWriter::Append() is blacklisted, WritableFileWriter::Flush() dominates the killing odds. Normally, most of WritableFileWriter::Flush() are called in WAL writes, where every write triggers a WAL flush. In atomic test, WAL is disabled, so the kill happens less frequently than we antipated. In some rare cases, the kill didn't end up with happening (for reasons I still don't fully understand) and cause the stress test timeout.

If WAL is disabled, make the odds 5x likely to trigger.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5717

Test Plan: Run whitebox_crash_test_with_atomic_flush and whitebox_crash_test and observe the kill odds printed out.

Differential Revision: D16897237

fbshipit-source-id: cbf5d96f6fc0e980523d0f1f94bf4e72cdb82d1c
1 parent e1c468d
Raw File
error_handler.h
//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
#pragma once

#include "monitoring/instrumented_mutex.h"
#include "options/db_options.h"
#include "rocksdb/listener.h"
#include "rocksdb/status.h"

namespace rocksdb {

class DBImpl;

class ErrorHandler {
  public:
   ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
                InstrumentedMutex* db_mutex)
       : db_(db),
         db_options_(db_options),
         bg_error_(Status::OK()),
         recovery_error_(Status::OK()),
         db_mutex_(db_mutex),
         auto_recovery_(false),
         recovery_in_prog_(false) {}
   ~ErrorHandler() {}

   void EnableAutoRecovery() { auto_recovery_ = true; }

   Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
                                     Status::Code code,
                                     Status::SubCode subcode);

   Status SetBGError(const Status& bg_err, BackgroundErrorReason reason);

   Status GetBGError() { return bg_error_; }

   Status GetRecoveryError() { return recovery_error_; }

   Status ClearBGError();

   bool IsDBStopped() {
     return !bg_error_.ok() &&
            bg_error_.severity() >= Status::Severity::kHardError;
    }

    bool IsBGWorkStopped() {
      return !bg_error_.ok() &&
             (bg_error_.severity() >= Status::Severity::kHardError ||
              !auto_recovery_);
    }

    bool IsRecoveryInProgress() { return recovery_in_prog_; }

    Status RecoverFromBGError(bool is_manual = false);
    void CancelErrorRecovery();

   private:
    DBImpl* db_;
    const ImmutableDBOptions& db_options_;
    Status bg_error_;
    // A separate Status variable used to record any errors during the
    // recovery process from hard errors
    Status recovery_error_;
    InstrumentedMutex* db_mutex_;
    // A flag indicating whether automatic recovery from errors is enabled
    bool auto_recovery_;
    bool recovery_in_prog_;

    Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery);
    void RecoverFromNoSpace();
};

}
back to top