0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 09:32:32 +01:00

SERVER-44061: Fix race between setting replication maintenance mode and concurrent election

This commit is contained in:
Lingzhi Deng 2019-12-23 16:07:47 +00:00 committed by evergreen
parent 74e2b22bcd
commit d3546ccb50
3 changed files with 16 additions and 5 deletions

View File

@ -122,7 +122,12 @@ replTest.waitForState(replTest.nodes[2], ReplSetTest.State.RECOVERING);
jsTestLog("7: Stop and restart Node 2.");
replTest.stop(2);
replTest.restart(2);
replTest.restart(2, {
// Set the failpoint to fail the transition to maintenance mode once. Make sure transitioning to
// maintenance mode is resilient to errors (e.g. race with a concurrent election) and will
// eventually succeed.
setParameter: {'failpoint.setMaintenanceModeFailsWithNotSecondary': tojson({mode: {times: 1}})}
});
jsTestLog(
"8: Wait for Node 2 to transition to RECOVERING (its oplog should remain stale after restart)");

View File

@ -313,9 +313,6 @@ void BackgroundSync::_produce() {
return;
}
// Mark yourself as too stale.
_tooStale = true;
// Need to take the RSTL in mode X to transition out of SECONDARY.
auto opCtx = cc().makeOperationContext();
ReplicationStateTransitionLockGuard transitionGuard(opCtx.get(), MODE_X);
@ -329,12 +326,18 @@ void BackgroundSync::_produce() {
auto status = _replCoord->setMaintenanceMode(true);
if (!status.isOK()) {
warning() << "Failed to transition into maintenance mode: " << status;
// Do not mark ourselves too stale on errors so we can try again next time.
return;
}
status = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
if (!status.isOK()) {
warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
<< ". Current state: " << _replCoord->getMemberState() << causedBy(status);
// Do not mark ourselves too stale on errors so we can try again next time.
return;
}
// Mark yourself as too stale.
_tooStale = true;
return;
} else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
{

View File

@ -99,6 +99,8 @@ namespace repl {
MONGO_FAIL_POINT_DEFINE(stepdownHangBeforePerformingPostMemberStateUpdateActions);
MONGO_FAIL_POINT_DEFINE(holdStableTimestampAtSpecificTimestamp);
MONGO_FAIL_POINT_DEFINE(stepdownHangBeforeRSTLEnqueue);
// Fail setMaintenanceMode with ErrorCodes::NotSecondary to simulate a concurrent election.
MONGO_FAIL_POINT_DEFINE(setMaintenanceModeFailsWithNotSecondary);
// Tracks the number of operations killed on step down.
Counter64 userOpsKilled;
@ -2466,7 +2468,8 @@ Status ReplicationCoordinatorImpl::setMaintenanceMode(bool activate) {
}
stdx::unique_lock<Latch> lk(_mutex);
if (_topCoord->getRole() == TopologyCoordinator::Role::kCandidate) {
if (_topCoord->getRole() == TopologyCoordinator::Role::kCandidate ||
MONGO_unlikely(setMaintenanceModeFailsWithNotSecondary.shouldFail())) {
return Status(ErrorCodes::NotSecondary, "currently running for election");
}