mirror of
https://github.com/mongodb/mongo.git
synced 2024-12-01 09:32:32 +01:00
SERVER-44061: Fix race between setting replication maintenance mode and concurrent election
This commit is contained in:
parent
74e2b22bcd
commit
d3546ccb50
@ -122,7 +122,12 @@ replTest.waitForState(replTest.nodes[2], ReplSetTest.State.RECOVERING);
|
||||
|
||||
jsTestLog("7: Stop and restart Node 2.");
|
||||
replTest.stop(2);
|
||||
replTest.restart(2);
|
||||
replTest.restart(2, {
|
||||
// Set the failpoint to fail the transition to maintenance mode once. Make sure transitioning to
|
||||
// maintenance mode is resilient to errors (e.g. race with a concurrent election) and will
|
||||
// eventually succeed.
|
||||
setParameter: {'failpoint.setMaintenanceModeFailsWithNotSecondary': tojson({mode: {times: 1}})}
|
||||
});
|
||||
|
||||
jsTestLog(
|
||||
"8: Wait for Node 2 to transition to RECOVERING (its oplog should remain stale after restart)");
|
||||
|
@ -313,9 +313,6 @@ void BackgroundSync::_produce() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Mark yourself as too stale.
|
||||
_tooStale = true;
|
||||
|
||||
// Need to take the RSTL in mode X to transition out of SECONDARY.
|
||||
auto opCtx = cc().makeOperationContext();
|
||||
ReplicationStateTransitionLockGuard transitionGuard(opCtx.get(), MODE_X);
|
||||
@ -329,12 +326,18 @@ void BackgroundSync::_produce() {
|
||||
auto status = _replCoord->setMaintenanceMode(true);
|
||||
if (!status.isOK()) {
|
||||
warning() << "Failed to transition into maintenance mode: " << status;
|
||||
// Do not mark ourselves too stale on errors so we can try again next time.
|
||||
return;
|
||||
}
|
||||
status = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
|
||||
if (!status.isOK()) {
|
||||
warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
|
||||
<< ". Current state: " << _replCoord->getMemberState() << causedBy(status);
|
||||
// Do not mark ourselves too stale on errors so we can try again next time.
|
||||
return;
|
||||
}
|
||||
// Mark yourself as too stale.
|
||||
_tooStale = true;
|
||||
return;
|
||||
} else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
|
||||
{
|
||||
|
@ -99,6 +99,8 @@ namespace repl {
|
||||
MONGO_FAIL_POINT_DEFINE(stepdownHangBeforePerformingPostMemberStateUpdateActions);
|
||||
MONGO_FAIL_POINT_DEFINE(holdStableTimestampAtSpecificTimestamp);
|
||||
MONGO_FAIL_POINT_DEFINE(stepdownHangBeforeRSTLEnqueue);
|
||||
// Fail setMaintenanceMode with ErrorCodes::NotSecondary to simulate a concurrent election.
|
||||
MONGO_FAIL_POINT_DEFINE(setMaintenanceModeFailsWithNotSecondary);
|
||||
|
||||
// Tracks the number of operations killed on step down.
|
||||
Counter64 userOpsKilled;
|
||||
@ -2466,7 +2468,8 @@ Status ReplicationCoordinatorImpl::setMaintenanceMode(bool activate) {
|
||||
}
|
||||
|
||||
stdx::unique_lock<Latch> lk(_mutex);
|
||||
if (_topCoord->getRole() == TopologyCoordinator::Role::kCandidate) {
|
||||
if (_topCoord->getRole() == TopologyCoordinator::Role::kCandidate ||
|
||||
MONGO_unlikely(setMaintenanceModeFailsWithNotSecondary.shouldFail())) {
|
||||
return Status(ErrorCodes::NotSecondary, "currently running for election");
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user