0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 09:32:32 +01:00

SERVER-57262 Relax vote constraint to allow voting for candidates with newer config.

This commit is contained in:
Wenbin Zhu 2021-08-02 19:52:21 +00:00 committed by Evergreen Agent
parent a33a04b618
commit e793725604
5 changed files with 151 additions and 15 deletions

View File

@ -214,6 +214,8 @@ last-lts:
test_file: jstests/core/timeseries/timeseries_bucket_drop.js
- ticket: SERVER-56800
test_file: jstests/sharding/cwwc_conflict_add_shard.js
- ticket: SERVER-57262
test_file: jstests/replsets/catchup_takeover_with_higher_config.js
# Tests that should only be excluded from particular suites should be listed under that suite.
suites:

View File

@ -0,0 +1,125 @@
/**
* Test that when a primary is blocked in drain mode, catchup takeover can work even
* if the primary has a lower config than the takeover node. The test starts a 3-node
* replica set and then steps up node1 but blocks it in drain mode before it can bump
* the config term. Next it steps up node2 and also blocks it in drain mode and later
* unblocks node1 to let it finish config term bump so that it has higher config than
* node2. Eventually after catchUpTakeoverDelayMillis has passed, node1 should be able
* get the vote from node2 which has a lower config, and finish the catchup takeover.
*/
(function() {
'use strict';
load("jstests/libs/write_concern_util.js");
load('jstests/replsets/libs/election_metrics.js');
// Get the current config from the node and compare it with the provided config.
const getNodeConfigAndCompare = function(node, config, cmp) {
const currentConfig = assert.commandWorked(node.adminCommand({replSetGetConfig: 1})).config;
if (cmp === '=') {
return currentConfig.term === config.term && currentConfig.version === config.version;
} else if (cmp === '>') {
return currentConfig.term > config.term ||
(currentConfig.term === config.term && currentConfig.version > config.version);
} else if (cmp === '<') {
return currentConfig.term < config.term ||
(currentConfig.term === config.term && currentConfig.version < config.version);
} else {
assert(false);
}
};
// Wait for all nodes to acknowledge that the node at nodeIndex is in PRIMARY state.
// A node may see multiple nodes claiming primary, but only checks the provided one.
const waitForPrimaryState = function(nodes, nodeIndex, timeout) {
assert.soon(() => {
for (const node of nodes) {
const status = assert.commandWorked(node.adminCommand({replSetGetStatus: 1}));
if (status.members[nodeIndex].state !== ReplSetTest.State.PRIMARY) {
return false;
}
}
return true;
}, `Failed to wait for primary state for node ${nodes[nodeIndex].host}`, timeout);
};
const replSet = new ReplSetTest({name: jsTestName(), nodes: 3});
const nodes = replSet.startSet();
let config = replSet.getReplSetConfig();
// Prevent nodes from syncing from other secondaries.
config.settings = {
chainingAllowed: false,
};
replSet.initiateWithHighElectionTimeout(config);
replSet.awaitReplication();
assert.eq(replSet.getPrimary(), nodes[0]);
// Failpoint to hang node1 before the automatic reconfig on stepup bumps the config term.
const hangBeforeTermBumpFpNode1 = configureFailPoint(nodes[1], "hangBeforeReconfigOnDrainComplete");
const initialConfig = assert.commandWorked(nodes[0].adminCommand({replSetGetConfig: 1})).config;
// Stepup node1 and wait to hang before bumping the config term.
assert.commandWorked(nodes[1].adminCommand({replSetStepUp: 1}));
hangBeforeTermBumpFpNode1.wait();
// Wait for all nodes to acknowledge that node1 has become primary.
jsTestLog(`Waiting for all nodes to agree on ${nodes[1].host} being primary`);
replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes, nodes[1]);
// Check that the failpoint worked and the config has not changed.
assert(getNodeConfigAndCompare(nodes[1], initialConfig, '='));
// Stepup node2 and wait to hang before bumping the config term as well.
const hangBeforeTermBumpFpNode2 = configureFailPoint(nodes[2], "hangBeforeReconfigOnDrainComplete");
assert.commandWorked(nodes[2].adminCommand({replSetStepUp: 1}));
hangBeforeTermBumpFpNode2.wait();
// Wait for all nodes to acknowledge that node2 has become primary. Cannot use
// awaitNodesAgreeOnPrimary() or getPrimary() here which do not allow a node to
// see multiple primaries.
jsTestLog(`Waiting for all nodes to agree on ${nodes[2].host} being primary`);
waitForPrimaryState(nodes, 2, 30 * 1000);
// Wait for node0 to change its sync source to node2. Later when the failpoint on node 1
// is lifted, it will do a no-op write and finish the stepup process, so its lastApplied
// opTime will be greater than the other two nodes. By waiting for sync source change we
// make sure node0 will not pull new entries from node1, making node1 the only eligible
// candidate to catchup takeover node2.
assert.soon(() => {
const status = assert.commandWorked(nodes[0].adminCommand({replSetGetStatus: 1}));
return status.syncSourceHost === nodes[2].host;
});
const statusBeforeTakeover = assert.commandWorked(nodes[1].adminCommand({serverStatus: 1}));
// Lift the failpoint on node1 to let it finish reconfig and bump the config term.
hangBeforeTermBumpFpNode1.off();
jsTestLog(
`Waiting for ${nodes[1].host} to finish config term bump and propagate to ${nodes[0].host}`);
assert.soon(() => getNodeConfigAndCompare(nodes[0], initialConfig, '>'));
assert.soon(() => getNodeConfigAndCompare(nodes[1], initialConfig, '>'));
// Check that node2 is still in catchup mode, so it cannot install a new config.
assert(getNodeConfigAndCompare(nodes[2], initialConfig, '='));
// Wait for node1 to catchup takeover node2 after the default catchup takeover delay.
jsTestLog(`Waiting for ${nodes[1].host} to catchup takeover ${nodes[2].host}`);
waitForPrimaryState(nodes, 1, 60 * 1000);
// Check again that node2 is still in catchup mode and has not installed a new config.
assert(getNodeConfigAndCompare(nodes[2], initialConfig, '='));
// Lift the failpoint on node2 and wait for all nodes to see node1 as the only primary.
hangBeforeTermBumpFpNode2.off();
replSet.awaitNodesAgreeOnPrimary(replSet.kDefaultTimeoutMS, nodes, nodes[1]);
// Check that election metrics has been updated with the new reason counter.
const statusAfterTakeover = assert.commandWorked(nodes[1].adminCommand({serverStatus: 1}));
verifyServerStatusElectionReasonCounterChange(statusBeforeTakeover.electionMetrics,
statusAfterTakeover.electionMetrics,
"catchUpTakeover",
1);
replSet.stopSet();
})();

View File

@ -132,7 +132,9 @@ MONGO_FAIL_POINT_DEFINE(skipDurableTimestampUpdates);
MONGO_FAIL_POINT_DEFINE(omitConfigQuorumCheck);
// Will cause signal drain complete to hang right before acquiring the RSTL.
MONGO_FAIL_POINT_DEFINE(hangBeforeRSTLOnDrainComplete);
// Will cause signal drain complete to hang after reconfig
// Will cause signal drain complete to hang before reconfig.
MONGO_FAIL_POINT_DEFINE(hangBeforeReconfigOnDrainComplete);
// Will cause signal drain complete to hang after reconfig.
MONGO_FAIL_POINT_DEFINE(hangAfterReconfigOnDrainComplete);
MONGO_FAIL_POINT_DEFINE(doNotRemoveNewlyAddedOnHeartbeats);
// Will hang right after setting the currentOp info associated with an automatic reconfig.
@ -1160,6 +1162,10 @@ void ReplicationCoordinatorImpl::signalDrainComplete(OperationContext* opCtx,
lk.unlock();
if (needBumpConfigTerm) {
if (MONGO_unlikely(hangBeforeReconfigOnDrainComplete.shouldFail())) {
LOGV2(5726200, "Hanging due to hangBeforeReconfigOnDrainComplete failpoint");
hangBeforeReconfigOnDrainComplete.pauseWhileSet(opCtx);
}
// We re-write the term but keep version the same. This conceptually a no-op
// in the config consensus group, analogous to writing a new oplog entry
// in Raft log state machine on step up.

View File

@ -3268,9 +3268,9 @@ void TopologyCoordinator::processReplSetRequestVotes(const ReplSetRequestVotesAr
return;
}
if (args.getConfigVersionAndTerm() != _rsConfig.getConfigVersionAndTerm()) {
if (args.getConfigVersionAndTerm() < _rsConfig.getConfigVersionAndTerm()) {
response->setVoteGranted(false);
response->setReason("candidate's config with {} differs from mine with {}"_format(
response->setReason("candidate's config with {} is older than mine with {}"_format(
args.getConfigVersionAndTerm(), _rsConfig.getConfigVersionAndTerm()));
} else if (args.getTerm() < _term) {
response->setVoteGranted(false);
@ -3292,8 +3292,15 @@ void TopologyCoordinator::processReplSetRequestVotes(const ReplSetRequestVotesAr
_rsConfig.getMemberAt(_lastVote.getCandidateIndex()).getHostAndPort(),
_lastVote.getTerm()));
} else {
bool isSameConfig = args.getConfigVersionAndTerm() == _rsConfig.getConfigVersionAndTerm();
int betterPrimary = _findHealthyPrimaryOfEqualOrGreaterPriority(args.getCandidateIndex());
if (_selfConfig().isArbiter() && betterPrimary >= 0) {
// Do not grant vote if we are arbiter and can see a healthy primary of greater or equal
// priority, to prevent primary flapping when there are two nodes that can't talk to each
// other but we that can talk to both as arbiter. We only do this if the voter's config
// is same as ours, otherwise the primary information might be stale and we might not be
// arbiter in the candidate's newer config. We might also hit an invariant described in
// SERVER-46387 without the check for same config.
if (isSameConfig && _selfConfig().isArbiter() && betterPrimary >= 0) {
response->setVoteGranted(false);
response->setReason(
"can see a healthy primary ({}) of equal or greater priority"_format(

View File

@ -3261,7 +3261,7 @@ public:
TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigVersionIsLower) {
auto response = testWithArbiter(false, 1, 2);
ASSERT_EQUALS(
"candidate's config with {version: 1, term: 2} differs from mine with"
"candidate's config with {version: 1, term: 2} is older than mine with"
" {version: 2, term: 2}",
response.getReason());
}
@ -3269,7 +3269,7 @@ TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigVersionIs
TEST_F(ConfigTermAndVersionVoteTest, ArbiterDoesNotGrantVoteWhenConfigVersionIsLower) {
auto response = testWithArbiter(true, 1, 2);
ASSERT_EQUALS(
"candidate's config with {version: 1, term: 2} differs from mine with"
"candidate's config with {version: 1, term: 2} is older than mine with"
" {version: 2, term: 2}",
response.getReason());
}
@ -3277,7 +3277,7 @@ TEST_F(ConfigTermAndVersionVoteTest, ArbiterDoesNotGrantVoteWhenConfigVersionIsL
TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigTermIsLower) {
auto response = testWithArbiter(false, 2, 1);
ASSERT_EQUALS(
"candidate's config with {version: 2, term: 1} differs from mine with"
"candidate's config with {version: 2, term: 1} is older than mine with"
" {version: 2, term: 2}",
response.getReason());
}
@ -3285,7 +3285,7 @@ TEST_F(ConfigTermAndVersionVoteTest, DataNodeDoesNotGrantVoteWhenConfigTermIsLow
TEST_F(ConfigTermAndVersionVoteTest, ArbiterDoesNotGrantVoteWhenConfigTermIsLower) {
auto response = testWithArbiter(true, 2, 1);
ASSERT_EQUALS(
"candidate's config with {version: 2, term: 1} differs from mine with"
"candidate's config with {version: 2, term: 1} is older than mine with"
" {version: 2, term: 2}",
response.getReason());
}
@ -3453,7 +3453,7 @@ TEST_F(TopoCoordTest, NodeDoesNotGrantDryRunVoteWhenConfigVersionIsLower) {
getTopoCoord().processReplSetRequestVotes(args, &response);
ASSERT_EQUALS(
"candidate's config with {version: 0, term: 1} differs from mine with {version: 1, term: "
"candidate's config with {version: 0, term: 1} is older than mine with {version: 1, term: "
"1}",
response.getReason());
ASSERT_EQUALS(1, response.getTerm());
@ -3508,7 +3508,7 @@ TEST_F(TopoCoordTest, NodeDoesNotGrantDryRunVoteWhenTermIsStale) {
ASSERT_FALSE(response.getVoteGranted());
}
TEST_F(TopoCoordTest, NodeDoesNotGrantVoteWhenTermIsHigherButConfigVersionIsLower) {
TEST_F(TopoCoordTest, NodeGrantsVoteWhenTermIsHigherButConfigVersionIsLower) {
updateConfig(BSON("_id"
<< "rs0"
<< "version" << 2 << "term" << 1LL << "members"
@ -3539,11 +3539,7 @@ TEST_F(TopoCoordTest, NodeDoesNotGrantVoteWhenTermIsHigherButConfigVersionIsLowe
getTopoCoord().processReplSetRequestVotes(args, &response);
// Candidates config(t, v) is (2, 1) and our config is (1, 2). Even though the candidate's
// config version is lower, we grant our vote because the candidate's config term is higher.
ASSERT_FALSE(response.getVoteGranted());
ASSERT_EQ(
"candidate's config with {version: 1, term: 2} differs from mine with {version: 2, term: "
"1}",
response.getReason());
ASSERT_TRUE(response.getVoteGranted());
}
TEST_F(TopoCoordTest, GrantDryRunVoteEvenWhenTermHasBeenSeen) {