0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-11-24 16:46:00 +01:00

SERVER-85390 Retry conflicting index build during oplog application (#25891)

GitOrigin-RevId: b82a13bfca57c2c06c5508770603306554d7d3cc
This commit is contained in:
Shin Yee Tan 2024-08-14 07:43:12 -07:00 committed by MongoDB Bot
parent db7a8e109c
commit db80281e66
6 changed files with 95 additions and 9 deletions

View File

@ -0,0 +1,66 @@
/**
* Tests if there is an index build conflict after a node steps down and the existing index build
* fails, that the index build from the new primary is not missing index build on the old
* primary/secondary. The index build conflicts when the new primary independently creates an index
* with the same name as the old primary.
*
* @tags: [
* requires_replication,
* ]
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {IndexBuildTest} from "jstests/noPassthrough/libs/index_build.js";
// The index build conflict is only an issue when oplogApplicationEnforcesSteadyStateConstraints is
// false. This is false by default outside of our testing.
const rst = new ReplSetTest({
nodes: 3,
nodeOptions: {setParameter: {oplogApplicationEnforcesSteadyStateConstraints: false}}
});
rst.startSet();
rst.initiate();
const dbName = 'test';
const collName = 'coll';
const primary = rst.getPrimary();
const primaryDB = primary.getDB(dbName);
const primaryColl = primaryDB.getCollection(collName);
assert.commandWorked(primaryColl.insert({a: 1}));
rst.awaitReplication();
const secondary = rst.getSecondary();
const secondaryDB = secondary.getDB(dbName);
const secondaryColl = secondaryDB.getCollection(collName);
const hangFpOnSetup = configureFailPoint(primary, 'hangIndexBuildOnSetupBeforeTakingLocks');
const hangFpOnConflict = configureFailPoint(primary, 'hangAfterIndexBuildConflict');
jsTestLog("Starting index build");
let awaitIndexBuild = IndexBuildTest.startIndexBuild(
primary, primaryColl.getFullName(), {a: 1}, null, [ErrorCodes.InterruptedDueToReplStateChange]);
jsTestLog("Waiting for primary to register the index build");
hangFpOnSetup.wait();
jsTestLog("Stepping up the secondary");
assert.commandWorked(secondary.adminCommand({replSetStepUp: 1}));
rst.waitForState(secondary, ReplSetTest.State.PRIMARY);
jsTestLog("Waiting for new primary to start index build with the same name");
let awaitSecondaryIndexBuild =
IndexBuildTest.startIndexBuild(secondary, secondaryColl.getFullName(), {a: 1}, null, null, 2);
IndexBuildTest.waitForIndexBuildToStart(primaryDB, collName, "a_1");
// Wait for the index builds to conflict on the old primary.
hangFpOnConflict.wait();
// Allow first index build to be cleaned up and index build should no longer have conflict.
hangFpOnSetup.off();
awaitIndexBuild();
hangFpOnConflict.off();
awaitSecondaryIndexBuild();
rst.stopSet();

View File

@ -98,14 +98,14 @@ void ActiveIndexBuilds::assertNoIndexBuildInProgress() const {
}
}
void ActiveIndexBuilds::waitUntilAnIndexBuildFinishes(OperationContext* opCtx) {
void ActiveIndexBuilds::waitUntilAnIndexBuildFinishes(OperationContext* opCtx, Date_t deadline) {
stdx::unique_lock<Latch> lk(_mutex);
if (_allIndexBuilds.empty()) {
return;
}
const auto generation = _indexBuildsCompletedGen;
opCtx->waitForConditionOrInterrupt(
_indexBuildsCondVar, lk, [&] { return _indexBuildsCompletedGen != generation; });
opCtx->waitForConditionOrInterruptUntil(
_indexBuildsCondVar, lk, deadline, [&] { return _indexBuildsCompletedGen != generation; });
}
void ActiveIndexBuilds::sleepIndexBuilds_forTestOnly(bool sleep) {

View File

@ -85,7 +85,7 @@ public:
void assertNoIndexBuildInProgress() const;
void waitUntilAnIndexBuildFinishes(OperationContext* opCtx);
void waitUntilAnIndexBuildFinishes(OperationContext* opCtx, Date_t deadline);
void sleepIndexBuilds_forTestOnly(bool sleep);

View File

@ -2126,8 +2126,9 @@ void IndexBuildsCoordinator::awaitNoBgOpInProgForDb(OperationContext* opCtx,
activeIndexBuilds.awaitNoBgOpInProgForDb(opCtx, dbName);
}
void IndexBuildsCoordinator::waitUntilAnIndexBuildFinishes(OperationContext* opCtx) {
activeIndexBuilds.waitUntilAnIndexBuildFinishes(opCtx);
void IndexBuildsCoordinator::waitUntilAnIndexBuildFinishes(OperationContext* opCtx,
Date_t deadline) {
activeIndexBuilds.waitUntilAnIndexBuildFinishes(opCtx, deadline);
}
void IndexBuildsCoordinator::appendBuildInfo(const UUID& buildUUID, BSONObjBuilder* builder) const {

View File

@ -486,10 +486,10 @@ public:
void awaitNoBgOpInProgForDb(OperationContext* opCtx, const DatabaseName& dbName);
/**
* Waits until an index build completes. If there are no index builds in progress, returns
* immediately.
* Waits until an index build completes or the deadline expires. If there are no index builds in
* progress, returns immediately.
*/
void waitUntilAnIndexBuildFinishes(OperationContext* opCtx);
void waitUntilAnIndexBuildFinishes(OperationContext* opCtx, Date_t deadline = Date_t::max());
/**

View File

@ -175,6 +175,8 @@ namespace {
// Failpoint to block after a write and its oplog entry have been written to the storage engine and
// are visible, but before we have advanced 'lastApplied' for the write.
MONGO_FAIL_POINT_DEFINE(hangBeforeLogOpAdvancesLastApplied);
// Failpoint to block oplog application after receiving an IndexBuildAlreadyInProgress error.
MONGO_FAIL_POINT_DEFINE(hangAfterIndexBuildConflict);
void abortIndexBuilds(OperationContext* opCtx,
const OplogEntry::CommandType& commandType,
@ -2499,6 +2501,23 @@ Status applyCommand_inlock(OperationContext* opCtx,
"command",
opObj,
status);
// Even though steady state constraints are not enforced, we need to retry index
// builds that conflict in case the existing index build fails.
if (status.code() == ErrorCodes::IndexBuildAlreadyInProgress) {
if (MONGO_unlikely(hangAfterIndexBuildConflict.shouldFail())) {
LOGV2(8539001, "Hanging due to hangAfterIndexBuildConflict failpoint");
hangAfterIndexBuildConflict.pauseWhileSet();
}
auto deadline = Date_t::now() + Milliseconds(1000);
LOGV2(8539000,
"Waiting for existing index build to complete before retrying the "
"conflicting operation");
IndexBuildsCoordinator::get(opCtx)->waitUntilAnIndexBuildFinishes(opCtx,
deadline);
break;
}
} else {
LOGV2_DEBUG(51776,
1,