mirror of
https://github.com/mongodb/mongo.git
synced 2024-11-27 15:06:34 +01:00
1e76016aa7
GitOrigin-RevId: 4f1026a5cae9badd1c9c9ec7a74635f5ca8c8d2e
271 lines
10 KiB
JavaScript
271 lines
10 KiB
JavaScript
// Tests mongos behavior on stale shard version errors received in a transaction.
|
|
//
|
|
// @tags: [
|
|
// requires_sharding,
|
|
// uses_transactions,
|
|
// uses_multi_shard_transaction,
|
|
// assumes_balancer_off
|
|
// ]
|
|
import "jstests/multiVersion/libs/verify_versions.js";
|
|
|
|
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
|
|
import {ShardingTest} from "jstests/libs/shardingtest.js";
|
|
import {findChunksUtil} from "jstests/sharding/libs/find_chunks_util.js";
|
|
import {
|
|
assertNoSuchTransactionOnAllShards,
|
|
disableStaleVersionAndSnapshotRetriesWithinTransactions,
|
|
enableStaleVersionAndSnapshotRetriesWithinTransactions,
|
|
kShardOptionsForDisabledStaleShardVersionRetries
|
|
} from "jstests/sharding/libs/sharded_transactions_helpers.js";
|
|
|
|
function expectChunks(st, ns, chunks) {
|
|
for (let i = 0; i < chunks.length; i++) {
|
|
assert.eq(chunks[i],
|
|
findChunksUtil.countChunksForNs(
|
|
st.s.getDB("config"), ns, {shard: st["shard" + i].shardName}),
|
|
"unexpected number of chunks on shard " + i);
|
|
}
|
|
}
|
|
|
|
const dbName = "test";
|
|
const collName = "foo";
|
|
const ns = dbName + '.' + collName;
|
|
|
|
// Disable checking for index consistency to ensure that the config server doesn't trigger a
|
|
// StaleShardVersion exception on shards and cause them to refresh their sharding metadata.
|
|
const configOptions = {
|
|
setParameter: {enableShardedIndexConsistencyCheck: false}
|
|
};
|
|
|
|
const st = new ShardingTest({
|
|
shards: 3,
|
|
mongos: 2,
|
|
other: {
|
|
shardOptions: kShardOptionsForDisabledStaleShardVersionRetries,
|
|
configOptions: configOptions,
|
|
enableBalancer: false
|
|
}
|
|
});
|
|
|
|
enableStaleVersionAndSnapshotRetriesWithinTransactions(st);
|
|
|
|
// Disable the best-effort recipient metadata refresh after migrations to simplify simulating
|
|
// stale shard version errors.
|
|
assert.commandWorked(st.rs0.getPrimary().adminCommand(
|
|
{configureFailPoint: "migrationRecipientFailPostCommitRefresh", mode: "alwaysOn"}));
|
|
assert.commandWorked(st.rs1.getPrimary().adminCommand(
|
|
{configureFailPoint: "migrationRecipientFailPostCommitRefresh", mode: "alwaysOn"}));
|
|
assert.commandWorked(st.rs2.getPrimary().adminCommand(
|
|
{configureFailPoint: "migrationRecipientFailPostCommitRefresh", mode: "alwaysOn"}));
|
|
|
|
// Shard two collections in the same database, each with 2 chunks, [minKey, 0), [0, maxKey),
|
|
// with one document each, all on Shard0.
|
|
|
|
assert.commandWorked(
|
|
st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard0.shardName}));
|
|
assert.commandWorked(
|
|
st.s.getDB(dbName)[collName].insert({_id: -5}, {writeConcern: {w: "majority"}}));
|
|
assert.commandWorked(
|
|
st.s.getDB(dbName)[collName].insert({_id: 5}, {writeConcern: {w: "majority"}}));
|
|
|
|
assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: {_id: 1}}));
|
|
assert.commandWorked(st.s.adminCommand({split: ns, middle: {_id: 0}}));
|
|
|
|
expectChunks(st, ns, [2, 0, 0]);
|
|
|
|
const otherCollName = "bar";
|
|
const otherNs = dbName + "." + otherCollName;
|
|
|
|
assert.commandWorked(
|
|
st.s.getDB(dbName)[otherCollName].insert({_id: -5}, {writeConcern: {w: "majority"}}));
|
|
assert.commandWorked(
|
|
st.s.getDB(dbName)[otherCollName].insert({_id: 5}, {writeConcern: {w: "majority"}}));
|
|
|
|
assert.commandWorked(st.s.adminCommand({shardCollection: otherNs, key: {_id: 1}}));
|
|
assert.commandWorked(st.s.adminCommand({split: otherNs, middle: {_id: 0}}));
|
|
|
|
expectChunks(st, otherNs, [2, 0, 0]);
|
|
|
|
const session = st.s.startSession();
|
|
const sessionDB = session.getDatabase(dbName);
|
|
|
|
//
|
|
// Stale shard version on first overall command should succeed.
|
|
//
|
|
|
|
// Move a chunk in the first collection from Shard0 to Shard1 through the main mongos, so Shard1
|
|
// is stale but not the router.
|
|
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard1.shardName}));
|
|
expectChunks(st, ns, [1, 1, 0]);
|
|
|
|
session.startTransaction();
|
|
// Targets Shard1, which is stale.
|
|
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));
|
|
|
|
assert.commandWorked(session.commitTransaction_forTesting());
|
|
|
|
//
|
|
// Stale shard version on second command to a shard should fail.
|
|
//
|
|
|
|
expectChunks(st, ns, [1, 1, 0]);
|
|
|
|
// Move a chunk in the other collection from Shard0 to Shard1 through the main mongos, so Shard1
|
|
// is stale for the other collection but not the router.
|
|
assert.commandWorked(
|
|
st.s.adminCommand({moveChunk: otherNs, find: {_id: 5}, to: st.shard1.shardName}));
|
|
expectChunks(st, otherNs, [1, 1, 0]);
|
|
|
|
session.startTransaction();
|
|
|
|
// Targets Shard1 for the first ns, which is not stale.
|
|
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));
|
|
|
|
// Targets the other sharded collection on Shard1, which is stale. Because a previous statement
|
|
// has executed on Shard1, the retry will not restart the transaction, and will fail when it
|
|
// finds the transaction has aborted because of the stale shard version.
|
|
let res = assert.commandFailedWithCode(
|
|
sessionDB.runCommand({find: otherCollName, filter: {_id: 5}}), ErrorCodes.NoSuchTransaction);
|
|
assert.eq(res.errorLabels, ["TransientTransactionError"]);
|
|
|
|
assertNoSuchTransactionOnAllShards(st, session.getSessionId(), session.getTxnNumber_forTesting());
|
|
assert.commandFailedWithCode(session.abortTransaction_forTesting(), ErrorCodes.NoSuchTransaction);
|
|
|
|
//
|
|
// Stale shard version on first command to a new shard should succeed.
|
|
//
|
|
|
|
expectChunks(st, ns, [1, 1, 0]);
|
|
|
|
// Move a chunk for the other collection from Shard1 to Shard0 through the main mongos, so
|
|
// Shard0 is stale for it and the router is not.
|
|
assert.commandWorked(
|
|
st.s.adminCommand({moveChunk: otherNs, find: {_id: 5}, to: st.shard0.shardName}));
|
|
expectChunks(st, otherNs, [2, 0, 0]);
|
|
|
|
session.startTransaction();
|
|
|
|
// Targets Shard1 for the first ns, which is not stale.
|
|
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));
|
|
|
|
// Targets Shard0 for the other ns, which is stale.
|
|
assert.commandWorked(sessionDB.runCommand({find: otherCollName, filter: {_id: 5}}));
|
|
|
|
assert.commandWorked(session.commitTransaction_forTesting());
|
|
|
|
//
|
|
// Stale mongos aborts on old shard.
|
|
//
|
|
|
|
// Move a chunk in the first collection from Shard1 to Shard0 through the other mongos, so
|
|
// Shard1 and the main mongos are stale for it.
|
|
const otherMongos = st.s1;
|
|
assert.commandWorked(
|
|
otherMongos.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard0.shardName}));
|
|
expectChunks(st, ns, [2, 0, 0]);
|
|
|
|
session.startTransaction();
|
|
|
|
// Targets Shard1, which hits a stale version error, then re-targets Shard0, which is also
|
|
// stale but should succeed.
|
|
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));
|
|
|
|
assert.commandWorked(session.commitTransaction_forTesting());
|
|
|
|
// Verify there is no in-progress transaction on Shard1.
|
|
res = assert.commandFailedWithCode(st.rs1.getPrimary().getDB(dbName).runCommand({
|
|
find: collName,
|
|
lsid: session.getSessionId(),
|
|
txnNumber: NumberLong(session.getTxnNumber_forTesting()),
|
|
autocommit: false,
|
|
}),
|
|
ErrorCodes.NoSuchTransaction);
|
|
assert.eq(res.errorLabels, ["TransientTransactionError"]);
|
|
|
|
//
|
|
// More than one stale shard version error.
|
|
//
|
|
|
|
// Move chunks for the first ns from Shard0 to Shard1 and Shard2 through the main mongos, so
|
|
// both are stale but not the router.
|
|
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard2.shardName}));
|
|
expectChunks(st, ns, [1, 0, 1]);
|
|
|
|
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: -5}, to: st.shard1.shardName}));
|
|
expectChunks(st, ns, [0, 1, 1]);
|
|
|
|
session.startTransaction();
|
|
|
|
// Targets all shards, two of which are stale.
|
|
assert.commandWorked(sessionDB.runCommand({find: collName}));
|
|
|
|
assert.commandWorked(session.commitTransaction_forTesting());
|
|
|
|
//
|
|
// Can retry a stale write on the first statement.
|
|
//
|
|
|
|
// Move a chunk to Shard1 to make it stale.
|
|
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard1.shardName}));
|
|
expectChunks(st, ns, [0, 2, 0]);
|
|
|
|
session.startTransaction();
|
|
|
|
// Targets Shard1, which is stale.
|
|
assert.commandWorked(sessionDB.runCommand({insert: collName, documents: [{_id: 6}]}));
|
|
|
|
assert.commandWorked(session.commitTransaction_forTesting());
|
|
|
|
//
|
|
// Can retry a stale write past the first statement if the write has been sent to only new
|
|
// participant shard(s).
|
|
//
|
|
// TODO SERVER-37207: Change batch writes to retry only the failed writes in a batch, to allow
|
|
// retrying writes beyond the first overall statement.
|
|
//
|
|
|
|
// Move a chunk to Shard2 to make it stale.
|
|
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard2.shardName}));
|
|
expectChunks(st, ns, [0, 1, 1]);
|
|
|
|
session.startTransaction();
|
|
|
|
// Targets Shard1, which is not stale.
|
|
assert.commandWorked(sessionDB.runCommand({insert: collName, documents: [{_id: -4}]}));
|
|
|
|
// Targets Shard2, which is stale.
|
|
assert.commandWorked(sessionDB.runCommand({insert: collName, documents: [{_id: 7}]}));
|
|
|
|
assert.commandWorked(session.commitTransaction_forTesting());
|
|
|
|
//
|
|
// The final StaleConfig error should be returned if the router exhausts its retries.
|
|
//
|
|
|
|
// Move a chunk to Shard0 to make it stale.
|
|
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: -5}, to: st.shard0.shardName}));
|
|
expectChunks(st, ns, [1, 0, 1]);
|
|
|
|
session.startTransaction();
|
|
|
|
// Target Shard2, to verify the transaction on it is aborted implicitly later.
|
|
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));
|
|
|
|
// Make metadata refreshes on the stale shard indefinitely return StaleConfig.
|
|
const fp = configureFailPoint(st.rs0.getPrimary(), "alwaysThrowStaleConfigInfo");
|
|
|
|
// Targets all shards. Shard0 is stale and won't refresh its metadata, so mongos should exhaust
|
|
// its retries and implicitly abort the transaction.
|
|
res = assert.commandFailedWithCode(sessionDB.runCommand({find: collName}), ErrorCodes.StaleConfig);
|
|
assert.eq(res.errorLabels, ["TransientTransactionError"]);
|
|
|
|
// Verify the shards that did not return an error were also aborted.
|
|
assertNoSuchTransactionOnAllShards(st, session.getSessionId(), session.getTxnNumber_forTesting());
|
|
assert.commandFailedWithCode(session.abortTransaction_forTesting(), ErrorCodes.NoSuchTransaction);
|
|
|
|
fp.off();
|
|
|
|
disableStaleVersionAndSnapshotRetriesWithinTransactions(st);
|
|
|
|
st.stop();
|