mongodb/jstests/sharding/transactions_stale_shard_version_errors.js

// Tests mongos behavior on stale shard version errors received in a transaction.
//
// @tags: [
//  requires_sharding,
//  uses_transactions,
//  uses_multi_shard_transaction,
//  assumes_balancer_off
// ]
import "jstests/multiVersion/libs/verify_versions.js";

import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
import {findChunksUtil} from "jstests/sharding/libs/find_chunks_util.js";
import {
    assertNoSuchTransactionOnAllShards,
    disableStaleVersionAndSnapshotRetriesWithinTransactions,
    enableStaleVersionAndSnapshotRetriesWithinTransactions,
    kShardOptionsForDisabledStaleShardVersionRetries
} from "jstests/sharding/libs/sharded_transactions_helpers.js";

function expectChunks(st, ns, chunks) {
    for (let i = 0; i < chunks.length; i++) {
        assert.eq(chunks[i],
                  findChunksUtil.countChunksForNs(
                      st.s.getDB("config"), ns, {shard: st["shard" + i].shardName}),
                  "unexpected number of chunks on shard " + i);
    }
}

const dbName = "test";
const collName = "foo";
const ns = dbName + '.' + collName;

// Disable checking for index consistency to ensure that the config server doesn't trigger a
// StaleShardVersion exception on shards and cause them to refresh their sharding metadata.
const configOptions = {
    setParameter: {enableShardedIndexConsistencyCheck: false}
};

const st = new ShardingTest({
    shards: 3,
    mongos: 2,
    other: {
        shardOptions: kShardOptionsForDisabledStaleShardVersionRetries,
        configOptions: configOptions,
        enableBalancer: false
    }
});

enableStaleVersionAndSnapshotRetriesWithinTransactions(st);

// Disable the best-effort recipient metadata refresh after migrations to simplify simulating
// stale shard version errors.
assert.commandWorked(st.rs0.getPrimary().adminCommand(
    {configureFailPoint: "migrationRecipientFailPostCommitRefresh", mode: "alwaysOn"}));
assert.commandWorked(st.rs1.getPrimary().adminCommand(
    {configureFailPoint: "migrationRecipientFailPostCommitRefresh", mode: "alwaysOn"}));
assert.commandWorked(st.rs2.getPrimary().adminCommand(
    {configureFailPoint: "migrationRecipientFailPostCommitRefresh", mode: "alwaysOn"}));

// Shard two collections in the same database, each with 2 chunks, [minKey, 0), [0, maxKey),
// with one document each, all on Shard0.

assert.commandWorked(
    st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard0.shardName}));
assert.commandWorked(
    st.s.getDB(dbName)[collName].insert({_id: -5}, {writeConcern: {w: "majority"}}));
assert.commandWorked(
    st.s.getDB(dbName)[collName].insert({_id: 5}, {writeConcern: {w: "majority"}}));

assert.commandWorked(st.s.adminCommand({shardCollection: ns, key: {_id: 1}}));
assert.commandWorked(st.s.adminCommand({split: ns, middle: {_id: 0}}));

expectChunks(st, ns, [2, 0, 0]);

const otherCollName = "bar";
const otherNs = dbName + "." + otherCollName;

assert.commandWorked(
    st.s.getDB(dbName)[otherCollName].insert({_id: -5}, {writeConcern: {w: "majority"}}));
assert.commandWorked(
    st.s.getDB(dbName)[otherCollName].insert({_id: 5}, {writeConcern: {w: "majority"}}));

assert.commandWorked(st.s.adminCommand({shardCollection: otherNs, key: {_id: 1}}));
assert.commandWorked(st.s.adminCommand({split: otherNs, middle: {_id: 0}}));

expectChunks(st, otherNs, [2, 0, 0]);

const session = st.s.startSession();
const sessionDB = session.getDatabase(dbName);

//
// Stale shard version on first overall command should succeed.
//

// Move a chunk in the first collection from Shard0 to Shard1 through the main mongos, so Shard1
// is stale but not the router.
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard1.shardName}));
expectChunks(st, ns, [1, 1, 0]);

session.startTransaction();
// Targets Shard1, which is stale.
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));

assert.commandWorked(session.commitTransaction_forTesting());

//
// Stale shard version on second command to a shard should fail.
//

expectChunks(st, ns, [1, 1, 0]);

// Move a chunk in the other collection from Shard0 to Shard1 through the main mongos, so Shard1
// is stale for the other collection but not the router.
assert.commandWorked(
    st.s.adminCommand({moveChunk: otherNs, find: {_id: 5}, to: st.shard1.shardName}));
expectChunks(st, otherNs, [1, 1, 0]);

session.startTransaction();

// Targets Shard1 for the first ns, which is not stale.
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));

// Targets the other sharded collection on Shard1, which is stale. Because a previous statement
// has executed on Shard1, the retry will not restart the transaction, and will fail when it
// finds the transaction has aborted because of the stale shard version.
let res = assert.commandFailedWithCode(
    sessionDB.runCommand({find: otherCollName, filter: {_id: 5}}), ErrorCodes.NoSuchTransaction);
assert.eq(res.errorLabels, ["TransientTransactionError"]);

assertNoSuchTransactionOnAllShards(st, session.getSessionId(), session.getTxnNumber_forTesting());
assert.commandFailedWithCode(session.abortTransaction_forTesting(), ErrorCodes.NoSuchTransaction);

//
// Stale shard version on first command to a new shard should succeed.
//

expectChunks(st, ns, [1, 1, 0]);

// Move a chunk for the other collection from Shard1 to Shard0 through the main mongos, so
// Shard0 is stale for it and the router is not.
assert.commandWorked(
    st.s.adminCommand({moveChunk: otherNs, find: {_id: 5}, to: st.shard0.shardName}));
expectChunks(st, otherNs, [2, 0, 0]);

session.startTransaction();

// Targets Shard1 for the first ns, which is not stale.
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));

// Targets Shard0 for the other ns, which is stale.
assert.commandWorked(sessionDB.runCommand({find: otherCollName, filter: {_id: 5}}));

assert.commandWorked(session.commitTransaction_forTesting());

//
// Stale mongos aborts on old shard.
//

// Move a chunk in the first collection from Shard1 to Shard0 through the other mongos, so
// Shard1 and the main mongos are stale for it.
const otherMongos = st.s1;
assert.commandWorked(
    otherMongos.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard0.shardName}));
expectChunks(st, ns, [2, 0, 0]);

session.startTransaction();

// Targets Shard1, which hits a stale version error, then re-targets Shard0, which is also
// stale but should succeed.
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));

assert.commandWorked(session.commitTransaction_forTesting());

// Verify there is no in-progress transaction on Shard1.
res = assert.commandFailedWithCode(st.rs1.getPrimary().getDB(dbName).runCommand({
    find: collName,
    lsid: session.getSessionId(),
    txnNumber: NumberLong(session.getTxnNumber_forTesting()),
    autocommit: false,
}),
                                   ErrorCodes.NoSuchTransaction);
assert.eq(res.errorLabels, ["TransientTransactionError"]);

//
// More than one stale shard version error.
//

// Move chunks for the first ns from Shard0 to Shard1 and Shard2 through the main mongos, so
// both are stale but not the router.
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard2.shardName}));
expectChunks(st, ns, [1, 0, 1]);

assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: -5}, to: st.shard1.shardName}));
expectChunks(st, ns, [0, 1, 1]);

session.startTransaction();

// Targets all shards, two of which are stale.
assert.commandWorked(sessionDB.runCommand({find: collName}));

assert.commandWorked(session.commitTransaction_forTesting());

//
// Can retry a stale write on the first statement.
//

// Move a chunk to Shard1 to make it stale.
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard1.shardName}));
expectChunks(st, ns, [0, 2, 0]);

session.startTransaction();

// Targets Shard1, which is stale.
assert.commandWorked(sessionDB.runCommand({insert: collName, documents: [{_id: 6}]}));

assert.commandWorked(session.commitTransaction_forTesting());

//
// Can retry a stale write past the first statement if the write has been sent to only new
// participant shard(s).
//
// TODO SERVER-37207: Change batch writes to retry only the failed writes in a batch, to allow
// retrying writes beyond the first overall statement.
//

// Move a chunk to Shard2 to make it stale.
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: 5}, to: st.shard2.shardName}));
expectChunks(st, ns, [0, 1, 1]);

session.startTransaction();

// Targets Shard1, which is not stale.
assert.commandWorked(sessionDB.runCommand({insert: collName, documents: [{_id: -4}]}));

// Targets Shard2, which is stale.
assert.commandWorked(sessionDB.runCommand({insert: collName, documents: [{_id: 7}]}));

assert.commandWorked(session.commitTransaction_forTesting());

//
// The final StaleConfig error should be returned if the router exhausts its retries.
//

// Move a chunk to Shard0 to make it stale.
assert.commandWorked(st.s.adminCommand({moveChunk: ns, find: {_id: -5}, to: st.shard0.shardName}));
expectChunks(st, ns, [1, 0, 1]);

session.startTransaction();

// Target Shard2, to verify the transaction on it is aborted implicitly later.
assert.commandWorked(sessionDB.runCommand({find: collName, filter: {_id: 5}}));

// Make metadata refreshes on the stale shard indefinitely return StaleConfig.
const fp = configureFailPoint(st.rs0.getPrimary(), "alwaysThrowStaleConfigInfo");

// Targets all shards. Shard0 is stale and won't refresh its metadata, so mongos should exhaust
// its retries and implicitly abort the transaction.
res = assert.commandFailedWithCode(sessionDB.runCommand({find: collName}), ErrorCodes.StaleConfig);
assert.eq(res.errorLabels, ["TransientTransactionError"]);

// Verify the shards that did not return an error were also aborted.
assertNoSuchTransactionOnAllShards(st, session.getSessionId(), session.getTxnNumber_forTesting());
assert.commandFailedWithCode(session.abortTransaction_forTesting(), ErrorCodes.NoSuchTransaction);

fp.off();

disableStaleVersionAndSnapshotRetriesWithinTransactions(st);

st.stop();