SERVER-47672: Add minSnapshotHistoryWindowInSeconds and remove snapshot window adjustment logic

Lingzhi Deng 2020-05-04 20:15:44 -04:00
21 changed files with 96 additions and 870 deletions

* Test setting minSnapshotHistoryWindowInSeconds at runtime and that server keeps history for up to
* minSnapshotHistoryWindowInSeconds.
* @tags: [requires_majority_read_concern, requires_replication]
(function() {
"use strict";
const replSet = new ReplSetTest({nodes: 1});
const collName = "coll";
const primary = replSet.getPrimary();
const primaryDB = primary.getDB('test');
const historyWindowSecs = 10;
{setParameter: 1, minSnapshotHistoryWindowInSeconds: historyWindowSecs}));
const startTime = Date.now();
const insertTimestamp =
assert.commandWorked(primaryDB.runCommand({insert: collName, documents: [{_id: 0}]}))
let nextId = 1;
// Test snapshot window with 1s margin.
const testMarginMS = 1000;
// Test that reading from a snapshot at insertTimestamp is valid for up to historyWindowSecs minus
// the testMarginMS (as a buffer) to avoid races between the client's snapshot read and the update
// of the oldest timestamp in the server.
const testWindowMS = historyWindowSecs * 1000 - testMarginMS;
while (Date.now() - startTime < testWindowMS) {
// Test that reading from a snapshot at insertTimestamp is still valid.
{find: collName, readConcern: {level: "snapshot", atClusterTime: insertTimestamp}}));
// Perform writes to advance stable timestamp and oldest timestamp. We use majority writeConcern
// so that we can make sure the stable timestamp and the oldest timestamp are updated after each
// insert.
{insert: collName, documents: [{_id: nextId}], writeConcern: {w: "majority"}}));
// Sleep enough to make sure the insertTimestamp falls off the snapshot window.
sleep(testMarginMS * 2);
// Perform another majority write to advance the stable timestamp and the oldest timestamp again.
{insert: collName, documents: [{_id: nextId}], writeConcern: {w: "majority"}}));
// Test that reading from a snapshot at insertTimestamp returns SnapshotTooOld.
{find: collName, readConcern: {level: "snapshot", atClusterTime: insertTimestamp}}),

@ -11,9 +11,9 @@
// TODO(SERVER-47672): Use minSnapshotHistoryWindowInSeconds instead.
const options = {
setParameter: {maxTargetSnapshotHistoryWindowInSeconds: 600}
// Set a large snapshot window of 10 minutes for the test.
setParameter: {minSnapshotHistoryWindowInSeconds: 600}
const replSet = new ReplSetTest({nodes: 3, nodeOptions: options});
@ -30,7 +30,7 @@ snapshotReadsTest({
// Ensure "atClusterTime" is omitted from a regular (non-snapshot) reads.
// Ensure "atClusterTime" is omitted from a regular (non-snapshot) read.
const cursor = assert.commandWorked(primaryDB.runCommand({find: "test"})).cursor;

@ -14,9 +14,9 @@
// TODO(SERVER-47672): Use minSnapshotHistoryWindowInSeconds instead.
const configOptions = {
setParameter: {maxTargetSnapshotHistoryWindowInSeconds: 600}
// Set a large snapshot window of 10 minutes for the test.
setParameter: {minSnapshotHistoryWindowInSeconds: 600}
const dbName = "test";

@ -4,7 +4,7 @@
* With majority reads disabled, we are not guaranteed to be able to service reads at the majority
* commit point. We can only provide reads within a window behind the primary's 'lastApplied'. The
* size of that window is controlled by 'maxTargetSnapshotHistoryWindowInSeconds', which is 5
* size of that window is controlled by 'minSnapshotHistoryWindowInSeconds', which is 5
* seconds by default. If the commit point lag is greater than that amount, reading at that time
* fails with a SnapshotTooOld error. Therefore, in order for the transaction to succeed, mongos
* needs to pick a read timestamp that is not derived from the commit point, but rather from the
@ -49,10 +49,10 @@ assert.commandWorked(mongosColl.insert({_id: 1, x: 10}, {writeConcern: {w: "majo
// We want the secondary to lag for an amount generously greater than the history window.
const secondary = rst.getSecondary();
const maxWindowResult = assert.commandWorked(secondary.getDB("admin").runCommand(
{"getParameter": 1, "maxTargetSnapshotHistoryWindowInSeconds": 1}));
{"getParameter": 1, "minSnapshotHistoryWindowInSeconds": 1}));
const maxWindowInMS = maxWindowResult.maxTargetSnapshotHistoryWindowInSeconds * 1000;
const maxWindowInMS = maxWindowResult.minSnapshotHistoryWindowInSeconds * 1000;
const lagTimeMS = maxWindowInMS * 2;
const startTime = Date.now();
let nextId = 1000;

@ -401,7 +401,6 @@ mongod = env.Program(

View File

@ -815,7 +815,7 @@ env.Library(
@ -1463,19 +1463,6 @@ env.Library(
@ -1487,18 +1474,6 @@ env.Library(
@ -1852,7 +1827,6 @@ envWithAsio.CppUnitTest(
@ -1928,7 +1902,6 @@ envWithAsio.CppUnitTest(

@ -104,7 +104,6 @@
#include "mongo/db/op_observer_registry.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/periodic_runner_job_abort_expired_transactions.h"
#include "mongo/db/periodic_runner_job_decrease_snapshot_cache_pressure.h"
#include "mongo/db/pipeline/process_interface/replica_set_node_process_interface.h"
#include "mongo/db/query/internal_plans.h"
#include "mongo/db/read_write_concern_defaults_cache_lookup_mongod.h"
@ -711,11 +710,6 @@ ExitCode _initAndListen(ServiceContext* serviceContext, int listenPort) {
if (storageEngine->supportsReadConcernSnapshot()) {
try {
// The inMemory engine is not yet used for replica or sharded transactions in production
// so it does not currently maintain snapshot history. It is live in testing, however.
if (!storageEngine->isEphemeral() || getTestCommandsEnabled()) {
} catch (ExceptionFor<ErrorCodes::PeriodicJobIsStopped>&) {
LOGV2_WARNING(4747501, "Not starting periodic jobs as shutdown is in progress");
// Shutdown has already started before initialization is complete. Wait for the
@ -1143,7 +1137,6 @@ void shutdownTask(const ShutdownTaskArgs& shutdownArgs) {
if (auto storageEngine = serviceContext->getStorageEngine()) {
if (storageEngine->supportsReadConcernSnapshot()) {
ServiceContext::UniqueOperationContext uniqueOpCtx;

@ -76,7 +76,7 @@
#include "mongo/db/s/transaction_coordinator_factory.h"
#include "mongo/db/service_entry_point_common.h"
#include "mongo/db/session_catalog_mongod.h"
#include "mongo/db/snapshot_window_util.h"
#include "mongo/db/snapshot_window_options.h"
#include "mongo/db/stats/counters.h"
#include "mongo/db/stats/server_read_concern_metrics.h"
#include "mongo/db/stats/top.h"
@ -1235,8 +1235,7 @@ void execCommandDatabase(OperationContext* opCtx,
// snapshot at their specified atClusterTime. Therefore, we'll try to increase the
// snapshot history window that the storage engine maintains in order to increase
// the likelihood of successful future PIT atClusterTime requests.
} else {
behaviors.handleException(e, opCtx);

#include "mongo/db/snapshot_window_options.h"
#include "mongo/platform/compiler.h"
#include "mongo/util/options_parser/startup_option_init.h"
namespace mongo {
SnapshotWindowParams snapshotWindowParams;
* After startup parameters have been initialized, set targetSnapshotHistoryWindowInSeconds to the
* value of maxTargetSnapshotHistoryWindowInSeconds, in case the max has been altered. The cache
* pressure is zero to begin with, so the user should not have to wait for the target to slowly
* adjust to max.
(InitializerContext* context) {
return Status::OK();
} // namespace mongo

@ -37,79 +37,25 @@ namespace mongo {
* These are parameters that affect how much snapshot history the storage engine will keep to
* support point-in-time transactions (read or write). This is referred to as the snapshot window.
* The window is between the stable timestamp and the oldest timestamp.
* support snapshot reads. This is referred to as the snapshot window. The window is between the
* stable timestamp and the oldest timestamp.
struct SnapshotWindowParams {
// maxTargetSnapshotHistoryWindowInSeconds (startup & runtime server parameter, range 0+).
// minSnapshotHistoryWindowInSeconds (startup & runtime server parameter, range 0+).
// Dictates the maximum lag in seconds oldest_timestamp should be behind stable_timestamp.
// targetSnapshotHistoryWindowInSeconds below is the actual active lag setting target.
// Dictates the lag in seconds oldest_timestamp should be set behind stable_timestamp.
// Note that the window size can become greater than this if an ongoing operation is holding an
// older snapshot open.
AtomicWord<int> maxTargetSnapshotHistoryWindowInSeconds{5};
// targetSnapshotHistoryWindowInSeconds (not a server parameter, range 0+).
// Dictates the target lag in seconds oldest_timestamp should be set behind stable_timestamp.
// Should only be set in the range [0, maxTargetSnapshotHistoryWindowInSeconds].
// Note that this is the history window we attempt to maintain, but our current system state may
// not always reflect it: the window can only change as more writes come in, so it can take time
// for the actual window size to catch up with a change. This value guides actions whenever the
// system goes to update the oldest_timestamp value (usually when the stable_timestamp is
// updated).
AtomicWord<int> targetSnapshotHistoryWindowInSeconds{
AtomicWord<int> minSnapshotHistoryWindowInSeconds{5};
// cachePressureThreshold (startup & runtime server parameter, range [0, 100]).
// Compares against a storage engine cache pressure indicator that ranges from 0 to 100.
// Currently, the only indicator is the WT lookaside score.
// This setting helps preempt storage cache pressure immobilizing the system. Attempts to
// increase targetSnapshotHistoryWindowInSeconds will be ignored when the cache pressure reaches
// this threshold. Additionally, a periodic task will decrease
// targetSnapshotHistoryWindowInSecond when cache pressure exceeds the threshold.
AtomicWord<int> cachePressureThreshold{95};
// snapshotWindowMultiplicativeDecrease (startup & runtime server parameter, range (0,1)).
// Controls by what multiplier the target snapshot history window setting is decreased when
// cache pressure becomes too high, per the cachePressureThreshold setting.
AtomicDouble snapshotWindowMultiplicativeDecrease{0.75};
// snapshotWindowAdditiveIncreaseSeconds (startup & runtime server parameter, range 1+).
// Controls by how much the target snapshot history window setting is increased when cache
// pressure is OK, per cachePressureThreshold, and we need to service older snapshots for global
// point-in-time reads.
AtomicWord<int> snapshotWindowAdditiveIncreaseSeconds{2};
// minMillisBetweenSnapshotWindowInc (startup & runtime server parameter, range 0+).
// minMillisBetweenSnapshotWindowDec (startup & runtime server parameter, range 0+).
// Controls how often attempting to increase/decrease the target snapshot window will have an
// effect. Multiple callers within minMillisBetweenSnapshotWindowInc will have the same effect
// as one. This protects the system because it takes time for the target snapshot window to
// affect the actual storage engine snapshot window. The stable timestamp must move forward for
// the window between it and oldest timestamp to grow or shrink.
AtomicWord<int> minMillisBetweenSnapshotWindowInc{500};
AtomicWord<int> minMillisBetweenSnapshotWindowDec{500};
// checkCachePressurePeriodSeconds (startup & runtime server parameter, range 1+)
// Controls the period of the task that checks for cache pressure and decreases
// targetSnapshotHistoryWindowInSeconds if the pressure is above cachePressureThreshold. The
// target window size setting must not be decreased too fast because time must be allowed for
// the storage engine to attempt to act on the new setting.
AtomicWord<int> checkCachePressurePeriodSeconds{5};
static inline MutableObserverRegistry<decltype(checkCachePressurePeriodSeconds)::WordType>
AtomicWord<std::int64_t> snapshotTooOldErrorCount{0};

View File

@ -35,10 +35,10 @@ imports:
- "mongo/idl/basic_types.idl"
description: "Maximum target snapshot history window, in seconds"
description: "Minimum snapshot history to keep, in seconds"
set_at: [ startup, runtime ]
cpp_varname: "snapshotWindowParams.maxTargetSnapshotHistoryWindowInSeconds"
cpp_varname: "snapshotWindowParams.minSnapshotHistoryWindowInSeconds"
validator: { gte: 0 }
@ -48,36 +48,3 @@ server_parameters:
gte: 0
lte: 100
description: "Snapshot window multiplicative decrease"
set_at: [ startup, runtime ]
cpp_varname: "snapshotWindowParams.snapshotWindowMultiplicativeDecrease"
gt: 0.0
lt: 1.0
description: "Snapshot window multiplicative increase, in seconds"
set_at: [ startup, runtime ]
cpp_varname: "snapshotWindowParams.snapshotWindowAdditiveIncreaseSeconds"
validator: { gte: 1 }
description: "Minimum duration between snapshot window increment, in milliseconds"
set_at: [ startup, runtime ]
cpp_varname: "snapshotWindowParams.minMillisBetweenSnapshotWindowInc"
validator: { gte: 1 }
description: "Minimum duration between snapshot window decrement, in milliseconds"
set_at: [ startup, runtime ]
cpp_varname: "snapshotWindowParams.minMillisBetweenSnapshotWindowDec"
validator: { gte: 1 }
description: "Check cache pressure period, in seconds"
set_at: [ startup, runtime ]
cpp_varname: "snapshotWindowParams.checkCachePressurePeriodSeconds"
validator: { gte: 1 }
on_update: std::ref(SnapshotWindowParams::observeCheckCachePressurePeriodSeconds)

@ -479,7 +479,7 @@ public:
* must maintain snapshot history through.
* oldest_timestamp will be set to stable_timestamp adjusted by
* 'targetSnapshotHistoryWindowInSeconds' to create a window of available snapshots on the
* 'minSnapshotHistoryWindowInSeconds' to create a window of available snapshots on the
* storage engine from oldest to stable. Furthermore, oldest_timestamp will never be set ahead
* of the oplog read timestamp, ensuring the oplog reader's 'read_timestamp' can always be
* serviced.

@ -764,7 +764,7 @@ WiredTigerKVEngine::WiredTigerKVEngine(const std::string& canonicalName,
// We do not maintain any snapshot history for the ephemeral storage engine in production
// because replication and sharded transactions do not currently run on the inMemory engine.
// It is live in testing, however.
_sizeStorerUri = _uri("sizeStorer");
@ -1861,7 +1861,7 @@ void WiredTigerKVEngine::setOldestTimestampFromStable() {
// Calculate what the oldest_timestamp should be from the stable_timestamp. The oldest
// timestamp should lag behind stable by 'targetSnapshotHistoryWindowInSeconds' to create a
// timestamp should lag behind stable by 'minSnapshotHistoryWindowInSeconds' to create a
// window of available snapshots. If the lag window is not yet large enough, we will not
// update/forward the oldest_timestamp yet and instead return early.
Timestamp newOldestTimestamp = _calculateHistoryLagFromStableTimestamp(stableTimestamp);
@ -1904,23 +1904,23 @@ void WiredTigerKVEngine::setOldestTimestamp(Timestamp newOldestTimestamp, bool f
Timestamp WiredTigerKVEngine::_calculateHistoryLagFromStableTimestamp(Timestamp stableTimestamp) {
// The oldest_timestamp should lag behind the stable_timestamp by
// 'targetSnapshotHistoryWindowInSeconds' seconds.
// 'minSnapshotHistoryWindowInSeconds' seconds.
if (_ephemeral && !getTestCommandsEnabled()) {
// No history should be maintained for the inMemory engine because it is not used yet.
invariant(snapshotWindowParams.targetSnapshotHistoryWindowInSeconds.load() == 0);
invariant(snapshotWindowParams.minSnapshotHistoryWindowInSeconds.load() == 0);
if (stableTimestamp.getSecs() <
static_cast<unsigned>(snapshotWindowParams.targetSnapshotHistoryWindowInSeconds.load())) {
static_cast<unsigned>(snapshotWindowParams.minSnapshotHistoryWindowInSeconds.load())) {
// The history window is larger than the timestamp history thus far. We must wait for
// the history to reach the window size before moving oldest_timestamp forward.
// the history to reach the window size before moving oldest_timestamp forward. This should
// only happen in unit tests.
return Timestamp();
Timestamp calculatedOldestTimestamp(
stableTimestamp.getSecs() -
stableTimestamp.getSecs() - snapshotWindowParams.minSnapshotHistoryWindowInSeconds.load(),
if (calculatedOldestTimestamp.asULL() <= _oldestTimestamp.load()) {

@ -376,7 +376,7 @@ private:
std::string _uri(StringData ident) const;
* Uses the 'stableTimestamp', the 'targetSnapshotHistoryWindowInSeconds' setting and the
* Uses the 'stableTimestamp', the 'minSnapshotHistoryWindowInSeconds' setting and the
* current _oldestTimestamp to calculate what the new oldest_timestamp should be, in order to
* maintain a window of available snapshots on the storage engine from oldest to stable
* timestamp.

@ -748,11 +748,9 @@ void WiredTigerUtil::appendSnapshotWindowSettings(WiredTigerKVEngine* engine,
settings.append("current cache pressure percentage", score);
settings.append("total number of SnapshotTooOld errors", totalNumberOfSnapshotTooOldErrors);
settings.append("max target available snapshots window size in seconds",
settings.append("target available snapshots window size in seconds",
settings.append("current available snapshots window size in seconds",
settings.append("minimum target snapshot window size in seconds",
settings.append("current available snapshot window size in seconds",
settings.append("latest majority snapshot timestamp available",