0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 09:32:32 +01:00

Import wiredtiger: a5fd80d29c69f12c01f412fb6d8d7930cecc8758 from branch mongodb-5.0

ref: 563ccc601f..a5fd80d29c
for: 4.9.0

WT-7164       Merge "HS cursor restructure" feature branch into develop
This commit is contained in:
Luke Chen 2021-03-04 14:45:58 +11:00 committed by Evergreen Agent
parent 2f11ef616e
commit ff9995ed5c
33 changed files with 1300 additions and 1637 deletions

View File

@ -565,6 +565,7 @@ calloc
cas
catfmt
cb
cbt
ccc
ccr
cd

View File

@ -135,7 +135,9 @@ func_ok()
-e '/int zlib_terminate$/d' \
-e '/int zstd_error$/d' \
-e '/int zstd_pre_size$/d' \
-e '/int zstd_terminate$/d'
-e '/int zstd_terminate$/d' \
-e '/int __wt_curhs_search_near_after$/d' \
-e '/int __wt_curhs_search_near_before$/d'
}
for f in `find bench ext src test -name '*.c' -o -name '*_inline.h'`; do

View File

@ -303,9 +303,7 @@ connection_stats = [
CursorStat('cursor_modify_bytes', 'cursor modify key and value bytes affected', 'size'),
CursorStat('cursor_modify_bytes_touch', 'cursor modify value bytes modified', 'size'),
CursorStat('cursor_next', 'cursor next calls'),
CursorStat('cursor_next_hs_tombstone_rts', 'cursor next calls that skip due to a globally visible history store tombstone in rollback to stable'),
CursorStat('cursor_prev', 'cursor prev calls'),
CursorStat('cursor_prev_hs_tombstone_rts', 'cursor prev calls that skip due to a globally visible history store tombstone in rollback to stable'),
CursorStat('cursor_remove', 'cursor remove calls'),
CursorStat('cursor_remove_bytes', 'cursor remove key bytes removed', 'size'),
CursorStat('cursor_reopen', 'cursors reused from cache'),
@ -874,7 +872,7 @@ conn_dsrc_stats = [
TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'),
TxnStat('txn_rts_hs_restore_updates', 'rollback to stable restored updates from history store'),
TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'),
TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'),
TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable history store records with stop timestamps older than newer records'),
TxnStat('txn_rts_inconsistent_ckpt', 'rollback to stable inconsistent checkpoint'),
TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'),
TxnStat('txn_rts_keys_restored', 'rollback to stable keys restored'),

View File

@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.0",
"commit": "563ccc601f5689a16a3f41743398329b8a3aedf7"
"commit": "a5fd80d29c69f12c01f412fb6d8d7930cecc8758"
}

View File

@ -40,18 +40,18 @@ struct __wt_dbg {
static const /* Output separator */
char *const sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool);
static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool, WT_CURSOR *);
static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *);
static int __debug_modify(WT_DBG *, const uint8_t *);
static int __debug_page(WT_DBG *, WT_REF *, uint32_t);
static int __debug_page_col_fix(WT_DBG *, WT_REF *);
static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t);
static int __debug_page_col_var(WT_DBG *, WT_REF *);
static int __debug_page_col_var(WT_DBG *, WT_REF *, WT_CURSOR *);
static int __debug_page_metadata(WT_DBG *, WT_REF *);
static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *, WT_CURSOR *);
static int __debug_ref(WT_DBG *, WT_REF *);
static int __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
static int __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *, WT_CURSOR *);
static int __debug_tree(WT_SESSION_IMPL *, WT_REF *, const char *, uint32_t);
static int __debug_update(WT_DBG *, WT_UPDATE *, bool);
static int __debug_wrapup(WT_DBG *);
@ -285,9 +285,6 @@ __debug_wrapup(WT_DBG *ds)
session = ds->session;
msg = ds->msg;
if (session->hs_cursor != NULL)
WT_TRET(__wt_hs_cursor_close(session));
__wt_scr_free(session, &ds->key);
__wt_scr_free(session, &ds->hs_key);
__wt_scr_free(session, &ds->hs_value);
@ -421,7 +418,7 @@ __debug_hs_cursor(WT_DBG *ds, WT_CURSOR *hs_cursor)
uint32_t hs_btree_id;
char time_string[WT_TIME_STRING_SIZE];
cbt = (WT_CURSOR_BTREE *)hs_cursor;
cbt = __wt_curhs_get_cbt(hs_cursor);
session = ds->session;
WT_TIME_WINDOW_INIT(&tw);
@ -463,16 +460,12 @@ __debug_hs_cursor(WT_DBG *ds, WT_CURSOR *hs_cursor)
* Dump any HS records associated with the key.
*/
static int
__debug_hs_key(WT_DBG *ds)
__debug_hs_key(WT_DBG *ds, WT_CURSOR *hs_cursor)
{
WT_BTREE *btree;
WT_CURSOR *hs_cursor;
WT_DECL_RET;
WT_SESSION_IMPL *session;
wt_timestamp_t older_start_ts;
uint64_t hs_counter;
uint32_t hs_btree_id;
int cmp, exact;
session = ds->session;
btree = S2BT(session);
@ -482,26 +475,12 @@ __debug_hs_key(WT_DBG *ds)
* Open a history store cursor positioned at the end of the data store key (the newest record)
* and iterate backwards until we reach a different key or btree.
*/
hs_cursor = session->hs_cursor;
hs_cursor->set_key(hs_cursor, hs_btree_id, ds->key, WT_TS_MAX, WT_TXN_MAX);
ret = hs_cursor->search_near(hs_cursor, &exact);
/* If we jumped to the next key, go back to the previous key. */
if (ret == 0 && exact > 0)
ret = hs_cursor->prev(hs_cursor);
for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
WT_RET(hs_cursor->get_key(hs_cursor, &hs_btree_id, ds->t1, &older_start_ts, &hs_counter));
if (hs_btree_id != btree->id)
break;
WT_RET(__wt_compare(session, NULL, ds->key, ds->t1, &cmp));
if (cmp != 0)
break;
hs_cursor->set_key(hs_cursor, 4, hs_btree_id, ds->key, WT_TS_MAX, WT_TXN_MAX);
ret = __wt_curhs_search_near_before(session, hs_cursor);
for (; ret == 0; ret = hs_cursor->prev(hs_cursor))
WT_RET(__debug_hs_cursor(ds, hs_cursor));
}
return (ret == WT_NOTFOUND ? 0 : ret);
}
@ -970,19 +949,19 @@ __wt_debug_cursor_page(void *cursor_arg, const char *ofile)
* Dump the history store tree given a user cursor.
*/
int
__wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile)
__wt_debug_cursor_tree_hs(void *session_arg, const char *ofile)
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
{
WT_CURSOR_BTREE *cbt;
WT_BTREE *hs_btree;
WT_CURSOR *hs_cursor;
WT_DECL_RET;
WT_SESSION_IMPL *session;
session = CUR2S(cursor_arg);
WT_RET(__wt_hs_cursor_open(session));
cbt = (WT_CURSOR_BTREE *)session->hs_cursor;
WT_WITH_BTREE(session, CUR2BT(cbt), ret = __wt_debug_tree_all(session, NULL, NULL, ofile));
WT_TRET(__wt_hs_cursor_close(session));
session = (WT_SESSION_IMPL *)session_arg;
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
hs_btree = __wt_curhs_get_btree(hs_cursor);
WT_WITH_BTREE(session, hs_btree, ret = __wt_debug_tree_all(session, NULL, NULL, ofile));
WT_TRET(hs_cursor->close(hs_cursor));
return (ret);
}
@ -1017,9 +996,11 @@ __debug_tree(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile, uint32_t
static int
__debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags)
{
WT_CURSOR *hs_cursor;
WT_DECL_RET;
WT_SESSION_IMPL *session;
hs_cursor = NULL;
session = ds->session;
WT_RET(__wt_scr_alloc(session, 100, &ds->key));
@ -1028,43 +1009,47 @@ __debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags)
* doesn't work, we may be running in-memory.
*/
if (!WT_IS_HS(session->dhandle)) {
if (session->hs_cursor != NULL || __wt_hs_cursor_open(session) == 0) {
WT_RET(__wt_scr_alloc(session, 0, &ds->hs_key));
WT_RET(__wt_scr_alloc(session, 0, &ds->hs_value));
}
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
WT_ERR(__wt_scr_alloc(session, 0, &ds->hs_key));
WT_ERR(__wt_scr_alloc(session, 0, &ds->hs_value));
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
}
/* Dump the page metadata. */
WT_WITH_PAGE_INDEX(session, ret = __debug_page_metadata(ds, ref));
WT_RET(ret);
WT_ERR(ret);
/* Dump the page. */
switch (ref->page->type) {
case WT_PAGE_COL_FIX:
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
WT_RET(__debug_page_col_fix(ds, ref));
WT_ERR(__debug_page_col_fix(ds, ref));
break;
case WT_PAGE_COL_INT:
WT_WITH_PAGE_INDEX(session, ret = __debug_page_col_int(ds, ref->page, flags));
WT_RET(ret);
WT_ERR(ret);
break;
case WT_PAGE_COL_VAR:
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
WT_RET(__debug_page_col_var(ds, ref));
WT_ERR(__debug_page_col_var(ds, ref, hs_cursor));
break;
case WT_PAGE_ROW_INT:
WT_WITH_PAGE_INDEX(session, ret = __debug_page_row_int(ds, ref->page, flags));
WT_RET(ret);
WT_ERR(ret);
break;
case WT_PAGE_ROW_LEAF:
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
WT_RET(__debug_page_row_leaf(ds, ref->page));
WT_ERR(__debug_page_row_leaf(ds, ref->page, hs_cursor));
break;
default:
return (__wt_illegal_value(session, ref->page->type));
WT_ERR(__wt_illegal_value(session, ref->page->type));
}
return (0);
err:
if (hs_cursor != NULL)
WT_TRET(hs_cursor->close(hs_cursor));
return (ret);
}
/*
@ -1209,11 +1194,11 @@ __debug_page_col_fix(WT_DBG *ds, WT_REF *ref)
if (WT_COL_UPDATE_SINGLE(page) != NULL) {
WT_RET(ds->f(ds, "%s", sep));
WT_RET(__debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", true));
WT_RET(__debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", true, NULL));
}
if (WT_COL_APPEND(page) != NULL) {
WT_RET(ds->f(ds, "%s", sep));
WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", true));
WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", true, NULL));
}
return (0);
}
@ -1254,7 +1239,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
* Dump an in-memory WT_PAGE_COL_VAR page.
*/
static int
__debug_page_col_var(WT_DBG *ds, WT_REF *ref)
__debug_page_col_var(WT_DBG *ds, WT_REF *ref, WT_CURSOR *hs_cursor)
{
WT_CELL *cell;
WT_CELL_UNPACK_KV *unpack, _unpack;
@ -1283,17 +1268,17 @@ __debug_page_col_var(WT_DBG *ds, WT_REF *ref)
p = ds->key->mem;
WT_RET(__wt_vpack_uint(&p, 0, recno));
ds->key->size = WT_PTRDIFF(p, ds->key->mem);
WT_RET(__debug_hs_key(ds));
WT_RET(__debug_hs_key(ds, hs_cursor));
}
if ((update = WT_COL_UPDATE(page, cip)) != NULL)
WT_RET(__debug_col_skip(ds, update, "update", false));
WT_RET(__debug_col_skip(ds, update, "update", false, hs_cursor));
recno += rle;
}
if (WT_COL_APPEND(page) != NULL) {
WT_RET(ds->f(ds, "%s", sep));
WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", false));
WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", false, hs_cursor));
}
return (0);
@ -1337,7 +1322,7 @@ __debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
* Dump an in-memory WT_PAGE_ROW_LEAF page.
*/
static int
__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page, WT_CURSOR *hs_cursor)
{
WT_CELL_UNPACK_KV *unpack, _unpack;
WT_INSERT_HEAD *insert;
@ -1353,7 +1338,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
* Dump any K/V pairs inserted into the page before the first from-disk key on the page.
*/
if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
WT_RET(__debug_row_skip(ds, insert));
WT_RET(__debug_row_skip(ds, insert, hs_cursor));
/* Dump the page's K/V pairs. */
WT_ROW_FOREACH (page, rip, i) {
@ -1366,11 +1351,11 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
WT_RET(__debug_update(ds, upd, false));
if (!WT_IS_HS(session->dhandle) && session->hs_cursor != NULL)
WT_RET(__debug_hs_key(ds));
if (!WT_IS_HS(session->dhandle) && hs_cursor != NULL)
WT_RET(__debug_hs_key(ds, hs_cursor));
if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
WT_RET(__debug_row_skip(ds, insert));
WT_RET(__debug_row_skip(ds, insert, hs_cursor));
}
return (0);
}
@ -1380,7 +1365,8 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
* Dump a column-store skiplist.
*/
static int
__debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte)
__debug_col_skip(
WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte, WT_CURSOR *hs_cursor)
{
WT_INSERT *ins;
WT_SESSION_IMPL *session;
@ -1392,11 +1378,11 @@ __debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte
WT_RET(ds->f(ds, "\t%s %" PRIu64 "\n", tag, WT_INSERT_RECNO(ins)));
WT_RET(__debug_update(ds, ins->upd, hexbyte));
if (!WT_IS_HS(session->dhandle) && session->hs_cursor != NULL) {
if (!WT_IS_HS(session->dhandle) && hs_cursor != NULL) {
p = ds->key->mem;
WT_RET(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(ins)));
ds->key->size = WT_PTRDIFF(p, ds->key->mem);
WT_RET(__debug_hs_key(ds));
WT_RET(__debug_hs_key(ds, hs_cursor));
}
}
return (0);
@ -1407,7 +1393,7 @@ __debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte
* Dump an insert list.
*/
static int
__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head, WT_CURSOR *hs_cursor)
{
WT_INSERT *ins;
WT_SESSION_IMPL *session;
@ -1418,9 +1404,9 @@ __debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
WT_RET(__debug_item_key(ds, "insert", WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins)));
WT_RET(__debug_update(ds, ins->upd, false));
if (!WT_IS_HS(session->dhandle) && session->hs_cursor != NULL) {
if (!WT_IS_HS(session->dhandle) && hs_cursor != NULL) {
WT_RET(__wt_buf_set(session, ds->key, WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins)));
WT_RET(__debug_hs_key(ds));
WT_RET(__debug_hs_key(ds, hs_cursor));
}
}
return (0);

View File

@ -71,7 +71,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
return (0);
}
WT_RET(__wt_hs_cursor_cache(session));
WT_RET(__wt_curhs_cache(session));
(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
ret = __wt_evict(session, ref, previous_state, 0);
(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);

View File

@ -278,9 +278,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
*/
if (ret == 0 && (ckpt + 1)->name == NULL && !skip_hs) {
/* Open a history store cursor. */
WT_ERR(__wt_hs_cursor_open(session));
WT_TRET(__wt_hs_verify_one(session));
WT_TRET(__wt_hs_cursor_close(session));
/*
* We cannot error out here. If we got an error verifying the history store, we need
* to follow through with reacquiring the exclusive call below. We'll error out
@ -778,11 +776,12 @@ __verify_key_hs(
wt_timestamp_t older_start_ts, older_stop_ts;
uint64_t hs_counter;
uint32_t hs_btree_id;
int cmp, exact;
char ts_string[2][WT_TS_INT_STRING_SIZE];
btree = S2BT(session);
hs_btree_id = btree->id;
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
/*
* Set the data store timestamp and transactions to initiate timestamp range verification. Since
@ -795,36 +794,23 @@ __verify_key_hs(
* Open a history store cursor positioned at the end of the data store key (the newest record)
* and iterate backwards until we reach a different key or btree.
*/
hs_cursor = session->hs_cursor;
hs_cursor->set_key(hs_cursor, hs_btree_id, tmp1, WT_TS_MAX, WT_TXN_MAX);
ret = hs_cursor->search_near(hs_cursor, &exact);
/* If we jumped to the next key, go back to the previous key. */
if (ret == 0 && exact > 0)
ret = hs_cursor->prev(hs_cursor);
hs_cursor->set_key(hs_cursor, 4, hs_btree_id, tmp1, WT_TS_MAX, UINT64_MAX);
ret = __wt_curhs_search_near_before(session, hs_cursor);
for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
WT_RET(hs_cursor->get_key(hs_cursor, &hs_btree_id, vs->tmp2, &older_start_ts, &hs_counter));
if (hs_btree_id != btree->id)
break;
WT_RET(__wt_compare(session, NULL, tmp1, vs->tmp2, &cmp));
if (cmp != 0)
break;
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, vs->tmp2, &older_start_ts, &hs_counter));
/* Verify the newer record's start is later than the older record's stop. */
if (newer_start_ts < older_stop_ts) {
WT_RET_MSG(session, WT_ERROR,
WT_ERR_MSG(session, WT_ERROR,
"key %s has a overlap of timestamp ranges between history store stop timestamp %s "
"being newer than a more recent timestamp range having start timestamp %s",
__wt_buf_set_printable(session, tmp1->data, tmp1->size, vs->tmp2),
__verify_timestamp_to_pretty_string(older_stop_ts, ts_string[0]),
__verify_timestamp_to_pretty_string(newer_start_ts, ts_string[1]));
__wt_timestamp_to_string(older_stop_ts, ts_string[0]),
__wt_timestamp_to_string(newer_start_ts, ts_string[1]));
}
if (vs->stable_timestamp != WT_TS_NONE)
WT_RET(
WT_ERR(
__verify_ts_stable_cmp(session, tmp1, NULL, 0, older_start_ts, older_stop_ts, vs));
/*
@ -833,7 +819,8 @@ __verify_key_hs(
*/
newer_start_ts = older_start_ts;
}
err:
WT_TRET(hs_cursor->close(hs_cursor));
return (ret == WT_NOTFOUND ? 0 : ret);
#else
WT_UNUSED(session);

View File

@ -8,15 +8,19 @@
#include "wt_internal.h"
static int __curhs_file_cursor_next(WT_SESSION_IMPL *, WT_CURSOR *);
static int __curhs_file_cursor_open(WT_SESSION_IMPL *, WT_CURSOR **);
static int __curhs_file_cursor_prev(WT_SESSION_IMPL *, WT_CURSOR *);
static int __curhs_file_cursor_search_near(WT_SESSION_IMPL *, WT_CURSOR *, int *);
static int __curhs_prev_visible(WT_SESSION_IMPL *, WT_CURSOR_HS *);
static int __curhs_next_visible(WT_SESSION_IMPL *, WT_CURSOR_HS *);
static int __curhs_search_near_helper(WT_SESSION_IMPL *, WT_CURSOR *, bool);
/*
* __hs_cursor_open_int --
* __curhs_file_cursor_open --
* Open a new history store table cursor, internal function.
*/
static int
__hs_cursor_open_int(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
__curhs_file_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
{
WT_CURSOR *cursor;
WT_DECL_RET;
@ -34,12 +38,12 @@ __hs_cursor_open_int(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
}
/*
* __wt_hs_cursor_cache --
* __wt_curhs_cache --
* Cache a new history store table cursor. Open and then close a history store cursor without
* saving it in the session.
*/
int
__wt_hs_cursor_cache(WT_SESSION_IMPL *session)
__wt_curhs_cache(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
WT_CURSOR *cursor;
@ -70,45 +74,17 @@ __wt_hs_cursor_cache(WT_SESSION_IMPL *session)
(session->dhandle != NULL && WT_IS_METADATA(S2BT(session)->dhandle)) ||
session == conn->default_session)
return (0);
WT_RET(__hs_cursor_open_int(session, &cursor));
WT_RET(__curhs_file_cursor_open(session, &cursor));
WT_RET(cursor->close(cursor));
return (0);
}
/*
* __wt_hs_cursor_open --
* Open a new history store table cursor wrapper function.
*/
int
__wt_hs_cursor_open(WT_SESSION_IMPL *session)
{
/* Not allowed to open a cursor if you already have one */
WT_ASSERT(session, session->hs_cursor == NULL);
return (__hs_cursor_open_int(session, &session->hs_cursor));
}
/*
* __wt_hs_cursor_close --
* Discard a history store cursor.
*/
int
__wt_hs_cursor_close(WT_SESSION_IMPL *session)
{
/* Should only be called when session has an open history store cursor */
WT_ASSERT(session, session->hs_cursor != NULL);
WT_RET(session->hs_cursor->close(session->hs_cursor));
session->hs_cursor = NULL;
return (0);
}
/*
* __wt_hs_cursor_next --
* __curhs_file_cursor_next --
* Execute a next operation on a history store cursor with the appropriate isolation level.
*/
int
__wt_hs_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
static int
__curhs_file_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
{
WT_DECL_RET;
@ -117,11 +93,11 @@ __wt_hs_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
}
/*
* __wt_hs_cursor_prev --
* __curhs_file_cursor_prev --
* Execute a prev operation on a history store cursor with the appropriate isolation level.
*/
int
__wt_hs_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
static int
__curhs_file_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
{
WT_DECL_RET;
@ -130,12 +106,12 @@ __wt_hs_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
}
/*
* __wt_hs_cursor_search_near --
* __curhs_file_cursor_search_near --
* Execute a search near operation on a history store cursor with the appropriate isolation
* level.
*/
int
__wt_hs_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exactp)
static int
__curhs_file_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exactp)
{
WT_DECL_RET;
@ -144,9 +120,35 @@ __wt_hs_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exa
return (ret);
}
/*
* __curhs_set_key_ptr --
* Copy the key buffer pointer from file cursor to the history store cursor.
*/
static inline void
__curhs_set_key_ptr(WT_CURSOR *hs_cursor, WT_CURSOR *file_cursor)
{
hs_cursor->key.data = file_cursor->key.data;
hs_cursor->key.size = file_cursor->key.size;
WT_ASSERT(CUR2S(file_cursor), F_ISSET(file_cursor, WT_CURSTD_KEY_SET));
F_SET(hs_cursor, F_MASK(file_cursor, WT_CURSTD_KEY_SET));
}
/*
* __curhs_set_value_ptr --
* Copy the value buffer pointer from file cursor to the history store cursor.
*/
static inline void
__curhs_set_value_ptr(WT_CURSOR *hs_cursor, WT_CURSOR *file_cursor)
{
hs_cursor->value.data = file_cursor->value.data;
hs_cursor->value.size = file_cursor->value.size;
WT_ASSERT(CUR2S(file_cursor), F_ISSET(file_cursor, WT_CURSTD_VALUE_SET));
F_SET(hs_cursor, F_MASK(file_cursor, WT_CURSTD_VALUE_SET));
}
/*
* __curhs_next --
* WT_CURSOR->next method for the hs cursor type.
* WT_CURSOR->next method for the history store cursor type.
*/
static int
__curhs_next(WT_CURSOR *cursor)
@ -160,7 +162,7 @@ __curhs_next(WT_CURSOR *cursor)
file_cursor = hs_cursor->file_cursor;
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, next, CUR2BT(file_cursor));
WT_ERR(__wt_hs_cursor_next(session, file_cursor));
WT_ERR(__curhs_file_cursor_next(session, file_cursor));
/*
* We need to check if the history store record is visible to the current session. If not, the
* __curhs_next_visible() will also keep iterating forward through the records until it finds a
@ -168,6 +170,9 @@ __curhs_next(WT_CURSOR *cursor)
*/
WT_ERR(__curhs_next_visible(session, hs_cursor));
__curhs_set_key_ptr(cursor, file_cursor);
__curhs_set_value_ptr(cursor, file_cursor);
if (0) {
err:
WT_TRET(cursor->reset(cursor));
@ -177,7 +182,7 @@ err:
/*
* __curhs_prev --
* WT_CURSOR->prev method for the hs cursor type.
* WT_CURSOR->prev method for the history store cursor type.
*/
static int
__curhs_prev(WT_CURSOR *cursor)
@ -191,7 +196,7 @@ __curhs_prev(WT_CURSOR *cursor)
file_cursor = hs_cursor->file_cursor;
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, prev, CUR2BT(file_cursor));
WT_ERR(__wt_hs_cursor_prev(session, file_cursor));
WT_ERR(__curhs_file_cursor_prev(session, file_cursor));
/*
* We need to check if the history store record is visible to the current session. If not, the
* __curhs_prev_visible() will also keep iterating backwards through the records until it finds
@ -199,6 +204,9 @@ __curhs_prev(WT_CURSOR *cursor)
*/
WT_ERR(__curhs_prev_visible(session, hs_cursor));
__curhs_set_key_ptr(cursor, file_cursor);
__curhs_set_value_ptr(cursor, file_cursor);
if (0) {
err:
WT_TRET(cursor->reset(cursor));
@ -208,7 +216,7 @@ err:
/*
* __curhs_close --
* WT_CURSOR->close method for the hs cursor type.
* WT_CURSOR->close method for the history store cursor type.
*/
static int
__curhs_close(WT_CURSOR *cursor)
@ -216,7 +224,6 @@ __curhs_close(WT_CURSOR *cursor)
WT_CURSOR *file_cursor;
WT_CURSOR_HS *hs_cursor;
WT_DECL_RET;
WT_ITEM *datastore_key;
WT_SESSION_IMPL *session;
hs_cursor = (WT_CURSOR_HS *)cursor;
@ -224,11 +231,11 @@ __curhs_close(WT_CURSOR *cursor)
CURSOR_API_CALL_PREPARE_ALLOWED(
cursor, session, close, file_cursor == NULL ? NULL : CUR2BT(file_cursor));
err:
__wt_scr_free(session, &hs_cursor->datastore_key);
if (file_cursor != NULL)
WT_TRET(file_cursor->close(file_cursor));
datastore_key = &hs_cursor->datastore_key;
__wt_scr_free(session, &datastore_key);
__wt_cursor_close(cursor);
--session->hs_cursor_counter;
API_END_RET(session, ret);
}
@ -252,9 +259,15 @@ __curhs_reset(WT_CURSOR *cursor)
ret = file_cursor->reset(file_cursor);
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
hs_cursor->btree_id = 0;
hs_cursor->datastore_key.data = NULL;
hs_cursor->datastore_key.size = 0;
hs_cursor->datastore_key->data = NULL;
hs_cursor->datastore_key->size = 0;
hs_cursor->flags = 0;
cursor->key.data = NULL;
cursor->key.size = 0;
cursor->value.data = NULL;
cursor->value.size = 0;
F_CLR(cursor, WT_CURSTD_KEY_SET);
F_CLR(cursor, WT_CURSTD_VALUE_SET);
err:
API_END_RET(session, ret);
@ -262,7 +275,7 @@ err:
/*
* __curhs_set_key --
* WT_CURSOR->set_key method for the hs cursor type.
* WT_CURSOR->set_key method for the history store cursor type.
*/
static void
__curhs_set_key(WT_CURSOR *cursor, ...)
@ -282,6 +295,7 @@ __curhs_set_key(WT_CURSOR *cursor, ...)
start_ts = WT_TS_NONE;
counter = 0;
hs_cursor->flags = 0;
va_start(ap, cursor);
arg_count = va_arg(ap, uint32_t);
@ -292,11 +306,11 @@ __curhs_set_key(WT_CURSOR *cursor, ...)
if (arg_count > 1) {
datastore_key = va_arg(ap, WT_ITEM *);
WT_IGNORE_RET(__wt_buf_set(
session, &hs_cursor->datastore_key, datastore_key->data, datastore_key->size));
session, hs_cursor->datastore_key, datastore_key->data, datastore_key->size));
F_SET(hs_cursor, WT_HS_CUR_KEY_SET);
} else {
hs_cursor->datastore_key.data = NULL;
hs_cursor->datastore_key.size = 0;
hs_cursor->datastore_key->data = NULL;
hs_cursor->datastore_key->size = 0;
F_CLR(hs_cursor, WT_HS_CUR_KEY_SET);
}
@ -315,7 +329,9 @@ __curhs_set_key(WT_CURSOR *cursor, ...)
va_end(ap);
file_cursor->set_key(
file_cursor, hs_cursor->btree_id, &hs_cursor->datastore_key, start_ts, counter);
file_cursor, hs_cursor->btree_id, hs_cursor->datastore_key, start_ts, counter);
__curhs_set_key_ptr(cursor, file_cursor);
}
/*
@ -342,8 +358,8 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
for (; ret == 0; ret = __wt_hs_cursor_prev(session, file_cursor)) {
WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter));
for (; ret == 0; ret = __curhs_file_cursor_prev(session, file_cursor)) {
WT_ERR(file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter));
/* Stop before crossing over to the next btree. */
if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) {
@ -356,7 +372,7 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
* have crossed over the desired key and not found the record we are looking for.
*/
if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp));
WT_ERR(__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp));
if (cmp != 0) {
ret = WT_NOTFOUND;
goto err;
@ -379,6 +395,12 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED))
break;
/*
* If we are using a history store cursor and haven't set the WT_CURSTD_HS_READ_COMMITTED
* flag then we must have a snapshot, assert that we do.
*/
WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT));
if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw)) {
/*
* If the stop time point of a record is visible to us, we won't be able to see anything
@ -425,8 +447,8 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
for (; ret == 0; ret = __wt_hs_cursor_next(session, file_cursor)) {
WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter));
for (; ret == 0; ret = __curhs_file_cursor_next(session, file_cursor)) {
WT_ERR(file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter));
/* Stop before crossing over to the next btree. */
if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) {
@ -439,7 +461,7 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
* have crossed over the desired key and not found the record we are looking for.
*/
if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp));
WT_ERR(__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp));
if (cmp != 0) {
ret = WT_NOTFOUND;
goto err;
@ -462,6 +484,12 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED))
break;
/*
* If we are using a history store cursor and haven't set the WT_CURSTD_HS_READ_COMMITTED
* flag then we must have a snapshot, assert that we do.
*/
WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT));
/*
* If the stop time point of a record is visible to us, check the next one.
*/
@ -478,171 +506,268 @@ err:
return (ret);
}
/*
* __wt_curhs_search_near_before --
* Set the cursor position at the requested position or before it.
*/
int
__wt_curhs_search_near_before(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
{
return (__curhs_search_near_helper(session, cursor, true));
}
/*
* __wt_curhs_search_near_after --
* Set the cursor position at the requested position or after it.
*/
int
__wt_curhs_search_near_after(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
{
return (__curhs_search_near_helper(session, cursor, false));
}
/*
* __curhs_search_near_helper --
* Helper function to set the cursor position based on search criteria.
*/
static int
__curhs_search_near_helper(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool before)
{
WT_DECL_ITEM(srch_key);
WT_DECL_RET;
int cmp;
WT_RET(__wt_scr_alloc(session, 0, &srch_key));
WT_ERR(__wt_buf_set(session, srch_key, cursor->key.data, cursor->key.size));
WT_ERR(cursor->search_near(cursor, &cmp));
if (before) {
/*
* If we want to land on a key that is smaller or equal to the specified key, keep walking
* backwards as there may be content inserted concurrently.
*/
if (cmp > 0) {
while ((ret = cursor->prev(cursor)) == 0) {
WT_STAT_CONN_INCR(session, cursor_skip_hs_cur_position);
WT_STAT_DATA_INCR(session, cursor_skip_hs_cur_position);
WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp));
/*
* Exit if we have found a key that is smaller than or equal to the specified key.
*/
if (cmp <= 0)
break;
}
}
} else {
/*
* If we want to land on a key that is larger or equal to the specified key, keep walking
* forwards as there may be content inserted concurrently.
*/
if (cmp < 0) {
while ((ret = cursor->next(cursor)) == 0) {
WT_STAT_CONN_INCR(session, cursor_skip_hs_cur_position);
WT_STAT_DATA_INCR(session, cursor_skip_hs_cur_position);
WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp));
/* Exit if we have found a key that is larger than or equal to the specified key. */
if (cmp >= 0)
break;
}
}
}
err:
__wt_scr_free(session, &srch_key);
return (ret);
}
/*
* __curhs_search_near --
* WT_CURSOR->search_near method for the hs cursor type.
* WT_CURSOR->search_near method for the history store cursor type.
*/
static int
__curhs_search_near(WT_CURSOR *cursor, int *exactp)
{
WT_CURSOR *file_cursor;
WT_CURSOR_HS *hs_cursor;
WT_DECL_ITEM(datastore_key);
WT_DECL_ITEM(srch_key);
WT_DECL_RET;
WT_SESSION_IMPL *session;
int cmp;
int exact;
wt_timestamp_t start_ts;
uint64_t counter;
uint32_t btree_id;
int exact, cmp;
hs_cursor = (WT_CURSOR_HS *)cursor;
file_cursor = hs_cursor->file_cursor;
*exactp = 0;
cmp = 0;
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, search_near, CUR2BT(file_cursor));
WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
WT_ERR(__wt_scr_alloc(session, 0, &srch_key));
/* At least we have the btree id set. */
WT_ASSERT(session, F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET));
WT_ERR(__wt_buf_set(session, srch_key, file_cursor->key.data, file_cursor->key.size));
/* Reset cursor if we get WT_NOTFOUND. */
WT_ERR(__wt_hs_cursor_search_near(session, file_cursor, &exact));
WT_ERR(__curhs_file_cursor_search_near(session, file_cursor, &exact));
/*
* There are some key fields missing so we are searching a range of keys. Place the cursor at
* the start of the range.
*/
if (!F_ISSET(hs_cursor, WT_HS_CUR_COUNTER_SET)) {
if (exact >= 0) {
/*
* If we raced with a history store insert, we may be two or more records away from our
* target. Keep iterating forwards until we are on or past our target key.
*
* We can't use the cursor positioning helper that we use for regular reads since that will
* place us at the end of a particular key/timestamp range whereas we want to be placed at
* the beginning.
* We placed the file cursor before the search key. Try first to walk forwards to see if we
* can find a visible record. If nothing is visible, try to walk backwards.
*/
if (exact < 0) {
while ((ret = __wt_hs_cursor_next(session, file_cursor)) == 0) {
WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
if (cmp >= 0)
break;
}
WT_ERR_NOTFOUND_OK(__curhs_next_visible(session, hs_cursor), true);
if (ret == WT_NOTFOUND) {
/*
* No entries greater than or equal to the key we searched for. Reset cursor if we get
* WT_NOTFOUND.
* When walking backwards, first ensure we walk back to the specified btree or key space
* as we may have crossed the boundary. Do that in a loop as there may be content
* inserted concurrently.
*/
while ((ret = __curhs_file_cursor_prev(session, file_cursor)) == 0) {
WT_ERR(
file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter));
/* We are back in the specified btree range. */
if (btree_id == hs_cursor->btree_id && F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
WT_ERR(
__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp));
/* We are back in the specified key range. */
if (cmp == 0)
break;
/*
* We are now smaller than the key range, which indicates nothing is visible to
* us in the specified key range.
*/
if (cmp < 0) {
ret = WT_NOTFOUND;
goto err;
}
}
/*
* We are now smaller than the btree range, which indicates nothing is visible to us
* in the specified btree range.
*/
if (btree_id < hs_cursor->btree_id) {
ret = WT_NOTFOUND;
goto err;
}
}
WT_ERR(ret);
*exactp = cmp;
} else
*exactp = 1;
WT_ERR(__curhs_next_visible(session, hs_cursor));
}
/* Search the closest match that is smaller or equal to the search key. */
else {
/*
* Because of the special visibility rules for the history store, a new key can appear in
* between our search and the set of updates that we're interested in. Keep trying until we
* find it.
*
* There may be no history store entries for the given btree id and record key if they have
* been removed by rollback to stable.
*
* Note that we need to compare the raw key off the cursor to determine where we are in the
* history store as opposed to comparing the embedded data store key since the ordering is
* not guaranteed to be the same.
*/
if (exact > 0) {
/*
* It's possible that we may race with a history store insert for another key. So we may
* be more than one record away the end of our target key/timestamp range. Keep
* iterating backwards until we land on our key.
* Keeping looking for the first visible update in the specified range when walking
* backwards.
*/
WT_ERR(__curhs_prev_visible(session, hs_cursor));
/*
* We can't find anything visible when first walking forwards so we must have found an
* update that is smaller than the specified key.
*/
while ((ret = __wt_hs_cursor_prev(session, file_cursor)) == 0) {
WT_STAT_CONN_DATA_INCR(session, cursor_skip_hs_cur_position);
WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
if (cmp <= 0)
break;
}
*exactp = cmp;
} else
*exactp = -1;
#ifdef HAVE_DIAGNOSTIC
if (ret == 0) {
WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
WT_ASSERT(session, cmp <= 0);
} else {
WT_ERR(ret);
/*
* We find an update when walking forwards. If initially we land on the same key as the
* specified key, exact will be 0 and we should return that. If it is not visible, we
* must have found a key that is larger than the specified key.
*/
*exactp = exact;
}
} else {
/*
* We placed the file cursor after the search key. Try first to walk backwards to see if we
* can find a visible record. If nothing is visible, try to walk forwards.
*/
WT_ERR_NOTFOUND_OK(__curhs_prev_visible(session, hs_cursor), true);
if (ret == WT_NOTFOUND) {
/*
* When walking forwards, first ensure we walk back to the specified btree or key space
* as we may have crossed the boundary. Do that in a loop as there may be content
* inserted concurrently.
*/
while ((ret = __curhs_file_cursor_next(session, file_cursor)) == 0) {
WT_ERR(
file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter));
/* We are back in the specified btree range. */
if (btree_id == hs_cursor->btree_id && F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
WT_ERR(
__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp));
/* We are back in the specified key range. */
if (cmp == 0)
break;
/*
* We are now larger than the key range, which indicates nothing is visible to
* us in the specified key range.
*/
if (cmp > 0) {
ret = WT_NOTFOUND;
goto err;
}
}
/*
* We are now larger than the btree range, which indicates nothing is visible to us
* in the specified btree range.
*/
if (btree_id > hs_cursor->btree_id) {
ret = WT_NOTFOUND;
goto err;
}
}
WT_ERR(ret);
/*
* Keeping looking for the first visible update in the specified range when walking
* forwards.
*/
WT_ERR(__curhs_next_visible(session, hs_cursor));
/*
* We can't find anything visible when first walking backwards so we must have found an
* update that is larger than the specified key.
*/
*exactp = 1;
} else {
WT_ERR(ret);
*exactp = exact;
}
}
#ifdef HAVE_DIAGNOSTIC
WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
WT_ASSERT(
session, (cmp == 0 && *exactp == 0) || (cmp < 0 && *exactp < 0) || (cmp > 0 && *exactp > 0));
#endif
WT_ERR(__curhs_prev_visible(session, hs_cursor));
}
__curhs_set_key_ptr(cursor, file_cursor);
__curhs_set_value_ptr(cursor, file_cursor);
if (0) {
err:
WT_TRET(cursor->reset(cursor));
}
__wt_scr_free(session, &datastore_key);
__wt_scr_free(session, &srch_key);
API_END_RET(session, ret);
}
/*
* __curhs_get_key --
* WT_CURSOR->get_key method for the hs cursor type.
*/
static int
__curhs_get_key(WT_CURSOR *cursor, ...)
{
WT_CURSOR *file_cursor;
WT_CURSOR_HS *hs_cursor;
WT_DECL_RET;
va_list ap;
hs_cursor = (WT_CURSOR_HS *)cursor;
file_cursor = hs_cursor->file_cursor;
va_start(ap, cursor);
ret = file_cursor->get_key(file_cursor, va_arg(ap, uint32_t *), va_arg(ap, WT_ITEM **),
va_arg(ap, wt_timestamp_t *), va_arg(ap, uint64_t *));
va_end(ap);
return (ret);
}
/*
* __curhs_get_value --
* WT_CURSOR->get_value method for the hs cursor type.
*/
static int
__curhs_get_value(WT_CURSOR *cursor, ...)
{
WT_CURSOR *file_cursor;
WT_CURSOR_HS *hs_cursor;
WT_DECL_RET;
va_list ap;
hs_cursor = (WT_CURSOR_HS *)cursor;
file_cursor = hs_cursor->file_cursor;
va_start(ap, cursor);
ret = file_cursor->get_value(file_cursor, va_arg(ap, wt_timestamp_t *),
va_arg(ap, wt_timestamp_t *), va_arg(ap, uint64_t *), va_arg(ap, WT_ITEM **));
va_end(ap);
return (ret);
}
/*
* __curhs_set_value --
* WT_CURSOR->set_value method for the hs cursor type.
* WT_CURSOR->set_value method for the history store cursor type.
*/
static void
__curhs_set_value(WT_CURSOR *cursor, ...)
{
WT_CURSOR *file_cursor;
WT_CURSOR_HS *hs_cursor;
WT_ITEM *hs_val;
wt_timestamp_t start_ts;
wt_timestamp_t stop_ts;
uint64_t type;
va_list ap;
hs_cursor = (WT_CURSOR_HS *)cursor;
@ -650,14 +775,20 @@ __curhs_set_value(WT_CURSOR *cursor, ...)
va_start(ap, cursor);
hs_cursor->time_window = *va_arg(ap, WT_TIME_WINDOW *);
file_cursor->set_value(file_cursor, va_arg(ap, wt_timestamp_t), va_arg(ap, wt_timestamp_t),
va_arg(ap, uint64_t), va_arg(ap, WT_ITEM *));
stop_ts = va_arg(ap, wt_timestamp_t);
start_ts = va_arg(ap, wt_timestamp_t);
type = va_arg(ap, uint64_t);
hs_val = va_arg(ap, WT_ITEM *);
file_cursor->set_value(file_cursor, stop_ts, start_ts, type, hs_val);
va_end(ap);
__curhs_set_value_ptr(cursor, file_cursor);
}
/*
* __curhs_insert --
* WT_CURSOR->insert method for the hs cursor type.
* WT_CURSOR->insert method for the history store cursor type.
*/
static int
__curhs_insert(WT_CURSOR *cursor)
@ -676,6 +807,12 @@ __curhs_insert(WT_CURSOR *cursor)
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, insert, CUR2BT(file_cursor));
/*
* Disable bulk loads into history store. This would normally occur when updating a record with
* a cursor however the history store doesn't use cursor update, so we do it here.
*/
__wt_cursor_disable_bulk(session);
/* Allocate a tombstone only when there is a valid stop time point. */
if (WT_TIME_WINDOW_HAS_STOP(&hs_cursor->time_window)) {
/*
@ -701,7 +838,6 @@ __curhs_insert(WT_CURSOR *cursor)
if (hs_tombstone != NULL) {
hs_tombstone->next = hs_upd;
hs_upd = hs_tombstone;
hs_tombstone = NULL;
}
retry:
@ -725,7 +861,7 @@ err:
/*
* __curhs_remove --
* WT_CURSOR->remove method for the hs cursor type.
* WT_CURSOR->remove method for the history store cursor type.
*/
static int
__curhs_remove(WT_CURSOR *cursor)
@ -734,9 +870,14 @@ __curhs_remove(WT_CURSOR *cursor)
WT_CURSOR_BTREE *cbt;
WT_CURSOR_HS *hs_cursor;
WT_DECL_RET;
WT_ITEM hs_key;
WT_SESSION_IMPL *session;
WT_UPDATE *hs_tombstone;
wt_timestamp_t hs_start_ts;
uint64_t hs_counter;
uint32_t hs_btree_id;
WT_CLEAR(hs_key);
hs_cursor = (WT_CURSOR_HS *)cursor;
file_cursor = hs_cursor->file_cursor;
cbt = (WT_CURSOR_BTREE *)file_cursor;
@ -745,7 +886,9 @@ __curhs_remove(WT_CURSOR *cursor)
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, insert, CUR2BT(file_cursor));
/* Remove must be called with cursor positioned. */
WT_ASSERT(session, F_ISSET(file_cursor, WT_CURSTD_KEY_INT));
WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_KEY_INT));
WT_ERR(cursor->get_key(cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
/*
* Since we're using internal functions to modify the row structure, we need to manually set the
@ -765,6 +908,7 @@ __curhs_remove(WT_CURSOR *cursor)
/* Invalidate the previous value but we will hold on to the position of the key. */
F_CLR(file_cursor, WT_CURSTD_VALUE_SET);
F_CLR(cursor, WT_CURSTD_VALUE_SET);
if (0) {
err:
@ -777,7 +921,7 @@ err:
/*
* __curhs_update --
* WT_CURSOR->update method for the hs cursor type.
* WT_CURSOR->update method for the history store cursor type.
*/
static int
__curhs_update(WT_CURSOR *cursor)
@ -785,15 +929,11 @@ __curhs_update(WT_CURSOR *cursor)
WT_CURSOR *file_cursor;
WT_CURSOR_BTREE *cbt;
WT_CURSOR_HS *hs_cursor;
WT_DECL_ITEM(hs_value);
WT_DECL_RET;
WT_SESSION_IMPL *session;
WT_UPDATE *hs_tombstone, *hs_upd;
bool retry;
uint64_t hs_upd_type;
wt_timestamp_t hs_durable_ts, hs_stop_durable_ts;
hs_cursor = (WT_CURSOR_HS *)cursor;
file_cursor = hs_cursor->file_cursor;
cbt = (WT_CURSOR_BTREE *)file_cursor;
@ -814,34 +954,12 @@ __curhs_update(WT_CURSOR *cursor)
WT_ASSERT(session, !WT_TIME_WINDOW_IS_EMPTY(&hs_cursor->time_window));
WT_ASSERT(session, WT_TIME_WINDOW_HAS_STOP(&hs_cursor->time_window));
/*
* Ideally we want to check if we are positioned on the newest value for user key. However, we
* can't check if the timestamp was set to WT_TS_MAX when we searched for the key. We can can a
* next() on cursor to confirm there is no newer value but that would disturb our cursor. A more
* expensive method would be to search again and verify.
*/
/* The tombstone to represent the stop time window. */
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_tombstone, NULL));
hs_tombstone->start_ts = hs_cursor->time_window.stop_ts;
hs_tombstone->durable_ts = hs_cursor->time_window.durable_stop_ts;
hs_tombstone->txnid = hs_cursor->time_window.stop_txn;
/* Modify the existing value with a new stop timestamp. */
/* Allocate a buffer for the history store value. */
WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
/* Retrieve the existing update value and stop timestamp. */
WT_ERR(file_cursor->get_value(
file_cursor, &hs_stop_durable_ts, &hs_durable_ts, &hs_upd_type, hs_value));
WT_ASSERT(session, hs_stop_durable_ts == WT_TS_MAX);
WT_ASSERT(session, (uint8_t)hs_upd_type == WT_UPDATE_STANDARD);
/* Use set_value method to pack the new value. */
file_cursor->set_value(
file_cursor, hs_cursor->time_window.stop_ts, hs_durable_ts, hs_upd_type, hs_value);
WT_ERR(__wt_upd_alloc(session, &file_cursor->value, WT_UPDATE_STANDARD, &hs_upd, NULL));
hs_upd->start_ts = hs_cursor->time_window.start_ts;
hs_upd->durable_ts = hs_cursor->time_window.durable_start_ts;
@ -850,6 +968,11 @@ __curhs_update(WT_CURSOR *cursor)
/* Connect the tombstone to the update. */
hs_tombstone->next = hs_upd;
/*
* Since we're using internal functions to modify the row structure, we need to manually set the
* comparison to an exact match.
*/
cbt->compare = 0;
/* Make the updates and if we fail, search and try again. */
while ((ret = __wt_hs_modify(cbt, hs_tombstone)) == WT_RESTART) {
WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(cbt, &file_cursor->key, false));
@ -863,11 +986,13 @@ __curhs_update(WT_CURSOR *cursor)
WT_TRET(ret);
}
__curhs_set_key_ptr(cursor, file_cursor);
__curhs_set_value_ptr(cursor, file_cursor);
if (0) {
err:
__wt_free(session, hs_tombstone);
__wt_free(session, hs_upd);
__wt_scr_free(session, &hs_value);
WT_TRET(cursor->reset(cursor));
}
API_END_RET(session, ret);
@ -880,53 +1005,54 @@ err:
int
__wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
{
WT_CURSOR_STATIC_INIT(iface, __curhs_get_key, /* get-key */
__curhs_get_value, /* get-value */
__curhs_set_key, /* set-key */
__curhs_set_value, /* set-value */
__wt_cursor_compare_notsup, /* compare */
__wt_cursor_equals_notsup, /* equals */
__curhs_next, /* next */
__curhs_prev, /* prev */
__curhs_reset, /* reset */
__wt_cursor_notsup, /* search */
__curhs_search_near, /* search-near */
__curhs_insert, /* insert */
__wt_cursor_modify_value_format_notsup, /* modify */
__curhs_update, /* update */
__curhs_remove, /* remove */
__wt_cursor_notsup, /* reserve */
__wt_cursor_reconfigure_notsup, /* reconfigure */
__wt_cursor_notsup, /* cache */
__wt_cursor_reopen_notsup, /* reopen */
__curhs_close); /* close */
WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */
__wt_cursor_get_value, /* get-value */
__curhs_set_key, /* set-key */
__curhs_set_value, /* set-value */
__wt_cursor_compare_notsup, /* compare */
__wt_cursor_equals_notsup, /* equals */
__curhs_next, /* next */
__curhs_prev, /* prev */
__curhs_reset, /* reset */
__wt_cursor_notsup, /* search */
__curhs_search_near, /* search-near */
__curhs_insert, /* insert */
__wt_cursor_modify_value_format_notsup, /* modify */
__curhs_update, /* update */
__curhs_remove, /* remove */
__wt_cursor_notsup, /* reserve */
__wt_cursor_reconfigure_notsup, /* reconfigure */
__wt_cursor_notsup, /* cache */
__wt_cursor_reopen_notsup, /* reopen */
__curhs_close); /* close */
WT_CURSOR *cursor;
WT_CURSOR_HS *hs_cursor;
WT_DECL_RET;
WT_ITEM *datastore_key;
*cursorp = NULL;
WT_RET(__wt_calloc_one(session, &hs_cursor));
++session->hs_cursor_counter;
cursor = (WT_CURSOR *)hs_cursor;
*cursor = iface;
cursor->session = (WT_SESSION *)session;
cursor->key_format = WT_HS_KEY_FORMAT;
cursor->value_format = WT_HS_VALUE_FORMAT;
WT_ERR(__wt_strdup(session, WT_HS_URI, &cursor->uri));
/* Open the file cursor for operations on the regular history store .*/
WT_ERR(__hs_cursor_open_int(session, &hs_cursor->file_cursor));
WT_ERR(__curhs_file_cursor_open(session, &hs_cursor->file_cursor));
WT_ERR(__wt_cursor_init(cursor, WT_HS_URI, owner, NULL, cursorp));
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
hs_cursor->btree_id = 0;
datastore_key = &hs_cursor->datastore_key;
WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
WT_ERR(__wt_scr_alloc(session, 0, &hs_cursor->datastore_key));
hs_cursor->flags = 0;
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
if (0) {
err:
WT_TRET(__curhs_close(cursor));
WT_TRET(cursor->close(cursor));
*cursorp = NULL;
}
return (ret);

View File

@ -285,7 +285,7 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
* busy and then opens a different file (in this case, the HS file), it can deadlock with a
* thread waiting for the first file to drain from the eviction queue. See WT-5946 for details.
*/
WT_RET(__wt_hs_cursor_cache(session));
WT_RET(__wt_curhs_cache(session));
if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) {
/*
* Cannot use WT_WITH_PASS_LOCK because this is a try lock. Fix when that is supported. We
@ -2330,7 +2330,6 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_CURSOR *hs_cursor_saved;
WT_DECL_RET;
WT_TRACK_OP_DECL;
WT_TXN_GLOBAL *txn_global;
@ -2348,22 +2347,13 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
txn_global = &conn->txn_global;
txn_shared = WT_SESSION_TXN_SHARED(session);
/*
* If we have a history store cursor, save it. This ensures that if eviction needs to access the
* history store, it will get its own cursor, avoiding potential problems if it were to
* reposition or reset a history store cursor that we're in the middle of using for something
* else.
*/
hs_cursor_saved = session->hs_cursor;
session->hs_cursor = NULL;
/*
* Before we enter the eviction generation, make sure this session has a cached history store
* cursor, otherwise we can deadlock with a session wanting exclusive access to a handle: that
* session will have a handle list write lock and will be waiting on eviction to drain, we'll be
* inside eviction waiting on a handle list read lock to open a history store cursor.
*/
WT_ERR(__wt_hs_cursor_cache(session));
WT_ERR(__wt_curhs_cache(session));
/*
* It is not safe to proceed if the eviction server threads aren't setup yet.
@ -2464,12 +2454,6 @@ err:
done:
WT_TRACK_OP_END(session);
/* If the caller was using a history store cursor they should have closed it by now. */
WT_ASSERT(session, session->hs_cursor == NULL);
/* Restore the caller's history store cursor. */
session->hs_cursor = hs_cursor_saved;
return (ret);
}

View File

@ -76,7 +76,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
evict_flags = LF_ISSET(WT_READ_NO_SPLIT) ? WT_EVICT_CALL_NO_SPLIT : 0;
FLD_SET(evict_flags, WT_EVICT_CALL_URGENT);
WT_RET(__wt_hs_cursor_cache(session));
WT_RET(__wt_curhs_cache(session));
(void)__wt_atomic_addv32(&btree->evict_busy, 1);
ret = __wt_evict(session, ref, previous_state, evict_flags);
(void)__wt_atomic_subv32(&btree->evict_busy, 1);
@ -131,7 +131,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state, uint32
/*
* Track history store pages being force evicted while holding a history store cursor open.
*/
if (session->hs_cursor != NULL && WT_IS_HS(session->dhandle)) {
if (session->hs_cursor_counter > 0 && WT_IS_HS(session->dhandle)) {
force_evict_hs = true;
WT_STAT_CONN_INCR(session, cache_eviction_force_hs);
}

View File

@ -55,22 +55,20 @@ __hs_cleanup_las(WT_SESSION_IMPL *session)
/*
* __wt_hs_get_btree --
* Get the history store btree. Open a history store cursor if needed to get the btree.
* Get the history store btree by opening a history store cursor.
*/
int
__wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep)
{
WT_CURSOR *hs_cursor;
WT_DECL_RET;
*hs_btreep = NULL;
WT_RET(__wt_hs_cursor_open(session));
*hs_btreep = CUR2BT(session->hs_cursor);
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
*hs_btreep = __wt_curhs_get_btree(hs_cursor);
WT_ASSERT(session, *hs_btreep != NULL);
WT_TRET(__wt_hs_cursor_close(session));
WT_TRET(hs_cursor->close(hs_cursor));
return (ret);
}

View File

@ -87,117 +87,39 @@ __wt_hs_modify(WT_CURSOR_BTREE *hs_cbt, WT_UPDATE *hs_upd)
}
/*
* __hs_cursor_position_int --
* Internal function to position a history store cursor at the end of a set of updates for a
* given btree id, record key and timestamp.
* __wt_hs_upd_time_window --
* Get the underlying time window of the update history store cursor is positioned at.
*/
static int
__hs_cursor_position_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id,
const WT_ITEM *key, wt_timestamp_t timestamp, WT_ITEM *user_srch_key)
void
__wt_hs_upd_time_window(WT_CURSOR *hs_cursor, WT_TIME_WINDOW **twp)
{
WT_DECL_ITEM(srch_key);
WT_DECL_RET;
int cmp, exact;
WT_CURSOR_BTREE *hs_cbt;
/* The session should be pointing at the history store btree. */
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
if (user_srch_key == NULL)
WT_RET(__wt_scr_alloc(session, 0, &srch_key));
else
srch_key = user_srch_key;
/*
* Because of the special visibility rules for the history store, a new key can appear in
* between our search and the set of updates that we're interested in. Keep trying until we find
* it.
*
* There may be no history store entries for the given btree id and record key if they have been
* removed by WT_CONNECTION::rollback_to_stable.
*
* Note that we need to compare the raw key off the cursor to determine where we are in the
* history store as opposed to comparing the embedded data store key since the ordering is not
* guaranteed to be the same.
*/
cursor->set_key(cursor, btree_id, key, timestamp, UINT64_MAX);
/* Copy the raw key before searching as a basis for comparison. */
WT_ERR(__wt_buf_set(session, srch_key, cursor->key.data, cursor->key.size));
WT_ERR(cursor->search_near(cursor, &exact));
if (exact > 0) {
/*
* It's possible that we may race with a history store insert for another key. So we may be
* more than one record away the end of our target key/timestamp range. Keep iterating
* backwards until we land on our key.
*/
while ((ret = cursor->prev(cursor)) == 0) {
WT_STAT_CONN_DATA_INCR(session, cursor_skip_hs_cur_position);
WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp));
if (cmp <= 0)
break;
}
}
#ifdef HAVE_DIAGNOSTIC
if (ret == 0) {
WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp));
WT_ASSERT(session, cmp <= 0);
}
#endif
err:
if (user_srch_key == NULL)
__wt_scr_free(session, &srch_key);
return (ret);
hs_cbt = __wt_curhs_get_cbt(hs_cursor);
*twp = &hs_cbt->upd_value->tw;
}
/*
* __wt_hs_cursor_position --
* Position a history store cursor at the end of a set of updates for a given btree id, record
* key and timestamp. There may be no history store entries for the given btree id and record
* key if they have been removed by WT_CONNECTION::rollback_to_stable. There is an optional
* argument to store the key that we used to position the cursor which can be used to assess
* where the cursor is relative to it. The function executes with isolation level set as
* WT_ISO_READ_UNCOMMITTED.
* __wt_hs_find_upd --
* Scan the history store for a record the btree cursor wants to position on. Create an update
* for the record and return to the caller.
*/
int
__wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id,
const WT_ITEM *key, wt_timestamp_t timestamp, WT_ITEM *user_srch_key)
{
WT_DECL_RET;
WT_WITH_BTREE(session, CUR2BT(cursor),
WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
ret = __hs_cursor_position_int(session, cursor, btree_id, key, timestamp, user_srch_key)));
return (ret);
}
/*
* __hs_find_upd_int --
* Internal helper to scan the history store for a record the btree cursor wants to position on.
* Create an update for the record and return to the caller. The caller may choose to optionally
* allow prepared updates to be returned regardless of whether prepare is being ignored
* globally. Otherwise, a prepare conflict will be returned upon reading a prepared update.
*/
static int
__hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare,
WT_ITEM *base_value_buf)
__wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf)
{
WT_CURSOR *hs_cursor;
WT_CURSOR_BTREE *hs_cbt;
WT_DECL_ITEM(hs_value);
WT_DECL_ITEM(orig_hs_value_buf);
WT_DECL_RET;
WT_ITEM hs_key, recno_key;
WT_MODIFY_VECTOR modifies;
WT_TXN *txn;
WT_TXN_SHARED *txn_shared;
WT_UPDATE *mod_upd;
wt_timestamp_t durable_timestamp, durable_timestamp_tmp, hs_start_ts, hs_start_ts_tmp;
wt_timestamp_t durable_timestamp, durable_timestamp_tmp;
wt_timestamp_t hs_stop_durable_ts, hs_stop_durable_ts_tmp, read_timestamp;
uint64_t hs_counter, hs_counter_tmp, upd_type_full;
uint32_t hs_btree_id;
uint64_t upd_type_full;
uint8_t *p, recno_key_buf[WT_INTPACK64_MAXSIZE], upd_type;
int cmp;
bool upd_found;
hs_cursor = NULL;
@ -205,15 +127,11 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
orig_hs_value_buf = NULL;
WT_CLEAR(hs_key);
__wt_modify_vector_init(session, &modifies);
txn = session->txn;
txn_shared = WT_SESSION_TXN_SHARED(session);
upd_found = false;
WT_STAT_CONN_DATA_INCR(session, cursor_search_hs);
hs_cursor = session->hs_cursor;
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
/* Row-store key is as passed to us, create the column-store key as needed. */
WT_ASSERT(
session, (key == NULL && recno != WT_RECNO_OOB) || (key != NULL && recno == WT_RECNO_OOB));
@ -226,70 +144,29 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
key->size = WT_PTRDIFF(p, recno_key_buf);
}
/* Allocate buffer for the history store value. */
WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
/*
* After positioning our cursor, we're stepping backwards to find the correct update. Since the
* timestamp is part of the key, our cursor needs to go from the newest record (further in the
* history store) to the oldest (earlier in the history store) for a given key.
*/
read_timestamp = allow_prepare ? txn->prepare_timestamp : txn_shared->read_timestamp;
/*
*
* A reader without a timestamp should read the largest timestamp in the range, however cursor
* search near if given a 0 timestamp will place at the top of the range and hide the records
* below it. As such we need to adjust a 0 timestamp to the timestamp max value.
*/
if (read_timestamp == WT_TS_NONE)
read_timestamp = WT_TS_MAX;
read_timestamp =
txn_shared->read_timestamp == WT_TS_NONE ? WT_TS_MAX : txn_shared->read_timestamp;
WT_ERR_NOTFOUND_OK(
__wt_hs_cursor_position(session, hs_cursor, btree_id, key, read_timestamp, NULL), true);
hs_cursor->set_key(hs_cursor, 4, btree_id, key, read_timestamp, UINT64_MAX);
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true);
if (ret == WT_NOTFOUND) {
ret = 0;
goto done;
}
for (;; ret = __wt_hs_cursor_prev(session, hs_cursor)) {
WT_ERR_NOTFOUND_OK(ret, true);
/* If we hit the end of the table, let's get out of here. */
if (ret == WT_NOTFOUND) {
ret = 0;
goto done;
}
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
/* Stop before crossing over to the next btree */
if (hs_btree_id != btree_id)
goto done;
/*
* Keys are sorted in an order, skip the ones before the desired key, and bail out if we
* have crossed over the desired key and not found the record we are looking for.
*/
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
if (cmp != 0)
goto done;
/*
* If the stop time pair on the tombstone in the history store is already globally visible
* we can skip it.
*/
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
WT_STAT_CONN_DATA_INCR(session, cursor_prev_hs_tombstone);
continue;
}
/*
* If the stop time point of a record is visible to us, we won't be able to see anything for
* this entire key. Just jump straight to the end.
*/
if (__wt_txn_tw_stop_visible(session, &hs_cbt->upd_value->tw))
goto done;
/* If the start time point is visible to us, let's return that record. */
if (__wt_txn_tw_start_visible(session, &hs_cbt->upd_value->tw))
break;
}
/* Allocate buffer for the history store value. */
WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
WT_ERR(hs_cursor->get_value(
hs_cursor, &hs_stop_durable_ts, &durable_timestamp, &upd_type_full, hs_value));
upd_type = (uint8_t)upd_type_full;
@ -320,6 +197,8 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
* visibility checks when reading in order to construct the modify chain, so we can create
* the value we expect.
*/
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
while (upd_type == WT_UPDATE_MODIFY) {
WT_ERR(__wt_upd_alloc(session, hs_value, upd_type, &mod_upd, NULL));
WT_ERR(__wt_modify_vector_push(&modifies, mod_upd));
@ -330,7 +209,7 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
* update here we fall back to the datastore version. If its timestamp doesn't match our
* timestamp then we return not found.
*/
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, hs_cursor), true);
WT_ERR_NOTFOUND_OK(hs_cursor->next(hs_cursor), true);
if (ret == WT_NOTFOUND) {
/*
* Fallback to the provided value as the base value.
@ -344,47 +223,6 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
upd_type = WT_UPDATE_STANDARD;
break;
}
hs_start_ts_tmp = WT_TS_NONE;
/*
* Make sure we use the temporary variants of these variables. We need to retain the
* timestamps of the original modify we saw.
*
* We keep looking back into history store until we find a base update to apply the
* reverse deltas on top of.
*/
WT_ERR(hs_cursor->get_key(
hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts_tmp, &hs_counter_tmp));
if (hs_btree_id != btree_id) {
/* Fallback to the provided value as the base value. */
orig_hs_value_buf = hs_value;
hs_value = base_value_buf;
upd_type = WT_UPDATE_STANDARD;
break;
}
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
if (cmp != 0) {
/* Fallback to the provided value as the base value. */
orig_hs_value_buf = hs_value;
hs_value = base_value_buf;
upd_type = WT_UPDATE_STANDARD;
break;
}
/*
* If the stop time pair on the tombstone in the history store is already globally
* visible fall back to the base value. This is possible in scenarios where the latest
* updates are aborted by RTS according to stable timestamp.
*/
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
/* Fallback to the provided value as the base value. */
orig_hs_value_buf = hs_value;
hs_value = base_value_buf;
upd_type = WT_UPDATE_STANDARD;
break;
}
WT_ERR(hs_cursor->get_value(hs_cursor, &hs_stop_durable_ts_tmp, &durable_timestamp_tmp,
&upd_type_full, hs_value));
@ -440,26 +278,8 @@ err:
WT_ASSERT(session, ret != WT_NOTFOUND);
return (ret);
}
/*
* __wt_hs_find_upd --
* Scan the history store for a record.
*/
int
__wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, uint64_t recno,
WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf)
{
WT_BTREE *btree;
WT_DECL_RET;
btree = S2BT(session);
WT_RET(__wt_hs_cursor_open(session));
WT_WITH_BTREE(session, CUR2BT(session->hs_cursor),
(ret = __hs_find_upd_int(
session, btree->id, key, value_format, recno, upd_value, allow_prepare, base_value_buf)));
WT_TRET(__wt_hs_cursor_close(session));
if (hs_cursor != NULL)
WT_TRET(hs_cursor->close(hs_cursor));
return (ret);
}

View File

@ -11,8 +11,7 @@
static int __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
uint32_t btree_id, const WT_ITEM *key, bool reinsert);
static int __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
WT_BTREE *btree, const WT_ITEM *key, wt_timestamp_t ts, uint64_t *hs_counter,
const WT_ITEM *srch_key);
WT_BTREE *btree, const WT_ITEM *key, wt_timestamp_t ts, uint64_t *hs_counter);
/*
* __hs_verbose_cache_stats --
@ -61,100 +60,17 @@ __hs_verbose_cache_stats(WT_SESSION_IMPL *session, WT_BTREE *btree)
}
/*
* __hs_insert_record_with_btree_int --
* Internal helper for inserting history store records. If this call is successful, the cursor
* parameter will be positioned on the newly inserted record. Otherwise, it will be reset.
*/
static int
__hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint64_t btree_id,
const WT_ITEM *key, const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw,
uint64_t counter)
{
WT_CURSOR_BTREE *cbt;
WT_DECL_RET;
WT_UPDATE *hs_upd, *upd_local;
cbt = (WT_CURSOR_BTREE *)cursor;
hs_upd = upd_local = NULL;
/* The session should be pointing at the history store btree. */
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
/*
* Use WT_CURSOR.set_key and WT_CURSOR.set_value to create key and value items, then use them to
* create an update chain for a direct insertion onto the history store page.
*/
cursor->set_key(cursor, btree_id, key, tw->start_ts, counter);
cursor->set_value(cursor, tw->durable_stop_ts, tw->durable_start_ts, (uint64_t)type, hs_value);
/* Allocate a tombstone only when there is a valid stop time point. */
if (WT_TIME_WINDOW_HAS_STOP(tw)) {
/*
* Insert a delete record to represent stop time point for the actual record to be inserted.
* Set the stop time point as the commit time point of the history store delete record.
*/
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
hs_upd->start_ts = tw->stop_ts;
hs_upd->durable_ts = tw->durable_stop_ts;
hs_upd->txnid = tw->stop_txn;
}
/*
* Append to the delete record, the actual record to be inserted into the history store. Set the
* current update start time point as the commit time point to the history store record.
*/
WT_ERR(__wt_upd_alloc(session, &cursor->value, WT_UPDATE_STANDARD, &upd_local, NULL));
upd_local->start_ts = tw->start_ts;
upd_local->durable_ts = tw->durable_start_ts;
upd_local->txnid = tw->start_txn;
/* Insert the standard update as next update if there is a tombstone. */
if (hs_upd != NULL)
hs_upd->next = upd_local;
else
hs_upd = upd_local;
/* Search the page and insert the updates. */
WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(cbt, &cursor->key, true));
WT_ERR(ret);
WT_ERR(__wt_hs_modify(cbt, hs_upd));
/*
* Since the two updates (tombstone and the standard) will reconcile into a single entry, we are
* incrementing the history store insert statistic by one.
*/
WT_STAT_CONN_DATA_INCR(session, cache_hs_insert);
err:
if (ret != 0) {
__wt_free_update_list(session, &hs_upd);
/*
* We did a row search, release the cursor so that the page doesn't continue being held.
*
* If we were successful, do NOT reset the cursor. We may want to make use of its position
* later to remove timestamped entries.
*/
cursor->reset(cursor);
}
return (ret);
}
/*
* __hs_insert_record_with_btree --
* __hs_insert_record --
* A helper function to insert the record into the history store including stop time point.
* Should be called with session's btree switched to the history store.
*/
static int
__hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree,
const WT_ITEM *key, const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw)
__hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key,
const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw)
{
#ifdef HAVE_DIAGNOSTIC
WT_CURSOR_BTREE *hs_cbt;
#endif
WT_DECL_ITEM(hs_key);
WT_DECL_ITEM(srch_key);
#ifdef HAVE_DIAGNOSTIC
WT_DECL_ITEM(existing_val);
#endif
@ -164,37 +80,24 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
wt_timestamp_t durable_timestamp_diag;
wt_timestamp_t hs_stop_durable_ts_diag;
uint64_t upd_type_full_diag;
int cmp;
#endif
uint64_t counter, hs_counter;
uint32_t hs_btree_id;
int cmp;
counter = 0;
/* Allocate buffers for the history store and search key. */
WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
WT_ERR(__wt_scr_alloc(session, 0, &srch_key));
#ifdef HAVE_DIAGNOSTIC
/* Allocate buffer for the existing history store value for the same key. */
WT_ERR(__wt_scr_alloc(session, 0, &existing_val));
hs_cbt = (WT_CURSOR_BTREE *)cursor;
hs_cbt = __wt_curhs_get_cbt(cursor);
#endif
/*
* The session should be pointing at the history store btree since this is the one that we'll be
* inserting into. The btree parameter that we're passing in should is the btree that the
* history store content is associated with (this is where the btree id part of the history
* store key comes from).
*/
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
WT_ASSERT(session, !WT_IS_HS(btree->dhandle));
/*
* Disable bulk loads into history store. This would normally occur when updating a record with
* a cursor however the history store doesn't use cursor update, so we do it here.
*/
__wt_cursor_disable_bulk(session);
/* Sanity check that the btree is not a history store btree. */
WT_ASSERT(session, !WT_IS_HS(btree));
/*
* Only deltas or full updates should be written to the history store. More specifically, we
@ -207,43 +110,33 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
* timestamp. Otherwise the newly inserting history store record may fall behind the existing
* one can lead to wrong order.
*/
WT_ERR_NOTFOUND_OK(
__wt_hs_cursor_position(session, cursor, btree->id, key, tw->start_ts, srch_key), true);
cursor->set_key(cursor, 4, btree->id, key, tw->start_ts, UINT64_MAX);
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, cursor), true);
if (ret == 0) {
WT_ERR(cursor->get_key(cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
/*
* Check the whether the existing record is also from the same timestamp.
*
* Verify simple checks first to confirm whether the retrieved update same or not before
* performing the expensive key comparison.
*/
if (hs_btree_id == btree->id && tw->start_ts == hs_start_ts) {
WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp));
#ifdef HAVE_DIAGNOSTIC
if (cmp == 0) {
WT_ERR(cursor->get_value(cursor, &hs_stop_durable_ts_diag, &durable_timestamp_diag,
&upd_type_full_diag, existing_val));
WT_ERR(__wt_compare(session, NULL, existing_val, hs_value, &cmp));
/*
* Check if the existing HS value is same as the new value we are about to insert.
* We can skip this check if the existing value has a globally visible stop time,
* i.e., the value has been deleted from the HS.
*/
if (cmp == 0)
WT_ASSERT(session,
(WT_TIME_WINDOW_HAS_STOP(&hs_cbt->upd_value->tw) &&
__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) ||
tw->start_txn == WT_TXN_NONE ||
tw->start_txn != hs_cbt->upd_value->tw.start_txn ||
tw->start_ts != hs_cbt->upd_value->tw.start_ts);
counter = hs_counter + 1;
}
#else
if (tw->start_ts == hs_start_ts) {
WT_ERR(cursor->get_value(cursor, &hs_stop_durable_ts_diag, &durable_timestamp_diag,
&upd_type_full_diag, existing_val));
WT_ERR(__wt_compare(session, NULL, existing_val, hs_value, &cmp));
/*
* We shouldn't be inserting the same value again for the key unless coming from a
* different transaction. If the updates are from the same transaction, the start
* timestamp for each update should be different.
*/
if (cmp == 0)
counter = hs_counter + 1;
#endif
WT_ASSERT(session,
tw->start_txn == WT_TXN_NONE ||
tw->start_txn != hs_cbt->upd_value->tw.start_txn ||
tw->start_ts != hs_cbt->upd_value->tw.start_ts);
counter = hs_counter + 1;
}
#else
if (tw->start_ts == hs_start_ts)
counter = hs_counter + 1;
#endif
}
/*
@ -251,10 +144,20 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
* updates, we should remove them and reinsert them at the current timestamp.
*/
if (tw->start_ts != WT_TS_NONE) {
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, cursor), true);
/*
* If there were no keys equal to or less than our target key, we would have received
* WT_NOTFOUND. In that case we need to search again with a higher timestamp as the cursor
* would not be positioned correctly.
*/
if (ret == 0)
WT_ERR_NOTFOUND_OK(cursor->next(cursor), true);
else {
cursor->set_key(cursor, 3, btree->id, key, tw->start_ts + 1);
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, cursor), true);
}
if (ret == 0)
WT_ERR(__hs_fixup_out_of_order_from_pos(
session, cursor, btree, key, tw->start_ts, &counter, srch_key));
session, cursor, btree, key, tw->start_ts, &counter));
}
#ifdef HAVE_DIAGNOSTIC
@ -270,36 +173,20 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
}
}
#endif
/* The tree structure can change while we try to insert the mod list, retry if that happens. */
while ((ret = __hs_insert_record_with_btree_int(
session, cursor, btree->id, key, type, hs_value, tw, counter)) == WT_RESTART)
WT_STAT_CONN_DATA_INCR(session, cache_hs_insert_restart);
/* Insert the new record now. */
cursor->set_key(cursor, 4, btree->id, key, tw->start_ts, counter);
cursor->set_value(
cursor, tw, tw->durable_stop_ts, tw->durable_start_ts, (uint64_t)type, hs_value);
WT_ERR(cursor->insert(cursor));
WT_STAT_CONN_INCR(session, cache_hs_insert);
WT_STAT_DATA_INCR(session, cache_hs_insert);
err:
#ifdef HAVE_DIAGNOSTIC
__wt_scr_free(session, &existing_val);
#endif
__wt_scr_free(session, &hs_key);
__wt_scr_free(session, &srch_key);
/* We did a row search, release the cursor so that the page doesn't continue being held. */
cursor->reset(cursor);
return (ret);
}
/*
* __hs_insert_record --
* Temporarily switches to history store btree and calls the helper routine to insert records.
*/
static int
__hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key,
const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw)
{
WT_CURSOR_BTREE *cbt;
WT_DECL_RET;
cbt = (WT_CURSOR_BTREE *)cursor;
WT_WITH_BTREE(session, CUR2BT(cbt),
ret = __hs_insert_record_with_btree(session, cursor, btree, key, type, hs_value, tw));
return (ret);
}
@ -346,8 +233,8 @@ __hs_next_upd_full_value(WT_SESSION_IMPL *session, WT_MODIFY_VECTOR *modifies,
int
__wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
{
WT_BTREE *btree;
WT_CURSOR *cursor;
WT_BTREE *btree, *hs_btree;
WT_CURSOR *hs_cursor;
WT_DECL_ITEM(full_value);
WT_DECL_ITEM(key);
WT_DECL_ITEM(modify_value);
@ -372,10 +259,13 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
bool enable_reverse_modify, hs_inserted, squashed, ts_updates_in_hs;
btree = S2BT(session);
cursor = session->hs_cursor;
prev_upd = NULL;
insert_cnt = 0;
WT_TIME_WINDOW_INIT(&tw);
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
__wt_modify_vector_init(session, &modifies);
if (!btree->hs_entries)
@ -560,13 +450,15 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
if (oldest_upd->type == WT_UPDATE_TOMBSTONE && oldest_upd == first_non_ts_upd &&
!F_ISSET(first_non_ts_upd, WT_UPDATE_CLEARED_HS)) {
/* We can only delete history store entries that have timestamps. */
WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1, true));
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_non_ts);
WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, key, 1, true));
WT_STAT_CONN_INCR(session, cache_hs_key_truncate_non_ts);
WT_STAT_DATA_INCR(session, cache_hs_key_truncate_non_ts);
F_SET(first_non_ts_upd, WT_UPDATE_CLEARED_HS);
} else if (first_non_ts_upd != NULL && !F_ISSET(first_non_ts_upd, WT_UPDATE_CLEARED_HS) &&
(list->ins == NULL || ts_updates_in_hs)) {
WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1, true));
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_non_ts);
WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, key, 1, true));
WT_STAT_CONN_INCR(session, cache_hs_key_truncate_non_ts);
WT_STAT_DATA_INCR(session, cache_hs_key_truncate_non_ts);
F_SET(first_non_ts_upd, WT_UPDATE_CLEARED_HS);
}
@ -704,13 +596,13 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
enable_reverse_modify &&
__wt_calc_modify(session, prev_full_value, full_value, prev_full_value->size / 10,
entries, &nentries) == 0) {
WT_ERR(__wt_modify_pack(cursor, entries, nentries, &modify_value));
WT_ERR(__wt_modify_pack(hs_cursor, entries, nentries, &modify_value));
WT_ERR(__hs_insert_record(
session, cursor, btree, key, WT_UPDATE_MODIFY, modify_value, &tw));
session, hs_cursor, btree, key, WT_UPDATE_MODIFY, modify_value, &tw));
__wt_scr_free(session, &modify_value);
} else
WT_ERR(__hs_insert_record(
session, cursor, btree, key, WT_UPDATE_STANDARD, full_value, &tw));
session, hs_cursor, btree, key, WT_UPDATE_STANDARD, full_value, &tw));
/* Flag the update as now in the history store. */
F_SET(upd, WT_UPDATE_HS);
@ -730,7 +622,8 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
WT_ERR(__wt_block_manager_named_size(session, WT_HS_FILE, &hs_size));
WT_STAT_CONN_SET(session, cache_hs_ondisk, hs_size);
max_hs_size = CUR2BT(cursor)->file_max;
hs_btree = __wt_curhs_get_btree(hs_cursor);
max_hs_size = hs_btree->file_max;
if (max_hs_size != 0 && (uint64_t)hs_size > max_hs_size)
WT_ERR_PANIC(session, WT_PANIC,
"WiredTigerHS: file size of %" PRIu64 " exceeds maximum size %" PRIu64, (uint64_t)hs_size,
@ -747,71 +640,8 @@ err:
__wt_modify_vector_free(&modifies);
__wt_scr_free(session, &full_value);
__wt_scr_free(session, &prev_full_value);
return (ret);
}
/*
* __hs_delete_key_from_ts_int --
* Internal helper for deleting history store content of a given key from a timestamp.
*/
static int
__hs_delete_key_from_ts_int(
WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
{
WT_CURSOR *hs_cursor;
WT_DECL_ITEM(srch_key);
WT_DECL_RET;
WT_ITEM hs_key;
wt_timestamp_t hs_start_ts;
uint64_t hs_counter;
uint32_t hs_btree_id;
int cmp, exact;
/* The session should be pointing at the history store btree. */
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
hs_cursor = session->hs_cursor;
WT_RET(__wt_scr_alloc(session, 0, &srch_key));
hs_cursor->set_key(hs_cursor, btree_id, key, ts, 0);
WT_ERR(__wt_buf_set(session, srch_key, hs_cursor->key.data, hs_cursor->key.size));
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_search_near(session, hs_cursor, &exact), true);
/* Empty history store is fine. */
if (ret == WT_NOTFOUND)
goto done;
/*
* If we raced with a history store insert, we may be two or more records away from our target.
* Keep iterating forwards until we are on or past our target key.
*
* We can't use the cursor positioning helper that we use for regular reads since that will
* place us at the end of a particular key/timestamp range whereas we want to be placed at the
* beginning.
*/
if (exact < 0) {
while ((ret = __wt_hs_cursor_next(session, hs_cursor)) == 0) {
WT_ERR(__wt_compare(session, NULL, &hs_cursor->key, srch_key, &cmp));
if (cmp >= 0)
break;
}
/* No entries greater than or equal to the key we searched for. */
WT_ERR_NOTFOUND_OK(ret, true);
if (ret == WT_NOTFOUND)
goto done;
}
/* Bailing out here also means we have no history store records for our key. */
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
if (hs_btree_id != btree_id)
goto done;
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
if (cmp != 0)
goto done;
WT_ASSERT(session, ts == WT_TS_NONE || hs_start_ts != WT_TS_NONE);
WT_ERR(__hs_delete_key_from_pos(session, hs_cursor, btree_id, key, reinsert));
done:
ret = 0;
err:
__wt_scr_free(session, &srch_key);
WT_TRET(hs_cursor->close(hs_cursor));
return (ret);
}
@ -820,22 +650,29 @@ err:
* Delete history store content of a given key from a timestamp.
*/
int
__wt_hs_delete_key_from_ts(
WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
__wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id,
const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
{
WT_DECL_RET;
bool hs_read_committed;
/* If the operation can't open new handles, it should have figured that out before here. */
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES));
hs_read_committed = F_ISSET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
if (!hs_read_committed)
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
/* The tree structure can change while we try to insert the mod list, retry if that happens. */
do {
WT_WITH_BTREE(session, CUR2BT(session->hs_cursor),
(ret = __hs_delete_key_from_ts_int(session, btree_id, key, ts, reinsert)));
if (ret == WT_RESTART)
WT_STAT_CONN_DATA_INCR(session, cache_hs_insert_restart);
} while (ret == WT_RESTART);
hs_cursor->set_key(hs_cursor, 3, btree_id, key, ts);
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor), true);
/* Empty history store is fine. */
if (ret == WT_NOTFOUND) {
ret = 0;
goto done;
}
WT_ERR(__hs_delete_key_from_pos(session, hs_cursor, btree_id, key, reinsert));
done:
err:
if (!hs_read_committed)
F_CLR(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
return (ret);
}
@ -847,31 +684,29 @@ __wt_hs_delete_key_from_ts(
*/
static int
__hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_BTREE *btree,
const WT_ITEM *key, wt_timestamp_t ts, uint64_t *counter, const WT_ITEM *srch_key)
const WT_ITEM *key, wt_timestamp_t ts, uint64_t *counter)
{
WT_CURSOR *insert_cursor;
WT_CURSOR *hs_insert_cursor;
WT_CURSOR_BTREE *hs_cbt;
WT_DECL_RET;
WT_ITEM hs_key, hs_value;
WT_TIME_WINDOW tw;
WT_UPDATE *tombstone;
wt_timestamp_t hs_ts, hs_start_durable_ts, hs_stop_durable_ts;
WT_TIME_WINDOW tw, hs_insert_tw;
wt_timestamp_t hs_ts;
uint64_t hs_counter, hs_upd_type;
uint32_t hs_btree_id;
#ifdef HAVE_DIAGNOSTIC
int cmp;
#endif
char ts_string[5][WT_TS_INT_STRING_SIZE];
const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL};
insert_cursor = NULL;
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
hs_insert_cursor = NULL;
hs_cbt = __wt_curhs_get_cbt(hs_cursor);
WT_CLEAR(hs_key);
WT_CLEAR(hs_value);
WT_TIME_WINDOW_INIT(&tw);
tombstone = NULL;
/* The session should be pointing at the history store btree. */
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
#ifndef HAVE_DIAGNOSTIC
WT_UNUSED(key);
#endif
/*
* Position ourselves at the beginning of the key range that we may have to fixup. Prior to
* getting here, we've positioned our cursor at the end of a key/timestamp range and then done a
@ -881,15 +716,15 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
* to keep doing "next" until we've got a key greater than the one we attempted to position
* ourselves with.
*/
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
/*
* Prior to getting here, we've done a "search near" on our key for the timestamp we're
* inserting and then a "next". In the regular case, our cursor will be positioned on the
* next key and we'll break out of the first iteration in one of the conditions below.
*/
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
/* We shouldn't have crossed the btree and user key search space. */
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
WT_ERR(__wt_compare(session, NULL, &hs_cursor->key, srch_key, &cmp));
if (cmp > 0)
WT_ASSERT(session, hs_btree_id == btree->id);
#ifdef HAVE_DIAGNOSTIC
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
WT_ASSERT(session, cmp == 0);
#endif
if (hs_ts > ts)
break;
}
if (ret == WT_NOTFOUND)
@ -916,27 +751,14 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
* 2 foo 3 2 ccc
* 2 foo 3 3 ddd
*/
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
/*
* Prior to getting here, we've done a "search near" on our key for the timestamp we're
* inserting and then a "next". In the regular case, our cursor will be positioned on the
* next key and we'll break out of the first iteration in one of the conditions below.
*/
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
/* We shouldn't have crossed the btree and user key search space. */
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
if (hs_btree_id != btree->id)
break;
WT_ASSERT(session, hs_btree_id == btree->id);
#ifdef HAVE_DIAGNOSTIC
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
if (cmp != 0)
break;
/*
* If the stop time pair on the tombstone in the history store is already globally visible
* we can skip it.
*/
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
WT_STAT_CONN_DATA_INCR(session, cursor_next_hs_tombstone);
continue;
}
WT_ASSERT(session, cmp == 0);
#endif
/*
* If we got here, we've got out-of-order updates in the history store.
*
@ -950,11 +772,8 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
* Don't incur the overhead of opening this new cursor unless we need it. In the regular
* case, we'll never get here.
*/
if (insert_cursor == NULL) {
WT_WITHOUT_DHANDLE(session,
ret = __wt_open_cursor(session, WT_HS_URI, NULL, open_cursor_cfg, &insert_cursor));
WT_ERR(ret);
}
if (hs_insert_cursor == NULL)
WT_ERR(__wt_curhs_open(session, NULL, &hs_insert_cursor));
/*
* If these history store records are resolved prepared updates, their durable timestamps
@ -973,47 +792,38 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
__wt_timestamp_to_string(hs_cbt->upd_value->tw.durable_stop_ts, ts_string[3]),
__wt_timestamp_to_string(ts, ts_string[4]));
tw.start_ts = tw.durable_start_ts = ts;
tw.start_txn = hs_cbt->upd_value->tw.start_txn;
hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = ts;
hs_insert_tw.start_txn = hs_cbt->upd_value->tw.start_txn;
/*
* We're going to be inserting something immediately after with the same timestamp. Either
* another moved update OR the update itself that triggered the correction. In either case,
* we should preserve the stop transaction id.
*/
tw.stop_ts = tw.durable_stop_ts = ts;
tw.stop_txn = hs_cbt->upd_value->tw.stop_txn;
hs_insert_tw.stop_ts = hs_insert_tw.durable_stop_ts = ts;
hs_insert_tw.stop_txn = hs_cbt->upd_value->tw.stop_txn;
/* Extract the underlying value for reinsertion. */
WT_ERR(hs_cursor->get_value(
hs_cursor, &hs_stop_durable_ts, &hs_start_durable_ts, &hs_upd_type, &hs_value));
hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value));
/* Reinsert entry with earlier timestamp. */
while ((ret = __hs_insert_record_with_btree_int(session, insert_cursor, btree->id, key,
(uint8_t)hs_upd_type, &hs_value, &tw, *counter)) == WT_RESTART)
;
WT_ERR(ret);
/* Insert the value back with different timestamps. */
hs_insert_cursor->set_key(hs_insert_cursor, 4, btree->id, &hs_key, ts, *counter);
hs_insert_cursor->set_value(hs_insert_cursor, &hs_insert_tw, hs_insert_tw.durable_stop_ts,
hs_insert_tw.durable_start_ts, (uint64_t)hs_upd_type, &hs_value);
WT_ERR(hs_insert_cursor->insert(hs_insert_cursor));
++(*counter);
/* Delete entry with higher timestamp. */
hs_cbt->compare = 0;
WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL));
tombstone->txnid = WT_TXN_NONE;
tombstone->start_ts = tombstone->durable_ts = WT_TS_NONE;
while ((ret = __wt_hs_modify(hs_cbt, tombstone)) == WT_RESTART) {
WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(hs_cbt, &hs_cursor->key, false));
WT_ERR(ret);
}
WT_ERR(ret);
tombstone = NULL;
WT_STAT_CONN_DATA_INCR(session, cache_hs_order_fixup_move);
/* Delete the entry with higher timestamp. */
WT_ERR(hs_cursor->remove(hs_cursor));
WT_STAT_CONN_INCR(session, cache_hs_order_fixup_move);
WT_STAT_DATA_INCR(session, cache_hs_order_fixup_move);
}
if (ret == WT_NOTFOUND)
ret = 0;
err:
__wt_free(session, tombstone);
if (insert_cursor != NULL)
insert_cursor->close(insert_cursor);
if (hs_insert_cursor != NULL)
hs_insert_cursor->close(hs_insert_cursor);
return (ret);
}
@ -1027,26 +837,21 @@ static int
__hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id,
const WT_ITEM *key, bool reinsert)
{
WT_CURSOR *insert_cursor;
WT_CURSOR *hs_insert_cursor;
WT_CURSOR_BTREE *hs_cbt;
WT_DECL_RET;
WT_ITEM hs_key, hs_value;
WT_TIME_WINDOW tw;
WT_UPDATE *upd;
WT_TIME_WINDOW hs_insert_tw;
wt_timestamp_t durable_timestamp, hs_start_ts, hs_stop_durable_ts;
uint64_t hs_counter, hs_insert_counter, hs_upd_type;
uint32_t hs_btree_id;
int cmp;
const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL};
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
hs_cbt = __wt_curhs_get_cbt(hs_cursor);
hs_insert_counter = 0;
WT_CLEAR(hs_key);
WT_CLEAR(hs_value);
WT_TIME_WINDOW_INIT(&tw);
upd = NULL;
insert_cursor = NULL;
hs_insert_cursor = NULL;
if (reinsert) {
/*
* Determine the starting value of our counter, i.e. highest counter value of the timestamp
@ -1056,90 +861,60 @@ __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_
* The cursor will also be positioned at the start of the range that we wish to start
* inserting.
*/
WT_WITHOUT_DHANDLE(session,
ret = __wt_open_cursor(session, WT_HS_URI, NULL, open_cursor_cfg, &insert_cursor));
WT_WITHOUT_DHANDLE(session, ret = __wt_curhs_open(session, NULL, &hs_insert_cursor));
WT_ERR(ret);
F_SET(insert_cursor, WT_CURSTD_IGNORE_TOMBSTONE);
WT_ERR_NOTFOUND_OK(
__wt_hs_cursor_position(session, insert_cursor, btree_id, key, WT_TS_NONE, NULL), true);
F_SET(hs_insert_cursor, WT_CURSTD_HS_READ_COMMITTED);
hs_insert_cursor->set_key(hs_insert_cursor, 4, btree_id, key, WT_TS_NONE, UINT64_MAX);
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_insert_cursor), true);
if (ret == WT_NOTFOUND) {
hs_insert_counter = 0;
ret = 0;
} else {
WT_ERR(insert_cursor->get_key(
insert_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_insert_counter));
WT_ERR(hs_insert_cursor->get_key(
hs_insert_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_insert_counter));
WT_ASSERT(session, hs_start_ts == WT_TS_NONE);
/*
* Increment the hs counter that we'll be using to insert with to avoid overwriting the
* record we just found.
* Increment the history store counter that we'll be using to insert with to avoid
* overwriting the record we just found.
*/
hs_insert_counter++;
}
}
/* Begin iterating over the range of entries we expect to replace. */
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
/*
* If the btree id or key isn't ours, that means that we've hit the end of the key range and
* that there is no more history store content for this key.
*/
if (hs_btree_id != btree_id)
break;
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
if (cmp != 0)
break;
/*
* If the stop time pair on the tombstone in the history store is already globally visible
* we can skip it.
*/
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
WT_STAT_CONN_DATA_INCR(session, cursor_next_hs_tombstone);
continue;
}
/*
* Once we reinsert the entry below, we're not allowed to fail otherwise we'll be leaving
* our history store an invalid state. Anything that can potentially fail, such as heap
* allocation of the tombstone that we'll be using to remove the old value, should be
* performed before reinsertion.
*/
WT_ERR(__wt_upd_alloc_tombstone(session, &upd, NULL));
if (reinsert) {
WT_ERR(hs_cursor->get_value(
hs_cursor, &hs_stop_durable_ts, &durable_timestamp, &hs_upd_type, &hs_value));
tw.start_ts = tw.durable_start_ts = WT_TS_NONE;
tw.start_txn = hs_cbt->upd_value->tw.start_txn;
tw.stop_ts = tw.durable_stop_ts = WT_TS_NONE;
tw.stop_txn = hs_cbt->upd_value->tw.stop_txn;
/* Reinsert entry with zero timestamp. */
while (
(ret = __hs_insert_record_with_btree_int(session, insert_cursor, btree_id, &hs_key,
(uint8_t)hs_upd_type, &hs_value, &tw, hs_insert_counter)) == WT_RESTART)
;
hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = WT_TS_NONE;
hs_insert_tw.start_txn = hs_cbt->upd_value->tw.start_txn;
hs_insert_tw.stop_ts = hs_insert_tw.durable_stop_ts = WT_TS_NONE;
hs_insert_tw.stop_txn = hs_cbt->upd_value->tw.stop_txn;
hs_insert_cursor->set_key(
hs_insert_cursor, 4, btree_id, key, WT_TS_NONE, hs_insert_counter);
hs_insert_cursor->set_value(hs_insert_cursor, &hs_insert_tw, WT_TS_NONE, WT_TS_NONE,
(uint64_t)hs_upd_type, &hs_value);
WT_ERR(hs_insert_cursor->insert(hs_insert_cursor));
WT_STAT_CONN_INCR(session, cache_hs_insert);
WT_STAT_DATA_INCR(session, cache_hs_insert);
hs_insert_counter++;
WT_ERR(ret);
}
/*
* Since we're using internal functions to modify the row structure, we need to manually set
* the comparison to an exact match.
*/
hs_cbt->compare = 0;
/*
* Append a globally visible tombstone to the update list. This will effectively make the
* value invisible and the key itself will eventually get removed during reconciliation.
* Remove the key using history store cursor interface.
*
* If anything fails after this point and we're reinserting we need to panic as it will
* leave our history store in an unexpected state with duplicate entries.
*/
upd->txnid = WT_TXN_NONE;
upd->start_ts = upd->durable_ts = WT_TS_NONE;
if ((ret = __wt_hs_modify(hs_cbt, upd)) != 0) {
if ((ret = hs_cursor->remove(hs_cursor)) != 0) {
if (reinsert)
WT_ERR_PANIC(session, WT_PANIC,
"Failed to insert tombstone, history store now "
@ -1147,14 +922,13 @@ __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_
else
WT_ERR(ret);
}
upd = NULL;
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate);
WT_STAT_CONN_INCR(session, cache_hs_key_truncate);
WT_STAT_DATA_INCR(session, cache_hs_key_truncate);
}
if (ret == WT_NOTFOUND)
ret = 0;
err:
__wt_free(session, upd);
if (insert_cursor != NULL)
insert_cursor->close(insert_cursor);
if (hs_insert_cursor != NULL)
hs_insert_cursor->close(hs_insert_cursor);
return (ret);
}

View File

@ -15,10 +15,9 @@
* store.
*/
static int
__hs_verify_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *ds_cbt, uint32_t this_btree_id)
__hs_verify_id(
WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_CURSOR_BTREE *ds_cbt, uint32_t this_btree_id)
{
WT_CURSOR *hs_cursor;
WT_CURSOR_BTREE *hs_cbt;
WT_DECL_ITEM(prev_key);
WT_DECL_RET;
WT_ITEM key;
@ -27,12 +26,14 @@ __hs_verify_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *ds_cbt, uint32_t this_
uint32_t btree_id;
int cmp;
hs_cursor = session->hs_cursor;
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
WT_CLEAR(key);
WT_ERR(__wt_scr_alloc(session, 0, &prev_key));
#ifndef HAVE_DIAGNOSTIC
WT_UNUSED(this_btree_id);
#endif
/*
* If using standard cursors, we need to skip the non-globally visible tombstones in the data
* table to verify the corresponding entries in the history store are too present in the data
@ -46,27 +47,18 @@ __hs_verify_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *ds_cbt, uint32_t this_
* verify. When we return after moving to a new key the caller is responsible for keeping the
* cursor there or deciding they're done.
*/
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
WT_ERR(hs_cursor->get_key(hs_cursor, &btree_id, &key, &hs_start_ts, &hs_counter));
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
/*
* If the btree id does not match the preview one, we're done. It is up to the caller to set
* up for the next tree and call us, if they choose. For a full history store walk, the
* caller sends in WT_BTREE_ID_INVALID and this function will set and use the first btree id
* it finds and will return once it walks off that tree, leaving the cursor set to the first
* key of that new tree.
*
* We should never cross the btree id, assert if we do so.
*/
if (btree_id != this_btree_id)
break;
/*
* If the stop time pair on the tombstone in the history store is already globally visible
* we can skip it.
*/
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
WT_STAT_CONN_INCR(session, cursor_next_hs_tombstone);
continue;
}
WT_ERR(hs_cursor->get_key(hs_cursor, &btree_id, &key, &hs_start_ts, &hs_counter));
WT_ASSERT(session, btree_id == this_btree_id);
/*
* If we have already checked against this key, keep going to the next key. We only need to
@ -114,22 +106,14 @@ __wt_hs_verify_one(WT_SESSION_IMPL *session)
WT_CURSOR *hs_cursor;
WT_CURSOR_BTREE ds_cbt;
WT_DECL_RET;
WT_ITEM hs_key;
uint32_t btree_id;
int exact;
hs_cursor = session->hs_cursor;
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
btree_id = S2BT(session)->id;
/*
* We are required to position the history store cursor. Set it to the first record of our btree
* in the history store.
*/
memset(&hs_key, 0, sizeof(hs_key));
hs_cursor->set_key(hs_cursor, btree_id, &hs_key, 0, 0);
ret = __wt_hs_cursor_search_near(session, hs_cursor, &exact);
if (ret == 0 && exact < 0)
ret = __wt_hs_cursor_next(session, hs_cursor);
hs_cursor->set_key(hs_cursor, 1, btree_id);
WT_ERR(__wt_curhs_search_near_after(session, hs_cursor));
/*
* If we positioned the cursor there is something to verify.
@ -141,9 +125,12 @@ __wt_hs_verify_one(WT_SESSION_IMPL *session)
if (ret == 0) {
__wt_btcur_init(session, &ds_cbt);
__wt_btcur_open(&ds_cbt);
ret = __hs_verify_id(session, &ds_cbt, btree_id);
ret = __hs_verify_id(session, hs_cursor, &ds_cbt, btree_id);
WT_TRET(__wt_btcur_close(&ds_cbt, false));
}
err:
WT_TRET(hs_cursor->close(hs_cursor));
return (ret == WT_NOTFOUND ? 0 : ret);
}
@ -173,10 +160,10 @@ __wt_hs_verify(WT_SESSION_IMPL *session)
btree_id = WT_BTREE_ID_INVALID;
uri_data = NULL;
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
WT_ERR(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_hs_cursor_open(session));
hs_cursor = session->hs_cursor;
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, hs_cursor), true);
WT_ERR_NOTFOUND_OK(hs_cursor->next(hs_cursor), true);
stop = ret == WT_NOTFOUND ? true : false;
ret = 0;
@ -198,17 +185,16 @@ __wt_hs_verify(WT_SESSION_IMPL *session)
}
WT_ERR(__wt_open_cursor(session, uri_data, NULL, NULL, &ds_cursor));
F_SET(ds_cursor, WT_CURSOR_RAW_OK);
ret = __hs_verify_id(session, (WT_CURSOR_BTREE *)ds_cursor, btree_id);
ret = __hs_verify_id(session, hs_cursor, (WT_CURSOR_BTREE *)ds_cursor, btree_id);
if (ret == WT_NOTFOUND)
stop = true;
WT_TRET(ds_cursor->close(ds_cursor));
WT_ERR_NOTFOUND_OK(ret, false);
}
err:
WT_TRET(__wt_hs_cursor_close(session));
__wt_scr_free(session, &buf);
WT_ASSERT(session, key.mem == NULL && key.memsize == 0);
__wt_free(session, uri_data);
WT_TRET(hs_cursor->close(hs_cursor));
return (ret);
}

View File

@ -36,32 +36,32 @@
WT_DATA_HANDLE *__olddh = (s)->dhandle; \
const char *__oldname; \
/* If this isn't an API reentry, the name should be NULL and the counter should be 0. */ \
WT_ASSERT(session, (s)->name != NULL || s->api_call_counter == 0); \
WT_ASSERT(session, (s)->name != NULL || (s)->api_call_counter == 0); \
__oldname = (s)->name; \
++s->api_call_counter; \
++(s)->api_call_counter; \
(s)->dhandle = (dh); \
(s)->name = (s)->lastop = #h "." #n
#define API_SESSION_POP(s) \
(s)->dhandle = __olddh; \
(s)->name = __oldname; \
--s->api_call_counter
--(s)->api_call_counter
/* Standard entry points to the API: declares/initializes local variables. */
#define API_SESSION_INIT(s, h, n, dh) \
WT_TRACK_OP_DECL; \
API_SESSION_PUSH(s, h, n, dh); \
/* \
* No code before this line, otherwise error handling won't be \
* correct. \
*/ \
WT_ERR(WT_SESSION_CHECK_PANIC(s)); \
WT_SINGLE_THREAD_CHECK_START(s); \
WT_TRACK_OP_INIT(s); \
if (s->api_call_counter == 1 && !F_ISSET(s, WT_SESSION_INTERNAL)) \
__wt_op_timer_start(s); \
/* Reset wait time if this isn't an API reentry. */ \
if (s->api_call_counter == 1) \
(s)->cache_wait_us = 0; \
#define API_SESSION_INIT(s, h, n, dh) \
WT_TRACK_OP_DECL; \
API_SESSION_PUSH(s, h, n, dh); \
/* \
* No code before this line, otherwise error handling won't be \
* correct. \
*/ \
WT_ERR(WT_SESSION_CHECK_PANIC(s)); \
WT_SINGLE_THREAD_CHECK_START(s); \
WT_TRACK_OP_INIT(s); \
if ((s)->api_call_counter == 1 && !F_ISSET(s, WT_SESSION_INTERNAL)) \
__wt_op_timer_start(s); \
/* Reset wait time if this isn't an API reentry. */ \
if ((s)->api_call_counter == 1) \
(s)->cache_wait_us = 0; \
__wt_verbose((s), WT_VERB_API, "%s", "CALL: " #h ":" #n)
#define API_CALL_NOCONF(s, h, n, dh) \
@ -75,21 +75,26 @@
if ((config) != NULL) \
WT_ERR(__wt_config_check((s), WT_CONFIG_REF(session, h##_##n), (config), 0))
#define API_END(s, ret) \
if ((s) != NULL) { \
WT_TRACK_OP_END(s); \
WT_SINGLE_THREAD_CHECK_STOP(s); \
if ((ret) != 0) \
__wt_txn_err_set(s, ret); \
if (s->api_call_counter == 1 && !F_ISSET(session, WT_SESSION_INTERNAL)) \
__wt_op_timer_stop(s); \
/* \
* No code after this line, otherwise error handling \
* won't be correct. \
*/ \
API_SESSION_POP(s); \
} \
} \
#define API_END(s, ret) \
if ((s) != NULL) { \
WT_TRACK_OP_END(s); \
WT_SINGLE_THREAD_CHECK_STOP(s); \
if ((ret) != 0) \
__wt_txn_err_set(s, ret); \
if ((s)->api_call_counter == 1 && !F_ISSET(session, WT_SESSION_INTERNAL)) \
__wt_op_timer_stop(s); \
/* \
* We should not leave any history store cursor open when return from an api call. \
* However, we cannot do a stricter check before WT-7247 is resolved. \
*/ \
WT_ASSERT(s, (s)->api_call_counter > 1 || (s)->hs_cursor_counter <= 2); \
/* \
* No code after this line, otherwise error handling \
* won't be correct. \
*/ \
API_SESSION_POP(s); \
} \
} \
while (0)
/* An API call wrapped in a transaction if necessary. */
@ -188,13 +193,15 @@
SESSION_API_PREPARE_CHECK(s, WT_SESSION, n); \
API_CALL_NOCONF(s, WT_SESSION, n, NULL)
#define SESSION_API_PREPARE_CHECK(s, h, n) \
do { \
int __prepare_ret; \
API_SESSION_PUSH(s, WT_SESSION, n, NULL); \
__prepare_ret = __wt_txn_context_prepare_check(s); \
API_SESSION_POP(s); \
WT_RET(__prepare_ret); \
#define SESSION_API_PREPARE_CHECK(s, h, n) \
do { \
if ((s)->api_call_counter == 0) { \
int __prepare_ret; \
API_SESSION_PUSH(s, WT_SESSION, n, NULL); \
__prepare_ret = __wt_txn_context_prepare_check(s); \
API_SESSION_POP(s); \
WT_RET(__prepare_ret); \
} \
} while (0)
#define SESSION_API_CALL(s, n, config, cfg) \
@ -209,8 +216,7 @@
#define CURSOR_API_CALL(cur, s, n, bt) \
(s) = (WT_SESSION_IMPL *)(cur)->session; \
if ((s)->hs_cursor == NULL) \
SESSION_API_PREPARE_CHECK(s, WT_CURSOR, n); \
SESSION_API_PREPARE_CHECK(s, WT_CURSOR, n); \
API_CALL_NOCONF(s, WT_CURSOR, n, ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \
if (F_ISSET(cur, WT_CURSTD_CACHED)) \
WT_ERR(__wt_cursor_cached(cur))

View File

@ -288,7 +288,7 @@ struct __wt_cursor_hs {
WT_CURSOR *file_cursor; /* Queries of regular history store data */
WT_TIME_WINDOW time_window;
uint32_t btree_id;
WT_ITEM datastore_key;
WT_ITEM *datastore_key;
/* AUTOMATIC FLAG VALUE GENERATION START */
#define WT_HS_CUR_BTREE_ID_SET 0x1u

View File

@ -6,6 +6,32 @@
* See the file LICENSE for redistribution information.
*/
/*
* __wt_curhs_get_btree --
* Convert a history store cursor to the underlying btree.
*/
static inline WT_BTREE *
__wt_curhs_get_btree(WT_CURSOR *cursor)
{
WT_CURSOR_HS *hs_cursor;
hs_cursor = (WT_CURSOR_HS *)cursor;
return (CUR2BT(hs_cursor->file_cursor));
}
/*
* __wt_curhs_get_cbt --
* Convert a history store cursor to the underlying btree cursor.
*/
static inline WT_CURSOR_BTREE *
__wt_curhs_get_cbt(WT_CURSOR *cursor)
{
WT_CURSOR_HS *hs_cursor;
hs_cursor = (WT_CURSOR_HS *)cursor;
return ((WT_CURSOR_BTREE *)hs_cursor->file_cursor);
}
/*
* __cursor_set_recno --
* The cursor value in the interface has to track the value in the underlying cursor, update

View File

@ -495,8 +495,14 @@ extern int __wt_curfile_next_random(WT_CURSOR *cursor)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_curhs_cache(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_curhs_search_near_after(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_curhs_search_near_before(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx,
@ -596,7 +602,7 @@ extern int __wt_debug_addr_print(WT_SESSION_IMPL *session, const uint8_t *addr,
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_cursor_page(void *cursor_arg, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE(
(visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile)
extern int __wt_debug_cursor_tree_hs(void *session_arg, const char *ofile)
WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")))
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_debug_disk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile)
@ -750,26 +756,11 @@ extern int __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_config(WT_SESSION_IMPL *session, const char **cfg)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_cursor_cache(WT_SESSION_IMPL *session)
extern int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_cursor_close(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_cursor_open(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id,
const WT_ITEM *key, wt_timestamp_t timestamp, WT_ITEM *user_srch_key)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exactp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_delete_key_from_ts(
WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format,
uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf)
extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
@ -1716,6 +1707,7 @@ extern void __wt_gen_next(WT_SESSION_IMPL *session, int which, uint64_t *genp);
extern void __wt_gen_next_drain(WT_SESSION_IMPL *session, int which);
extern void __wt_hazard_close(WT_SESSION_IMPL *session);
extern void __wt_hs_close(WT_SESSION_IMPL *session);
extern void __wt_hs_upd_time_window(WT_CURSOR *hs_cursor, WT_TIME_WINDOW **twp);
extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg);
extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor);
extern void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn);
@ -1820,8 +1812,12 @@ extern void __wt_verbose_worker(WT_SESSION_IMPL *session, const char *fmt, ...)
WT_GCC_FUNC_DECL_ATTRIBUTE((format(printf, 2, 3))) WT_GCC_FUNC_DECL_ATTRIBUTE((cold));
extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
static inline WT_BTREE *__wt_curhs_get_btree(WT_CURSOR *cursor)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline WT_CELL *__wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline WT_CURSOR_BTREE *__wt_curhs_get_cbt(WT_CURSOR *cursor)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline WT_IKEY *__wt_ref_key_instantiated(WT_REF *ref)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static inline WT_VISIBLE_TYPE __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)

View File

@ -92,7 +92,7 @@ struct __wt_session_impl {
WT_COMPACT_STATE *compact; /* Compaction information */
enum { WT_COMPACT_NONE = 0, WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state;
WT_CURSOR *hs_cursor; /* History store table cursor */
u_int hs_cursor_counter; /* Number of open history store cursors */
WT_CURSOR *meta_cursor; /* Metadata file */
void *meta_track; /* Metadata operation tracking */

View File

@ -451,10 +451,8 @@ struct __wt_connection_stats {
int64_t cursor_modify_bytes;
int64_t cursor_modify_bytes_touch;
int64_t cursor_next;
int64_t cursor_next_hs_tombstone_rts;
int64_t cursor_restart;
int64_t cursor_prev;
int64_t cursor_prev_hs_tombstone_rts;
int64_t cursor_remove;
int64_t cursor_remove_bytes;
int64_t cursor_reserve;

View File

@ -1044,8 +1044,8 @@ retry:
/* If there's no visible update in the update chain or ondisk, check the history store file. */
if (F_ISSET(S2C(session), WT_CONN_HS_OPEN) && !F_ISSET(session->dhandle, WT_DHANDLE_HS)) {
__wt_timing_stress(session, WT_TIMING_STRESS_HS_SEARCH);
WT_RET(__wt_hs_find_upd(session, key, cbt->iface.value_format, recno, cbt->upd_value, false,
&cbt->upd_value->buf));
WT_RET(__wt_hs_find_upd(session, S2BT(session)->id, key, cbt->iface.value_format, recno,
cbt->upd_value, &cbt->upd_value->buf));
}
/*

File diff suppressed because it is too large Load Diff

View File

@ -703,6 +703,7 @@ __wt_rec_row_leaf(
WT_BTREE *btree;
WT_CELL *cell;
WT_CELL_UNPACK_KV *kpack, _kpack, *vpack, _vpack;
WT_CURSOR *hs_cursor;
WT_CURSOR_BTREE *cbt;
WT_DECL_ITEM(tmpkey);
WT_DECL_RET;
@ -720,6 +721,7 @@ __wt_rec_row_leaf(
void *copy;
btree = S2BT(session);
hs_cursor = NULL;
page = pageref->page;
slvg_skip = salvage == NULL ? 0 : salvage->skip;
WT_TIME_WINDOW_INIT(&tw);
@ -914,11 +916,19 @@ __wt_rec_row_leaf(
* ever need to blow away history store content, so we can skip this.
*/
if (!F_ISSET(session, WT_SESSION_NO_DATA_HANDLES)) {
WT_ERR(__wt_hs_cursor_open(session));
/*
* FIXME-WT-7053: we will hit the dhandle deadlock if we open multiple
* history store cursors in reconciliation. Once it is fixed, we can move
* the open and close of the history store cursor inside the delete key
* function.
*/
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
WT_ERR(__wt_hs_delete_key_from_ts(
session, btree->id, tmpkey, WT_TS_NONE, false));
WT_ERR(__wt_hs_cursor_close(session));
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_onpage_removal);
session, hs_cursor, btree->id, tmpkey, WT_TS_NONE, false));
WT_ERR(hs_cursor->close(hs_cursor));
hs_cursor = NULL;
WT_STAT_CONN_INCR(session, cache_hs_key_truncate_onpage_removal);
WT_STAT_DATA_INCR(session, cache_hs_key_truncate_onpage_removal);
}
}
@ -1034,6 +1044,8 @@ leaf_insert:
ret = __wt_rec_split_finish(session, r);
err:
if (hs_cursor != NULL)
WT_TRET(hs_cursor->close(hs_cursor));
__wt_scr_free(session, &tmpkey);
return (ret);
}

View File

@ -2289,8 +2289,6 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
if (i == r->multi_next)
return (0);
WT_RET(__wt_hs_cursor_open(session));
for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i)
if (multi->supd != NULL) {
WT_ERR(__wt_hs_insert_updates(session, r->page, multi));
@ -2302,7 +2300,6 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
err:
WT_TRET(__wt_hs_cursor_close(session));
return (ret);
}

View File

@ -542,6 +542,9 @@ __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, co
{
WT_DECL_RET;
/* We should not open other cursors when there are open history store cursors in the session. */
WT_ASSERT(session, strcmp(uri, WT_HS_URI) == 0 || session->hs_cursor_counter == 0);
/* We do not cache any subordinate tables/files cursors. */
if (owner == NULL) {
if ((ret = __wt_cursor_cache_get(session, uri, NULL, cfg, cursorp)) == 0)

View File

@ -208,7 +208,8 @@ static const char *const __stats_dsrc_desc[] = {
"session: flush_tier operation calls",
"session: tiered storage local retention time (secs)",
"transaction: race to read prepared update retry",
"transaction: rollback to stable hs records with stop timestamps older than newer records",
"transaction: rollback to stable history store records with stop timestamps older than newer "
"records",
"transaction: rollback to stable inconsistent checkpoint",
"transaction: rollback to stable keys removed",
"transaction: rollback to stable keys restored",
@ -1098,12 +1099,8 @@ static const char *const __stats_connection_desc[] = {
"cursor: cursor modify key and value bytes affected",
"cursor: cursor modify value bytes modified",
"cursor: cursor next calls",
"cursor: cursor next calls that skip due to a globally visible history store tombstone in "
"rollback to stable",
"cursor: cursor operation restarted",
"cursor: cursor prev calls",
"cursor: cursor prev calls that skip due to a globally visible history store tombstone in "
"rollback to stable",
"cursor: cursor remove calls",
"cursor: cursor remove key bytes removed",
"cursor: cursor reserve calls",
@ -1437,7 +1434,8 @@ static const char *const __stats_connection_desc[] = {
"session: flush_tier operation calls",
"session: tiered storage local retention time (secs)",
"transaction: race to read prepared update retry",
"transaction: rollback to stable hs records with stop timestamps older than newer records",
"transaction: rollback to stable history store records with stop timestamps older than newer "
"records",
"transaction: rollback to stable inconsistent checkpoint",
"transaction: rollback to stable keys removed",
"transaction: rollback to stable keys restored",
@ -1625,10 +1623,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cursor_modify_bytes = 0;
stats->cursor_modify_bytes_touch = 0;
stats->cursor_next = 0;
stats->cursor_next_hs_tombstone_rts = 0;
stats->cursor_restart = 0;
stats->cursor_prev = 0;
stats->cursor_prev_hs_tombstone_rts = 0;
stats->cursor_remove = 0;
stats->cursor_remove_bytes = 0;
stats->cursor_reserve = 0;
@ -2139,10 +2135,8 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
to->cursor_modify_bytes += WT_STAT_READ(from, cursor_modify_bytes);
to->cursor_modify_bytes_touch += WT_STAT_READ(from, cursor_modify_bytes_touch);
to->cursor_next += WT_STAT_READ(from, cursor_next);
to->cursor_next_hs_tombstone_rts += WT_STAT_READ(from, cursor_next_hs_tombstone_rts);
to->cursor_restart += WT_STAT_READ(from, cursor_restart);
to->cursor_prev += WT_STAT_READ(from, cursor_prev);
to->cursor_prev_hs_tombstone_rts += WT_STAT_READ(from, cursor_prev_hs_tombstone_rts);
to->cursor_remove += WT_STAT_READ(from, cursor_remove);
to->cursor_remove_bytes += WT_STAT_READ(from, cursor_remove_bytes);
to->cursor_reserve += WT_STAT_READ(from, cursor_reserve);

View File

@ -721,76 +721,27 @@ __wt_txn_release(WT_SESSION_IMPL *session)
* Append the update older than the prepared update to the update chain
*/
static int
__txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *key, WT_PAGE *page,
__txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_PAGE *page,
WT_UPDATE *chain, bool commit, WT_UPDATE **fix_updp, bool *upd_appended)
{
WT_CURSOR_BTREE *hs_cbt;
WT_DECL_ITEM(hs_key);
WT_DECL_ITEM(hs_value);
WT_DECL_RET;
WT_TIME_WINDOW *hs_tw;
WT_UPDATE *tombstone, *upd;
wt_timestamp_t durable_ts, hs_start_ts, hs_stop_durable_ts;
wt_timestamp_t durable_ts, hs_stop_durable_ts;
size_t size, total_size;
uint64_t hs_counter, type_full;
uint32_t hs_btree_id;
int cmp;
uint64_t type_full;
char ts_string[2][WT_TS_INT_STRING_SIZE];
WT_ASSERT(session, chain != NULL);
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
*fix_updp = NULL;
*upd_appended = false;
size = total_size = 0;
tombstone = upd = NULL;
/* Allocate buffers for the data store and history store key. */
WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
for (; ret == 0; ret = __wt_hs_cursor_prev(session, hs_cursor)) {
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
/* Stop before crossing over to the next btree */
if (hs_btree_id != S2BT(session)->id) {
ret = WT_NOTFOUND;
goto done;
}
/*
* Keys are sorted in an order, skip the ones before the desired key, and bail out if we
* have crossed over the desired key and not found the record we are looking for.
*/
WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp));
if (cmp != 0) {
ret = WT_NOTFOUND;
goto done;
}
/*
* If the stop time pair on the tombstone in the history store is already globally visible
* we can skip it.
*/
if (!__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw))
break;
else
WT_STAT_CONN_INCR(session, cursor_prev_hs_tombstone);
}
/* We walked off the top of the history store. */
if (ret == WT_NOTFOUND)
goto done;
WT_ERR(ret);
/*
* As part of the history store search, we never get an exact match based on our search criteria
* as we always search for a maximum record for that key. Make sure that we set the comparison
* result as an exact match to remove this key as part of rollback to stable. In case if we
* don't mark the comparison result as same, later the __wt_row_modify function will not
* properly remove the update from history store.
*/
hs_cbt->compare = 0;
/* Get current value. */
WT_ERR(hs_cursor->get_value(hs_cursor, &hs_stop_durable_ts, &durable_ts, &type_full, hs_value));
@ -799,15 +750,16 @@ __txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *
/*
* If the history update already has a stop time point and we are committing the prepared update
* there is no work to do.
* there is no work to do. This happens if a deleted key is reinserted by a prepared update.
*/
if (hs_stop_durable_ts != WT_TS_MAX && commit)
goto done;
__wt_hs_upd_time_window(hs_cursor, &hs_tw);
WT_ERR(__wt_upd_alloc(session, hs_value, WT_UPDATE_STANDARD, &upd, &size));
upd->txnid = hs_cbt->upd_value->tw.start_txn;
upd->durable_ts = hs_cbt->upd_value->tw.durable_start_ts;
upd->start_ts = hs_cbt->upd_value->tw.start_ts;
upd->txnid = hs_tw->start_txn;
upd->durable_ts = hs_tw->durable_start_ts;
upd->start_ts = hs_tw->start_ts;
*fix_updp = upd;
/*
@ -831,11 +783,11 @@ __txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *
/* If the history store record has a valid stop time point, append it. */
if (hs_stop_durable_ts != WT_TS_MAX) {
WT_ASSERT(session, hs_cbt->upd_value->tw.stop_ts != WT_TS_MAX);
WT_ASSERT(session, hs_tw->stop_ts != WT_TS_MAX);
WT_ERR(__wt_upd_alloc(session, NULL, WT_UPDATE_TOMBSTONE, &tombstone, &size));
tombstone->durable_ts = hs_cbt->upd_value->tw.durable_stop_ts;
tombstone->start_ts = hs_cbt->upd_value->tw.stop_ts;
tombstone->txnid = hs_cbt->upd_value->tw.stop_txn;
tombstone->durable_ts = hs_tw->durable_stop_ts;
tombstone->start_ts = hs_tw->stop_ts;
tombstone->txnid = hs_tw->stop_txn;
tombstone->next = upd;
/*
* Set the flag to indicate that this update has been restored from history store for the
@ -873,7 +825,6 @@ err:
__wt_free_update_list(session, &upd);
}
done:
__wt_scr_free(session, &hs_key);
__wt_scr_free(session, &hs_value);
return (ret);
}
@ -958,15 +909,18 @@ static int
__txn_fixup_prepared_update(
WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_UPDATE *fix_upd, bool commit)
{
WT_CURSOR_BTREE *hs_cbt;
WT_DECL_RET;
WT_ITEM hs_value;
WT_TIME_WINDOW tw;
WT_TXN *txn;
WT_UPDATE *hs_upd;
uint32_t txn_flags;
#ifdef HAVE_DIAGNOSTIC
uint64_t hs_upd_type;
wt_timestamp_t hs_durable_ts, hs_stop_durable_ts;
#endif
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
txn = session->txn;
WT_TIME_WINDOW_INIT(&tw);
/*
* Transaction error and prepare are cleared temporarily as cursor functions are not allowed
@ -982,33 +936,34 @@ __txn_fixup_prepared_update(
* If the history update already has a stop time point and we are committing the prepared update
* there is no work to do.
*/
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
if (commit) {
hs_upd->start_ts = txn->commit_timestamp;
hs_upd->durable_ts = txn->durable_timestamp;
hs_upd->txnid = txn->id;
tw.stop_ts = txn->commit_timestamp;
tw.durable_stop_ts = txn->durable_timestamp;
tw.stop_txn = txn->id;
WT_TIME_WINDOW_SET_START(&tw, fix_upd);
hs_value.data = fix_upd->data;
hs_value.size = fix_upd->size;
#ifdef HAVE_DIAGNOSTIC
/* Retrieve the existing update value and stop timestamp. */
WT_ERR(hs_cursor->get_value(
hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &hs_upd_type, &hs_value));
WT_ASSERT(session, hs_stop_durable_ts == WT_TS_MAX);
WT_ASSERT(session, (uint8_t)hs_upd_type == WT_UPDATE_STANDARD);
#endif
/*
* We need to update the stop durable timestamp stored in the history store value.
*
* Pack the value using cursor api.
*/
hs_cursor->set_value(hs_cursor, txn->durable_timestamp, fix_upd->durable_ts,
(uint64_t)fix_upd->type, &hs_value);
WT_ERR(__wt_upd_alloc(session, &hs_cursor->value, WT_UPDATE_STANDARD, &hs_upd->next, NULL));
hs_upd->next->durable_ts = fix_upd->durable_ts;
hs_upd->next->start_ts = fix_upd->start_ts;
hs_upd->next->txnid = fix_upd->txnid;
hs_value.data = fix_upd->data;
hs_value.size = fix_upd->size;
hs_cursor->set_value(hs_cursor, &tw, tw.durable_stop_ts, tw.durable_start_ts,
(uint64_t)WT_UPDATE_STANDARD, &hs_value);
WT_ERR(hs_cursor->update(hs_cursor));
} else {
WT_ERR(hs_cursor->remove(hs_cursor));
}
WT_ERR(__wt_hs_modify(hs_cbt, hs_upd));
if (0) {
err:
__wt_free_update_list(session, &hs_upd);
}
F_SET(txn, txn_flags);
return (ret);
@ -1128,22 +1083,15 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
cbt = (WT_CURSOR_BTREE *)(*cursorp);
hs_btree_id = S2BT(session)->id;
/* Open a history store table cursor. */
WT_ERR(__wt_hs_cursor_open(session));
hs_cursor = session->hs_cursor;
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
/*
* Scan the history store for the given btree and key with maximum start timestamp to let
* the search point to the last version of the key.
*/
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_position(
session, hs_cursor, hs_btree_id, &op->u.op_row.key, WT_TS_MAX, NULL),
true);
if (ret == 0)
/* Not found if we cross the tree or key boundary. */
WT_ERR_NOTFOUND_OK(__txn_append_hs_record(session, hs_cursor, &op->u.op_row.key,
cbt->ref->page, upd, commit, &fix_upd, &upd_appended),
true);
hs_cursor->set_key(hs_cursor, 4, hs_btree_id, &op->u.op_row.key, WT_TS_MAX, UINT64_MAX);
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true);
if (ret == WT_NOTFOUND && !commit) {
/*
* Allocate a tombstone and prepend it to the row so when we reconcile the update chain
@ -1156,7 +1104,10 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
__wt_row_modify(cbt, &cbt->iface.key, NULL, tombstone, WT_UPDATE_INVALID, false));
WT_ERR(ret);
tombstone = NULL;
} else
} else if (ret == 0)
WT_ERR(__txn_append_hs_record(
session, hs_cursor, cbt->ref->page, upd, commit, &fix_upd, &upd_appended));
else
ret = 0;
}
@ -1212,15 +1163,14 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
* Fix the history store contents if they exist, when there are no more updates in the update
* list. Only in eviction, it is possible to write an unfinished history store update when the
* prepared updates are written to the data store. When the page is read back into memory, there
* will be only one uncommitted prepared update. There can be a false positive of fixing history
* store when handling prepared inserts, but it doesn't cost much.
* will be only one uncommitted prepared update.
*/
if (fix_upd != NULL)
WT_ERR(__txn_fixup_prepared_update(session, hs_cursor, fix_upd, commit));
err:
if (hs_cursor != NULL)
WT_TRET(__wt_hs_cursor_close(session));
WT_TRET(hs_cursor->close(hs_cursor));
if (!upd_appended)
__wt_free(session, fix_upd);
__wt_free(session, tombstone);

View File

@ -276,18 +276,17 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
{
WT_CELL_UNPACK_KV *unpack, _unpack;
WT_CURSOR *hs_cursor;
WT_CURSOR_BTREE *cbt;
WT_DECL_ITEM(hs_key);
WT_DECL_ITEM(hs_value);
WT_DECL_ITEM(key);
WT_DECL_RET;
WT_ITEM full_value;
WT_UPDATE *hs_upd, *tombstone, *upd;
WT_TIME_WINDOW *hs_tw;
WT_UPDATE *tombstone, *upd;
wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts;
uint64_t hs_counter, type_full;
uint32_t hs_btree_id;
uint8_t type;
int cmp;
char ts_string[4][WT_TS_INT_STRING_SIZE];
bool valid_update_found;
#ifdef HAVE_DIAGNOSTIC
@ -295,7 +294,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
#endif
hs_cursor = NULL;
hs_upd = tombstone = upd = NULL;
tombstone = upd = NULL;
hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE;
hs_btree_id = S2BT(session)->id;
WT_CLEAR(full_value);
@ -319,9 +318,13 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
newer_hs_durable_ts = unpack->tw.durable_start_ts;
/* Open a history store table cursor. */
WT_ERR(__wt_hs_cursor_open(session));
hs_cursor = session->hs_cursor;
cbt = (WT_CURSOR_BTREE *)hs_cursor;
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
/*
* Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system)
* outside the constraints of transactions. Therefore, there is no need for snapshot based
* visibility checks.
*/
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
/*
* Scan the history store for the given btree and key with maximum start timestamp to let the
@ -330,40 +333,11 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
* into data store and removed from history store. If none of the history store records satisfy
* the given timestamp, the key is removed from data store.
*/
ret = __wt_hs_cursor_position(session, hs_cursor, hs_btree_id, key, WT_TS_MAX, NULL);
for (; ret == 0; ret = __wt_hs_cursor_prev(session, hs_cursor)) {
hs_cursor->set_key(hs_cursor, 4, hs_btree_id, key, WT_TS_MAX, UINT64_MAX);
ret = __wt_curhs_search_near_before(session, hs_cursor);
for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
/* Stop before crossing over to the next btree */
if (hs_btree_id != S2BT(session)->id)
break;
/*
* Keys are sorted in an order, skip the ones before the desired key, and bail out if we
* have crossed over the desired key and not found the record we are looking for.
*/
WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp));
if (cmp != 0)
break;
/*
* If the stop time pair on the tombstone in the history store is already globally visible
* we can skip it.
*/
if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
WT_STAT_CONN_INCR(session, cursor_prev_hs_tombstone_rts);
continue;
}
/*
* As part of the history store search, we never get an exact match based on our search
* criteria as we always search for a maximum record for that key. Make sure that we set the
* comparison result as an exact match to remove this key as part of rollback to stable. In
* case if we don't mark the comparison result as same, later the __wt_row_modify function
* will not properly remove the update from history store.
*/
cbt->compare = 0;
/* Get current value and convert to full update if it is a modify. */
WT_ERR(hs_cursor->get_value(
hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value));
@ -416,16 +390,17 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
* selected update to the update chain. Also it confirms that history store doesn't contains
* any newer version than the current version for the key.
*/
/* Retrieve the time window from the history cursor. */
__wt_hs_upd_time_window(hs_cursor, &hs_tw);
if (!replace &&
(hs_stop_durable_ts != WT_TS_NONE ||
!__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.stop_txn)) &&
!__rollback_check_if_txnid_non_committed(session, hs_tw->stop_txn)) &&
(hs_stop_durable_ts <= rollback_timestamp)) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store update valid with stop timestamp: %s, stable timestamp: %s, txnid: "
"%" PRIu64 " and type: %" PRIu8,
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]),
__wt_timestamp_to_string(rollback_timestamp, ts_string[1]),
cbt->upd_value->tw.stop_txn, type);
__wt_timestamp_to_string(rollback_timestamp, ts_string[1]), hs_tw->stop_txn, type);
break;
}
@ -434,7 +409,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
* transaction id.
*/
if ((hs_durable_ts != WT_TS_NONE ||
!__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.start_txn)) &&
!__rollback_check_if_txnid_non_committed(session, hs_tw->start_txn)) &&
(hs_durable_ts <= rollback_timestamp)) {
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"history store update valid with start timestamp: %s, durable timestamp: %s, stop "
@ -442,8 +417,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
__wt_timestamp_to_string(rollback_timestamp, ts_string[3]),
cbt->upd_value->tw.start_txn, type);
__wt_timestamp_to_string(rollback_timestamp, ts_string[3]), hs_tw->start_txn, type);
WT_ASSERT(session, hs_tw->start_ts < unpack->tw.start_ts);
valid_update_found = true;
break;
}
@ -455,8 +430,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
__wt_timestamp_to_string(rollback_timestamp, ts_string[3]), cbt->upd_value->tw.start_txn,
cbt->upd_value->tw.stop_txn, type);
__wt_timestamp_to_string(rollback_timestamp, ts_string[3]), hs_tw->start_txn,
hs_tw->stop_txn, type);
/*
* Start time point of the current record may be used as stop time point of the previous
@ -468,8 +443,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
first_record = false;
#endif
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
WT_ERR(__wt_hs_modify(cbt, hs_upd));
WT_ERR(hs_cursor->remove(hs_cursor));
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable);
}
@ -480,9 +454,10 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
* list. Otherwise remove the key by adding a tombstone.
*/
if (valid_update_found) {
/* Retrieve the time window from the history cursor. */
__wt_hs_upd_time_window(hs_cursor, &hs_tw);
WT_ASSERT(session,
cbt->upd_value->tw.start_ts < unpack->tw.start_ts ||
cbt->upd_value->tw.start_txn < unpack->tw.start_txn);
hs_tw->start_ts < unpack->tw.start_ts || hs_tw->start_txn < unpack->tw.start_txn);
WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL));
/*
@ -494,9 +469,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
upd->txnid = WT_TXN_NONE;
else
upd->txnid = cbt->upd_value->tw.start_txn;
upd->durable_ts = cbt->upd_value->tw.durable_start_ts;
upd->start_ts = cbt->upd_value->tw.start_ts;
upd->txnid = hs_tw->start_txn;
upd->durable_ts = hs_tw->durable_start_ts;
upd->start_ts = hs_tw->start_ts;
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"update restored from history store txnid: %" PRIu64
", start_ts: %s and durable_ts: %s",
@ -527,9 +502,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
tombstone->txnid = WT_TXN_NONE;
else
tombstone->txnid = cbt->upd_value->tw.stop_txn;
tombstone->durable_ts = cbt->upd_value->tw.durable_stop_ts;
tombstone->start_ts = cbt->upd_value->tw.stop_ts;
tombstone->txnid = hs_tw->stop_txn;
tombstone->durable_ts = hs_tw->durable_stop_ts;
tombstone->start_ts = hs_tw->stop_ts;
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"tombstone restored from history store txnid: %" PRIu64
", start_ts: %s, durable_ts: %s",
@ -557,8 +532,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
/* Finally remove that update from history store. */
if (valid_update_found) {
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
WT_ERR(__wt_hs_modify(cbt, hs_upd));
WT_ERR(hs_cursor->remove(hs_cursor));
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
}
@ -567,13 +541,13 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
err:
WT_ASSERT(session, tombstone == NULL || upd == tombstone);
__wt_free_update_list(session, &upd);
__wt_free_update_list(session, &hs_upd);
}
__wt_scr_free(session, &hs_key);
__wt_scr_free(session, &hs_value);
__wt_scr_free(session, &key);
__wt_buf_free(session, &full_value);
WT_TRET(__wt_hs_cursor_close(session));
if (hs_cursor != NULL)
WT_TRET(hs_cursor->close(hs_cursor));
return (ret);
}
@ -1305,74 +1279,44 @@ static int
__rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_id)
{
WT_CURSOR *hs_cursor;
WT_CURSOR_BTREE *cbt;
WT_DECL_ITEM(hs_key);
WT_DECL_RET;
WT_ITEM key;
WT_UPDATE *hs_upd;
wt_timestamp_t hs_start_ts;
uint64_t hs_counter;
uint32_t hs_btree_id;
int exact;
char ts_string[WT_TS_INT_STRING_SIZE];
hs_cursor = NULL;
WT_CLEAR(key);
hs_upd = NULL;
WT_RET(__wt_scr_alloc(session, 0, &hs_key));
/* Open a history store table cursor. */
WT_ERR(__wt_hs_cursor_open(session));
hs_cursor = session->hs_cursor;
cbt = (WT_CURSOR_BTREE *)hs_cursor;
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
/* Walk the history store for the given btree. */
hs_cursor->set_key(hs_cursor, btree_id, &key, WT_TS_NONE, 0);
ret = __wt_hs_cursor_search_near(session, hs_cursor, &exact);
hs_cursor->set_key(hs_cursor, 1, btree_id);
ret = __wt_curhs_search_near_after(session, hs_cursor);
/*
* The search should always end up pointing to the start of the required btree or end of the
* previous btree on success. Move the cursor based on the result.
*/
WT_ASSERT(session, (ret != 0 || exact != 0));
if (ret == 0 && exact < 0)
ret = __wt_hs_cursor_next(session, hs_cursor);
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
/* Stop crossing into the next btree boundary. */
if (btree_id != hs_btree_id)
break;
/* We shouldn't cross the btree search space. */
WT_ASSERT(session, btree_id == hs_btree_id);
/*
* If the stop time pair on the tombstone in the history store is already globally visible
* we can skip it.
*/
if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
WT_STAT_CONN_INCR(session, cursor_next_hs_tombstone_rts);
continue;
}
/* Set this comparison as exact match of the search for later use. */
cbt->compare = 0;
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
"rollback to stable history store cleanup of update with start timestamp: %s",
__wt_timestamp_to_string(hs_start_ts, ts_string));
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
WT_ERR(__wt_hs_modify(cbt, hs_upd));
WT_ERR(hs_cursor->remove(hs_cursor));
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
hs_upd = NULL;
}
WT_ERR_NOTFOUND_OK(ret, false);
err:
__wt_scr_free(session, &hs_key);
__wt_free(session, hs_upd);
WT_TRET(__wt_hs_cursor_close(session));
if (hs_cursor != NULL)
WT_TRET(hs_cursor->close(hs_cursor));
return (ret);
}

View File

@ -378,7 +378,7 @@ format_die(void)
testutil_check(__wt_debug_cursor_page(g.page_dump_cursor, g.home_pagedump));
fprintf(stderr, "snapshot-isolation error: Dumping HS to %s\n", g.home_hsdump);
#if WIREDTIGER_VERSION_MAJOR >= 10
testutil_check(__wt_debug_cursor_tree_hs(g.page_dump_cursor, g.home_hsdump));
testutil_check(__wt_debug_cursor_tree_hs(CUR2S(g.page_dump_cursor), g.home_hsdump));
#endif
}
#endif

View File

@ -71,7 +71,7 @@ class test_cursor13_base(wttest.WiredTigerTestCase):
if hs_before[0] == hs_after[0] and hs_before[1] == hs_after[1]:
break
# Fail if we haven't been able to get stable hs stats after too many attempts.
# Fail if we haven't been able to get stable history store stats after too many attempts.
# Seems impossible, but better to check than to have an accidental infinite loop.
self.assertNotEqual(i, max_tries - 1)

View File

@ -72,7 +72,7 @@ class test_hs05(wttest.WiredTigerTestCase):
score_diff = score_end - score_start
self.pr("After large updates score start: " + str(score_start))
self.pr("After large updates score end: " + str(score_end))
self.pr("After large updates hs score diff: " + str(score_diff))
self.pr("After large updates history store score diff: " + str(score_diff))
def test_checkpoint_hs_reads(self):
# Create a small table.

View File

@ -37,7 +37,7 @@ def timestamp_str(t):
return '%x' % t
# test_rollback_to_stable11.py
# Test the rollback to stable is retrieving the proper hs update.
# Test the rollback to stable is retrieving the proper history store update.
class test_rollback_to_stable11(test_rollback_to_stable_base):
session_config = 'isolation=snapshot'

View File

@ -0,0 +1,85 @@
#!/usr/bin/env python
#
# Public Domain 2014-2021 MongoDB, Inc.
# Public Domain 2008-2014 WiredTiger, Inc.
#
# This is free and unencumbered software released into the public domain.
#
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
#
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
import wttest
from suite_subprocess import suite_subprocess
from helper import compare_files
def timestamp_str(t):
return '%x' % t
# test_util21.py
# Ensure that wt dump can dump obsolete data in the history store.
class test_util21(wttest.WiredTigerTestCase, suite_subprocess):
conn_config = 'cache_size=50MB'
session_config = 'isolation=snapshot'
def add_data_with_timestamp(self, uri, value, ts):
# Apply a series of updates with commit timestamp.
cursor = self.session.open_cursor(uri)
for i in range(1, 5):
self.session.begin_transaction()
cursor[str(i)] = value
self.session.commit_transaction('commit_timestamp=' + timestamp_str(ts))
cursor.close()
def test_dump_obsolete_data(self):
uri = 'table:test_util21'
create_params = 'key_format=S,value_format=S'
self.session.create(uri, create_params)
value1 = 'a' * 100
value2 = 'b' * 100
value3 = 'c' * 100
value4 = 'd' * 100
self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1))
self.add_data_with_timestamp(uri, value1, 2)
self.add_data_with_timestamp(uri, value2, 3)
self.add_data_with_timestamp(uri, value3, 5)
self.add_data_with_timestamp(uri, value4, 7)
# Perform checkpoint, to clean the dirty pages and place values on disk.
self.session.checkpoint()
# Set stable timestamp, so we don't lose data when closing/opening connection when using wt dump.
self.conn.set_timestamp('stable_timestamp=' + timestamp_str(10))
# Call dump on the values before the oldest timestamp is set
self.runWt(['dump', 'file:WiredTigerHS.wt'], outfilename="before_oldest")
# Set oldest timestamp, and checkpoint, the obsolete data should not removed as
# the pages are clean.
self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(6))
self.session.checkpoint()
self.runWt(['dump', 'file:WiredTigerHS.wt'], outfilename="after_oldest")
self.assertEqual(True, compare_files(self, "before_oldest", "after_oldest"))
if __name__ == '__main__':
wttest.run()