mirror of
https://github.com/mongodb/mongo.git
synced 2024-12-01 09:32:32 +01:00
Import wiredtiger: a5fd80d29c69f12c01f412fb6d8d7930cecc8758 from branch mongodb-5.0
ref: 563ccc601f..a5fd80d29c for: 4.9.0 WT-7164 Merge "HS cursor restructure" feature branch into develop
This commit is contained in:
parent
2f11ef616e
commit
ff9995ed5c
1
src/third_party/wiredtiger/dist/s_string.ok
vendored
1
src/third_party/wiredtiger/dist/s_string.ok
vendored
@ -565,6 +565,7 @@ calloc
|
||||
cas
|
||||
catfmt
|
||||
cb
|
||||
cbt
|
||||
ccc
|
||||
ccr
|
||||
cd
|
||||
|
4
src/third_party/wiredtiger/dist/s_void
vendored
4
src/third_party/wiredtiger/dist/s_void
vendored
@ -135,7 +135,9 @@ func_ok()
|
||||
-e '/int zlib_terminate$/d' \
|
||||
-e '/int zstd_error$/d' \
|
||||
-e '/int zstd_pre_size$/d' \
|
||||
-e '/int zstd_terminate$/d'
|
||||
-e '/int zstd_terminate$/d' \
|
||||
-e '/int __wt_curhs_search_near_after$/d' \
|
||||
-e '/int __wt_curhs_search_near_before$/d'
|
||||
}
|
||||
|
||||
for f in `find bench ext src test -name '*.c' -o -name '*_inline.h'`; do
|
||||
|
4
src/third_party/wiredtiger/dist/stat_data.py
vendored
4
src/third_party/wiredtiger/dist/stat_data.py
vendored
@ -303,9 +303,7 @@ connection_stats = [
|
||||
CursorStat('cursor_modify_bytes', 'cursor modify key and value bytes affected', 'size'),
|
||||
CursorStat('cursor_modify_bytes_touch', 'cursor modify value bytes modified', 'size'),
|
||||
CursorStat('cursor_next', 'cursor next calls'),
|
||||
CursorStat('cursor_next_hs_tombstone_rts', 'cursor next calls that skip due to a globally visible history store tombstone in rollback to stable'),
|
||||
CursorStat('cursor_prev', 'cursor prev calls'),
|
||||
CursorStat('cursor_prev_hs_tombstone_rts', 'cursor prev calls that skip due to a globally visible history store tombstone in rollback to stable'),
|
||||
CursorStat('cursor_remove', 'cursor remove calls'),
|
||||
CursorStat('cursor_remove_bytes', 'cursor remove key bytes removed', 'size'),
|
||||
CursorStat('cursor_reopen', 'cursors reused from cache'),
|
||||
@ -874,7 +872,7 @@ conn_dsrc_stats = [
|
||||
TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'),
|
||||
TxnStat('txn_rts_hs_restore_updates', 'rollback to stable restored updates from history store'),
|
||||
TxnStat('txn_rts_hs_restore_tombstones', 'rollback to stable restored tombstones from history store'),
|
||||
TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable hs records with stop timestamps older than newer records'),
|
||||
TxnStat('txn_rts_hs_stop_older_than_newer_start', 'rollback to stable history store records with stop timestamps older than newer records'),
|
||||
TxnStat('txn_rts_inconsistent_ckpt', 'rollback to stable inconsistent checkpoint'),
|
||||
TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'),
|
||||
TxnStat('txn_rts_keys_restored', 'rollback to stable keys restored'),
|
||||
|
2
src/third_party/wiredtiger/import.data
vendored
2
src/third_party/wiredtiger/import.data
vendored
@ -2,5 +2,5 @@
|
||||
"vendor": "wiredtiger",
|
||||
"github": "wiredtiger/wiredtiger.git",
|
||||
"branch": "mongodb-5.0",
|
||||
"commit": "563ccc601f5689a16a3f41743398329b8a3aedf7"
|
||||
"commit": "a5fd80d29c69f12c01f412fb6d8d7930cecc8758"
|
||||
}
|
||||
|
122
src/third_party/wiredtiger/src/btree/bt_debug.c
vendored
122
src/third_party/wiredtiger/src/btree/bt_debug.c
vendored
@ -40,18 +40,18 @@ struct __wt_dbg {
|
||||
static const /* Output separator */
|
||||
char *const sep = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n";
|
||||
|
||||
static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool);
|
||||
static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool, WT_CURSOR *);
|
||||
static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *);
|
||||
static int __debug_modify(WT_DBG *, const uint8_t *);
|
||||
static int __debug_page(WT_DBG *, WT_REF *, uint32_t);
|
||||
static int __debug_page_col_fix(WT_DBG *, WT_REF *);
|
||||
static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t);
|
||||
static int __debug_page_col_var(WT_DBG *, WT_REF *);
|
||||
static int __debug_page_col_var(WT_DBG *, WT_REF *, WT_CURSOR *);
|
||||
static int __debug_page_metadata(WT_DBG *, WT_REF *);
|
||||
static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
|
||||
static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
|
||||
static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *, WT_CURSOR *);
|
||||
static int __debug_ref(WT_DBG *, WT_REF *);
|
||||
static int __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
|
||||
static int __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *, WT_CURSOR *);
|
||||
static int __debug_tree(WT_SESSION_IMPL *, WT_REF *, const char *, uint32_t);
|
||||
static int __debug_update(WT_DBG *, WT_UPDATE *, bool);
|
||||
static int __debug_wrapup(WT_DBG *);
|
||||
@ -285,9 +285,6 @@ __debug_wrapup(WT_DBG *ds)
|
||||
session = ds->session;
|
||||
msg = ds->msg;
|
||||
|
||||
if (session->hs_cursor != NULL)
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
|
||||
__wt_scr_free(session, &ds->key);
|
||||
__wt_scr_free(session, &ds->hs_key);
|
||||
__wt_scr_free(session, &ds->hs_value);
|
||||
@ -421,7 +418,7 @@ __debug_hs_cursor(WT_DBG *ds, WT_CURSOR *hs_cursor)
|
||||
uint32_t hs_btree_id;
|
||||
char time_string[WT_TIME_STRING_SIZE];
|
||||
|
||||
cbt = (WT_CURSOR_BTREE *)hs_cursor;
|
||||
cbt = __wt_curhs_get_cbt(hs_cursor);
|
||||
session = ds->session;
|
||||
|
||||
WT_TIME_WINDOW_INIT(&tw);
|
||||
@ -463,16 +460,12 @@ __debug_hs_cursor(WT_DBG *ds, WT_CURSOR *hs_cursor)
|
||||
* Dump any HS records associated with the key.
|
||||
*/
|
||||
static int
|
||||
__debug_hs_key(WT_DBG *ds)
|
||||
__debug_hs_key(WT_DBG *ds, WT_CURSOR *hs_cursor)
|
||||
{
|
||||
WT_BTREE *btree;
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_DECL_RET;
|
||||
WT_SESSION_IMPL *session;
|
||||
wt_timestamp_t older_start_ts;
|
||||
uint64_t hs_counter;
|
||||
uint32_t hs_btree_id;
|
||||
int cmp, exact;
|
||||
|
||||
session = ds->session;
|
||||
btree = S2BT(session);
|
||||
@ -482,26 +475,12 @@ __debug_hs_key(WT_DBG *ds)
|
||||
* Open a history store cursor positioned at the end of the data store key (the newest record)
|
||||
* and iterate backwards until we reach a different key or btree.
|
||||
*/
|
||||
hs_cursor = session->hs_cursor;
|
||||
hs_cursor->set_key(hs_cursor, hs_btree_id, ds->key, WT_TS_MAX, WT_TXN_MAX);
|
||||
ret = hs_cursor->search_near(hs_cursor, &exact);
|
||||
|
||||
/* If we jumped to the next key, go back to the previous key. */
|
||||
if (ret == 0 && exact > 0)
|
||||
ret = hs_cursor->prev(hs_cursor);
|
||||
|
||||
for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
|
||||
WT_RET(hs_cursor->get_key(hs_cursor, &hs_btree_id, ds->t1, &older_start_ts, &hs_counter));
|
||||
|
||||
if (hs_btree_id != btree->id)
|
||||
break;
|
||||
|
||||
WT_RET(__wt_compare(session, NULL, ds->key, ds->t1, &cmp));
|
||||
if (cmp != 0)
|
||||
break;
|
||||
hs_cursor->set_key(hs_cursor, 4, hs_btree_id, ds->key, WT_TS_MAX, WT_TXN_MAX);
|
||||
ret = __wt_curhs_search_near_before(session, hs_cursor);
|
||||
|
||||
for (; ret == 0; ret = hs_cursor->prev(hs_cursor))
|
||||
WT_RET(__debug_hs_cursor(ds, hs_cursor));
|
||||
}
|
||||
|
||||
return (ret == WT_NOTFOUND ? 0 : ret);
|
||||
}
|
||||
|
||||
@ -970,19 +949,19 @@ __wt_debug_cursor_page(void *cursor_arg, const char *ofile)
|
||||
* Dump the history store tree given a user cursor.
|
||||
*/
|
||||
int
|
||||
__wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile)
|
||||
__wt_debug_cursor_tree_hs(void *session_arg, const char *ofile)
|
||||
WT_GCC_FUNC_ATTRIBUTE((visibility("default")))
|
||||
{
|
||||
WT_CURSOR_BTREE *cbt;
|
||||
WT_BTREE *hs_btree;
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_DECL_RET;
|
||||
WT_SESSION_IMPL *session;
|
||||
|
||||
session = CUR2S(cursor_arg);
|
||||
|
||||
WT_RET(__wt_hs_cursor_open(session));
|
||||
cbt = (WT_CURSOR_BTREE *)session->hs_cursor;
|
||||
WT_WITH_BTREE(session, CUR2BT(cbt), ret = __wt_debug_tree_all(session, NULL, NULL, ofile));
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
session = (WT_SESSION_IMPL *)session_arg;
|
||||
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
hs_btree = __wt_curhs_get_btree(hs_cursor);
|
||||
WT_WITH_BTREE(session, hs_btree, ret = __wt_debug_tree_all(session, NULL, NULL, ofile));
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
|
||||
return (ret);
|
||||
}
|
||||
@ -1017,9 +996,11 @@ __debug_tree(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile, uint32_t
|
||||
static int
|
||||
__debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags)
|
||||
{
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_DECL_RET;
|
||||
WT_SESSION_IMPL *session;
|
||||
|
||||
hs_cursor = NULL;
|
||||
session = ds->session;
|
||||
WT_RET(__wt_scr_alloc(session, 100, &ds->key));
|
||||
|
||||
@ -1028,43 +1009,47 @@ __debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags)
|
||||
* doesn't work, we may be running in-memory.
|
||||
*/
|
||||
if (!WT_IS_HS(session->dhandle)) {
|
||||
if (session->hs_cursor != NULL || __wt_hs_cursor_open(session) == 0) {
|
||||
WT_RET(__wt_scr_alloc(session, 0, &ds->hs_key));
|
||||
WT_RET(__wt_scr_alloc(session, 0, &ds->hs_value));
|
||||
}
|
||||
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &ds->hs_key));
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &ds->hs_value));
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
}
|
||||
|
||||
/* Dump the page metadata. */
|
||||
WT_WITH_PAGE_INDEX(session, ret = __debug_page_metadata(ds, ref));
|
||||
WT_RET(ret);
|
||||
WT_ERR(ret);
|
||||
|
||||
/* Dump the page. */
|
||||
switch (ref->page->type) {
|
||||
case WT_PAGE_COL_FIX:
|
||||
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
|
||||
WT_RET(__debug_page_col_fix(ds, ref));
|
||||
WT_ERR(__debug_page_col_fix(ds, ref));
|
||||
break;
|
||||
case WT_PAGE_COL_INT:
|
||||
WT_WITH_PAGE_INDEX(session, ret = __debug_page_col_int(ds, ref->page, flags));
|
||||
WT_RET(ret);
|
||||
WT_ERR(ret);
|
||||
break;
|
||||
case WT_PAGE_COL_VAR:
|
||||
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
|
||||
WT_RET(__debug_page_col_var(ds, ref));
|
||||
WT_ERR(__debug_page_col_var(ds, ref, hs_cursor));
|
||||
break;
|
||||
case WT_PAGE_ROW_INT:
|
||||
WT_WITH_PAGE_INDEX(session, ret = __debug_page_row_int(ds, ref->page, flags));
|
||||
WT_RET(ret);
|
||||
WT_ERR(ret);
|
||||
break;
|
||||
case WT_PAGE_ROW_LEAF:
|
||||
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
|
||||
WT_RET(__debug_page_row_leaf(ds, ref->page));
|
||||
WT_ERR(__debug_page_row_leaf(ds, ref->page, hs_cursor));
|
||||
break;
|
||||
default:
|
||||
return (__wt_illegal_value(session, ref->page->type));
|
||||
WT_ERR(__wt_illegal_value(session, ref->page->type));
|
||||
}
|
||||
|
||||
return (0);
|
||||
err:
|
||||
if (hs_cursor != NULL)
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1209,11 +1194,11 @@ __debug_page_col_fix(WT_DBG *ds, WT_REF *ref)
|
||||
|
||||
if (WT_COL_UPDATE_SINGLE(page) != NULL) {
|
||||
WT_RET(ds->f(ds, "%s", sep));
|
||||
WT_RET(__debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", true));
|
||||
WT_RET(__debug_col_skip(ds, WT_COL_UPDATE_SINGLE(page), "update", true, NULL));
|
||||
}
|
||||
if (WT_COL_APPEND(page) != NULL) {
|
||||
WT_RET(ds->f(ds, "%s", sep));
|
||||
WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", true));
|
||||
WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", true, NULL));
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
@ -1254,7 +1239,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
|
||||
* Dump an in-memory WT_PAGE_COL_VAR page.
|
||||
*/
|
||||
static int
|
||||
__debug_page_col_var(WT_DBG *ds, WT_REF *ref)
|
||||
__debug_page_col_var(WT_DBG *ds, WT_REF *ref, WT_CURSOR *hs_cursor)
|
||||
{
|
||||
WT_CELL *cell;
|
||||
WT_CELL_UNPACK_KV *unpack, _unpack;
|
||||
@ -1283,17 +1268,17 @@ __debug_page_col_var(WT_DBG *ds, WT_REF *ref)
|
||||
p = ds->key->mem;
|
||||
WT_RET(__wt_vpack_uint(&p, 0, recno));
|
||||
ds->key->size = WT_PTRDIFF(p, ds->key->mem);
|
||||
WT_RET(__debug_hs_key(ds));
|
||||
WT_RET(__debug_hs_key(ds, hs_cursor));
|
||||
}
|
||||
|
||||
if ((update = WT_COL_UPDATE(page, cip)) != NULL)
|
||||
WT_RET(__debug_col_skip(ds, update, "update", false));
|
||||
WT_RET(__debug_col_skip(ds, update, "update", false, hs_cursor));
|
||||
recno += rle;
|
||||
}
|
||||
|
||||
if (WT_COL_APPEND(page) != NULL) {
|
||||
WT_RET(ds->f(ds, "%s", sep));
|
||||
WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", false));
|
||||
WT_RET(__debug_col_skip(ds, WT_COL_APPEND(page), "append", false, hs_cursor));
|
||||
}
|
||||
|
||||
return (0);
|
||||
@ -1337,7 +1322,7 @@ __debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
|
||||
* Dump an in-memory WT_PAGE_ROW_LEAF page.
|
||||
*/
|
||||
static int
|
||||
__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
|
||||
__debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page, WT_CURSOR *hs_cursor)
|
||||
{
|
||||
WT_CELL_UNPACK_KV *unpack, _unpack;
|
||||
WT_INSERT_HEAD *insert;
|
||||
@ -1353,7 +1338,7 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
|
||||
* Dump any K/V pairs inserted into the page before the first from-disk key on the page.
|
||||
*/
|
||||
if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
|
||||
WT_RET(__debug_row_skip(ds, insert));
|
||||
WT_RET(__debug_row_skip(ds, insert, hs_cursor));
|
||||
|
||||
/* Dump the page's K/V pairs. */
|
||||
WT_ROW_FOREACH (page, rip, i) {
|
||||
@ -1366,11 +1351,11 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
|
||||
if ((upd = WT_ROW_UPDATE(page, rip)) != NULL)
|
||||
WT_RET(__debug_update(ds, upd, false));
|
||||
|
||||
if (!WT_IS_HS(session->dhandle) && session->hs_cursor != NULL)
|
||||
WT_RET(__debug_hs_key(ds));
|
||||
if (!WT_IS_HS(session->dhandle) && hs_cursor != NULL)
|
||||
WT_RET(__debug_hs_key(ds, hs_cursor));
|
||||
|
||||
if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
|
||||
WT_RET(__debug_row_skip(ds, insert));
|
||||
WT_RET(__debug_row_skip(ds, insert, hs_cursor));
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
@ -1380,7 +1365,8 @@ __debug_page_row_leaf(WT_DBG *ds, WT_PAGE *page)
|
||||
* Dump a column-store skiplist.
|
||||
*/
|
||||
static int
|
||||
__debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte)
|
||||
__debug_col_skip(
|
||||
WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte, WT_CURSOR *hs_cursor)
|
||||
{
|
||||
WT_INSERT *ins;
|
||||
WT_SESSION_IMPL *session;
|
||||
@ -1392,11 +1378,11 @@ __debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte
|
||||
WT_RET(ds->f(ds, "\t%s %" PRIu64 "\n", tag, WT_INSERT_RECNO(ins)));
|
||||
WT_RET(__debug_update(ds, ins->upd, hexbyte));
|
||||
|
||||
if (!WT_IS_HS(session->dhandle) && session->hs_cursor != NULL) {
|
||||
if (!WT_IS_HS(session->dhandle) && hs_cursor != NULL) {
|
||||
p = ds->key->mem;
|
||||
WT_RET(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(ins)));
|
||||
ds->key->size = WT_PTRDIFF(p, ds->key->mem);
|
||||
WT_RET(__debug_hs_key(ds));
|
||||
WT_RET(__debug_hs_key(ds, hs_cursor));
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
@ -1407,7 +1393,7 @@ __debug_col_skip(WT_DBG *ds, WT_INSERT_HEAD *head, const char *tag, bool hexbyte
|
||||
* Dump an insert list.
|
||||
*/
|
||||
static int
|
||||
__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
|
||||
__debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head, WT_CURSOR *hs_cursor)
|
||||
{
|
||||
WT_INSERT *ins;
|
||||
WT_SESSION_IMPL *session;
|
||||
@ -1418,9 +1404,9 @@ __debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
|
||||
WT_RET(__debug_item_key(ds, "insert", WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins)));
|
||||
WT_RET(__debug_update(ds, ins->upd, false));
|
||||
|
||||
if (!WT_IS_HS(session->dhandle) && session->hs_cursor != NULL) {
|
||||
if (!WT_IS_HS(session->dhandle) && hs_cursor != NULL) {
|
||||
WT_RET(__wt_buf_set(session, ds->key, WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins)));
|
||||
WT_RET(__debug_hs_key(ds));
|
||||
WT_RET(__debug_hs_key(ds, hs_cursor));
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
|
@ -71,7 +71,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
|
||||
return (0);
|
||||
}
|
||||
|
||||
WT_RET(__wt_hs_cursor_cache(session));
|
||||
WT_RET(__wt_curhs_cache(session));
|
||||
(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
|
||||
ret = __wt_evict(session, ref, previous_state, 0);
|
||||
(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
|
||||
|
35
src/third_party/wiredtiger/src/btree/bt_vrfy.c
vendored
35
src/third_party/wiredtiger/src/btree/bt_vrfy.c
vendored
@ -278,9 +278,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
|
||||
*/
|
||||
if (ret == 0 && (ckpt + 1)->name == NULL && !skip_hs) {
|
||||
/* Open a history store cursor. */
|
||||
WT_ERR(__wt_hs_cursor_open(session));
|
||||
WT_TRET(__wt_hs_verify_one(session));
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
/*
|
||||
* We cannot error out here. If we got an error verifying the history store, we need
|
||||
* to follow through with reacquiring the exclusive call below. We'll error out
|
||||
@ -778,11 +776,12 @@ __verify_key_hs(
|
||||
wt_timestamp_t older_start_ts, older_stop_ts;
|
||||
uint64_t hs_counter;
|
||||
uint32_t hs_btree_id;
|
||||
int cmp, exact;
|
||||
char ts_string[2][WT_TS_INT_STRING_SIZE];
|
||||
|
||||
btree = S2BT(session);
|
||||
hs_btree_id = btree->id;
|
||||
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
|
||||
/*
|
||||
* Set the data store timestamp and transactions to initiate timestamp range verification. Since
|
||||
@ -795,36 +794,23 @@ __verify_key_hs(
|
||||
* Open a history store cursor positioned at the end of the data store key (the newest record)
|
||||
* and iterate backwards until we reach a different key or btree.
|
||||
*/
|
||||
hs_cursor = session->hs_cursor;
|
||||
hs_cursor->set_key(hs_cursor, hs_btree_id, tmp1, WT_TS_MAX, WT_TXN_MAX);
|
||||
ret = hs_cursor->search_near(hs_cursor, &exact);
|
||||
|
||||
/* If we jumped to the next key, go back to the previous key. */
|
||||
if (ret == 0 && exact > 0)
|
||||
ret = hs_cursor->prev(hs_cursor);
|
||||
hs_cursor->set_key(hs_cursor, 4, hs_btree_id, tmp1, WT_TS_MAX, UINT64_MAX);
|
||||
ret = __wt_curhs_search_near_before(session, hs_cursor);
|
||||
|
||||
for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
|
||||
WT_RET(hs_cursor->get_key(hs_cursor, &hs_btree_id, vs->tmp2, &older_start_ts, &hs_counter));
|
||||
|
||||
if (hs_btree_id != btree->id)
|
||||
break;
|
||||
|
||||
WT_RET(__wt_compare(session, NULL, tmp1, vs->tmp2, &cmp));
|
||||
if (cmp != 0)
|
||||
break;
|
||||
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, vs->tmp2, &older_start_ts, &hs_counter));
|
||||
/* Verify the newer record's start is later than the older record's stop. */
|
||||
if (newer_start_ts < older_stop_ts) {
|
||||
WT_RET_MSG(session, WT_ERROR,
|
||||
WT_ERR_MSG(session, WT_ERROR,
|
||||
"key %s has a overlap of timestamp ranges between history store stop timestamp %s "
|
||||
"being newer than a more recent timestamp range having start timestamp %s",
|
||||
__wt_buf_set_printable(session, tmp1->data, tmp1->size, vs->tmp2),
|
||||
__verify_timestamp_to_pretty_string(older_stop_ts, ts_string[0]),
|
||||
__verify_timestamp_to_pretty_string(newer_start_ts, ts_string[1]));
|
||||
__wt_timestamp_to_string(older_stop_ts, ts_string[0]),
|
||||
__wt_timestamp_to_string(newer_start_ts, ts_string[1]));
|
||||
}
|
||||
|
||||
if (vs->stable_timestamp != WT_TS_NONE)
|
||||
WT_RET(
|
||||
WT_ERR(
|
||||
__verify_ts_stable_cmp(session, tmp1, NULL, 0, older_start_ts, older_stop_ts, vs));
|
||||
|
||||
/*
|
||||
@ -833,7 +819,8 @@ __verify_key_hs(
|
||||
*/
|
||||
newer_start_ts = older_start_ts;
|
||||
}
|
||||
|
||||
err:
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
return (ret == WT_NOTFOUND ? 0 : ret);
|
||||
#else
|
||||
WT_UNUSED(session);
|
||||
|
596
src/third_party/wiredtiger/src/cursor/cur_hs.c
vendored
596
src/third_party/wiredtiger/src/cursor/cur_hs.c
vendored
@ -8,15 +8,19 @@
|
||||
|
||||
#include "wt_internal.h"
|
||||
|
||||
static int __curhs_file_cursor_next(WT_SESSION_IMPL *, WT_CURSOR *);
|
||||
static int __curhs_file_cursor_open(WT_SESSION_IMPL *, WT_CURSOR **);
|
||||
static int __curhs_file_cursor_prev(WT_SESSION_IMPL *, WT_CURSOR *);
|
||||
static int __curhs_file_cursor_search_near(WT_SESSION_IMPL *, WT_CURSOR *, int *);
|
||||
static int __curhs_prev_visible(WT_SESSION_IMPL *, WT_CURSOR_HS *);
|
||||
static int __curhs_next_visible(WT_SESSION_IMPL *, WT_CURSOR_HS *);
|
||||
|
||||
static int __curhs_search_near_helper(WT_SESSION_IMPL *, WT_CURSOR *, bool);
|
||||
/*
|
||||
* __hs_cursor_open_int --
|
||||
* __curhs_file_cursor_open --
|
||||
* Open a new history store table cursor, internal function.
|
||||
*/
|
||||
static int
|
||||
__hs_cursor_open_int(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
|
||||
__curhs_file_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
|
||||
{
|
||||
WT_CURSOR *cursor;
|
||||
WT_DECL_RET;
|
||||
@ -34,12 +38,12 @@ __hs_cursor_open_int(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_hs_cursor_cache --
|
||||
* __wt_curhs_cache --
|
||||
* Cache a new history store table cursor. Open and then close a history store cursor without
|
||||
* saving it in the session.
|
||||
*/
|
||||
int
|
||||
__wt_hs_cursor_cache(WT_SESSION_IMPL *session)
|
||||
__wt_curhs_cache(WT_SESSION_IMPL *session)
|
||||
{
|
||||
WT_CONNECTION_IMPL *conn;
|
||||
WT_CURSOR *cursor;
|
||||
@ -70,45 +74,17 @@ __wt_hs_cursor_cache(WT_SESSION_IMPL *session)
|
||||
(session->dhandle != NULL && WT_IS_METADATA(S2BT(session)->dhandle)) ||
|
||||
session == conn->default_session)
|
||||
return (0);
|
||||
WT_RET(__hs_cursor_open_int(session, &cursor));
|
||||
WT_RET(__curhs_file_cursor_open(session, &cursor));
|
||||
WT_RET(cursor->close(cursor));
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_hs_cursor_open --
|
||||
* Open a new history store table cursor wrapper function.
|
||||
*/
|
||||
int
|
||||
__wt_hs_cursor_open(WT_SESSION_IMPL *session)
|
||||
{
|
||||
/* Not allowed to open a cursor if you already have one */
|
||||
WT_ASSERT(session, session->hs_cursor == NULL);
|
||||
|
||||
return (__hs_cursor_open_int(session, &session->hs_cursor));
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_hs_cursor_close --
|
||||
* Discard a history store cursor.
|
||||
*/
|
||||
int
|
||||
__wt_hs_cursor_close(WT_SESSION_IMPL *session)
|
||||
{
|
||||
/* Should only be called when session has an open history store cursor */
|
||||
WT_ASSERT(session, session->hs_cursor != NULL);
|
||||
|
||||
WT_RET(session->hs_cursor->close(session->hs_cursor));
|
||||
session->hs_cursor = NULL;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_hs_cursor_next --
|
||||
* __curhs_file_cursor_next --
|
||||
* Execute a next operation on a history store cursor with the appropriate isolation level.
|
||||
*/
|
||||
int
|
||||
__wt_hs_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
static int
|
||||
__curhs_file_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
|
||||
@ -117,11 +93,11 @@ __wt_hs_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_hs_cursor_prev --
|
||||
* __curhs_file_cursor_prev --
|
||||
* Execute a prev operation on a history store cursor with the appropriate isolation level.
|
||||
*/
|
||||
int
|
||||
__wt_hs_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
static int
|
||||
__curhs_file_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
|
||||
@ -130,12 +106,12 @@ __wt_hs_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_hs_cursor_search_near --
|
||||
* __curhs_file_cursor_search_near --
|
||||
* Execute a search near operation on a history store cursor with the appropriate isolation
|
||||
* level.
|
||||
*/
|
||||
int
|
||||
__wt_hs_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exactp)
|
||||
static int
|
||||
__curhs_file_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exactp)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
|
||||
@ -144,9 +120,35 @@ __wt_hs_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exa
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __curhs_set_key_ptr --
|
||||
* Copy the key buffer pointer from file cursor to the history store cursor.
|
||||
*/
|
||||
static inline void
|
||||
__curhs_set_key_ptr(WT_CURSOR *hs_cursor, WT_CURSOR *file_cursor)
|
||||
{
|
||||
hs_cursor->key.data = file_cursor->key.data;
|
||||
hs_cursor->key.size = file_cursor->key.size;
|
||||
WT_ASSERT(CUR2S(file_cursor), F_ISSET(file_cursor, WT_CURSTD_KEY_SET));
|
||||
F_SET(hs_cursor, F_MASK(file_cursor, WT_CURSTD_KEY_SET));
|
||||
}
|
||||
|
||||
/*
|
||||
* __curhs_set_value_ptr --
|
||||
* Copy the value buffer pointer from file cursor to the history store cursor.
|
||||
*/
|
||||
static inline void
|
||||
__curhs_set_value_ptr(WT_CURSOR *hs_cursor, WT_CURSOR *file_cursor)
|
||||
{
|
||||
hs_cursor->value.data = file_cursor->value.data;
|
||||
hs_cursor->value.size = file_cursor->value.size;
|
||||
WT_ASSERT(CUR2S(file_cursor), F_ISSET(file_cursor, WT_CURSTD_VALUE_SET));
|
||||
F_SET(hs_cursor, F_MASK(file_cursor, WT_CURSTD_VALUE_SET));
|
||||
}
|
||||
|
||||
/*
|
||||
* __curhs_next --
|
||||
* WT_CURSOR->next method for the hs cursor type.
|
||||
* WT_CURSOR->next method for the history store cursor type.
|
||||
*/
|
||||
static int
|
||||
__curhs_next(WT_CURSOR *cursor)
|
||||
@ -160,7 +162,7 @@ __curhs_next(WT_CURSOR *cursor)
|
||||
file_cursor = hs_cursor->file_cursor;
|
||||
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, next, CUR2BT(file_cursor));
|
||||
|
||||
WT_ERR(__wt_hs_cursor_next(session, file_cursor));
|
||||
WT_ERR(__curhs_file_cursor_next(session, file_cursor));
|
||||
/*
|
||||
* We need to check if the history store record is visible to the current session. If not, the
|
||||
* __curhs_next_visible() will also keep iterating forward through the records until it finds a
|
||||
@ -168,6 +170,9 @@ __curhs_next(WT_CURSOR *cursor)
|
||||
*/
|
||||
WT_ERR(__curhs_next_visible(session, hs_cursor));
|
||||
|
||||
__curhs_set_key_ptr(cursor, file_cursor);
|
||||
__curhs_set_value_ptr(cursor, file_cursor);
|
||||
|
||||
if (0) {
|
||||
err:
|
||||
WT_TRET(cursor->reset(cursor));
|
||||
@ -177,7 +182,7 @@ err:
|
||||
|
||||
/*
|
||||
* __curhs_prev --
|
||||
* WT_CURSOR->prev method for the hs cursor type.
|
||||
* WT_CURSOR->prev method for the history store cursor type.
|
||||
*/
|
||||
static int
|
||||
__curhs_prev(WT_CURSOR *cursor)
|
||||
@ -191,7 +196,7 @@ __curhs_prev(WT_CURSOR *cursor)
|
||||
file_cursor = hs_cursor->file_cursor;
|
||||
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, prev, CUR2BT(file_cursor));
|
||||
|
||||
WT_ERR(__wt_hs_cursor_prev(session, file_cursor));
|
||||
WT_ERR(__curhs_file_cursor_prev(session, file_cursor));
|
||||
/*
|
||||
* We need to check if the history store record is visible to the current session. If not, the
|
||||
* __curhs_prev_visible() will also keep iterating backwards through the records until it finds
|
||||
@ -199,6 +204,9 @@ __curhs_prev(WT_CURSOR *cursor)
|
||||
*/
|
||||
WT_ERR(__curhs_prev_visible(session, hs_cursor));
|
||||
|
||||
__curhs_set_key_ptr(cursor, file_cursor);
|
||||
__curhs_set_value_ptr(cursor, file_cursor);
|
||||
|
||||
if (0) {
|
||||
err:
|
||||
WT_TRET(cursor->reset(cursor));
|
||||
@ -208,7 +216,7 @@ err:
|
||||
|
||||
/*
|
||||
* __curhs_close --
|
||||
* WT_CURSOR->close method for the hs cursor type.
|
||||
* WT_CURSOR->close method for the history store cursor type.
|
||||
*/
|
||||
static int
|
||||
__curhs_close(WT_CURSOR *cursor)
|
||||
@ -216,7 +224,6 @@ __curhs_close(WT_CURSOR *cursor)
|
||||
WT_CURSOR *file_cursor;
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
WT_DECL_RET;
|
||||
WT_ITEM *datastore_key;
|
||||
WT_SESSION_IMPL *session;
|
||||
|
||||
hs_cursor = (WT_CURSOR_HS *)cursor;
|
||||
@ -224,11 +231,11 @@ __curhs_close(WT_CURSOR *cursor)
|
||||
CURSOR_API_CALL_PREPARE_ALLOWED(
|
||||
cursor, session, close, file_cursor == NULL ? NULL : CUR2BT(file_cursor));
|
||||
err:
|
||||
__wt_scr_free(session, &hs_cursor->datastore_key);
|
||||
if (file_cursor != NULL)
|
||||
WT_TRET(file_cursor->close(file_cursor));
|
||||
datastore_key = &hs_cursor->datastore_key;
|
||||
__wt_scr_free(session, &datastore_key);
|
||||
__wt_cursor_close(cursor);
|
||||
--session->hs_cursor_counter;
|
||||
|
||||
API_END_RET(session, ret);
|
||||
}
|
||||
@ -252,9 +259,15 @@ __curhs_reset(WT_CURSOR *cursor)
|
||||
ret = file_cursor->reset(file_cursor);
|
||||
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
|
||||
hs_cursor->btree_id = 0;
|
||||
hs_cursor->datastore_key.data = NULL;
|
||||
hs_cursor->datastore_key.size = 0;
|
||||
hs_cursor->datastore_key->data = NULL;
|
||||
hs_cursor->datastore_key->size = 0;
|
||||
hs_cursor->flags = 0;
|
||||
cursor->key.data = NULL;
|
||||
cursor->key.size = 0;
|
||||
cursor->value.data = NULL;
|
||||
cursor->value.size = 0;
|
||||
F_CLR(cursor, WT_CURSTD_KEY_SET);
|
||||
F_CLR(cursor, WT_CURSTD_VALUE_SET);
|
||||
|
||||
err:
|
||||
API_END_RET(session, ret);
|
||||
@ -262,7 +275,7 @@ err:
|
||||
|
||||
/*
|
||||
* __curhs_set_key --
|
||||
* WT_CURSOR->set_key method for the hs cursor type.
|
||||
* WT_CURSOR->set_key method for the history store cursor type.
|
||||
*/
|
||||
static void
|
||||
__curhs_set_key(WT_CURSOR *cursor, ...)
|
||||
@ -282,6 +295,7 @@ __curhs_set_key(WT_CURSOR *cursor, ...)
|
||||
start_ts = WT_TS_NONE;
|
||||
counter = 0;
|
||||
|
||||
hs_cursor->flags = 0;
|
||||
va_start(ap, cursor);
|
||||
arg_count = va_arg(ap, uint32_t);
|
||||
|
||||
@ -292,11 +306,11 @@ __curhs_set_key(WT_CURSOR *cursor, ...)
|
||||
if (arg_count > 1) {
|
||||
datastore_key = va_arg(ap, WT_ITEM *);
|
||||
WT_IGNORE_RET(__wt_buf_set(
|
||||
session, &hs_cursor->datastore_key, datastore_key->data, datastore_key->size));
|
||||
session, hs_cursor->datastore_key, datastore_key->data, datastore_key->size));
|
||||
F_SET(hs_cursor, WT_HS_CUR_KEY_SET);
|
||||
} else {
|
||||
hs_cursor->datastore_key.data = NULL;
|
||||
hs_cursor->datastore_key.size = 0;
|
||||
hs_cursor->datastore_key->data = NULL;
|
||||
hs_cursor->datastore_key->size = 0;
|
||||
F_CLR(hs_cursor, WT_HS_CUR_KEY_SET);
|
||||
}
|
||||
|
||||
@ -315,7 +329,9 @@ __curhs_set_key(WT_CURSOR *cursor, ...)
|
||||
va_end(ap);
|
||||
|
||||
file_cursor->set_key(
|
||||
file_cursor, hs_cursor->btree_id, &hs_cursor->datastore_key, start_ts, counter);
|
||||
file_cursor, hs_cursor->btree_id, hs_cursor->datastore_key, start_ts, counter);
|
||||
|
||||
__curhs_set_key_ptr(cursor, file_cursor);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -342,8 +358,8 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
|
||||
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
|
||||
|
||||
for (; ret == 0; ret = __wt_hs_cursor_prev(session, file_cursor)) {
|
||||
WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter));
|
||||
for (; ret == 0; ret = __curhs_file_cursor_prev(session, file_cursor)) {
|
||||
WT_ERR(file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter));
|
||||
|
||||
/* Stop before crossing over to the next btree. */
|
||||
if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) {
|
||||
@ -356,7 +372,7 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
|
||||
* have crossed over the desired key and not found the record we are looking for.
|
||||
*/
|
||||
if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
|
||||
WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp));
|
||||
WT_ERR(__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp));
|
||||
if (cmp != 0) {
|
||||
ret = WT_NOTFOUND;
|
||||
goto err;
|
||||
@ -379,6 +395,12 @@ __curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
|
||||
if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED))
|
||||
break;
|
||||
|
||||
/*
|
||||
* If we are using a history store cursor and haven't set the WT_CURSTD_HS_READ_COMMITTED
|
||||
* flag then we must have a snapshot, assert that we do.
|
||||
*/
|
||||
WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT));
|
||||
|
||||
if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw)) {
|
||||
/*
|
||||
* If the stop time point of a record is visible to us, we won't be able to see anything
|
||||
@ -425,8 +447,8 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
|
||||
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
|
||||
|
||||
for (; ret == 0; ret = __wt_hs_cursor_next(session, file_cursor)) {
|
||||
WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter));
|
||||
for (; ret == 0; ret = __curhs_file_cursor_next(session, file_cursor)) {
|
||||
WT_ERR(file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter));
|
||||
|
||||
/* Stop before crossing over to the next btree. */
|
||||
if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) {
|
||||
@ -439,7 +461,7 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
|
||||
* have crossed over the desired key and not found the record we are looking for.
|
||||
*/
|
||||
if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
|
||||
WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp));
|
||||
WT_ERR(__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp));
|
||||
if (cmp != 0) {
|
||||
ret = WT_NOTFOUND;
|
||||
goto err;
|
||||
@ -462,6 +484,12 @@ __curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
|
||||
if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED))
|
||||
break;
|
||||
|
||||
/*
|
||||
* If we are using a history store cursor and haven't set the WT_CURSTD_HS_READ_COMMITTED
|
||||
* flag then we must have a snapshot, assert that we do.
|
||||
*/
|
||||
WT_ASSERT(session, F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT));
|
||||
|
||||
/*
|
||||
* If the stop time point of a record is visible to us, check the next one.
|
||||
*/
|
||||
@ -478,171 +506,268 @@ err:
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_curhs_search_near_before --
|
||||
* Set the cursor position at the requested position or before it.
|
||||
*/
|
||||
int
|
||||
__wt_curhs_search_near_before(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
{
|
||||
return (__curhs_search_near_helper(session, cursor, true));
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_curhs_search_near_after --
|
||||
* Set the cursor position at the requested position or after it.
|
||||
*/
|
||||
int
|
||||
__wt_curhs_search_near_after(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
{
|
||||
return (__curhs_search_near_helper(session, cursor, false));
|
||||
}
|
||||
|
||||
/*
|
||||
* __curhs_search_near_helper --
|
||||
* Helper function to set the cursor position based on search criteria.
|
||||
*/
|
||||
static int
|
||||
__curhs_search_near_helper(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool before)
|
||||
{
|
||||
WT_DECL_ITEM(srch_key);
|
||||
WT_DECL_RET;
|
||||
int cmp;
|
||||
|
||||
WT_RET(__wt_scr_alloc(session, 0, &srch_key));
|
||||
WT_ERR(__wt_buf_set(session, srch_key, cursor->key.data, cursor->key.size));
|
||||
WT_ERR(cursor->search_near(cursor, &cmp));
|
||||
if (before) {
|
||||
/*
|
||||
* If we want to land on a key that is smaller or equal to the specified key, keep walking
|
||||
* backwards as there may be content inserted concurrently.
|
||||
*/
|
||||
if (cmp > 0) {
|
||||
while ((ret = cursor->prev(cursor)) == 0) {
|
||||
WT_STAT_CONN_INCR(session, cursor_skip_hs_cur_position);
|
||||
WT_STAT_DATA_INCR(session, cursor_skip_hs_cur_position);
|
||||
WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp));
|
||||
/*
|
||||
* Exit if we have found a key that is smaller than or equal to the specified key.
|
||||
*/
|
||||
if (cmp <= 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If we want to land on a key that is larger or equal to the specified key, keep walking
|
||||
* forwards as there may be content inserted concurrently.
|
||||
*/
|
||||
if (cmp < 0) {
|
||||
while ((ret = cursor->next(cursor)) == 0) {
|
||||
WT_STAT_CONN_INCR(session, cursor_skip_hs_cur_position);
|
||||
WT_STAT_DATA_INCR(session, cursor_skip_hs_cur_position);
|
||||
WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp));
|
||||
/* Exit if we have found a key that is larger than or equal to the specified key. */
|
||||
if (cmp >= 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
err:
|
||||
__wt_scr_free(session, &srch_key);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __curhs_search_near --
|
||||
* WT_CURSOR->search_near method for the hs cursor type.
|
||||
* WT_CURSOR->search_near method for the history store cursor type.
|
||||
*/
|
||||
static int
|
||||
__curhs_search_near(WT_CURSOR *cursor, int *exactp)
|
||||
{
|
||||
WT_CURSOR *file_cursor;
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
WT_DECL_ITEM(datastore_key);
|
||||
WT_DECL_ITEM(srch_key);
|
||||
WT_DECL_RET;
|
||||
WT_SESSION_IMPL *session;
|
||||
int cmp;
|
||||
int exact;
|
||||
wt_timestamp_t start_ts;
|
||||
uint64_t counter;
|
||||
uint32_t btree_id;
|
||||
int exact, cmp;
|
||||
|
||||
hs_cursor = (WT_CURSOR_HS *)cursor;
|
||||
file_cursor = hs_cursor->file_cursor;
|
||||
*exactp = 0;
|
||||
cmp = 0;
|
||||
|
||||
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, search_near, CUR2BT(file_cursor));
|
||||
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &srch_key));
|
||||
/* At least we have the btree id set. */
|
||||
WT_ASSERT(session, F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET));
|
||||
WT_ERR(__wt_buf_set(session, srch_key, file_cursor->key.data, file_cursor->key.size));
|
||||
/* Reset cursor if we get WT_NOTFOUND. */
|
||||
WT_ERR(__wt_hs_cursor_search_near(session, file_cursor, &exact));
|
||||
WT_ERR(__curhs_file_cursor_search_near(session, file_cursor, &exact));
|
||||
|
||||
/*
|
||||
* There are some key fields missing so we are searching a range of keys. Place the cursor at
|
||||
* the start of the range.
|
||||
*/
|
||||
if (!F_ISSET(hs_cursor, WT_HS_CUR_COUNTER_SET)) {
|
||||
if (exact >= 0) {
|
||||
/*
|
||||
* If we raced with a history store insert, we may be two or more records away from our
|
||||
* target. Keep iterating forwards until we are on or past our target key.
|
||||
*
|
||||
* We can't use the cursor positioning helper that we use for regular reads since that will
|
||||
* place us at the end of a particular key/timestamp range whereas we want to be placed at
|
||||
* the beginning.
|
||||
* We placed the file cursor before the search key. Try first to walk forwards to see if we
|
||||
* can find a visible record. If nothing is visible, try to walk backwards.
|
||||
*/
|
||||
if (exact < 0) {
|
||||
while ((ret = __wt_hs_cursor_next(session, file_cursor)) == 0) {
|
||||
WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
|
||||
if (cmp >= 0)
|
||||
break;
|
||||
}
|
||||
WT_ERR_NOTFOUND_OK(__curhs_next_visible(session, hs_cursor), true);
|
||||
if (ret == WT_NOTFOUND) {
|
||||
/*
|
||||
* No entries greater than or equal to the key we searched for. Reset cursor if we get
|
||||
* WT_NOTFOUND.
|
||||
* When walking backwards, first ensure we walk back to the specified btree or key space
|
||||
* as we may have crossed the boundary. Do that in a loop as there may be content
|
||||
* inserted concurrently.
|
||||
*/
|
||||
while ((ret = __curhs_file_cursor_prev(session, file_cursor)) == 0) {
|
||||
WT_ERR(
|
||||
file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter));
|
||||
|
||||
/* We are back in the specified btree range. */
|
||||
if (btree_id == hs_cursor->btree_id && F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
|
||||
WT_ERR(
|
||||
__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp));
|
||||
|
||||
/* We are back in the specified key range. */
|
||||
if (cmp == 0)
|
||||
break;
|
||||
|
||||
/*
|
||||
* We are now smaller than the key range, which indicates nothing is visible to
|
||||
* us in the specified key range.
|
||||
*/
|
||||
if (cmp < 0) {
|
||||
ret = WT_NOTFOUND;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We are now smaller than the btree range, which indicates nothing is visible to us
|
||||
* in the specified btree range.
|
||||
*/
|
||||
if (btree_id < hs_cursor->btree_id) {
|
||||
ret = WT_NOTFOUND;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
WT_ERR(ret);
|
||||
|
||||
*exactp = cmp;
|
||||
} else
|
||||
*exactp = 1;
|
||||
|
||||
WT_ERR(__curhs_next_visible(session, hs_cursor));
|
||||
}
|
||||
/* Search the closest match that is smaller or equal to the search key. */
|
||||
else {
|
||||
/*
|
||||
* Because of the special visibility rules for the history store, a new key can appear in
|
||||
* between our search and the set of updates that we're interested in. Keep trying until we
|
||||
* find it.
|
||||
*
|
||||
* There may be no history store entries for the given btree id and record key if they have
|
||||
* been removed by rollback to stable.
|
||||
*
|
||||
* Note that we need to compare the raw key off the cursor to determine where we are in the
|
||||
* history store as opposed to comparing the embedded data store key since the ordering is
|
||||
* not guaranteed to be the same.
|
||||
*/
|
||||
if (exact > 0) {
|
||||
/*
|
||||
* It's possible that we may race with a history store insert for another key. So we may
|
||||
* be more than one record away the end of our target key/timestamp range. Keep
|
||||
* iterating backwards until we land on our key.
|
||||
* Keeping looking for the first visible update in the specified range when walking
|
||||
* backwards.
|
||||
*/
|
||||
WT_ERR(__curhs_prev_visible(session, hs_cursor));
|
||||
/*
|
||||
* We can't find anything visible when first walking forwards so we must have found an
|
||||
* update that is smaller than the specified key.
|
||||
*/
|
||||
while ((ret = __wt_hs_cursor_prev(session, file_cursor)) == 0) {
|
||||
WT_STAT_CONN_DATA_INCR(session, cursor_skip_hs_cur_position);
|
||||
|
||||
WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
|
||||
if (cmp <= 0)
|
||||
break;
|
||||
}
|
||||
|
||||
*exactp = cmp;
|
||||
} else
|
||||
*exactp = -1;
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
if (ret == 0) {
|
||||
WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
|
||||
WT_ASSERT(session, cmp <= 0);
|
||||
} else {
|
||||
WT_ERR(ret);
|
||||
/*
|
||||
* We find an update when walking forwards. If initially we land on the same key as the
|
||||
* specified key, exact will be 0 and we should return that. If it is not visible, we
|
||||
* must have found a key that is larger than the specified key.
|
||||
*/
|
||||
*exactp = exact;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* We placed the file cursor after the search key. Try first to walk backwards to see if we
|
||||
* can find a visible record. If nothing is visible, try to walk forwards.
|
||||
*/
|
||||
WT_ERR_NOTFOUND_OK(__curhs_prev_visible(session, hs_cursor), true);
|
||||
if (ret == WT_NOTFOUND) {
|
||||
/*
|
||||
* When walking forwards, first ensure we walk back to the specified btree or key space
|
||||
* as we may have crossed the boundary. Do that in a loop as there may be content
|
||||
* inserted concurrently.
|
||||
*/
|
||||
while ((ret = __curhs_file_cursor_next(session, file_cursor)) == 0) {
|
||||
WT_ERR(
|
||||
file_cursor->get_key(file_cursor, &btree_id, datastore_key, &start_ts, &counter));
|
||||
|
||||
/* We are back in the specified btree range. */
|
||||
if (btree_id == hs_cursor->btree_id && F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
|
||||
WT_ERR(
|
||||
__wt_compare(session, NULL, datastore_key, hs_cursor->datastore_key, &cmp));
|
||||
|
||||
/* We are back in the specified key range. */
|
||||
if (cmp == 0)
|
||||
break;
|
||||
|
||||
/*
|
||||
* We are now larger than the key range, which indicates nothing is visible to
|
||||
* us in the specified key range.
|
||||
*/
|
||||
if (cmp > 0) {
|
||||
ret = WT_NOTFOUND;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We are now larger than the btree range, which indicates nothing is visible to us
|
||||
* in the specified btree range.
|
||||
*/
|
||||
if (btree_id > hs_cursor->btree_id) {
|
||||
ret = WT_NOTFOUND;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
WT_ERR(ret);
|
||||
/*
|
||||
* Keeping looking for the first visible update in the specified range when walking
|
||||
* forwards.
|
||||
*/
|
||||
WT_ERR(__curhs_next_visible(session, hs_cursor));
|
||||
/*
|
||||
* We can't find anything visible when first walking backwards so we must have found an
|
||||
* update that is larger than the specified key.
|
||||
*/
|
||||
*exactp = 1;
|
||||
} else {
|
||||
WT_ERR(ret);
|
||||
*exactp = exact;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
|
||||
WT_ASSERT(
|
||||
session, (cmp == 0 && *exactp == 0) || (cmp < 0 && *exactp < 0) || (cmp > 0 && *exactp > 0));
|
||||
#endif
|
||||
|
||||
WT_ERR(__curhs_prev_visible(session, hs_cursor));
|
||||
}
|
||||
__curhs_set_key_ptr(cursor, file_cursor);
|
||||
__curhs_set_value_ptr(cursor, file_cursor);
|
||||
|
||||
if (0) {
|
||||
err:
|
||||
WT_TRET(cursor->reset(cursor));
|
||||
}
|
||||
|
||||
__wt_scr_free(session, &datastore_key);
|
||||
__wt_scr_free(session, &srch_key);
|
||||
API_END_RET(session, ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __curhs_get_key --
|
||||
* WT_CURSOR->get_key method for the hs cursor type.
|
||||
*/
|
||||
static int
|
||||
__curhs_get_key(WT_CURSOR *cursor, ...)
|
||||
{
|
||||
WT_CURSOR *file_cursor;
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
WT_DECL_RET;
|
||||
va_list ap;
|
||||
|
||||
hs_cursor = (WT_CURSOR_HS *)cursor;
|
||||
file_cursor = hs_cursor->file_cursor;
|
||||
|
||||
va_start(ap, cursor);
|
||||
ret = file_cursor->get_key(file_cursor, va_arg(ap, uint32_t *), va_arg(ap, WT_ITEM **),
|
||||
va_arg(ap, wt_timestamp_t *), va_arg(ap, uint64_t *));
|
||||
va_end(ap);
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __curhs_get_value --
|
||||
* WT_CURSOR->get_value method for the hs cursor type.
|
||||
*/
|
||||
static int
|
||||
__curhs_get_value(WT_CURSOR *cursor, ...)
|
||||
{
|
||||
WT_CURSOR *file_cursor;
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
WT_DECL_RET;
|
||||
va_list ap;
|
||||
|
||||
hs_cursor = (WT_CURSOR_HS *)cursor;
|
||||
file_cursor = hs_cursor->file_cursor;
|
||||
|
||||
va_start(ap, cursor);
|
||||
ret = file_cursor->get_value(file_cursor, va_arg(ap, wt_timestamp_t *),
|
||||
va_arg(ap, wt_timestamp_t *), va_arg(ap, uint64_t *), va_arg(ap, WT_ITEM **));
|
||||
va_end(ap);
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __curhs_set_value --
|
||||
* WT_CURSOR->set_value method for the hs cursor type.
|
||||
* WT_CURSOR->set_value method for the history store cursor type.
|
||||
*/
|
||||
static void
|
||||
__curhs_set_value(WT_CURSOR *cursor, ...)
|
||||
{
|
||||
WT_CURSOR *file_cursor;
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
WT_ITEM *hs_val;
|
||||
wt_timestamp_t start_ts;
|
||||
wt_timestamp_t stop_ts;
|
||||
uint64_t type;
|
||||
va_list ap;
|
||||
|
||||
hs_cursor = (WT_CURSOR_HS *)cursor;
|
||||
@ -650,14 +775,20 @@ __curhs_set_value(WT_CURSOR *cursor, ...)
|
||||
va_start(ap, cursor);
|
||||
hs_cursor->time_window = *va_arg(ap, WT_TIME_WINDOW *);
|
||||
|
||||
file_cursor->set_value(file_cursor, va_arg(ap, wt_timestamp_t), va_arg(ap, wt_timestamp_t),
|
||||
va_arg(ap, uint64_t), va_arg(ap, WT_ITEM *));
|
||||
stop_ts = va_arg(ap, wt_timestamp_t);
|
||||
start_ts = va_arg(ap, wt_timestamp_t);
|
||||
type = va_arg(ap, uint64_t);
|
||||
hs_val = va_arg(ap, WT_ITEM *);
|
||||
|
||||
file_cursor->set_value(file_cursor, stop_ts, start_ts, type, hs_val);
|
||||
va_end(ap);
|
||||
|
||||
__curhs_set_value_ptr(cursor, file_cursor);
|
||||
}
|
||||
|
||||
/*
|
||||
* __curhs_insert --
|
||||
* WT_CURSOR->insert method for the hs cursor type.
|
||||
* WT_CURSOR->insert method for the history store cursor type.
|
||||
*/
|
||||
static int
|
||||
__curhs_insert(WT_CURSOR *cursor)
|
||||
@ -676,6 +807,12 @@ __curhs_insert(WT_CURSOR *cursor)
|
||||
|
||||
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, insert, CUR2BT(file_cursor));
|
||||
|
||||
/*
|
||||
* Disable bulk loads into history store. This would normally occur when updating a record with
|
||||
* a cursor however the history store doesn't use cursor update, so we do it here.
|
||||
*/
|
||||
__wt_cursor_disable_bulk(session);
|
||||
|
||||
/* Allocate a tombstone only when there is a valid stop time point. */
|
||||
if (WT_TIME_WINDOW_HAS_STOP(&hs_cursor->time_window)) {
|
||||
/*
|
||||
@ -701,7 +838,6 @@ __curhs_insert(WT_CURSOR *cursor)
|
||||
if (hs_tombstone != NULL) {
|
||||
hs_tombstone->next = hs_upd;
|
||||
hs_upd = hs_tombstone;
|
||||
hs_tombstone = NULL;
|
||||
}
|
||||
|
||||
retry:
|
||||
@ -725,7 +861,7 @@ err:
|
||||
|
||||
/*
|
||||
* __curhs_remove --
|
||||
* WT_CURSOR->remove method for the hs cursor type.
|
||||
* WT_CURSOR->remove method for the history store cursor type.
|
||||
*/
|
||||
static int
|
||||
__curhs_remove(WT_CURSOR *cursor)
|
||||
@ -734,9 +870,14 @@ __curhs_remove(WT_CURSOR *cursor)
|
||||
WT_CURSOR_BTREE *cbt;
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
WT_DECL_RET;
|
||||
WT_ITEM hs_key;
|
||||
WT_SESSION_IMPL *session;
|
||||
WT_UPDATE *hs_tombstone;
|
||||
wt_timestamp_t hs_start_ts;
|
||||
uint64_t hs_counter;
|
||||
uint32_t hs_btree_id;
|
||||
|
||||
WT_CLEAR(hs_key);
|
||||
hs_cursor = (WT_CURSOR_HS *)cursor;
|
||||
file_cursor = hs_cursor->file_cursor;
|
||||
cbt = (WT_CURSOR_BTREE *)file_cursor;
|
||||
@ -745,7 +886,9 @@ __curhs_remove(WT_CURSOR *cursor)
|
||||
CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, insert, CUR2BT(file_cursor));
|
||||
|
||||
/* Remove must be called with cursor positioned. */
|
||||
WT_ASSERT(session, F_ISSET(file_cursor, WT_CURSTD_KEY_INT));
|
||||
WT_ASSERT(session, F_ISSET(cursor, WT_CURSTD_KEY_INT));
|
||||
|
||||
WT_ERR(cursor->get_key(cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
|
||||
|
||||
/*
|
||||
* Since we're using internal functions to modify the row structure, we need to manually set the
|
||||
@ -765,6 +908,7 @@ __curhs_remove(WT_CURSOR *cursor)
|
||||
|
||||
/* Invalidate the previous value but we will hold on to the position of the key. */
|
||||
F_CLR(file_cursor, WT_CURSTD_VALUE_SET);
|
||||
F_CLR(cursor, WT_CURSTD_VALUE_SET);
|
||||
|
||||
if (0) {
|
||||
err:
|
||||
@ -777,7 +921,7 @@ err:
|
||||
|
||||
/*
|
||||
* __curhs_update --
|
||||
* WT_CURSOR->update method for the hs cursor type.
|
||||
* WT_CURSOR->update method for the history store cursor type.
|
||||
*/
|
||||
static int
|
||||
__curhs_update(WT_CURSOR *cursor)
|
||||
@ -785,15 +929,11 @@ __curhs_update(WT_CURSOR *cursor)
|
||||
WT_CURSOR *file_cursor;
|
||||
WT_CURSOR_BTREE *cbt;
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
WT_DECL_ITEM(hs_value);
|
||||
WT_DECL_RET;
|
||||
WT_SESSION_IMPL *session;
|
||||
WT_UPDATE *hs_tombstone, *hs_upd;
|
||||
bool retry;
|
||||
|
||||
uint64_t hs_upd_type;
|
||||
wt_timestamp_t hs_durable_ts, hs_stop_durable_ts;
|
||||
|
||||
hs_cursor = (WT_CURSOR_HS *)cursor;
|
||||
file_cursor = hs_cursor->file_cursor;
|
||||
cbt = (WT_CURSOR_BTREE *)file_cursor;
|
||||
@ -814,34 +954,12 @@ __curhs_update(WT_CURSOR *cursor)
|
||||
WT_ASSERT(session, !WT_TIME_WINDOW_IS_EMPTY(&hs_cursor->time_window));
|
||||
WT_ASSERT(session, WT_TIME_WINDOW_HAS_STOP(&hs_cursor->time_window));
|
||||
|
||||
/*
|
||||
* Ideally we want to check if we are positioned on the newest value for user key. However, we
|
||||
* can't check if the timestamp was set to WT_TS_MAX when we searched for the key. We can can a
|
||||
* next() on cursor to confirm there is no newer value but that would disturb our cursor. A more
|
||||
* expensive method would be to search again and verify.
|
||||
*/
|
||||
|
||||
/* The tombstone to represent the stop time window. */
|
||||
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_tombstone, NULL));
|
||||
hs_tombstone->start_ts = hs_cursor->time_window.stop_ts;
|
||||
hs_tombstone->durable_ts = hs_cursor->time_window.durable_stop_ts;
|
||||
hs_tombstone->txnid = hs_cursor->time_window.stop_txn;
|
||||
|
||||
/* Modify the existing value with a new stop timestamp. */
|
||||
|
||||
/* Allocate a buffer for the history store value. */
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
|
||||
|
||||
/* Retrieve the existing update value and stop timestamp. */
|
||||
WT_ERR(file_cursor->get_value(
|
||||
file_cursor, &hs_stop_durable_ts, &hs_durable_ts, &hs_upd_type, hs_value));
|
||||
WT_ASSERT(session, hs_stop_durable_ts == WT_TS_MAX);
|
||||
WT_ASSERT(session, (uint8_t)hs_upd_type == WT_UPDATE_STANDARD);
|
||||
|
||||
/* Use set_value method to pack the new value. */
|
||||
file_cursor->set_value(
|
||||
file_cursor, hs_cursor->time_window.stop_ts, hs_durable_ts, hs_upd_type, hs_value);
|
||||
|
||||
WT_ERR(__wt_upd_alloc(session, &file_cursor->value, WT_UPDATE_STANDARD, &hs_upd, NULL));
|
||||
hs_upd->start_ts = hs_cursor->time_window.start_ts;
|
||||
hs_upd->durable_ts = hs_cursor->time_window.durable_start_ts;
|
||||
@ -850,6 +968,11 @@ __curhs_update(WT_CURSOR *cursor)
|
||||
/* Connect the tombstone to the update. */
|
||||
hs_tombstone->next = hs_upd;
|
||||
|
||||
/*
|
||||
* Since we're using internal functions to modify the row structure, we need to manually set the
|
||||
* comparison to an exact match.
|
||||
*/
|
||||
cbt->compare = 0;
|
||||
/* Make the updates and if we fail, search and try again. */
|
||||
while ((ret = __wt_hs_modify(cbt, hs_tombstone)) == WT_RESTART) {
|
||||
WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(cbt, &file_cursor->key, false));
|
||||
@ -863,11 +986,13 @@ __curhs_update(WT_CURSOR *cursor)
|
||||
WT_TRET(ret);
|
||||
}
|
||||
|
||||
__curhs_set_key_ptr(cursor, file_cursor);
|
||||
__curhs_set_value_ptr(cursor, file_cursor);
|
||||
|
||||
if (0) {
|
||||
err:
|
||||
__wt_free(session, hs_tombstone);
|
||||
__wt_free(session, hs_upd);
|
||||
__wt_scr_free(session, &hs_value);
|
||||
WT_TRET(cursor->reset(cursor));
|
||||
}
|
||||
API_END_RET(session, ret);
|
||||
@ -880,53 +1005,54 @@ err:
|
||||
int
|
||||
__wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
|
||||
{
|
||||
WT_CURSOR_STATIC_INIT(iface, __curhs_get_key, /* get-key */
|
||||
__curhs_get_value, /* get-value */
|
||||
__curhs_set_key, /* set-key */
|
||||
__curhs_set_value, /* set-value */
|
||||
__wt_cursor_compare_notsup, /* compare */
|
||||
__wt_cursor_equals_notsup, /* equals */
|
||||
__curhs_next, /* next */
|
||||
__curhs_prev, /* prev */
|
||||
__curhs_reset, /* reset */
|
||||
__wt_cursor_notsup, /* search */
|
||||
__curhs_search_near, /* search-near */
|
||||
__curhs_insert, /* insert */
|
||||
__wt_cursor_modify_value_format_notsup, /* modify */
|
||||
__curhs_update, /* update */
|
||||
__curhs_remove, /* remove */
|
||||
__wt_cursor_notsup, /* reserve */
|
||||
__wt_cursor_reconfigure_notsup, /* reconfigure */
|
||||
__wt_cursor_notsup, /* cache */
|
||||
__wt_cursor_reopen_notsup, /* reopen */
|
||||
__curhs_close); /* close */
|
||||
WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */
|
||||
__wt_cursor_get_value, /* get-value */
|
||||
__curhs_set_key, /* set-key */
|
||||
__curhs_set_value, /* set-value */
|
||||
__wt_cursor_compare_notsup, /* compare */
|
||||
__wt_cursor_equals_notsup, /* equals */
|
||||
__curhs_next, /* next */
|
||||
__curhs_prev, /* prev */
|
||||
__curhs_reset, /* reset */
|
||||
__wt_cursor_notsup, /* search */
|
||||
__curhs_search_near, /* search-near */
|
||||
__curhs_insert, /* insert */
|
||||
__wt_cursor_modify_value_format_notsup, /* modify */
|
||||
__curhs_update, /* update */
|
||||
__curhs_remove, /* remove */
|
||||
__wt_cursor_notsup, /* reserve */
|
||||
__wt_cursor_reconfigure_notsup, /* reconfigure */
|
||||
__wt_cursor_notsup, /* cache */
|
||||
__wt_cursor_reopen_notsup, /* reopen */
|
||||
__curhs_close); /* close */
|
||||
WT_CURSOR *cursor;
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
WT_DECL_RET;
|
||||
WT_ITEM *datastore_key;
|
||||
|
||||
*cursorp = NULL;
|
||||
WT_RET(__wt_calloc_one(session, &hs_cursor));
|
||||
++session->hs_cursor_counter;
|
||||
cursor = (WT_CURSOR *)hs_cursor;
|
||||
*cursor = iface;
|
||||
cursor->session = (WT_SESSION *)session;
|
||||
cursor->key_format = WT_HS_KEY_FORMAT;
|
||||
cursor->value_format = WT_HS_VALUE_FORMAT;
|
||||
WT_ERR(__wt_strdup(session, WT_HS_URI, &cursor->uri));
|
||||
|
||||
/* Open the file cursor for operations on the regular history store .*/
|
||||
WT_ERR(__hs_cursor_open_int(session, &hs_cursor->file_cursor));
|
||||
WT_ERR(__curhs_file_cursor_open(session, &hs_cursor->file_cursor));
|
||||
|
||||
WT_ERR(__wt_cursor_init(cursor, WT_HS_URI, owner, NULL, cursorp));
|
||||
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
|
||||
hs_cursor->btree_id = 0;
|
||||
datastore_key = &hs_cursor->datastore_key;
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &hs_cursor->datastore_key));
|
||||
hs_cursor->flags = 0;
|
||||
|
||||
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
|
||||
|
||||
if (0) {
|
||||
err:
|
||||
WT_TRET(__curhs_close(cursor));
|
||||
WT_TRET(cursor->close(cursor));
|
||||
*cursorp = NULL;
|
||||
}
|
||||
return (ret);
|
||||
|
20
src/third_party/wiredtiger/src/evict/evict_lru.c
vendored
20
src/third_party/wiredtiger/src/evict/evict_lru.c
vendored
@ -285,7 +285,7 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread)
|
||||
* busy and then opens a different file (in this case, the HS file), it can deadlock with a
|
||||
* thread waiting for the first file to drain from the eviction queue. See WT-5946 for details.
|
||||
*/
|
||||
WT_RET(__wt_hs_cursor_cache(session));
|
||||
WT_RET(__wt_curhs_cache(session));
|
||||
if (conn->evict_server_running && __wt_spin_trylock(session, &cache->evict_pass_lock) == 0) {
|
||||
/*
|
||||
* Cannot use WT_WITH_PASS_LOCK because this is a try lock. Fix when that is supported. We
|
||||
@ -2330,7 +2330,6 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
|
||||
{
|
||||
WT_CACHE *cache;
|
||||
WT_CONNECTION_IMPL *conn;
|
||||
WT_CURSOR *hs_cursor_saved;
|
||||
WT_DECL_RET;
|
||||
WT_TRACK_OP_DECL;
|
||||
WT_TXN_GLOBAL *txn_global;
|
||||
@ -2348,22 +2347,13 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, d
|
||||
txn_global = &conn->txn_global;
|
||||
txn_shared = WT_SESSION_TXN_SHARED(session);
|
||||
|
||||
/*
|
||||
* If we have a history store cursor, save it. This ensures that if eviction needs to access the
|
||||
* history store, it will get its own cursor, avoiding potential problems if it were to
|
||||
* reposition or reset a history store cursor that we're in the middle of using for something
|
||||
* else.
|
||||
*/
|
||||
hs_cursor_saved = session->hs_cursor;
|
||||
session->hs_cursor = NULL;
|
||||
|
||||
/*
|
||||
* Before we enter the eviction generation, make sure this session has a cached history store
|
||||
* cursor, otherwise we can deadlock with a session wanting exclusive access to a handle: that
|
||||
* session will have a handle list write lock and will be waiting on eviction to drain, we'll be
|
||||
* inside eviction waiting on a handle list read lock to open a history store cursor.
|
||||
*/
|
||||
WT_ERR(__wt_hs_cursor_cache(session));
|
||||
WT_ERR(__wt_curhs_cache(session));
|
||||
|
||||
/*
|
||||
* It is not safe to proceed if the eviction server threads aren't setup yet.
|
||||
@ -2464,12 +2454,6 @@ err:
|
||||
done:
|
||||
WT_TRACK_OP_END(session);
|
||||
|
||||
/* If the caller was using a history store cursor they should have closed it by now. */
|
||||
WT_ASSERT(session, session->hs_cursor == NULL);
|
||||
|
||||
/* Restore the caller's history store cursor. */
|
||||
session->hs_cursor = hs_cursor_saved;
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
|
@ -76,7 +76,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
|
||||
evict_flags = LF_ISSET(WT_READ_NO_SPLIT) ? WT_EVICT_CALL_NO_SPLIT : 0;
|
||||
FLD_SET(evict_flags, WT_EVICT_CALL_URGENT);
|
||||
|
||||
WT_RET(__wt_hs_cursor_cache(session));
|
||||
WT_RET(__wt_curhs_cache(session));
|
||||
(void)__wt_atomic_addv32(&btree->evict_busy, 1);
|
||||
ret = __wt_evict(session, ref, previous_state, evict_flags);
|
||||
(void)__wt_atomic_subv32(&btree->evict_busy, 1);
|
||||
@ -131,7 +131,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, uint8_t previous_state, uint32
|
||||
/*
|
||||
* Track history store pages being force evicted while holding a history store cursor open.
|
||||
*/
|
||||
if (session->hs_cursor != NULL && WT_IS_HS(session->dhandle)) {
|
||||
if (session->hs_cursor_counter > 0 && WT_IS_HS(session->dhandle)) {
|
||||
force_evict_hs = true;
|
||||
WT_STAT_CONN_INCR(session, cache_eviction_force_hs);
|
||||
}
|
||||
|
12
src/third_party/wiredtiger/src/history/hs_conn.c
vendored
12
src/third_party/wiredtiger/src/history/hs_conn.c
vendored
@ -55,22 +55,20 @@ __hs_cleanup_las(WT_SESSION_IMPL *session)
|
||||
|
||||
/*
|
||||
* __wt_hs_get_btree --
|
||||
* Get the history store btree. Open a history store cursor if needed to get the btree.
|
||||
* Get the history store btree by opening a history store cursor.
|
||||
*/
|
||||
int
|
||||
__wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep)
|
||||
{
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_DECL_RET;
|
||||
|
||||
*hs_btreep = NULL;
|
||||
|
||||
WT_RET(__wt_hs_cursor_open(session));
|
||||
|
||||
*hs_btreep = CUR2BT(session->hs_cursor);
|
||||
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
*hs_btreep = __wt_curhs_get_btree(hs_cursor);
|
||||
WT_ASSERT(session, *hs_btreep != NULL);
|
||||
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
|
236
src/third_party/wiredtiger/src/history/hs_cursor.c
vendored
236
src/third_party/wiredtiger/src/history/hs_cursor.c
vendored
@ -87,117 +87,39 @@ __wt_hs_modify(WT_CURSOR_BTREE *hs_cbt, WT_UPDATE *hs_upd)
|
||||
}
|
||||
|
||||
/*
|
||||
* __hs_cursor_position_int --
|
||||
* Internal function to position a history store cursor at the end of a set of updates for a
|
||||
* given btree id, record key and timestamp.
|
||||
* __wt_hs_upd_time_window --
|
||||
* Get the underlying time window of the update history store cursor is positioned at.
|
||||
*/
|
||||
static int
|
||||
__hs_cursor_position_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id,
|
||||
const WT_ITEM *key, wt_timestamp_t timestamp, WT_ITEM *user_srch_key)
|
||||
void
|
||||
__wt_hs_upd_time_window(WT_CURSOR *hs_cursor, WT_TIME_WINDOW **twp)
|
||||
{
|
||||
WT_DECL_ITEM(srch_key);
|
||||
WT_DECL_RET;
|
||||
int cmp, exact;
|
||||
WT_CURSOR_BTREE *hs_cbt;
|
||||
|
||||
/* The session should be pointing at the history store btree. */
|
||||
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
|
||||
|
||||
if (user_srch_key == NULL)
|
||||
WT_RET(__wt_scr_alloc(session, 0, &srch_key));
|
||||
else
|
||||
srch_key = user_srch_key;
|
||||
|
||||
/*
|
||||
* Because of the special visibility rules for the history store, a new key can appear in
|
||||
* between our search and the set of updates that we're interested in. Keep trying until we find
|
||||
* it.
|
||||
*
|
||||
* There may be no history store entries for the given btree id and record key if they have been
|
||||
* removed by WT_CONNECTION::rollback_to_stable.
|
||||
*
|
||||
* Note that we need to compare the raw key off the cursor to determine where we are in the
|
||||
* history store as opposed to comparing the embedded data store key since the ordering is not
|
||||
* guaranteed to be the same.
|
||||
*/
|
||||
cursor->set_key(cursor, btree_id, key, timestamp, UINT64_MAX);
|
||||
/* Copy the raw key before searching as a basis for comparison. */
|
||||
WT_ERR(__wt_buf_set(session, srch_key, cursor->key.data, cursor->key.size));
|
||||
WT_ERR(cursor->search_near(cursor, &exact));
|
||||
if (exact > 0) {
|
||||
/*
|
||||
* It's possible that we may race with a history store insert for another key. So we may be
|
||||
* more than one record away the end of our target key/timestamp range. Keep iterating
|
||||
* backwards until we land on our key.
|
||||
*/
|
||||
while ((ret = cursor->prev(cursor)) == 0) {
|
||||
WT_STAT_CONN_DATA_INCR(session, cursor_skip_hs_cur_position);
|
||||
|
||||
WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp));
|
||||
if (cmp <= 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
if (ret == 0) {
|
||||
WT_ERR(__wt_compare(session, NULL, &cursor->key, srch_key, &cmp));
|
||||
WT_ASSERT(session, cmp <= 0);
|
||||
}
|
||||
#endif
|
||||
err:
|
||||
if (user_srch_key == NULL)
|
||||
__wt_scr_free(session, &srch_key);
|
||||
return (ret);
|
||||
hs_cbt = __wt_curhs_get_cbt(hs_cursor);
|
||||
*twp = &hs_cbt->upd_value->tw;
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_hs_cursor_position --
|
||||
* Position a history store cursor at the end of a set of updates for a given btree id, record
|
||||
* key and timestamp. There may be no history store entries for the given btree id and record
|
||||
* key if they have been removed by WT_CONNECTION::rollback_to_stable. There is an optional
|
||||
* argument to store the key that we used to position the cursor which can be used to assess
|
||||
* where the cursor is relative to it. The function executes with isolation level set as
|
||||
* WT_ISO_READ_UNCOMMITTED.
|
||||
* __wt_hs_find_upd --
|
||||
* Scan the history store for a record the btree cursor wants to position on. Create an update
|
||||
* for the record and return to the caller.
|
||||
*/
|
||||
int
|
||||
__wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id,
|
||||
const WT_ITEM *key, wt_timestamp_t timestamp, WT_ITEM *user_srch_key)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
|
||||
WT_WITH_BTREE(session, CUR2BT(cursor),
|
||||
WT_WITH_TXN_ISOLATION(session, WT_ISO_READ_UNCOMMITTED,
|
||||
ret = __hs_cursor_position_int(session, cursor, btree_id, key, timestamp, user_srch_key)));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __hs_find_upd_int --
|
||||
* Internal helper to scan the history store for a record the btree cursor wants to position on.
|
||||
* Create an update for the record and return to the caller. The caller may choose to optionally
|
||||
* allow prepared updates to be returned regardless of whether prepare is being ignored
|
||||
* globally. Otherwise, a prepare conflict will be returned upon reading a prepared update.
|
||||
*/
|
||||
static int
|
||||
__hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
|
||||
const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare,
|
||||
WT_ITEM *base_value_buf)
|
||||
__wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
|
||||
const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf)
|
||||
{
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_CURSOR_BTREE *hs_cbt;
|
||||
WT_DECL_ITEM(hs_value);
|
||||
WT_DECL_ITEM(orig_hs_value_buf);
|
||||
WT_DECL_RET;
|
||||
WT_ITEM hs_key, recno_key;
|
||||
WT_MODIFY_VECTOR modifies;
|
||||
WT_TXN *txn;
|
||||
WT_TXN_SHARED *txn_shared;
|
||||
WT_UPDATE *mod_upd;
|
||||
wt_timestamp_t durable_timestamp, durable_timestamp_tmp, hs_start_ts, hs_start_ts_tmp;
|
||||
wt_timestamp_t durable_timestamp, durable_timestamp_tmp;
|
||||
wt_timestamp_t hs_stop_durable_ts, hs_stop_durable_ts_tmp, read_timestamp;
|
||||
uint64_t hs_counter, hs_counter_tmp, upd_type_full;
|
||||
uint32_t hs_btree_id;
|
||||
uint64_t upd_type_full;
|
||||
uint8_t *p, recno_key_buf[WT_INTPACK64_MAXSIZE], upd_type;
|
||||
int cmp;
|
||||
bool upd_found;
|
||||
|
||||
hs_cursor = NULL;
|
||||
@ -205,15 +127,11 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
|
||||
orig_hs_value_buf = NULL;
|
||||
WT_CLEAR(hs_key);
|
||||
__wt_modify_vector_init(session, &modifies);
|
||||
txn = session->txn;
|
||||
txn_shared = WT_SESSION_TXN_SHARED(session);
|
||||
upd_found = false;
|
||||
|
||||
WT_STAT_CONN_DATA_INCR(session, cursor_search_hs);
|
||||
|
||||
hs_cursor = session->hs_cursor;
|
||||
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
|
||||
|
||||
/* Row-store key is as passed to us, create the column-store key as needed. */
|
||||
WT_ASSERT(
|
||||
session, (key == NULL && recno != WT_RECNO_OOB) || (key != NULL && recno == WT_RECNO_OOB));
|
||||
@ -226,70 +144,29 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
|
||||
key->size = WT_PTRDIFF(p, recno_key_buf);
|
||||
}
|
||||
|
||||
/* Allocate buffer for the history store value. */
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
|
||||
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
|
||||
/*
|
||||
* After positioning our cursor, we're stepping backwards to find the correct update. Since the
|
||||
* timestamp is part of the key, our cursor needs to go from the newest record (further in the
|
||||
* history store) to the oldest (earlier in the history store) for a given key.
|
||||
*/
|
||||
read_timestamp = allow_prepare ? txn->prepare_timestamp : txn_shared->read_timestamp;
|
||||
|
||||
/*
|
||||
*
|
||||
* A reader without a timestamp should read the largest timestamp in the range, however cursor
|
||||
* search near if given a 0 timestamp will place at the top of the range and hide the records
|
||||
* below it. As such we need to adjust a 0 timestamp to the timestamp max value.
|
||||
*/
|
||||
if (read_timestamp == WT_TS_NONE)
|
||||
read_timestamp = WT_TS_MAX;
|
||||
read_timestamp =
|
||||
txn_shared->read_timestamp == WT_TS_NONE ? WT_TS_MAX : txn_shared->read_timestamp;
|
||||
|
||||
WT_ERR_NOTFOUND_OK(
|
||||
__wt_hs_cursor_position(session, hs_cursor, btree_id, key, read_timestamp, NULL), true);
|
||||
hs_cursor->set_key(hs_cursor, 4, btree_id, key, read_timestamp, UINT64_MAX);
|
||||
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true);
|
||||
if (ret == WT_NOTFOUND) {
|
||||
ret = 0;
|
||||
goto done;
|
||||
}
|
||||
for (;; ret = __wt_hs_cursor_prev(session, hs_cursor)) {
|
||||
WT_ERR_NOTFOUND_OK(ret, true);
|
||||
/* If we hit the end of the table, let's get out of here. */
|
||||
if (ret == WT_NOTFOUND) {
|
||||
ret = 0;
|
||||
goto done;
|
||||
}
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
|
||||
|
||||
/* Stop before crossing over to the next btree */
|
||||
if (hs_btree_id != btree_id)
|
||||
goto done;
|
||||
|
||||
/*
|
||||
* Keys are sorted in an order, skip the ones before the desired key, and bail out if we
|
||||
* have crossed over the desired key and not found the record we are looking for.
|
||||
*/
|
||||
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
|
||||
if (cmp != 0)
|
||||
goto done;
|
||||
|
||||
/*
|
||||
* If the stop time pair on the tombstone in the history store is already globally visible
|
||||
* we can skip it.
|
||||
*/
|
||||
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
|
||||
WT_STAT_CONN_DATA_INCR(session, cursor_prev_hs_tombstone);
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* If the stop time point of a record is visible to us, we won't be able to see anything for
|
||||
* this entire key. Just jump straight to the end.
|
||||
*/
|
||||
if (__wt_txn_tw_stop_visible(session, &hs_cbt->upd_value->tw))
|
||||
goto done;
|
||||
/* If the start time point is visible to us, let's return that record. */
|
||||
if (__wt_txn_tw_start_visible(session, &hs_cbt->upd_value->tw))
|
||||
break;
|
||||
}
|
||||
|
||||
/* Allocate buffer for the history store value. */
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
|
||||
WT_ERR(hs_cursor->get_value(
|
||||
hs_cursor, &hs_stop_durable_ts, &durable_timestamp, &upd_type_full, hs_value));
|
||||
upd_type = (uint8_t)upd_type_full;
|
||||
@ -320,6 +197,8 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
|
||||
* visibility checks when reading in order to construct the modify chain, so we can create
|
||||
* the value we expect.
|
||||
*/
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
|
||||
while (upd_type == WT_UPDATE_MODIFY) {
|
||||
WT_ERR(__wt_upd_alloc(session, hs_value, upd_type, &mod_upd, NULL));
|
||||
WT_ERR(__wt_modify_vector_push(&modifies, mod_upd));
|
||||
@ -330,7 +209,7 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
|
||||
* update here we fall back to the datastore version. If its timestamp doesn't match our
|
||||
* timestamp then we return not found.
|
||||
*/
|
||||
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, hs_cursor), true);
|
||||
WT_ERR_NOTFOUND_OK(hs_cursor->next(hs_cursor), true);
|
||||
if (ret == WT_NOTFOUND) {
|
||||
/*
|
||||
* Fallback to the provided value as the base value.
|
||||
@ -344,47 +223,6 @@ __hs_find_upd_int(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
|
||||
upd_type = WT_UPDATE_STANDARD;
|
||||
break;
|
||||
}
|
||||
hs_start_ts_tmp = WT_TS_NONE;
|
||||
/*
|
||||
* Make sure we use the temporary variants of these variables. We need to retain the
|
||||
* timestamps of the original modify we saw.
|
||||
*
|
||||
* We keep looking back into history store until we find a base update to apply the
|
||||
* reverse deltas on top of.
|
||||
*/
|
||||
WT_ERR(hs_cursor->get_key(
|
||||
hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts_tmp, &hs_counter_tmp));
|
||||
|
||||
if (hs_btree_id != btree_id) {
|
||||
/* Fallback to the provided value as the base value. */
|
||||
orig_hs_value_buf = hs_value;
|
||||
hs_value = base_value_buf;
|
||||
upd_type = WT_UPDATE_STANDARD;
|
||||
break;
|
||||
}
|
||||
|
||||
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
|
||||
|
||||
if (cmp != 0) {
|
||||
/* Fallback to the provided value as the base value. */
|
||||
orig_hs_value_buf = hs_value;
|
||||
hs_value = base_value_buf;
|
||||
upd_type = WT_UPDATE_STANDARD;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the stop time pair on the tombstone in the history store is already globally
|
||||
* visible fall back to the base value. This is possible in scenarios where the latest
|
||||
* updates are aborted by RTS according to stable timestamp.
|
||||
*/
|
||||
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
|
||||
/* Fallback to the provided value as the base value. */
|
||||
orig_hs_value_buf = hs_value;
|
||||
hs_value = base_value_buf;
|
||||
upd_type = WT_UPDATE_STANDARD;
|
||||
break;
|
||||
}
|
||||
|
||||
WT_ERR(hs_cursor->get_value(hs_cursor, &hs_stop_durable_ts_tmp, &durable_timestamp_tmp,
|
||||
&upd_type_full, hs_value));
|
||||
@ -440,26 +278,8 @@ err:
|
||||
|
||||
WT_ASSERT(session, ret != WT_NOTFOUND);
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_hs_find_upd --
|
||||
* Scan the history store for a record.
|
||||
*/
|
||||
int
|
||||
__wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format, uint64_t recno,
|
||||
WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf)
|
||||
{
|
||||
WT_BTREE *btree;
|
||||
WT_DECL_RET;
|
||||
|
||||
btree = S2BT(session);
|
||||
|
||||
WT_RET(__wt_hs_cursor_open(session));
|
||||
WT_WITH_BTREE(session, CUR2BT(session->hs_cursor),
|
||||
(ret = __hs_find_upd_int(
|
||||
session, btree->id, key, value_format, recno, upd_value, allow_prepare, base_value_buf)));
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
if (hs_cursor != NULL)
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
554
src/third_party/wiredtiger/src/history/hs_rec.c
vendored
554
src/third_party/wiredtiger/src/history/hs_rec.c
vendored
@ -11,8 +11,7 @@
|
||||
static int __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
|
||||
uint32_t btree_id, const WT_ITEM *key, bool reinsert);
|
||||
static int __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
|
||||
WT_BTREE *btree, const WT_ITEM *key, wt_timestamp_t ts, uint64_t *hs_counter,
|
||||
const WT_ITEM *srch_key);
|
||||
WT_BTREE *btree, const WT_ITEM *key, wt_timestamp_t ts, uint64_t *hs_counter);
|
||||
|
||||
/*
|
||||
* __hs_verbose_cache_stats --
|
||||
@ -61,100 +60,17 @@ __hs_verbose_cache_stats(WT_SESSION_IMPL *session, WT_BTREE *btree)
|
||||
}
|
||||
|
||||
/*
|
||||
* __hs_insert_record_with_btree_int --
|
||||
* Internal helper for inserting history store records. If this call is successful, the cursor
|
||||
* parameter will be positioned on the newly inserted record. Otherwise, it will be reset.
|
||||
*/
|
||||
static int
|
||||
__hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint64_t btree_id,
|
||||
const WT_ITEM *key, const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw,
|
||||
uint64_t counter)
|
||||
{
|
||||
WT_CURSOR_BTREE *cbt;
|
||||
WT_DECL_RET;
|
||||
WT_UPDATE *hs_upd, *upd_local;
|
||||
|
||||
cbt = (WT_CURSOR_BTREE *)cursor;
|
||||
hs_upd = upd_local = NULL;
|
||||
|
||||
/* The session should be pointing at the history store btree. */
|
||||
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
|
||||
|
||||
/*
|
||||
* Use WT_CURSOR.set_key and WT_CURSOR.set_value to create key and value items, then use them to
|
||||
* create an update chain for a direct insertion onto the history store page.
|
||||
*/
|
||||
cursor->set_key(cursor, btree_id, key, tw->start_ts, counter);
|
||||
cursor->set_value(cursor, tw->durable_stop_ts, tw->durable_start_ts, (uint64_t)type, hs_value);
|
||||
|
||||
/* Allocate a tombstone only when there is a valid stop time point. */
|
||||
if (WT_TIME_WINDOW_HAS_STOP(tw)) {
|
||||
/*
|
||||
* Insert a delete record to represent stop time point for the actual record to be inserted.
|
||||
* Set the stop time point as the commit time point of the history store delete record.
|
||||
*/
|
||||
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
|
||||
hs_upd->start_ts = tw->stop_ts;
|
||||
hs_upd->durable_ts = tw->durable_stop_ts;
|
||||
hs_upd->txnid = tw->stop_txn;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append to the delete record, the actual record to be inserted into the history store. Set the
|
||||
* current update start time point as the commit time point to the history store record.
|
||||
*/
|
||||
WT_ERR(__wt_upd_alloc(session, &cursor->value, WT_UPDATE_STANDARD, &upd_local, NULL));
|
||||
upd_local->start_ts = tw->start_ts;
|
||||
upd_local->durable_ts = tw->durable_start_ts;
|
||||
upd_local->txnid = tw->start_txn;
|
||||
|
||||
/* Insert the standard update as next update if there is a tombstone. */
|
||||
if (hs_upd != NULL)
|
||||
hs_upd->next = upd_local;
|
||||
else
|
||||
hs_upd = upd_local;
|
||||
|
||||
/* Search the page and insert the updates. */
|
||||
WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(cbt, &cursor->key, true));
|
||||
WT_ERR(ret);
|
||||
WT_ERR(__wt_hs_modify(cbt, hs_upd));
|
||||
|
||||
/*
|
||||
* Since the two updates (tombstone and the standard) will reconcile into a single entry, we are
|
||||
* incrementing the history store insert statistic by one.
|
||||
*/
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_insert);
|
||||
|
||||
err:
|
||||
if (ret != 0) {
|
||||
__wt_free_update_list(session, &hs_upd);
|
||||
|
||||
/*
|
||||
* We did a row search, release the cursor so that the page doesn't continue being held.
|
||||
*
|
||||
* If we were successful, do NOT reset the cursor. We may want to make use of its position
|
||||
* later to remove timestamped entries.
|
||||
*/
|
||||
cursor->reset(cursor);
|
||||
}
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __hs_insert_record_with_btree --
|
||||
* __hs_insert_record --
|
||||
* A helper function to insert the record into the history store including stop time point.
|
||||
* Should be called with session's btree switched to the history store.
|
||||
*/
|
||||
static int
|
||||
__hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree,
|
||||
const WT_ITEM *key, const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw)
|
||||
__hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key,
|
||||
const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw)
|
||||
{
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
WT_CURSOR_BTREE *hs_cbt;
|
||||
#endif
|
||||
WT_DECL_ITEM(hs_key);
|
||||
WT_DECL_ITEM(srch_key);
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
WT_DECL_ITEM(existing_val);
|
||||
#endif
|
||||
@ -164,37 +80,24 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
|
||||
wt_timestamp_t durable_timestamp_diag;
|
||||
wt_timestamp_t hs_stop_durable_ts_diag;
|
||||
uint64_t upd_type_full_diag;
|
||||
int cmp;
|
||||
#endif
|
||||
uint64_t counter, hs_counter;
|
||||
uint32_t hs_btree_id;
|
||||
int cmp;
|
||||
|
||||
counter = 0;
|
||||
|
||||
/* Allocate buffers for the history store and search key. */
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &srch_key));
|
||||
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
/* Allocate buffer for the existing history store value for the same key. */
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &existing_val));
|
||||
hs_cbt = (WT_CURSOR_BTREE *)cursor;
|
||||
hs_cbt = __wt_curhs_get_cbt(cursor);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The session should be pointing at the history store btree since this is the one that we'll be
|
||||
* inserting into. The btree parameter that we're passing in should is the btree that the
|
||||
* history store content is associated with (this is where the btree id part of the history
|
||||
* store key comes from).
|
||||
*/
|
||||
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
|
||||
WT_ASSERT(session, !WT_IS_HS(btree->dhandle));
|
||||
|
||||
/*
|
||||
* Disable bulk loads into history store. This would normally occur when updating a record with
|
||||
* a cursor however the history store doesn't use cursor update, so we do it here.
|
||||
*/
|
||||
__wt_cursor_disable_bulk(session);
|
||||
/* Sanity check that the btree is not a history store btree. */
|
||||
WT_ASSERT(session, !WT_IS_HS(btree));
|
||||
|
||||
/*
|
||||
* Only deltas or full updates should be written to the history store. More specifically, we
|
||||
@ -207,43 +110,33 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
|
||||
* timestamp. Otherwise the newly inserting history store record may fall behind the existing
|
||||
* one can lead to wrong order.
|
||||
*/
|
||||
WT_ERR_NOTFOUND_OK(
|
||||
__wt_hs_cursor_position(session, cursor, btree->id, key, tw->start_ts, srch_key), true);
|
||||
cursor->set_key(cursor, 4, btree->id, key, tw->start_ts, UINT64_MAX);
|
||||
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, cursor), true);
|
||||
|
||||
if (ret == 0) {
|
||||
WT_ERR(cursor->get_key(cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
|
||||
/*
|
||||
* Check the whether the existing record is also from the same timestamp.
|
||||
*
|
||||
* Verify simple checks first to confirm whether the retrieved update same or not before
|
||||
* performing the expensive key comparison.
|
||||
*/
|
||||
if (hs_btree_id == btree->id && tw->start_ts == hs_start_ts) {
|
||||
WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp));
|
||||
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
if (cmp == 0) {
|
||||
WT_ERR(cursor->get_value(cursor, &hs_stop_durable_ts_diag, &durable_timestamp_diag,
|
||||
&upd_type_full_diag, existing_val));
|
||||
WT_ERR(__wt_compare(session, NULL, existing_val, hs_value, &cmp));
|
||||
/*
|
||||
* Check if the existing HS value is same as the new value we are about to insert.
|
||||
* We can skip this check if the existing value has a globally visible stop time,
|
||||
* i.e., the value has been deleted from the HS.
|
||||
*/
|
||||
if (cmp == 0)
|
||||
WT_ASSERT(session,
|
||||
(WT_TIME_WINDOW_HAS_STOP(&hs_cbt->upd_value->tw) &&
|
||||
__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) ||
|
||||
tw->start_txn == WT_TXN_NONE ||
|
||||
tw->start_txn != hs_cbt->upd_value->tw.start_txn ||
|
||||
tw->start_ts != hs_cbt->upd_value->tw.start_ts);
|
||||
counter = hs_counter + 1;
|
||||
}
|
||||
#else
|
||||
if (tw->start_ts == hs_start_ts) {
|
||||
WT_ERR(cursor->get_value(cursor, &hs_stop_durable_ts_diag, &durable_timestamp_diag,
|
||||
&upd_type_full_diag, existing_val));
|
||||
WT_ERR(__wt_compare(session, NULL, existing_val, hs_value, &cmp));
|
||||
/*
|
||||
* We shouldn't be inserting the same value again for the key unless coming from a
|
||||
* different transaction. If the updates are from the same transaction, the start
|
||||
* timestamp for each update should be different.
|
||||
*/
|
||||
if (cmp == 0)
|
||||
counter = hs_counter + 1;
|
||||
#endif
|
||||
WT_ASSERT(session,
|
||||
tw->start_txn == WT_TXN_NONE ||
|
||||
tw->start_txn != hs_cbt->upd_value->tw.start_txn ||
|
||||
tw->start_ts != hs_cbt->upd_value->tw.start_ts);
|
||||
counter = hs_counter + 1;
|
||||
}
|
||||
#else
|
||||
if (tw->start_ts == hs_start_ts)
|
||||
counter = hs_counter + 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@ -251,10 +144,20 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
|
||||
* updates, we should remove them and reinsert them at the current timestamp.
|
||||
*/
|
||||
if (tw->start_ts != WT_TS_NONE) {
|
||||
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, cursor), true);
|
||||
/*
|
||||
* If there were no keys equal to or less than our target key, we would have received
|
||||
* WT_NOTFOUND. In that case we need to search again with a higher timestamp as the cursor
|
||||
* would not be positioned correctly.
|
||||
*/
|
||||
if (ret == 0)
|
||||
WT_ERR_NOTFOUND_OK(cursor->next(cursor), true);
|
||||
else {
|
||||
cursor->set_key(cursor, 3, btree->id, key, tw->start_ts + 1);
|
||||
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, cursor), true);
|
||||
}
|
||||
if (ret == 0)
|
||||
WT_ERR(__hs_fixup_out_of_order_from_pos(
|
||||
session, cursor, btree, key, tw->start_ts, &counter, srch_key));
|
||||
session, cursor, btree, key, tw->start_ts, &counter));
|
||||
}
|
||||
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
@ -270,36 +173,20 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* The tree structure can change while we try to insert the mod list, retry if that happens. */
|
||||
while ((ret = __hs_insert_record_with_btree_int(
|
||||
session, cursor, btree->id, key, type, hs_value, tw, counter)) == WT_RESTART)
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_insert_restart);
|
||||
|
||||
/* Insert the new record now. */
|
||||
cursor->set_key(cursor, 4, btree->id, key, tw->start_ts, counter);
|
||||
cursor->set_value(
|
||||
cursor, tw, tw->durable_stop_ts, tw->durable_start_ts, (uint64_t)type, hs_value);
|
||||
WT_ERR(cursor->insert(cursor));
|
||||
WT_STAT_CONN_INCR(session, cache_hs_insert);
|
||||
WT_STAT_DATA_INCR(session, cache_hs_insert);
|
||||
|
||||
err:
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
__wt_scr_free(session, &existing_val);
|
||||
#endif
|
||||
__wt_scr_free(session, &hs_key);
|
||||
__wt_scr_free(session, &srch_key);
|
||||
/* We did a row search, release the cursor so that the page doesn't continue being held. */
|
||||
cursor->reset(cursor);
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __hs_insert_record --
|
||||
* Temporarily switches to history store btree and calls the helper routine to insert records.
|
||||
*/
|
||||
static int
|
||||
__hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key,
|
||||
const uint8_t type, const WT_ITEM *hs_value, WT_TIME_WINDOW *tw)
|
||||
{
|
||||
WT_CURSOR_BTREE *cbt;
|
||||
WT_DECL_RET;
|
||||
|
||||
cbt = (WT_CURSOR_BTREE *)cursor;
|
||||
WT_WITH_BTREE(session, CUR2BT(cbt),
|
||||
ret = __hs_insert_record_with_btree(session, cursor, btree, key, type, hs_value, tw));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
@ -346,8 +233,8 @@ __hs_next_upd_full_value(WT_SESSION_IMPL *session, WT_MODIFY_VECTOR *modifies,
|
||||
int
|
||||
__wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
|
||||
{
|
||||
WT_BTREE *btree;
|
||||
WT_CURSOR *cursor;
|
||||
WT_BTREE *btree, *hs_btree;
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_DECL_ITEM(full_value);
|
||||
WT_DECL_ITEM(key);
|
||||
WT_DECL_ITEM(modify_value);
|
||||
@ -372,10 +259,13 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
|
||||
bool enable_reverse_modify, hs_inserted, squashed, ts_updates_in_hs;
|
||||
|
||||
btree = S2BT(session);
|
||||
cursor = session->hs_cursor;
|
||||
prev_upd = NULL;
|
||||
insert_cnt = 0;
|
||||
WT_TIME_WINDOW_INIT(&tw);
|
||||
|
||||
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
|
||||
__wt_modify_vector_init(session, &modifies);
|
||||
|
||||
if (!btree->hs_entries)
|
||||
@ -560,13 +450,15 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
|
||||
if (oldest_upd->type == WT_UPDATE_TOMBSTONE && oldest_upd == first_non_ts_upd &&
|
||||
!F_ISSET(first_non_ts_upd, WT_UPDATE_CLEARED_HS)) {
|
||||
/* We can only delete history store entries that have timestamps. */
|
||||
WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1, true));
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_non_ts);
|
||||
WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, key, 1, true));
|
||||
WT_STAT_CONN_INCR(session, cache_hs_key_truncate_non_ts);
|
||||
WT_STAT_DATA_INCR(session, cache_hs_key_truncate_non_ts);
|
||||
F_SET(first_non_ts_upd, WT_UPDATE_CLEARED_HS);
|
||||
} else if (first_non_ts_upd != NULL && !F_ISSET(first_non_ts_upd, WT_UPDATE_CLEARED_HS) &&
|
||||
(list->ins == NULL || ts_updates_in_hs)) {
|
||||
WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1, true));
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_non_ts);
|
||||
WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, key, 1, true));
|
||||
WT_STAT_CONN_INCR(session, cache_hs_key_truncate_non_ts);
|
||||
WT_STAT_DATA_INCR(session, cache_hs_key_truncate_non_ts);
|
||||
F_SET(first_non_ts_upd, WT_UPDATE_CLEARED_HS);
|
||||
}
|
||||
|
||||
@ -704,13 +596,13 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
|
||||
enable_reverse_modify &&
|
||||
__wt_calc_modify(session, prev_full_value, full_value, prev_full_value->size / 10,
|
||||
entries, &nentries) == 0) {
|
||||
WT_ERR(__wt_modify_pack(cursor, entries, nentries, &modify_value));
|
||||
WT_ERR(__wt_modify_pack(hs_cursor, entries, nentries, &modify_value));
|
||||
WT_ERR(__hs_insert_record(
|
||||
session, cursor, btree, key, WT_UPDATE_MODIFY, modify_value, &tw));
|
||||
session, hs_cursor, btree, key, WT_UPDATE_MODIFY, modify_value, &tw));
|
||||
__wt_scr_free(session, &modify_value);
|
||||
} else
|
||||
WT_ERR(__hs_insert_record(
|
||||
session, cursor, btree, key, WT_UPDATE_STANDARD, full_value, &tw));
|
||||
session, hs_cursor, btree, key, WT_UPDATE_STANDARD, full_value, &tw));
|
||||
|
||||
/* Flag the update as now in the history store. */
|
||||
F_SET(upd, WT_UPDATE_HS);
|
||||
@ -730,7 +622,8 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi)
|
||||
|
||||
WT_ERR(__wt_block_manager_named_size(session, WT_HS_FILE, &hs_size));
|
||||
WT_STAT_CONN_SET(session, cache_hs_ondisk, hs_size);
|
||||
max_hs_size = CUR2BT(cursor)->file_max;
|
||||
hs_btree = __wt_curhs_get_btree(hs_cursor);
|
||||
max_hs_size = hs_btree->file_max;
|
||||
if (max_hs_size != 0 && (uint64_t)hs_size > max_hs_size)
|
||||
WT_ERR_PANIC(session, WT_PANIC,
|
||||
"WiredTigerHS: file size of %" PRIu64 " exceeds maximum size %" PRIu64, (uint64_t)hs_size,
|
||||
@ -747,71 +640,8 @@ err:
|
||||
__wt_modify_vector_free(&modifies);
|
||||
__wt_scr_free(session, &full_value);
|
||||
__wt_scr_free(session, &prev_full_value);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __hs_delete_key_from_ts_int --
|
||||
* Internal helper for deleting history store content of a given key from a timestamp.
|
||||
*/
|
||||
static int
|
||||
__hs_delete_key_from_ts_int(
|
||||
WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
|
||||
{
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_DECL_ITEM(srch_key);
|
||||
WT_DECL_RET;
|
||||
WT_ITEM hs_key;
|
||||
wt_timestamp_t hs_start_ts;
|
||||
uint64_t hs_counter;
|
||||
uint32_t hs_btree_id;
|
||||
int cmp, exact;
|
||||
|
||||
/* The session should be pointing at the history store btree. */
|
||||
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
|
||||
|
||||
hs_cursor = session->hs_cursor;
|
||||
WT_RET(__wt_scr_alloc(session, 0, &srch_key));
|
||||
|
||||
hs_cursor->set_key(hs_cursor, btree_id, key, ts, 0);
|
||||
WT_ERR(__wt_buf_set(session, srch_key, hs_cursor->key.data, hs_cursor->key.size));
|
||||
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_search_near(session, hs_cursor, &exact), true);
|
||||
/* Empty history store is fine. */
|
||||
if (ret == WT_NOTFOUND)
|
||||
goto done;
|
||||
/*
|
||||
* If we raced with a history store insert, we may be two or more records away from our target.
|
||||
* Keep iterating forwards until we are on or past our target key.
|
||||
*
|
||||
* We can't use the cursor positioning helper that we use for regular reads since that will
|
||||
* place us at the end of a particular key/timestamp range whereas we want to be placed at the
|
||||
* beginning.
|
||||
*/
|
||||
if (exact < 0) {
|
||||
while ((ret = __wt_hs_cursor_next(session, hs_cursor)) == 0) {
|
||||
WT_ERR(__wt_compare(session, NULL, &hs_cursor->key, srch_key, &cmp));
|
||||
if (cmp >= 0)
|
||||
break;
|
||||
}
|
||||
/* No entries greater than or equal to the key we searched for. */
|
||||
WT_ERR_NOTFOUND_OK(ret, true);
|
||||
if (ret == WT_NOTFOUND)
|
||||
goto done;
|
||||
}
|
||||
/* Bailing out here also means we have no history store records for our key. */
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
|
||||
if (hs_btree_id != btree_id)
|
||||
goto done;
|
||||
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
|
||||
if (cmp != 0)
|
||||
goto done;
|
||||
|
||||
WT_ASSERT(session, ts == WT_TS_NONE || hs_start_ts != WT_TS_NONE);
|
||||
WT_ERR(__hs_delete_key_from_pos(session, hs_cursor, btree_id, key, reinsert));
|
||||
done:
|
||||
ret = 0;
|
||||
err:
|
||||
__wt_scr_free(session, &srch_key);
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
@ -820,22 +650,29 @@ err:
|
||||
* Delete history store content of a given key from a timestamp.
|
||||
*/
|
||||
int
|
||||
__wt_hs_delete_key_from_ts(
|
||||
WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
|
||||
__wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id,
|
||||
const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
bool hs_read_committed;
|
||||
|
||||
/* If the operation can't open new handles, it should have figured that out before here. */
|
||||
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES));
|
||||
hs_read_committed = F_ISSET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
if (!hs_read_committed)
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
|
||||
/* The tree structure can change while we try to insert the mod list, retry if that happens. */
|
||||
do {
|
||||
WT_WITH_BTREE(session, CUR2BT(session->hs_cursor),
|
||||
(ret = __hs_delete_key_from_ts_int(session, btree_id, key, ts, reinsert)));
|
||||
if (ret == WT_RESTART)
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_insert_restart);
|
||||
} while (ret == WT_RESTART);
|
||||
hs_cursor->set_key(hs_cursor, 3, btree_id, key, ts);
|
||||
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_after(session, hs_cursor), true);
|
||||
/* Empty history store is fine. */
|
||||
if (ret == WT_NOTFOUND) {
|
||||
ret = 0;
|
||||
goto done;
|
||||
}
|
||||
|
||||
WT_ERR(__hs_delete_key_from_pos(session, hs_cursor, btree_id, key, reinsert));
|
||||
done:
|
||||
err:
|
||||
if (!hs_read_committed)
|
||||
F_CLR(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
@ -847,31 +684,29 @@ __wt_hs_delete_key_from_ts(
|
||||
*/
|
||||
static int
|
||||
__hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_BTREE *btree,
|
||||
const WT_ITEM *key, wt_timestamp_t ts, uint64_t *counter, const WT_ITEM *srch_key)
|
||||
const WT_ITEM *key, wt_timestamp_t ts, uint64_t *counter)
|
||||
{
|
||||
WT_CURSOR *insert_cursor;
|
||||
WT_CURSOR *hs_insert_cursor;
|
||||
WT_CURSOR_BTREE *hs_cbt;
|
||||
WT_DECL_RET;
|
||||
WT_ITEM hs_key, hs_value;
|
||||
WT_TIME_WINDOW tw;
|
||||
WT_UPDATE *tombstone;
|
||||
wt_timestamp_t hs_ts, hs_start_durable_ts, hs_stop_durable_ts;
|
||||
WT_TIME_WINDOW tw, hs_insert_tw;
|
||||
wt_timestamp_t hs_ts;
|
||||
uint64_t hs_counter, hs_upd_type;
|
||||
uint32_t hs_btree_id;
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
int cmp;
|
||||
#endif
|
||||
char ts_string[5][WT_TS_INT_STRING_SIZE];
|
||||
const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL};
|
||||
|
||||
insert_cursor = NULL;
|
||||
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
|
||||
hs_insert_cursor = NULL;
|
||||
hs_cbt = __wt_curhs_get_cbt(hs_cursor);
|
||||
WT_CLEAR(hs_key);
|
||||
WT_CLEAR(hs_value);
|
||||
WT_TIME_WINDOW_INIT(&tw);
|
||||
tombstone = NULL;
|
||||
|
||||
/* The session should be pointing at the history store btree. */
|
||||
WT_ASSERT(session, WT_IS_HS((S2BT(session))->dhandle));
|
||||
|
||||
#ifndef HAVE_DIAGNOSTIC
|
||||
WT_UNUSED(key);
|
||||
#endif
|
||||
/*
|
||||
* Position ourselves at the beginning of the key range that we may have to fixup. Prior to
|
||||
* getting here, we've positioned our cursor at the end of a key/timestamp range and then done a
|
||||
@ -881,15 +716,15 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
|
||||
* to keep doing "next" until we've got a key greater than the one we attempted to position
|
||||
* ourselves with.
|
||||
*/
|
||||
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
|
||||
/*
|
||||
* Prior to getting here, we've done a "search near" on our key for the timestamp we're
|
||||
* inserting and then a "next". In the regular case, our cursor will be positioned on the
|
||||
* next key and we'll break out of the first iteration in one of the conditions below.
|
||||
*/
|
||||
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
|
||||
/* We shouldn't have crossed the btree and user key search space. */
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
|
||||
WT_ERR(__wt_compare(session, NULL, &hs_cursor->key, srch_key, &cmp));
|
||||
if (cmp > 0)
|
||||
WT_ASSERT(session, hs_btree_id == btree->id);
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
|
||||
WT_ASSERT(session, cmp == 0);
|
||||
#endif
|
||||
if (hs_ts > ts)
|
||||
break;
|
||||
}
|
||||
if (ret == WT_NOTFOUND)
|
||||
@ -916,27 +751,14 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
|
||||
* 2 foo 3 2 ccc
|
||||
* 2 foo 3 3 ddd
|
||||
*/
|
||||
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
|
||||
/*
|
||||
* Prior to getting here, we've done a "search near" on our key for the timestamp we're
|
||||
* inserting and then a "next". In the regular case, our cursor will be positioned on the
|
||||
* next key and we'll break out of the first iteration in one of the conditions below.
|
||||
*/
|
||||
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
|
||||
/* We shouldn't have crossed the btree and user key search space. */
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter));
|
||||
if (hs_btree_id != btree->id)
|
||||
break;
|
||||
|
||||
WT_ASSERT(session, hs_btree_id == btree->id);
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
|
||||
if (cmp != 0)
|
||||
break;
|
||||
/*
|
||||
* If the stop time pair on the tombstone in the history store is already globally visible
|
||||
* we can skip it.
|
||||
*/
|
||||
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
|
||||
WT_STAT_CONN_DATA_INCR(session, cursor_next_hs_tombstone);
|
||||
continue;
|
||||
}
|
||||
WT_ASSERT(session, cmp == 0);
|
||||
#endif
|
||||
/*
|
||||
* If we got here, we've got out-of-order updates in the history store.
|
||||
*
|
||||
@ -950,11 +772,8 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
|
||||
* Don't incur the overhead of opening this new cursor unless we need it. In the regular
|
||||
* case, we'll never get here.
|
||||
*/
|
||||
if (insert_cursor == NULL) {
|
||||
WT_WITHOUT_DHANDLE(session,
|
||||
ret = __wt_open_cursor(session, WT_HS_URI, NULL, open_cursor_cfg, &insert_cursor));
|
||||
WT_ERR(ret);
|
||||
}
|
||||
if (hs_insert_cursor == NULL)
|
||||
WT_ERR(__wt_curhs_open(session, NULL, &hs_insert_cursor));
|
||||
|
||||
/*
|
||||
* If these history store records are resolved prepared updates, their durable timestamps
|
||||
@ -973,47 +792,38 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
|
||||
__wt_timestamp_to_string(hs_cbt->upd_value->tw.durable_stop_ts, ts_string[3]),
|
||||
__wt_timestamp_to_string(ts, ts_string[4]));
|
||||
|
||||
tw.start_ts = tw.durable_start_ts = ts;
|
||||
tw.start_txn = hs_cbt->upd_value->tw.start_txn;
|
||||
hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = ts;
|
||||
hs_insert_tw.start_txn = hs_cbt->upd_value->tw.start_txn;
|
||||
|
||||
/*
|
||||
* We're going to be inserting something immediately after with the same timestamp. Either
|
||||
* another moved update OR the update itself that triggered the correction. In either case,
|
||||
* we should preserve the stop transaction id.
|
||||
*/
|
||||
tw.stop_ts = tw.durable_stop_ts = ts;
|
||||
tw.stop_txn = hs_cbt->upd_value->tw.stop_txn;
|
||||
hs_insert_tw.stop_ts = hs_insert_tw.durable_stop_ts = ts;
|
||||
hs_insert_tw.stop_txn = hs_cbt->upd_value->tw.stop_txn;
|
||||
|
||||
/* Extract the underlying value for reinsertion. */
|
||||
WT_ERR(hs_cursor->get_value(
|
||||
hs_cursor, &hs_stop_durable_ts, &hs_start_durable_ts, &hs_upd_type, &hs_value));
|
||||
hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value));
|
||||
|
||||
/* Reinsert entry with earlier timestamp. */
|
||||
while ((ret = __hs_insert_record_with_btree_int(session, insert_cursor, btree->id, key,
|
||||
(uint8_t)hs_upd_type, &hs_value, &tw, *counter)) == WT_RESTART)
|
||||
;
|
||||
WT_ERR(ret);
|
||||
/* Insert the value back with different timestamps. */
|
||||
hs_insert_cursor->set_key(hs_insert_cursor, 4, btree->id, &hs_key, ts, *counter);
|
||||
hs_insert_cursor->set_value(hs_insert_cursor, &hs_insert_tw, hs_insert_tw.durable_stop_ts,
|
||||
hs_insert_tw.durable_start_ts, (uint64_t)hs_upd_type, &hs_value);
|
||||
WT_ERR(hs_insert_cursor->insert(hs_insert_cursor));
|
||||
++(*counter);
|
||||
|
||||
/* Delete entry with higher timestamp. */
|
||||
hs_cbt->compare = 0;
|
||||
WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, NULL));
|
||||
tombstone->txnid = WT_TXN_NONE;
|
||||
tombstone->start_ts = tombstone->durable_ts = WT_TS_NONE;
|
||||
while ((ret = __wt_hs_modify(hs_cbt, tombstone)) == WT_RESTART) {
|
||||
WT_WITH_PAGE_INDEX(session, ret = __wt_hs_row_search(hs_cbt, &hs_cursor->key, false));
|
||||
WT_ERR(ret);
|
||||
}
|
||||
WT_ERR(ret);
|
||||
tombstone = NULL;
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_order_fixup_move);
|
||||
/* Delete the entry with higher timestamp. */
|
||||
WT_ERR(hs_cursor->remove(hs_cursor));
|
||||
WT_STAT_CONN_INCR(session, cache_hs_order_fixup_move);
|
||||
WT_STAT_DATA_INCR(session, cache_hs_order_fixup_move);
|
||||
}
|
||||
if (ret == WT_NOTFOUND)
|
||||
ret = 0;
|
||||
err:
|
||||
__wt_free(session, tombstone);
|
||||
if (insert_cursor != NULL)
|
||||
insert_cursor->close(insert_cursor);
|
||||
if (hs_insert_cursor != NULL)
|
||||
hs_insert_cursor->close(hs_insert_cursor);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
@ -1027,26 +837,21 @@ static int
|
||||
__hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id,
|
||||
const WT_ITEM *key, bool reinsert)
|
||||
{
|
||||
WT_CURSOR *insert_cursor;
|
||||
WT_CURSOR *hs_insert_cursor;
|
||||
WT_CURSOR_BTREE *hs_cbt;
|
||||
WT_DECL_RET;
|
||||
WT_ITEM hs_key, hs_value;
|
||||
WT_TIME_WINDOW tw;
|
||||
WT_UPDATE *upd;
|
||||
WT_TIME_WINDOW hs_insert_tw;
|
||||
wt_timestamp_t durable_timestamp, hs_start_ts, hs_stop_durable_ts;
|
||||
uint64_t hs_counter, hs_insert_counter, hs_upd_type;
|
||||
uint32_t hs_btree_id;
|
||||
int cmp;
|
||||
const char *open_cursor_cfg[] = {WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL};
|
||||
|
||||
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
|
||||
hs_cbt = __wt_curhs_get_cbt(hs_cursor);
|
||||
hs_insert_counter = 0;
|
||||
WT_CLEAR(hs_key);
|
||||
WT_CLEAR(hs_value);
|
||||
WT_TIME_WINDOW_INIT(&tw);
|
||||
upd = NULL;
|
||||
insert_cursor = NULL;
|
||||
|
||||
hs_insert_cursor = NULL;
|
||||
if (reinsert) {
|
||||
/*
|
||||
* Determine the starting value of our counter, i.e. highest counter value of the timestamp
|
||||
@ -1056,90 +861,60 @@ __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_
|
||||
* The cursor will also be positioned at the start of the range that we wish to start
|
||||
* inserting.
|
||||
*/
|
||||
WT_WITHOUT_DHANDLE(session,
|
||||
ret = __wt_open_cursor(session, WT_HS_URI, NULL, open_cursor_cfg, &insert_cursor));
|
||||
WT_WITHOUT_DHANDLE(session, ret = __wt_curhs_open(session, NULL, &hs_insert_cursor));
|
||||
WT_ERR(ret);
|
||||
F_SET(insert_cursor, WT_CURSTD_IGNORE_TOMBSTONE);
|
||||
WT_ERR_NOTFOUND_OK(
|
||||
__wt_hs_cursor_position(session, insert_cursor, btree_id, key, WT_TS_NONE, NULL), true);
|
||||
F_SET(hs_insert_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
hs_insert_cursor->set_key(hs_insert_cursor, 4, btree_id, key, WT_TS_NONE, UINT64_MAX);
|
||||
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_insert_cursor), true);
|
||||
|
||||
if (ret == WT_NOTFOUND) {
|
||||
hs_insert_counter = 0;
|
||||
ret = 0;
|
||||
} else {
|
||||
WT_ERR(insert_cursor->get_key(
|
||||
insert_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_insert_counter));
|
||||
WT_ERR(hs_insert_cursor->get_key(
|
||||
hs_insert_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_insert_counter));
|
||||
WT_ASSERT(session, hs_start_ts == WT_TS_NONE);
|
||||
/*
|
||||
* Increment the hs counter that we'll be using to insert with to avoid overwriting the
|
||||
* record we just found.
|
||||
* Increment the history store counter that we'll be using to insert with to avoid
|
||||
* overwriting the record we just found.
|
||||
*/
|
||||
hs_insert_counter++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Begin iterating over the range of entries we expect to replace. */
|
||||
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
|
||||
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter));
|
||||
/*
|
||||
* If the btree id or key isn't ours, that means that we've hit the end of the key range and
|
||||
* that there is no more history store content for this key.
|
||||
*/
|
||||
if (hs_btree_id != btree_id)
|
||||
break;
|
||||
WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp));
|
||||
if (cmp != 0)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If the stop time pair on the tombstone in the history store is already globally visible
|
||||
* we can skip it.
|
||||
*/
|
||||
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
|
||||
WT_STAT_CONN_DATA_INCR(session, cursor_next_hs_tombstone);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Once we reinsert the entry below, we're not allowed to fail otherwise we'll be leaving
|
||||
* our history store an invalid state. Anything that can potentially fail, such as heap
|
||||
* allocation of the tombstone that we'll be using to remove the old value, should be
|
||||
* performed before reinsertion.
|
||||
*/
|
||||
WT_ERR(__wt_upd_alloc_tombstone(session, &upd, NULL));
|
||||
|
||||
if (reinsert) {
|
||||
WT_ERR(hs_cursor->get_value(
|
||||
hs_cursor, &hs_stop_durable_ts, &durable_timestamp, &hs_upd_type, &hs_value));
|
||||
|
||||
tw.start_ts = tw.durable_start_ts = WT_TS_NONE;
|
||||
tw.start_txn = hs_cbt->upd_value->tw.start_txn;
|
||||
|
||||
tw.stop_ts = tw.durable_stop_ts = WT_TS_NONE;
|
||||
tw.stop_txn = hs_cbt->upd_value->tw.stop_txn;
|
||||
|
||||
/* Reinsert entry with zero timestamp. */
|
||||
while (
|
||||
(ret = __hs_insert_record_with_btree_int(session, insert_cursor, btree_id, &hs_key,
|
||||
(uint8_t)hs_upd_type, &hs_value, &tw, hs_insert_counter)) == WT_RESTART)
|
||||
;
|
||||
hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = WT_TS_NONE;
|
||||
hs_insert_tw.start_txn = hs_cbt->upd_value->tw.start_txn;
|
||||
|
||||
hs_insert_tw.stop_ts = hs_insert_tw.durable_stop_ts = WT_TS_NONE;
|
||||
hs_insert_tw.stop_txn = hs_cbt->upd_value->tw.stop_txn;
|
||||
|
||||
hs_insert_cursor->set_key(
|
||||
hs_insert_cursor, 4, btree_id, key, WT_TS_NONE, hs_insert_counter);
|
||||
hs_insert_cursor->set_value(hs_insert_cursor, &hs_insert_tw, WT_TS_NONE, WT_TS_NONE,
|
||||
(uint64_t)hs_upd_type, &hs_value);
|
||||
WT_ERR(hs_insert_cursor->insert(hs_insert_cursor));
|
||||
WT_STAT_CONN_INCR(session, cache_hs_insert);
|
||||
WT_STAT_DATA_INCR(session, cache_hs_insert);
|
||||
|
||||
hs_insert_counter++;
|
||||
WT_ERR(ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* Since we're using internal functions to modify the row structure, we need to manually set
|
||||
* the comparison to an exact match.
|
||||
*/
|
||||
hs_cbt->compare = 0;
|
||||
/*
|
||||
* Append a globally visible tombstone to the update list. This will effectively make the
|
||||
* value invisible and the key itself will eventually get removed during reconciliation.
|
||||
* Remove the key using history store cursor interface.
|
||||
*
|
||||
* If anything fails after this point and we're reinserting we need to panic as it will
|
||||
* leave our history store in an unexpected state with duplicate entries.
|
||||
*/
|
||||
upd->txnid = WT_TXN_NONE;
|
||||
upd->start_ts = upd->durable_ts = WT_TS_NONE;
|
||||
if ((ret = __wt_hs_modify(hs_cbt, upd)) != 0) {
|
||||
if ((ret = hs_cursor->remove(hs_cursor)) != 0) {
|
||||
if (reinsert)
|
||||
WT_ERR_PANIC(session, WT_PANIC,
|
||||
"Failed to insert tombstone, history store now "
|
||||
@ -1147,14 +922,13 @@ __hs_delete_key_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_
|
||||
else
|
||||
WT_ERR(ret);
|
||||
}
|
||||
upd = NULL;
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate);
|
||||
WT_STAT_CONN_INCR(session, cache_hs_key_truncate);
|
||||
WT_STAT_DATA_INCR(session, cache_hs_key_truncate);
|
||||
}
|
||||
if (ret == WT_NOTFOUND)
|
||||
ret = 0;
|
||||
err:
|
||||
__wt_free(session, upd);
|
||||
if (insert_cursor != NULL)
|
||||
insert_cursor->close(insert_cursor);
|
||||
if (hs_insert_cursor != NULL)
|
||||
hs_insert_cursor->close(hs_insert_cursor);
|
||||
return (ret);
|
||||
}
|
||||
|
@ -15,10 +15,9 @@
|
||||
* store.
|
||||
*/
|
||||
static int
|
||||
__hs_verify_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *ds_cbt, uint32_t this_btree_id)
|
||||
__hs_verify_id(
|
||||
WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_CURSOR_BTREE *ds_cbt, uint32_t this_btree_id)
|
||||
{
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_CURSOR_BTREE *hs_cbt;
|
||||
WT_DECL_ITEM(prev_key);
|
||||
WT_DECL_RET;
|
||||
WT_ITEM key;
|
||||
@ -27,12 +26,14 @@ __hs_verify_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *ds_cbt, uint32_t this_
|
||||
uint32_t btree_id;
|
||||
int cmp;
|
||||
|
||||
hs_cursor = session->hs_cursor;
|
||||
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
|
||||
WT_CLEAR(key);
|
||||
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &prev_key));
|
||||
|
||||
#ifndef HAVE_DIAGNOSTIC
|
||||
WT_UNUSED(this_btree_id);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If using standard cursors, we need to skip the non-globally visible tombstones in the data
|
||||
* table to verify the corresponding entries in the history store are too present in the data
|
||||
@ -46,27 +47,18 @@ __hs_verify_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *ds_cbt, uint32_t this_
|
||||
* verify. When we return after moving to a new key the caller is responsible for keeping the
|
||||
* cursor there or deciding they're done.
|
||||
*/
|
||||
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &btree_id, &key, &hs_start_ts, &hs_counter));
|
||||
|
||||
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
|
||||
/*
|
||||
* If the btree id does not match the preview one, we're done. It is up to the caller to set
|
||||
* up for the next tree and call us, if they choose. For a full history store walk, the
|
||||
* caller sends in WT_BTREE_ID_INVALID and this function will set and use the first btree id
|
||||
* it finds and will return once it walks off that tree, leaving the cursor set to the first
|
||||
* key of that new tree.
|
||||
*
|
||||
* We should never cross the btree id, assert if we do so.
|
||||
*/
|
||||
if (btree_id != this_btree_id)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If the stop time pair on the tombstone in the history store is already globally visible
|
||||
* we can skip it.
|
||||
*/
|
||||
if (__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw)) {
|
||||
WT_STAT_CONN_INCR(session, cursor_next_hs_tombstone);
|
||||
continue;
|
||||
}
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &btree_id, &key, &hs_start_ts, &hs_counter));
|
||||
WT_ASSERT(session, btree_id == this_btree_id);
|
||||
|
||||
/*
|
||||
* If we have already checked against this key, keep going to the next key. We only need to
|
||||
@ -114,22 +106,14 @@ __wt_hs_verify_one(WT_SESSION_IMPL *session)
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_CURSOR_BTREE ds_cbt;
|
||||
WT_DECL_RET;
|
||||
WT_ITEM hs_key;
|
||||
uint32_t btree_id;
|
||||
int exact;
|
||||
|
||||
hs_cursor = session->hs_cursor;
|
||||
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
btree_id = S2BT(session)->id;
|
||||
|
||||
/*
|
||||
* We are required to position the history store cursor. Set it to the first record of our btree
|
||||
* in the history store.
|
||||
*/
|
||||
memset(&hs_key, 0, sizeof(hs_key));
|
||||
hs_cursor->set_key(hs_cursor, btree_id, &hs_key, 0, 0);
|
||||
ret = __wt_hs_cursor_search_near(session, hs_cursor, &exact);
|
||||
if (ret == 0 && exact < 0)
|
||||
ret = __wt_hs_cursor_next(session, hs_cursor);
|
||||
hs_cursor->set_key(hs_cursor, 1, btree_id);
|
||||
WT_ERR(__wt_curhs_search_near_after(session, hs_cursor));
|
||||
|
||||
/*
|
||||
* If we positioned the cursor there is something to verify.
|
||||
@ -141,9 +125,12 @@ __wt_hs_verify_one(WT_SESSION_IMPL *session)
|
||||
if (ret == 0) {
|
||||
__wt_btcur_init(session, &ds_cbt);
|
||||
__wt_btcur_open(&ds_cbt);
|
||||
ret = __hs_verify_id(session, &ds_cbt, btree_id);
|
||||
ret = __hs_verify_id(session, hs_cursor, &ds_cbt, btree_id);
|
||||
WT_TRET(__wt_btcur_close(&ds_cbt, false));
|
||||
}
|
||||
|
||||
err:
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
return (ret == WT_NOTFOUND ? 0 : ret);
|
||||
}
|
||||
|
||||
@ -173,10 +160,10 @@ __wt_hs_verify(WT_SESSION_IMPL *session)
|
||||
btree_id = WT_BTREE_ID_INVALID;
|
||||
uri_data = NULL;
|
||||
|
||||
WT_RET(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &buf));
|
||||
WT_ERR(__wt_hs_cursor_open(session));
|
||||
hs_cursor = session->hs_cursor;
|
||||
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_next(session, hs_cursor), true);
|
||||
WT_ERR_NOTFOUND_OK(hs_cursor->next(hs_cursor), true);
|
||||
stop = ret == WT_NOTFOUND ? true : false;
|
||||
ret = 0;
|
||||
|
||||
@ -198,17 +185,16 @@ __wt_hs_verify(WT_SESSION_IMPL *session)
|
||||
}
|
||||
WT_ERR(__wt_open_cursor(session, uri_data, NULL, NULL, &ds_cursor));
|
||||
F_SET(ds_cursor, WT_CURSOR_RAW_OK);
|
||||
ret = __hs_verify_id(session, (WT_CURSOR_BTREE *)ds_cursor, btree_id);
|
||||
ret = __hs_verify_id(session, hs_cursor, (WT_CURSOR_BTREE *)ds_cursor, btree_id);
|
||||
if (ret == WT_NOTFOUND)
|
||||
stop = true;
|
||||
WT_TRET(ds_cursor->close(ds_cursor));
|
||||
WT_ERR_NOTFOUND_OK(ret, false);
|
||||
}
|
||||
err:
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
|
||||
__wt_scr_free(session, &buf);
|
||||
WT_ASSERT(session, key.mem == NULL && key.memsize == 0);
|
||||
__wt_free(session, uri_data);
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
return (ret);
|
||||
}
|
||||
|
90
src/third_party/wiredtiger/src/include/api.h
vendored
90
src/third_party/wiredtiger/src/include/api.h
vendored
@ -36,32 +36,32 @@
|
||||
WT_DATA_HANDLE *__olddh = (s)->dhandle; \
|
||||
const char *__oldname; \
|
||||
/* If this isn't an API reentry, the name should be NULL and the counter should be 0. */ \
|
||||
WT_ASSERT(session, (s)->name != NULL || s->api_call_counter == 0); \
|
||||
WT_ASSERT(session, (s)->name != NULL || (s)->api_call_counter == 0); \
|
||||
__oldname = (s)->name; \
|
||||
++s->api_call_counter; \
|
||||
++(s)->api_call_counter; \
|
||||
(s)->dhandle = (dh); \
|
||||
(s)->name = (s)->lastop = #h "." #n
|
||||
#define API_SESSION_POP(s) \
|
||||
(s)->dhandle = __olddh; \
|
||||
(s)->name = __oldname; \
|
||||
--s->api_call_counter
|
||||
--(s)->api_call_counter
|
||||
|
||||
/* Standard entry points to the API: declares/initializes local variables. */
|
||||
#define API_SESSION_INIT(s, h, n, dh) \
|
||||
WT_TRACK_OP_DECL; \
|
||||
API_SESSION_PUSH(s, h, n, dh); \
|
||||
/* \
|
||||
* No code before this line, otherwise error handling won't be \
|
||||
* correct. \
|
||||
*/ \
|
||||
WT_ERR(WT_SESSION_CHECK_PANIC(s)); \
|
||||
WT_SINGLE_THREAD_CHECK_START(s); \
|
||||
WT_TRACK_OP_INIT(s); \
|
||||
if (s->api_call_counter == 1 && !F_ISSET(s, WT_SESSION_INTERNAL)) \
|
||||
__wt_op_timer_start(s); \
|
||||
/* Reset wait time if this isn't an API reentry. */ \
|
||||
if (s->api_call_counter == 1) \
|
||||
(s)->cache_wait_us = 0; \
|
||||
#define API_SESSION_INIT(s, h, n, dh) \
|
||||
WT_TRACK_OP_DECL; \
|
||||
API_SESSION_PUSH(s, h, n, dh); \
|
||||
/* \
|
||||
* No code before this line, otherwise error handling won't be \
|
||||
* correct. \
|
||||
*/ \
|
||||
WT_ERR(WT_SESSION_CHECK_PANIC(s)); \
|
||||
WT_SINGLE_THREAD_CHECK_START(s); \
|
||||
WT_TRACK_OP_INIT(s); \
|
||||
if ((s)->api_call_counter == 1 && !F_ISSET(s, WT_SESSION_INTERNAL)) \
|
||||
__wt_op_timer_start(s); \
|
||||
/* Reset wait time if this isn't an API reentry. */ \
|
||||
if ((s)->api_call_counter == 1) \
|
||||
(s)->cache_wait_us = 0; \
|
||||
__wt_verbose((s), WT_VERB_API, "%s", "CALL: " #h ":" #n)
|
||||
|
||||
#define API_CALL_NOCONF(s, h, n, dh) \
|
||||
@ -75,21 +75,26 @@
|
||||
if ((config) != NULL) \
|
||||
WT_ERR(__wt_config_check((s), WT_CONFIG_REF(session, h##_##n), (config), 0))
|
||||
|
||||
#define API_END(s, ret) \
|
||||
if ((s) != NULL) { \
|
||||
WT_TRACK_OP_END(s); \
|
||||
WT_SINGLE_THREAD_CHECK_STOP(s); \
|
||||
if ((ret) != 0) \
|
||||
__wt_txn_err_set(s, ret); \
|
||||
if (s->api_call_counter == 1 && !F_ISSET(session, WT_SESSION_INTERNAL)) \
|
||||
__wt_op_timer_stop(s); \
|
||||
/* \
|
||||
* No code after this line, otherwise error handling \
|
||||
* won't be correct. \
|
||||
*/ \
|
||||
API_SESSION_POP(s); \
|
||||
} \
|
||||
} \
|
||||
#define API_END(s, ret) \
|
||||
if ((s) != NULL) { \
|
||||
WT_TRACK_OP_END(s); \
|
||||
WT_SINGLE_THREAD_CHECK_STOP(s); \
|
||||
if ((ret) != 0) \
|
||||
__wt_txn_err_set(s, ret); \
|
||||
if ((s)->api_call_counter == 1 && !F_ISSET(session, WT_SESSION_INTERNAL)) \
|
||||
__wt_op_timer_stop(s); \
|
||||
/* \
|
||||
* We should not leave any history store cursor open when return from an api call. \
|
||||
* However, we cannot do a stricter check before WT-7247 is resolved. \
|
||||
*/ \
|
||||
WT_ASSERT(s, (s)->api_call_counter > 1 || (s)->hs_cursor_counter <= 2); \
|
||||
/* \
|
||||
* No code after this line, otherwise error handling \
|
||||
* won't be correct. \
|
||||
*/ \
|
||||
API_SESSION_POP(s); \
|
||||
} \
|
||||
} \
|
||||
while (0)
|
||||
|
||||
/* An API call wrapped in a transaction if necessary. */
|
||||
@ -188,13 +193,15 @@
|
||||
SESSION_API_PREPARE_CHECK(s, WT_SESSION, n); \
|
||||
API_CALL_NOCONF(s, WT_SESSION, n, NULL)
|
||||
|
||||
#define SESSION_API_PREPARE_CHECK(s, h, n) \
|
||||
do { \
|
||||
int __prepare_ret; \
|
||||
API_SESSION_PUSH(s, WT_SESSION, n, NULL); \
|
||||
__prepare_ret = __wt_txn_context_prepare_check(s); \
|
||||
API_SESSION_POP(s); \
|
||||
WT_RET(__prepare_ret); \
|
||||
#define SESSION_API_PREPARE_CHECK(s, h, n) \
|
||||
do { \
|
||||
if ((s)->api_call_counter == 0) { \
|
||||
int __prepare_ret; \
|
||||
API_SESSION_PUSH(s, WT_SESSION, n, NULL); \
|
||||
__prepare_ret = __wt_txn_context_prepare_check(s); \
|
||||
API_SESSION_POP(s); \
|
||||
WT_RET(__prepare_ret); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define SESSION_API_CALL(s, n, config, cfg) \
|
||||
@ -209,8 +216,7 @@
|
||||
|
||||
#define CURSOR_API_CALL(cur, s, n, bt) \
|
||||
(s) = (WT_SESSION_IMPL *)(cur)->session; \
|
||||
if ((s)->hs_cursor == NULL) \
|
||||
SESSION_API_PREPARE_CHECK(s, WT_CURSOR, n); \
|
||||
SESSION_API_PREPARE_CHECK(s, WT_CURSOR, n); \
|
||||
API_CALL_NOCONF(s, WT_CURSOR, n, ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \
|
||||
if (F_ISSET(cur, WT_CURSTD_CACHED)) \
|
||||
WT_ERR(__wt_cursor_cached(cur))
|
||||
|
@ -288,7 +288,7 @@ struct __wt_cursor_hs {
|
||||
WT_CURSOR *file_cursor; /* Queries of regular history store data */
|
||||
WT_TIME_WINDOW time_window;
|
||||
uint32_t btree_id;
|
||||
WT_ITEM datastore_key;
|
||||
WT_ITEM *datastore_key;
|
||||
|
||||
/* AUTOMATIC FLAG VALUE GENERATION START */
|
||||
#define WT_HS_CUR_BTREE_ID_SET 0x1u
|
||||
|
@ -6,6 +6,32 @@
|
||||
* See the file LICENSE for redistribution information.
|
||||
*/
|
||||
|
||||
/*
|
||||
* __wt_curhs_get_btree --
|
||||
* Convert a history store cursor to the underlying btree.
|
||||
*/
|
||||
static inline WT_BTREE *
|
||||
__wt_curhs_get_btree(WT_CURSOR *cursor)
|
||||
{
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
hs_cursor = (WT_CURSOR_HS *)cursor;
|
||||
|
||||
return (CUR2BT(hs_cursor->file_cursor));
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_curhs_get_cbt --
|
||||
* Convert a history store cursor to the underlying btree cursor.
|
||||
*/
|
||||
static inline WT_CURSOR_BTREE *
|
||||
__wt_curhs_get_cbt(WT_CURSOR *cursor)
|
||||
{
|
||||
WT_CURSOR_HS *hs_cursor;
|
||||
hs_cursor = (WT_CURSOR_HS *)cursor;
|
||||
|
||||
return ((WT_CURSOR_BTREE *)hs_cursor->file_cursor);
|
||||
}
|
||||
|
||||
/*
|
||||
* __cursor_set_recno --
|
||||
* The cursor value in the interface has to track the value in the underlying cursor, update
|
||||
|
36
src/third_party/wiredtiger/src/include/extern.h
vendored
36
src/third_party/wiredtiger/src/include/extern.h
vendored
@ -495,8 +495,14 @@ extern int __wt_curfile_next_random(WT_CURSOR *cursor)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
|
||||
const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_curhs_cache(WT_SESSION_IMPL *session)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_curhs_search_near_after(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_curhs_search_near_before(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
|
||||
const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx,
|
||||
@ -596,7 +602,7 @@ extern int __wt_debug_addr_print(WT_SESSION_IMPL *session, const uint8_t *addr,
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_debug_cursor_page(void *cursor_arg, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE(
|
||||
(visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_debug_cursor_tree_hs(void *cursor_arg, const char *ofile)
|
||||
extern int __wt_debug_cursor_tree_hs(void *session_arg, const char *ofile)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default")))
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_debug_disk(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile)
|
||||
@ -750,26 +756,11 @@ extern int __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_config(WT_SESSION_IMPL *session, const char **cfg)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_cursor_cache(WT_SESSION_IMPL *session)
|
||||
extern int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor,
|
||||
uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_cursor_close(WT_SESSION_IMPL *session)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_cursor_next(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_cursor_open(WT_SESSION_IMPL *session)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id,
|
||||
const WT_ITEM *key, wt_timestamp_t timestamp, WT_ITEM *user_srch_key)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_cursor_prev(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_cursor_search_near(WT_SESSION_IMPL *session, WT_CURSOR *cursor, int *exactp)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_delete_key_from_ts(
|
||||
WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_format,
|
||||
uint64_t recno, WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *base_value_buf)
|
||||
extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key,
|
||||
const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
extern int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
@ -1716,6 +1707,7 @@ extern void __wt_gen_next(WT_SESSION_IMPL *session, int which, uint64_t *genp);
|
||||
extern void __wt_gen_next_drain(WT_SESSION_IMPL *session, int which);
|
||||
extern void __wt_hazard_close(WT_SESSION_IMPL *session);
|
||||
extern void __wt_hs_close(WT_SESSION_IMPL *session);
|
||||
extern void __wt_hs_upd_time_window(WT_CURSOR *hs_cursor, WT_TIME_WINDOW **twp);
|
||||
extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg);
|
||||
extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor);
|
||||
extern void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn);
|
||||
@ -1820,8 +1812,12 @@ extern void __wt_verbose_worker(WT_SESSION_IMPL *session, const char *fmt, ...)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((format(printf, 2, 3))) WT_GCC_FUNC_DECL_ATTRIBUTE((cold));
|
||||
extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
|
||||
extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
|
||||
static inline WT_BTREE *__wt_curhs_get_btree(WT_CURSOR *cursor)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
static inline WT_CELL *__wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
static inline WT_CURSOR_BTREE *__wt_curhs_get_cbt(WT_CURSOR *cursor)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
static inline WT_IKEY *__wt_ref_key_instantiated(WT_REF *ref)
|
||||
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
|
||||
static inline WT_VISIBLE_TYPE __wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
|
||||
|
@ -92,7 +92,7 @@ struct __wt_session_impl {
|
||||
WT_COMPACT_STATE *compact; /* Compaction information */
|
||||
enum { WT_COMPACT_NONE = 0, WT_COMPACT_RUNNING, WT_COMPACT_SUCCESS } compact_state;
|
||||
|
||||
WT_CURSOR *hs_cursor; /* History store table cursor */
|
||||
u_int hs_cursor_counter; /* Number of open history store cursors */
|
||||
|
||||
WT_CURSOR *meta_cursor; /* Metadata file */
|
||||
void *meta_track; /* Metadata operation tracking */
|
||||
|
@ -451,10 +451,8 @@ struct __wt_connection_stats {
|
||||
int64_t cursor_modify_bytes;
|
||||
int64_t cursor_modify_bytes_touch;
|
||||
int64_t cursor_next;
|
||||
int64_t cursor_next_hs_tombstone_rts;
|
||||
int64_t cursor_restart;
|
||||
int64_t cursor_prev;
|
||||
int64_t cursor_prev_hs_tombstone_rts;
|
||||
int64_t cursor_remove;
|
||||
int64_t cursor_remove_bytes;
|
||||
int64_t cursor_reserve;
|
||||
|
@ -1044,8 +1044,8 @@ retry:
|
||||
/* If there's no visible update in the update chain or ondisk, check the history store file. */
|
||||
if (F_ISSET(S2C(session), WT_CONN_HS_OPEN) && !F_ISSET(session->dhandle, WT_DHANDLE_HS)) {
|
||||
__wt_timing_stress(session, WT_TIMING_STRESS_HS_SEARCH);
|
||||
WT_RET(__wt_hs_find_upd(session, key, cbt->iface.value_format, recno, cbt->upd_value, false,
|
||||
&cbt->upd_value->buf));
|
||||
WT_RET(__wt_hs_find_upd(session, S2BT(session)->id, key, cbt->iface.value_format, recno,
|
||||
cbt->upd_value, &cbt->upd_value->buf));
|
||||
}
|
||||
|
||||
/*
|
||||
|
700
src/third_party/wiredtiger/src/include/wiredtiger.in
vendored
700
src/third_party/wiredtiger/src/include/wiredtiger.in
vendored
File diff suppressed because it is too large
Load Diff
@ -703,6 +703,7 @@ __wt_rec_row_leaf(
|
||||
WT_BTREE *btree;
|
||||
WT_CELL *cell;
|
||||
WT_CELL_UNPACK_KV *kpack, _kpack, *vpack, _vpack;
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_CURSOR_BTREE *cbt;
|
||||
WT_DECL_ITEM(tmpkey);
|
||||
WT_DECL_RET;
|
||||
@ -720,6 +721,7 @@ __wt_rec_row_leaf(
|
||||
void *copy;
|
||||
|
||||
btree = S2BT(session);
|
||||
hs_cursor = NULL;
|
||||
page = pageref->page;
|
||||
slvg_skip = salvage == NULL ? 0 : salvage->skip;
|
||||
WT_TIME_WINDOW_INIT(&tw);
|
||||
@ -914,11 +916,19 @@ __wt_rec_row_leaf(
|
||||
* ever need to blow away history store content, so we can skip this.
|
||||
*/
|
||||
if (!F_ISSET(session, WT_SESSION_NO_DATA_HANDLES)) {
|
||||
WT_ERR(__wt_hs_cursor_open(session));
|
||||
/*
|
||||
* FIXME-WT-7053: we will hit the dhandle deadlock if we open multiple
|
||||
* history store cursors in reconciliation. Once it is fixed, we can move
|
||||
* the open and close of the history store cursor inside the delete key
|
||||
* function.
|
||||
*/
|
||||
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
WT_ERR(__wt_hs_delete_key_from_ts(
|
||||
session, btree->id, tmpkey, WT_TS_NONE, false));
|
||||
WT_ERR(__wt_hs_cursor_close(session));
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_onpage_removal);
|
||||
session, hs_cursor, btree->id, tmpkey, WT_TS_NONE, false));
|
||||
WT_ERR(hs_cursor->close(hs_cursor));
|
||||
hs_cursor = NULL;
|
||||
WT_STAT_CONN_INCR(session, cache_hs_key_truncate_onpage_removal);
|
||||
WT_STAT_DATA_INCR(session, cache_hs_key_truncate_onpage_removal);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1034,6 +1044,8 @@ leaf_insert:
|
||||
ret = __wt_rec_split_finish(session, r);
|
||||
|
||||
err:
|
||||
if (hs_cursor != NULL)
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
__wt_scr_free(session, &tmpkey);
|
||||
return (ret);
|
||||
}
|
||||
|
@ -2289,8 +2289,6 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
|
||||
if (i == r->multi_next)
|
||||
return (0);
|
||||
|
||||
WT_RET(__wt_hs_cursor_open(session));
|
||||
|
||||
for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i)
|
||||
if (multi->supd != NULL) {
|
||||
WT_ERR(__wt_hs_insert_updates(session, r->page, multi));
|
||||
@ -2302,7 +2300,6 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r)
|
||||
}
|
||||
|
||||
err:
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
|
@ -542,6 +542,9 @@ __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, co
|
||||
{
|
||||
WT_DECL_RET;
|
||||
|
||||
/* We should not open other cursors when there are open history store cursors in the session. */
|
||||
WT_ASSERT(session, strcmp(uri, WT_HS_URI) == 0 || session->hs_cursor_counter == 0);
|
||||
|
||||
/* We do not cache any subordinate tables/files cursors. */
|
||||
if (owner == NULL) {
|
||||
if ((ret = __wt_cursor_cache_get(session, uri, NULL, cfg, cursorp)) == 0)
|
||||
|
14
src/third_party/wiredtiger/src/support/stat.c
vendored
14
src/third_party/wiredtiger/src/support/stat.c
vendored
@ -208,7 +208,8 @@ static const char *const __stats_dsrc_desc[] = {
|
||||
"session: flush_tier operation calls",
|
||||
"session: tiered storage local retention time (secs)",
|
||||
"transaction: race to read prepared update retry",
|
||||
"transaction: rollback to stable hs records with stop timestamps older than newer records",
|
||||
"transaction: rollback to stable history store records with stop timestamps older than newer "
|
||||
"records",
|
||||
"transaction: rollback to stable inconsistent checkpoint",
|
||||
"transaction: rollback to stable keys removed",
|
||||
"transaction: rollback to stable keys restored",
|
||||
@ -1098,12 +1099,8 @@ static const char *const __stats_connection_desc[] = {
|
||||
"cursor: cursor modify key and value bytes affected",
|
||||
"cursor: cursor modify value bytes modified",
|
||||
"cursor: cursor next calls",
|
||||
"cursor: cursor next calls that skip due to a globally visible history store tombstone in "
|
||||
"rollback to stable",
|
||||
"cursor: cursor operation restarted",
|
||||
"cursor: cursor prev calls",
|
||||
"cursor: cursor prev calls that skip due to a globally visible history store tombstone in "
|
||||
"rollback to stable",
|
||||
"cursor: cursor remove calls",
|
||||
"cursor: cursor remove key bytes removed",
|
||||
"cursor: cursor reserve calls",
|
||||
@ -1437,7 +1434,8 @@ static const char *const __stats_connection_desc[] = {
|
||||
"session: flush_tier operation calls",
|
||||
"session: tiered storage local retention time (secs)",
|
||||
"transaction: race to read prepared update retry",
|
||||
"transaction: rollback to stable hs records with stop timestamps older than newer records",
|
||||
"transaction: rollback to stable history store records with stop timestamps older than newer "
|
||||
"records",
|
||||
"transaction: rollback to stable inconsistent checkpoint",
|
||||
"transaction: rollback to stable keys removed",
|
||||
"transaction: rollback to stable keys restored",
|
||||
@ -1625,10 +1623,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
|
||||
stats->cursor_modify_bytes = 0;
|
||||
stats->cursor_modify_bytes_touch = 0;
|
||||
stats->cursor_next = 0;
|
||||
stats->cursor_next_hs_tombstone_rts = 0;
|
||||
stats->cursor_restart = 0;
|
||||
stats->cursor_prev = 0;
|
||||
stats->cursor_prev_hs_tombstone_rts = 0;
|
||||
stats->cursor_remove = 0;
|
||||
stats->cursor_remove_bytes = 0;
|
||||
stats->cursor_reserve = 0;
|
||||
@ -2139,10 +2135,8 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
|
||||
to->cursor_modify_bytes += WT_STAT_READ(from, cursor_modify_bytes);
|
||||
to->cursor_modify_bytes_touch += WT_STAT_READ(from, cursor_modify_bytes_touch);
|
||||
to->cursor_next += WT_STAT_READ(from, cursor_next);
|
||||
to->cursor_next_hs_tombstone_rts += WT_STAT_READ(from, cursor_next_hs_tombstone_rts);
|
||||
to->cursor_restart += WT_STAT_READ(from, cursor_restart);
|
||||
to->cursor_prev += WT_STAT_READ(from, cursor_prev);
|
||||
to->cursor_prev_hs_tombstone_rts += WT_STAT_READ(from, cursor_prev_hs_tombstone_rts);
|
||||
to->cursor_remove += WT_STAT_READ(from, cursor_remove);
|
||||
to->cursor_remove_bytes += WT_STAT_READ(from, cursor_remove_bytes);
|
||||
to->cursor_reserve += WT_STAT_READ(from, cursor_reserve);
|
||||
|
144
src/third_party/wiredtiger/src/txn/txn.c
vendored
144
src/third_party/wiredtiger/src/txn/txn.c
vendored
@ -721,76 +721,27 @@ __wt_txn_release(WT_SESSION_IMPL *session)
|
||||
* Append the update older than the prepared update to the update chain
|
||||
*/
|
||||
static int
|
||||
__txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *key, WT_PAGE *page,
|
||||
__txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_PAGE *page,
|
||||
WT_UPDATE *chain, bool commit, WT_UPDATE **fix_updp, bool *upd_appended)
|
||||
{
|
||||
WT_CURSOR_BTREE *hs_cbt;
|
||||
WT_DECL_ITEM(hs_key);
|
||||
WT_DECL_ITEM(hs_value);
|
||||
WT_DECL_RET;
|
||||
WT_TIME_WINDOW *hs_tw;
|
||||
WT_UPDATE *tombstone, *upd;
|
||||
wt_timestamp_t durable_ts, hs_start_ts, hs_stop_durable_ts;
|
||||
wt_timestamp_t durable_ts, hs_stop_durable_ts;
|
||||
size_t size, total_size;
|
||||
uint64_t hs_counter, type_full;
|
||||
uint32_t hs_btree_id;
|
||||
int cmp;
|
||||
uint64_t type_full;
|
||||
char ts_string[2][WT_TS_INT_STRING_SIZE];
|
||||
|
||||
WT_ASSERT(session, chain != NULL);
|
||||
|
||||
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
|
||||
*fix_updp = NULL;
|
||||
*upd_appended = false;
|
||||
size = total_size = 0;
|
||||
tombstone = upd = NULL;
|
||||
|
||||
/* Allocate buffers for the data store and history store key. */
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
|
||||
WT_ERR(__wt_scr_alloc(session, 0, &hs_value));
|
||||
|
||||
for (; ret == 0; ret = __wt_hs_cursor_prev(session, hs_cursor)) {
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
|
||||
|
||||
/* Stop before crossing over to the next btree */
|
||||
if (hs_btree_id != S2BT(session)->id) {
|
||||
ret = WT_NOTFOUND;
|
||||
goto done;
|
||||
}
|
||||
|
||||
/*
|
||||
* Keys are sorted in an order, skip the ones before the desired key, and bail out if we
|
||||
* have crossed over the desired key and not found the record we are looking for.
|
||||
*/
|
||||
WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp));
|
||||
if (cmp != 0) {
|
||||
ret = WT_NOTFOUND;
|
||||
goto done;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the stop time pair on the tombstone in the history store is already globally visible
|
||||
* we can skip it.
|
||||
*/
|
||||
if (!__wt_txn_tw_stop_visible_all(session, &hs_cbt->upd_value->tw))
|
||||
break;
|
||||
else
|
||||
WT_STAT_CONN_INCR(session, cursor_prev_hs_tombstone);
|
||||
}
|
||||
|
||||
/* We walked off the top of the history store. */
|
||||
if (ret == WT_NOTFOUND)
|
||||
goto done;
|
||||
WT_ERR(ret);
|
||||
|
||||
/*
|
||||
* As part of the history store search, we never get an exact match based on our search criteria
|
||||
* as we always search for a maximum record for that key. Make sure that we set the comparison
|
||||
* result as an exact match to remove this key as part of rollback to stable. In case if we
|
||||
* don't mark the comparison result as same, later the __wt_row_modify function will not
|
||||
* properly remove the update from history store.
|
||||
*/
|
||||
hs_cbt->compare = 0;
|
||||
|
||||
/* Get current value. */
|
||||
WT_ERR(hs_cursor->get_value(hs_cursor, &hs_stop_durable_ts, &durable_ts, &type_full, hs_value));
|
||||
|
||||
@ -799,15 +750,16 @@ __txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *
|
||||
|
||||
/*
|
||||
* If the history update already has a stop time point and we are committing the prepared update
|
||||
* there is no work to do.
|
||||
* there is no work to do. This happens if a deleted key is reinserted by a prepared update.
|
||||
*/
|
||||
if (hs_stop_durable_ts != WT_TS_MAX && commit)
|
||||
goto done;
|
||||
|
||||
__wt_hs_upd_time_window(hs_cursor, &hs_tw);
|
||||
WT_ERR(__wt_upd_alloc(session, hs_value, WT_UPDATE_STANDARD, &upd, &size));
|
||||
upd->txnid = hs_cbt->upd_value->tw.start_txn;
|
||||
upd->durable_ts = hs_cbt->upd_value->tw.durable_start_ts;
|
||||
upd->start_ts = hs_cbt->upd_value->tw.start_ts;
|
||||
upd->txnid = hs_tw->start_txn;
|
||||
upd->durable_ts = hs_tw->durable_start_ts;
|
||||
upd->start_ts = hs_tw->start_ts;
|
||||
*fix_updp = upd;
|
||||
|
||||
/*
|
||||
@ -831,11 +783,11 @@ __txn_append_hs_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *
|
||||
|
||||
/* If the history store record has a valid stop time point, append it. */
|
||||
if (hs_stop_durable_ts != WT_TS_MAX) {
|
||||
WT_ASSERT(session, hs_cbt->upd_value->tw.stop_ts != WT_TS_MAX);
|
||||
WT_ASSERT(session, hs_tw->stop_ts != WT_TS_MAX);
|
||||
WT_ERR(__wt_upd_alloc(session, NULL, WT_UPDATE_TOMBSTONE, &tombstone, &size));
|
||||
tombstone->durable_ts = hs_cbt->upd_value->tw.durable_stop_ts;
|
||||
tombstone->start_ts = hs_cbt->upd_value->tw.stop_ts;
|
||||
tombstone->txnid = hs_cbt->upd_value->tw.stop_txn;
|
||||
tombstone->durable_ts = hs_tw->durable_stop_ts;
|
||||
tombstone->start_ts = hs_tw->stop_ts;
|
||||
tombstone->txnid = hs_tw->stop_txn;
|
||||
tombstone->next = upd;
|
||||
/*
|
||||
* Set the flag to indicate that this update has been restored from history store for the
|
||||
@ -873,7 +825,6 @@ err:
|
||||
__wt_free_update_list(session, &upd);
|
||||
}
|
||||
done:
|
||||
__wt_scr_free(session, &hs_key);
|
||||
__wt_scr_free(session, &hs_value);
|
||||
return (ret);
|
||||
}
|
||||
@ -958,15 +909,18 @@ static int
|
||||
__txn_fixup_prepared_update(
|
||||
WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_UPDATE *fix_upd, bool commit)
|
||||
{
|
||||
WT_CURSOR_BTREE *hs_cbt;
|
||||
WT_DECL_RET;
|
||||
WT_ITEM hs_value;
|
||||
WT_TIME_WINDOW tw;
|
||||
WT_TXN *txn;
|
||||
WT_UPDATE *hs_upd;
|
||||
uint32_t txn_flags;
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
uint64_t hs_upd_type;
|
||||
wt_timestamp_t hs_durable_ts, hs_stop_durable_ts;
|
||||
#endif
|
||||
|
||||
hs_cbt = (WT_CURSOR_BTREE *)hs_cursor;
|
||||
txn = session->txn;
|
||||
WT_TIME_WINDOW_INIT(&tw);
|
||||
|
||||
/*
|
||||
* Transaction error and prepare are cleared temporarily as cursor functions are not allowed
|
||||
@ -982,33 +936,34 @@ __txn_fixup_prepared_update(
|
||||
* If the history update already has a stop time point and we are committing the prepared update
|
||||
* there is no work to do.
|
||||
*/
|
||||
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
|
||||
if (commit) {
|
||||
hs_upd->start_ts = txn->commit_timestamp;
|
||||
hs_upd->durable_ts = txn->durable_timestamp;
|
||||
hs_upd->txnid = txn->id;
|
||||
tw.stop_ts = txn->commit_timestamp;
|
||||
tw.durable_stop_ts = txn->durable_timestamp;
|
||||
tw.stop_txn = txn->id;
|
||||
WT_TIME_WINDOW_SET_START(&tw, fix_upd);
|
||||
|
||||
hs_value.data = fix_upd->data;
|
||||
hs_value.size = fix_upd->size;
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
/* Retrieve the existing update value and stop timestamp. */
|
||||
WT_ERR(hs_cursor->get_value(
|
||||
hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &hs_upd_type, &hs_value));
|
||||
WT_ASSERT(session, hs_stop_durable_ts == WT_TS_MAX);
|
||||
WT_ASSERT(session, (uint8_t)hs_upd_type == WT_UPDATE_STANDARD);
|
||||
#endif
|
||||
/*
|
||||
* We need to update the stop durable timestamp stored in the history store value.
|
||||
*
|
||||
* Pack the value using cursor api.
|
||||
*/
|
||||
hs_cursor->set_value(hs_cursor, txn->durable_timestamp, fix_upd->durable_ts,
|
||||
(uint64_t)fix_upd->type, &hs_value);
|
||||
WT_ERR(__wt_upd_alloc(session, &hs_cursor->value, WT_UPDATE_STANDARD, &hs_upd->next, NULL));
|
||||
hs_upd->next->durable_ts = fix_upd->durable_ts;
|
||||
hs_upd->next->start_ts = fix_upd->start_ts;
|
||||
hs_upd->next->txnid = fix_upd->txnid;
|
||||
hs_value.data = fix_upd->data;
|
||||
hs_value.size = fix_upd->size;
|
||||
hs_cursor->set_value(hs_cursor, &tw, tw.durable_stop_ts, tw.durable_start_ts,
|
||||
(uint64_t)WT_UPDATE_STANDARD, &hs_value);
|
||||
WT_ERR(hs_cursor->update(hs_cursor));
|
||||
} else {
|
||||
WT_ERR(hs_cursor->remove(hs_cursor));
|
||||
}
|
||||
|
||||
WT_ERR(__wt_hs_modify(hs_cbt, hs_upd));
|
||||
|
||||
if (0) {
|
||||
err:
|
||||
__wt_free_update_list(session, &hs_upd);
|
||||
}
|
||||
F_SET(txn, txn_flags);
|
||||
|
||||
return (ret);
|
||||
@ -1128,22 +1083,15 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
|
||||
cbt = (WT_CURSOR_BTREE *)(*cursorp);
|
||||
hs_btree_id = S2BT(session)->id;
|
||||
/* Open a history store table cursor. */
|
||||
WT_ERR(__wt_hs_cursor_open(session));
|
||||
hs_cursor = session->hs_cursor;
|
||||
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
|
||||
/*
|
||||
* Scan the history store for the given btree and key with maximum start timestamp to let
|
||||
* the search point to the last version of the key.
|
||||
*/
|
||||
WT_ERR_NOTFOUND_OK(__wt_hs_cursor_position(
|
||||
session, hs_cursor, hs_btree_id, &op->u.op_row.key, WT_TS_MAX, NULL),
|
||||
true);
|
||||
|
||||
if (ret == 0)
|
||||
/* Not found if we cross the tree or key boundary. */
|
||||
WT_ERR_NOTFOUND_OK(__txn_append_hs_record(session, hs_cursor, &op->u.op_row.key,
|
||||
cbt->ref->page, upd, commit, &fix_upd, &upd_appended),
|
||||
true);
|
||||
hs_cursor->set_key(hs_cursor, 4, hs_btree_id, &op->u.op_row.key, WT_TS_MAX, UINT64_MAX);
|
||||
WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true);
|
||||
if (ret == WT_NOTFOUND && !commit) {
|
||||
/*
|
||||
* Allocate a tombstone and prepend it to the row so when we reconcile the update chain
|
||||
@ -1156,7 +1104,10 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
|
||||
__wt_row_modify(cbt, &cbt->iface.key, NULL, tombstone, WT_UPDATE_INVALID, false));
|
||||
WT_ERR(ret);
|
||||
tombstone = NULL;
|
||||
} else
|
||||
} else if (ret == 0)
|
||||
WT_ERR(__txn_append_hs_record(
|
||||
session, hs_cursor, cbt->ref->page, upd, commit, &fix_upd, &upd_appended));
|
||||
else
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
@ -1212,15 +1163,14 @@ __txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_TXN_OP *op, bool commit,
|
||||
* Fix the history store contents if they exist, when there are no more updates in the update
|
||||
* list. Only in eviction, it is possible to write an unfinished history store update when the
|
||||
* prepared updates are written to the data store. When the page is read back into memory, there
|
||||
* will be only one uncommitted prepared update. There can be a false positive of fixing history
|
||||
* store when handling prepared inserts, but it doesn't cost much.
|
||||
* will be only one uncommitted prepared update.
|
||||
*/
|
||||
if (fix_upd != NULL)
|
||||
WT_ERR(__txn_fixup_prepared_update(session, hs_cursor, fix_upd, commit));
|
||||
|
||||
err:
|
||||
if (hs_cursor != NULL)
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
if (!upd_appended)
|
||||
__wt_free(session, fix_upd);
|
||||
__wt_free(session, tombstone);
|
||||
|
@ -276,18 +276,17 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
{
|
||||
WT_CELL_UNPACK_KV *unpack, _unpack;
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_CURSOR_BTREE *cbt;
|
||||
WT_DECL_ITEM(hs_key);
|
||||
WT_DECL_ITEM(hs_value);
|
||||
WT_DECL_ITEM(key);
|
||||
WT_DECL_RET;
|
||||
WT_ITEM full_value;
|
||||
WT_UPDATE *hs_upd, *tombstone, *upd;
|
||||
WT_TIME_WINDOW *hs_tw;
|
||||
WT_UPDATE *tombstone, *upd;
|
||||
wt_timestamp_t hs_durable_ts, hs_start_ts, hs_stop_durable_ts, newer_hs_durable_ts;
|
||||
uint64_t hs_counter, type_full;
|
||||
uint32_t hs_btree_id;
|
||||
uint8_t type;
|
||||
int cmp;
|
||||
char ts_string[4][WT_TS_INT_STRING_SIZE];
|
||||
bool valid_update_found;
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
@ -295,7 +294,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
#endif
|
||||
|
||||
hs_cursor = NULL;
|
||||
hs_upd = tombstone = upd = NULL;
|
||||
tombstone = upd = NULL;
|
||||
hs_durable_ts = hs_start_ts = hs_stop_durable_ts = WT_TS_NONE;
|
||||
hs_btree_id = S2BT(session)->id;
|
||||
WT_CLEAR(full_value);
|
||||
@ -319,9 +318,13 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
newer_hs_durable_ts = unpack->tw.durable_start_ts;
|
||||
|
||||
/* Open a history store table cursor. */
|
||||
WT_ERR(__wt_hs_cursor_open(session));
|
||||
hs_cursor = session->hs_cursor;
|
||||
cbt = (WT_CURSOR_BTREE *)hs_cursor;
|
||||
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
/*
|
||||
* Rollback-to-stable operates exclusively (i.e., it is the only active operation in the system)
|
||||
* outside the constraints of transactions. Therefore, there is no need for snapshot based
|
||||
* visibility checks.
|
||||
*/
|
||||
F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED);
|
||||
|
||||
/*
|
||||
* Scan the history store for the given btree and key with maximum start timestamp to let the
|
||||
@ -330,40 +333,11 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
* into data store and removed from history store. If none of the history store records satisfy
|
||||
* the given timestamp, the key is removed from data store.
|
||||
*/
|
||||
ret = __wt_hs_cursor_position(session, hs_cursor, hs_btree_id, key, WT_TS_MAX, NULL);
|
||||
for (; ret == 0; ret = __wt_hs_cursor_prev(session, hs_cursor)) {
|
||||
hs_cursor->set_key(hs_cursor, 4, hs_btree_id, key, WT_TS_MAX, UINT64_MAX);
|
||||
ret = __wt_curhs_search_near_before(session, hs_cursor);
|
||||
for (; ret == 0; ret = hs_cursor->prev(hs_cursor)) {
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
|
||||
|
||||
/* Stop before crossing over to the next btree */
|
||||
if (hs_btree_id != S2BT(session)->id)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Keys are sorted in an order, skip the ones before the desired key, and bail out if we
|
||||
* have crossed over the desired key and not found the record we are looking for.
|
||||
*/
|
||||
WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp));
|
||||
if (cmp != 0)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If the stop time pair on the tombstone in the history store is already globally visible
|
||||
* we can skip it.
|
||||
*/
|
||||
if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
|
||||
WT_STAT_CONN_INCR(session, cursor_prev_hs_tombstone_rts);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* As part of the history store search, we never get an exact match based on our search
|
||||
* criteria as we always search for a maximum record for that key. Make sure that we set the
|
||||
* comparison result as an exact match to remove this key as part of rollback to stable. In
|
||||
* case if we don't mark the comparison result as same, later the __wt_row_modify function
|
||||
* will not properly remove the update from history store.
|
||||
*/
|
||||
cbt->compare = 0;
|
||||
|
||||
/* Get current value and convert to full update if it is a modify. */
|
||||
WT_ERR(hs_cursor->get_value(
|
||||
hs_cursor, &hs_stop_durable_ts, &hs_durable_ts, &type_full, hs_value));
|
||||
@ -416,16 +390,17 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
* selected update to the update chain. Also it confirms that history store doesn't contains
|
||||
* any newer version than the current version for the key.
|
||||
*/
|
||||
/* Retrieve the time window from the history cursor. */
|
||||
__wt_hs_upd_time_window(hs_cursor, &hs_tw);
|
||||
if (!replace &&
|
||||
(hs_stop_durable_ts != WT_TS_NONE ||
|
||||
!__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.stop_txn)) &&
|
||||
!__rollback_check_if_txnid_non_committed(session, hs_tw->stop_txn)) &&
|
||||
(hs_stop_durable_ts <= rollback_timestamp)) {
|
||||
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
|
||||
"history store update valid with stop timestamp: %s, stable timestamp: %s, txnid: "
|
||||
"%" PRIu64 " and type: %" PRIu8,
|
||||
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[0]),
|
||||
__wt_timestamp_to_string(rollback_timestamp, ts_string[1]),
|
||||
cbt->upd_value->tw.stop_txn, type);
|
||||
__wt_timestamp_to_string(rollback_timestamp, ts_string[1]), hs_tw->stop_txn, type);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -434,7 +409,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
* transaction id.
|
||||
*/
|
||||
if ((hs_durable_ts != WT_TS_NONE ||
|
||||
!__rollback_check_if_txnid_non_committed(session, cbt->upd_value->tw.start_txn)) &&
|
||||
!__rollback_check_if_txnid_non_committed(session, hs_tw->start_txn)) &&
|
||||
(hs_durable_ts <= rollback_timestamp)) {
|
||||
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
|
||||
"history store update valid with start timestamp: %s, durable timestamp: %s, stop "
|
||||
@ -442,8 +417,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
|
||||
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
|
||||
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
|
||||
__wt_timestamp_to_string(rollback_timestamp, ts_string[3]),
|
||||
cbt->upd_value->tw.start_txn, type);
|
||||
__wt_timestamp_to_string(rollback_timestamp, ts_string[3]), hs_tw->start_txn, type);
|
||||
WT_ASSERT(session, hs_tw->start_ts < unpack->tw.start_ts);
|
||||
valid_update_found = true;
|
||||
break;
|
||||
}
|
||||
@ -455,8 +430,8 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
__wt_timestamp_to_string(hs_start_ts, ts_string[0]),
|
||||
__wt_timestamp_to_string(hs_durable_ts, ts_string[1]),
|
||||
__wt_timestamp_to_string(hs_stop_durable_ts, ts_string[2]),
|
||||
__wt_timestamp_to_string(rollback_timestamp, ts_string[3]), cbt->upd_value->tw.start_txn,
|
||||
cbt->upd_value->tw.stop_txn, type);
|
||||
__wt_timestamp_to_string(rollback_timestamp, ts_string[3]), hs_tw->start_txn,
|
||||
hs_tw->stop_txn, type);
|
||||
|
||||
/*
|
||||
* Start time point of the current record may be used as stop time point of the previous
|
||||
@ -468,8 +443,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
first_record = false;
|
||||
#endif
|
||||
|
||||
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
|
||||
WT_ERR(__wt_hs_modify(cbt, hs_upd));
|
||||
WT_ERR(hs_cursor->remove(hs_cursor));
|
||||
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts_unstable);
|
||||
}
|
||||
@ -480,9 +454,10 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
* list. Otherwise remove the key by adding a tombstone.
|
||||
*/
|
||||
if (valid_update_found) {
|
||||
/* Retrieve the time window from the history cursor. */
|
||||
__wt_hs_upd_time_window(hs_cursor, &hs_tw);
|
||||
WT_ASSERT(session,
|
||||
cbt->upd_value->tw.start_ts < unpack->tw.start_ts ||
|
||||
cbt->upd_value->tw.start_txn < unpack->tw.start_txn);
|
||||
hs_tw->start_ts < unpack->tw.start_ts || hs_tw->start_txn < unpack->tw.start_txn);
|
||||
WT_ERR(__wt_upd_alloc(session, &full_value, WT_UPDATE_STANDARD, &upd, NULL));
|
||||
|
||||
/*
|
||||
@ -494,9 +469,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
|
||||
upd->txnid = WT_TXN_NONE;
|
||||
else
|
||||
upd->txnid = cbt->upd_value->tw.start_txn;
|
||||
upd->durable_ts = cbt->upd_value->tw.durable_start_ts;
|
||||
upd->start_ts = cbt->upd_value->tw.start_ts;
|
||||
upd->txnid = hs_tw->start_txn;
|
||||
upd->durable_ts = hs_tw->durable_start_ts;
|
||||
upd->start_ts = hs_tw->start_ts;
|
||||
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
|
||||
"update restored from history store txnid: %" PRIu64
|
||||
", start_ts: %s and durable_ts: %s",
|
||||
@ -527,9 +502,9 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
|
||||
tombstone->txnid = WT_TXN_NONE;
|
||||
else
|
||||
tombstone->txnid = cbt->upd_value->tw.stop_txn;
|
||||
tombstone->durable_ts = cbt->upd_value->tw.durable_stop_ts;
|
||||
tombstone->start_ts = cbt->upd_value->tw.stop_ts;
|
||||
tombstone->txnid = hs_tw->stop_txn;
|
||||
tombstone->durable_ts = hs_tw->durable_stop_ts;
|
||||
tombstone->start_ts = hs_tw->stop_ts;
|
||||
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
|
||||
"tombstone restored from history store txnid: %" PRIu64
|
||||
", start_ts: %s, durable_ts: %s",
|
||||
@ -557,8 +532,7 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
|
||||
/* Finally remove that update from history store. */
|
||||
if (valid_update_found) {
|
||||
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
|
||||
WT_ERR(__wt_hs_modify(cbt, hs_upd));
|
||||
WT_ERR(hs_cursor->remove(hs_cursor));
|
||||
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
|
||||
}
|
||||
@ -567,13 +541,13 @@ __rollback_row_ondisk_fixup_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW
|
||||
err:
|
||||
WT_ASSERT(session, tombstone == NULL || upd == tombstone);
|
||||
__wt_free_update_list(session, &upd);
|
||||
__wt_free_update_list(session, &hs_upd);
|
||||
}
|
||||
__wt_scr_free(session, &hs_key);
|
||||
__wt_scr_free(session, &hs_value);
|
||||
__wt_scr_free(session, &key);
|
||||
__wt_buf_free(session, &full_value);
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
if (hs_cursor != NULL)
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
@ -1305,74 +1279,44 @@ static int
|
||||
__rollback_to_stable_btree_hs_truncate(WT_SESSION_IMPL *session, uint32_t btree_id)
|
||||
{
|
||||
WT_CURSOR *hs_cursor;
|
||||
WT_CURSOR_BTREE *cbt;
|
||||
WT_DECL_ITEM(hs_key);
|
||||
WT_DECL_RET;
|
||||
WT_ITEM key;
|
||||
WT_UPDATE *hs_upd;
|
||||
wt_timestamp_t hs_start_ts;
|
||||
uint64_t hs_counter;
|
||||
uint32_t hs_btree_id;
|
||||
int exact;
|
||||
char ts_string[WT_TS_INT_STRING_SIZE];
|
||||
|
||||
hs_cursor = NULL;
|
||||
WT_CLEAR(key);
|
||||
hs_upd = NULL;
|
||||
|
||||
WT_RET(__wt_scr_alloc(session, 0, &hs_key));
|
||||
|
||||
/* Open a history store table cursor. */
|
||||
WT_ERR(__wt_hs_cursor_open(session));
|
||||
hs_cursor = session->hs_cursor;
|
||||
cbt = (WT_CURSOR_BTREE *)hs_cursor;
|
||||
WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor));
|
||||
|
||||
/* Walk the history store for the given btree. */
|
||||
hs_cursor->set_key(hs_cursor, btree_id, &key, WT_TS_NONE, 0);
|
||||
ret = __wt_hs_cursor_search_near(session, hs_cursor, &exact);
|
||||
hs_cursor->set_key(hs_cursor, 1, btree_id);
|
||||
ret = __wt_curhs_search_near_after(session, hs_cursor);
|
||||
|
||||
/*
|
||||
* The search should always end up pointing to the start of the required btree or end of the
|
||||
* previous btree on success. Move the cursor based on the result.
|
||||
*/
|
||||
WT_ASSERT(session, (ret != 0 || exact != 0));
|
||||
if (ret == 0 && exact < 0)
|
||||
ret = __wt_hs_cursor_next(session, hs_cursor);
|
||||
|
||||
for (; ret == 0; ret = __wt_hs_cursor_next(session, hs_cursor)) {
|
||||
for (; ret == 0; ret = hs_cursor->next(hs_cursor)) {
|
||||
WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
|
||||
|
||||
/* Stop crossing into the next btree boundary. */
|
||||
if (btree_id != hs_btree_id)
|
||||
break;
|
||||
/* We shouldn't cross the btree search space. */
|
||||
WT_ASSERT(session, btree_id == hs_btree_id);
|
||||
|
||||
/*
|
||||
* If the stop time pair on the tombstone in the history store is already globally visible
|
||||
* we can skip it.
|
||||
*/
|
||||
if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
|
||||
WT_STAT_CONN_INCR(session, cursor_next_hs_tombstone_rts);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Set this comparison as exact match of the search for later use. */
|
||||
cbt->compare = 0;
|
||||
__wt_verbose(session, WT_VERB_RECOVERY_RTS(session),
|
||||
"rollback to stable history store cleanup of update with start timestamp: %s",
|
||||
__wt_timestamp_to_string(hs_start_ts, ts_string));
|
||||
|
||||
WT_ERR(__wt_upd_alloc_tombstone(session, &hs_upd, NULL));
|
||||
WT_ERR(__wt_hs_modify(cbt, hs_upd));
|
||||
WT_ERR(hs_cursor->remove(hs_cursor));
|
||||
WT_STAT_CONN_DATA_INCR(session, txn_rts_hs_removed);
|
||||
WT_STAT_CONN_DATA_INCR(session, cache_hs_key_truncate_rts);
|
||||
hs_upd = NULL;
|
||||
}
|
||||
WT_ERR_NOTFOUND_OK(ret, false);
|
||||
|
||||
err:
|
||||
__wt_scr_free(session, &hs_key);
|
||||
__wt_free(session, hs_upd);
|
||||
WT_TRET(__wt_hs_cursor_close(session));
|
||||
if (hs_cursor != NULL)
|
||||
WT_TRET(hs_cursor->close(hs_cursor));
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
2
src/third_party/wiredtiger/test/format/t.c
vendored
2
src/third_party/wiredtiger/test/format/t.c
vendored
@ -378,7 +378,7 @@ format_die(void)
|
||||
testutil_check(__wt_debug_cursor_page(g.page_dump_cursor, g.home_pagedump));
|
||||
fprintf(stderr, "snapshot-isolation error: Dumping HS to %s\n", g.home_hsdump);
|
||||
#if WIREDTIGER_VERSION_MAJOR >= 10
|
||||
testutil_check(__wt_debug_cursor_tree_hs(g.page_dump_cursor, g.home_hsdump));
|
||||
testutil_check(__wt_debug_cursor_tree_hs(CUR2S(g.page_dump_cursor), g.home_hsdump));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
@ -71,7 +71,7 @@ class test_cursor13_base(wttest.WiredTigerTestCase):
|
||||
if hs_before[0] == hs_after[0] and hs_before[1] == hs_after[1]:
|
||||
break
|
||||
|
||||
# Fail if we haven't been able to get stable hs stats after too many attempts.
|
||||
# Fail if we haven't been able to get stable history store stats after too many attempts.
|
||||
# Seems impossible, but better to check than to have an accidental infinite loop.
|
||||
self.assertNotEqual(i, max_tries - 1)
|
||||
|
||||
|
@ -72,7 +72,7 @@ class test_hs05(wttest.WiredTigerTestCase):
|
||||
score_diff = score_end - score_start
|
||||
self.pr("After large updates score start: " + str(score_start))
|
||||
self.pr("After large updates score end: " + str(score_end))
|
||||
self.pr("After large updates hs score diff: " + str(score_diff))
|
||||
self.pr("After large updates history store score diff: " + str(score_diff))
|
||||
|
||||
def test_checkpoint_hs_reads(self):
|
||||
# Create a small table.
|
||||
|
@ -37,7 +37,7 @@ def timestamp_str(t):
|
||||
return '%x' % t
|
||||
|
||||
# test_rollback_to_stable11.py
|
||||
# Test the rollback to stable is retrieving the proper hs update.
|
||||
# Test the rollback to stable is retrieving the proper history store update.
|
||||
class test_rollback_to_stable11(test_rollback_to_stable_base):
|
||||
session_config = 'isolation=snapshot'
|
||||
|
||||
|
85
src/third_party/wiredtiger/test/suite/test_util21.py
vendored
Normal file
85
src/third_party/wiredtiger/test/suite/test_util21.py
vendored
Normal file
@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# Public Domain 2014-2021 MongoDB, Inc.
|
||||
# Public Domain 2008-2014 WiredTiger, Inc.
|
||||
#
|
||||
# This is free and unencumbered software released into the public domain.
|
||||
#
|
||||
# Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
# distribute this software, either in source code form or as a compiled
|
||||
# binary, for any purpose, commercial or non-commercial, and by any
|
||||
# means.
|
||||
#
|
||||
# In jurisdictions that recognize copyright laws, the author or authors
|
||||
# of this software dedicate any and all copyright interest in the
|
||||
# software to the public domain. We make this dedication for the benefit
|
||||
# of the public at large and to the detriment of our heirs and
|
||||
# successors. We intend this dedication to be an overt act of
|
||||
# relinquishment in perpetuity of all present and future rights to this
|
||||
# software under copyright law.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
# OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
import wttest
|
||||
from suite_subprocess import suite_subprocess
|
||||
from helper import compare_files
|
||||
|
||||
def timestamp_str(t):
|
||||
return '%x' % t
|
||||
|
||||
# test_util21.py
|
||||
# Ensure that wt dump can dump obsolete data in the history store.
|
||||
class test_util21(wttest.WiredTigerTestCase, suite_subprocess):
|
||||
conn_config = 'cache_size=50MB'
|
||||
session_config = 'isolation=snapshot'
|
||||
|
||||
def add_data_with_timestamp(self, uri, value, ts):
|
||||
# Apply a series of updates with commit timestamp.
|
||||
cursor = self.session.open_cursor(uri)
|
||||
for i in range(1, 5):
|
||||
self.session.begin_transaction()
|
||||
cursor[str(i)] = value
|
||||
self.session.commit_transaction('commit_timestamp=' + timestamp_str(ts))
|
||||
cursor.close()
|
||||
|
||||
def test_dump_obsolete_data(self):
|
||||
uri = 'table:test_util21'
|
||||
create_params = 'key_format=S,value_format=S'
|
||||
self.session.create(uri, create_params)
|
||||
|
||||
value1 = 'a' * 100
|
||||
value2 = 'b' * 100
|
||||
value3 = 'c' * 100
|
||||
value4 = 'd' * 100
|
||||
|
||||
self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1))
|
||||
|
||||
self.add_data_with_timestamp(uri, value1, 2)
|
||||
self.add_data_with_timestamp(uri, value2, 3)
|
||||
self.add_data_with_timestamp(uri, value3, 5)
|
||||
self.add_data_with_timestamp(uri, value4, 7)
|
||||
# Perform checkpoint, to clean the dirty pages and place values on disk.
|
||||
self.session.checkpoint()
|
||||
|
||||
# Set stable timestamp, so we don't lose data when closing/opening connection when using wt dump.
|
||||
self.conn.set_timestamp('stable_timestamp=' + timestamp_str(10))
|
||||
|
||||
# Call dump on the values before the oldest timestamp is set
|
||||
self.runWt(['dump', 'file:WiredTigerHS.wt'], outfilename="before_oldest")
|
||||
|
||||
# Set oldest timestamp, and checkpoint, the obsolete data should not removed as
|
||||
# the pages are clean.
|
||||
self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(6))
|
||||
self.session.checkpoint()
|
||||
self.runWt(['dump', 'file:WiredTigerHS.wt'], outfilename="after_oldest")
|
||||
|
||||
self.assertEqual(True, compare_files(self, "before_oldest", "after_oldest"))
|
||||
|
||||
if __name__ == '__main__':
|
||||
wttest.run()
|
Loading…
Reference in New Issue
Block a user