mirror of
https://github.com/sqlite/sqlite.git
synced 2024-12-01 17:23:42 +01:00
34aebb88da
and "LsmPgno". This allows the LSM code to be compiled as a single unit with sqlite3.c. FossilOrigin-Name: 30bf38d589adf0b2eb613e184ffb03ed7e625736cb04fd5c341328f72bc5ef4d
1240 lines
38 KiB
C
1240 lines
38 KiB
C
/*
|
|
** 2011-09-11
|
|
**
|
|
** The author disclaims copyright to this source code. In place of
|
|
** a legal notice, here is a blessing:
|
|
**
|
|
** May you do good and not evil.
|
|
** May you find forgiveness for yourself and forgive others.
|
|
** May you share freely, never taking more than you give.
|
|
**
|
|
*************************************************************************
|
|
**
|
|
** This file contains code to read and write checkpoints.
|
|
**
|
|
** A checkpoint represents the database layout at a single point in time.
|
|
** It includes a log offset. When an existing database is opened, the
|
|
** current state is determined by reading the newest checkpoint and updating
|
|
** it with all committed transactions from the log that follow the specified
|
|
** offset.
|
|
*/
|
|
#include "lsmInt.h"
|
|
|
|
/*
|
|
** CHECKPOINT BLOB FORMAT:
|
|
**
|
|
** A checkpoint blob is a series of unsigned 32-bit integers stored in
|
|
** big-endian byte order. As follows:
|
|
**
|
|
** Checkpoint header (see the CKPT_HDR_XXX #defines):
|
|
**
|
|
** 1. The checkpoint id MSW.
|
|
** 2. The checkpoint id LSW.
|
|
** 3. The number of integer values in the entire checkpoint, including
|
|
** the two checksum values.
|
|
** 4. The compression scheme id.
|
|
** 5. The total number of blocks in the database.
|
|
** 6. The block size.
|
|
** 7. The number of levels.
|
|
** 8. The nominal database page size.
|
|
** 9. The number of pages (in total) written to the database file.
|
|
**
|
|
** Log pointer:
|
|
**
|
|
** 1. The log offset MSW.
|
|
** 2. The log offset LSW.
|
|
** 3. Log checksum 0.
|
|
** 4. Log checksum 1.
|
|
**
|
|
** Note that the "log offset" is not the literal byte offset. Instead,
|
|
** it is the byte offset multiplied by 2, with least significant bit
|
|
** toggled each time the log pointer value is changed. This is to make
|
|
** sure that this field changes each time the log pointer is updated,
|
|
** even if the log file itself is disabled. See lsmTreeMakeOld().
|
|
**
|
|
** See ckptExportLog() and ckptImportLog().
|
|
**
|
|
** Append points:
|
|
**
|
|
** 8 integers (4 * 64-bit page numbers). See ckptExportAppendlist().
|
|
**
|
|
** For each level in the database, a level record. Formatted as follows:
|
|
**
|
|
** 0. Age of the level (least significant 16-bits). And flags mask (most
|
|
** significant 16-bits).
|
|
** 1. The number of right-hand segments (nRight, possibly 0),
|
|
** 2. Segment record for left-hand segment (8 integers defined below),
|
|
** 3. Segment record for each right-hand segment (8 integers defined below),
|
|
** 4. If nRight>0, The number of segments involved in the merge
|
|
** 5. if nRight>0, Current nSkip value (see Merge structure defn.),
|
|
** 6. For each segment in the merge:
|
|
** 5a. Page number of next cell to read during merge (this field
|
|
** is 64-bits - 2 integers)
|
|
** 5b. Cell number of next cell to read during merge
|
|
** 7. Page containing current split-key (64-bits - 2 integers).
|
|
** 8. Cell within page containing current split-key.
|
|
** 9. Current pointer value (64-bits - 2 integers).
|
|
**
|
|
** The block redirect array:
|
|
**
|
|
** 1. Number of redirections (maximum LSM_MAX_BLOCK_REDIRECTS).
|
|
** 2. For each redirection:
|
|
** a. "from" block number
|
|
** b. "to" block number
|
|
**
|
|
** The in-memory freelist entries. Each entry is either an insert or a
|
|
** delete. The in-memory freelist is to the free-block-list as the
|
|
** in-memory tree is to the users database content.
|
|
**
|
|
** 1. Number of free-list entries stored in checkpoint header.
|
|
** 2. Number of free blocks (in total).
|
|
** 3. Total number of blocks freed during database lifetime.
|
|
** 4. For each entry:
|
|
** 2a. Block number of free block.
|
|
** 2b. A 64-bit integer (MSW followed by LSW). -1 for a delete entry,
|
|
** or the associated checkpoint id for an insert.
|
|
**
|
|
** The checksum:
|
|
**
|
|
** 1. Checksum value 1.
|
|
** 2. Checksum value 2.
|
|
**
|
|
** In the above, a segment record consists of the following four 64-bit
|
|
** fields (converted to 2 * u32 by storing the MSW followed by LSW):
|
|
**
|
|
** 1. First page of array,
|
|
** 2. Last page of array,
|
|
** 3. Root page of array (or 0),
|
|
** 4. Size of array in pages.
|
|
*/
|
|
|
|
/*
|
|
** LARGE NUMBERS OF LEVEL RECORDS:
|
|
**
|
|
** A limit on the number of rhs segments that may be present in the database
|
|
** file. Defining this limit ensures that all level records fit within
|
|
** the 4096 byte limit for checkpoint blobs.
|
|
**
|
|
** The number of right-hand-side segments in a database is counted as
|
|
** follows:
|
|
**
|
|
** * For each level in the database not undergoing a merge, add 1.
|
|
**
|
|
** * For each level in the database that is undergoing a merge, add
|
|
** the number of segments on the rhs of the level.
|
|
**
|
|
** A level record not undergoing a merge is 10 integers. A level record
|
|
** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the
|
|
** separators from the next level) is (11*nRhs+20) integers. The maximum
|
|
** per right-hand-side level is therefore 21 integers. So the maximum
|
|
** size of all level records in a checkpoint is 21*40=820 integers.
|
|
**
|
|
** TODO: Before pointer values were changed from 32 to 64 bits, the above
|
|
** used to come to 420 bytes - leaving significant space for a free-list
|
|
** prefix. No more. To fix this, reduce the size of the level records in
|
|
** a db snapshot, and improve management of the free-list tail in
|
|
** lsm_sorted.c.
|
|
*/
|
|
#define LSM_MAX_RHS_SEGMENTS 40
|
|
|
|
/*
|
|
** LARGE NUMBERS OF FREELIST ENTRIES:
|
|
**
|
|
** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h)
|
|
** on the number of free-list entries stored in a checkpoint. Since each
|
|
** free-list entry consists of 3 integers, the maximum free-list size is
|
|
** 3*100=300 integers. Combined with the limit on rhs segments defined
|
|
** above, this ensures that a checkpoint always fits within a 4096 byte
|
|
** meta page.
|
|
**
|
|
** If the database contains more than 100 free blocks, the "overflow" flag
|
|
** in the checkpoint header is set and the remainder are stored in the
|
|
** system FREELIST entry in the LSM (along with user data). The value
|
|
** accompanying the FREELIST key in the LSM is, like a checkpoint, an array
|
|
** of 32-bit big-endian integers. As follows:
|
|
**
|
|
** For each entry:
|
|
** a. Block number of free block.
|
|
** b. MSW of associated checkpoint id.
|
|
** c. LSW of associated checkpoint id.
|
|
**
|
|
** The number of entries is not required - it is implied by the size of the
|
|
** value blob containing the integer array.
|
|
**
|
|
** Note that the limit defined by LSM_MAX_FREELIST_ENTRIES is a hard limit.
|
|
** The actual value used may be configured using LSM_CONFIG_MAX_FREELIST.
|
|
*/
|
|
|
|
/*
|
|
** The argument to this macro must be of type u32. On a little-endian
|
|
** architecture, it returns the u32 value that results from interpreting
|
|
** the 4 bytes as a big-endian value. On a big-endian architecture, it
|
|
** returns the value that would be produced by intepreting the 4 bytes
|
|
** of the input value as a little-endian integer.
|
|
*/
|
|
#define BYTESWAP32(x) ( \
|
|
(((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \
|
|
+ (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \
|
|
)
|
|
|
|
static const int one = 1;
|
|
#define LSM_LITTLE_ENDIAN (*(u8 *)(&one))
|
|
|
|
/* Sizes, in integers, of various parts of the checkpoint. */
|
|
#define CKPT_HDR_SIZE 9
|
|
#define CKPT_LOGPTR_SIZE 4
|
|
#define CKPT_APPENDLIST_SIZE (LSM_APPLIST_SZ * 2)
|
|
|
|
/* A #define to describe each integer in the checkpoint header. */
|
|
#define CKPT_HDR_ID_MSW 0
|
|
#define CKPT_HDR_ID_LSW 1
|
|
#define CKPT_HDR_NCKPT 2
|
|
#define CKPT_HDR_CMPID 3
|
|
#define CKPT_HDR_NBLOCK 4
|
|
#define CKPT_HDR_BLKSZ 5
|
|
#define CKPT_HDR_NLEVEL 6
|
|
#define CKPT_HDR_PGSZ 7
|
|
#define CKPT_HDR_NWRITE 8
|
|
|
|
#define CKPT_HDR_LO_MSW 9
|
|
#define CKPT_HDR_LO_LSW 10
|
|
#define CKPT_HDR_LO_CKSUM1 11
|
|
#define CKPT_HDR_LO_CKSUM2 12
|
|
|
|
typedef struct CkptBuffer CkptBuffer;
|
|
|
|
/*
|
|
** Dynamic buffer used to accumulate data for a checkpoint.
|
|
*/
|
|
struct CkptBuffer {
|
|
lsm_env *pEnv;
|
|
int nAlloc;
|
|
u32 *aCkpt;
|
|
};
|
|
|
|
/*
|
|
** Calculate the checksum of the checkpoint specified by arguments aCkpt and
|
|
** nCkpt. Store the checksum in *piCksum1 and *piCksum2 before returning.
|
|
**
|
|
** The value of the nCkpt parameter includes the two checksum values at
|
|
** the end of the checkpoint. They are not used as inputs to the checksum
|
|
** calculation. The checksum is based on the array of (nCkpt-2) integers
|
|
** at aCkpt[].
|
|
*/
|
|
static void ckptChecksum(u32 *aCkpt, u32 nCkpt, u32 *piCksum1, u32 *piCksum2){
|
|
u32 i;
|
|
u32 cksum1 = 1;
|
|
u32 cksum2 = 2;
|
|
|
|
if( nCkpt % 2 ){
|
|
cksum1 += aCkpt[nCkpt-3] & 0x0000FFFF;
|
|
cksum2 += aCkpt[nCkpt-3] & 0xFFFF0000;
|
|
}
|
|
|
|
for(i=0; (i+3)<nCkpt; i+=2){
|
|
cksum1 += cksum2 + aCkpt[i];
|
|
cksum2 += cksum1 + aCkpt[i+1];
|
|
}
|
|
|
|
*piCksum1 = cksum1;
|
|
*piCksum2 = cksum2;
|
|
}
|
|
|
|
/*
|
|
** Set integer iIdx of the checkpoint accumulating in buffer *p to iVal.
|
|
*/
|
|
static void ckptSetValue(CkptBuffer *p, int iIdx, u32 iVal, int *pRc){
|
|
if( *pRc ) return;
|
|
if( iIdx>=p->nAlloc ){
|
|
int nNew = LSM_MAX(8, iIdx*2);
|
|
p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32));
|
|
if( !p->aCkpt ){
|
|
*pRc = LSM_NOMEM_BKPT;
|
|
return;
|
|
}
|
|
p->nAlloc = nNew;
|
|
}
|
|
p->aCkpt[iIdx] = iVal;
|
|
}
|
|
|
|
/*
|
|
** Argument aInt points to an array nInt elements in size. Switch the
|
|
** endian-ness of each element of the array.
|
|
*/
|
|
static void ckptChangeEndianness(u32 *aInt, int nInt){
|
|
if( LSM_LITTLE_ENDIAN ){
|
|
int i;
|
|
for(i=0; i<nInt; i++) aInt[i] = BYTESWAP32(aInt[i]);
|
|
}
|
|
}
|
|
|
|
/*
|
|
** Object *p contains a checkpoint in native byte-order. The checkpoint is
|
|
** nCkpt integers in size, not including any checksum. This function sets
|
|
** the two checksum elements of the checkpoint accordingly.
|
|
*/
|
|
static void ckptAddChecksum(CkptBuffer *p, int nCkpt, int *pRc){
|
|
if( *pRc==LSM_OK ){
|
|
u32 aCksum[2] = {0, 0};
|
|
ckptChecksum(p->aCkpt, nCkpt+2, &aCksum[0], &aCksum[1]);
|
|
ckptSetValue(p, nCkpt, aCksum[0], pRc);
|
|
ckptSetValue(p, nCkpt+1, aCksum[1], pRc);
|
|
}
|
|
}
|
|
|
|
static void ckptAppend64(CkptBuffer *p, int *piOut, i64 iVal, int *pRc){
|
|
int iOut = *piOut;
|
|
ckptSetValue(p, iOut++, (iVal >> 32) & 0xFFFFFFFF, pRc);
|
|
ckptSetValue(p, iOut++, (iVal & 0xFFFFFFFF), pRc);
|
|
*piOut = iOut;
|
|
}
|
|
|
|
static i64 ckptRead64(u32 *a){
|
|
return (((i64)a[0]) << 32) + (i64)a[1];
|
|
}
|
|
|
|
static i64 ckptGobble64(u32 *a, int *piIn){
|
|
int iIn = *piIn;
|
|
*piIn += 2;
|
|
return ckptRead64(&a[iIn]);
|
|
}
|
|
|
|
|
|
/*
|
|
** Append a 6-value segment record corresponding to pSeg to the checkpoint
|
|
** buffer passed as the third argument.
|
|
*/
|
|
static void ckptExportSegment(
|
|
Segment *pSeg,
|
|
CkptBuffer *p,
|
|
int *piOut,
|
|
int *pRc
|
|
){
|
|
ckptAppend64(p, piOut, pSeg->iFirst, pRc);
|
|
ckptAppend64(p, piOut, pSeg->iLastPg, pRc);
|
|
ckptAppend64(p, piOut, pSeg->iRoot, pRc);
|
|
ckptAppend64(p, piOut, pSeg->nSize, pRc);
|
|
}
|
|
|
|
static void ckptExportLevel(
|
|
Level *pLevel, /* Level object to serialize */
|
|
CkptBuffer *p, /* Append new level record to this ckpt */
|
|
int *piOut, /* IN/OUT: Size of checkpoint so far */
|
|
int *pRc /* IN/OUT: Error code */
|
|
){
|
|
int iOut = *piOut;
|
|
Merge *pMerge;
|
|
|
|
pMerge = pLevel->pMerge;
|
|
ckptSetValue(p, iOut++, (u32)pLevel->iAge + (u32)(pLevel->flags<<16), pRc);
|
|
ckptSetValue(p, iOut++, pLevel->nRight, pRc);
|
|
ckptExportSegment(&pLevel->lhs, p, &iOut, pRc);
|
|
|
|
assert( (pLevel->nRight>0)==(pMerge!=0) );
|
|
if( pMerge ){
|
|
int i;
|
|
for(i=0; i<pLevel->nRight; i++){
|
|
ckptExportSegment(&pLevel->aRhs[i], p, &iOut, pRc);
|
|
}
|
|
assert( pMerge->nInput==pLevel->nRight
|
|
|| pMerge->nInput==pLevel->nRight+1
|
|
);
|
|
ckptSetValue(p, iOut++, pMerge->nInput, pRc);
|
|
ckptSetValue(p, iOut++, pMerge->nSkip, pRc);
|
|
for(i=0; i<pMerge->nInput; i++){
|
|
ckptAppend64(p, &iOut, pMerge->aInput[i].iPg, pRc);
|
|
ckptSetValue(p, iOut++, pMerge->aInput[i].iCell, pRc);
|
|
}
|
|
ckptAppend64(p, &iOut, pMerge->splitkey.iPg, pRc);
|
|
ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc);
|
|
ckptAppend64(p, &iOut, pMerge->iCurrentPtr, pRc);
|
|
}
|
|
|
|
*piOut = iOut;
|
|
}
|
|
|
|
/*
|
|
** Populate the log offset fields of the checkpoint buffer. 4 values.
|
|
*/
|
|
static void ckptExportLog(
|
|
lsm_db *pDb,
|
|
int bFlush,
|
|
CkptBuffer *p,
|
|
int *piOut,
|
|
int *pRc
|
|
){
|
|
int iOut = *piOut;
|
|
|
|
assert( iOut==CKPT_HDR_LO_MSW );
|
|
|
|
if( bFlush ){
|
|
i64 iOff = pDb->treehdr.iOldLog;
|
|
ckptAppend64(p, &iOut, iOff, pRc);
|
|
ckptSetValue(p, iOut++, pDb->treehdr.oldcksum0, pRc);
|
|
ckptSetValue(p, iOut++, pDb->treehdr.oldcksum1, pRc);
|
|
}else{
|
|
for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){
|
|
ckptSetValue(p, iOut, pDb->pShmhdr->aSnap2[iOut], pRc);
|
|
}
|
|
}
|
|
|
|
assert( *pRc || iOut==CKPT_HDR_LO_CKSUM2+1 );
|
|
*piOut = iOut;
|
|
}
|
|
|
|
static void ckptExportAppendlist(
|
|
lsm_db *db, /* Database connection */
|
|
CkptBuffer *p, /* Checkpoint buffer to write to */
|
|
int *piOut, /* IN/OUT: Offset within checkpoint buffer */
|
|
int *pRc /* IN/OUT: Error code */
|
|
){
|
|
int i;
|
|
LsmPgno *aiAppend = db->pWorker->aiAppend;
|
|
|
|
for(i=0; i<LSM_APPLIST_SZ; i++){
|
|
ckptAppend64(p, piOut, aiAppend[i], pRc);
|
|
}
|
|
};
|
|
|
|
static int ckptExportSnapshot(
|
|
lsm_db *pDb, /* Connection handle */
|
|
int bLog, /* True to update log-offset fields */
|
|
i64 iId, /* Checkpoint id */
|
|
int bCksum, /* If true, include checksums */
|
|
void **ppCkpt, /* OUT: Buffer containing checkpoint */
|
|
int *pnCkpt /* OUT: Size of checkpoint in bytes */
|
|
){
|
|
int rc = LSM_OK; /* Return Code */
|
|
FileSystem *pFS = pDb->pFS; /* File system object */
|
|
Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */
|
|
int nLevel = 0; /* Number of levels in checkpoint */
|
|
int iLevel; /* Used to count out nLevel levels */
|
|
int iOut = 0; /* Current offset in aCkpt[] */
|
|
Level *pLevel; /* Level iterator */
|
|
int i; /* Iterator used while serializing freelist */
|
|
CkptBuffer ckpt;
|
|
|
|
/* Initialize the output buffer */
|
|
memset(&ckpt, 0, sizeof(CkptBuffer));
|
|
ckpt.pEnv = pDb->pEnv;
|
|
iOut = CKPT_HDR_SIZE;
|
|
|
|
/* Write the log offset into the checkpoint. */
|
|
ckptExportLog(pDb, bLog, &ckpt, &iOut, &rc);
|
|
|
|
/* Write the append-point list */
|
|
ckptExportAppendlist(pDb, &ckpt, &iOut, &rc);
|
|
|
|
/* Figure out how many levels will be written to the checkpoint. */
|
|
for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nLevel++;
|
|
|
|
/* Serialize nLevel levels. */
|
|
iLevel = 0;
|
|
for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nLevel; pLevel=pLevel->pNext){
|
|
ckptExportLevel(pLevel, &ckpt, &iOut, &rc);
|
|
iLevel++;
|
|
}
|
|
|
|
/* Write the block-redirect list */
|
|
ckptSetValue(&ckpt, iOut++, pSnap->redirect.n, &rc);
|
|
for(i=0; i<pSnap->redirect.n; i++){
|
|
ckptSetValue(&ckpt, iOut++, pSnap->redirect.a[i].iFrom, &rc);
|
|
ckptSetValue(&ckpt, iOut++, pSnap->redirect.a[i].iTo, &rc);
|
|
}
|
|
|
|
/* Write the freelist */
|
|
assert( pSnap->freelist.nEntry<=pDb->nMaxFreelist );
|
|
if( rc==LSM_OK ){
|
|
int nFree = pSnap->freelist.nEntry;
|
|
ckptSetValue(&ckpt, iOut++, nFree, &rc);
|
|
for(i=0; i<nFree; i++){
|
|
FreelistEntry *p = &pSnap->freelist.aEntry[i];
|
|
ckptSetValue(&ckpt, iOut++, p->iBlk, &rc);
|
|
ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc);
|
|
ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc);
|
|
}
|
|
}
|
|
|
|
/* Write the checkpoint header */
|
|
assert( iId>=0 );
|
|
assert( pSnap->iCmpId==pDb->compress.iId
|
|
|| pSnap->iCmpId==LSM_COMPRESSION_EMPTY
|
|
);
|
|
ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc);
|
|
ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc);
|
|
ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc);
|
|
ckptSetValue(&ckpt, CKPT_HDR_CMPID, pDb->compress.iId, &rc);
|
|
ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc);
|
|
ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc);
|
|
ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc);
|
|
ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc);
|
|
ckptSetValue(&ckpt, CKPT_HDR_NWRITE, pSnap->nWrite, &rc);
|
|
|
|
if( bCksum ){
|
|
ckptAddChecksum(&ckpt, iOut, &rc);
|
|
}else{
|
|
ckptSetValue(&ckpt, iOut, 0, &rc);
|
|
ckptSetValue(&ckpt, iOut+1, 0, &rc);
|
|
}
|
|
iOut += 2;
|
|
assert( iOut<=1024 );
|
|
|
|
#ifdef LSM_LOG_FREELIST
|
|
lsmLogMessage(pDb, rc,
|
|
"ckptExportSnapshot(): id=%lld freelist: %d", iId, pSnap->freelist.nEntry
|
|
);
|
|
for(i=0; i<pSnap->freelist.nEntry; i++){
|
|
lsmLogMessage(pDb, rc,
|
|
"ckptExportSnapshot(): iBlk=%d id=%lld",
|
|
pSnap->freelist.aEntry[i].iBlk,
|
|
pSnap->freelist.aEntry[i].iId
|
|
);
|
|
}
|
|
#endif
|
|
|
|
*ppCkpt = (void *)ckpt.aCkpt;
|
|
if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut;
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** Helper function for ckptImport().
|
|
*/
|
|
static void ckptNewSegment(
|
|
u32 *aIn,
|
|
int *piIn,
|
|
Segment *pSegment /* Populate this structure */
|
|
){
|
|
assert( pSegment->iFirst==0 && pSegment->iLastPg==0 );
|
|
assert( pSegment->nSize==0 && pSegment->iRoot==0 );
|
|
pSegment->iFirst = ckptGobble64(aIn, piIn);
|
|
pSegment->iLastPg = ckptGobble64(aIn, piIn);
|
|
pSegment->iRoot = ckptGobble64(aIn, piIn);
|
|
pSegment->nSize = (int)ckptGobble64(aIn, piIn);
|
|
assert( pSegment->iFirst );
|
|
}
|
|
|
|
static int ckptSetupMerge(lsm_db *pDb, u32 *aInt, int *piIn, Level *pLevel){
|
|
Merge *pMerge; /* Allocated Merge object */
|
|
int nInput; /* Number of input segments in merge */
|
|
int iIn = *piIn; /* Next value to read from aInt[] */
|
|
int i; /* Iterator variable */
|
|
int nByte; /* Number of bytes to allocate */
|
|
|
|
/* Allocate the Merge object. If malloc() fails, return LSM_NOMEM. */
|
|
nInput = (int)aInt[iIn++];
|
|
nByte = sizeof(Merge) + sizeof(MergeInput) * nInput;
|
|
pMerge = (Merge *)lsmMallocZero(pDb->pEnv, nByte);
|
|
if( !pMerge ) return LSM_NOMEM_BKPT;
|
|
pLevel->pMerge = pMerge;
|
|
|
|
/* Populate the Merge object. */
|
|
pMerge->aInput = (MergeInput *)&pMerge[1];
|
|
pMerge->nInput = nInput;
|
|
pMerge->iOutputOff = -1;
|
|
pMerge->nSkip = (int)aInt[iIn++];
|
|
for(i=0; i<nInput; i++){
|
|
pMerge->aInput[i].iPg = ckptGobble64(aInt, &iIn);
|
|
pMerge->aInput[i].iCell = (int)aInt[iIn++];
|
|
}
|
|
pMerge->splitkey.iPg = ckptGobble64(aInt, &iIn);
|
|
pMerge->splitkey.iCell = (int)aInt[iIn++];
|
|
pMerge->iCurrentPtr = ckptGobble64(aInt, &iIn);
|
|
|
|
/* Set *piIn and return LSM_OK. */
|
|
*piIn = iIn;
|
|
return LSM_OK;
|
|
}
|
|
|
|
|
|
static int ckptLoadLevels(
|
|
lsm_db *pDb,
|
|
u32 *aIn,
|
|
int *piIn,
|
|
int nLevel,
|
|
Level **ppLevel
|
|
){
|
|
int i;
|
|
int rc = LSM_OK;
|
|
Level *pRet = 0;
|
|
Level **ppNext;
|
|
int iIn = *piIn;
|
|
|
|
ppNext = &pRet;
|
|
for(i=0; rc==LSM_OK && i<nLevel; i++){
|
|
int iRight;
|
|
Level *pLevel;
|
|
|
|
/* Allocate space for the Level structure and Level.apRight[] array */
|
|
pLevel = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
|
|
if( rc==LSM_OK ){
|
|
pLevel->iAge = (u16)(aIn[iIn] & 0x0000FFFF);
|
|
pLevel->flags = (u16)((aIn[iIn]>>16) & 0x0000FFFF);
|
|
iIn++;
|
|
pLevel->nRight = aIn[iIn++];
|
|
if( pLevel->nRight ){
|
|
int nByte = sizeof(Segment) * pLevel->nRight;
|
|
pLevel->aRhs = (Segment *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
|
|
}
|
|
if( rc==LSM_OK ){
|
|
*ppNext = pLevel;
|
|
ppNext = &pLevel->pNext;
|
|
|
|
/* Allocate the main segment */
|
|
ckptNewSegment(aIn, &iIn, &pLevel->lhs);
|
|
|
|
/* Allocate each of the right-hand segments, if any */
|
|
for(iRight=0; iRight<pLevel->nRight; iRight++){
|
|
ckptNewSegment(aIn, &iIn, &pLevel->aRhs[iRight]);
|
|
}
|
|
|
|
/* Set up the Merge object, if required */
|
|
if( pLevel->nRight>0 ){
|
|
rc = ckptSetupMerge(pDb, aIn, &iIn, pLevel);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if( rc!=LSM_OK ){
|
|
/* An OOM must have occurred. Free any level structures allocated and
|
|
** return the error to the caller. */
|
|
lsmSortedFreeLevel(pDb->pEnv, pRet);
|
|
pRet = 0;
|
|
}
|
|
|
|
*ppLevel = pRet;
|
|
*piIn = iIn;
|
|
return rc;
|
|
}
|
|
|
|
|
|
int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal){
|
|
int rc = LSM_OK;
|
|
if( nVal>0 ){
|
|
u32 *aIn;
|
|
|
|
aIn = lsmMallocRc(pDb->pEnv, nVal, &rc);
|
|
if( aIn ){
|
|
Level *pLevel = 0;
|
|
Level *pParent;
|
|
|
|
int nIn;
|
|
int nLevel;
|
|
int iIn = 1;
|
|
memcpy(aIn, pVal, nVal);
|
|
nIn = nVal / sizeof(u32);
|
|
|
|
ckptChangeEndianness(aIn, nIn);
|
|
nLevel = aIn[0];
|
|
rc = ckptLoadLevels(pDb, aIn, &iIn, nLevel, &pLevel);
|
|
lsmFree(pDb->pEnv, aIn);
|
|
assert( rc==LSM_OK || pLevel==0 );
|
|
if( rc==LSM_OK ){
|
|
pParent = lsmDbSnapshotLevel(pDb->pWorker);
|
|
assert( pParent );
|
|
while( pParent->pNext ) pParent = pParent->pNext;
|
|
pParent->pNext = pLevel;
|
|
}
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Return the data for the LEVELS record.
|
|
**
|
|
** The size of the checkpoint that can be stored in the database header
|
|
** must not exceed 1024 32-bit integers. Normally, it does not. However,
|
|
** if it does, part of the checkpoint must be stored in the LSM. This
|
|
** routine returns that part.
|
|
*/
|
|
int lsmCheckpointLevels(
|
|
lsm_db *pDb, /* Database handle */
|
|
int nLevel, /* Number of levels to write to blob */
|
|
void **paVal, /* OUT: Pointer to LEVELS blob */
|
|
int *pnVal /* OUT: Size of LEVELS blob in bytes */
|
|
){
|
|
Level *p; /* Used to iterate through levels */
|
|
int nAll= 0;
|
|
int rc;
|
|
int i;
|
|
int iOut;
|
|
CkptBuffer ckpt;
|
|
assert( nLevel>0 );
|
|
|
|
for(p=lsmDbSnapshotLevel(pDb->pWorker); p; p=p->pNext) nAll++;
|
|
|
|
assert( nAll>nLevel );
|
|
nAll -= nLevel;
|
|
for(p=lsmDbSnapshotLevel(pDb->pWorker); p && nAll>0; p=p->pNext) nAll--;
|
|
|
|
memset(&ckpt, 0, sizeof(CkptBuffer));
|
|
ckpt.pEnv = pDb->pEnv;
|
|
|
|
ckptSetValue(&ckpt, 0, nLevel, &rc);
|
|
iOut = 1;
|
|
for(i=0; rc==LSM_OK && i<nLevel; i++){
|
|
ckptExportLevel(p, &ckpt, &iOut, &rc);
|
|
p = p->pNext;
|
|
}
|
|
assert( rc!=LSM_OK || p==0 );
|
|
|
|
if( rc==LSM_OK ){
|
|
ckptChangeEndianness(ckpt.aCkpt, iOut);
|
|
*paVal = (void *)ckpt.aCkpt;
|
|
*pnVal = iOut * sizeof(u32);
|
|
}else{
|
|
*pnVal = 0;
|
|
*paVal = 0;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Read the checkpoint id from meta-page pPg.
|
|
*/
|
|
static i64 ckptLoadId(MetaPage *pPg){
|
|
i64 ret = 0;
|
|
if( pPg ){
|
|
int nData;
|
|
u8 *aData = lsmFsMetaPageData(pPg, &nData);
|
|
ret = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32) +
|
|
((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
** Return true if the buffer passed as an argument contains a valid
|
|
** checkpoint.
|
|
*/
|
|
static int ckptChecksumOk(u32 *aCkpt){
|
|
u32 nCkpt = aCkpt[CKPT_HDR_NCKPT];
|
|
u32 cksum1;
|
|
u32 cksum2;
|
|
|
|
if( nCkpt<CKPT_HDR_NCKPT || nCkpt>(LSM_META_RW_PAGE_SIZE)/sizeof(u32) ){
|
|
return 0;
|
|
}
|
|
ckptChecksum(aCkpt, nCkpt, &cksum1, &cksum2);
|
|
return (cksum1==aCkpt[nCkpt-2] && cksum2==aCkpt[nCkpt-1]);
|
|
}
|
|
|
|
/*
|
|
** Attempt to load a checkpoint from meta page iMeta.
|
|
**
|
|
** This function is a no-op if *pRc is set to any value other than LSM_OK
|
|
** when it is called. If an error occurs, *pRc is set to an LSM error code
|
|
** before returning.
|
|
**
|
|
** If no error occurs and the checkpoint is successfully loaded, copy it to
|
|
** ShmHeader.aSnap1[] and ShmHeader.aSnap2[], and set ShmHeader.iMetaPage
|
|
** to indicate its origin. In this case return 1. Or, if the checkpoint
|
|
** cannot be loaded (because the checksum does not compute), return 0.
|
|
*/
|
|
static int ckptTryLoad(lsm_db *pDb, MetaPage *pPg, u32 iMeta, int *pRc){
|
|
int bLoaded = 0; /* Return value */
|
|
if( *pRc==LSM_OK ){
|
|
int rc = LSM_OK; /* Error code */
|
|
u32 *aCkpt = 0; /* Pointer to buffer containing checkpoint */
|
|
u32 nCkpt; /* Number of elements in aCkpt[] */
|
|
int nData; /* Bytes of data in aData[] */
|
|
u8 *aData; /* Meta page data */
|
|
|
|
aData = lsmFsMetaPageData(pPg, &nData);
|
|
nCkpt = (u32)lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
|
|
if( nCkpt<=nData/sizeof(u32) && nCkpt>CKPT_HDR_NCKPT ){
|
|
aCkpt = (u32 *)lsmMallocRc(pDb->pEnv, nCkpt*sizeof(u32), &rc);
|
|
}
|
|
if( aCkpt ){
|
|
memcpy(aCkpt, aData, nCkpt*sizeof(u32));
|
|
ckptChangeEndianness(aCkpt, nCkpt);
|
|
if( ckptChecksumOk(aCkpt) ){
|
|
ShmHeader *pShm = pDb->pShmhdr;
|
|
memcpy(pShm->aSnap1, aCkpt, nCkpt*sizeof(u32));
|
|
memcpy(pShm->aSnap2, aCkpt, nCkpt*sizeof(u32));
|
|
memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
|
|
pShm->iMetaPage = iMeta;
|
|
bLoaded = 1;
|
|
}
|
|
}
|
|
|
|
lsmFree(pDb->pEnv, aCkpt);
|
|
*pRc = rc;
|
|
}
|
|
return bLoaded;
|
|
}
|
|
|
|
/*
|
|
** Initialize the shared-memory header with an empty snapshot. This function
|
|
** is called when no valid snapshot can be found in the database header.
|
|
*/
|
|
static void ckptLoadEmpty(lsm_db *pDb){
|
|
u32 aCkpt[] = {
|
|
0, /* CKPT_HDR_ID_MSW */
|
|
10, /* CKPT_HDR_ID_LSW */
|
|
0, /* CKPT_HDR_NCKPT */
|
|
LSM_COMPRESSION_EMPTY, /* CKPT_HDR_CMPID */
|
|
0, /* CKPT_HDR_NBLOCK */
|
|
0, /* CKPT_HDR_BLKSZ */
|
|
0, /* CKPT_HDR_NLEVEL */
|
|
0, /* CKPT_HDR_PGSZ */
|
|
0, /* CKPT_HDR_NWRITE */
|
|
0, 0, 1234, 5678, /* The log pointer and initial checksum */
|
|
0,0,0,0, 0,0,0,0, /* The append list */
|
|
0, /* The redirected block list */
|
|
0, /* The free block list */
|
|
0, 0 /* Space for checksum values */
|
|
};
|
|
u32 nCkpt = array_size(aCkpt);
|
|
ShmHeader *pShm = pDb->pShmhdr;
|
|
|
|
aCkpt[CKPT_HDR_NCKPT] = nCkpt;
|
|
aCkpt[CKPT_HDR_BLKSZ] = pDb->nDfltBlksz;
|
|
aCkpt[CKPT_HDR_PGSZ] = pDb->nDfltPgsz;
|
|
ckptChecksum(aCkpt, array_size(aCkpt), &aCkpt[nCkpt-2], &aCkpt[nCkpt-1]);
|
|
|
|
memcpy(pShm->aSnap1, aCkpt, nCkpt*sizeof(u32));
|
|
memcpy(pShm->aSnap2, aCkpt, nCkpt*sizeof(u32));
|
|
memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
|
|
}
|
|
|
|
/*
|
|
** This function is called as part of database recovery to initialize the
|
|
** ShmHeader.aSnap1[] and ShmHeader.aSnap2[] snapshots.
|
|
*/
|
|
int lsmCheckpointRecover(lsm_db *pDb){
|
|
int rc = LSM_OK; /* Return Code */
|
|
i64 iId1; /* Id of checkpoint on meta-page 1 */
|
|
i64 iId2; /* Id of checkpoint on meta-page 2 */
|
|
int bLoaded = 0; /* True once checkpoint has been loaded */
|
|
int cmp; /* True if (iId2>iId1) */
|
|
MetaPage *apPg[2] = {0, 0}; /* Meta-pages 1 and 2 */
|
|
|
|
rc = lsmFsMetaPageGet(pDb->pFS, 0, 1, &apPg[0]);
|
|
if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pDb->pFS, 0, 2, &apPg[1]);
|
|
|
|
iId1 = ckptLoadId(apPg[0]);
|
|
iId2 = ckptLoadId(apPg[1]);
|
|
cmp = (iId2 > iId1);
|
|
bLoaded = ckptTryLoad(pDb, apPg[cmp?1:0], (cmp?2:1), &rc);
|
|
if( bLoaded==0 ){
|
|
bLoaded = ckptTryLoad(pDb, apPg[cmp?0:1], (cmp?1:2), &rc);
|
|
}
|
|
|
|
/* The database does not contain a valid checkpoint. Initialize the shared
|
|
** memory header with an empty checkpoint. */
|
|
if( bLoaded==0 ){
|
|
ckptLoadEmpty(pDb);
|
|
}
|
|
|
|
lsmFsMetaPageRelease(apPg[0]);
|
|
lsmFsMetaPageRelease(apPg[1]);
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Store the snapshot in pDb->aSnapshot[] in meta-page iMeta.
|
|
*/
|
|
int lsmCheckpointStore(lsm_db *pDb, int iMeta){
|
|
MetaPage *pPg = 0;
|
|
int rc;
|
|
|
|
assert( iMeta==1 || iMeta==2 );
|
|
rc = lsmFsMetaPageGet(pDb->pFS, 1, iMeta, &pPg);
|
|
if( rc==LSM_OK ){
|
|
u8 *aData;
|
|
int nData;
|
|
int nCkpt;
|
|
|
|
nCkpt = (int)pDb->aSnapshot[CKPT_HDR_NCKPT];
|
|
aData = lsmFsMetaPageData(pPg, &nData);
|
|
memcpy(aData, pDb->aSnapshot, nCkpt*sizeof(u32));
|
|
ckptChangeEndianness((u32 *)aData, nCkpt);
|
|
rc = lsmFsMetaPageRelease(pPg);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Copy the current client snapshot from shared-memory to pDb->aSnapshot[].
|
|
*/
|
|
int lsmCheckpointLoad(lsm_db *pDb, int *piRead){
|
|
int nRem = LSM_ATTEMPTS_BEFORE_PROTOCOL;
|
|
ShmHeader *pShm = pDb->pShmhdr;
|
|
while( (nRem--)>0 ){
|
|
int nInt;
|
|
|
|
nInt = pShm->aSnap1[CKPT_HDR_NCKPT];
|
|
if( nInt<=(LSM_META_RW_PAGE_SIZE / sizeof(u32)) ){
|
|
memcpy(pDb->aSnapshot, pShm->aSnap1, nInt*sizeof(u32));
|
|
if( ckptChecksumOk(pDb->aSnapshot) ){
|
|
if( piRead ) *piRead = 1;
|
|
return LSM_OK;
|
|
}
|
|
}
|
|
|
|
nInt = pShm->aSnap2[CKPT_HDR_NCKPT];
|
|
if( nInt<=(LSM_META_RW_PAGE_SIZE / sizeof(u32)) ){
|
|
memcpy(pDb->aSnapshot, pShm->aSnap2, nInt*sizeof(u32));
|
|
if( ckptChecksumOk(pDb->aSnapshot) ){
|
|
if( piRead ) *piRead = 2;
|
|
return LSM_OK;
|
|
}
|
|
}
|
|
|
|
lsmShmBarrier(pDb);
|
|
}
|
|
return LSM_PROTOCOL_BKPT;
|
|
}
|
|
|
|
int lsmInfoCompressionId(lsm_db *db, u32 *piCmpId){
|
|
int rc;
|
|
|
|
assert( db->pClient==0 && db->pWorker==0 );
|
|
rc = lsmCheckpointLoad(db, 0);
|
|
if( rc==LSM_OK ){
|
|
*piCmpId = db->aSnapshot[CKPT_HDR_CMPID];
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
int lsmCheckpointLoadOk(lsm_db *pDb, int iSnap){
|
|
u32 *aShm;
|
|
assert( iSnap==1 || iSnap==2 );
|
|
aShm = (iSnap==1) ? pDb->pShmhdr->aSnap1 : pDb->pShmhdr->aSnap2;
|
|
return (lsmCheckpointId(pDb->aSnapshot, 0)==lsmCheckpointId(aShm, 0) );
|
|
}
|
|
|
|
int lsmCheckpointClientCacheOk(lsm_db *pDb){
|
|
return ( pDb->pClient
|
|
&& pDb->pClient->iId==lsmCheckpointId(pDb->aSnapshot, 0)
|
|
&& pDb->pClient->iId==lsmCheckpointId(pDb->pShmhdr->aSnap1, 0)
|
|
&& pDb->pClient->iId==lsmCheckpointId(pDb->pShmhdr->aSnap2, 0)
|
|
);
|
|
}
|
|
|
|
int lsmCheckpointLoadWorker(lsm_db *pDb){
|
|
int rc;
|
|
ShmHeader *pShm = pDb->pShmhdr;
|
|
int nInt1;
|
|
int nInt2;
|
|
|
|
/* Must be holding the WORKER lock to do this. Or DMS2. */
|
|
assert(
|
|
lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL)
|
|
|| lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL)
|
|
);
|
|
|
|
/* Check that the two snapshots match. If not, repair them. */
|
|
nInt1 = pShm->aSnap1[CKPT_HDR_NCKPT];
|
|
nInt2 = pShm->aSnap2[CKPT_HDR_NCKPT];
|
|
if( nInt1!=nInt2 || memcmp(pShm->aSnap1, pShm->aSnap2, nInt2*sizeof(u32)) ){
|
|
if( ckptChecksumOk(pShm->aSnap1) ){
|
|
memcpy(pShm->aSnap2, pShm->aSnap1, sizeof(u32)*nInt1);
|
|
}else if( ckptChecksumOk(pShm->aSnap2) ){
|
|
memcpy(pShm->aSnap1, pShm->aSnap2, sizeof(u32)*nInt2);
|
|
}else{
|
|
return LSM_PROTOCOL_BKPT;
|
|
}
|
|
}
|
|
|
|
rc = lsmCheckpointDeserialize(pDb, 1, pShm->aSnap1, &pDb->pWorker);
|
|
if( pDb->pWorker ) pDb->pWorker->pDatabase = pDb->pDatabase;
|
|
|
|
if( rc==LSM_OK ){
|
|
rc = lsmCheckCompressionId(pDb, pDb->pWorker->iCmpId);
|
|
}
|
|
|
|
#if 0
|
|
assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
|
|
#endif
|
|
return rc;
|
|
}
|
|
|
|
int lsmCheckpointDeserialize(
|
|
lsm_db *pDb,
|
|
int bInclFreelist, /* If true, deserialize free-list */
|
|
u32 *aCkpt,
|
|
Snapshot **ppSnap
|
|
){
|
|
int rc = LSM_OK;
|
|
Snapshot *pNew;
|
|
|
|
pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc);
|
|
if( rc==LSM_OK ){
|
|
Level *pLvl;
|
|
int nFree;
|
|
int i;
|
|
int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL];
|
|
int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE;
|
|
|
|
pNew->iId = lsmCheckpointId(aCkpt, 0);
|
|
pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK];
|
|
pNew->nWrite = aCkpt[CKPT_HDR_NWRITE];
|
|
rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel);
|
|
pNew->iLogOff = lsmCheckpointLogOffset(aCkpt);
|
|
pNew->iCmpId = aCkpt[CKPT_HDR_CMPID];
|
|
|
|
/* Make a copy of the append-list */
|
|
for(i=0; i<LSM_APPLIST_SZ; i++){
|
|
u32 *a = &aCkpt[CKPT_HDR_SIZE + CKPT_LOGPTR_SIZE + i*2];
|
|
pNew->aiAppend[i] = ckptRead64(a);
|
|
}
|
|
|
|
/* Read the block-redirect list */
|
|
pNew->redirect.n = aCkpt[iIn++];
|
|
if( pNew->redirect.n ){
|
|
pNew->redirect.a = lsmMallocZeroRc(pDb->pEnv,
|
|
(sizeof(struct RedirectEntry) * LSM_MAX_BLOCK_REDIRECTS), &rc
|
|
);
|
|
if( rc==LSM_OK ){
|
|
for(i=0; i<pNew->redirect.n; i++){
|
|
pNew->redirect.a[i].iFrom = aCkpt[iIn++];
|
|
pNew->redirect.a[i].iTo = aCkpt[iIn++];
|
|
}
|
|
}
|
|
for(pLvl=pNew->pLevel; pLvl->pNext; pLvl=pLvl->pNext);
|
|
if( pLvl->nRight ){
|
|
pLvl->aRhs[pLvl->nRight-1].pRedirect = &pNew->redirect;
|
|
}else{
|
|
pLvl->lhs.pRedirect = &pNew->redirect;
|
|
}
|
|
}
|
|
|
|
/* Copy the free-list */
|
|
if( rc==LSM_OK && bInclFreelist ){
|
|
nFree = aCkpt[iIn++];
|
|
if( nFree ){
|
|
pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc(
|
|
pDb->pEnv, sizeof(FreelistEntry)*nFree, &rc
|
|
);
|
|
if( rc==LSM_OK ){
|
|
int j;
|
|
for(j=0; j<nFree; j++){
|
|
FreelistEntry *p = &pNew->freelist.aEntry[j];
|
|
p->iBlk = aCkpt[iIn++];
|
|
p->iId = ((i64)(aCkpt[iIn])<<32) + aCkpt[iIn+1];
|
|
iIn += 2;
|
|
}
|
|
pNew->freelist.nEntry = pNew->freelist.nAlloc = nFree;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if( rc!=LSM_OK ){
|
|
lsmFreeSnapshot(pDb->pEnv, pNew);
|
|
pNew = 0;
|
|
}
|
|
|
|
*ppSnap = pNew;
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Connection pDb must be the worker connection in order to call this
|
|
** function. It returns true if the database already contains the maximum
|
|
** number of levels or false otherwise.
|
|
**
|
|
** This is used when flushing the in-memory tree to disk. If the database
|
|
** is already full, then the caller should invoke lsm_work() or similar
|
|
** until it is not full before creating a new level by flushing the in-memory
|
|
** tree to disk. Limiting the number of levels in the database ensures that
|
|
** the records describing them always fit within the checkpoint blob.
|
|
*/
|
|
int lsmDatabaseFull(lsm_db *pDb){
|
|
Level *p;
|
|
int nRhs = 0;
|
|
|
|
assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) );
|
|
assert( pDb->pWorker );
|
|
|
|
for(p=pDb->pWorker->pLevel; p; p=p->pNext){
|
|
nRhs += (p->nRight ? p->nRight : 1);
|
|
}
|
|
|
|
return (nRhs >= LSM_MAX_RHS_SEGMENTS);
|
|
}
|
|
|
|
/*
|
|
** The connection passed as the only argument is currently the worker
|
|
** connection. Some work has been performed on the database by the connection,
|
|
** but no new snapshot has been written into shared memory.
|
|
**
|
|
** This function updates the shared-memory worker and client snapshots with
|
|
** the new snapshot produced by the work performed by pDb.
|
|
**
|
|
** If successful, LSM_OK is returned. Otherwise, if an error occurs, an LSM
|
|
** error code is returned.
|
|
*/
|
|
int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush){
|
|
Snapshot *pSnap = pDb->pWorker;
|
|
ShmHeader *pShm = pDb->pShmhdr;
|
|
void *p = 0;
|
|
int n = 0;
|
|
int rc;
|
|
|
|
pSnap->iId++;
|
|
rc = ckptExportSnapshot(pDb, bFlush, pSnap->iId, 1, &p, &n);
|
|
if( rc!=LSM_OK ) return rc;
|
|
assert( ckptChecksumOk((u32 *)p) );
|
|
|
|
assert( n<=LSM_META_RW_PAGE_SIZE );
|
|
memcpy(pShm->aSnap2, p, n);
|
|
lsmShmBarrier(pDb);
|
|
memcpy(pShm->aSnap1, p, n);
|
|
lsmFree(pDb->pEnv, p);
|
|
|
|
/* assert( lsmFsIntegrityCheck(pDb) ); */
|
|
return LSM_OK;
|
|
}
|
|
|
|
/*
|
|
** This function is used to determine the snapshot-id of the most recently
|
|
** checkpointed snapshot. Variable ShmHeader.iMetaPage indicates which of
|
|
** the two meta-pages said snapshot resides on (if any).
|
|
**
|
|
** If successful, this function loads the snapshot from the meta-page,
|
|
** verifies its checksum and sets *piId to the snapshot-id before returning
|
|
** LSM_OK. Or, if the checksum attempt fails, *piId is set to zero and
|
|
** LSM_OK returned. If an error occurs, an LSM error code is returned and
|
|
** the final value of *piId is undefined.
|
|
*/
|
|
int lsmCheckpointSynced(lsm_db *pDb, i64 *piId, i64 *piLog, u32 *pnWrite){
|
|
int rc = LSM_OK;
|
|
MetaPage *pPg;
|
|
u32 iMeta;
|
|
|
|
iMeta = pDb->pShmhdr->iMetaPage;
|
|
if( iMeta==1 || iMeta==2 ){
|
|
rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg);
|
|
if( rc==LSM_OK ){
|
|
int nCkpt;
|
|
int nData;
|
|
u8 *aData;
|
|
|
|
aData = lsmFsMetaPageData(pPg, &nData);
|
|
assert( nData==LSM_META_RW_PAGE_SIZE );
|
|
nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
|
|
if( nCkpt<(LSM_META_RW_PAGE_SIZE/sizeof(u32)) ){
|
|
u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc);
|
|
if( aCopy ){
|
|
memcpy(aCopy, aData, nCkpt*sizeof(u32));
|
|
ckptChangeEndianness(aCopy, nCkpt);
|
|
if( ckptChecksumOk(aCopy) ){
|
|
if( piId ) *piId = lsmCheckpointId(aCopy, 0);
|
|
if( piLog ) *piLog = (lsmCheckpointLogOffset(aCopy) >> 1);
|
|
if( pnWrite ) *pnWrite = aCopy[CKPT_HDR_NWRITE];
|
|
}
|
|
lsmFree(pDb->pEnv, aCopy);
|
|
}
|
|
}
|
|
lsmFsMetaPageRelease(pPg);
|
|
}
|
|
}
|
|
|
|
if( (iMeta!=1 && iMeta!=2) || rc!=LSM_OK || pDb->pShmhdr->iMetaPage!=iMeta ){
|
|
if( piId ) *piId = 0;
|
|
if( piLog ) *piLog = 0;
|
|
if( pnWrite ) *pnWrite = 0;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Return the checkpoint-id of the checkpoint array passed as the first
|
|
** argument to this function. If the second argument is true, then assume
|
|
** that the checkpoint is made up of 32-bit big-endian integers. If it
|
|
** is false, assume that the integers are in machine byte order.
|
|
*/
|
|
i64 lsmCheckpointId(u32 *aCkpt, int bDisk){
|
|
i64 iId;
|
|
if( bDisk ){
|
|
u8 *aData = (u8 *)aCkpt;
|
|
iId = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32);
|
|
iId += ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
|
|
}else{
|
|
iId = ((i64)aCkpt[CKPT_HDR_ID_MSW] << 32) + (i64)aCkpt[CKPT_HDR_ID_LSW];
|
|
}
|
|
return iId;
|
|
}
|
|
|
|
u32 lsmCheckpointNBlock(u32 *aCkpt){
|
|
return aCkpt[CKPT_HDR_NBLOCK];
|
|
}
|
|
|
|
u32 lsmCheckpointNWrite(u32 *aCkpt, int bDisk){
|
|
if( bDisk ){
|
|
return lsmGetU32((u8 *)&aCkpt[CKPT_HDR_NWRITE]);
|
|
}else{
|
|
return aCkpt[CKPT_HDR_NWRITE];
|
|
}
|
|
}
|
|
|
|
i64 lsmCheckpointLogOffset(u32 *aCkpt){
|
|
return ((i64)aCkpt[CKPT_HDR_LO_MSW] << 32) + (i64)aCkpt[CKPT_HDR_LO_LSW];
|
|
}
|
|
|
|
int lsmCheckpointPgsz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_PGSZ]; }
|
|
|
|
int lsmCheckpointBlksz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_BLKSZ]; }
|
|
|
|
void lsmCheckpointLogoffset(
|
|
u32 *aCkpt,
|
|
DbLog *pLog
|
|
){
|
|
pLog->aRegion[2].iStart = (lsmCheckpointLogOffset(aCkpt) >> 1);
|
|
|
|
pLog->cksum0 = aCkpt[CKPT_HDR_LO_CKSUM1];
|
|
pLog->cksum1 = aCkpt[CKPT_HDR_LO_CKSUM2];
|
|
pLog->iSnapshotId = lsmCheckpointId(aCkpt, 0);
|
|
}
|
|
|
|
void lsmCheckpointZeroLogoffset(lsm_db *pDb){
|
|
u32 nCkpt;
|
|
|
|
nCkpt = pDb->aSnapshot[CKPT_HDR_NCKPT];
|
|
assert( nCkpt>CKPT_HDR_NCKPT );
|
|
assert( nCkpt==pDb->pShmhdr->aSnap1[CKPT_HDR_NCKPT] );
|
|
assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aSnap1, nCkpt*sizeof(u32)) );
|
|
assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aSnap2, nCkpt*sizeof(u32)) );
|
|
|
|
pDb->aSnapshot[CKPT_HDR_LO_MSW] = 0;
|
|
pDb->aSnapshot[CKPT_HDR_LO_LSW] = 0;
|
|
ckptChecksum(pDb->aSnapshot, nCkpt,
|
|
&pDb->aSnapshot[nCkpt-2], &pDb->aSnapshot[nCkpt-1]
|
|
);
|
|
|
|
memcpy(pDb->pShmhdr->aSnap1, pDb->aSnapshot, nCkpt*sizeof(u32));
|
|
memcpy(pDb->pShmhdr->aSnap2, pDb->aSnapshot, nCkpt*sizeof(u32));
|
|
}
|
|
|
|
/*
|
|
** Set the output variable to the number of KB of data written into the
|
|
** database file since the most recent checkpoint.
|
|
*/
|
|
int lsmCheckpointSize(lsm_db *db, int *pnKB){
|
|
int rc = LSM_OK;
|
|
u32 nSynced;
|
|
|
|
/* Set nSynced to the number of pages that had been written when the
|
|
** database was last checkpointed. */
|
|
rc = lsmCheckpointSynced(db, 0, 0, &nSynced);
|
|
|
|
if( rc==LSM_OK ){
|
|
u32 nPgsz = db->pShmhdr->aSnap1[CKPT_HDR_PGSZ];
|
|
u32 nWrite = db->pShmhdr->aSnap1[CKPT_HDR_NWRITE];
|
|
*pnKB = (int)(( ((i64)(nWrite - nSynced) * nPgsz) + 1023) / 1024);
|
|
}
|
|
|
|
return rc;
|
|
}
|