mirror of
https://github.com/mongodb/mongo.git
synced 2024-12-01 09:32:32 +01:00
1045 lines
47 KiB
C++
1045 lines
47 KiB
C++
// btree.h
|
|
|
|
/**
|
|
* Copyright (C) 2008 10gen Inc.
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License, version 3,
|
|
* as published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Affero General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "../pch.h"
|
|
#include "jsobj.h"
|
|
#include "diskloc.h"
|
|
#include "pdfile.h"
|
|
#include "key.h"
|
|
|
|
namespace mongo {
|
|
|
|
/**
|
|
* Our btree implementation generally follows the standard btree algorithm,
|
|
* which is described in many places. The nodes of our btree are referred to
|
|
* as buckets below. These buckets are of size BucketSize and their body is
|
|
* an ordered array of <bson key, disk loc> pairs, where disk loc is the disk
|
|
* location of a document and bson key is a projection of this document into
|
|
* the schema of the index for this btree. Ordering is determined on the
|
|
* basis of bson key first and then disk loc in case of a tie. All bson keys
|
|
* for a btree have identical schemas with empty string field names and may
|
|
* not have an objsize() exceeding KeyMax. The btree's buckets are
|
|
* themselves organized into an ordered tree. Although there are exceptions,
|
|
* generally buckets with n keys have n+1 children and the body of a bucket is
|
|
* at least lowWaterMark bytes. A more strictly enforced requirement is that
|
|
* a non root bucket must have at least one key except in certain transient
|
|
* states.
|
|
*
|
|
* Our btrees support the following primary read operations: finding a
|
|
* specified key; iterating from a starting key to the next or previous
|
|
* ordered key; and skipping from a starting key to another specified key
|
|
* without checking every intermediate key. The primary write operations
|
|
* are insertion and deletion of keys. Insertion may trigger a bucket split
|
|
* if necessary to avoid bucket overflow. In such a case, subsequent splits
|
|
* will occur recursively as necessary. Deletion may trigger a bucket
|
|
* rebalance, in which a size deficient bucket is filled with keys from an
|
|
* adjacent bucket. In this case, splitting may potentially occur in the
|
|
* parent. Deletion may alternatively trigger a merge, in which the keys
|
|
* from two buckets and a key from their shared parent are combined into the
|
|
* same bucket. In such a case, rebalancing or merging may proceed
|
|
* recursively from the parent.
|
|
*
|
|
* While the btree data format has been relatively constant over time, btrees
|
|
* initially created by versions of mongo earlier than the current version
|
|
* may embody different properties than freshly created btrees (while
|
|
* following the same data format). These older btrees are referred to
|
|
* below as legacy btrees.
|
|
*/
|
|
|
|
const int OldBucketSize = 8192;
|
|
|
|
// largest key size we allow. note we very much need to support bigger keys (somehow) in the future.
|
|
const int KeyMax = OldBucketSize / 10;\
|
|
|
|
#pragma pack(1)
|
|
template< class Version > class BucketBasics;
|
|
|
|
/**
|
|
* This is the fixed width data component for storage of a key within a
|
|
* bucket. It contains an offset pointer to the variable width bson
|
|
* data component. A _KeyNode may be 'unused', please see below.
|
|
*/
|
|
struct _KeyNode {
|
|
/** Signals that we are writing this _KeyNode and casts away const */
|
|
_KeyNode& writing() const;
|
|
/**
|
|
* The 'left' child bucket of this key. If this is the i-th key, it
|
|
* points to the i index child bucket.
|
|
*/
|
|
DiskLoc prevChildBucket;
|
|
/** The location of the record associated with this key. */
|
|
DiskLoc recordLoc;
|
|
short keyDataOfs() const {
|
|
return (short) _kdo;
|
|
}
|
|
/** Offset within current bucket of the variable width bson key for this _KeyNode. */
|
|
unsigned short _kdo;
|
|
void setKeyDataOfs(short s) {
|
|
_kdo = s;
|
|
assert(s>=0);
|
|
}
|
|
/** Seems to be redundant. */
|
|
void setKeyDataOfsSavingUse(short s) {
|
|
_kdo = s;
|
|
assert(s>=0);
|
|
}
|
|
/**
|
|
* Unused keys are not returned by read operations. Keys may be marked
|
|
* as unused in cases where it is difficult to delete them while
|
|
* maintaining the constraints required of a btree.
|
|
*
|
|
* Setting ofs to odd is the sentinel for unused, as real recordLoc's
|
|
* are always even numbers. Note we need to keep its value basically
|
|
* the same as we use the recordLoc as part of the key in the index
|
|
* (to handle duplicate keys efficiently).
|
|
*
|
|
* Flagging keys as unused is a feature that is being phased out in favor
|
|
* of deleting the keys outright. The current btree implementation is
|
|
* not expected to mark a key as unused in a non legacy btree.
|
|
*/
|
|
void setUnused() {
|
|
recordLoc.GETOFS() |= 1;
|
|
}
|
|
void setUsed() { recordLoc.GETOFS() &= ~1; }
|
|
int isUnused() const {
|
|
return recordLoc.getOfs() & 1;
|
|
}
|
|
int isUsed() const {
|
|
return !isUnused();
|
|
}
|
|
};
|
|
|
|
/**
|
|
* This structure represents header data for a btree bucket. An object of
|
|
* this type is typically allocated inside of a buffer of size BucketSize,
|
|
* resulting in a full bucket with an appropriate header.
|
|
*
|
|
* The body of a btree bucket contains an array of _KeyNode objects starting
|
|
* from its lowest indexed bytes and growing to higher indexed bytes. The
|
|
* body also contains variable width bson keys, which are allocated from the
|
|
* highest indexed bytes toward lower indexed bytes.
|
|
*
|
|
* |hhhh|kkkkkkk--------bbbbbbbbbbbuuubbbuubbb|
|
|
* h = header data
|
|
* k = KeyNode data
|
|
* - = empty space
|
|
* b = bson key data
|
|
* u = unused (old) bson key data, that may be garbage collected
|
|
*/
|
|
class BtreeData_V0 {
|
|
protected:
|
|
/** Parent bucket of this bucket, which isNull() for the root bucket. */
|
|
DiskLoc parent;
|
|
/** Given that there are n keys, this is the n index child. */
|
|
DiskLoc nextChild;
|
|
/** can be reused, value is 8192 in current pdfile version Apr2010 */
|
|
unsigned short _wasSize;
|
|
/** zero */
|
|
unsigned short _reserved1;
|
|
int flags;
|
|
|
|
void _init() {
|
|
_reserved1 = 0;
|
|
_wasSize = BucketSize;
|
|
reserved = 0;
|
|
}
|
|
|
|
/** basicInsert() assumes the next three members are consecutive and in this order: */
|
|
|
|
/** Size of the empty region. */
|
|
int emptySize;
|
|
/** Size used for bson storage, including storage of old keys. */
|
|
int topSize;
|
|
/* Number of keys in the bucket. */
|
|
int n;
|
|
|
|
int reserved;
|
|
/* Beginning of the bucket's body */
|
|
char data[4];
|
|
|
|
public:
|
|
typedef KeyBson Key;
|
|
typedef KeyBson KeyOwned;
|
|
enum { BucketSize = 8192 };
|
|
};
|
|
|
|
class BtreeData_V1 {
|
|
protected:
|
|
/** Parent bucket of this bucket, which isNull() for the root bucket. */
|
|
DiskLoc parent;
|
|
/** Given that there are n keys, this is the n index child. */
|
|
DiskLoc nextChild;
|
|
|
|
unsigned short flags;
|
|
|
|
/** basicInsert() assumes the next three members are consecutive and in this order: */
|
|
|
|
/** Size of the empty region. */
|
|
unsigned short emptySize;
|
|
/** Size used for bson storage, including storage of old keys. */
|
|
unsigned short topSize;
|
|
/* Number of keys in the bucket. */
|
|
unsigned short n;
|
|
|
|
/* Beginning of the bucket's body */
|
|
char data[4];
|
|
|
|
void _init() { }
|
|
|
|
public:
|
|
typedef KeyV1 Key;
|
|
typedef KeyV1Owned KeyOwned;
|
|
enum { BucketSize = 8192-16 }; // leave room for Record header
|
|
};
|
|
|
|
typedef BtreeData_V0 V0;
|
|
typedef BtreeData_V1 V1;
|
|
|
|
/**
|
|
* This class adds functionality to BtreeData for managing a single bucket.
|
|
* The following policies are used in an attempt to encourage simplicity:
|
|
*
|
|
* Const member functions of this class are those which may be called on
|
|
* an object for which writing has not been signaled. Non const member
|
|
* functions may only be called on objects for which writing has been
|
|
* signaled. Note that currently some const functions write to the
|
|
* underlying memory representation of this bucket using optimized methods
|
|
* to signal write operations.
|
|
*
|
|
* DiskLoc parameters that may shadow references within the btree should
|
|
* be passed by value rather than by reference to non const member
|
|
* functions or to const member functions which may perform writes. This way
|
|
* a callee need not worry that write operations will change or invalidate
|
|
* its arguments.
|
|
*
|
|
* The current policy for dealing with bson arguments is the opposite of
|
|
* what is described above for DiskLoc arguments. We do not want to copy
|
|
* bson into memory as an intermediate step for btree changes, and if bson
|
|
* is to be moved it must be copied to the new location before the old
|
|
* location is invalidated. Care should be taken in cases where that invalid
|
|
* memory may be implicitly referenced by function arguments.
|
|
*
|
|
* A number of functions below require a thisLoc argument, which must be the
|
|
* disk location of the bucket mapped to 'this'.
|
|
*/
|
|
template< class Version >
|
|
class BucketBasics : public Version {
|
|
public:
|
|
template <class U> friend class BtreeBuilder;
|
|
typedef typename Version::Key Key;
|
|
|
|
int getN() const { return this->n; }
|
|
|
|
/**
|
|
* This is an in memory wrapper for a _KeyNode, and not itself part of btree
|
|
* storage. This object and its BSONObj 'key' will become invalid if the
|
|
* _KeyNode data that generated it is moved within the btree. In general,
|
|
* a KeyNode should not be expected to be valid after a write.
|
|
*/
|
|
class KeyNode {
|
|
public:
|
|
KeyNode(const BucketBasics<Version>& bb, const _KeyNode &k);
|
|
const DiskLoc& prevChildBucket;
|
|
const DiskLoc& recordLoc;
|
|
/* Points to the bson key storage for a _KeyNode */
|
|
Key key;
|
|
};
|
|
friend class KeyNode;
|
|
|
|
/** Assert write intent declared for this bucket already. */
|
|
void assertWritable();
|
|
|
|
void assertValid(const Ordering &order, bool force = false) const;
|
|
void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); }
|
|
|
|
/**
|
|
* @return KeyNode for key at index i. The KeyNode will become invalid
|
|
* if the key is moved or reassigned, or if the node is packed. In general
|
|
* a KeyNode should not be expected to be valid after a write.
|
|
*/
|
|
const KeyNode keyNode(int i) const {
|
|
if ( i >= this->n ) {
|
|
massert( 13000 , (string)"invalid keyNode: " + BSON( "i" << i << "n" << this->n ).jsonString() , i < this->n );
|
|
}
|
|
return KeyNode(*this, k(i));
|
|
}
|
|
|
|
static int headerSize() {
|
|
const BucketBasics *d = 0;
|
|
return (char*)&(d->data) - (char*)&(d->parent);
|
|
}
|
|
static int bodySize() { return Version::BucketSize - headerSize(); }
|
|
static int lowWaterMark() { return bodySize() / 2 - KeyMax - sizeof( _KeyNode ) + 1; } // see comment in btree.cpp
|
|
|
|
// for testing
|
|
int nKeys() const { return this->n; }
|
|
const DiskLoc getNextChild() const { return this->nextChild; }
|
|
|
|
protected:
|
|
char * dataAt(short ofs) { return this->data + ofs; }
|
|
|
|
/** Initialize the header for a new node. */
|
|
void init();
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 0 <= keypos <= n
|
|
* - If key is inserted at position keypos, the bucket's keys will still be
|
|
* in order.
|
|
* Postconditions:
|
|
* - If key can fit in the bucket, the bucket may be packed and keypos
|
|
* may be decreased to reflect deletion of earlier indexed keys during
|
|
* packing, the key will be inserted at the updated keypos index with
|
|
* a null prevChildBucket, the subsequent keys shifted to the right,
|
|
* and the function will return true.
|
|
* - If key cannot fit in the bucket, the bucket will be packed and
|
|
* the function will return false.
|
|
* Although this function is marked const, it modifies the underlying
|
|
* btree representation through an optimized write intent mechanism.
|
|
*/
|
|
bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const Key& key, const Ordering &order) const;
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - key / recordLoc are > all existing keys
|
|
* - The keys in prevChild and their descendents are between all existing
|
|
* keys and 'key'.
|
|
* Postconditions:
|
|
* - If there is space for key without packing, it is inserted as the
|
|
* last key with specified prevChild and true is returned.
|
|
* Importantly, nextChild is not updated!
|
|
* - Otherwise false is returned and there is no change.
|
|
*/
|
|
bool _pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild);
|
|
void pushBack(const DiskLoc recordLoc, const Key& key, const Ordering &order, const DiskLoc prevChild) {
|
|
bool ok = _pushBack( recordLoc , key , order , prevChild );
|
|
assert(ok);
|
|
}
|
|
|
|
/**
|
|
* This is a special purpose function used by BtreeBuilder. The
|
|
* interface is quite dangerous if you're not careful. The bson key
|
|
* returned here points to bucket memory that has been invalidated but
|
|
* not yet reclaimed.
|
|
*
|
|
* TODO Maybe this could be replaced with two functions, one which
|
|
* returns the last key without deleting it and another which simply
|
|
* deletes the last key. Then the caller would have enough control to
|
|
* ensure proper memory integrity.
|
|
*
|
|
* Preconditions:
|
|
* - bucket is not empty
|
|
* - last key of bucket is used (not unused)
|
|
* - nextChild isNull()
|
|
* - _unalloc will work correctly as used - see code
|
|
* Postconditions:
|
|
* - The last key of the bucket is removed, and its key and recLoc are
|
|
* returned. As mentioned above, the key points to unallocated memory.
|
|
*/
|
|
void popBack(DiskLoc& recLoc, Key& key);
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 0 <= keypos < n
|
|
* - there is no child bucket at keypos
|
|
* - n > 1
|
|
* - if mayEmpty == false or nextChild.isNull(), n > 0
|
|
* Postconditions:
|
|
* - The key at keypos is removed, and remaining keys are shifted over.
|
|
* - The bucket becomes unpacked.
|
|
* - if mayEmpty is true and nextChild.isNull(), the bucket may have no keys.
|
|
*/
|
|
void _delKeyAtPos(int keypos, bool mayEmpty = false);
|
|
|
|
/* !Packed means there is deleted fragment space within the bucket.
|
|
We "repack" when we run out of space before considering the node
|
|
to be full.
|
|
*/
|
|
enum Flags { Packed=1 };
|
|
|
|
/** n == 0 is ok */
|
|
const DiskLoc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
|
|
DiskLoc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
|
|
|
|
/** Same as bodySize(). */
|
|
int totalDataSize() const;
|
|
/**
|
|
* @return true when a key may be dropped by pack()
|
|
* @param index index of the key that may be dropped
|
|
* @param refPos index of a particular key of interest, which must not
|
|
* be dropped; = 0 to safely ignore
|
|
*/
|
|
bool mayDropKey( int index, int refPos ) const;
|
|
|
|
/**
|
|
* Pack the bucket to reclaim space from invalidated memory.
|
|
* @refPos is an index in the bucket which may be updated if we
|
|
* delete keys from the bucket
|
|
* This function may cast away const and perform a write.
|
|
* Preconditions: none
|
|
* Postconditions:
|
|
* - Bucket will be packed
|
|
* - Some unused nodes may be dropped, but not ones at index 0 or refPos
|
|
* - Some used nodes may be moved
|
|
* - If refPos is the index of an existing key, it will be updated to that
|
|
* key's new index if the key is moved.
|
|
*/
|
|
void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const;
|
|
/** Pack when already writable */
|
|
void _packReadyForMod(const Ordering &order, int &refPos);
|
|
|
|
/** @return the size the bucket's body would have if we were to call pack() */
|
|
int packedDataSize( int refPos ) const;
|
|
void setNotPacked() { this->flags &= ~Packed; }
|
|
void setPacked() { this->flags |= Packed; }
|
|
/**
|
|
* Preconditions: 'bytes' is <= emptySize
|
|
* Postconditions: A buffer of size 'bytes' is allocated on the top side,
|
|
* and its offset is returned.
|
|
*/
|
|
int _alloc(int bytes);
|
|
/**
|
|
* This function can be used to deallocate the lowest byte index bson
|
|
* buffer in the top region, which in some but not all cases is for the
|
|
* n - 1 index key. This function only works correctly in certain
|
|
* special cases, please be careful.
|
|
* Preconditions: 'bytes' <= topSize
|
|
* Postconditions: The top region is decreased
|
|
*/
|
|
void _unalloc(int bytes);
|
|
/**
|
|
* Preconditions: 'N' <= n
|
|
* Postconditions:
|
|
* - All keys after the N index key are dropped.
|
|
* - Then bucket is packed, without dropping refPos if < refPos N.
|
|
*/
|
|
void truncateTo(int N, const Ordering &order, int &refPos);
|
|
/**
|
|
* Preconditions:
|
|
* - 'nDrop' < n
|
|
* - for now, refPos should be zero.
|
|
* Postconditions:
|
|
* - All keys before the nDrop index key are dropped.
|
|
* - The bucket is packed.
|
|
*/
|
|
void dropFront(int nDrop, const Ordering &order, int &refPos);
|
|
/**
|
|
* Preconditions: 0 <= keypos < n
|
|
* Postconditions: keypos indexed key is marked unused.
|
|
*/
|
|
void markUnused(int keypos);
|
|
|
|
/**
|
|
* BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
|
|
* we use tempNext() when we do that to be less confusing. (one might have written a union in C)
|
|
*/
|
|
const DiskLoc& tempNext() const { return this->parent; }
|
|
DiskLoc& tempNext() { return this->parent; }
|
|
|
|
void _shape(int level, stringstream&) const;
|
|
int Size() const;
|
|
|
|
/** @return i-indexed _KeyNode, without bounds checking */
|
|
public:
|
|
const _KeyNode& k(int i) const { return ((const _KeyNode*)this->data)[i]; }
|
|
_KeyNode& _k(int i) { return ((_KeyNode*)this->data)[i]; }
|
|
protected:
|
|
_KeyNode& k(int i) { return ((_KeyNode*)this->data)[i]; }
|
|
|
|
/**
|
|
* Preconditions: 'this' is packed
|
|
* @return the key index to be promoted on split
|
|
* @param keypos The requested index of a key to insert, which may affect
|
|
* the choice of split position.
|
|
*/
|
|
int splitPos( int keypos ) const;
|
|
|
|
/**
|
|
* Preconditions: nAdd * sizeof( _KeyNode ) <= emptySize
|
|
* Postconditions:
|
|
* - Increases indexes of existing _KeyNode objects by nAdd, reserving
|
|
* space for additional _KeyNode objects at front.
|
|
* - Does not initialize ofs values for the bson data of these
|
|
* _KeyNode objects.
|
|
*/
|
|
void reserveKeysFront( int nAdd );
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 0 <= i < n
|
|
* - The bson 'key' must fit in the bucket without packing.
|
|
* - If 'key' and 'prevChildBucket' are set at index i, the btree
|
|
* ordering properties will be maintained.
|
|
* Postconditions:
|
|
* - The specified key is set at index i, replacing the existing
|
|
* _KeyNode data and without shifting any other _KeyNode objects.
|
|
*/
|
|
void setKey( int i, const DiskLoc recordLoc, const Key& key, const DiskLoc prevChildBucket );
|
|
};
|
|
|
|
/**
|
|
* This class adds functionality for manipulating buckets that are assembled
|
|
* in a tree. The requirements for const and non const functions and
|
|
* arguments are generally the same as in BtreeBucket. Because this class
|
|
* deals with tree structure, some functions that are marked const may
|
|
* trigger modification of another node in the btree or potentially of the
|
|
* current node. In such cases, the function's implementation explicitly
|
|
* casts away const when indicating an intent to write to the durability
|
|
* layer. The DiskLocs provided to such functions should be passed by
|
|
* value if they shadow pointers within the btree.
|
|
*
|
|
* To clarify enforcement of referential integrity in this implementation,
|
|
* we use the following pattern when deleting data we have a persistent
|
|
* pointer to. The pointer is cleared or removed explicitly, then the data
|
|
* it pointed to is cleaned up with a helper function.
|
|
*
|
|
* TODO It might make sense to put some of these functions in a class
|
|
* representing a full btree instead of a single btree bucket. That would
|
|
* allow us to use the const qualifier in a manner more consistent with
|
|
* standard usage. Right now the interface is for both a node and a tree,
|
|
* so assignment of const is sometimes nonideal.
|
|
*
|
|
* TODO There are several cases in which the 'this' pointer is invalidated
|
|
* as a result of deallocation. A seperate class representing a btree would
|
|
* alleviate some fragile cases where the implementation must currently
|
|
* behave correctly if the 'this' pointer is suddenly invalidated by a
|
|
* callee.
|
|
*/
|
|
template< class V >
|
|
class BtreeBucket : public BucketBasics<V> {
|
|
friend class BtreeCursor;
|
|
public:
|
|
// make compiler happy:
|
|
typedef typename V::Key Key;
|
|
typedef typename V::KeyOwned KeyOwned;
|
|
typedef typename BucketBasics<V>::KeyNode KeyNode;
|
|
const _KeyNode& k(int i) const { return static_cast< const BucketBasics<V> * >(this)->k(i); }
|
|
protected:
|
|
_KeyNode& k(int i) { return static_cast< BucketBasics<V> * >(this)->_k(i); }
|
|
public:
|
|
const KeyNode keyNode(int i) const { return static_cast< const BucketBasics<V> * >(this)->keyNode(i); }
|
|
|
|
bool isHead() const { return this->parent.isNull(); }
|
|
void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const;
|
|
long long fullValidate(const DiskLoc& thisLoc, const BSONObj &order, long long *unusedCount = 0, bool strict = false) const; /* traverses everything */
|
|
|
|
bool isUsed( int i ) const { return this->k(i).isUsed(); }
|
|
string bucketSummary() const;
|
|
void dump() const;
|
|
|
|
/**
|
|
* @return true if key exists in index
|
|
*
|
|
* @order - indicates order of keys in the index. this is basically the index's key pattern, e.g.:
|
|
* BSONObj order = ((IndexDetails&)idx).keyPattern();
|
|
* likewise below in bt_insert() etc.
|
|
*/
|
|
private:
|
|
bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const Key& key, const Ordering& order) const;
|
|
public:
|
|
|
|
/**
|
|
* @param self - Don't complain about ourself already being in the index case.
|
|
* @return true = There is a duplicate used key.
|
|
*/
|
|
bool wouldCreateDup(
|
|
const IndexDetails& idx, const DiskLoc &thisLoc,
|
|
const Key& key, const Ordering& order,
|
|
const DiskLoc &self) const;
|
|
|
|
/**
|
|
* Preconditions: none
|
|
* Postconditions: @return a new bucket allocated from pdfile storage
|
|
* and init()-ed. This bucket is suitable to for use as a new root
|
|
* or any other new node in the tree.
|
|
*/
|
|
static DiskLoc addBucket(const IndexDetails&);
|
|
|
|
/**
|
|
* Preconditions: none
|
|
* Postconditions:
|
|
* - Some header values in this bucket are cleared, and the bucket is
|
|
* deallocated from pdfile storage.
|
|
* - The memory at thisLoc is invalidated, and 'this' is invalidated.
|
|
*/
|
|
void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 'key' has a valid schema for this index.
|
|
* - All other paramenters are valid and consistent with this index if applicable.
|
|
* Postconditions:
|
|
* - If key is bigger than KeyMax, @return 2 or 3 and no change.
|
|
* - If key / recordLoc exist in the btree as an unused key, set them
|
|
* as used and @return 0
|
|
* - If key / recordLoc exist in the btree as a used key, @throw
|
|
* exception 10287 and no change.
|
|
* - If key / recordLoc do not exist in the btree, they are inserted
|
|
* and @return 0. The root of the btree may be changed, so
|
|
* 'this'/thisLoc may no longer be the root upon return.
|
|
*/
|
|
int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
|
|
const BSONObj& key, const Ordering &order, bool dupsAllowed,
|
|
IndexDetails& idx, bool toplevel = true) const;
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 'key' has a valid schema for this index, and may have objsize() > KeyMax.
|
|
* Postconditions:
|
|
* - If key / recordLoc are in the btree, they are removed (possibly
|
|
* by being marked as an unused key), @return true, and potentially
|
|
* invalidate 'this' / thisLoc and change the head.
|
|
* - If key / recordLoc are not in the btree, @return false and do nothing.
|
|
*/
|
|
bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const;
|
|
|
|
/**
|
|
* locate may return an "unused" key that is just a marker. so be careful.
|
|
* looks for a key:recordloc pair.
|
|
*
|
|
* @found - returns true if exact match found. note you can get back a position
|
|
* result even if found is false.
|
|
*/
|
|
DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
|
|
int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
|
|
DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const Key& key, const Ordering &order,
|
|
int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
|
|
|
|
/**
|
|
* find the first instance of the key
|
|
* does not handle dups
|
|
* WARNING: findSingle may not be compound index safe. this may need to change. see notes in
|
|
* findSingle code.
|
|
* @return the record location of the first match
|
|
*/
|
|
DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const;
|
|
|
|
/**
|
|
* Advance to next or previous key in the index.
|
|
* @param direction to advance.
|
|
*/
|
|
DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const;
|
|
|
|
/** Advance in specified direction to the specified key */
|
|
void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const;
|
|
|
|
/** Locate a key with fields comprised of a combination of keyBegin fields and keyEnd fields. */
|
|
void customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const;
|
|
|
|
/** @return head of the btree by traversing from current bucket. */
|
|
const DiskLoc getHead(const DiskLoc& thisLoc) const;
|
|
|
|
/** get tree shape */
|
|
void shape(stringstream&) const;
|
|
|
|
static void a_test(IndexDetails&);
|
|
|
|
static int getKeyMax();
|
|
|
|
protected:
|
|
/**
|
|
* Preconditions:
|
|
* - 0 <= firstIndex <= n
|
|
* - -1 <= lastIndex <= n ( -1 is equivalent to n )
|
|
* Postconditions:
|
|
* - Any children at indexes firstIndex through lastIndex (inclusive)
|
|
* will have their parent pointers set to thisLoc.
|
|
*/
|
|
void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const;
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - thisLoc is not the btree head.
|
|
* - n == 0 is ok
|
|
* Postconditions:
|
|
* - All cursors pointing to this bucket will be updated.
|
|
* - This bucket's parent's child pointer is set to null.
|
|
* - This bucket is deallocated from pdfile storage.
|
|
* - 'this' and thisLoc are invalidated.
|
|
*/
|
|
void delBucket(const DiskLoc thisLoc, const IndexDetails&);
|
|
|
|
/**
|
|
* Preconditions: 0 <= p < n
|
|
* Postconditions:
|
|
* - The key at index p is removed from the btree.
|
|
* - 'this' and thisLoc may be invalidated.
|
|
* - The tree head may change.
|
|
*/
|
|
void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order);
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - n == 0 is ok
|
|
* Postconditions:
|
|
* - If thisLoc is head, or if its body has at least lowWaterMark bytes,
|
|
* return false and do nothing.
|
|
* - Otherwise, if thisLoc has left or right neighbors, either balance
|
|
* or merge with them and return true. Also, 'this' and thisLoc may
|
|
* be invalidated and the tree head may change.
|
|
*/
|
|
bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const;
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 0 <= leftIndex < n
|
|
* - The child at leftIndex or the child at leftIndex + 1 contains
|
|
* fewer than lowWaterMark bytes.
|
|
* Postconditions:
|
|
* - If the child bucket at leftIndex can merge with the child index
|
|
* at leftIndex + 1, do nothing and return false.
|
|
* - Otherwise, balance keys between the leftIndex child and the
|
|
* leftIndex + 1 child, return true, and possibly change the tree head.
|
|
*/
|
|
bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const;
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - All preconditions of tryBalanceChildren.
|
|
* - The leftIndex child and leftIndex + 1 child cannot be merged.
|
|
* Postconditions:
|
|
* - Keys are moved between the leftIndex child and the leftIndex + 1
|
|
* child such that neither child has fewer than lowWaterMark bytes.
|
|
* The tree head may change.
|
|
*/
|
|
void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order );
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - All preconditions of doBalanceChildren
|
|
* - The leftIndex and leftIndex + 1 children are packed.
|
|
* - The leftIndex + 1 child has fewer than lowWaterMark bytes.
|
|
* - split returned by rebalancedSeparatorPos()
|
|
* Postconditions:
|
|
* - The key in lchild at index split is set as thisLoc's key at index
|
|
* leftIndex, which may trigger a split and change the tree head.
|
|
* The previous key in thisLoc at index leftIndex and all keys with
|
|
* indexes greater than split in lchild are moved to rchild.
|
|
*/
|
|
void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
|
|
BtreeBucket<V> *l, const DiskLoc lchild,
|
|
BtreeBucket<V> *r, const DiskLoc rchild,
|
|
IndexDetails &id, const Ordering &order );
|
|
/**
|
|
* Preconditions:
|
|
* - All preconditions of doBalanceChildren
|
|
* - The leftIndex and leftIndex + 1 children are packed.
|
|
* - The leftIndex child has fewer than lowWaterMark bytes.
|
|
* - split returned by rebalancedSeparatorPos()
|
|
* Postconditions:
|
|
* - The key in rchild at index split - l->n - 1 is set as thisLoc's key
|
|
* at index leftIndex, which may trigger a split and change the tree
|
|
* head. The previous key in thisLoc at index leftIndex and all keys
|
|
* with indexes less than split - l->n - 1 in rchild are moved to
|
|
* lchild.
|
|
*/
|
|
void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
|
|
BtreeBucket<V> *l, const DiskLoc lchild,
|
|
BtreeBucket<V> *r, const DiskLoc rchild,
|
|
IndexDetails &id, const Ordering &order );
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 0 <= leftIndex < n
|
|
* - this->canMergeChildren( thisLoc, leftIndex ) == true
|
|
* Postconditions:
|
|
* - All of the above mentioned keys will be placed in the left child.
|
|
* - The tree may be updated recursively, resulting in 'this' and
|
|
* thisLoc being invalidated and the tree head being changed.
|
|
*/
|
|
void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order);
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - n == 0
|
|
* - !nextChild.isNull()
|
|
* Postconditions:
|
|
* - 'this' and thisLoc are deallocated (and invalidated), any cursors
|
|
* to them are updated, and the tree head may change.
|
|
* - nextChild replaces thisLoc in the btree structure.
|
|
*/
|
|
void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id );
|
|
|
|
/**
|
|
* @return true iff the leftIndex and leftIndex + 1 children both exist,
|
|
* and if their body sizes when packed and the thisLoc key at leftIndex
|
|
* would fit in a single bucket body.
|
|
*/
|
|
bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const;
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - leftIndex and leftIndex + 1 children are packed
|
|
* - leftIndex or leftIndex + 1 child is below lowWaterMark
|
|
* @return index of the rebalanced separator; the index value is
|
|
* determined as if we had a bucket with body
|
|
* <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> )
|
|
* and called splitPos( 0 ) on it.
|
|
*/
|
|
int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const;
|
|
|
|
/**
|
|
* Preconditions: thisLoc has a parent
|
|
* @return parent's index of thisLoc.
|
|
*/
|
|
int indexInParent( const DiskLoc &thisLoc ) const;
|
|
|
|
Key keyAt(int keyOfs) const {
|
|
return keyOfs >= this->n ? Key() : this->keyNode(keyOfs).key;
|
|
}
|
|
|
|
/**
|
|
* Allocate a temporary btree bucket in ram rather than in memory mapped
|
|
* storage. The caller must release this bucket with free().
|
|
*/
|
|
static BtreeBucket<V> * allocTemp();
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - This bucket is packed.
|
|
* - Cannot add a key of size KeyMax to this bucket.
|
|
* - 0 <= keypos <= n is the position of a new key that will be inserted
|
|
* - lchild is equal to the existing child at index keypos.
|
|
* Postconditions:
|
|
* - The thisLoc bucket is split into two packed buckets, possibly
|
|
* invalidating the initial position of keypos, with a split key
|
|
* promoted to the parent. The new key key/recordLoc will be inserted
|
|
* into one of the split buckets, and lchild/rchild set appropriately.
|
|
* Splitting may occur recursively, possibly changing the tree head.
|
|
*/
|
|
void split(const DiskLoc thisLoc, int keypos,
|
|
const DiskLoc recordLoc, const Key& key,
|
|
const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx);
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 0 <= keypos <= n
|
|
* - If key / recordLoc are inserted at position keypos, with provided
|
|
* lchild and rchild, the btree ordering requirements will be
|
|
* maintained.
|
|
* - lchild is equal to the existing child at index keypos.
|
|
* - n == 0 is ok.
|
|
* Postconditions:
|
|
* - The key / recordLoc are inserted at position keypos, and the
|
|
* bucket is split if necessary, which may change the tree head.
|
|
* - The bucket may be packed or split, invalidating the specified value
|
|
* of keypos.
|
|
* This function will always modify thisLoc, but it's marked const because
|
|
* it commonly relies on the specialized write intent mechanism of basicInsert().
|
|
*/
|
|
void insertHere(const DiskLoc thisLoc, int keypos,
|
|
const DiskLoc recordLoc, const Key& key, const Ordering &order,
|
|
const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const;
|
|
|
|
/** bt_insert() is basically just a wrapper around this. */
|
|
int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
|
|
const Key& key, const Ordering &order, bool dupsAllowed,
|
|
const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const;
|
|
|
|
bool find(const IndexDetails& idx, const Key& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;
|
|
bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const;
|
|
static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
|
|
static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction );
|
|
|
|
/** If child is non null, set its parent to thisLoc */
|
|
static void fix(const DiskLoc thisLoc, const DiskLoc child);
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 0 <= keypos < n
|
|
* - If the specified key and recordLoc are placed in keypos of thisLoc,
|
|
* and lchild and rchild are set, the btree ordering properties will
|
|
* be maintained.
|
|
* - rchild == childForPos( keypos + 1 )
|
|
* - childForPos( keypos ) is referenced elsewhere if nonnull.
|
|
* Postconditions:
|
|
* - The key at keypos will be replaced with the specified key and
|
|
* lchild, potentially splitting this bucket and changing the tree
|
|
* head.
|
|
* - childForPos( keypos ) will be orphaned.
|
|
*/
|
|
void setInternalKey( const DiskLoc thisLoc, int keypos,
|
|
const DiskLoc recordLoc, const Key &key, const Ordering &order,
|
|
const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx);
|
|
|
|
/**
|
|
* Preconditions:
|
|
* - 0 <= keypos < n
|
|
* - The keypos or keypos+1 indexed child is non null.
|
|
* Postconditions:
|
|
* - The specified key is deleted by replacing it with another key if
|
|
* possible. This replacement may cause a split and change the tree
|
|
* head. The replacement key will be deleted from its original
|
|
* location, potentially causing merges and splits that may invalidate
|
|
* 'this' and thisLoc and change the tree head.
|
|
* - If the key cannot be replaced, it will be marked as unused. This
|
|
* is only expected in legacy btrees.
|
|
*/
|
|
void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order );
|
|
public:
|
|
/** simply builds and returns a dup key error message string */
|
|
static string dupKeyError( const IndexDetails& idx , const Key& key );
|
|
};
|
|
#pragma pack()
|
|
|
|
class FieldRangeVector;
|
|
class FieldRangeVectorIterator;
|
|
|
|
class BtreeCursor : public Cursor {
|
|
protected:
|
|
BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
|
|
BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
|
|
public:
|
|
virtual ~BtreeCursor();
|
|
/** makes an appropriate subclass depending on the index version */
|
|
static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
|
|
static BtreeCursor* make( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
|
|
|
|
virtual bool ok() { return !bucket.isNull(); }
|
|
virtual bool advance();
|
|
virtual void noteLocation(); // updates keyAtKeyOfs...
|
|
virtual void checkLocation();
|
|
virtual bool supportGetMore() { return true; }
|
|
virtual bool supportYields() { return true; }
|
|
|
|
/**
|
|
* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
|
|
* if a multikey index traversal:
|
|
* if loc has already been sent, returns true.
|
|
* otherwise, marks loc as sent.
|
|
* @return false if the loc has not been seen
|
|
*/
|
|
virtual bool getsetdup(DiskLoc loc) {
|
|
if( _multikey ) {
|
|
pair<set<DiskLoc>::iterator, bool> p = _dups.insert(loc);
|
|
return !p.second;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
virtual bool modifiedKeys() const { return _multikey; }
|
|
virtual bool isMultiKey() const { return _multikey; }
|
|
|
|
const _KeyNode& _currKeyNode() const {
|
|
assert( !bucket.isNull() );
|
|
const _KeyNode& kn = keyNode(keyOfs);
|
|
assert( kn.isUsed() );
|
|
return kn;
|
|
}
|
|
|
|
/** returns BSONObj() if ofs is out of range */
|
|
virtual BSONObj keyAt(int ofs) const = 0;
|
|
|
|
virtual BSONObj currKey() const = 0;
|
|
virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); }
|
|
|
|
virtual void aboutToDeleteBucket(const DiskLoc& b) {
|
|
if ( bucket == b )
|
|
keyOfs = -1;
|
|
}
|
|
|
|
virtual DiskLoc currLoc() = 0; // { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc(); }
|
|
virtual DiskLoc refLoc() { return currLoc(); }
|
|
virtual Record* _current() { return currLoc().rec(); }
|
|
virtual BSONObj current() { return BSONObj(_current()); }
|
|
virtual string toString();
|
|
|
|
BSONObj prettyKey( const BSONObj &key ) const {
|
|
return key.replaceFieldNames( indexDetails.keyPattern() ).clientReadable();
|
|
}
|
|
|
|
virtual BSONObj prettyIndexBounds() const;
|
|
|
|
void forgetEndKey() { endKey = BSONObj(); }
|
|
|
|
virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
|
|
virtual shared_ptr< CoveredIndexMatcher > matcherPtr() const { return _matcher; }
|
|
|
|
virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher; }
|
|
|
|
virtual long long nscanned() { return _nscanned; }
|
|
|
|
/** for debugging only */
|
|
const DiskLoc getBucket() const { return bucket; }
|
|
|
|
protected:
|
|
/**
|
|
* Our btrees may (rarely) have "unused" keys when items are deleted.
|
|
* Skip past them.
|
|
*/
|
|
bool skipUnusedKeys( bool mayJump );
|
|
bool skipOutOfRangeKeysAndCheckEnd();
|
|
void skipAndCheck();
|
|
void checkEnd();
|
|
|
|
/** selective audits on construction */
|
|
void audit();
|
|
|
|
virtual void _audit() = 0;
|
|
virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) = 0;
|
|
virtual const _KeyNode& keyNode(int keyOfs) const = 0;
|
|
virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
|
|
virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) = 0;
|
|
|
|
/** set initial bucket */
|
|
void init();
|
|
|
|
/** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */
|
|
void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive );
|
|
|
|
set<DiskLoc> _dups;
|
|
NamespaceDetails * const d;
|
|
const int idxNo;
|
|
BSONObj startKey;
|
|
BSONObj endKey;
|
|
bool _endKeyInclusive;
|
|
bool _multikey; // this must be updated every getmore batch in case someone added a multikey
|
|
const IndexDetails& indexDetails;
|
|
const BSONObj _order;
|
|
const Ordering _ordering;
|
|
DiskLoc bucket;
|
|
int keyOfs;
|
|
const int _direction; // 1=fwd,-1=reverse
|
|
BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
|
|
DiskLoc locAtKeyOfs;
|
|
const shared_ptr< FieldRangeVector > _bounds;
|
|
auto_ptr< FieldRangeVectorIterator > _boundsIterator;
|
|
const IndexSpec& _spec;
|
|
shared_ptr< CoveredIndexMatcher > _matcher;
|
|
bool _independentFieldRanges;
|
|
long long _nscanned;
|
|
};
|
|
|
|
/** Renames the index namespace for this btree's index. */
|
|
void renameIndexNamespace(const char *oldNs, const char *newNs);
|
|
|
|
/**
|
|
* give us a writable version of the btree bucket (declares write intent).
|
|
* note it is likely more efficient to declare write intent on something smaller when you can.
|
|
*/
|
|
template< class V >
|
|
BtreeBucket<V> * DiskLoc::btreemod() const {
|
|
assert( _a != -1 );
|
|
BtreeBucket<V> *b = const_cast< BtreeBucket<V> * >( btree<V>() );
|
|
return static_cast< BtreeBucket<V>* >( getDur().writingPtr( b, V::BucketSize ) );
|
|
}
|
|
|
|
} // namespace mongo;
|