56 bit disklocs in btree v1

2024-12-01 09:32:32 +01:00 · 2011-06-20 10:17:42 -04:00 · 2011-06-20 10:17:42 -04:00 · 8e1b5c26ab
commit 8e1b5c26ab
parent e95cab5429
10 changed files with 286 additions and 161 deletions
--- a/db/btree.cpp
+++ b/db/btree.cpp
@ -28,6 +28,7 @@
 #include "stats/counters.h"
 #include "dur_commitjob.h"
 #include "btreebuilder.h"
+#include "../util/unittest.h"

 namespace mongo {

@ -36,16 +37,11 @@ namespace mongo {

 #define VERIFYTHISLOC dassert( thisLoc.btree<V>() == this );

-    _KeyNode& _KeyNode::writing() const {
-        return *getDur().writing( const_cast< _KeyNode* >( this ) );
+    template< class Loc >
+    __KeyNode<Loc> & __KeyNode<Loc>::writing() const {
+        return *getDur().writing( const_cast< __KeyNode<Loc> * >( this ) );
    }

-    template< class V >
-    BucketBasics<V>::KeyNode::KeyNode(const BucketBasics<V>& bb, const _KeyNode &k) :
-        prevChildBucket(k.prevChildBucket),
-        recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
-    { }
-
    // BucketBasics::lowWaterMark()
    //
    // We define this value as the maximum number of bytes such that, if we have
@ -227,9 +223,9 @@ namespace mongo {
                }
                else if ( z == 0 ) {
                    if ( !(k(i).recordLoc < k(i+1).recordLoc) ) {
-                        out() << "ERROR: btree key order corrupt (recordloc's wrong).  Keys:" << endl;
-                        out() << " k(" << i << "):" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl;
-                        out() << " k(" << i+1 << "):" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl;
+                        out() << "ERROR: btree key order corrupt (recordloc's wrong):" << endl;
+                        out() << " k(" << i << ")" << keyNode(i).key.toString() << " RL:" << k(i).recordLoc.toString() << endl;
+                        out() << " k(" << i+1 << ")" << keyNode(i+1).key.toString() << " RL:" << k(i+1).recordLoc.toString() << endl;
                        wassert( k(i).recordLoc < k(i+1).recordLoc );
                    }
                }
@ -734,8 +730,10 @@ namespace mongo {
     * note result might be an Unused location!
     */
    template< class V >
-    bool BtreeBucket<V>::find(const IndexDetails& idx, const Key& key, const DiskLoc &recordLoc, 
+    bool BtreeBucket<V>::find(const IndexDetails& idx, const Key& key, const DiskLoc &rl, 
 			      const Ordering &order, int& pos, bool assertIfDup) const {
+        Loc recordLoc;
+        recordLoc = rl;
        globalIndexCounters.btree( (char*)this );

        // binary search for this key
@ -744,7 +742,7 @@ namespace mongo {
        int h=this->n-1;
        while ( l <= h ) {
            int m = (l+h)/2;
-	    KeyNode M = this->keyNode(m);
+            KeyNode M = this->keyNode(m);
            int x = key.woCompare(M.key, order);
            if ( x == 0 ) {
                if( assertIfDup ) {
@ -770,7 +768,7 @@ namespace mongo {
                }

                // dup keys allowed.  use recordLoc as if it is part of the key
-                DiskLoc unusedRL = M.recordLoc;
+                Loc unusedRL = M.recordLoc;
                unusedRL.GETOFS() &= ~1; // so we can test equality without the used bit messing us up
                x = recordLoc.compare(unusedRL);
            }
@ -914,8 +912,10 @@ namespace mongo {
        advanceLoc.btreemod<V>()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order );
    }

-#define BTREE(loc) (static_cast<DiskLoc>(loc).btree<V>())
-#define BTREEMOD(loc) (static_cast<DiskLoc>(loc).btreemod<V>())
+//#define BTREE(loc) (static_cast<DiskLoc>(loc).btree<V>())
+#define BTREE(loc) (loc.btree<V>())
+//#define BTREEMOD(loc) (static_cast<DiskLoc>(loc).btreemod<V>())
+#define BTREEMOD(loc) (loc.btreemod<V>())

    template< class V >
    void BtreeBucket<V>::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) {
@ -929,7 +929,9 @@ namespace mongo {
            ll.btree<V>()->childForPos( indexInParent( thisLoc ) ).writing() = this->nextChild;
        }
        BTREE(this->nextChild)->parent.writing() = this->parent;
-        (static_cast<DiskLoc>(this->nextChild).btree<V>())->parent.writing() = this->parent;
+
+        (this->nextChild.btree<V>())->parent.writing() = this->parent;
+        //(static_cast<DiskLoc>(this->nextChild).btree<V>())->parent.writing() = this->parent;
        ClientCursor::informAboutToDeleteBucket( thisLoc );
        deallocBucket( thisLoc, id );
    }
@ -1214,15 +1216,13 @@ namespace mongo {
    bool BtreeBucket<V>::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const {
        int pos;
        bool found;
-        DiskLoc loc = locate(id, thisLoc, key, Ordering::make(id.keyPattern()), pos, found, recordLoc, 1);
+        const Ordering ord = Ordering::make(id.keyPattern());
+        DiskLoc loc = locate(id, thisLoc, key, ord, pos, found, recordLoc, 1);
        if ( found ) {
-
            if ( key.objsize() > KeyMax ) {
                OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl;
-            }
-            
-            loc.btreemod<V>()->delKeyAtPos(loc, id, pos, Ordering::make(id.keyPattern()));
-            
+            }            
+            loc.btreemod<V>()->delKeyAtPos(loc, id, pos, ord);            
            return true;
        }
        return false;
@ -1239,7 +1239,7 @@ namespace mongo {
    inline void BtreeBucket<V>::fix(const DiskLoc thisLoc, const DiskLoc child) {
        if ( !child.isNull() ) {
            if ( insert_debug )
-                out() << "      " << child.toString() << ".parent=" << thisLoc.toString() << endl;
+                out() << "     fix " << child.toString() << ".parent=" << thisLoc.toString() << endl;
            child.btree<V>()->parent.writing() = thisLoc;
        }
    }
@ -1335,8 +1335,8 @@ namespace mongo {
                    dump();
                    assert(false);
                }
-                const DiskLoc *pc = &k(keypos+1).prevChildBucket;
-                *getDur().alreadyDeclared((DiskLoc*) pc) = rchild; // declared in basicInsert()
+                const Loc *pc = &k(keypos+1).prevChildBucket;
+                *getDur().alreadyDeclared( const_cast<Loc*>(pc) ) = rchild; // declared in basicInsert()
                if ( !rchild.isNull() )
                    rchild.btree<V>()->parent.writing() = thisLoc;
            }
@ -1513,7 +1513,7 @@ namespace mongo {
        DiskLoc child = this->childForPos(p);

        if ( !child.isNull() ) {
-	  DiskLoc l = BTREE(child)->locate(idx, child, key, order, pos, found, recordLoc, direction);
+            DiskLoc l = BTREE(child)->locate(idx, child, key, order, pos, found, recordLoc, direction);
            if ( !l.isNull() )
                return l;
        }
@ -1716,7 +1716,8 @@ namespace mongo {
        }

        DEBUGGING out() << "TEMP: key: " << key.toString() << endl;
-        DiskLoc child = this->childForPos(pos);
+        Loc ch = this->childForPos(pos);
+        DiskLoc child = ch;
        if ( insert_debug )
            out() << "    getChild(" << pos << "): " << child.toString() << endl;
        // In current usage, rChild isNull() for a this->new key and false when we are
@ -1841,20 +1842,17 @@ namespace mongo {

        b->dumpTree(id.head, orderObj);

-        /*        b->bt_insert(id.head, B, key, order, false, id);
+        /* b->bt_insert(id.head, B, key, order, false, id);
        b->k(1).setUnused();
-
        b->dumpTree(id.head, order);
-
        b->bt_insert(id.head, A, key, order, false, id);
-
        b->dumpTree(id.head, order);
        */

        // this should assert.  does it? (it might "accidentally" though, not asserting proves a problem, asserting proves nothing)
        b->bt_insert(id.head, C, key, order, false, id);

-//        b->dumpTree(id.head, order);
+        // b->dumpTree(id.head, order);
    }

    template class BucketBasics<V0>;
@ -1862,4 +1860,27 @@ namespace mongo {
    template class BtreeBucket<V0>;
    template class BtreeBucket<V1>;

+    struct BTUnitTest : public UnitTest {
+        void run() {
+            DiskLoc big(0xf12312, 0x70001234);
+            DiskLoc56Bit bigl;
+            {
+                bigl = big;
+                assert( big == bigl );
+                DiskLoc e = bigl;
+                assert( big == e );
+            }
+            {
+                DiskLoc d;
+                assert( d.isNull() );
+                DiskLoc56Bit l;
+                l = d;
+                assert( l.isNull() );
+                d = l;
+                assert( d.isNull() );
+                assert( l < bigl );
+            }
+        }
+    } btunittest;
+
 }
--- a/db/btree.h
+++ b/db/btree.h
@ -66,7 +66,7 @@ namespace mongo {
    const int OldBucketSize = 8192;

    // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
-    const int KeyMax = OldBucketSize / 10;\
+    const int KeyMax = OldBucketSize / 10;

 #pragma pack(1)
    template< class Version > class BucketBasics;
@ -76,19 +76,19 @@ namespace mongo {
     * bucket.  It contains an offset pointer to the variable width bson
     * data component.  A _KeyNode may be 'unused', please see below.
     */
-    struct _KeyNode {
+    template< class Loc >
+    struct __KeyNode {
        /** Signals that we are writing this _KeyNode and casts away const */
-        _KeyNode& writing() const;
+        __KeyNode<Loc> & writing() const;
        /**
         * The 'left' child bucket of this key.  If this is the i-th key, it
         * points to the i index child bucket.
         */
-        DiskLoc prevChildBucket;
+        Loc prevChildBucket;
        /** The location of the record associated with this key. */
-        DiskLoc recordLoc;
-        short keyDataOfs() const {
-            return (short) _kdo;
-        }
+        Loc recordLoc;
+        short keyDataOfs() const { return (short) _kdo; }
+
        /** Offset within current bucket of the variable width bson key for this _KeyNode. */
        unsigned short _kdo;
        void setKeyDataOfs(short s) {
@ -175,17 +175,96 @@ namespace mongo {
        char data[4];

    public:
+        typedef __KeyNode<DiskLoc> _KeyNode;
+        typedef DiskLoc Loc;
        typedef KeyBson Key;
        typedef KeyBson KeyOwned;
        enum { BucketSize = 8192 };
    };

+    // a a a ofs ofs ofs ofs
+    class DiskLoc56Bit {
+        int ofs;
+        unsigned char _a[3];
+        unsigned long long Z() const { 
+            // endian
+            return *((unsigned long long*)this) & 0x00ffffffffffffffULL;
+        }
+        enum { 
+            // first bit of offsets used in _KeyNode we don't use -1 here.
+            OurNullOfs = -2
+        };
+    public:
+        template< class V >
+        const BtreeBucket<V> * btree() const { 
+            return DiskLoc(*this).btree<V>();
+        }
+        template< class V >
+        BtreeBucket<V> * btreemod() const { 
+            return DiskLoc(*this).btreemod<V>();
+        }
+        operator DiskLoc() const { 
+            // endian
+            if( isNull() ) return DiskLoc();
+            unsigned a = *((unsigned *) (_a-1));
+            return DiskLoc(a >> 8, ofs);
+        }
+        int& GETOFS()      { return ofs; }
+        int getOfs() const { return ofs; }
+        bool operator<(const DiskLoc56Bit& rhs) const {
+            // the orderering of dup keys in btrees isn't too critical, but we'd like to put items that are 
+            // close together on disk close together in the tree, so we do want the file # to be the most significant
+            // bytes
+            return Z() < rhs.Z();
+        }
+        int compare(const DiskLoc56Bit& rhs) const {
+            unsigned long long a = Z();
+            unsigned long long b = rhs.Z();
+            if( a < b ) return -1;
+            return a == b ? 0 : 1;
+        }
+        bool operator==(const DiskLoc56Bit& rhs) const { return Z() == rhs.Z(); }
+        bool operator!=(const DiskLoc56Bit& rhs) const { return Z() != rhs.Z(); }
+        bool operator==(const DiskLoc& rhs) const {
+            return DiskLoc(*this) == rhs;
+        }
+        bool operator!=(const DiskLoc& rhs) const { return !(*this==rhs); }
+        bool isNull() const { return ofs < 0; }
+        void Null() { 
+            ofs = OurNullOfs; 
+            _a[0] = _a[1] = _a[2] = 0;
+        }
+        string toString() const { return DiskLoc(*this).toString(); }
+        void operator=(const DiskLoc& loc) {
+            ofs = loc.getOfs();
+            int la = loc.a();
+            assert( la <= 0xffffff ); // must fit in 3 bytes
+            if( la < 0 ) {
+                assert( la == -1 );
+                la = 0;
+                ofs = OurNullOfs;
+            }
+            memcpy(_a, &la, 3); // endian
+            dassert( ofs != 0 );
+        }
+        DiskLoc56Bit& writing() const { 
+            return *((DiskLoc56Bit*) getDur().writingPtr((void*)this, 7));
+        }
+    };
+
    class BtreeData_V1 {
+    public:
+        typedef DiskLoc56Bit Loc;
+        //typedef DiskLoc Loc;
+        typedef __KeyNode<Loc> _KeyNode;
+        typedef KeyV1 Key;
+        typedef KeyV1Owned KeyOwned;
+        enum { BucketSize = 8192-16 }; // leave room for Record header
    protected:
        /** Parent bucket of this bucket, which isNull() for the root bucket. */
-        DiskLoc parent;
+        Loc parent;
        /** Given that there are n keys, this is the n index child. */
-        DiskLoc nextChild;
+        Loc nextChild;

        unsigned short flags;

@ -202,11 +281,6 @@ namespace mongo {
        char data[4];

        void _init() { }
-
-    public:
-        typedef KeyV1 Key;
-        typedef KeyV1Owned KeyOwned;
-        enum { BucketSize = 8192-16 }; // leave room for Record header
    };

    typedef BtreeData_V0 V0;
@ -244,6 +318,8 @@ namespace mongo {
    public:
        template <class U> friend class BtreeBuilder;
        typedef typename Version::Key Key;
+        typedef typename Version::_KeyNode _KeyNode;
+        typedef typename Version::Loc Loc;

        int getN() const { return this->n; }

@ -256,8 +332,8 @@ namespace mongo {
        class KeyNode {
        public:
            KeyNode(const BucketBasics<Version>& bb, const _KeyNode &k);
-            const DiskLoc& prevChildBucket;
-            const DiskLoc& recordLoc;
+            const Loc& prevChildBucket;
+            const Loc& recordLoc;
            /* Points to the bson key storage for a _KeyNode */
            Key key;
        };
@ -375,8 +451,8 @@ namespace mongo {
        enum Flags { Packed=1 };

        /** n == 0 is ok */
-        const DiskLoc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
-        DiskLoc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
+        const Loc& childForPos(int p) const { return p == this->n ? this->nextChild : k(p).prevChildBucket; }
+        Loc& childForPos(int p) { return p == this->n ? this->nextChild : k(p).prevChildBucket; }

        /** Same as bodySize(). */
        int totalDataSize() const;
@ -450,8 +526,8 @@ namespace mongo {
         * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
         *   we use tempNext() when we do that to be less confusing. (one might have written a union in C)
         */
-        const DiskLoc& tempNext() const { return this->parent; }
-        DiskLoc& tempNext() { return this->parent; }
+        DiskLoc tempNext() const { return this->parent; }
+        void setTempNext(DiskLoc l) { this->parent = l; }

        void _shape(int level, stringstream&) const;
        int Size() const;
@ -915,7 +991,7 @@ namespace mongo {
        virtual bool ok() { return !bucket.isNull(); }
        virtual bool advance();
        virtual void noteLocation(); // updates keyAtKeyOfs...
-        virtual void checkLocation();
+        virtual void checkLocation() = 0;
        virtual bool supportGetMore() { return true; }
        virtual bool supportYields() { return true; }

@ -937,12 +1013,12 @@ namespace mongo {
        virtual bool modifiedKeys() const { return _multikey; }
        virtual bool isMultiKey() const { return _multikey; }

-        const _KeyNode& _currKeyNode() const {
+        /*const _KeyNode& _currKeyNode() const {
            assert( !bucket.isNull() );
            const _KeyNode& kn = keyNode(keyOfs);
            assert( kn.isUsed() );
            return kn;
-        }
+        }*/

        /** returns BSONObj() if ofs is out of range */
        virtual BSONObj keyAt(int ofs) const = 0;
@ -979,12 +1055,16 @@ namespace mongo {
        /** for debugging only */
        const DiskLoc getBucket() const { return bucket; }

+        // just for unit tests
+        virtual bool curKeyHasChild() = 0;
+
    protected:
        /**
         * Our btrees may (rarely) have "unused" keys when items are deleted.
         * Skip past them.
         */
-        bool skipUnusedKeys( bool mayJump );
+        virtual bool skipUnusedKeys( bool mayJump ) = 0;
+
        bool skipOutOfRangeKeysAndCheckEnd();
        void skipAndCheck();
        void checkEnd();
@ -994,7 +1074,6 @@ namespace mongo {

        virtual void _audit() = 0;
        virtual DiskLoc _locate(const BSONObj& key, const DiskLoc& loc) = 0;
-        virtual const _KeyNode& keyNode(int keyOfs) const = 0;
        virtual DiskLoc _advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) = 0;
        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) = 0;

@ -1041,4 +1120,10 @@ namespace mongo {
        return static_cast< BtreeBucket<V>* >( getDur().writingPtr( b, V::BucketSize ) );
    }

+    template< class V >
+    BucketBasics<V>::KeyNode::KeyNode(const BucketBasics<V>& bb, const _KeyNode &k) :
+        prevChildBucket(k.prevChildBucket),
+        recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
+    { }
+
 } // namespace mongo;
--- a/db/btreebuilder.cpp
+++ b/db/btreebuilder.cpp
@ -48,7 +48,7 @@ namespace mongo {
    template<class V>
    void BtreeBuilder<V>::newBucket() {
        DiskLoc L = BtreeBucket<V>::addBucket(idx);
-        b->tempNext() = L;
+        b->setTempNext(L);
        cur = L;
        b = cur.btreemod<V>();
    }
@ -123,7 +123,7 @@ namespace mongo {
                if ( ! up->_pushBack(r, k, ordering, keepLoc) ) {
                    // current bucket full
                    DiskLoc n = BtreeBucket<V>::addBucket(idx);
-                    up->tempNext() = n;
+                    up->setTempNext(n);
                    upLoc = n;
                    up = upLoc.btreemod<V>();
                    up->pushBack(r, k, ordering, keepLoc);
--- a/db/btreecursor.cpp
+++ b/db/btreecursor.cpp
@ -32,6 +32,7 @@ namespace mongo {
    public:
        typedef typename BucketBasics<V>::KeyNode KeyNode;
        typedef typename V::Key Key;
+        typedef typename V::_KeyNode _KeyNode;

        BtreeCursorImpl(NamespaceDetails *a, int b, const IndexDetails& c, const BSONObj &d, const BSONObj &e, bool f, int g) : 
          BtreeCursor(a,b,c,d,e,f,g) { }
@ -60,6 +61,87 @@ namespace mongo {
            return bucket.btree<V>()->keyNode(keyOfs).key.toBson();
        }

+        virtual bool curKeyHasChild() { 
+            return !currKeyNode().prevChildBucket.isNull();
+        }
+
+        bool skipUnusedKeys( bool mayJump ) {
+            int u = 0;
+            while ( 1 ) {
+                if ( !ok() )
+                    break;
+                const _KeyNode& kn = keyNode(keyOfs);
+                if ( kn.isUsed() )
+                    break;
+                bucket = _advance(bucket, keyOfs, _direction, "skipUnusedKeys");
+                u++;
+                //don't include unused keys in nscanned
+                //++_nscanned;
+                if ( mayJump && ( u % 10 == 0 ) ) {
+                    skipOutOfRangeKeysAndCheckEnd();
+                }
+            }
+            if ( u > 10 )
+                OCCASIONALLY log() << "btree unused skipped:" << u << '\n';
+            return u;
+        }
+
+        /* Since the last noteLocation(), our key may have moved around, and that old cached
+           information may thus be stale and wrong (although often it is right).  We check
+           that here; if we have moved, we have to search back for where we were at.
+
+           i.e., after operations on the index, the BtreeCursor's cached location info may
+           be invalid.  This function ensures validity, so you should call it before using
+           the cursor if other writers have used the database since the last noteLocation
+           call.
+        */
+        void checkLocation() {
+            if ( eof() )
+                return;
+
+            _multikey = d->isMultikey(idxNo);
+
+            if ( keyOfs >= 0 ) {
+                assert( !keyAtKeyOfs.isEmpty() );
+
+                // Note keyAt() returns an empty BSONObj if keyOfs is now out of range,
+                // which is possible as keys may have been deleted.
+                int x = 0;
+                while( 1 ) {
+                    //  if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
+                    //       b->k(keyOfs).recordLoc == locAtKeyOfs ) {
+                    if ( keyAt(keyOfs).shallowEqual(keyAtKeyOfs) ) {
+                        const _KeyNode& kn = keyNode(keyOfs);
+                        if( kn.recordLoc == locAtKeyOfs ) {
+                            if ( !kn.isUsed() ) {
+                                // we were deleted but still exist as an unused
+                                // marker key. advance.
+                                skipUnusedKeys( false );
+                            }
+                            return;
+                        }
+                    }
+
+                    // we check one key earlier too, in case a key was just deleted.  this is
+                    // important so that multi updates are reasonably fast.
+                    if( keyOfs == 0 || x++ )
+                        break;
+                    keyOfs--;
+                }
+            }
+
+            /* normally we don't get to here.  when we do, old position is no longer
+                valid and we must refind where we left off (which is expensive)
+            */
+
+            /* TODO: Switch to keep indexdetails and do idx.head! */
+            bucket = _locate(keyAtKeyOfs, locAtKeyOfs);
+            RARELY log() << "key seems to have moved in the index, refinding. " << bucket.toString() << endl;
+            if ( ! bucket.isNull() )
+                skipUnusedKeys( false );
+
+        }
+    
    protected:
        virtual void _advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) {
            thisLoc.btree<V>()->advanceTo(thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction);
@ -185,6 +267,10 @@ namespace mongo {
        if( v == 0 )
            return new BtreeCursorImpl<V0>(_d,_idxNo,_id,_bounds,_direction);
        uasserted(14801, str::stream() << "unsupported index version " << v);
+
+        // just check we are in sync with this method
+        dassert( IndexDetails::isASupportedIndexVersionNumber(v) );
+
        return 0;
    }

@ -282,28 +368,6 @@ namespace mongo {
        return true;
    }

-    /* skip unused keys. */
-    bool BtreeCursor::skipUnusedKeys( bool mayJump ) {
-        int u = 0;
-        while ( 1 ) {
-            if ( !ok() )
-                break;
-            const _KeyNode& kn = keyNode(keyOfs);
-            if ( kn.isUsed() )
-                break;
-            bucket = _advance(bucket, keyOfs, _direction, "skipUnusedKeys");
-            u++;
-            //don't include unused keys in nscanned
-            //++_nscanned;
-            if ( mayJump && ( u % 10 == 0 ) ) {
-                skipOutOfRangeKeysAndCheckEnd();
-            }
-        }
-        if ( u > 10 )
-            OCCASIONALLY log() << "btree unused skipped:" << u << '\n';
-        return u;
-    }
-
    // Return a value in the set {-1, 0, 1} to represent the sign of parameter i.
    int sgn( int i ) {
        if ( i == 0 )
@ -355,62 +419,6 @@ namespace mongo {
        }
    }

-    /* Since the last noteLocation(), our key may have moved around, and that old cached
-       information may thus be stale and wrong (although often it is right).  We check
-       that here; if we have moved, we have to search back for where we were at.
-
-       i.e., after operations on the index, the BtreeCursor's cached location info may
-       be invalid.  This function ensures validity, so you should call it before using
-       the cursor if other writers have used the database since the last noteLocation
-       call.
-    */
-    void BtreeCursor::checkLocation() {
-        if ( eof() )
-            return;
-
-        _multikey = d->isMultikey(idxNo);
-
-        if ( keyOfs >= 0 ) {
-            assert( !keyAtKeyOfs.isEmpty() );
-
-            // Note keyAt() returns an empty BSONObj if keyOfs is now out of range,
-            // which is possible as keys may have been deleted.
-            int x = 0;
-            while( 1 ) {
-                //  if ( b->keyAt(keyOfs).woEqual(keyAtKeyOfs) &&
-                //       b->k(keyOfs).recordLoc == locAtKeyOfs ) {
-                if ( keyAt(keyOfs).shallowEqual(keyAtKeyOfs) ) {
-                    const _KeyNode& kn = keyNode(keyOfs);
-                    if( kn.recordLoc == locAtKeyOfs ) {
-                        if ( !kn.isUsed() ) {
-                            // we were deleted but still exist as an unused
-                            // marker key. advance.
-                            skipUnusedKeys( false );
-                        }
-                        return;
-                    }
-                }
-
-                // we check one key earlier too, in case a key was just deleted.  this is
-                // important so that multi updates are reasonably fast.
-                if( keyOfs == 0 || x++ )
-                    break;
-                keyOfs--;
-            }
-        }
-
-        /* normally we don't get to here.  when we do, old position is no longer
-            valid and we must refind where we left off (which is expensive)
-        */
-
-        /* TODO: Switch to keep indexdetails and do idx.head! */
-        bucket = _locate(keyAtKeyOfs, locAtKeyOfs);
-        RARELY log() << "key seems to have moved in the index, refinding. " << bucket.toString() << endl;
-        if ( ! bucket.isNull() )
-            skipUnusedKeys( false );
-
-    }
-    
    string BtreeCursor::toString() {
        string s = string("BtreeCursor ") + indexDetails.indexName();
        if ( _direction < 0 ) s += " reverse";
--- a/db/diskloc.h
+++ b/db/diskloc.h
@ -39,17 +39,18 @@ namespace mongo {
        (such as adding a virtual function)
     */
    class DiskLoc {
-        int _a;     // this will be volume, file #, etc. but is a logical value could be anything depending on storage engine
+        int _a;     // this will be volume, file #, etsc. but is a logical value could be anything depending on storage engine
        int ofs;

    public:

        enum SentinelValues {
+            /* note NullOfs is different. todo clean up.  see refs to NullOfs in code - use is valid but outside DiskLoc context so confusing as-is. */
            NullOfs = -1,
            MaxFiles=16000 // thus a limit of about 32TB of data per db
        };

-        DiskLoc(int a, int b) : _a(a), ofs(b) { }
+        DiskLoc(int a, int Ofs) : _a(a), ofs(Ofs) { }
        DiskLoc() { Null(); }
        DiskLoc(const DiskLoc& l) {
            _a=l._a;
--- a/db/index.h
+++ b/db/index.h
@ -152,6 +152,7 @@ namespace mongo {
            return v;
        }

+        /** @return true if index has unique constraint */
        bool unique() const {
            BSONObj io = info.obj();
            return io["unique"].trueValue() ||
@ -159,13 +160,13 @@ namespace mongo {
                   isIdIndex();
        }

-        /* if set, when building index, if any duplicates, drop the duplicating object */
+        /** return true if dropDups was set when building index (if any duplicates, dropdups drops the duplicating objects) */
        bool dropDups() const {
            return info.obj().getBoolField( "dropDups" );
        }

-        /* delete this index.  does NOT clean up the system catalog
-           (system.indexes or system.namespaces) -- only NamespaceIndex.
+        /** delete this index.  does NOT clean up the system catalog
+            (system.indexes or system.namespaces) -- only NamespaceIndex.
        */
        void kill_idx();

@ -179,8 +180,11 @@ namespace mongo {
                    it may not mean we can build the index version in question: we may not maintain building 
                    of indexes in old formats in the future.
        */
-        static bool isASupportedIndexVersionNumber(int v) { return v == 0 || v == 1; }
+        static bool isASupportedIndexVersionNumber(int v) { return (v&1)==v; } // v == 0 || v == 1

+        /** @return the interface for this interface, which varies with the index version.
+            used for backward compatibility of index versions/formats.
+        */
        IndexInterface& idxInterface() { 
            int v = version();
            dassert( isASupportedIndexVersionNumber(v) );
--- a/db/indexkey.h
+++ b/db/indexkey.h
@ -166,28 +166,21 @@ namespace mongo {
        void _getKeys( vector<const char*> fieldNames , vector<BSONElement> fixed , const BSONObj &obj, BSONObjSet &keys ) const;

        BSONSizeTracker _sizeTracker;
-
        vector<const char*> _fieldNames;
        vector<BSONElement> _fixed;
-
        BSONObj _nullKey; // a full key with all fields null
-
        BSONObj _nullObj; // only used for _nullElt
        BSONElement _nullElt; // jstNull
-
        int _nFields; // number of fields in the index
        bool _sparse; // if the index is sparse
-
        shared_ptr<IndexType> _indexType;
-
        const IndexDetails * _details;

        void _init();

+        friend class IndexType;
    public:
        bool _finishedInit;
-
-        friend class IndexType;
    };


--- a/dbtests/btreetests.inl
+++ b/dbtests/btreetests.inl
@ -1,3 +1,5 @@
+    typedef BtreeBucket::_KeyNode _KeyNode;
+ 
    const char* ns() {
        return "unittests.btreetests";
    }
@ -104,7 +106,7 @@
                d = b->getNextChild();
            }
            else {
-                d = const_cast< DiskLoc& >( b->keyNode( i ).prevChildBucket );
+                d = b->keyNode( i ).prevChildBucket;
            }
            assert( !d.isNull() );
            return d.btree();
@ -323,7 +325,7 @@
            end.appendMaxKey( "a" );
            auto_ptr< BtreeCursor > c( BtreeCursor::make( nsdetails( ns() ), 1, id(), start.done(), end.done(), false, 1 ) );
            while( c->ok() ) {
-                if ( !c->_currKeyNode().prevChildBucket.isNull() ) {
+                if ( c->curKeyHasChild() ) {
                    toDel.push_back( c->currKey().firstElement().valuestr() );
                }
                else {
@ -388,8 +390,9 @@
            }
            // too much work to try to make this happen through inserts and deletes
            // we are intentionally manipulating the btree bucket directly here
-            getDur().writingDiskLoc( const_cast< DiskLoc& >( bt()->keyNode( 1 ).prevChildBucket ) ) = DiskLoc();
-            getDur().writingInt( const_cast< DiskLoc& >( bt()->keyNode( 1 ).recordLoc ).GETOFS() ) |= 1; // make unused
+            BtreeBucket::Loc* L = const_cast< BtreeBucket::Loc* >( &bt()->keyNode( 1 ).prevChildBucket );
+            getDur().writing(L)->Null();
+            getDur().writingInt( const_cast< BtreeBucket::Loc& >( bt()->keyNode( 1 ).recordLoc ).GETOFS() ) |= 1; // make unused
            BSONObj k = BSON( "a" << toInsert );
            Base::insert( k );
        }
@ -820,13 +823,23 @@
            ArtificialTree::setTree( "{h:{e:{b:{a:null},c:null,d:null},_:{f:null}},_:{i:null}}", id() );
 //            dump();
            string ns = id().indexNamespace();
+            const BtreeBucket* b = bt();
            ASSERT_EQUALS( 8, bt()->fullValidate( dl(), order(), 0, true ) );
            ASSERT_EQUALS( 6, nsdetails( ns.c_str() )->stats.nrecords );

+//            cout << "---------------------" << endl;
+
+//            dump();
+
+//            cout << "---------------------" << endl;
+
            BSONObj k = BSON( "" << "c" );
            assert( unindex( k ) );
+
 //            dump();
-            ASSERT_EQUALS( 7, bt()->fullValidate( dl(), order(), 0, true ) );
+
+            long long keyCount = bt()->fullValidate( dl(), order(), 0, true );
+            ASSERT_EQUALS( 7, keyCount );
            ASSERT_EQUALS( 4, nsdetails( ns.c_str() )->stats.nrecords );
            // no recursion currently in this case
            ArtificialTree::checkStructure( "{h:{b:{a:null},d:null,e:null,f:null},_:{i:null}}", id() );
@ -1435,7 +1448,7 @@
        void run() {
            string ns = id().indexNamespace();
            ArtificialTree::setTree( "{a:null,c:{b:null},d:null}", id() );
-            getDur().writingInt( const_cast< DiskLoc& >( bt()->keyNode( 1 ).prevChildBucket.btree()->keyNode( 0 ).recordLoc ).GETOFS() ) |= 1; // make unused
+            getDur().writingInt( const_cast< BtreeBucket::Loc& >( bt()->keyNode( 1 ).prevChildBucket.btree()->keyNode( 0 ).recordLoc ).GETOFS() ) |= 1; // make unused
            long long unused = 0;
            ASSERT_EQUALS( 3, bt()->fullValidate( dl(), order(), &unused, true ) );
            ASSERT_EQUALS( 1, unused );
--- a/dbtests/dbtests.cpp
+++ b/dbtests/dbtests.cpp
@ -19,9 +19,11 @@

 #include "pch.h"
 #include "dbtests.h"
+#include "../util/unittest.h"

 int main( int argc, char** argv ) {
    static StaticObserver StaticObserver;
    doPreServerStatupInits();
+    UnitTest::runTests();
    return Suite::run(argc, argv, "/tmp/unittest");
 }
--- a/dbtests/framework.cpp
+++ b/dbtests/framework.cpp
@ -357,8 +357,6 @@ namespace mongo {

            Logstream::get().flush();

-            cout << "**************************************************" << endl;
-            cout << "**************************************************" << endl;
            cout << "**************************************************" << endl;

            int rc = 0;