mongodb/db/pdfile.cpp

// pdfile.cpp

/**
*    Copyright (C) 2008 10gen Inc.
*
*    This program is free software: you can redistribute it and/or  modify
*    it under the terms of the GNU Affero General Public License, version 3,
*    as published by the Free Software Foundation.
*
*    This program is distributed in the hope that it will be useful,
*    but WITHOUT ANY WARRANTY; without even the implied warranty of
*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*    GNU Affero General Public License for more details.
*
*    You should have received a copy of the GNU Affero General Public License
*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/*
todo:
_ table scans must be sequential, not next/prev pointers
_ coalesce deleted
_ disallow system* manipulations from the database.
*/

#include "pch.h"
#include "pdfile.h"
#include "db.h"
#include "../util/mmap.h"
#include "../util/hashtab.h"
#include "../util/file_allocator.h"
#include "../util/processinfo.h"
#include "btree.h"
#include <algorithm>
#include <list>
#include "query.h"
#include "repl.h"
#include "dbhelpers.h"
#include "namespace-inl.h"
#include "queryutil.h"
#include "extsort.h"
#include "curop-inl.h"
#include "background.h"

namespace mongo {

    bool inDBRepair = false;
    struct doingRepair {
        doingRepair() {
            assert( ! inDBRepair );
            inDBRepair = true;
        }
        ~doingRepair() {
            inDBRepair = false;
        }
    };

    map<string, unsigned> BackgroundOperation::dbsInProg;
    set<string> BackgroundOperation::nsInProg;

    bool BackgroundOperation::inProgForDb(const char *db) {
        assertInWriteLock();
        return dbsInProg[db] != 0;
    }

    bool BackgroundOperation::inProgForNs(const char *ns) {
        assertInWriteLock();
        return nsInProg.count(ns) != 0;
    }

    void BackgroundOperation::assertNoBgOpInProgForDb(const char *db) {
        uassert(12586, "cannot perform operation: a background operation is currently running for this database",
                !inProgForDb(db));
    }

    void BackgroundOperation::assertNoBgOpInProgForNs(const char *ns) {
        uassert(12587, "cannot perform operation: a background operation is currently running for this collection",
                !inProgForNs(ns));
    }

    BackgroundOperation::BackgroundOperation(const char *ns) : _ns(ns) {
        assertInWriteLock();
        dbsInProg[_ns.db]++;
        assert( nsInProg.count(_ns.ns()) == 0 );
        nsInProg.insert(_ns.ns());
    }

    BackgroundOperation::~BackgroundOperation() {
        assertInWriteLock();
        dbsInProg[_ns.db]--;
        nsInProg.erase(_ns.ns());
    }

    void BackgroundOperation::dump(stringstream& ss) {
        if( nsInProg.size() ) {
            ss << "\n<b>Background Jobs in Progress</b>\n";
            for( set<string>::iterator i = nsInProg.begin(); i != nsInProg.end(); i++ )
                ss << "  " << *i << '\n';
        }
        for( map<string,unsigned>::iterator i = dbsInProg.begin(); i != dbsInProg.end(); i++ ) {
            if( i->second )
                ss << "database " << i->first << ": " << i->second << '\n';
        }
    }

    /* ----------------------------------------- */

    string dbpath = "/data/db/";
    bool directoryperdb = false;
    string repairpath;
    string pidfilepath;

    DataFileMgr theDataFileMgr;
    DatabaseHolder dbHolder;
    int MAGIC = 0x1000;

    extern int otherTraceLevel;
    void addNewNamespaceToCatalog(const char *ns, const BSONObj *options = 0);
    void ensureIdIndexForNewNs(const char *ns) {
        if ( ( strstr( ns, ".system." ) == 0 || legalClientSystemNS( ns , false ) ) &&
                strstr( ns, ".$freelist" ) == 0 ) {
            log( 1 ) << "adding _id index for collection " << ns << endl;
            ensureHaveIdIndex( ns );
        }
    }

    string getDbContext() {
        stringstream ss;
        Client * c = currentClient.get();
        if ( c ) {
            Client::Context * cx = c->getContext();
            if ( cx ) {
                Database *database = cx->db();
                if ( database ) {
                    ss << database->name << ' ';
                    ss << cx->ns() << ' ';
                }
            }
        }
        return ss.str();
    }

    /*---------------------------------------------------------------------*/

    // inheritable class to implement an operation that may be applied to all
    // files in a database using _applyOpToDataFiles()
    class FileOp {
    public:
        virtual ~FileOp() {}
        // Return true if file exists and operation successful
        virtual bool apply( const boost::filesystem::path &p ) = 0;
        virtual const char * op() const = 0;
    };

    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator = false, const string& path = dbpath );

    void _deleteDataFiles(const char *database) {
        if ( directoryperdb ) {
            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( boost::filesystem::path( dbpath ) / database ) );
            return;
        }
        class : public FileOp {
            virtual bool apply( const boost::filesystem::path &p ) {
                return boost::filesystem::remove( p );
            }
            virtual const char * op() const {
                return "remove";
            }
        } deleter;
        _applyOpToDataFiles( database, deleter, true );
    }

    int Extent::initialSize(int len) {
        long long sz = len * 16;
        if ( len < 1000 ) sz = len * 64;
        if ( sz > 1000000000 )
            sz = 1000000000;
        int z = ((int)sz) & 0xffffff00;
        assert( z > len );
        return z;
    }

    bool _userCreateNS(const char *ns, const BSONObj& options, string& err, bool *deferIdIndex) {
        if ( nsdetails(ns) ) {
            err = "collection already exists";
            return false;
        }

        log(1) << "create collection " << ns << ' ' << options << endl;

        /* todo: do this only when we have allocated space successfully? or we could insert with a { ok: 0 } field
           and then go back and set to ok : 1 after we are done.
        */
        bool isFreeList = strstr(ns, ".$freelist") != 0;
        if( !isFreeList )
            addNewNamespaceToCatalog(ns, options.isEmpty() ? 0 : &options);

        long long size = Extent::initialSize(128);
        {
            BSONElement e = options.getField("size");
            if ( e.isNumber() ) {
                size = e.numberLong();
                size += 256;
                size &= 0xffffffffffffff00LL;
            }
        }

        uassert( 10083 ,  "invalid size spec", size > 0 );

        bool newCapped = false;
        int mx = 0;
        if( options.getBoolField("capped") ) {
            newCapped = true;
            BSONElement e = options.getField("max");
            if ( e.isNumber() ) {
                mx = e.numberInt();
            }
        }

        // $nExtents just for debug/testing.
        BSONElement e = options.getField( "$nExtents" );
        Database *database = cc().database();
        if ( e.type() == Array ) {
            // We create one extent per array entry, with size specified
            // by the array value.
            BSONObjIterator i( e.embeddedObject() );
            while( i.more() ) {
                BSONElement e = i.next();
                int size = int( e.number() );
                assert( size <= 0x7fffffff );
                // $nExtents is just for testing - always allocate new extents
                // rather than reuse existing extents so we have some predictibility
                // in the extent size used by our tests
                database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
            }
        }
        else if ( int( e.number() ) > 0 ) {
            // We create '$nExtents' extents, each of size 'size'.
            int nExtents = int( e.number() );
            assert( size <= 0x7fffffff );
            for ( int i = 0; i < nExtents; ++i ) {
                assert( size <= 0x7fffffff );
                // $nExtents is just for testing - always allocate new extents
                // rather than reuse existing extents so we have some predictibility
                // in the extent size used by our tests
                database->suitableFile( (int) size, false )->createExtent( ns, (int) size, newCapped );
            }
        }
        else {
            // This is the non test case, where we don't have a $nExtents spec.
            while ( size > 0 ) {
                int max = MongoDataFile::maxSize() - DataFileHeader::HeaderSize;
                int desiredExtentSize = (int) (size > max ? max : size);
                Extent *e = database->allocExtent( ns, desiredExtentSize, newCapped );
                size -= e->length;
            }
        }

        NamespaceDetails *d = nsdetails(ns);
        assert(d);

        bool ensure = false;
        if ( options.getField( "autoIndexId" ).type() ) {
            if ( options["autoIndexId"].trueValue() ) {
                ensure = true;
            }
        }
        else {
            if ( !newCapped ) {
                ensure=true;
            }
        }
        if( ensure ) {
            if( deferIdIndex )
                *deferIdIndex = true;
            else
                ensureIdIndexForNewNs( ns );
        }

        if ( mx > 0 )
            getDur().writingInt( d->max ) = mx;

        return true;
    }

    /** { ..., capped: true, size: ..., max: ... }
        @param deferIdIndex - if not not, defers id index creation.  sets the bool value to true if we wanted to create the id index.
        @return true if successful
    */
    bool userCreateNS(const char *ns, BSONObj options, string& err, bool logForReplication, bool *deferIdIndex) {
        const char *coll = strchr( ns, '.' ) + 1;
        massert( 10356 ,  str::stream() << "invalid ns: " << ns , coll && *coll );
        char cl[ 256 ];
        nsToDatabase( ns, cl );
        bool ok = _userCreateNS(ns, options, err, deferIdIndex);
        if ( logForReplication && ok ) {
            if ( options.getField( "create" ).eoo() ) {
                BSONObjBuilder b;
                b << "create" << coll;
                b.appendElements( options );
                options = b.obj();
            }
            string logNs = string( cl ) + ".$cmd";
            logOp("c", logNs.c_str(), options);
        }
        return ok;
    }

    /*---------------------------------------------------------------------*/

    int MongoDataFile::maxSize() {
        if ( sizeof( int* ) == 4 ) {
            return 512 * 1024 * 1024;
        }
        else if ( cmdLine.smallfiles ) {
            return 0x7ff00000 >> 2;
        }
        else {
            return 0x7ff00000;
        }
    }

    void MongoDataFile::badOfs2(int ofs) const {
        stringstream ss;
        ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
        uasserted(13441, ss.str());
    }

    void MongoDataFile::badOfs(int ofs) const {
        stringstream ss;
        ss << "bad offset:" << ofs << " accessing file: " << mmf.filename() << " - consider repairing database";
        uasserted(13440, ss.str());
    }

    int MongoDataFile::defaultSize( const char *filename ) const {
        int size;

        if ( fileNo <= 4 )
            size = (64*1024*1024) << fileNo;
        else
            size = 0x7ff00000;

        if ( strstr(filename, "_hudsonSmall") ) {
            int mult = 1;
            if ( fileNo > 1 && fileNo < 1000 )
                mult = fileNo;
            size = 1024 * 512 * mult;
            log() << "Warning : using small files for _hudsonSmall" << endl;
        }
        else if ( cmdLine.smallfiles ) {
            size = size >> 2;
        }


        return size;
    }

    void MongoDataFile::open( const char *filename, int minSize, bool preallocateOnly ) {
        {
            /* check quotas
               very simple temporary implementation for now
            */
            if ( cmdLine.quota && fileNo > cmdLine.quotaFiles && !MMF::exists(filename) ) {
                /* todo: if we were adding / changing keys in an index did we do some
                   work previously that needs cleaning up?  Possible.  We should
                   check code like that and have it catch the exception and do
                   something reasonable.
                */
                string s = "db disk space quota exceeded ";
                Database *database = cc().database();
                if ( database )
                    s += database->name;
                uasserted(12501,s);
            }
        }

        long size = defaultSize( filename );
        while ( size < minSize ) {
            if ( size < maxSize() / 2 )
                size *= 2;
            else {
                size = maxSize();
                break;
            }
        }
        if ( size > maxSize() )
            size = maxSize();

        assert( size >= 64*1024*1024 || cmdLine.smallfiles );
        assert( size % 4096 == 0 );

        if ( preallocateOnly ) {
            if ( cmdLine.prealloc ) {
                FileAllocator::get()->requestAllocation( filename, size );
            }
            return;
        }

        {
            assert( _mb == 0 );
            unsigned long long sz = size;
            if( mmf.create(filename, sz, false) )
                _mb = mmf.getView();
            assert( sz <= 0x7fffffff );
            size = (int) sz;
        }
        //header = (DataFileHeader *) _p;
        if( sizeof(char *) == 4 )
            uassert( 10084 , "can't map file memory - mongo requires 64 bit build for larger datasets", _mb != 0);
        else
            uassert( 10085 , "can't map file memory", _mb != 0);
        header()->init(fileNo, size);
    }

    void MongoDataFile::flush( bool sync ) {
        mmf.flush( sync );
    }

    void addNewExtentToNamespace(const char *ns, Extent *e, DiskLoc eloc, DiskLoc emptyLoc, bool capped) {
        NamespaceIndex *ni = nsindex(ns);
        NamespaceDetails *details = ni->details(ns);
        if ( details ) {
            assert( !details->lastExtent.isNull() );
            assert( !details->firstExtent.isNull() );
            getDur().writingDiskLoc(e->xprev) = details->lastExtent;
            getDur().writingDiskLoc(details->lastExtent.ext()->xnext) = eloc;
            assert( !eloc.isNull() );
            getDur().writingDiskLoc(details->lastExtent) = eloc;
        }
        else {
            ni->add_ns(ns, eloc, capped);
            details = ni->details(ns);
        }

        {
            NamespaceDetails *dw = details->writingWithoutExtra();
            dw->lastExtentSize = e->length;
        }
        details->addDeletedRec(emptyLoc.drec(), emptyLoc);
    }

    Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
        massert( 10357 ,  "shutdown in progress", ! inShutdown() );
        massert( 10358 ,  "bad new extent size", approxSize >= 0 && approxSize <= Extent::maxSize() );
        massert( 10359 ,  "header==0 on new extent: 32 bit mmap space exceeded?", header() ); // null if file open failed
        int ExtentSize = approxSize <= header()->unusedLength ? approxSize : header()->unusedLength;
        DiskLoc loc;
        if ( ExtentSize <= 0 ) {
            /* not there could be a lot of looping here is db just started and
               no files are open yet.  we might want to do something about that. */
            if ( loops > 8 ) {
                assert( loops < 10000 );
                out() << "warning: loops=" << loops << " fileno:" << fileNo << ' ' << ns << '\n';
            }
            log() << "newExtent: " << ns << " file " << fileNo << " full, adding a new file\n";
            return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
        }
        int offset = header()->unused.getOfs();

        DataFileHeader *h = getDur().writing(header());
        h->unused.set( fileNo, offset + ExtentSize );
        h->unusedLength -= ExtentSize;
        loc.set(fileNo, offset);
        Extent *e = _getExtent(loc);
        DiskLoc emptyLoc = getDur().writing(e)->init(ns, ExtentSize, fileNo, offset);

        addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);

        DEV tlog(1) << "new extent " << ns << " size: 0x" << hex << ExtentSize << " loc: 0x" << hex << offset
                    << " emptyLoc:" << hex << emptyLoc.getOfs() << dec << endl;
        return e;
    }

    Extent* DataFileMgr::allocFromFreeList(const char *ns, int approxSize, bool capped) {
        string s = cc().database()->name + ".$freelist";
        NamespaceDetails *f = nsdetails(s.c_str());
        if( f ) {
            int low, high;
            if( capped ) {
                // be strict about the size
                low = approxSize;
                if( low > 2048 ) low -= 256;
                high = (int) (approxSize * 1.05) + 256;
            }
            else {
                low = (int) (approxSize * 0.8);
                high = (int) (approxSize * 1.4);
            }
            if( high < 0 ) high = approxSize;
            int n = 0;
            Extent *best = 0;
            int bestDiff = 0x7fffffff;
            {
                DiskLoc L = f->firstExtent;
                while( !L.isNull() ) {
                    Extent * e = L.ext();
                    if( e->length >= low && e->length <= high ) {
                        int diff = abs(e->length - approxSize);
                        if( diff < bestDiff ) {
                            bestDiff = diff;
                            best = e;
                            if( diff == 0 )
                                break;
                        }
                    }
                    L = e->xnext;
                    ++n;

                }
            }
            OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
            if( best ) {
                Extent *e = best;
                // remove from the free list
                if( !e->xprev.isNull() )
                    e->xprev.ext()->xnext.writing() = e->xnext;
                if( !e->xnext.isNull() )
                    e->xnext.ext()->xprev.writing() = e->xprev;
                if( f->firstExtent == e->myLoc )
                    f->firstExtent.writing() = e->xnext;
                if( f->lastExtent == e->myLoc )
                    f->lastExtent.writing() = e->xprev;

                // use it
                OCCASIONALLY if( n > 512 ) log() << "warning: newExtent " << n << " scanned\n";
                DiskLoc emptyLoc = e->reuse(ns);
                addNewExtentToNamespace(ns, e, e->myLoc, emptyLoc, capped);
                return e;
            }
        }

        return 0;
        //        return createExtent(ns, approxSize, capped);
    }

    /*---------------------------------------------------------------------*/

    DiskLoc Extent::reuse(const char *nsname) {
        return getDur().writing(this)->_reuse(nsname);
    }
    DiskLoc Extent::_reuse(const char *nsname) {
        log(3) << "reset extent was:" << nsDiagnostic.toString() << " now:" << nsname << '\n';
        massert( 10360 ,  "Extent::reset bad magic value", magic == 0x41424344 );
        xnext.Null();
        xprev.Null();
        nsDiagnostic = nsname;
        firstRecord.Null();
        lastRecord.Null();

        DiskLoc emptyLoc = myLoc;
        emptyLoc.inc( (int) (_extentData-(char*)this) );

        int delRecLength = length - (_extentData - (char *) this);

        DeletedRecord *empty = DataFileMgr::makeDeletedRecord(emptyLoc, delRecLength);//(DeletedRecord *) getRecord(emptyLoc);
        empty = getDur().writing(empty);
        empty->lengthWithHeaders = delRecLength;
        empty->extentOfs = myLoc.getOfs();
        empty->nextDeleted.Null();

        return emptyLoc;
    }

    /* assumes already zeroed -- insufficient for block 'reuse' perhaps */
    DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset) {
        magic = 0x41424344;
        myLoc.set(_fileNo, _offset);
        xnext.Null();
        xprev.Null();
        nsDiagnostic = nsname;
        length = _length;
        firstRecord.Null();
        lastRecord.Null();

        DiskLoc emptyLoc = myLoc;
        emptyLoc.inc( (int) (_extentData-(char*)this) );

        int l = _length - (_extentData - (char *) this);
        DeletedRecord *empty = getDur().writing( DataFileMgr::makeDeletedRecord(emptyLoc, l) );
        empty->lengthWithHeaders = l;
        empty->extentOfs = myLoc.getOfs();
        return emptyLoc;
    }

    /*
      Record* Extent::newRecord(int len) {
      if( firstEmptyRegion.isNull() )8
      return 0;

      assert(len > 0);
      int newRecSize = len + Record::HeaderSize;
      DiskLoc newRecordLoc = firstEmptyRegion;
      Record *r = getRecord(newRecordLoc);
      int left = r->netLength() - len;
      if( left < 0 ) {
      //
      firstEmptyRegion.Null();
      return 0;
      }

      DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion);
      r->lengthWithHeaders = newRecSize;
      r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent
      if( !lastRecord.isNull() ) {
      assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one
      getRecord(lastRecord)->next.set(newRecordLoc); // until now
      r->prev.set(lastRecord);
      }
      else {
      r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent
      assert( firstRecord.isNull() );
      firstRecord = newRecordLoc;
      }
      lastRecord = newRecordLoc;

      if( left < Record::HeaderSize + 32 ) {
      firstEmptyRegion.Null();
      }
      else {
      firstEmptyRegion.inc(newRecSize);
      Record *empty = getRecord(firstEmptyRegion);
      empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null.
      empty->prev.Null();
      empty->lengthWithHeaders = left;
      }

      return r;
      }
    */

    int Extent::maxSize() {
        int maxExtentSize = 0x7ff00000;
        if ( cmdLine.smallfiles ) {
            maxExtentSize >>= 2;
        }
        return maxExtentSize;
    }

    /*---------------------------------------------------------------------*/

    shared_ptr<Cursor> DataFileMgr::findAll(const char *ns, const DiskLoc &startLoc) {
        NamespaceDetails * d = nsdetails( ns );
        if ( ! d )
            return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));

        DiskLoc loc = d->firstExtent;
        Extent *e = getExtent(loc);

        DEBUGGING {
            out() << "listing extents for " << ns << endl;
            DiskLoc tmp = loc;
            set<DiskLoc> extents;

            while ( 1 ) {
                Extent *f = getExtent(tmp);
                out() << "extent: " << tmp.toString() << endl;
                extents.insert(tmp);
                tmp = f->xnext;
                if ( tmp.isNull() )
                    break;
                f = f->getNextExtent();
            }

            out() << endl;
            d->dumpDeleted(&extents);
        }

        if ( d->capped )
            return shared_ptr<Cursor>( new ForwardCappedCursor( d , startLoc ) );

        if ( !startLoc.isNull() )
            return shared_ptr<Cursor>(new BasicCursor( startLoc ));

        while ( e->firstRecord.isNull() && !e->xnext.isNull() ) {
            /* todo: if extent is empty, free it for reuse elsewhere.
               that is a bit complicated have to clean up the freelists.
            */
            RARELY out() << "info DFM::findAll(): extent " << loc.toString() << " was empty, skipping ahead " << ns << endl;
            // find a nonempty extent
            // it might be nice to free the whole extent here!  but have to clean up free recs then.
            e = e->getNextExtent();
        }
        return shared_ptr<Cursor>(new BasicCursor( e->firstRecord ));
    }

    /* get a table scan cursor, but can be forward or reverse direction.
       order.$natural - if set, > 0 means forward (asc), < 0 backward (desc).
    */
    shared_ptr<Cursor> findTableScan(const char *ns, const BSONObj& order, const DiskLoc &startLoc) {
        BSONElement el = order.getField("$natural"); // e.g., { $natural : -1 }

        if ( el.number() >= 0 )
            return DataFileMgr::findAll(ns, startLoc);

        // "reverse natural order"
        NamespaceDetails *d = nsdetails(ns);

        if ( !d )
            return shared_ptr<Cursor>(new BasicCursor(DiskLoc()));

        if ( !d->capped ) {
            if ( !startLoc.isNull() )
                return shared_ptr<Cursor>(new ReverseCursor( startLoc ));
            Extent *e = d->lastExtent.ext();
            while ( e->lastRecord.isNull() && !e->xprev.isNull() ) {
                OCCASIONALLY out() << "  findTableScan: extent empty, skipping ahead" << endl;
                e = e->getPrevExtent();
            }
            return shared_ptr<Cursor>(new ReverseCursor( e->lastRecord ));
        }
        else {
            return shared_ptr<Cursor>( new ReverseCappedCursor( d, startLoc ) );
        }
    }

    void printFreeList() {
        string s = cc().database()->name + ".$freelist";
        log() << "dump freelist " << s << '\n';
        NamespaceDetails *freeExtents = nsdetails(s.c_str());
        if( freeExtents == 0 ) {
            log() << "  freeExtents==0" << endl;
            return;
        }
        DiskLoc a = freeExtents->firstExtent;
        while( !a.isNull() ) {
            Extent *e = a.ext();
            log() << "  " << a.toString() << " len:" << e->length << " prev:" << e->xprev.toString() << '\n';
            a = e->xnext;
        }

        log() << "  end freelist" << endl;
    }

    /* drop a collection/namespace */
    void dropNS(const string& nsToDrop) {
        NamespaceDetails* d = nsdetails(nsToDrop.c_str());
        uassert( 10086 ,  (string)"ns not found: " + nsToDrop , d );

        BackgroundOperation::assertNoBgOpInProgForNs(nsToDrop.c_str());

        NamespaceString s(nsToDrop);
        assert( s.db == cc().database()->name );
        if( s.isSystem() ) {
            if( s.coll == "system.profile" )
                uassert( 10087 ,  "turn off profiling before dropping system.profile collection", cc().database()->profile == 0 );
            else
                uasserted( 12502, "can't drop system ns" );
        }

        {
            // remove from the system catalog
            BSONObj cond = BSON( "name" << nsToDrop );   // { name: "colltodropname" }
            string system_namespaces = cc().database()->name + ".system.namespaces";
            /*int n = */ deleteObjects(system_namespaces.c_str(), cond, false, false, true);
            // no check of return code as this ns won't exist for some of the new storage engines
        }

        // free extents
        if( !d->firstExtent.isNull() ) {
            string s = cc().database()->name + ".$freelist";
            NamespaceDetails *freeExtents = nsdetails(s.c_str());
            if( freeExtents == 0 ) {
                string err;
                _userCreateNS(s.c_str(), BSONObj(), err, 0);
                freeExtents = nsdetails(s.c_str());
                massert( 10361 , "can't create .$freelist", freeExtents);
            }
            if( freeExtents->firstExtent.isNull() ) {
                freeExtents->firstExtent.writing() = d->firstExtent;
                freeExtents->lastExtent.writing() = d->lastExtent;
            }
            else {
                DiskLoc a = freeExtents->firstExtent;
                assert( a.ext()->xprev.isNull() );
                getDur().writingDiskLoc( a.ext()->xprev ) = d->lastExtent;
                getDur().writingDiskLoc( d->lastExtent.ext()->xnext ) = a;
                getDur().writingDiskLoc( freeExtents->firstExtent ) = d->firstExtent;
                getDur().writingDiskLoc( d->firstExtent ).setInvalid();
                getDur().writingDiskLoc( d->lastExtent ).setInvalid();
            }
        }

        // remove from the catalog hashtable
        cc().database()->namespaceIndex.kill_ns(nsToDrop.c_str());
    }

    void dropCollection( const string &name, string &errmsg, BSONObjBuilder &result ) {
        log(1) << "dropCollection: " << name << endl;
        NamespaceDetails *d = nsdetails(name.c_str());
        if( d == 0 )
            return;

        BackgroundOperation::assertNoBgOpInProgForNs(name.c_str());

        if ( d->nIndexes != 0 ) {
            try {
                assert( dropIndexes(d, name.c_str(), "*", errmsg, result, true) );
            }
            catch( DBException& e ) {
                stringstream ss;
                ss << "drop: dropIndexes for collection failed - consider trying repair ";
                ss << " cause: " << e.what();
                uasserted(12503,ss.str());
            }
            assert( d->nIndexes == 0 );
        }
        log(1) << "\t dropIndexes done" << endl;
        result.append("ns", name.c_str());
        ClientCursor::invalidate(name.c_str());
        Top::global.collectionDropped( name );
        dropNS(name);
    }

    int nUnindexes = 0;

    /* unindex all keys in index for this record. */
    static void _unindexRecord(IndexDetails& id, BSONObj& obj, const DiskLoc& dl, bool logMissing = true) {
        BSONObjSetDefaultOrder keys;
        id.getKeysFromObject(obj, keys);
        for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
            BSONObj j = *i;
            if ( otherTraceLevel >= 5 ) {
                out() << "_unindexRecord() " << obj.toString();
                out() << "\n  unindex:" << j.toString() << endl;
            }
            nUnindexes++;
            bool ok = false;
            try {
                ok = id.head.btree()->unindex(id.head, id, j, dl);
            }
            catch (AssertionException& e) {
                problem() << "Assertion failure: _unindex failed " << id.indexNamespace() << endl;
                out() << "Assertion failure: _unindex failed: " << e.what() << '\n';
                out() << "  obj:" << obj.toString() << '\n';
                out() << "  key:" << j.toString() << '\n';
                out() << "  dl:" << dl.toString() << endl;
                sayDbContext();
            }

            if ( !ok && logMissing ) {
                out() << "unindex failed (key too big?) " << id.indexNamespace() << '\n';
            }
        }
    }

    /* unindex all keys in all indexes for this record. */
    static void unindexRecord(NamespaceDetails *d, Record *todelete, const DiskLoc& dl, bool noWarn = false) {
        BSONObj obj(todelete);
        int n = d->nIndexes;
        for ( int i = 0; i < n; i++ )
            _unindexRecord(d->idx(i), obj, dl, !noWarn);
        if( d->indexBuildInProgress ) { // background index
            // always pass nowarn here, as this one may be missing for valid reasons as we are concurrently building it
            _unindexRecord(d->idx(n), obj, dl, false);
        }
    }

    /* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc.
       caller must check if capped
    */
    void DataFileMgr::_deleteRecord(NamespaceDetails *d, const char *ns, Record *todelete, const DiskLoc& dl) {
        /* remove ourself from the record next/prev chain */
        {
            if ( todelete->prevOfs != DiskLoc::NullOfs )
                getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs ) = todelete->nextOfs;
            if ( todelete->nextOfs != DiskLoc::NullOfs )
                getDur().writingInt( todelete->getNext(dl).rec()->prevOfs ) = todelete->prevOfs;
        }

        /* remove ourself from extent pointers */
        {
            Extent *e = getDur().writing( todelete->myExtent(dl) );
            if ( e->firstRecord == dl ) {
                if ( todelete->nextOfs == DiskLoc::NullOfs )
                    e->firstRecord.Null();
                else
                    e->firstRecord.set(dl.a(), todelete->nextOfs);
            }
            if ( e->lastRecord == dl ) {
                if ( todelete->prevOfs == DiskLoc::NullOfs )
                    e->lastRecord.Null();
                else
                    e->lastRecord.set(dl.a(), todelete->prevOfs);
            }
        }

        /* add to the free list */
        {
            {
                NamespaceDetails::Stats *s = getDur().writing(&d->stats);
                s->datasize -= todelete->netLength();
                s->nrecords--;
            }

            if ( strstr(ns, ".system.indexes") ) {
                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
                   careful until validated more, as IndexDetails has pointers
                   to this disk location.  so an incorrectly done remove would cause
                   a lot of problems.
                */
                memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders), 0, todelete->lengthWithHeaders);
            }
            else {
                DEV {
                    unsigned long long *p = (unsigned long long *) todelete->data;
                    *getDur().writing(p) = 0;
                    //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse.
                }
                d->addDeletedRec((DeletedRecord*)todelete, dl);
            }
        }
    }

    void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn) {
        dassert( todelete == dl.rec() );

        NamespaceDetails* d = nsdetails(ns);
        if ( d->capped && !cappedOK ) {
            out() << "failing remove on a capped ns " << ns << endl;
            uassert( 10089 ,  "can't remove from a capped collection" , 0 );
            return;
        }

        /* check if any cursors point to us.  if so, advance them. */
        ClientCursor::aboutToDelete(dl);

        unindexRecord(d, todelete, dl, noWarn);

        _deleteRecord(d, ns, todelete, dl);
        NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();
    }


    /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
     */
    const DiskLoc DataFileMgr::updateRecord(
        const char *ns,
        NamespaceDetails *d,
        NamespaceDetailsTransient *nsdt,
        Record *toupdate, const DiskLoc& dl,
        const char *_buf, int _len, OpDebug& debug,  bool god) {
        StringBuilder& ss = debug.str;
        dassert( toupdate == dl.rec() );

        BSONObj objOld(toupdate);
        BSONObj objNew(_buf);
        DEV assert( objNew.objsize() == _len );
        DEV assert( objNew.objdata() == _buf );

        if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
            /* add back the old _id value if the update removes it.  Note this implementation is slow
               (copies entire object multiple times), but this shouldn't happen often, so going for simple
               code, not speed.
            */
            BSONObjBuilder b;
            BSONElement e;
            assert( objOld.getObjectID(e) );
            b.append(e); // put _id first, for best performance
            b.appendElements(objNew);
            objNew = b.obj();
        }

        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
        */
        vector<IndexChanges> changes;
        bool changedId = false;
        getIndexChanges(changes, *d, objNew, objOld, changedId);
        uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew , ! changedId );
        dupCheck(changes, *d, dl);

        if ( toupdate->netLength() < objNew.objsize() ) {
            // doesn't fit.  reallocate -----------------------------------------------------
            uassert( 10003 , "failing update: objects in a capped ns cannot grow", !(d && d->capped));
            d->paddingTooSmall();
            if ( cc().database()->profile )
                ss << " moved ";
            deleteRecord(ns, toupdate, dl);
            return insert(ns, objNew.objdata(), objNew.objsize(), god);
        }

        nsdt->notifyOfWriteOp();
        d->paddingFits();

        /* have any index keys changed? */
        {
            unsigned keyUpdates = 0;
            int z = d->nIndexesBeingBuilt();
            for ( int x = 0; x < z; x++ ) {
                IndexDetails& idx = d->idx(x);
                for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) {
                    try {
                        idx.head.btree()->unindex(idx.head, idx, *changes[x].removed[i], dl);
                    }
                    catch (AssertionException&) {
                        ss << " exception update unindex ";
                        problem() << " caught assertion update unindex " << idx.indexNamespace() << endl;
                    }
                }
                assert( !dl.isNull() );
                BSONObj idxKey = idx.info.obj().getObjectField("key");
                Ordering ordering = Ordering::make(idxKey);
                keyUpdates += changes[x].added.size();
                for ( unsigned i = 0; i < changes[x].added.size(); i++ ) {
                    try {
                        /* we did the dupCheck() above.  so we don't have to worry about it here. */
                        idx.head.btree()->bt_insert(
                            idx.head,
                            dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
                    }
                    catch (AssertionException& e) {
                        ss << " exception update index ";
                        problem() << " caught assertion update index " << idx.indexNamespace() << " " << e << endl;
                    }
                }
            }
            if( keyUpdates && cc().database()->profile )
                ss << '\n' << keyUpdates << " key updates ";
        }

        //  update in place
        int sz = objNew.objsize();
        memcpy(getDur().writingPtr(toupdate->data, sz), objNew.objdata(), sz);
        return dl;
    }

    int Extent::followupSize(int len, int lastExtentLen) {
        assert( len < Extent::maxSize() );
        int x = initialSize(len);
        int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2);
        int sz = y > x ? y : x;

        if ( sz < lastExtentLen ) {
            // this means there was an int overflow
            // so we should turn it into maxSize
            sz = Extent::maxSize();
        }
        else if ( sz > Extent::maxSize() ) {
            sz = Extent::maxSize();
        }

        sz = ((int)sz) & 0xffffff00;
        assert( sz > len );

        return sz;
    }

    /* add keys to index idxNo for a new record */
    static inline void  _indexRecord(NamespaceDetails *d, int idxNo, BSONObj& obj, DiskLoc recordLoc, bool dupsAllowed) {
        IndexDetails& idx = d->idx(idxNo);
        BSONObjSetDefaultOrder keys;
        idx.getKeysFromObject(obj, keys);
        BSONObj order = idx.keyPattern();
        Ordering ordering = Ordering::make(order);
        int n = 0;
        for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
            if( ++n == 2 ) {
                d->setIndexIsMultikey(idxNo);
            }
            assert( !recordLoc.isNull() );
            try {
                idx.head.btree()->bt_insert(idx.head, recordLoc,
                                            *i, ordering, dupsAllowed, idx);
            }
            catch (AssertionException& e) {
                if( e.getCode() == 10287 && idxNo == d->nIndexes ) {
                    DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
                    continue;
                }
                if( !dupsAllowed ) {
                    // dup key exception, presumably.
                    throw;
                }
                problem() << " caught assertion _indexRecord " << idx.indexNamespace() << endl;
            }
        }
    }

    void testSorting() {
        BSONObjBuilder b;
        b.appendNull("");
        BSONObj x = b.obj();

        BSONObjExternalSorter sorter;

        sorter.add(x, DiskLoc(3,7));
        sorter.add(x, DiskLoc(4,7));
        sorter.add(x, DiskLoc(2,7));
        sorter.add(x, DiskLoc(1,7));
        sorter.add(x, DiskLoc(3,77));

        sorter.sort();

        auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
        while( i->more() ) {
            BSONObjExternalSorter::Data d = i->next();
            /*cout << d.second.toString() << endl;
            cout << d.first.objsize() << endl;
            cout<<"SORTER next:" << d.first.toString() << endl;*/
        }
    }

    // throws DBException
    unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
        CurOp * op = cc().curop();

        Timer t;

        tlog(1) << "fastBuildIndex " << ns << " idxNo:" << idxNo << ' ' << idx.info.obj().toString() << endl;

        bool dupsAllowed = !idx.unique();
        bool dropDups = idx.dropDups() || inDBRepair;
        BSONObj order = idx.keyPattern();

        getDur().writingDiskLoc(idx.head).Null();

        if ( logLevel > 1 ) printMemInfo( "before index start" );

        /* get and sort all the keys ----- */
        unsigned long long n = 0;
        shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
        BSONObjExternalSorter sorter(order);
        sorter.hintNumObjects( d->stats.nrecords );
        unsigned long long nkeys = 0;
        ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
        while ( c->ok() ) {
            BSONObj o = c->current();
            DiskLoc loc = c->currLoc();

            BSONObjSetDefaultOrder keys;
            idx.getKeysFromObject(o, keys);
            int k = 0;
            for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
                if( ++k == 2 ) {
                    d->setIndexIsMultikey(idxNo);
                }
                sorter.add(*i, loc);
                nkeys++;
            }

            c->advance();
            n++;
            pm.hit();
            if ( logLevel > 1 && n % 10000 == 0 ) {
                printMemInfo( "\t iterating objects" );
            }

        };
        pm.finished();

        if ( logLevel > 1 ) printMemInfo( "before final sort" );
        sorter.sort();
        if ( logLevel > 1 ) printMemInfo( "after final sort" );

        log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;

        list<DiskLoc> dupsToDrop;

        /* build index --- */
        {
            BtreeBuilder btBuilder(dupsAllowed, idx);
            BSONObj keyLast;
            auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
            assert( pm == op->setMessage( "index: (2/3) btree bottom up" , nkeys , 10 ) );
            while( i->more() ) {
                RARELY killCurrentOp.checkForInterrupt();
                BSONObjExternalSorter::Data d = i->next();

                try {
                    btBuilder.addKey(d.first, d.second);
                }
                catch( AssertionException& e ) {
                    if ( dupsAllowed ) {
                        // unknow exception??
                        throw;
                    }

                    if( e.interrupted() )
                        throw;

                    if ( ! dropDups )
                        throw;

                    /* we could queue these on disk, but normally there are very few dups, so instead we
                       keep in ram and have a limit.
                    */
                    dupsToDrop.push_back(d.second);
                    uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 );
                }
                pm.hit();
            }
            pm.finished();
            op->setMessage( "index: (3/3) btree-middle" );
            log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
            btBuilder.commit();
            wassert( btBuilder.getn() == nkeys || dropDups );
        }

        log(1) << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;

        for( list<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ )
            theDataFileMgr.deleteRecord( ns, i->rec(), *i, false, true );

        return n;
    }

    class BackgroundIndexBuildJob : public BackgroundOperation {

        unsigned long long addExistingToIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
            bool dupsAllowed = !idx.unique();
            bool dropDups = idx.dropDups();

            ProgressMeter& progress = cc().curop()->setMessage( "bg index build" , d->stats.nrecords );

            unsigned long long n = 0;
            auto_ptr<ClientCursor> cc;
            {
                shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
                cc.reset( new ClientCursor(QueryOption_NoCursorTimeout, c, ns) );
            }
            CursorId id = cc->cursorid();

            while ( cc->ok() ) {
                BSONObj js = cc->current();
                try {
                    _indexRecord(d, idxNo, js, cc->currLoc(), dupsAllowed);
                    cc->advance();
                }
                catch( AssertionException& e ) {
                    if( e.interrupted() )
                        throw;

                    if ( dropDups ) {
                        DiskLoc toDelete = cc->currLoc();
                        bool ok = cc->advance();
                        cc->updateLocation();
                        theDataFileMgr.deleteRecord( ns, toDelete.rec(), toDelete, false, true );
                        if( ClientCursor::find(id, false) == 0 ) {
                            cc.release();
                            if( !ok ) {
                                /* we were already at the end. normal. */
                            }
                            else {
                                uasserted(12585, "cursor gone during bg index; dropDups");
                            }
                            break;
                        }
                    }
                    else {
                        log() << "background addExistingToIndex exception " << e.what() << endl;
                        throw;
                    }
                }
                n++;
                progress.hit();

                if ( n % 128 == 0 && !cc->yield() ) {
                    cc.release();
                    uasserted(12584, "cursor gone during bg index");
                    break;
                }
            }
            progress.finished();
            return n;
        }

        /* we do set a flag in the namespace for quick checking, but this is our authoritative info -
           that way on a crash/restart, we don't think we are still building one. */
        set<NamespaceDetails*> bgJobsInProgress;

        void prep(const char *ns, NamespaceDetails *d) {
            assertInWriteLock();
            uassert( 13130 , "can't start bg index b/c in recursive lock (db.eval?)" , dbMutex.getState() == 1 );
            bgJobsInProgress.insert(d);
        }
        void done(const char *ns, NamespaceDetails *d) {
            NamespaceDetailsTransient::get_w(ns).addedIndex(); // clear query optimizer cache
            assertInWriteLock();
        }

    public:
        BackgroundIndexBuildJob(const char *ns) : BackgroundOperation(ns) { }

        unsigned long long go(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
            unsigned long long n = 0;

            prep(ns.c_str(), d);
            assert( idxNo == d->nIndexes );
            try {
                idx.head = BtreeBucket::addBucket(idx);
                n = addExistingToIndex(ns.c_str(), d, idx, idxNo);
            }
            catch(...) {
                if( cc().database() && nsdetails(ns.c_str()) == d ) {
                    assert( idxNo == d->nIndexes );
                    done(ns.c_str(), d);
                }
                else {
                    log() << "ERROR: db gone during bg index?" << endl;
                }
                throw;
            }
            assert( idxNo == d->nIndexes );
            done(ns.c_str(), d);
            return n;
        }
    };

    /**
     * For the lifetime of this object, an index build is indicated on the specified
     * namespace and the newest index is marked as absent.  This simplifies
     * the cleanup required on recovery.
     */
    class RecoverableIndexState {
    public:
        RecoverableIndexState( NamespaceDetails *d ) : _d( d ) {
            indexBuildInProgress() = 1;
            nIndexes()--;
        }
        ~RecoverableIndexState() {
            DESTRUCTOR_GUARD (
                nIndexes()++;
                indexBuildInProgress() = 0;
            )
        }
    private:
        int &nIndexes() { return getDur().writingInt( _d->nIndexes ); }
        int &indexBuildInProgress() { return getDur().writingInt( _d->indexBuildInProgress ); }
        NamespaceDetails *_d;
    };

    // throws DBException
    static void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
        tlog() << "building new index on " << idx.keyPattern() << " for " << ns << ( background ? " background" : "" ) << endl;
        Timer t;
        unsigned long long n;

        if( background ) {
            log(2) << "buildAnIndex: background=true\n";
        }

        assert( !BackgroundOperation::inProgForNs(ns.c_str()) ); // should have been checked earlier, better not be...
        assert( d->indexBuildInProgress == 0 );
        assertInWriteLock();
        RecoverableIndexState recoverable( d );
        if( inDBRepair || !background ) {
            n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
            assert( !idx.head.isNull() );
        }
        else {
            BackgroundIndexBuildJob j(ns.c_str());
            n = j.go(ns, d, idx, idxNo);
        }
        tlog() << "done for " << n << " records " << t.millis() / 1000.0 << "secs" << endl;
    }

    /* add keys to indexes for a new record */
    static void indexRecord(NamespaceDetails *d, BSONObj obj, DiskLoc loc) {
        int n = d->nIndexesBeingBuilt();
        for ( int i = 0; i < n; i++ ) {
            try {
                bool unique = d->idx(i).unique();
                _indexRecord(d, i, obj, loc, /*dupsAllowed*/!unique);
            }
            catch( DBException& ) {
                /* try to roll back previously added index entries
                   note <= i (not < i) is important here as the index we were just attempted
                   may be multikey and require some cleanup.
                */
                for( int j = 0; j <= i; j++ ) {
                    try {
                        _unindexRecord(d->idx(j), obj, loc, false);
                    }
                    catch(...) {
                        log(3) << "unindex fails on rollback after unique failure\n";
                    }
                }
                throw;
            }
        }
    }

    extern BSONObj id_obj; // { _id : 1 }

    void ensureHaveIdIndex(const char *ns) {
        NamespaceDetails *d = nsdetails(ns);
        if ( d == 0 || (d->flags & NamespaceDetails::Flag_HaveIdIndex) )
            return;

        *getDur().writing(&d->flags) |= NamespaceDetails::Flag_HaveIdIndex;

        {
            NamespaceDetails::IndexIterator i = d->ii();
            while( i.more() ) {
                if( i.next().isIdIndex() )
                    return;
            }
        }

        string system_indexes = cc().database()->name + ".system.indexes";

        BSONObjBuilder b;
        b.append("name", "_id_");
        b.append("ns", ns);
        b.append("key", id_obj);
        BSONObj o = b.done();

        /* edge case: note the insert could fail if we have hit maxindexes already */
        theDataFileMgr.insert(system_indexes.c_str(), o.objdata(), o.objsize(), true);
    }

#pragma pack(1)
    struct IDToInsert_ {
        char type;
        char _id[4];
        OID oid;
        IDToInsert_() {
            type = (char) jstOID;
            strcpy(_id, "_id");
            assert( sizeof(IDToInsert_) == 17 );
        }
    } idToInsert_;
    struct IDToInsert : public BSONElement {
        IDToInsert() : BSONElement( ( char * )( &idToInsert_ ) ) {}
    } idToInsert;
#pragma pack()

    void DataFileMgr::insertAndLog( const char *ns, const BSONObj &o, bool god ) {
        BSONObj tmp = o;
        insertWithObjMod( ns, tmp, god );
        logOp( "i", ns, tmp );
    }

    DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
        DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god );
        if ( !loc.isNull() )
            o = BSONObj( loc.rec() );
        return loc;
    }

    void DataFileMgr::insertNoReturnVal(const char *ns,  BSONObj o, bool god) {
        insert( ns, o.objdata(), o.objsize(), god );
    }

    bool prepareToBuildIndex(const BSONObj& io, bool god, string& sourceNS, NamespaceDetails *&sourceCollection, BSONObj& fixedIndexObject );

    // We are now doing two btree scans for all unique indexes (one here, and one when we've
    // written the record to the collection.  This could be made more efficient inserting
    // dummy data here, keeping pointers to the btree nodes holding the dummy data and then
    // updating the dummy data with the DiskLoc of the real record.
    void checkNoIndexConflicts( NamespaceDetails *d, const BSONObj &obj ) {
        for ( int idxNo = 0; idxNo < d->nIndexes; idxNo++ ) {
            if( d->idx(idxNo).unique() ) {
                IndexDetails& idx = d->idx(idxNo);
                BSONObjSetDefaultOrder keys;
                idx.getKeysFromObject(obj, keys);
                BSONObj order = idx.keyPattern();
                for ( BSONObjSetDefaultOrder::iterator i=keys.begin(); i != keys.end(); i++ ) {
                    uassert( 12582, "duplicate key insert for unique index of capped collection",
                             idx.head.btree()->findSingle(idx, idx.head, *i ).isNull() );
                }
            }
        }
    }

    /* note: if god==true, you may pass in obuf of NULL and then populate the returned DiskLoc
             after the call -- that will prevent a double buffer copy in some cases (btree.cpp).
    */
    DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) {
        bool wouldAddIndex = false;
        massert( 10093 , "cannot insert into reserved $ collection", god || isANormalNSName( ns ) );
        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
        const char *sys = strstr(ns, "system.");
        if ( sys ) {
            uassert( 10095 , "attempt to insert in reserved database name 'system'", sys != ns);
            if ( strstr(ns, ".system.") ) {
                // later:check for dba-type permissions here if have that at some point separate
                if ( strstr(ns, ".system.indexes" ) )
                    wouldAddIndex = true;
                else if ( legalClientSystemNS( ns , true ) )
                    ;
                else if ( !god ) {
                    out() << "ERROR: attempt to insert in system namespace " << ns << endl;
                    return DiskLoc();
                }
            }
            else
                sys = 0;
        }

        bool addIndex = wouldAddIndex && mayAddIndex;

        NamespaceDetails *d = nsdetails(ns);
        if ( d == 0 ) {
            addNewNamespaceToCatalog(ns);
            /* todo: shouldn't be in the namespace catalog until after the allocations here work.
               also if this is an addIndex, those checks should happen before this!
            */
            // This may create first file in the database.
            cc().database()->allocExtent(ns, Extent::initialSize(len), false);
            d = nsdetails(ns);
            if ( !god )
                ensureIdIndexForNewNs(ns);
        }
        d->paddingFits();

        NamespaceDetails *tableToIndex = 0;

        string tabletoidxns;
        BSONObj fixedIndexObject;
        if ( addIndex ) {
            assert( obuf );
            BSONObj io((const char *) obuf);
            if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) )
                return DiskLoc();

            if ( ! fixedIndexObject.isEmpty() ) {
                obuf = fixedIndexObject.objdata();
                len = fixedIndexObject.objsize();
            }

        }

        const BSONElement *newId = &writeId;
        int addID = 0;
        if( !god ) {
            /* Check if we have an _id field. If we don't, we'll add it.
               Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
            */
            BSONObj io((const char *) obuf);
            BSONElement idField = io.getField( "_id" );
            uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
            if( idField.eoo() && !wouldAddIndex && strstr(ns, ".local.") == 0 ) {
                addID = len;
                if ( writeId.eoo() ) {
                    // Very likely we'll add this elt, so little harm in init'ing here.
                    idToInsert_.oid.init();
                    newId = &idToInsert;
                }
                len += newId->size();
            }

            BSONElementManipulator::lookForTimestamps( io );
        }

        DiskLoc extentLoc;
        int lenWHdr = len + Record::HeaderSize;
        lenWHdr = (int) (lenWHdr * d->paddingFactor);
        if ( lenWHdr == 0 ) {
            // old datafiles, backward compatible here.
            assert( d->paddingFactor == 0 );
            *getDur().writing(&d->paddingFactor) = 1.0;
            lenWHdr = len + Record::HeaderSize;
        }

        // If the collection is capped, check if the new object will violate a unique index
        // constraint before allocating space.
        if ( d->nIndexes && d->capped && !god ) {
            checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
        }

        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
        if ( loc.isNull() ) {
            // out of space
            if ( d->capped == 0 ) { // size capped doesn't grow
                log(1) << "allocating new extent for " << ns << " padding:" << d->paddingFactor << " lenWHdr: " << lenWHdr << endl;
                cc().database()->allocExtent(ns, Extent::followupSize(lenWHdr, d->lastExtentSize), false);
                loc = d->alloc(ns, lenWHdr, extentLoc);
                if ( loc.isNull() ) {
                    log() << "WARNING: alloc() failed after allocating new extent. lenWHdr: " << lenWHdr << " last extent size:" << d->lastExtentSize << "; trying again\n";
                    for ( int zzz=0; zzz<10 && lenWHdr > d->lastExtentSize; zzz++ ) {
                        log() << "try #" << zzz << endl;
                        cc().database()->allocExtent(ns, Extent::followupSize(len, d->lastExtentSize), false);
                        loc = d->alloc(ns, lenWHdr, extentLoc);
                        if ( ! loc.isNull() )
                            break;
                    }
                }
            }
            if ( loc.isNull() ) {
                log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->capped << endl;
                assert(d->capped);
                return DiskLoc();
            }
        }

        Record *r = loc.rec();
        {
            assert( r->lengthWithHeaders >= lenWHdr );
            r = (Record*) getDur().writingPtr(r, lenWHdr);
            if( addID ) {
                /* a little effort was made here to avoid a double copy when we add an ID */
                ((int&)*r->data) = *((int*) obuf) + newId->size();
                memcpy(r->data+4, newId->rawdata(), newId->size());
                memcpy(r->data+4+newId->size(), ((char *)obuf)+4, addID-4);
            }
            else {
                if( obuf )
                    memcpy(r->data, obuf, len);
            }
        }

        {
            Extent *e = r->myExtent(loc);
            if ( e->lastRecord.isNull() ) {
                Extent::FL *fl = getDur().writing(e->fl());
                fl->firstRecord = fl->lastRecord = loc;
                r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
            }
            else {
                Record *oldlast = e->lastRecord.rec();
                r->prevOfs = e->lastRecord.getOfs();
                r->nextOfs = DiskLoc::NullOfs;
                getDur().writingInt(oldlast->nextOfs) = loc.getOfs();
                getDur().writingDiskLoc(e->lastRecord) = loc;
            }
        }

        /* durability todo : this could be a bit annoying / slow to record constantly */
        {
            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
            s->datasize += r->netLength();
            s->nrecords++;
        }

        // we don't bother clearing those stats for the god tables - also god is true when adidng a btree bucket
        if ( !god )
            NamespaceDetailsTransient::get_w( ns ).notifyOfWriteOp();

        if ( tableToIndex ) {
            uassert( 13143 , "can't create index on system.indexes" , tabletoidxns.find( ".system.indexes" ) == string::npos );

            BSONObj info = loc.obj();
            bool background = info["background"].trueValue();
            if( background && cc().isSyncThread() ) {
                /* don't do background indexing on slaves.  there are nuances.  this could be added later
                   but requires more code.
                   */
                log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
                background = false;
            }

            int idxNo = tableToIndex->nIndexes;
            IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
            getDur().writingDiskLoc(idx.info) = loc;
            try {
                buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
            }
            catch( DBException& e ) {
                // save our error msg string as an exception or dropIndexes will overwrite our message
                LastError *le = lastError.get();
                int savecode = 0;
                string saveerrmsg;
                if ( le ) {
                    savecode = le->code;
                    saveerrmsg = le->msg;
                }
                else {
                    savecode = e.getCode();
                    saveerrmsg = e.what();
                }

                // roll back this index
                string name = idx.indexName();
                BSONObjBuilder b;
                string errmsg;
                bool ok = dropIndexes(tableToIndex, tabletoidxns.c_str(), name.c_str(), errmsg, b, true);
                if( !ok ) {
                    log() << "failed to drop index after a unique key error building it: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
                }

                assert( le && !saveerrmsg.empty() );
                raiseError(savecode,saveerrmsg.c_str());
                throw;
            }
        }

        /* add this record to our indexes */
        if ( d->nIndexes ) {
            try {
                BSONObj obj(r->data);
                indexRecord(d, obj, loc);
            }
            catch( AssertionException& e ) {
                // should be a dup key error on _id index
                if( tableToIndex || d->capped ) {
                    massert( 12583, "unexpected index insertion failure on capped collection", !d->capped );
                    string s = e.toString();
                    s += " : on addIndex/capped - collection and its index will not match";
                    uassert_nothrow(s.c_str());
                    error() << s << endl;
                }
                else {
                    // normal case -- we can roll back
                    _deleteRecord(d, ns, r, loc);
                    throw;
                }
            }
        }

        //  out() << "   inserted at loc:" << hex << loc.getOfs() << " lenwhdr:" << hex << lenWHdr << dec << ' ' << ns << endl;
        return loc;
    }

    /* special version of insert for transaction logging -- streamlined a bit.
       assumes ns is capped and no indexes
    */
    Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) {
        assert( d );
        RARELY assert( d == nsdetails(ns) );
        DEV assert( d == nsdetails(ns) );

        DiskLoc extentLoc;
        int lenWHdr = len + Record::HeaderSize;
        DiskLoc loc = d->alloc(ns, lenWHdr, extentLoc);
        if ( loc.isNull() ) {
            assert(false);
            return 0;
        }

        Record *r = loc.rec();
        assert( r->lengthWithHeaders >= lenWHdr );

        Extent *e = r->myExtent(loc);
        if ( e->lastRecord.isNull() ) {
            Extent::FL *fl = getDur().writing( e->fl() );
            fl->firstRecord = fl->lastRecord = loc;

            Record::NP *np = getDur().writing(r->np());
            np->nextOfs = np->prevOfs = DiskLoc::NullOfs;
        }
        else {
            Record *oldlast = e->lastRecord.rec();
            Record::NP *np = getDur().writing(r->np());
            np->prevOfs = e->lastRecord.getOfs();
            np->nextOfs = DiskLoc::NullOfs;
            getDur().writingInt( oldlast->nextOfs ) = loc.getOfs();
            e->lastRecord.writing() = loc;
        }

        /* todo: don't update for oplog?  seems wasteful. */
        {
            NamespaceDetails::Stats *s = getDur().writing(&d->stats);
            s->datasize += r->netLength();
            s->nrecords++;
        }

        return r;
    }

} // namespace mongo

#include "clientcursor.h"

namespace mongo {

    void dropAllDatabasesExceptLocal() {
        writelock lk("");

        vector<string> n;
        getDatabaseNames(n);
        if( n.size() == 0 ) return;
        log() << "dropAllDatabasesExceptLocal " << n.size() << endl;
        for( vector<string>::iterator i = n.begin(); i != n.end(); i++ ) {
            if( *i != "local" ) {
                Client::Context ctx(*i);
                dropDatabase(*i);
            }
        }
    }

    void dropDatabase(string db) {
        log(1) << "dropDatabase " << db << endl;
        Database *d = cc().database();
        assert( d );
        assert( d->name == db );

        BackgroundOperation::assertNoBgOpInProgForDb(d->name.c_str());

        getDur().syncDataAndTruncateJournal();

        Database::closeDatabase( d->name.c_str(), d->path );
        d = 0; // d is now deleted

        _deleteDataFiles( db.c_str() );
    }

    typedef boost::filesystem::path Path;

    void boostRenameWrapper( const Path &from, const Path &to ) {
        try {
            boost::filesystem::rename( from, to );
        }
        catch ( const boost::filesystem::filesystem_error & ) {
            // boost rename doesn't work across partitions
            boost::filesystem::copy_file( from, to);
            boost::filesystem::remove( from );
        }
    }

    // back up original database files to 'temp' dir
    void _renameForBackup( const char *database, const Path &reservedPath ) {
        Path newPath( reservedPath );
        if ( directoryperdb )
            newPath /= database;
        class Renamer : public FileOp {
        public:
            Renamer( const Path &newPath ) : newPath_( newPath ) {}
        private:
            const boost::filesystem::path &newPath_;
            virtual bool apply( const Path &p ) {
                if ( !boost::filesystem::exists( p ) )
                    return false;
                boostRenameWrapper( p, newPath_ / ( p.leaf() + ".bak" ) );
                return true;
            }
            virtual const char * op() const {
                return "renaming";
            }
        } renamer( newPath );
        _applyOpToDataFiles( database, renamer, true );
    }

    // move temp files to standard data dir
    void _replaceWithRecovered( const char *database, const char *reservedPathString ) {
        Path newPath( dbpath );
        if ( directoryperdb )
            newPath /= database;
        class Replacer : public FileOp {
        public:
            Replacer( const Path &newPath ) : newPath_( newPath ) {}
        private:
            const boost::filesystem::path &newPath_;
            virtual bool apply( const Path &p ) {
                if ( !boost::filesystem::exists( p ) )
                    return false;
                boostRenameWrapper( p, newPath_ / p.leaf() );
                return true;
            }
            virtual const char * op() const {
                return "renaming";
            }
        } replacer( newPath );
        _applyOpToDataFiles( database, replacer, true, reservedPathString );
    }

    // generate a directory name for storing temp data files
    Path uniqueReservedPath( const char *prefix ) {
        Path repairPath = Path( repairpath );
        Path reservedPath;
        int i = 0;
        bool exists = false;
        do {
            stringstream ss;
            ss << prefix << "_repairDatabase_" << i++;
            reservedPath = repairPath / ss.str();
            BOOST_CHECK_EXCEPTION( exists = boost::filesystem::exists( reservedPath ) );
        }
        while ( exists );
        return reservedPath;
    }

    boost::intmax_t dbSize( const char *database ) {
        class SizeAccumulator : public FileOp {
        public:
            SizeAccumulator() : totalSize_( 0 ) {}
            boost::intmax_t size() const {
                return totalSize_;
            }
        private:
            virtual bool apply( const boost::filesystem::path &p ) {
                if ( !boost::filesystem::exists( p ) )
                    return false;
                totalSize_ += boost::filesystem::file_size( p );
                return true;
            }
            virtual const char *op() const {
                return "checking size";
            }
            boost::intmax_t totalSize_;
        };
        SizeAccumulator sa;
        _applyOpToDataFiles( database, sa );
        return sa.size();
    }

#if !defined(_WIN32)
} // namespace mongo
#include <sys/statvfs.h>
namespace mongo {
#endif
    boost::intmax_t freeSpace ( const string &path ) {
#if !defined(_WIN32)
        struct statvfs info;
        assert( !statvfs( path.c_str() , &info ) );
        return boost::intmax_t( info.f_bavail ) * info.f_frsize;
#else
        return -1;
#endif
    }

    bool repairDatabase( string dbNameS , string &errmsg,
                         bool preserveClonedFilesOnFailure, bool backupOriginalFiles ) {
        doingRepair dr;
        dbNameS = nsToDatabase( dbNameS );
        const char * dbName = dbNameS.c_str();

        stringstream ss;
        ss << "localhost:" << cmdLine.port;
        string localhost = ss.str();

        problem() << "repairDatabase " << dbName << endl;
        assert( cc().database()->name == dbName );
        assert( cc().database()->path == dbpath );

        BackgroundOperation::assertNoBgOpInProgForDb(dbName);

        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair

        boost::intmax_t totalSize = dbSize( dbName );
        boost::intmax_t freeSize = freeSpace( repairpath );
        if ( freeSize > -1 && freeSize < totalSize ) {
            stringstream ss;
            ss << "Cannot repair database " << dbName << " having size: " << totalSize
               << " (bytes) because free disk space is: " << freeSize << " (bytes)";
            errmsg = ss.str();
            problem() << errmsg << endl;
            return false;
        }

        Path reservedPath =
            uniqueReservedPath( ( preserveClonedFilesOnFailure || backupOriginalFiles ) ?
                                "backup" : "$tmp" );
        BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( reservedPath ) );
        string reservedPathString = reservedPath.native_directory_string();

        bool res;
        {
            // clone to temp location, which effectively does repair
            Client::Context ctx( dbName, reservedPathString );
            assert( ctx.justCreated() );

            res = cloneFrom(localhost.c_str(), errmsg, dbName,
                            /*logForReplication=*/false, /*slaveok*/false, /*replauth*/false, /*snapshot*/false);
            Database::closeDatabase( dbName, reservedPathString.c_str() );
        }

        if ( !res ) {
            problem() << "clone failed for " << dbName << " with error: " << errmsg << endl;
            if ( !preserveClonedFilesOnFailure )
                BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );

            getDur().syncDataAndTruncateJournal(); // Must be done before and after repair

            return false;
        }

        MongoFile::flushAll(true);

        Client::Context ctx( dbName );
        Database::closeDatabase( dbName, dbpath );

        if ( backupOriginalFiles ) {
            _renameForBackup( dbName, reservedPath );
        }
        else {
            _deleteDataFiles( dbName );
            BOOST_CHECK_EXCEPTION( boost::filesystem::create_directory( Path( dbpath ) / dbName ) );
        }

        _replaceWithRecovered( dbName, reservedPathString.c_str() );

        if ( !backupOriginalFiles )
            BOOST_CHECK_EXCEPTION( boost::filesystem::remove_all( reservedPath ) );

        getDur().syncDataAndTruncateJournal(); // Must be done before and after repair

        return true;
    }

    void _applyOpToDataFiles( const char *database, FileOp &fo, bool afterAllocator, const string& path ) {
        if ( afterAllocator )
            FileAllocator::get()->waitUntilFinished();
        string c = database;
        c += '.';
        boost::filesystem::path p(path);
        if ( directoryperdb )
            p /= database;
        boost::filesystem::path q;
        q = p / (c+"ns");
        bool ok = false;
        BOOST_CHECK_EXCEPTION( ok = fo.apply( q ) );
        if ( ok )
            log(2) << fo.op() << " file " << q.string() << '\n';
        int i = 0;
        int extra = 10; // should not be necessary, this is defensive in case there are missing files
        while ( 1 ) {
            assert( i <= DiskLoc::MaxFiles );
            stringstream ss;
            ss << c << i;
            q = p / ss.str();
            BOOST_CHECK_EXCEPTION( ok = fo.apply(q) );
            if ( ok ) {
                if ( extra != 10 ) {
                    log(1) << fo.op() << " file " << q.string() << endl;
                    log() << "  _applyOpToDataFiles() warning: extra == " << extra << endl;
                }
            }
            else if ( --extra <= 0 )
                break;
            i++;
        }
    }

    NamespaceDetails* nsdetails_notinline(const char *ns) { return nsdetails(ns); }

    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) {
        log() << "DatabaseHolder::closeAll path:" << path << endl;
        dbMutex.assertWriteLocked();

        map<string,Database*>& m = _paths[path];
        _size -= m.size();

        set< string > dbs;
        for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) {
            wassert( i->second->path == path );
            dbs.insert( i->first );
        }

        currentClient.get()->getContext()->clear();

        BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
        int n = 0;
        int nNotClosed = 0;
        for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) {
            string name = *i;
            log(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl;
            Client::Context ctx( name , path );
            if( !force && BackgroundOperation::inProgForDb(name.c_str()) ) {
                log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl;
                nNotClosed++;
            }
            else {
                Database::closeDatabase( name.c_str() , path );
                bb.append( bb.numStr( n++ ) , name );
            }
        }
        bb.done();
        if( nNotClosed )
            result.append("nNotClosed", nNotClosed);
        else {
            ClientCursor::assertNoCursors();
        }

        return true;
    }

    bool isValidNS( const StringData& ns ) {
        // TODO: should check for invalid characters

        const char * x = strchr( ns.data() , '.' );
        if ( ! x )
            return false;

        x++;
        return *x > 0;
    }


} // namespace mongo