mongodb/db/dur.cpp

// @file dur.cpp durability in the storage engine (crash-safeness / journaling)

/**
*    Copyright (C) 2009 10gen Inc.
*
*    This program is free software: you can redistribute it and/or  modify
*    it under the terms of the GNU Affero General Public License, version 3,
*    as published by the Free Software Foundation.
*
*    This program is distributed in the hope that it will be useful,
*    but WITHOUT ANY WARRANTY; without even the implied warranty of
*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*    GNU Affero General Public License for more details.
*
*    You should have received a copy of the GNU Affero General Public License
*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/* 
   phases

     PREPLOGBUFFER 
       we will build an output buffer ourself and then use O_DIRECT
       we could be in read lock for this
       for very large objects write directly to redo log in situ?
     WRITETOJOURNAL
       we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity
         have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
         downgrading to (a perhaps upgradable) read lock would be a good start
     WRITETODATAFILES
       apply the writes back to the non-private MMF after they are for certain in redo log
     REMAPPRIVATEVIEW
       we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real 
         remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want 
         to be too frequent.  tracking time for this step would be wise.
       there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will 
         be required.  so doing these remaps more incrementally in the future might make sense - but have to be careful
         not to introduce bugs.
*/

#include "pch.h"

#if defined(_DURABLE)

#include "client.h"
#include "dur.h"
#include "dur_journal.h"
#include "../util/mongoutils/hash.h"
#include "../util/timer.h"
#include "../util/alignedbuilder.h"

namespace mongo { 

    void dbunlocking_write() {
        // pending ...
    }

    namespace dur { 

        //MongoMMF* pointerToMMF(void *p, size_t& ofs);

        struct WriteIntent { 
            WriteIntent() : p(0) { }
            WriteIntent(void *a, unsigned b) : p(a), len(b) { }
            void *p; // where we will write
            unsigned len; // up to this len
        };

        /* try to remember things we have already marked for journalling.  false negatives are ok if infrequent - 
           we will just log them twice.
           */
        template<int Prime>
        class Already {
            enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily
            WriteIntent nodes[N];
        public:
            Already() { clear(); }
            void clear() { memset(this, 0, sizeof(*this)); }

            /* see if we have Already recorded/indicated our write intent for this region of memory.
               @return true if already indicated.
            */
            bool checkAndSet(const WriteIntent& w) {
                unsigned x = mongoutils::hashPointer(w.p);
                WriteIntent& nd = nodes[x % N];
                if( nd.p != w.p || nd.len < w.len ) {
                    nd = w;
                    return false;
                }
                return true;
            }
        };

        static Already<127> alreadyNoted;

        /* our record of pending/uncommitted write intents */
        static vector<WriteIntent> writes;

        void* writingPtr(void *x, size_t len) { 
            //log() << "TEMP writing " << x << ' ' << len << endl;
            void *p = x;
            DEV p = MongoMMF::switchToPrivateView(x);
            WriteIntent w(p, len);
            if( !alreadyNoted.checkAndSet(w) ) {
                // remember intent. we will journal it in a bit
                writes.push_back(w);
                wassert( writes.size() <  2000000 );
                assert(  writes.size() < 20000000 );
            }
            return p;
        }

        /** caller handles locking */
        static bool PREPLOGBUFFER(AlignedBuilder& bb) { 
            if( writes.empty() )
                return false;

            bb.reset();

            unsigned *lenInBlockHeader;
            {
                // JSectHeader
                bb.appendStr("\nHH\n", false);
                lenInBlockHeader = (unsigned *) bb.skip(4);
            }

            string lastFilePath;

            {
                scoped_lock lk(privateViews._mutex());
                for( vector<WriteIntent>::iterator i = writes.begin(); i != writes.end(); i++ ) {
                    size_t ofs;
                    MongoMMF *mmf = privateViews._find(i->p, ofs);
                    if( mmf == 0 ) {
                        journalingFailure("view pointer cannot be resolved");
                    }
                    else {
                        if( mmf->filePath() != lastFilePath ) { 
                            lastFilePath = mmf->filePath();
                            JDbContext c;
                            bb.appendStruct(c);
                            bb.appendStr(lastFilePath);
                        }
                        JEntry e;
                        e.len = i->len;
                        e.fileNo = mmf->fileSuffixNo();
                        bb.appendStruct(e);
                        bb.appendBuf(i->p, i->len);
                    }
                }
            }

            {
                JSectFooter f;
                f.hash = 0;
                bb.appendStruct(f);
            }

            {
                unsigned L = (bb.len() + 8191) & 0xffffe000; // fill to alignment
                dassert( L >= (unsigned) bb.len() );
                *lenInBlockHeader = L;
                unsigned padding = L - bb.len();
                bb.skip(padding);
                dassert( bb.len() % 8192 == 0 );
            }

            writes.clear();
            alreadyNoted.clear();
            return true;
        }

        static void WRITETOJOURNAL(const AlignedBuilder& bb) { 
            journal(bb);
        }

        static void _go(AlignedBuilder& bb) {
            PREPLOGBUFFER(bb);

            // todo: add double buffering so we can be (not even read locked) during WRITETOJOURNAL
            WRITETOJOURNAL(bb);
        }

        static void go(AlignedBuilder& bb) {
            {
                readlocktry lk("", 1000);
                if( lk.got() ) {
                    _go(bb);
                    return;
                }
            }
            // starvation on read locks could occur.  so if read lock acquisition is slow, try to get a 
            // write lock instead.  otherwise writes could use too much RAM.
            writelock lk;
            _go(bb);
        }

        static void durThread() { 
            Client::initThread("dur");
            const int HowOftenToGroupCommitMs = 100;
            AlignedBuilder bb(1024 * 1024 * 16); // reuse to avoid any heap fragmentation
            while( 1 ) { 
                try {
                    int millis = HowOftenToGroupCommitMs;
                    {
                        Timer t;
                        journalRotate(); // note we do this part outside of mongomutex
                        millis -= t.millis();
                        if( millis < 5 || millis > HowOftenToGroupCommitMs )
                            millis = 5;
                    }
                    sleepmillis(millis);
                    go(bb);
                }
                catch(std::exception& e) { 
                    log() << "exception in durThread " << e.what() << endl;
                }
            }
        }

        void unlinkThread();

        void startup() {
            journalMakeDir();
            boost::thread t(durThread);
            boost::thread t2(unlinkThread);
        }

    } // namespace dur

} // namespace mongo

#endif
comments and LF fixes 2010-10-05 02:09:41 +02:00			`// @file dur.cpp durability in the storage engine (crash-safeness / journaling)`

proper comments and put in missing pragma once 2010-11-04 03:17:15 +01:00			`/**`
			`* Copyright (C) 2009 10gen Inc.`
			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU Affero General Public License, version 3,`
			`* as published by the Free Software Foundation.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU Affero General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Affero General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

comments and LF fixes 2010-10-05 02:09:41 +02:00			`/*`
			`phases`

			`PREPLOGBUFFER`
			`we will build an output buffer ourself and then use O_DIRECT`
			`we could be in read lock for this`
dur 2010-11-13 19:04:48 +01:00			`for very large objects write directly to redo log in situ?`
dur work 2010-11-15 04:28:04 +01:00			`WRITETOJOURNAL`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity`
			`have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).`
			`downgrading to (a perhaps upgradable) read lock would be a good start`
dur work 2010-11-15 04:28:04 +01:00			`WRITETODATAFILES`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`apply the writes back to the non-private MMF after they are for certain in redo log`
			`REMAPPRIVATEVIEW`
			`we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real`
			`remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want`
			`to be too frequent. tracking time for this step would be wise.`
			`there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will`
			`be required. so doing these remaps more incrementally in the future might make sense - but have to be careful`
			`not to introduce bugs.`
			`*/`

			`#include "pch.h"`

dur 2010-11-14 00:32:41 +01:00			`#if defined(_DURABLE)`
comments and LF fixes 2010-10-05 02:09:41 +02:00
dur work 2010-11-15 04:28:04 +01:00			`#include "client.h"`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`#include "dur.h"`
dur 2010-11-13 23:42:41 +01:00			`#include "dur_journal.h"`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`#include "../util/mongoutils/hash.h"`
dur work 2010-11-15 04:28:04 +01:00			`#include "../util/timer.h"`
AlignedBuilder dur 2010-11-16 04:13:48 +01:00			`#include "../util/alignedbuilder.h"`
comments and LF fixes 2010-10-05 02:09:41 +02:00
			`namespace mongo {`

dur 2010-10-04 23:05:31 +02:00			`void dbunlocking_write() {`
			`// pending ...`
			`}`
comments and LF fixes 2010-10-05 02:09:41 +02:00
			`namespace dur {`

dur work 2010-11-15 04:28:04 +01:00			`//MongoMMF* pointerToMMF(void *p, size_t& ofs);`
dur 2010-11-14 00:32:41 +01:00
comments and LF fixes 2010-10-05 02:09:41 +02:00			`struct WriteIntent {`
			`WriteIntent() : p(0) { }`
			`WriteIntent(void *a, unsigned b) : p(a), len(b) { }`
dur 2010-11-13 19:04:48 +01:00			`void *p; // where we will write`
			`unsigned len; // up to this len`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`};`

			`/* try to remember things we have already marked for journalling. false negatives are ok if infrequent -`
			`we will just log them twice.`
			`*/`
			`template<int Prime>`
			`class Already {`
			`enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily`
			`WriteIntent nodes[N];`
			`public:`
dur work 2010-11-15 04:28:04 +01:00			`Already() { clear(); }`
			`void clear() { memset(this, 0, sizeof(*this)); }`
dur 2010-11-13 19:04:48 +01:00
			`/* see if we have Already recorded/indicated our write intent for this region of memory.`
			`@return true if already indicated.`
			`*/`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`bool checkAndSet(const WriteIntent& w) {`
			`unsigned x = mongoutils::hashPointer(w.p);`
dur 2010-11-13 19:04:48 +01:00			`WriteIntent& nd = nodes[x % N];`
			`if( nd.p != w.p \|\| nd.len < w.len ) {`
			`nd = w;`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`return false;`
			`}`
dur 2010-11-13 19:04:48 +01:00			`return true;`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`}`
			`};`

			`static Already<127> alreadyNoted;`
dur 2010-11-13 19:04:48 +01:00
			`/* our record of pending/uncommitted write intents */`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`static vector<WriteIntent> writes;`

			`void* writingPtr(void *x, size_t len) {`
			`//log() << "TEMP writing " << x << ' ' << len << endl;`
dur work 2010-11-15 04:28:04 +01:00			`void *p = x;`
			`DEV p = MongoMMF::switchToPrivateView(x);`
			`WriteIntent w(p, len);`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`if( !alreadyNoted.checkAndSet(w) ) {`
dur work 2010-11-15 04:28:04 +01:00			`// remember intent. we will journal it in a bit`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`writes.push_back(w);`
			`wassert( writes.size() < 2000000 );`
			`assert( writes.size() < 20000000 );`
			`}`
dur work 2010-11-15 04:28:04 +01:00			`return p;`
comments and LF fixes 2010-10-05 02:09:41 +02:00			`}`

dur work 2010-11-15 04:28:04 +01:00			`/** caller handles locking */`
AlignedBuilder dur 2010-11-16 04:13:48 +01:00			`static bool PREPLOGBUFFER(AlignedBuilder& bb) {`
dur work 2010-11-15 04:28:04 +01:00			`if( writes.empty() )`
			`return false;`
dur 2010-11-14 00:32:41 +01:00
dur 2010-11-13 23:42:41 +01:00			`bb.reset();`

dur work 2010-11-15 04:28:04 +01:00			`unsigned *lenInBlockHeader;`
			`{`
			`// JSectHeader`
			`bb.appendStr("\nHH\n", false);`
			`lenInBlockHeader = (unsigned *) bb.skip(4);`
			`}`
dur 2010-11-13 23:42:41 +01:00
dur work 2010-11-15 04:28:04 +01:00			`string lastFilePath;`

			`{`
			`scoped_lock lk(privateViews._mutex());`
			`for( vector<WriteIntent>::iterator i = writes.begin(); i != writes.end(); i++ ) {`
			`size_t ofs;`
			`MongoMMF *mmf = privateViews._find(i->p, ofs);`
			`if( mmf == 0 ) {`
			`journalingFailure("view pointer cannot be resolved");`
			`}`
			`else {`
			`if( mmf->filePath() != lastFilePath ) {`
			`lastFilePath = mmf->filePath();`
			`JDbContext c;`
			`bb.appendStruct(c);`
			`bb.appendStr(lastFilePath);`
			`}`
			`JEntry e;`
			`e.len = i->len;`
			`e.fileNo = mmf->fileSuffixNo();`
			`bb.appendStruct(e);`
			`bb.appendBuf(i->p, i->len);`
			`}`
dur 2010-11-14 00:32:41 +01:00			`}`
dur 2010-11-13 23:42:41 +01:00			`}`

dur work 2010-11-15 04:28:04 +01:00			`{`
			`JSectFooter f;`
			`f.hash = 0;`
			`bb.appendStruct(f);`
			`}`

			`{`
			`unsigned L = (bb.len() + 8191) & 0xffffe000; // fill to alignment`
			`dassert( L >= (unsigned) bb.len() );`
			`*lenInBlockHeader = L;`
			`unsigned padding = L - bb.len();`
			`bb.skip(padding);`
			`dassert( bb.len() % 8192 == 0 );`
			`}`

			`writes.clear();`
			`alreadyNoted.clear();`
			`return true;`
			`}`

AlignedBuilder dur 2010-11-16 04:13:48 +01:00			`static void WRITETOJOURNAL(const AlignedBuilder& bb) {`
dur work 2010-11-15 04:28:04 +01:00			`journal(bb);`
			`}`

AlignedBuilder dur 2010-11-16 04:13:48 +01:00			`static void _go(AlignedBuilder& bb) {`
dur work 2010-11-15 04:28:04 +01:00			`PREPLOGBUFFER(bb);`

			`// todo: add double buffering so we can be (not even read locked) during WRITETOJOURNAL`
			`WRITETOJOURNAL(bb);`
dur 2010-11-13 23:42:41 +01:00			`}`

AlignedBuilder dur 2010-11-16 04:13:48 +01:00			`static void go(AlignedBuilder& bb) {`
dur 2010-11-13 23:42:41 +01:00			`{`
			`readlocktry lk("", 1000);`
			`if( lk.got() ) {`
dur work 2010-11-15 04:28:04 +01:00			`_go(bb);`
dur 2010-11-13 23:42:41 +01:00			`return;`
			`}`
			`}`
			`// starvation on read locks could occur. so if read lock acquisition is slow, try to get a`
			`// write lock instead. otherwise writes could use too much RAM.`
			`writelock lk;`
dur work 2010-11-15 04:28:04 +01:00			`_go(bb);`
dur 2010-11-13 23:42:41 +01:00			`}`

dur work 2010-11-15 04:28:04 +01:00			`static void durThread() {`
			`Client::initThread("dur");`
			`const int HowOftenToGroupCommitMs = 100;`
AlignedBuilder dur 2010-11-16 04:13:48 +01:00			`AlignedBuilder bb(1024 * 1024 * 16); // reuse to avoid any heap fragmentation`
dur 2010-11-13 23:42:41 +01:00			`while( 1 ) {`
			`try {`
dur work 2010-11-15 04:28:04 +01:00			`int millis = HowOftenToGroupCommitMs;`
			`{`
			`Timer t;`
			`journalRotate(); // note we do this part outside of mongomutex`
			`millis -= t.millis();`
			`if( millis < 5 \|\| millis > HowOftenToGroupCommitMs )`
			`millis = 5;`
			`}`
			`sleepmillis(millis);`
			`go(bb);`
dur 2010-11-13 23:42:41 +01:00			`}`
dur work 2010-11-15 04:28:04 +01:00			`catch(std::exception& e) {`
			`log() << "exception in durThread " << e.what() << endl;`
dur 2010-11-13 23:42:41 +01:00			`}`
			`}`
dur 2010-11-13 19:04:48 +01:00			`}`

dur 2010-11-15 22:03:56 +01:00			`void unlinkThread();`

dur 2010-11-13 23:42:41 +01:00			`void startup() {`
dur work 2010-11-15 04:28:04 +01:00			`journalMakeDir();`
dur 2010-11-13 23:42:41 +01:00			`boost::thread t(durThread);`
dur 2010-11-15 22:03:56 +01:00			`boost::thread t2(unlinkThread);`
dur 2010-11-13 23:42:41 +01:00			`}`
dur 2010-11-13 19:04:48 +01:00
comments and LF fixes 2010-10-05 02:09:41 +02:00			`} // namespace dur`

			`} // namespace mongo`

			`#endif`