0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 09:32:32 +01:00
mongodb/db/dur.cpp

234 lines
8.0 KiB
C++
Raw Normal View History

2010-10-05 02:09:41 +02:00
// @file dur.cpp durability in the storage engine (crash-safeness / journaling)
/**
* Copyright (C) 2009 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
2010-10-05 02:09:41 +02:00
/*
phases
PREPLOGBUFFER
we will build an output buffer ourself and then use O_DIRECT
we could be in read lock for this
2010-11-13 19:04:48 +01:00
for very large objects write directly to redo log in situ?
2010-11-15 04:28:04 +01:00
WRITETOJOURNAL
2010-10-05 02:09:41 +02:00
we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity
have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
downgrading to (a perhaps upgradable) read lock would be a good start
2010-11-15 04:28:04 +01:00
WRITETODATAFILES
2010-10-05 02:09:41 +02:00
apply the writes back to the non-private MMF after they are for certain in redo log
REMAPPRIVATEVIEW
we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real
remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want
to be too frequent. tracking time for this step would be wise.
there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
be required. so doing these remaps more incrementally in the future might make sense - but have to be careful
not to introduce bugs.
*/
#include "pch.h"
2010-11-14 00:32:41 +01:00
#if defined(_DURABLE)
2010-10-05 02:09:41 +02:00
2010-11-15 04:28:04 +01:00
#include "client.h"
2010-10-05 02:09:41 +02:00
#include "dur.h"
2010-11-13 23:42:41 +01:00
#include "dur_journal.h"
2010-10-05 02:09:41 +02:00
#include "../util/mongoutils/hash.h"
2010-11-15 04:28:04 +01:00
#include "../util/timer.h"
2010-11-16 04:13:48 +01:00
#include "../util/alignedbuilder.h"
2010-10-05 02:09:41 +02:00
namespace mongo {
2010-10-04 23:05:31 +02:00
void dbunlocking_write() {
// pending ...
}
2010-10-05 02:09:41 +02:00
namespace dur {
2010-11-15 04:28:04 +01:00
//MongoMMF* pointerToMMF(void *p, size_t& ofs);
2010-11-14 00:32:41 +01:00
2010-10-05 02:09:41 +02:00
struct WriteIntent {
WriteIntent() : p(0) { }
WriteIntent(void *a, unsigned b) : p(a), len(b) { }
2010-11-13 19:04:48 +01:00
void *p; // where we will write
unsigned len; // up to this len
2010-10-05 02:09:41 +02:00
};
/* try to remember things we have already marked for journalling. false negatives are ok if infrequent -
we will just log them twice.
*/
template<int Prime>
class Already {
enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily
WriteIntent nodes[N];
public:
2010-11-15 04:28:04 +01:00
Already() { clear(); }
void clear() { memset(this, 0, sizeof(*this)); }
2010-11-13 19:04:48 +01:00
/* see if we have Already recorded/indicated our write intent for this region of memory.
@return true if already indicated.
*/
2010-10-05 02:09:41 +02:00
bool checkAndSet(const WriteIntent& w) {
unsigned x = mongoutils::hashPointer(w.p);
2010-11-13 19:04:48 +01:00
WriteIntent& nd = nodes[x % N];
if( nd.p != w.p || nd.len < w.len ) {
nd = w;
2010-10-05 02:09:41 +02:00
return false;
}
2010-11-13 19:04:48 +01:00
return true;
2010-10-05 02:09:41 +02:00
}
};
static Already<127> alreadyNoted;
2010-11-13 19:04:48 +01:00
/* our record of pending/uncommitted write intents */
2010-10-05 02:09:41 +02:00
static vector<WriteIntent> writes;
void* writingPtr(void *x, size_t len) {
//log() << "TEMP writing " << x << ' ' << len << endl;
2010-11-15 04:28:04 +01:00
void *p = x;
DEV p = MongoMMF::switchToPrivateView(x);
WriteIntent w(p, len);
2010-10-05 02:09:41 +02:00
if( !alreadyNoted.checkAndSet(w) ) {
2010-11-15 04:28:04 +01:00
// remember intent. we will journal it in a bit
2010-10-05 02:09:41 +02:00
writes.push_back(w);
wassert( writes.size() < 2000000 );
assert( writes.size() < 20000000 );
}
2010-11-15 04:28:04 +01:00
return p;
2010-10-05 02:09:41 +02:00
}
2010-11-15 04:28:04 +01:00
/** caller handles locking */
2010-11-16 04:13:48 +01:00
static bool PREPLOGBUFFER(AlignedBuilder& bb) {
2010-11-15 04:28:04 +01:00
if( writes.empty() )
return false;
2010-11-14 00:32:41 +01:00
2010-11-13 23:42:41 +01:00
bb.reset();
2010-11-15 04:28:04 +01:00
unsigned *lenInBlockHeader;
{
// JSectHeader
bb.appendStr("\nHH\n", false);
lenInBlockHeader = (unsigned *) bb.skip(4);
}
2010-11-13 23:42:41 +01:00
2010-11-15 04:28:04 +01:00
string lastFilePath;
{
scoped_lock lk(privateViews._mutex());
for( vector<WriteIntent>::iterator i = writes.begin(); i != writes.end(); i++ ) {
size_t ofs;
MongoMMF *mmf = privateViews._find(i->p, ofs);
if( mmf == 0 ) {
journalingFailure("view pointer cannot be resolved");
}
else {
if( mmf->filePath() != lastFilePath ) {
lastFilePath = mmf->filePath();
JDbContext c;
bb.appendStruct(c);
bb.appendStr(lastFilePath);
}
JEntry e;
e.len = i->len;
e.fileNo = mmf->fileSuffixNo();
bb.appendStruct(e);
bb.appendBuf(i->p, i->len);
}
2010-11-14 00:32:41 +01:00
}
2010-11-13 23:42:41 +01:00
}
2010-11-15 04:28:04 +01:00
{
JSectFooter f;
f.hash = 0;
bb.appendStruct(f);
}
{
unsigned L = (bb.len() + 8191) & 0xffffe000; // fill to alignment
dassert( L >= (unsigned) bb.len() );
*lenInBlockHeader = L;
unsigned padding = L - bb.len();
bb.skip(padding);
dassert( bb.len() % 8192 == 0 );
}
writes.clear();
alreadyNoted.clear();
return true;
}
2010-11-16 04:13:48 +01:00
static void WRITETOJOURNAL(const AlignedBuilder& bb) {
2010-11-15 04:28:04 +01:00
journal(bb);
}
2010-11-16 04:13:48 +01:00
static void _go(AlignedBuilder& bb) {
2010-11-15 04:28:04 +01:00
PREPLOGBUFFER(bb);
// todo: add double buffering so we can be (not even read locked) during WRITETOJOURNAL
WRITETOJOURNAL(bb);
2010-11-13 23:42:41 +01:00
}
2010-11-16 04:13:48 +01:00
static void go(AlignedBuilder& bb) {
2010-11-13 23:42:41 +01:00
{
readlocktry lk("", 1000);
if( lk.got() ) {
2010-11-15 04:28:04 +01:00
_go(bb);
2010-11-13 23:42:41 +01:00
return;
}
}
// starvation on read locks could occur. so if read lock acquisition is slow, try to get a
// write lock instead. otherwise writes could use too much RAM.
writelock lk;
2010-11-15 04:28:04 +01:00
_go(bb);
2010-11-13 23:42:41 +01:00
}
2010-11-15 04:28:04 +01:00
static void durThread() {
Client::initThread("dur");
const int HowOftenToGroupCommitMs = 100;
2010-11-16 04:13:48 +01:00
AlignedBuilder bb(1024 * 1024 * 16); // reuse to avoid any heap fragmentation
2010-11-13 23:42:41 +01:00
while( 1 ) {
try {
2010-11-15 04:28:04 +01:00
int millis = HowOftenToGroupCommitMs;
{
Timer t;
journalRotate(); // note we do this part outside of mongomutex
millis -= t.millis();
if( millis < 5 || millis > HowOftenToGroupCommitMs )
millis = 5;
}
sleepmillis(millis);
go(bb);
2010-11-13 23:42:41 +01:00
}
2010-11-15 04:28:04 +01:00
catch(std::exception& e) {
log() << "exception in durThread " << e.what() << endl;
2010-11-13 23:42:41 +01:00
}
}
2010-11-13 19:04:48 +01:00
}
2010-11-15 22:03:56 +01:00
void unlinkThread();
2010-11-13 23:42:41 +01:00
void startup() {
2010-11-15 04:28:04 +01:00
journalMakeDir();
2010-11-13 23:42:41 +01:00
boost::thread t(durThread);
2010-11-15 22:03:56 +01:00
boost::thread t2(unlinkThread);
2010-11-13 23:42:41 +01:00
}
2010-11-13 19:04:48 +01:00
2010-10-05 02:09:41 +02:00
} // namespace dur
} // namespace mongo
#endif