2010-10-05 02:09:41 +02:00
|
|
|
// @file dur.cpp durability in the storage engine (crash-safeness / journaling)
|
|
|
|
|
2010-11-04 03:17:15 +01:00
|
|
|
/**
|
|
|
|
* Copyright (C) 2009 10gen Inc.
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU Affero General Public License, version 3,
|
|
|
|
* as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2010-10-05 02:09:41 +02:00
|
|
|
/*
|
|
|
|
phases
|
|
|
|
|
|
|
|
PREPLOGBUFFER
|
|
|
|
we will build an output buffer ourself and then use O_DIRECT
|
|
|
|
we could be in read lock for this
|
2010-11-13 19:04:48 +01:00
|
|
|
for very large objects write directly to redo log in situ?
|
2010-11-15 04:28:04 +01:00
|
|
|
WRITETOJOURNAL
|
2010-10-05 02:09:41 +02:00
|
|
|
we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity
|
|
|
|
have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
|
|
|
|
downgrading to (a perhaps upgradable) read lock would be a good start
|
2010-11-15 04:28:04 +01:00
|
|
|
WRITETODATAFILES
|
2010-10-05 02:09:41 +02:00
|
|
|
apply the writes back to the non-private MMF after they are for certain in redo log
|
|
|
|
REMAPPRIVATEVIEW
|
|
|
|
we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real
|
|
|
|
remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want
|
|
|
|
to be too frequent. tracking time for this step would be wise.
|
|
|
|
there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
|
|
|
|
be required. so doing these remaps more incrementally in the future might make sense - but have to be careful
|
|
|
|
not to introduce bugs.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "pch.h"
|
|
|
|
|
2010-11-14 00:32:41 +01:00
|
|
|
#if defined(_DURABLE)
|
2010-10-05 02:09:41 +02:00
|
|
|
|
2010-11-15 04:28:04 +01:00
|
|
|
#include "client.h"
|
2010-10-05 02:09:41 +02:00
|
|
|
#include "dur.h"
|
2010-11-13 23:42:41 +01:00
|
|
|
#include "dur_journal.h"
|
2010-10-05 02:09:41 +02:00
|
|
|
#include "../util/mongoutils/hash.h"
|
2010-11-15 04:28:04 +01:00
|
|
|
#include "../util/timer.h"
|
2010-11-16 04:13:48 +01:00
|
|
|
#include "../util/alignedbuilder.h"
|
2010-10-05 02:09:41 +02:00
|
|
|
|
|
|
|
namespace mongo {
|
|
|
|
|
|
|
|
namespace dur {
|
|
|
|
|
2010-11-15 04:28:04 +01:00
|
|
|
//MongoMMF* pointerToMMF(void *p, size_t& ofs);
|
2010-11-14 00:32:41 +01:00
|
|
|
|
2010-11-17 07:53:52 +01:00
|
|
|
struct WriteIntent /* copyable */ {
|
|
|
|
WriteIntent() : w_ptr(0), p(0) { }
|
|
|
|
WriteIntent(void *a, unsigned b) : w_ptr(0), p(a), len(b) { }
|
|
|
|
void *w_ptr; // p is mapped from private to equivalent location in the writable mmap
|
|
|
|
void *p; // intent to write at p
|
2010-11-13 19:04:48 +01:00
|
|
|
unsigned len; // up to this len
|
2010-10-05 02:09:41 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/* try to remember things we have already marked for journalling. false negatives are ok if infrequent -
|
|
|
|
we will just log them twice.
|
|
|
|
*/
|
|
|
|
template<int Prime>
|
|
|
|
class Already {
|
|
|
|
enum { N = Prime }; // this should be small the idea is that it fits in the cpu cache easily
|
|
|
|
WriteIntent nodes[N];
|
|
|
|
public:
|
2010-11-15 04:28:04 +01:00
|
|
|
Already() { clear(); }
|
|
|
|
void clear() { memset(this, 0, sizeof(*this)); }
|
2010-11-13 19:04:48 +01:00
|
|
|
|
|
|
|
/* see if we have Already recorded/indicated our write intent for this region of memory.
|
|
|
|
@return true if already indicated.
|
|
|
|
*/
|
2010-10-05 02:09:41 +02:00
|
|
|
bool checkAndSet(const WriteIntent& w) {
|
|
|
|
unsigned x = mongoutils::hashPointer(w.p);
|
2010-11-13 19:04:48 +01:00
|
|
|
WriteIntent& nd = nodes[x % N];
|
|
|
|
if( nd.p != w.p || nd.len < w.len ) {
|
|
|
|
nd = w;
|
2010-10-05 02:09:41 +02:00
|
|
|
return false;
|
|
|
|
}
|
2010-11-13 19:04:48 +01:00
|
|
|
return true;
|
2010-10-05 02:09:41 +02:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2010-11-13 19:04:48 +01:00
|
|
|
/* our record of pending/uncommitted write intents */
|
2010-11-17 07:53:52 +01:00
|
|
|
struct Writes {
|
|
|
|
Already<127> _alreadyNoted;
|
|
|
|
vector<WriteIntent> _writes;
|
|
|
|
|
|
|
|
void clear() {
|
|
|
|
_alreadyNoted.clear();
|
|
|
|
_writes.clear();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
static Writes wi;
|
2010-10-05 02:09:41 +02:00
|
|
|
|
|
|
|
void* writingPtr(void *x, size_t len) {
|
|
|
|
//log() << "TEMP writing " << x << ' ' << len << endl;
|
2010-11-15 04:28:04 +01:00
|
|
|
void *p = x;
|
|
|
|
DEV p = MongoMMF::switchToPrivateView(x);
|
|
|
|
WriteIntent w(p, len);
|
2010-11-17 07:53:52 +01:00
|
|
|
if( !wi._alreadyNoted.checkAndSet(w) ) {
|
2010-11-15 04:28:04 +01:00
|
|
|
// remember intent. we will journal it in a bit
|
2010-11-17 07:53:52 +01:00
|
|
|
wi._writes.push_back(w);
|
|
|
|
wassert( wi._writes.size() < 2000000 );
|
|
|
|
assert( wi._writes.size() < 20000000 );
|
2010-10-05 02:09:41 +02:00
|
|
|
}
|
2010-11-15 04:28:04 +01:00
|
|
|
return p;
|
2010-10-05 02:09:41 +02:00
|
|
|
}
|
|
|
|
|
2010-11-15 04:28:04 +01:00
|
|
|
/** caller handles locking */
|
2010-11-16 04:13:48 +01:00
|
|
|
static bool PREPLOGBUFFER(AlignedBuilder& bb) {
|
2010-11-17 07:53:52 +01:00
|
|
|
if( wi._writes.empty() )
|
2010-11-15 04:28:04 +01:00
|
|
|
return false;
|
2010-11-14 00:32:41 +01:00
|
|
|
|
2010-11-13 23:42:41 +01:00
|
|
|
bb.reset();
|
|
|
|
|
2010-11-15 04:28:04 +01:00
|
|
|
unsigned *lenInBlockHeader;
|
|
|
|
{
|
|
|
|
// JSectHeader
|
|
|
|
bb.appendStr("\nHH\n", false);
|
|
|
|
lenInBlockHeader = (unsigned *) bb.skip(4);
|
|
|
|
}
|
2010-11-13 23:42:41 +01:00
|
|
|
|
2010-11-15 04:28:04 +01:00
|
|
|
string lastFilePath;
|
|
|
|
|
|
|
|
{
|
|
|
|
scoped_lock lk(privateViews._mutex());
|
2010-11-17 07:53:52 +01:00
|
|
|
for( vector<WriteIntent>::iterator i = wi._writes.begin(); i != wi._writes.end(); i++ ) {
|
2010-11-15 04:28:04 +01:00
|
|
|
size_t ofs;
|
|
|
|
MongoMMF *mmf = privateViews._find(i->p, ofs);
|
|
|
|
if( mmf == 0 ) {
|
|
|
|
journalingFailure("view pointer cannot be resolved");
|
|
|
|
}
|
|
|
|
else {
|
2010-11-17 07:53:52 +01:00
|
|
|
{
|
|
|
|
size_t ofs = ((char *)i->p) - ((char*)mmf->getView().p);
|
|
|
|
i->w_ptr = ((char*)mmf->view_write()) + ofs;
|
|
|
|
}
|
2010-11-15 04:28:04 +01:00
|
|
|
if( mmf->filePath() != lastFilePath ) {
|
|
|
|
lastFilePath = mmf->filePath();
|
|
|
|
JDbContext c;
|
|
|
|
bb.appendStruct(c);
|
|
|
|
bb.appendStr(lastFilePath);
|
|
|
|
}
|
|
|
|
JEntry e;
|
|
|
|
e.len = i->len;
|
|
|
|
e.fileNo = mmf->fileSuffixNo();
|
|
|
|
bb.appendStruct(e);
|
|
|
|
bb.appendBuf(i->p, i->len);
|
|
|
|
}
|
2010-11-14 00:32:41 +01:00
|
|
|
}
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
|
|
|
|
2010-11-15 04:28:04 +01:00
|
|
|
{
|
|
|
|
JSectFooter f;
|
|
|
|
f.hash = 0;
|
|
|
|
bb.appendStruct(f);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
unsigned L = (bb.len() + 8191) & 0xffffe000; // fill to alignment
|
|
|
|
dassert( L >= (unsigned) bb.len() );
|
|
|
|
*lenInBlockHeader = L;
|
|
|
|
unsigned padding = L - bb.len();
|
|
|
|
bb.skip(padding);
|
|
|
|
dassert( bb.len() % 8192 == 0 );
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2010-11-16 04:13:48 +01:00
|
|
|
static void WRITETOJOURNAL(const AlignedBuilder& bb) {
|
2010-11-15 04:28:04 +01:00
|
|
|
journal(bb);
|
|
|
|
}
|
|
|
|
|
2010-11-17 07:53:52 +01:00
|
|
|
/** apply the writes back to the non-private MMF after they are for certain in redo log
|
|
|
|
|
|
|
|
(1) todo we don't need to write back everything every group commit. we MUST write back
|
|
|
|
that which is going to be a remapped on its private view - but that might not be all
|
|
|
|
views.
|
|
|
|
|
|
|
|
(2) todo should we do this using N threads? would be quite easy
|
|
|
|
*/
|
|
|
|
static void WRITETODATAFILES() {
|
|
|
|
for( vector<WriteIntent>::iterator i = wi._writes.begin(); i != wi._writes.end(); i++ ) {
|
|
|
|
char *dst = (char *) (i->w_ptr);
|
|
|
|
memcpy(dst, i->p, i->len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-16 04:13:48 +01:00
|
|
|
static void _go(AlignedBuilder& bb) {
|
2010-11-15 04:28:04 +01:00
|
|
|
PREPLOGBUFFER(bb);
|
|
|
|
|
|
|
|
// todo: add double buffering so we can be (not even read locked) during WRITETOJOURNAL
|
|
|
|
WRITETOJOURNAL(bb);
|
2010-11-17 07:53:52 +01:00
|
|
|
|
|
|
|
// write the wi entries to the data files
|
|
|
|
WRITETODATAFILES();
|
|
|
|
|
|
|
|
wi.clear();
|
|
|
|
|
|
|
|
//REMAPPRIVATEVIEW();
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
|
|
|
|
2010-11-16 04:13:48 +01:00
|
|
|
static void go(AlignedBuilder& bb) {
|
2010-11-13 23:42:41 +01:00
|
|
|
{
|
|
|
|
readlocktry lk("", 1000);
|
|
|
|
if( lk.got() ) {
|
2010-11-15 04:28:04 +01:00
|
|
|
_go(bb);
|
2010-11-13 23:42:41 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// starvation on read locks could occur. so if read lock acquisition is slow, try to get a
|
|
|
|
// write lock instead. otherwise writes could use too much RAM.
|
|
|
|
writelock lk;
|
2010-11-15 04:28:04 +01:00
|
|
|
_go(bb);
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
|
|
|
|
2010-11-15 04:28:04 +01:00
|
|
|
static void durThread() {
|
|
|
|
Client::initThread("dur");
|
|
|
|
const int HowOftenToGroupCommitMs = 100;
|
2010-11-16 04:13:48 +01:00
|
|
|
AlignedBuilder bb(1024 * 1024 * 16); // reuse to avoid any heap fragmentation
|
2010-11-13 23:42:41 +01:00
|
|
|
while( 1 ) {
|
|
|
|
try {
|
2010-11-15 04:28:04 +01:00
|
|
|
int millis = HowOftenToGroupCommitMs;
|
|
|
|
{
|
|
|
|
Timer t;
|
|
|
|
journalRotate(); // note we do this part outside of mongomutex
|
|
|
|
millis -= t.millis();
|
|
|
|
if( millis < 5 || millis > HowOftenToGroupCommitMs )
|
|
|
|
millis = 5;
|
|
|
|
}
|
|
|
|
sleepmillis(millis);
|
|
|
|
go(bb);
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
2010-11-15 04:28:04 +01:00
|
|
|
catch(std::exception& e) {
|
|
|
|
log() << "exception in durThread " << e.what() << endl;
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
|
|
|
}
|
2010-11-13 19:04:48 +01:00
|
|
|
}
|
|
|
|
|
2010-11-15 22:03:56 +01:00
|
|
|
void unlinkThread();
|
|
|
|
|
2010-11-13 23:42:41 +01:00
|
|
|
void startup() {
|
2010-11-15 04:28:04 +01:00
|
|
|
journalMakeDir();
|
2010-11-13 23:42:41 +01:00
|
|
|
boost::thread t(durThread);
|
2010-11-15 22:03:56 +01:00
|
|
|
boost::thread t2(unlinkThread);
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
2010-11-13 19:04:48 +01:00
|
|
|
|
2010-10-05 02:09:41 +02:00
|
|
|
} // namespace dur
|
|
|
|
|
|
|
|
} // namespace mongo
|
|
|
|
|
|
|
|
#endif
|