2010-10-05 02:09:41 +02:00
|
|
|
// @file dur.cpp durability in the storage engine (crash-safeness / journaling)
|
|
|
|
|
2010-11-04 03:17:15 +01:00
|
|
|
/**
|
|
|
|
* Copyright (C) 2009 10gen Inc.
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU Affero General Public License, version 3,
|
|
|
|
* as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2010-10-05 02:09:41 +02:00
|
|
|
/*
|
|
|
|
phases
|
|
|
|
|
|
|
|
PREPLOGBUFFER
|
|
|
|
we will build an output buffer ourself and then use O_DIRECT
|
|
|
|
we could be in read lock for this
|
2010-11-13 19:04:48 +01:00
|
|
|
for very large objects write directly to redo log in situ?
|
2010-11-15 04:28:04 +01:00
|
|
|
WRITETOJOURNAL
|
2010-10-05 02:09:41 +02:00
|
|
|
we could be unlocked (the main db lock that is...) for this, with sufficient care, but there is some complexity
|
|
|
|
have to handle falling behind which would use too much ram (going back into a read lock would suffice to stop that).
|
2010-11-28 16:22:17 +01:00
|
|
|
for now we are in read lock which is not ideal.
|
2010-11-15 04:28:04 +01:00
|
|
|
WRITETODATAFILES
|
2010-10-05 02:09:41 +02:00
|
|
|
apply the writes back to the non-private MMF after they are for certain in redo log
|
|
|
|
REMAPPRIVATEVIEW
|
|
|
|
we could in a write lock quickly flip readers back to the main view, then stay in read lock and do our real
|
|
|
|
remapping. with many files (e.g., 1000), remapping could be time consuming (several ms), so we don't want
|
|
|
|
to be too frequent. tracking time for this step would be wise.
|
|
|
|
there could be a slow down immediately after remapping as fresh copy-on-writes for commonly written pages will
|
|
|
|
be required. so doing these remaps more incrementally in the future might make sense - but have to be careful
|
|
|
|
not to introduce bugs.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "pch.h"
|
2010-11-28 16:13:01 +01:00
|
|
|
#include "cmdline.h"
|
2010-11-15 04:28:04 +01:00
|
|
|
#include "client.h"
|
2010-10-05 02:09:41 +02:00
|
|
|
#include "dur.h"
|
2010-11-13 23:42:41 +01:00
|
|
|
#include "dur_journal.h"
|
2010-11-20 21:29:49 +01:00
|
|
|
#include "dur_commitjob.h"
|
2010-10-05 02:09:41 +02:00
|
|
|
#include "../util/mongoutils/hash.h"
|
2010-11-27 00:18:24 +01:00
|
|
|
#include "../util/mongoutils/str.h"
|
2010-11-15 04:28:04 +01:00
|
|
|
#include "../util/timer.h"
|
2010-10-05 02:09:41 +02:00
|
|
|
|
2010-11-27 00:18:24 +01:00
|
|
|
using namespace mongoutils;
|
|
|
|
|
2010-10-05 02:09:41 +02:00
|
|
|
namespace mongo {
|
|
|
|
|
2010-12-13 06:40:13 +01:00
|
|
|
namespace dur {
|
2010-10-05 02:09:41 +02:00
|
|
|
|
2010-12-12 22:05:21 +01:00
|
|
|
#if defined(_DEBUG)
|
2010-12-13 06:21:08 +01:00
|
|
|
const bool DebugValidateMapsMatch = false;
|
|
|
|
const bool DebugCheckLastDeclaredWrite = false;
|
2010-12-12 22:05:21 +01:00
|
|
|
#else
|
2010-12-06 23:22:56 +01:00
|
|
|
const bool DebugValidateMapsMatch = false;
|
|
|
|
const bool DebugCheckLastDeclaredWrite = false;
|
2010-12-12 22:05:21 +01:00
|
|
|
#endif
|
2010-12-06 23:22:56 +01:00
|
|
|
|
2010-12-13 06:40:13 +01:00
|
|
|
DurableInterface* DurableInterface::_impl = new NonDurableImpl();
|
2010-12-09 20:44:08 +01:00
|
|
|
|
2010-12-13 06:40:13 +01:00
|
|
|
#if !defined(_DURABLE)
|
|
|
|
// called by startup/main
|
|
|
|
void enableDurability() {}
|
|
|
|
#else
|
|
|
|
void enableDurability() { // TODO: merge with startup() ?
|
|
|
|
assert(typeid(*DurableInterface::_impl) == typeid(NonDurableImpl));
|
|
|
|
// lets NonDurableImpl instance leak, but its tiny and only happens once
|
|
|
|
DurableInterface::_impl = new DurableImpl();
|
|
|
|
}
|
|
|
|
|
|
|
|
// later in this file
|
|
|
|
static void groupCommit();
|
2010-10-05 02:09:41 +02:00
|
|
|
|
2010-12-13 06:40:13 +01:00
|
|
|
static CommitJob commitJob;
|
|
|
|
|
2010-12-13 20:53:49 +01:00
|
|
|
bool DurableImpl::commitNow() {
|
|
|
|
groupCommit();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2010-12-13 06:40:13 +01:00
|
|
|
bool DurableImpl::awaitCommit() {
|
|
|
|
commitJob.awaitNextCommit();
|
|
|
|
return true;
|
|
|
|
}
|
2010-12-13 06:21:08 +01:00
|
|
|
|
2010-11-27 00:18:24 +01:00
|
|
|
/** Declare that a file has been created
|
|
|
|
Normally writes are applied only after journalling, for safety. But here the file
|
|
|
|
is created first, and the journal will just replay the creation if the create didn't
|
|
|
|
happen because of crashing.
|
|
|
|
*/
|
2010-12-09 20:44:08 +01:00
|
|
|
void DurableImpl::createdFile(string filename, unsigned long long len) {
|
|
|
|
shared_ptr<DurOp> op( new FileCreatedOp(filename, len) );
|
|
|
|
commitJob.noteOp(op);
|
2010-11-27 00:18:24 +01:00
|
|
|
}
|
|
|
|
|
2010-12-13 01:47:10 +01:00
|
|
|
/** indicate that a database is about to be dropped. call before the actual drop. */
|
|
|
|
void DurableImpl::droppingDb(string db) {
|
|
|
|
shared_ptr<DurOp> op( new DropDbOp(db) );
|
|
|
|
commitJob.noteOp(op);
|
|
|
|
|
|
|
|
// must commit now, before files are actually unlinked:
|
2010-12-13 06:40:13 +01:00
|
|
|
groupCommit();
|
2010-12-13 01:47:10 +01:00
|
|
|
}
|
|
|
|
|
2010-11-22 01:36:40 +01:00
|
|
|
/** declare write intent. when already in the write view if testIntent is true. */
|
2010-12-09 20:44:08 +01:00
|
|
|
void DurableImpl::declareWriteIntent(void *p, unsigned len) {
|
|
|
|
WriteIntent w(p, len);
|
|
|
|
commitJob.note(w);
|
2010-11-22 01:36:40 +01:00
|
|
|
}
|
|
|
|
|
2010-12-09 20:44:08 +01:00
|
|
|
void* DurableImpl::writingPtr(void *x, unsigned len) {
|
2010-11-22 01:36:40 +01:00
|
|
|
void *p = x;
|
|
|
|
if( testIntent )
|
|
|
|
p = MongoMMF::switchToPrivateView(x);
|
|
|
|
declareWriteIntent(p, len);
|
|
|
|
return p;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** declare intent to write
|
|
|
|
@param ofs offset within buf at which we will write
|
|
|
|
@param len the length at ofs we will write
|
|
|
|
@return new buffer pointer. this is modified when testIntent is true.
|
|
|
|
*/
|
2010-12-09 20:44:08 +01:00
|
|
|
void* DurableImpl::writingAtOffset(void *buf, unsigned ofs, unsigned len) {
|
2010-11-22 01:36:40 +01:00
|
|
|
char *p = (char *) buf;
|
|
|
|
if( testIntent )
|
|
|
|
p = (char *) MongoMMF::switchToPrivateView(buf);
|
|
|
|
declareWriteIntent(p+ofs, len);
|
2010-11-15 04:28:04 +01:00
|
|
|
return p;
|
2010-10-05 02:09:41 +02:00
|
|
|
}
|
|
|
|
|
2010-11-27 23:30:51 +01:00
|
|
|
/** Used in _DEBUG builds to check that we didn't overwrite the last intent
|
|
|
|
that was declared. called just before writelock release. we check a few
|
|
|
|
bytes after the declared region to see if they changed.
|
2010-12-06 23:22:56 +01:00
|
|
|
|
2010-12-05 22:53:39 +01:00
|
|
|
@see MongoMutex::_releasedWriteLock
|
2010-12-06 23:22:56 +01:00
|
|
|
|
|
|
|
SLOW
|
2010-11-27 23:30:51 +01:00
|
|
|
*/
|
2010-12-12 22:46:38 +01:00
|
|
|
#if defined(_DEBUG)
|
2010-12-09 20:44:08 +01:00
|
|
|
void DurableImpl::debugCheckLastDeclaredWrite() {
|
2010-12-06 23:22:56 +01:00
|
|
|
if( !DebugCheckLastDeclaredWrite )
|
2010-12-05 22:53:39 +01:00
|
|
|
return;
|
|
|
|
|
|
|
|
if( testIntent )
|
|
|
|
return;
|
|
|
|
|
2010-12-06 23:22:56 +01:00
|
|
|
static int n;
|
|
|
|
++n;
|
|
|
|
|
2010-11-28 16:13:01 +01:00
|
|
|
assert(debug && cmdLine.dur);
|
2010-12-01 17:18:11 +01:00
|
|
|
vector<WriteIntent>& w = commitJob.writes();
|
2010-11-27 23:30:51 +01:00
|
|
|
if( w.size() == 0 )
|
|
|
|
return;
|
|
|
|
const WriteIntent &i = w[w.size()-1];
|
|
|
|
size_t ofs;
|
|
|
|
MongoMMF *mmf = privateViews.find(i.p, ofs);
|
|
|
|
if( mmf == 0 )
|
|
|
|
return;
|
|
|
|
size_t past = ofs + i.len;
|
|
|
|
if( mmf->length() < past + 8 )
|
|
|
|
return; // too close to end of view
|
|
|
|
char *priv = (char *) mmf->getView();
|
|
|
|
char *writ = (char *) mmf->view_write();
|
|
|
|
unsigned long long *a = (unsigned long long *) (priv+past);
|
|
|
|
unsigned long long *b = (unsigned long long *) (writ+past);
|
|
|
|
if( *a != *b ) {
|
|
|
|
for( unsigned z = 0; z < w.size() - 1; z++ ) {
|
|
|
|
const WriteIntent& wi = w[z];
|
|
|
|
char *r1 = (char*) wi.p;
|
|
|
|
char *r2 = r1 + wi.len;
|
2010-12-06 23:22:56 +01:00
|
|
|
if( r1 <= (((char*)a)+8) && r2 > (char*)a ) {
|
2010-12-05 22:53:39 +01:00
|
|
|
//log() << "it's ok " << wi.p << ' ' << wi.len << endl;
|
|
|
|
return;
|
2010-11-27 23:30:51 +01:00
|
|
|
}
|
|
|
|
}
|
2010-12-06 23:22:56 +01:00
|
|
|
log() << "dur data after write area " << i.p << " does not agree" << endl;
|
|
|
|
log() << " was: " << ((void*)b) << " " << hexdump((char*)b, 8) << endl;
|
|
|
|
log() << " now: " << ((void*)a) << " " << hexdump((char*)a, 8) << endl;
|
|
|
|
log() << " n: " << n << endl;
|
|
|
|
log() << endl;
|
2010-11-27 23:30:51 +01:00
|
|
|
}
|
|
|
|
}
|
2010-12-12 22:46:38 +01:00
|
|
|
#endif
|
2010-11-27 23:30:51 +01:00
|
|
|
|
2010-11-20 21:29:49 +01:00
|
|
|
/** we will build an output buffer ourself and then use O_DIRECT
|
|
|
|
we could be in read lock for this
|
|
|
|
caller handles locking
|
|
|
|
*/
|
2010-11-27 00:18:24 +01:00
|
|
|
static void PREPLOGBUFFER() {
|
2010-11-29 22:05:17 +01:00
|
|
|
assert( cmdLine.dur );
|
2010-12-01 17:18:11 +01:00
|
|
|
AlignedBuilder& bb = commitJob._ab;
|
2010-11-13 23:42:41 +01:00
|
|
|
bb.reset();
|
|
|
|
|
2010-12-01 15:48:29 +01:00
|
|
|
unsigned lenOfs;
|
2010-11-27 00:18:24 +01:00
|
|
|
// JSectHeader
|
2010-11-15 04:28:04 +01:00
|
|
|
{
|
|
|
|
bb.appendStr("\nHH\n", false);
|
2010-12-01 15:48:29 +01:00
|
|
|
lenOfs = bb.skip(4);
|
2010-11-15 04:28:04 +01:00
|
|
|
}
|
2010-11-13 23:42:41 +01:00
|
|
|
|
2010-11-27 00:18:24 +01:00
|
|
|
// ops other than basic writes
|
|
|
|
{
|
2010-12-01 17:18:11 +01:00
|
|
|
for( vector< shared_ptr<DurOp> >::iterator i = commitJob.ops().begin(); i != commitJob.ops().end(); ++i ) {
|
2010-11-27 00:18:24 +01:00
|
|
|
(*i)->serialize(bb);
|
|
|
|
}
|
|
|
|
}
|
2010-11-15 04:28:04 +01:00
|
|
|
|
2010-11-27 00:18:24 +01:00
|
|
|
// write intents
|
2010-11-15 04:28:04 +01:00
|
|
|
{
|
|
|
|
scoped_lock lk(privateViews._mutex());
|
2010-11-27 00:18:24 +01:00
|
|
|
string lastFilePath;
|
2010-12-01 17:18:11 +01:00
|
|
|
for( vector<WriteIntent>::iterator i = commitJob.writes().begin(); i != commitJob.writes().end(); i++ ) {
|
2010-11-15 04:28:04 +01:00
|
|
|
size_t ofs;
|
|
|
|
MongoMMF *mmf = privateViews._find(i->p, ofs);
|
|
|
|
if( mmf == 0 ) {
|
2010-11-28 16:22:17 +01:00
|
|
|
string s = str::stream() << "view pointer cannot be resolved " << (size_t) i->p;
|
2010-11-27 00:18:24 +01:00
|
|
|
journalingFailure(s.c_str()); // asserts
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-11-27 21:25:08 +01:00
|
|
|
if( !mmf->willNeedRemap() ) {
|
|
|
|
mmf->willNeedRemap() = true; // usually it will already be dirty so don't bother writing then
|
2010-11-15 04:28:04 +01:00
|
|
|
}
|
2010-11-27 00:18:24 +01:00
|
|
|
//size_t ofs = ((char *)i->p) - ((char*)mmf->getView().p);
|
|
|
|
i->w_ptr = ((char*)mmf->view_write()) + ofs;
|
|
|
|
if( mmf->filePath() != lastFilePath ) {
|
|
|
|
lastFilePath = mmf->filePath();
|
|
|
|
JDbContext c;
|
|
|
|
bb.appendStruct(c);
|
|
|
|
bb.appendStr(lastFilePath);
|
2010-11-15 04:28:04 +01:00
|
|
|
}
|
2010-11-27 00:18:24 +01:00
|
|
|
JEntry e;
|
|
|
|
e.len = i->len;
|
|
|
|
assert( ofs <= 0x80000000 );
|
|
|
|
e.ofs = (unsigned) ofs;
|
|
|
|
e.fileNo = mmf->fileSuffixNo();
|
|
|
|
bb.appendStruct(e);
|
|
|
|
bb.appendBuf(i->p, i->len);
|
2010-11-14 00:32:41 +01:00
|
|
|
}
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
|
|
|
|
2010-11-15 04:28:04 +01:00
|
|
|
{
|
2010-12-11 00:54:19 +01:00
|
|
|
JSectFooter f(bb.buf(), bb.len());
|
2010-11-15 04:28:04 +01:00
|
|
|
bb.appendStruct(f);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
2010-11-27 00:18:24 +01:00
|
|
|
assert( 0xffffe000 == (~(Alignment-1)) );
|
|
|
|
unsigned L = (bb.len() + Alignment-1) & (~(Alignment-1)); // fill to alignment
|
2010-11-15 04:28:04 +01:00
|
|
|
dassert( L >= (unsigned) bb.len() );
|
2010-12-01 15:48:29 +01:00
|
|
|
*((unsigned*)bb.atOfs(lenOfs)) = L;
|
2010-11-15 04:28:04 +01:00
|
|
|
unsigned padding = L - bb.len();
|
|
|
|
bb.skip(padding);
|
2010-11-27 00:18:24 +01:00
|
|
|
dassert( bb.len() % Alignment == 0 );
|
2010-11-15 04:28:04 +01:00
|
|
|
}
|
|
|
|
|
2010-11-27 00:18:24 +01:00
|
|
|
return;
|
2010-11-15 04:28:04 +01:00
|
|
|
}
|
|
|
|
|
2010-11-20 21:29:49 +01:00
|
|
|
/** write the buffer we have built to the journal and fsync it.
|
|
|
|
outside of lock as that could be slow.
|
2010-12-05 22:53:39 +01:00
|
|
|
*/
|
|
|
|
static void WRITETOJOURNAL(AlignedBuilder& ab) {
|
2010-11-20 21:29:49 +01:00
|
|
|
journal(ab);
|
2010-12-05 22:53:39 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/** (SLOW) diagnostic to check that the private view and the non-private view are in sync.
|
|
|
|
*/
|
|
|
|
static void debugValidateMapsMatch() {
|
2010-12-06 23:22:56 +01:00
|
|
|
if( !DebugValidateMapsMatch )
|
2010-12-05 22:53:39 +01:00
|
|
|
return;
|
2010-12-01 17:18:11 +01:00
|
|
|
|
|
|
|
Timer t;
|
|
|
|
set<MongoFile*>& files = MongoFile::getAllFiles();
|
|
|
|
for( set<MongoFile*>::iterator i = files.begin(); i != files.end(); i++ ) {
|
|
|
|
MongoFile *mf = *i;
|
|
|
|
if( mf->isMongoMMF() ) {
|
|
|
|
MongoMMF *mmf = (MongoMMF*) mf;
|
|
|
|
const char *p = (const char *) mmf->getView();
|
|
|
|
const char *w = (const char *) mmf->view_write();
|
|
|
|
unsigned low = 0xffffffff;
|
|
|
|
unsigned high = 0;
|
|
|
|
for( unsigned i = 0; i < mmf->length(); i++ ) {
|
|
|
|
if( p[i] != w[i] ) {
|
2010-12-05 22:53:39 +01:00
|
|
|
log() << i << '\t' << (int) p[i] << '\t' << (int) w[i] << endl;
|
2010-12-01 17:18:11 +01:00
|
|
|
if( i < low ) low = i;
|
|
|
|
if( i > high ) high = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if( low != 0xffffffff ) {
|
|
|
|
std::stringstream ss;
|
|
|
|
ss << "dur error warning views mismatch " << mmf->filename() << ' ' << (hex) << low << ".." << high << " len:" << high-low+1;
|
|
|
|
log() << ss.str() << endl;
|
2010-12-05 22:53:39 +01:00
|
|
|
log() << "priv loc: " << (void*)(p+low) << endl;
|
|
|
|
vector<WriteIntent>& w = commitJob.writes();
|
2010-12-06 23:26:59 +01:00
|
|
|
(void)w; // mark as unused. Useful for inspection in debugger
|
|
|
|
|
2010-12-01 17:18:11 +01:00
|
|
|
breakpoint();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-12-06 23:22:56 +01:00
|
|
|
log() << "debugValidateMapsMatch " << t.millis() << "ms " << endl;
|
2010-12-05 22:53:39 +01:00
|
|
|
}
|
2010-11-15 04:28:04 +01:00
|
|
|
|
2010-11-17 07:53:52 +01:00
|
|
|
/** apply the writes back to the non-private MMF after they are for certain in redo log
|
|
|
|
|
|
|
|
(1) todo we don't need to write back everything every group commit. we MUST write back
|
|
|
|
that which is going to be a remapped on its private view - but that might not be all
|
|
|
|
views.
|
|
|
|
|
|
|
|
(2) todo should we do this using N threads? would be quite easy
|
2010-11-18 04:31:38 +01:00
|
|
|
see Hackenberg paper table 5 and 6. 2 threads might be a good balance.
|
2010-11-17 19:59:29 +01:00
|
|
|
|
|
|
|
locking: in read lock when called
|
2010-11-17 07:53:52 +01:00
|
|
|
*/
|
|
|
|
static void WRITETODATAFILES() {
|
2010-11-17 19:59:29 +01:00
|
|
|
/* we go backwards as what is at the end is most likely in the cpu cache. it won't be much, but we'll take it. */
|
2010-12-01 17:18:11 +01:00
|
|
|
for( int i = commitJob.writes().size() - 1; i >= 0; i-- ) {
|
|
|
|
const WriteIntent& intent = commitJob.writes()[i];
|
2010-11-17 19:59:29 +01:00
|
|
|
char *dst = (char *) (intent.w_ptr);
|
|
|
|
memcpy(dst, intent.p, intent.len);
|
|
|
|
}
|
2010-11-30 20:46:18 +01:00
|
|
|
|
2010-12-05 22:53:39 +01:00
|
|
|
debugValidateMapsMatch();
|
2010-11-17 19:59:29 +01:00
|
|
|
}
|
|
|
|
|
2010-11-27 21:25:08 +01:00
|
|
|
/** We need to remap the private views periodically. otherwise they would become very large.
|
|
|
|
Call within write lock.
|
2010-11-17 19:59:29 +01:00
|
|
|
*/
|
2010-11-27 21:25:08 +01:00
|
|
|
void REMAPPRIVATEVIEW() {
|
|
|
|
static unsigned startAt;
|
|
|
|
static unsigned long long lastRemap;
|
|
|
|
|
|
|
|
dbMutex.assertWriteLocked();
|
|
|
|
dbMutex._remapPrivateViewRequested = false;
|
2010-12-01 17:18:11 +01:00
|
|
|
assert( !commitJob.hasWritten() );
|
2010-11-27 21:25:08 +01:00
|
|
|
|
2010-12-01 19:16:37 +01:00
|
|
|
if( 0 ) {
|
|
|
|
log() << "TEMP remapprivateview disabled for testing - will eventually run oom in this mode if db bigger than ram" << endl;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-11-27 21:25:08 +01:00
|
|
|
// we want to remap all private views about every 2 seconds. there could be ~1000 views so
|
|
|
|
// we do a little each pass; beyond the remap time, more significantly, there will be copy on write
|
|
|
|
// faults after remapping, so doing a little bit at a time will avoid big load spikes on
|
|
|
|
// remapping.
|
|
|
|
unsigned long long now = curTimeMicros64();
|
|
|
|
double fraction = (now-lastRemap)/20000000.0;
|
|
|
|
|
|
|
|
set<MongoFile*>& files = MongoFile::getAllFiles();
|
|
|
|
unsigned sz = files.size();
|
|
|
|
if( sz == 0 )
|
|
|
|
return;
|
|
|
|
|
|
|
|
unsigned ntodo = (unsigned) (sz * fraction);
|
|
|
|
if( ntodo < 1 ) ntodo = 1;
|
|
|
|
if( ntodo > sz ) ntodo = sz;
|
|
|
|
|
|
|
|
const set<MongoFile*>::iterator b = files.begin();
|
|
|
|
const set<MongoFile*>::iterator e = files.end();
|
|
|
|
set<MongoFile*>::iterator i = b;
|
2010-12-08 06:27:04 +01:00
|
|
|
// skip to our starting position
|
2010-11-27 21:25:08 +01:00
|
|
|
for( unsigned x = 0; x < startAt; x++ ) {
|
|
|
|
i++;
|
|
|
|
if( i == e ) i = b;
|
|
|
|
}
|
2010-12-08 06:27:04 +01:00
|
|
|
startAt = (startAt + ntodo) % sz; // mark where to start next time
|
2010-11-27 21:25:08 +01:00
|
|
|
|
|
|
|
for( unsigned x = 0; x < ntodo; x++ ) {
|
|
|
|
dassert( i != e );
|
2010-12-08 06:27:04 +01:00
|
|
|
if( (*i)->isMongoMMF() ) {
|
|
|
|
MongoMMF *mmf = (MongoMMF*) *i;
|
|
|
|
assert(mmf);
|
|
|
|
if( mmf->willNeedRemap() ) {
|
|
|
|
mmf->willNeedRemap() = false;
|
|
|
|
mmf->remapThePrivateView();
|
|
|
|
}
|
|
|
|
i++;
|
|
|
|
if( i == e ) i = b;
|
2010-11-27 21:25:08 +01:00
|
|
|
}
|
2010-11-17 07:53:52 +01:00
|
|
|
}
|
2010-11-17 19:59:29 +01:00
|
|
|
}
|
2010-11-17 07:53:52 +01:00
|
|
|
|
2010-11-27 00:18:24 +01:00
|
|
|
/** locking in read lock when called
|
|
|
|
@see MongoMMF::close()
|
|
|
|
*/
|
2010-12-13 06:40:13 +01:00
|
|
|
static void groupCommit() {
|
2010-11-27 00:18:24 +01:00
|
|
|
dbMutex.assertAtLeastReadLocked();
|
|
|
|
|
2010-12-01 17:18:11 +01:00
|
|
|
if( !commitJob.hasWritten() )
|
2010-11-17 19:59:29 +01:00
|
|
|
return;
|
|
|
|
|
2010-11-20 21:29:49 +01:00
|
|
|
PREPLOGBUFFER();
|
2010-11-15 04:28:04 +01:00
|
|
|
|
2010-12-01 17:18:11 +01:00
|
|
|
WRITETOJOURNAL(commitJob._ab);
|
2010-11-17 07:53:52 +01:00
|
|
|
|
2010-12-13 06:21:08 +01:00
|
|
|
// data is now in the journal, which is sufficient for acknowledging getlasterror.
|
|
|
|
// (ok to crash after that)
|
|
|
|
log() << "TEMP NOTIFYING COMMITTED" << endl;
|
|
|
|
commitJob.notifyCommitted();
|
|
|
|
|
2010-11-20 21:29:49 +01:00
|
|
|
// write the noted write intent entries to the data files.
|
|
|
|
// this has to come after writing to the journal, obviously...
|
2010-12-07 02:25:20 +01:00
|
|
|
MongoFile::markAllWritable(); // for _DEBUG. normally we don't write in a read lock
|
2010-11-17 07:53:52 +01:00
|
|
|
WRITETODATAFILES();
|
2010-12-07 00:17:34 +01:00
|
|
|
if (!dbMutex.isWriteLocked())
|
2010-12-07 02:25:20 +01:00
|
|
|
MongoFile::unmarkAllWritable();
|
2010-11-17 07:53:52 +01:00
|
|
|
|
2010-12-01 17:18:11 +01:00
|
|
|
commitJob.reset();
|
2010-11-27 21:25:08 +01:00
|
|
|
|
2010-12-01 17:18:11 +01:00
|
|
|
// REMAPPRIVATEVIEW
|
|
|
|
//
|
2010-11-20 21:29:49 +01:00
|
|
|
// remapping private views must occur after WRITETODATAFILES otherwise
|
|
|
|
// we wouldn't see newly written data on reads.
|
2010-11-27 21:25:08 +01:00
|
|
|
//
|
2010-12-01 19:16:37 +01:00
|
|
|
DEV assert( !commitJob.hasWritten() );
|
|
|
|
if( !dbMutex.isWriteLocked() ) {
|
|
|
|
// this needs done in a write lock thus we do it on the next acquisition of that
|
|
|
|
// instead of here (there is no rush if you aren't writing anyway -- but it must happen,
|
|
|
|
// if it is done, before any uncommitted writes occur).
|
|
|
|
//
|
|
|
|
dbMutex._remapPrivateViewRequested = true;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// however, if we are already write locked, we must do it now -- up the call tree someone
|
|
|
|
// may do a write without a new lock acquisition. this can happen when MongoMMF::close() calls
|
|
|
|
// this method when a file (and its views) is about to go away.
|
|
|
|
//
|
|
|
|
REMAPPRIVATEVIEW();
|
|
|
|
}
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
|
|
|
|
2010-11-20 21:29:49 +01:00
|
|
|
static void go() {
|
2010-12-01 17:18:11 +01:00
|
|
|
if( !commitJob.hasWritten() )
|
2010-11-20 21:29:49 +01:00
|
|
|
return;
|
2010-12-01 19:16:37 +01:00
|
|
|
|
2010-11-13 23:42:41 +01:00
|
|
|
{
|
|
|
|
readlocktry lk("", 1000);
|
|
|
|
if( lk.got() ) {
|
2010-12-13 01:47:10 +01:00
|
|
|
groupCommit();
|
2010-11-13 23:42:41 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2010-12-01 19:16:37 +01:00
|
|
|
|
2010-11-13 23:42:41 +01:00
|
|
|
// starvation on read locks could occur. so if read lock acquisition is slow, try to get a
|
|
|
|
// write lock instead. otherwise writes could use too much RAM.
|
|
|
|
writelock lk;
|
2010-12-13 01:47:10 +01:00
|
|
|
groupCommit();
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
|
|
|
|
2010-12-01 19:16:37 +01:00
|
|
|
/** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its
|
|
|
|
views disappear
|
|
|
|
*/
|
|
|
|
void closingFileNotification() {
|
|
|
|
if( dbMutex.atLeastReadLocked() ) {
|
2010-12-13 01:47:10 +01:00
|
|
|
groupCommit();
|
2010-12-01 19:16:37 +01:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
assert( inShutdown() );
|
|
|
|
if( commitJob.hasWritten() ) {
|
|
|
|
log() << "dur warning files are closing outside locks with writes pending" << endl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-15 04:28:04 +01:00
|
|
|
static void durThread() {
|
|
|
|
Client::initThread("dur");
|
|
|
|
const int HowOftenToGroupCommitMs = 100;
|
2010-11-13 23:42:41 +01:00
|
|
|
while( 1 ) {
|
|
|
|
try {
|
2010-11-15 04:28:04 +01:00
|
|
|
int millis = HowOftenToGroupCommitMs;
|
|
|
|
{
|
|
|
|
Timer t;
|
|
|
|
journalRotate(); // note we do this part outside of mongomutex
|
|
|
|
millis -= t.millis();
|
|
|
|
if( millis < 5 || millis > HowOftenToGroupCommitMs )
|
|
|
|
millis = 5;
|
|
|
|
}
|
|
|
|
sleepmillis(millis);
|
2010-11-20 21:29:49 +01:00
|
|
|
go();
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
2010-11-15 04:28:04 +01:00
|
|
|
catch(std::exception& e) {
|
|
|
|
log() << "exception in durThread " << e.what() << endl;
|
2010-11-13 23:42:41 +01:00
|
|
|
}
|
|
|
|
}
|
2010-11-13 19:04:48 +01:00
|
|
|
}
|
|
|
|
|
2010-11-15 22:03:56 +01:00
|
|
|
void unlinkThread();
|
2010-11-22 16:35:30 +01:00
|
|
|
void recover();
|
2010-12-13 08:23:50 +01:00
|
|
|
void _debugCheckLastDeclaredWrite() {
|
|
|
|
#if defined(_DEBUG)
|
|
|
|
getDur().debugCheckLastDeclaredWrite();
|
|
|
|
#endif
|
|
|
|
}
|
2010-10-05 02:09:41 +02:00
|
|
|
|
2010-12-13 06:40:13 +01:00
|
|
|
void DurableImpl::startup() {
|
|
|
|
if( !cmdLine.dur )
|
|
|
|
return;
|
|
|
|
if( testIntent )
|
|
|
|
return;
|
|
|
|
recover();
|
|
|
|
journalMakeDir();
|
|
|
|
boost::thread t(durThread);
|
|
|
|
boost::thread t2(unlinkThread);
|
|
|
|
}
|
2010-12-09 20:44:08 +01:00
|
|
|
|
2010-10-05 02:09:41 +02:00
|
|
|
#endif
|
2010-12-09 20:44:08 +01:00
|
|
|
|
2010-12-13 08:23:50 +01:00
|
|
|
|
|
|
|
} // namespace dur
|
|
|
|
|
|
|
|
|
2010-12-09 20:44:08 +01:00
|
|
|
} // namespace mongo
|
|
|
|
|