0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 09:32:32 +01:00
mongodb/db/pdfile.cpp

545 lines
14 KiB
C++
Raw Normal View History

2007-10-20 01:35:48 +02:00
// pdfile.cpp
2007-10-30 10:50:14 +01:00
/*
todo:
_ manage deleted records. bucket?
_ use deleted on inserts!
_ quantize allocations
2007-10-30 16:35:17 +01:00
_ table scans must be sequential, not next/prev pointers
2007-10-31 02:16:35 +01:00
_ regex support
2007-10-30 10:50:14 +01:00
*/
2007-10-20 01:35:48 +02:00
#include "stdafx.h"
#include "pdfile.h"
#include "db.h"
#include "../util/mmap.h"
#include "../util/hashtab.h"
2007-11-09 03:42:50 +01:00
#include "objwrappers.h"
#include "btree.h"
2007-10-20 01:35:48 +02:00
DataFileMgr theDataFileMgr;
/* just temporary */
2007-11-06 03:43:49 +01:00
//const int ExtentSize = 1 * 1024 * 1024;
2007-10-20 01:35:48 +02:00
2007-11-02 03:34:44 +01:00
JSObj::JSObj(Record *r) {
_objdata = r->data;
_objsize = *((int*) _objdata);
assert( _objsize <= r->netLength() );
2007-11-05 04:34:37 +01:00
iFree = false;
2007-11-02 03:34:44 +01:00
}
2007-10-20 01:35:48 +02:00
/*---------------------------------------------------------------------*/
2007-10-30 10:50:14 +01:00
int bucketSizes[] = {
32, 64, 128, 256, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000,
0x8000, 0x10000, 0x20000, 0x40000, 0x80000, 0x100000, 0x200000,
0x400000, 0x800000
};
const int Buckets = 19;
2007-10-30 16:35:17 +01:00
const int MaxBucket = 18;
2007-11-09 03:42:50 +01:00
const int MaxIndexes = 10;
class IndexDetails {
public:
DiskLoc head;
DiskLoc info;
};
2007-10-30 10:50:14 +01:00
class NamespaceDetails {
public:
2007-11-06 03:43:49 +01:00
NamespaceDetails() {
datasize = nrecords = 0;
lastExtentSize = 0;
2007-11-09 03:42:50 +01:00
nIndexes = 0;
2007-11-06 03:43:49 +01:00
memset(reserved, 0, sizeof(reserved));
}
2007-10-30 10:50:14 +01:00
DiskLoc firstExtent;
DiskLoc deletedList[Buckets];
2007-11-06 03:43:49 +01:00
long long datasize;
long long nrecords;
int lastExtentSize;
2007-11-09 03:42:50 +01:00
int nIndexes;
IndexDetails indexes[MaxIndexes];
char reserved[256-16-4-4-8*MaxIndexes];
2007-10-30 10:50:14 +01:00
static int bucket(int n) {
for( int i = 0; i < Buckets; i++ )
if( bucketSizes[i] > n )
return i;
return Buckets-1;
}
2007-10-30 16:35:17 +01:00
void addDeletedRec(DeletedRecord *d, DiskLoc dloc) {
2007-10-30 10:50:14 +01:00
int b = bucket(d->lengthWithHeaders);
DiskLoc& list = deletedList[b];
DiskLoc oldHead = list;
list = dloc;
2007-10-30 16:35:17 +01:00
d->nextDeleted = oldHead;
2007-10-30 10:50:14 +01:00
}
2007-10-30 16:35:17 +01:00
DiskLoc alloc(int lenToAlloc, DiskLoc& extentLoc);
private:
DiskLoc _alloc(int len);
2007-10-30 10:50:14 +01:00
};
2007-10-30 16:35:17 +01:00
DiskLoc NamespaceDetails::alloc(int lenToAlloc, DiskLoc& extentLoc) {
lenToAlloc = (lenToAlloc + 3) & 0xfffffffc;
DiskLoc loc = _alloc(lenToAlloc);
if( loc.isNull() )
return loc;
DeletedRecord *r = loc.drec();
/* note we want to grab from the front so our next pointers on disk tend
to go in a forward direction which is important for performance. */
int regionlen = r->lengthWithHeaders;
extentLoc.set(loc.a(), r->extentOfs);
int left = regionlen - lenToAlloc;
if( left < 24 ) {
// you get the whole thing.
return loc;
}
/* split off some for further use. */
r->lengthWithHeaders = lenToAlloc;
DiskLoc newDelLoc = loc;
newDelLoc.inc(lenToAlloc);
DeletedRecord *newDel = newDelLoc.drec();
newDel->extentOfs = r->extentOfs;
newDel->lengthWithHeaders = left;
newDel->nextDeleted.Null();
addDeletedRec(newDel, newDelLoc);
return loc;
}
/* returned item is out of the deleted list upon return */
DiskLoc NamespaceDetails::_alloc(int len) {
DiskLoc *prev;
DiskLoc *bestprev = 0;
DiskLoc bestmatch;
int bestmatchlen = 0x7fffffff;
int b = bucket(len);
DiskLoc cur = deletedList[b]; prev = &deletedList[b];
int extra = 5; // look for a better fit, a little.
int chain = 0;
while( 1 ) {
if( cur.isNull() ) {
// move to next bucket. if we were doing "extra", just break
if( bestmatchlen < 0x7fffffff )
break;
b++;
if( b > MaxBucket ) {
// out of space. alloc a new extent.
return DiskLoc();
}
cur = deletedList[b]; prev = &deletedList[b];
continue;
}
DeletedRecord *r = cur.drec();
if( r->lengthWithHeaders >= len &&
r->lengthWithHeaders < bestmatchlen ) {
bestmatchlen = r->lengthWithHeaders;
bestmatch = cur;
bestprev = prev;
}
if( bestmatchlen < 0x7fffffff && --extra <= 0 )
break;
if( ++chain > 30 && b < MaxBucket ) {
// too slow, force move to next bucket to grab a big chunk
b++;
chain = 0;
cur.Null();
}
else {
cur = r->nextDeleted; prev = &r->nextDeleted;
}
}
/* unlink ourself from the deleted list */
*bestprev = bestmatch.drec()->nextDeleted;
return bestmatch;
}
2007-10-20 01:35:48 +02:00
class NamespaceIndex {
2007-11-02 03:34:44 +01:00
friend class NamespaceCursor;
2007-10-20 01:35:48 +02:00
public:
NamespaceIndex() { }
void init() {
const int LEN = 16 * 1024 * 1024;
2007-10-30 18:35:02 +01:00
void *p = f.map("/data/db/namespace.idx", LEN);
2007-11-03 02:30:40 +01:00
if( p == 0 ) {
cout << "couldn't open /data/db/namespace.idx" << endl;
exit(-3);
}
2007-10-30 10:50:14 +01:00
ht = new HashTable<Namespace,NamespaceDetails>(p, LEN, "namespace index");
2007-10-20 01:35:48 +02:00
}
void add(const char *ns, DiskLoc& loc) {
Namespace n(ns);
2007-10-30 10:50:14 +01:00
NamespaceDetails details;
details.firstExtent = loc;
ht->put(n, details);
2007-10-20 01:35:48 +02:00
}
2007-10-30 10:50:14 +01:00
NamespaceDetails* details(const char *ns) {
2007-10-20 01:35:48 +02:00
Namespace n(ns);
2007-10-30 10:50:14 +01:00
return ht->get(n);
}
bool find(const char *ns, DiskLoc& loc) {
NamespaceDetails *l = details(ns);
2007-10-20 01:35:48 +02:00
if( l ) {
2007-10-30 10:50:14 +01:00
loc = l->firstExtent;
2007-10-20 01:35:48 +02:00
return true;
}
return false;
}
private:
MemoryMappedFile f;
2007-10-30 10:50:14 +01:00
HashTable<Namespace,NamespaceDetails> *ht;
2007-10-20 01:35:48 +02:00
} namespaceIndex;
2007-11-02 03:34:44 +01:00
class NamespaceCursor : public Cursor {
public:
virtual bool ok() { return i >= 0; }
virtual Record* _current() { assert(false); return 0; }
virtual DiskLoc currLoc() { assert(false); return DiskLoc(); }
virtual JSObj current() {
NamespaceDetails &d = namespaceIndex.ht->nodes[i].value;
JSObjBuilder b;
b.append("name", namespaceIndex.ht->nodes[i].k.buf);
return b.done();
}
virtual bool advance() {
while( 1 ) {
i++;
if( i >= namespaceIndex.ht->n )
break;
if( namespaceIndex.ht->nodes[i].inUse() )
return true;
}
i = -1000000;
return false;
}
NamespaceCursor() {
i = -1;
advance();
}
private:
int i;
};
auto_ptr<Cursor> makeNamespaceCursor() {
return auto_ptr<Cursor>(new NamespaceCursor());
}
2007-11-03 02:30:40 +01:00
void newNamespace(const char *ns) {
cout << "New namespace: " << ns << endl;
if( strcmp(ns, "system.namespaces") != 0 ) {
JSObjBuilder b;
b.append("name", ns);
JSObj j = b.done();
theDataFileMgr.insert("system.namespaces", j.objdata(), j.objsize(), true);
}
}
2007-10-20 01:35:48 +02:00
/*---------------------------------------------------------------------*/
void PhysicalDataFile::open(const char *filename, int length) {
header = (PDFHeader *) mmf.map(filename, length);
assert(header);
header->init(length);
}
2007-10-30 10:50:14 +01:00
/* prev - previous extent for this namespace. null=this is the first one. */
2007-11-06 03:43:49 +01:00
Extent* PhysicalDataFile::newExtent(const char *ns, int approxSize) {
int ExtentSize = approxSize <= header->unusedLength ? approxSize : header->unusedLength;
2007-10-30 16:35:17 +01:00
DiskLoc loc;
2007-11-06 03:43:49 +01:00
if( ExtentSize <= 0 ) {
2007-10-20 01:35:48 +02:00
cout << "ERROR: newExtent: no more room for extents. write more code" << endl;
assert(false);
exit(2);
}
int offset = header->unused.getOfs();
header->unused.setOfs( offset + ExtentSize );
header->unusedLength -= ExtentSize;
loc.setOfs(offset);
Extent *e = _getExtent(loc);
2007-10-30 16:35:17 +01:00
DiskLoc emptyLoc = e->init(ns, ExtentSize, offset);
DiskLoc oldExtentLoc;
if( namespaceIndex.find(ns, oldExtentLoc) ) {
Extent *old = oldExtentLoc.ext();
assert( old->xprev.isNull() );
old->xprev = loc;
e->xnext = oldExtentLoc;
namespaceIndex.details(ns)->firstExtent = loc;
2007-10-30 10:50:14 +01:00
}
2007-10-30 16:35:17 +01:00
else {
namespaceIndex.add(ns, loc);
}
2007-11-06 03:43:49 +01:00
NamespaceDetails *d = namespaceIndex.details(ns);
d->lastExtentSize = approxSize;
d->addDeletedRec(emptyLoc.drec(), emptyLoc);
2007-10-30 16:35:17 +01:00
2007-10-20 01:35:48 +02:00
return e;
}
/*---------------------------------------------------------------------*/
/* assumes already zeroed -- insufficient for block 'reuse' perhaps */
2007-10-30 16:35:17 +01:00
DiskLoc Extent::init(const char *nsname, int _length, int _offset) {
2007-10-20 01:35:48 +02:00
magic = 0x41424344;
myLoc.setOfs(_offset);
2007-10-30 10:50:14 +01:00
xnext.Null(); xprev.Null();
2007-10-20 01:35:48 +02:00
ns = nsname;
length = _length;
firstRecord.Null(); lastRecord.Null();
2007-10-30 16:35:17 +01:00
DiskLoc emptyLoc = myLoc;
emptyLoc.inc( (extentData-(char*)this) );
2007-10-20 01:35:48 +02:00
2007-10-30 16:35:17 +01:00
DeletedRecord *empty1 = (DeletedRecord *) extentData;
DeletedRecord *empty = (DeletedRecord *) getRecord(emptyLoc);
2007-10-20 01:35:48 +02:00
assert( empty == empty1 );
empty->lengthWithHeaders = _length - (extentData - (char *) this);
2007-10-30 16:35:17 +01:00
empty->extentOfs = myLoc.getOfs();
return emptyLoc;
2007-10-20 01:35:48 +02:00
}
2007-10-30 16:35:17 +01:00
/*
2007-10-20 01:35:48 +02:00
Record* Extent::newRecord(int len) {
if( firstEmptyRegion.isNull() )
return 0;
assert(len > 0);
int newRecSize = len + Record::HeaderSize;
DiskLoc newRecordLoc = firstEmptyRegion;
Record *r = getRecord(newRecordLoc);
int left = r->netLength() - len;
if( left < 0 ) {
2007-10-30 16:35:17 +01:00
//
2007-10-20 01:35:48 +02:00
firstEmptyRegion.Null();
return 0;
}
2007-10-30 10:50:14 +01:00
DiskLoc nextEmpty = r->next.getNextEmpty(firstEmptyRegion);
2007-10-20 01:35:48 +02:00
r->lengthWithHeaders = newRecSize;
2007-10-30 10:50:14 +01:00
r->next.markAsFirstOrLastInExtent(this); // we're now last in the extent
2007-10-20 01:35:48 +02:00
if( !lastRecord.isNull() ) {
2007-10-30 10:50:14 +01:00
assert(getRecord(lastRecord)->next.lastInExtent()); // it was the last one
getRecord(lastRecord)->next.set(newRecordLoc); // until now
r->prev.set(lastRecord);
2007-10-20 01:35:48 +02:00
}
2007-10-30 10:50:14 +01:00
else {
r->prev.markAsFirstOrLastInExtent(this); // we are the first in the extent
assert( firstRecord.isNull() );
2007-10-20 01:35:48 +02:00
firstRecord = newRecordLoc;
2007-10-30 10:50:14 +01:00
}
lastRecord = newRecordLoc;
2007-10-20 01:35:48 +02:00
if( left < Record::HeaderSize + 32 ) {
firstEmptyRegion.Null();
}
else {
firstEmptyRegion.inc(newRecSize);
Record *empty = getRecord(firstEmptyRegion);
2007-10-30 10:50:14 +01:00
empty->next.set(nextEmpty); // not for empty records, unless in-use records, next and prev can be null.
empty->prev.Null();
2007-10-20 01:35:48 +02:00
empty->lengthWithHeaders = left;
}
return r;
}
2007-10-30 16:35:17 +01:00
*/
2007-10-20 01:35:48 +02:00
/*---------------------------------------------------------------------*/
2007-11-02 03:34:44 +01:00
auto_ptr<Cursor> DataFileMgr::findAll(const char *ns) {
2007-10-20 01:35:48 +02:00
DiskLoc loc;
bool found = namespaceIndex.find(ns, loc);
if( !found ) {
cout << "info: findAll() namespace does not exist: " << ns << endl;
2007-11-02 03:34:44 +01:00
return auto_ptr<Cursor>(new BasicCursor(DiskLoc()));
2007-10-20 01:35:48 +02:00
}
Extent *e = temp.getExtent(loc);
2007-11-02 03:34:44 +01:00
return auto_ptr<Cursor>(new BasicCursor( e->firstRecord ));
2007-10-20 01:35:48 +02:00
}
2007-11-05 01:17:42 +01:00
void aboutToDelete(const DiskLoc& dl);
2007-10-30 16:35:17 +01:00
void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl)
{
2007-11-05 01:17:42 +01:00
/* check if any cursors point to us. if so, advance them. */
aboutToDelete(dl);
2007-10-30 10:50:14 +01:00
/* remove ourself from the record next/prev chain */
2007-10-30 16:35:17 +01:00
{
if( todelete->prevOfs != DiskLoc::NullOfs )
todelete->getPrev(dl).rec()->nextOfs = todelete->nextOfs;
if( todelete->nextOfs != DiskLoc::NullOfs )
todelete->getNext(dl).rec()->prevOfs = todelete->prevOfs;
}
2007-10-30 10:50:14 +01:00
/* remove ourself from extent pointers */
2007-10-30 16:35:17 +01:00
{
Extent *e = todelete->myExtent(dl);
if( e->firstRecord == dl )
e->firstRecord.setOfs(todelete->nextOfs);
if( e->lastRecord == dl )
e->lastRecord.setOfs(todelete->prevOfs);
2007-10-30 10:50:14 +01:00
}
2007-11-05 01:17:42 +01:00
/* add to the free list */
2007-10-30 16:35:17 +01:00
{
NamespaceDetails* d = namespaceIndex.details(ns);
2007-11-06 03:43:49 +01:00
d->nrecords--;
d->datasize -= todelete->netLength();
2007-10-30 16:35:17 +01:00
d->addDeletedRec((DeletedRecord*)todelete, dl);
}
2007-10-30 10:50:14 +01:00
}
/** Note: as written so far, if the object shrinks a lot, we don't free up space. */
void DataFileMgr::update(
const char *ns,
Record *toupdate, const DiskLoc& dl,
const char *buf, int len)
{
if( toupdate->netLength() < len ) {
cout << "temp: update: moving record to a larger location " << ns << endl;
// doesn't fit.
deleteRecord(ns, toupdate, dl);
insert(ns, buf, len);
return;
}
memcpy(toupdate->data, buf, len);
}
2007-11-06 03:43:49 +01:00
int initialExtentSize(int len) {
long long sz = len * 16;
if( len < 1000 ) sz = len * 64;
if( sz > 1000000000 )
sz = 1000000000;
int z = ((int)sz) & 0xffffff00;
assert( z > len );
return z;
}
int followupExtentSize(int len, int lastExtentLen) {
int x = initialExtentSize(len);
int y = (int) (lastExtentLen < 4000000 ? lastExtentLen * 4.0 : lastExtentLen * 1.2);
int sz = y > x ? y : x;
sz = ((int)sz) & 0xffffff00;
assert( sz > len );
return sz;
}
2007-11-09 03:42:50 +01:00
DiskLoc DataFileMgr::insert(const char *ns, const void *buf, int len, bool god) {
bool addIndex = false;
if( strncmp(ns, "system.", 7) == 0 ) {
if( strcmp(ns, "system.indexes") == 0 )
addIndex = true;
else if( !god ) {
cout << "ERROR: attempt to insert in system namespace " << ns << endl;
return DiskLoc();
}
2007-11-03 02:30:40 +01:00
}
2007-10-30 16:35:17 +01:00
NamespaceDetails *d = namespaceIndex.details(ns);
if( d == 0 ) {
2007-11-03 02:30:40 +01:00
newNamespace(ns);
2007-11-06 03:43:49 +01:00
temp.newExtent(ns, initialExtentSize(len));
2007-10-30 16:35:17 +01:00
d = namespaceIndex.details(ns);
2007-10-20 01:35:48 +02:00
}
2007-10-30 16:35:17 +01:00
2007-11-09 03:42:50 +01:00
NamespaceDetails *indexesBaseDetails = 0;
string indexFullNS;
if( addIndex ) {
JSObj io((const char *) buf);
const char *name = io.getStringField("name");
const char *idxns = io.getStringField("ns");
JSObj key = io.getObjectField("key");
if( name == 0 || *name == 0 || idxns == 0 || key.isEmpty() || key.objsize() > 2048 ) {
cout << "ERROR: bad add index attempt name:" << (name?name:"") << " ns:" << (idxns?idxns:"") << endl;
return DiskLoc();
}
indexesBaseDetails = namespaceIndex.details(idxns);
if( indexesBaseDetails == 0 ) {
cout << "ERROR: bad add index attempt, no such table(ns):" << idxns << endl;
return DiskLoc();
}
if( indexesBaseDetails->nIndexes >= MaxIndexes ) {
cout << "ERROR: bad add index attempt, too many indexes for:" << idxns << endl;
return DiskLoc();
}
indexFullNS = idxns;
indexFullNS += ".$";
indexFullNS += name; // client.table.$index -- note this doesn't contain jsobjs, it contains BtreeBuckets.
}
2007-10-30 16:35:17 +01:00
DiskLoc extentLoc;
int lenWHdr = len + Record::HeaderSize;
DiskLoc loc = d->alloc(lenWHdr, extentLoc);
if( loc.isNull() ) {
// out of space
cout << "allocating new extent for " << ns << endl;
2007-11-06 03:43:49 +01:00
temp.newExtent(ns, followupExtentSize(len, d->lastExtentSize));
2007-10-30 16:35:17 +01:00
loc = d->alloc(lenWHdr, extentLoc);
if( loc.isNull() ) {
cout << "ERROR: out of space in datafile. write more code." << endl;
assert(false);
2007-11-09 03:42:50 +01:00
return DiskLoc();
2007-10-30 16:35:17 +01:00
}
}
Record *r = loc.rec();
assert( r->lengthWithHeaders >= lenWHdr );
2007-10-20 01:35:48 +02:00
memcpy(r->data, buf, len);
2007-10-30 16:35:17 +01:00
Extent *e = r->myExtent(loc);
if( e->lastRecord.isNull() ) {
e->firstRecord = e->lastRecord = loc;
r->prevOfs = r->nextOfs = DiskLoc::NullOfs;
}
else {
Record *oldlast = e->lastRecord.rec();
r->prevOfs = e->lastRecord.getOfs();
r->nextOfs = DiskLoc::NullOfs;
2007-10-30 18:43:44 +01:00
oldlast->nextOfs = loc.getOfs();
2007-10-30 16:35:17 +01:00
e->lastRecord = loc;
}
2007-11-06 03:43:49 +01:00
d->nrecords++;
d->datasize += r->netLength();
2007-11-09 03:42:50 +01:00
if( indexesBaseDetails ) {
indexesBaseDetails->indexes[indexesBaseDetails->nIndexes].info = loc;
indexesBaseDetails->indexes[indexesBaseDetails->nIndexes].head =
BtreeBucket::addHead(indexFullNS.c_str());
indexesBaseDetails->nIndexes++;
/* todo: index existing records here */
}
return loc;
2007-10-20 01:35:48 +02:00
}
void DataFileMgr::init() {
2007-10-30 18:35:02 +01:00
temp.open("/data/db/temp.dat", 64 * 1024 * 1024);
2007-10-20 01:35:48 +02:00
}
void pdfileInit() {
namespaceIndex.init();
theDataFileMgr.init();
}