// reccache.h /* * Copyright (C) 2010 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ /* CachedBasicRecStore This is our store which implements a traditional page-cache type of storage (not memory mapped files). */ /* LOCK HIERARCHY dblock RecCache::rcmutex i.e. always lock dblock first if you lock both */ #pragma once #include "reci.h" #include "recstore.h" namespace mongo { class RecCache { struct Node { Node(void* _data) : data((char *) _data) { dirty = false; newer = 0; } ~Node() { free(data); data = 0; } char *data; DiskLoc loc; bool dirty; Node *older, *newer; // lru }; mongo::mutex rcmutex; // mainly to coordinate with the lazy writer thread unsigned recsize; map m; // the cache Node *newest, *oldest; unsigned nnodes; set dirtyl; vector stores; // DiskLoc::a() indicates the index into this vector map storesByNsKey; // nskey -> BasicRecStore* public: static unsigned MAXNODES; enum BaseValue { Base = 10000 }; private: BasicRecStore* _initStore(string fname); BasicRecStore* initStore(int n); string findStoreFilename(const char *_ns, bool& found); void initStoreByNs(const char *ns, const string& nskey); void closeStore(BasicRecStore *rs); static string directory(); static string mknskey(const char *ns) { return directory() + ns; } /* get the right file for a given diskloc */ BasicRecStore& store(DiskLoc& d) { int n = d.a() - Base; if( (int) stores.size() > n ) { BasicRecStore *rs = stores[n]; if( rs ) { assert( rs->fileNumber == n ); return *rs; } } return *initStore(n); } BasicRecStore& store(const char *ns) { string nskey = mknskey(ns); BasicRecStore *&rs = storesByNsKey[nskey]; if( rs ) return *rs; initStoreByNs(ns, nskey); return *rs; } void writeDirty( set::iterator i, bool rawLog = false ); void writeIfDirty(Node *n); void touch(Node* n) { if( n == newest ) return; if( n == oldest ) { oldest = oldest->newer; assert( oldest || nnodes == 1 ); } if( n->older ) n->older->newer = n->newer; if( n->newer ) n->newer->older = n->older; n->newer = 0; n->older = newest; newest->newer = n; newest = n; } Node* mkNode() { Node *n = new Node(calloc(recsize,1)); // calloc is TEMP for testing. change to malloc n->older = newest; if( newest ) newest->newer = n; else { assert( oldest == 0 ); oldest = n; } newest = n; nnodes++; return n; } fileofs fileOfs(DiskLoc d) { return ((fileofs) d.getOfs()) * recsize; } void dump(); void _ejectOld(); public: /* all public functions (except constructor) should use the mutex */ RecCache(unsigned recsz) : recsize(recsz) { nnodes = 0; newest = oldest = 0; } /* call this after doing some work, after you are sure you are done with modifications. we call it from dbunlocking(). */ void ejectOld() { if( nnodes > MAXNODES ) // just enough here to be inlineable for speed reasons. _ejectOld does the real work _ejectOld(); } /* bg writer thread invokes this */ void writeLazily(); /* Note that this may be called BEFORE the actual writing to the node takes place. We do flushing later on a dbunlocking() call, which happens after the writing. */ void dirty(DiskLoc d) { assert( d.a() >= Base ); scoped_lock lk(rcmutex); map::iterator i = m.find(d); if( i != m.end() ) { Node *n = i->second; if( !n->dirty ) { n->dirty = true; dirtyl.insert(n->loc); } } } char* get(DiskLoc d, unsigned len) { assert( d.a() >= Base ); assert( len == recsize ); scoped_lock lk(rcmutex); map::iterator i = m.find(d); if( i != m.end() ) { touch(i->second); return i->second->data; } Node *n = mkNode(); n->loc = d; store(d).get(fileOfs(d), n->data, recsize); // could throw exception m.insert( pair(d, n) ); return n->data; } void drop(const char *ns); DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { scoped_lock lk(rcmutex); BasicRecStore& rs = store(ns); fileofs o = rs.insert((const char *) obuf, len); assert( o % recsize == 0 ); fileofs recnum = o / recsize; massert( 10377 , "RecCache file too large?", recnum <= 0x7fffffff ); Node *n = mkNode(); memcpy(n->data, obuf, len); DiskLoc d(rs.fileNumber + Base, (int) recnum); n->loc = d; m[d] = n; return d; } void closeFiles(string dbname, string path); // at termination: write dirty pages and close all files void closing(); }; extern RecCache theRecCache; class CachedBasicRecStore : public RecStoreInterface { public: VIRT char* get(DiskLoc d, unsigned len) { return theRecCache.get(d, len); } VIRT DiskLoc insert(const char *ns, const void *obuf, int len, bool god) { return theRecCache.insert(ns, obuf, len, god); } VIRT void modified(DiskLoc d) { theRecCache.dirty(d); } /* drop collection */ VIRT void drop(const char *ns) { theRecCache.drop(ns); } VIRT void rename(const char *fromNs, const char *toNs) { massert( 10378 , "rename not yet implemented for CachedBasicRecStore", false ); } /* close datafiles associated with the db specified. */ VIRT void closeFiles(string dbname, string path) { theRecCache.closeFiles(dbname, dbpath); } }; /* see concurrency.h - note on a lock reset from read->write we don't call dbunlocking_read, we just wait for the final dbunlocking_write call */ inline void dbunlocking_read() { /* Client *c = currentClient.get(); if ( c ) c->top.clientStop(); */ } inline void dbunlocking_write() { theRecCache.ejectOld(); dbunlocking_read(); } } /*namespace*/