mongodb/dbtests/perf/btreeperf.cpp

// btreeperf.cpp

/*    Copyright 2010 10gen Inc.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */

/**
 * Performance timing and space utilization testing for btree indexes.
 */

#include <iostream>

#include <boost/random/bernoulli_distribution.hpp>
#include <boost/random/geometric_distribution.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/variate_generator.hpp>
#include <boost/random/uniform_int.hpp>

#include "client/dbclient.h"
#include "../../util/timer.h"

using namespace std;
using namespace mongo;
using namespace boost;

const char *ns = "test.btreeperf";
const char *db = "test";
const char *index_collection = "btreeperf.$_id_";

// This random number generator has a much larger period than the default
// generator and is half as fast as the default.  Given that we intend to
// generate large numbers of documents and will utilize more than one random
// sample per document, choosing this generator seems like a worthwhile tradeoff.
mt19937 randomNumberGenerator;

/**
 * An interface for generating documents to be inserted and document specs for
 * remove requests.
 */
class InsertAndRemoveStrategy {
public:
    virtual ~InsertAndRemoveStrategy() {}
    virtual BSONObj insertObj() = 0;
    virtual BSONObj removeObj() = 0;
protected:
    /**
     * Helper functions for converting a sample value to a sample object with
     * specified _id, to be inserted or removed.
     */

    template< class T >
    BSONObj insertObjWithVal( const T &val ) {
        BSONObjBuilder b;
        b.append( "_id", val );
        return b.obj();
    }
    template< class T >
    BSONObj removeObjWithVal( const T &val ) {
        BSONObjBuilder b;
        b.append( "_id", val );
        return b.obj();
    }
};

/**
 * Manages a set of elements of type T.  Supports inserting unique elements and
 * sampling a random element without replacement.
 *
 * TODO In the contexts where this class is currently used, duplicate keys are
 * either impossible or highly unlikely.  And an occasional duplicate value will
 * not much affect the procedure by wich a random element is chosen.  We could
 * stop checking for duplicates in push(), eliminate _set from the implementaiton,
 * and potentially improve performance and memory requirements somewhat.
 */
template< class T >
class SetSampler {
public:
    /** @param val Insert this value in the set if not already present. */
    void push( const T& val ) {
        if ( _set.insert( val ).second ) {
            _vector.push_back( val );
        }
    }
    /** @return a random element removed from the set */
    T pull() {
        if ( _vector.size() == 0 ) {
            return T();
        }
        uniform_int< size_t > sizeRange( 0, _vector.size() - 1 );
        variate_generator< mt19937&, uniform_int< size_t > > sizeGenerator( randomNumberGenerator, sizeRange );
        size_t toRemove = sizeGenerator();
        T val = _vector[ toRemove ];
        // Replace the random element with the last element, then remove the
        // last element.
        _vector[ toRemove ] = _vector.back();
        _vector.pop_back();
        _set.erase( val );
        return val;
    }
private:
    vector< T > _vector;
    set< T > _set;
};

/**
 * Tracks values that have been specified for insertion by the derived class's
 * implementation of insertVal() and selects uniformally from among values that
 * have been inserted but not yet removed for the next value to remove.
 *
 * The implementation is probabilistically sound, but may be resource intensive
 * and slow due to the use of a SetSampler.
 */
template< class T >
class InsertAndUniformRemoveStrategy : public InsertAndRemoveStrategy {
public:
    virtual BSONObj insertObj() {
        T val = insertVal();
        _sampler.push( val );
        return insertObjWithVal( val );
    }
    virtual BSONObj removeObj() { return removeObjWithVal( _sampler.pull() ); }
protected:
    /** @return value to insert. This is the only function a derived class need implement. */
    virtual T insertVal() = 0;
private:
    SetSampler< T > _sampler;
};

/**
 * The derived class supplies keys to be inserted and removed.  The key removal
 * strategy is similar to the strategy for selecting a random element described
 * in the MongoDB cookbook: the first key in the collection greater than or
 * equal to the supplied removal key is removed.  This allows selecting an
 * exising key for removal without the overhead required by a SetSampler.
 *
 * While this ranged selection strategy can work well for selecting a random
 * element, there are some theoretical and empirically observed shortcomings
 * when the strategy is applied to removing nodes for btree performance measurement:
 * 1 The likelihood that a given key is removed is proportional to the difference
 *   in value between it and the previous key.  Because key deletion increases
 *   the difference in value between adjacent keys, neighboring keys will be
 *   more likely to be deleted than they would be in a true uniform distribution.
 * 2 MongoDB 1.6 uses 'unused' nodes in the btree implementation.  With a ranged
 *   removal strategy, those nodes must be traversed to find a node available
 *   for removal.
 * 3 Ranged removal was observed to be biased against the balancing policy of
 *   MongoDB 1.7 in some cases, in terms of storage size.  This may be a
 *   consequence of point 1 above.
 * 4 Ranged removal was observed to be significantly biased against the btree
 *   implementation in MongoDB 1.6 in terms of performance.  This is likely a
 *   consequence of point 2 above.
 * 5 In some cases the biases described above were not evident in tests lasting
 *   several minutes, but were evident in tests lasting several hours.
 */
template< class T >
class InsertAndRangedRemoveStrategy : public InsertAndRemoveStrategy {
public:
    virtual BSONObj insertObj() { return insertObjWithVal( insertVal() ); }
    virtual BSONObj removeObj() { return rangedRemoveObjWithVal( removeVal() ); }
protected:
    /** Small likelihood that this removal spec will not match any document */
    template< class U >
    BSONObj rangedRemoveObjWithVal( const U &val ) {
        BSONObjBuilder b1;
        BSONObjBuilder b2( b1.subobjStart( "_id" ) );
        b2.append( "$gte", val );
        b2.done();
        return b1.obj();
    }
    virtual T insertVal() = 0;
    virtual T removeVal() = 0;
};

/**
 * Integer Keys
 * Uniform Inserts
 * Uniform Removes
 */
class UniformInsertRangedUniformRemoveInteger : public InsertAndRangedRemoveStrategy< long long > {
public:
    UniformInsertRangedUniformRemoveInteger() :
        _uniform_int( 0ULL, ~0ULL ),
        _nextLongLong( randomNumberGenerator, _uniform_int ) {
    }
    /** Small likelihood of duplicates */
    virtual long long insertVal() { return _nextLongLong(); }
    virtual long long removeVal() { return _nextLongLong(); }
private:
    uniform_int< unsigned long long > _uniform_int;
    variate_generator< mt19937&, uniform_int< unsigned long long > > _nextLongLong;
};

class UniformInsertUniformRemoveInteger : public InsertAndUniformRemoveStrategy< long long > {
public:
    virtual long long insertVal() { return _gen.insertVal(); }
private:
    UniformInsertRangedUniformRemoveInteger _gen;
};

/**
 * String Keys
 * Uniform Inserts
 * Uniform Removes
 */
class UniformInsertRangedUniformRemoveString : public InsertAndRangedRemoveStrategy< string > {
public:
    UniformInsertRangedUniformRemoveString() :
        _geometric_distribution( 0.9 ),
        _nextLength( randomNumberGenerator, _geometric_distribution ),
        _uniform_char( 'a', 'z' ),
        _nextChar( randomNumberGenerator, _uniform_char ) {
    }
    /** Small likelihood of duplicates */
    virtual string insertVal() { return nextString(); }
    virtual string removeVal() { return nextString(); }
private:
    string nextString() {
        // The longer the minimum string length, the lower the likelihood of duplicates
        int len = _nextLength() + 5;
        len = len > 100 ? 100 : len;
        string ret( len, 'x' );
        for( int i = 0; i < len; ++i ) {
            ret[ i ] = _nextChar();
        }
        return ret;
    }
    geometric_distribution<> _geometric_distribution;
    variate_generator< mt19937&, geometric_distribution<> > _nextLength;
    uniform_int< char > _uniform_char;
    variate_generator< mt19937&, uniform_int< char > > _nextChar;
};

class UniformInsertUniformRemoveString : public InsertAndUniformRemoveStrategy< string > {
public:
    virtual string insertVal() { return _gen.insertVal(); }
private:
    UniformInsertRangedUniformRemoveString _gen;
};

/**
 * OID Keys
 * Increasing Inserts
 * Uniform Removes
 */
class IncreasingInsertRangedUniformRemoveOID : public InsertAndRangedRemoveStrategy< OID > {
public:
    IncreasingInsertRangedUniformRemoveOID() :
        _max( -1 ) {
    }
    virtual OID insertVal() { return oidFromULL( ++_max ); }
    virtual OID removeVal() {
        uniform_int< unsigned long long > distribution( 0, _max > 0 ? _max : 0 );
        variate_generator< mt19937&, uniform_int< unsigned long long > > generator( randomNumberGenerator, distribution );
        return oidFromULL( generator() );
    }
private:
    static OID oidFromULL( unsigned long long val ) {
        val = __builtin_bswap64( val );
        OID oid;
        oid.clear();
        memcpy( (char*)&oid + 4, &val, 8 );
        return oid;
    }
    long long _max;
};

class IncreasingInsertUniformRemoveOID : public InsertAndUniformRemoveStrategy< OID > {
public:
    virtual OID insertVal() { return _gen.insertVal(); }
private:
    IncreasingInsertRangedUniformRemoveOID _gen;
};

/**
 * Integer Keys
 * Increasing Inserts
 * Increasing Removes (on remove, the lowest key is always removed)
 */
class IncreasingInsertIncreasingRemoveInteger : public InsertAndRemoveStrategy {
public:
    IncreasingInsertIncreasingRemoveInteger() :
        // Start with a large value so data type will be preserved if we round
        // trip through json.
        _min( 1LL << 32 ),
        _max( 1LL << 32 ) {
    }
    virtual BSONObj insertObj() { return insertObjWithVal( ++_max ); }
    virtual BSONObj removeObj() { return removeObjWithVal( _min < _max ? ++_min : _min ); }
private:
    long long _min;
    long long _max;
};

/** Generate a random boolean value. */
class BernoulliGenerator {
public:
    /**
     * @param excessFalsePercent This specifies the desired rate of false values
     * vs true values.  If we want false to be 5% more likely than true, we
     * specify 5 for this argument.
     */
    BernoulliGenerator( int excessFalsePercent ) :
        _bernoulli_distribution( 1.0 / ( 2.0 + excessFalsePercent / 100.0 ) ),
        _generator( randomNumberGenerator, _bernoulli_distribution ) {
    }
    bool operator()() { return _generator(); }
private:
    bernoulli_distribution<> _bernoulli_distribution;
    variate_generator< mt19937&, bernoulli_distribution<> > _generator;
};

/** Runs a strategy on a connection, with specified mix of inserts and removes. */
class InsertAndRemoveRunner {
public:
    InsertAndRemoveRunner( DBClientConnection &conn, InsertAndRemoveStrategy &strategy, int excessInsertPercent ) :
        _conn( conn ),
        _strategy( strategy ),
        _nextOpTypeRemove( excessInsertPercent ) {
    }
    void writeOne() {
        if ( _nextOpTypeRemove() ) {
            _conn.remove( ns, _strategy.removeObj(), true );
        }
        else {
            _conn.insert( ns, _strategy.insertObj() );
        }
    }
private:
    DBClientConnection &_conn;
    InsertAndRemoveStrategy &_strategy;
    BernoulliGenerator _nextOpTypeRemove;
};

/**
 * Writes a test script to cout based on a strategy and specified mix of inserts
 * and removes.  The script can be subsequently executed by InsertAndRemoveRunner.
 * Script generation is intended for strategies that are memory or cpu intensive
 * and might either divert resources from a mongod instance being analyzed on the
 * same machine or fail to generate requests as quickly as the mongod might
 * accept them.
 * The script contains one line per operation.  Each line begins
 * with a letter indicating the operation type, followed by a space.  Next
 * follows the json representation of a document for the specified operation
 * type.
 */
class InsertAndRemoveScriptGenerator {
public:
    InsertAndRemoveScriptGenerator( InsertAndRemoveStrategy &strategy, int excessInsertPercent ) :
        _strategy( strategy ),
        _nextOpTypeRemove( excessInsertPercent ) {
    }
    void writeOne() {
        if ( _nextOpTypeRemove() ) {
            cout << "r " << _strategy.removeObj().jsonString() << endl;
        }
        else {
            cout << "i " << _strategy.insertObj().jsonString() << endl;
        }
    }
private:
    InsertAndRemoveStrategy &_strategy;
    BernoulliGenerator _nextOpTypeRemove;
};

/**
 * Run a test script from cin that was generated by
 * InsertAndRemoveScriptGenerator.  Running the script is intended to be
 * lightweight in terms of memory and cpu usage, and fast.
 */
class InsertAndRemoveScriptRunner {
public:
    InsertAndRemoveScriptRunner( DBClientConnection &conn ) :
        _conn( conn ) {
    }
    void writeOne() {
        cin.getline( _buf, 1024 );
        BSONObj val = fromjson( _buf + 2 );
        if ( _buf[ 0 ] == 'r' ) {
            _conn.remove( ns, val, true );
        }
        else {
            _conn.insert( ns, val );
        }
    }
private:
    DBClientConnection &_conn;
    char _buf[ 1024 ];
};

int main( int argc, const char **argv ) {

    DBClientConnection conn;
    conn.connect( "127.0.0.1:27017" );
    conn.dropCollection( ns );

//    UniformInsertRangedUniformRemoveInteger strategy;
//    UniformInsertUniformRemoveInteger strategy;
//    UniformInsertRangedUniformRemoveString strategy;
//    UniformInsertUniformRemoveString strategy;
//    IncreasingInsertRangedUniformRemoveOID strategy;
//    IncreasingInsertUniformRemoveOID strategy;
//    IncreasingInsertIncreasingRemoveInteger strategy;
//    InsertAndRemoveScriptGenerator runner( strategy, 5 );
    InsertAndRemoveScriptRunner runner( conn );

    Timer t;
    BSONObj statsCmd = BSON( "collstats" << index_collection );

    // Print header, unless we are generating a script (in that case, comment this out).
    cout << "ops,milliseconds,docs,totalBucketSize" << endl;

    long long i = 0;
    long long n = 10000000000;
    while( i < n ) {
        runner.writeOne();
        // Print statistics, unless we are generating a script (in that case, comment this out).
        // The stats collection requests below provide regular read operations,
        // ensuring we are caught up with the progress being made by the mongod
        // under analysis.
        if ( ++i % 50000 == 0 ) {
            // The total number of documents present.
            long long docs = conn.count( ns );
            BSONObj result;
            conn.runCommand( db, statsCmd, result );
            // The total number of bytes used for all allocated 8K buckets of the
            // btree.
            long long totalBucketSize = result.getField( "count" ).numberLong() * 8192;
            cout << i << ',' << t.millis() << ',' << docs << ',' << totalBucketSize << endl;
        }
    }
}