/* queryoptimizer.cpp */ /** * Copyright (C) 2008 10gen Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "pch.h" #include "db.h" #include "btree.h" #include "pdfile.h" #include "queryoptimizer.h" #include "cmdline.h" //#define DEBUGQO(x) cout << x << endl; #define DEBUGQO(x) namespace mongo { void checkTableScanAllowed( const char * ns ){ if ( ! cmdLine.notablescan ) return; if ( strstr( ns , ".system." ) || strstr( ns , "local." ) ) return; if ( ! nsdetails( ns ) ) return; uassert( 10111 , (string)"table scans not allowed:" + ns , ! cmdLine.notablescan ); } double elementDirection( const BSONElement &e ) { if ( e.isNumber() ) return e.number(); return 1; } QueryPlan::QueryPlan( NamespaceDetails *_d, int _idxNo, const FieldRangeSet &fbs, const BSONObj &order, const BSONObj &startKey, const BSONObj &endKey , string special ) : d(_d), idxNo(_idxNo), fbs_( fbs ), order_( order ), index_( 0 ), optimal_( false ), scanAndOrderRequired_( true ), exactKeyMatch_( false ), direction_( 0 ), endKeyInclusive_( endKey.isEmpty() ), unhelpful_( false ), _special( special ), _type(0){ if ( !fbs_.matchPossible() ) { unhelpful_ = true; scanAndOrderRequired_ = false; return; } if( idxNo >= 0 ) { index_ = &d->idx(idxNo); } else { // full table scan case if ( order_.isEmpty() || !strcmp( order_.firstElement().fieldName(), "$natural" ) ) scanAndOrderRequired_ = false; return; } if ( _special.size() ){ optimal_ = true; _type = index_->getSpec().getType(); massert( 13040 , (string)"no type for special: " + _special , _type ); scanAndOrderRequired_ = _type->scanAndOrderRequired( fbs.query() , order ); return; } BSONObj idxKey = index_->keyPattern(); BSONObjIterator o( order ); BSONObjIterator k( idxKey ); if ( !o.moreWithEOO() ) scanAndOrderRequired_ = false; while( o.moreWithEOO() ) { BSONElement oe = o.next(); if ( oe.eoo() ) { scanAndOrderRequired_ = false; break; } if ( !k.moreWithEOO() ) break; BSONElement ke; while( 1 ) { ke = k.next(); if ( ke.eoo() ) goto doneCheckOrder; if ( strcmp( oe.fieldName(), ke.fieldName() ) == 0 ) break; if ( !fbs.range( ke.fieldName() ).equality() ) goto doneCheckOrder; } int d = elementDirection( oe ) == elementDirection( ke ) ? 1 : -1; if ( direction_ == 0 ) direction_ = d; else if ( direction_ != d ) break; } doneCheckOrder: if ( scanAndOrderRequired_ ) direction_ = 0; BSONObjIterator i( idxKey ); int exactIndexedQueryCount = 0; int optimalIndexedQueryCount = 0; bool stillOptimalIndexedQueryCount = true; set< string > orderFieldsUnindexed; order.getFieldNames( orderFieldsUnindexed ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; const FieldRange &fb = fbs.range( e.fieldName() ); if ( stillOptimalIndexedQueryCount ) { if ( fb.nontrivial() ) ++optimalIndexedQueryCount; if ( !fb.equality() ) stillOptimalIndexedQueryCount = false; } else { if ( fb.nontrivial() ) optimalIndexedQueryCount = -1; } if ( fb.equality() ) { BSONElement e = fb.max(); if ( !e.isNumber() && !e.mayEncapsulate() && e.type() != RegEx ) ++exactIndexedQueryCount; } orderFieldsUnindexed.erase( e.fieldName() ); } if ( !scanAndOrderRequired_ && ( optimalIndexedQueryCount == fbs.nNontrivialRanges() ) ) optimal_ = true; if ( exactIndexedQueryCount == fbs.nNontrivialRanges() && orderFieldsUnindexed.size() == 0 && exactIndexedQueryCount == index_->keyPattern().nFields() && exactIndexedQueryCount == fbs.query().nFields() ) { exactKeyMatch_ = true; } indexBounds_ = fbs.indexBounds( idxKey, direction_ ); if ( !startKey.isEmpty() || !endKey.isEmpty() ) { BSONObj newStart, newEnd; if ( !startKey.isEmpty() ) newStart = startKey; else newStart = indexBounds_[ 0 ].first; if ( !endKey.isEmpty() ) newEnd = endKey; else newEnd = indexBounds_[ indexBounds_.size() - 1 ].second; BoundList newBounds; newBounds.push_back( make_pair( newStart, newEnd ) ); indexBounds_ = newBounds; } if ( ( scanAndOrderRequired_ || order_.isEmpty() ) && !fbs.range( idxKey.firstElement().fieldName() ).nontrivial() ) unhelpful_ = true; } shared_ptr QueryPlan::newCursor( const DiskLoc &startLoc , int numWanted ) const { if ( _type ) return _type->newCursor( fbs_.query() , order_ , numWanted ); if ( !fbs_.matchPossible() ){ if ( fbs_.nNontrivialRanges() ) checkTableScanAllowed( fbs_.ns() ); return shared_ptr( new BasicCursor( DiskLoc() ) ); } if ( !index_ ){ if ( fbs_.nNontrivialRanges() ) checkTableScanAllowed( fbs_.ns() ); return findTableScan( fbs_.ns(), order_, startLoc ); } massert( 10363 , "newCursor() with start location not implemented for indexed plans", startLoc.isNull() ); if ( indexBounds_.size() < 2 ) { // we are sure to spec endKeyInclusive_ return shared_ptr( new BtreeCursor( d, idxNo, *index_, indexBounds_[ 0 ].first, indexBounds_[ 0 ].second, endKeyInclusive_, direction_ >= 0 ? 1 : -1 ) ); } else { return shared_ptr( new BtreeCursor( d, idxNo, *index_, indexBounds_, direction_ >= 0 ? 1 : -1 ) ); } } shared_ptr QueryPlan::newReverseCursor() const { if ( !fbs_.matchPossible() ) return shared_ptr( new BasicCursor( DiskLoc() ) ); if ( !index_ ) { int orderSpec = order_.getIntField( "$natural" ); if ( orderSpec == INT_MIN ) orderSpec = 1; return findTableScan( fbs_.ns(), BSON( "$natural" << -orderSpec ) ); } massert( 10364 , "newReverseCursor() not implemented for indexed plans", false ); return shared_ptr(); } BSONObj QueryPlan::indexKey() const { if ( !index_ ) return BSON( "$natural" << 1 ); return index_->keyPattern(); } void QueryPlan::registerSelf( long long nScanned ) const { if ( fbs_.matchPossible() ) { scoped_lock lk(NamespaceDetailsTransient::_qcMutex); NamespaceDetailsTransient::get_inlock( ns() ).registerIndexForPattern( fbs_.pattern( order_ ), indexKey(), nScanned ); } } QueryPlanSet::QueryPlanSet( const char *_ns, const BSONObj &query, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max ) : ns(_ns), query_( query.getOwned() ), fbs_( _ns, query ), mayRecordPlan_( true ), usingPrerecordedPlan_( false ), hint_( BSONObj() ), order_( order.getOwned() ), oldNScanned_( 0 ), honorRecordedPlan_( honorRecordedPlan ), min_( min.getOwned() ), max_( max.getOwned() ) { if ( hint && !hint->eoo() ) { hint_ = hint->wrap(); } init(); } void QueryPlanSet::addHint( IndexDetails &id ) { if ( !min_.isEmpty() || !max_.isEmpty() ) { string errmsg; BSONObj keyPattern = id.keyPattern(); // This reformats min_ and max_ to be used for index lookup. massert( 10365 , errmsg, indexDetailsForRange( fbs_.ns(), errmsg, min_, max_, keyPattern ) ); } NamespaceDetails *d = nsdetails(ns); plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(id), fbs_, order_, min_, max_ ) ) ); } void QueryPlanSet::init() { DEBUGQO( "QueryPlanSet::init " << ns << "\t" << query_ ); plans_.clear(); mayRecordPlan_ = true; usingPrerecordedPlan_ = false; const char *ns = fbs_.ns(); NamespaceDetails *d = nsdetails( ns ); if ( !d || !fbs_.matchPossible() ) { // Table scan plan, when no matches are possible plans_.push_back( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ) ); return; } BSONElement hint = hint_.firstElement(); if ( !hint.eoo() ) { mayRecordPlan_ = false; if( hint.type() == String ) { string hintstr = hint.valuestr(); NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { IndexDetails& ii = i.next(); if ( ii.indexName() == hintstr ) { addHint( ii ); return; } } } else if( hint.type() == Object ) { BSONObj hintobj = hint.embeddedObject(); uassert( 10112 , "bad hint", !hintobj.isEmpty() ); if ( !strcmp( hintobj.firstElement().fieldName(), "$natural" ) ) { massert( 10366 , "natural order cannot be specified with $min/$max", min_.isEmpty() && max_.isEmpty() ); // Table scan plan plans_.push_back( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ) ); return; } NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { IndexDetails& ii = i.next(); if( ii.keyPattern().woCompare(hintobj) == 0 ) { addHint( ii ); return; } } } uassert( 10113 , "bad hint", false ); } if ( !min_.isEmpty() || !max_.isEmpty() ) { string errmsg; BSONObj keyPattern; IndexDetails *idx = indexDetailsForRange( ns, errmsg, min_, max_, keyPattern ); massert( 10367 , errmsg, idx ); plans_.push_back( PlanPtr( new QueryPlan( d, d->idxNo(*idx), fbs_, order_, min_, max_ ) ) ); return; } if ( isSimpleIdQuery( query_ ) ){ int idx = d->findIdIndex(); if ( idx >= 0 ){ usingPrerecordedPlan_ = true; mayRecordPlan_ = false; plans_.push_back( PlanPtr( new QueryPlan( d , idx , fbs_ , order_ ) ) ); return; } } if ( query_.isEmpty() && order_.isEmpty() ){ plans_.push_back( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ) ); return; } DEBUGQO( "\t special : " << fbs_.getSpecial() ); if ( fbs_.getSpecial().size() ){ _special = fbs_.getSpecial(); NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { int j = i.pos(); IndexDetails& ii = i.next(); const IndexSpec& spec = ii.getSpec(); if ( spec.getTypeName() == _special && spec.suitability( query_ , order_ ) ){ usingPrerecordedPlan_ = true; mayRecordPlan_ = true; plans_.push_back( PlanPtr( new QueryPlan( d , j , fbs_ , order_ , BSONObj() , BSONObj() , _special ) ) ); return; } } uassert( 13038 , (string)"can't find special index: " + _special + " for: " + query_.toString() , 0 ); } if ( honorRecordedPlan_ ) { scoped_lock lk(NamespaceDetailsTransient::_qcMutex); NamespaceDetailsTransient& nsd = NamespaceDetailsTransient::get_inlock( ns ); BSONObj bestIndex = nsd.indexForPattern( fbs_.pattern( order_ ) ); if ( !bestIndex.isEmpty() ) { usingPrerecordedPlan_ = true; mayRecordPlan_ = false; oldNScanned_ = nsd.nScannedForPattern( fbs_.pattern( order_ ) ); if ( !strcmp( bestIndex.firstElement().fieldName(), "$natural" ) ) { // Table scan plan plans_.push_back( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ) ); return; } NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { int j = i.pos(); IndexDetails& ii = i.next(); if( ii.keyPattern().woCompare(bestIndex) == 0 ) { plans_.push_back( PlanPtr( new QueryPlan( d, j, fbs_, order_ ) ) ); return; } } massert( 10368 , "Unable to locate previously recorded index", false ); } } addOtherPlans( false ); } void QueryPlanSet::addOtherPlans( bool checkFirst ) { const char *ns = fbs_.ns(); NamespaceDetails *d = nsdetails( ns ); if ( !d ) return; // If table scan is optimal or natural order requested or tailable cursor requested if ( !fbs_.matchPossible() || ( fbs_.nNontrivialRanges() == 0 && order_.isEmpty() ) || ( !order_.isEmpty() && !strcmp( order_.firstElement().fieldName(), "$natural" ) ) ) { // Table scan plan addPlan( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ), checkFirst ); return; } bool normalQuery = hint_.isEmpty() && min_.isEmpty() && max_.isEmpty() && query_.getField( "$or" ).eoo(); PlanSet plans; for( int i = 0; i < d->nIndexes; ++i ) { IndexDetails& id = d->idx(i); const IndexSpec& spec = id.getSpec(); IndexSuitability suitability = HELPFUL; if ( normalQuery ){ suitability = spec.suitability( query_ , order_ ); if ( suitability == USELESS ) continue; } PlanPtr p( new QueryPlan( d, i, fbs_, order_ ) ); if ( p->optimal() ) { addPlan( p, checkFirst ); return; } else if ( !p->unhelpful() ) { plans.push_back( p ); } } for( PlanSet::iterator i = plans.begin(); i != plans.end(); ++i ) addPlan( *i, checkFirst ); // Table scan plan addPlan( PlanPtr( new QueryPlan( d, -1, fbs_, order_ ) ), checkFirst ); } shared_ptr< QueryOp > QueryPlanSet::runOp( QueryOp &op ) { if ( usingPrerecordedPlan_ ) { Runner r( *this, op ); shared_ptr< QueryOp > res = r.run(); // plans_.size() > 1 if addOtherPlans was called in Runner::run(). if ( res->complete() || plans_.size() > 1 ) return res; { scoped_lock lk(NamespaceDetailsTransient::_qcMutex); NamespaceDetailsTransient::get_inlock( fbs_.ns() ).registerIndexForPattern( fbs_.pattern( order_ ), BSONObj(), 0 ); } init(); } Runner r( *this, op ); return r.run(); } BSONObj QueryPlanSet::explain() const { vector< BSONObj > arr; for( PlanSet::const_iterator i = plans_.begin(); i != plans_.end(); ++i ) { shared_ptr c = (*i)->newCursor(); BSONObjBuilder explain; explain.append( "cursor", c->toString() ); explain.appendArray( "indexBounds", c->prettyIndexBounds() ); arr.push_back( explain.obj() ); } BSONObjBuilder b; b.append( "allPlans", arr ); return b.obj(); } QueryPlanSet::PlanPtr QueryPlanSet::getBestGuess() const { assert( plans_.size() ); return plans_[0]; } QueryPlanSet::Runner::Runner( QueryPlanSet &plans, QueryOp &op ) : op_( op ), plans_( plans ) { } shared_ptr< QueryOp > QueryPlanSet::Runner::run() { massert( 10369 , "no plans", plans_.plans_.size() > 0 ); if ( plans_.plans_.size() > 1 ) log(1) << " running multiple plans" << endl; vector< shared_ptr< QueryOp > > ops; for( PlanSet::iterator i = plans_.plans_.begin(); i != plans_.plans_.end(); ++i ) { shared_ptr< QueryOp > op( op_.clone() ); op->setQueryPlan( i->get() ); ops.push_back( op ); } for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) { initOp( **i ); if ( (*i)->complete() ) return *i; } long long nScanned = 0; long long nScannedBackup = 0; while( 1 ) { ++nScanned; unsigned errCount = 0; bool first = true; for( vector< shared_ptr< QueryOp > >::iterator i = ops.begin(); i != ops.end(); ++i ) { QueryOp &op = **i; nextOp( op ); if ( op.complete() ) { if ( first ) nScanned += nScannedBackup; if ( plans_.mayRecordPlan_ && op.mayRecordPlan() ) op.qp().registerSelf( nScanned ); return *i; } if ( op.error() ) ++errCount; first = false; } if ( errCount == ops.size() ) break; if ( plans_.usingPrerecordedPlan_ && nScanned > plans_.oldNScanned_ * 10 && plans_._special.empty() ) { plans_.addOtherPlans( true ); PlanSet::iterator i = plans_.plans_.begin(); ++i; for( ; i != plans_.plans_.end(); ++i ) { shared_ptr< QueryOp > op( op_.clone() ); op->setQueryPlan( i->get() ); ops.push_back( op ); initOp( *op ); if ( op->complete() ) return op; } plans_.mayRecordPlan_ = true; plans_.usingPrerecordedPlan_ = false; nScannedBackup = nScanned; nScanned = 0; } } return ops[ 0 ]; } void QueryPlanSet::Runner::initOp( QueryOp &op ) { try { op.init(); } catch ( const std::exception &e ) { op.setExceptionMessage( e.what() ); } catch ( ... ) { op.setExceptionMessage( "Caught unknown exception" ); } } void QueryPlanSet::Runner::nextOp( QueryOp &op ) { try { if ( !op.error() ) op.next(); } catch ( const std::exception &e ) { op.setExceptionMessage( e.what() ); } catch ( ... ) { op.setExceptionMessage( "Caught unknown exception" ); } } MultiPlanScanner::MultiPlanScanner( const char *ns, const BSONObj &query, const BSONObj &order, const BSONElement *hint, bool honorRecordedPlan, const BSONObj &min, const BSONObj &max ) : _ns( ns ), _or( !query.getField( "$or" ).eoo() ), _query( query.getOwned() ), _i() { // _fros( ns, query ) { // eventually implement (some of?) these if ( !order.isEmpty() || hint || !honorRecordedPlan || !min.isEmpty() || !max.isEmpty() ) { _or = false; } if ( !_or ) { _currentQps.reset( new QueryPlanSet( ns, query, order, hint, honorRecordedPlan, min, max ) ); _n = 1; // only one run } else { BSONElement e = query.getField( "$or" ); massert( 13268, "invalid $or spec", e.type() == Array && e.embeddedObject().nFields() > 0 ); _n = e.embeddedObject().nFields(); } } shared_ptr< QueryOp > MultiPlanScanner::runOpOnce( QueryOp &op ) { massert( 13271, "can't run more ops", mayRunMore() ); if ( !_or ) { ++_i; return _currentQps->runOp( op ); } _currentQps.reset( new QueryPlanSet( _ns, nextSimpleQuery(), BSONObj() ) ); return _currentQps->runOp( op ); } shared_ptr< QueryOp > MultiPlanScanner::runOp( QueryOp &op ) { shared_ptr< QueryOp > ret = runOpOnce( op ); while( mayRunMore() ) { ret = runOpOnce( *ret ); } return ret; } bool indexWorks( const BSONObj &idxPattern, const BSONObj &sampleKey, int direction, int firstSignificantField ) { BSONObjIterator p( idxPattern ); BSONObjIterator k( sampleKey ); int i = 0; while( 1 ) { BSONElement pe = p.next(); BSONElement ke = k.next(); if ( pe.eoo() && ke.eoo() ) return true; if ( pe.eoo() || ke.eoo() ) return false; if ( strcmp( pe.fieldName(), ke.fieldName() ) != 0 ) return false; if ( ( i == firstSignificantField ) && !( ( direction > 0 ) == ( pe.number() > 0 ) ) ) return false; ++i; } return false; } BSONObj extremeKeyForIndex( const BSONObj &idxPattern, int baseDirection ) { BSONObjIterator i( idxPattern ); BSONObjBuilder b; while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; int idxDirection = e.number() >= 0 ? 1 : -1; int direction = idxDirection * baseDirection; switch( direction ) { case 1: b.appendMaxKey( e.fieldName() ); break; case -1: b.appendMinKey( e.fieldName() ); break; default: assert( false ); } } return b.obj(); } pair< int, int > keyAudit( const BSONObj &min, const BSONObj &max ) { int direction = 0; int firstSignificantField = 0; BSONObjIterator i( min ); BSONObjIterator a( max ); while( 1 ) { BSONElement ie = i.next(); BSONElement ae = a.next(); if ( ie.eoo() && ae.eoo() ) break; if ( ie.eoo() || ae.eoo() || strcmp( ie.fieldName(), ae.fieldName() ) != 0 ) { return make_pair( -1, -1 ); } int cmp = ie.woCompare( ae ); if ( cmp < 0 ) direction = 1; if ( cmp > 0 ) direction = -1; if ( direction != 0 ) break; ++firstSignificantField; } return make_pair( direction, firstSignificantField ); } pair< int, int > flexibleKeyAudit( const BSONObj &min, const BSONObj &max ) { if ( min.isEmpty() || max.isEmpty() ) { return make_pair( 1, -1 ); } else { return keyAudit( min, max ); } } // NOTE min, max, and keyPattern will be updated to be consistent with the selected index. IndexDetails *indexDetailsForRange( const char *ns, string &errmsg, BSONObj &min, BSONObj &max, BSONObj &keyPattern ) { if ( min.isEmpty() && max.isEmpty() ) { errmsg = "one of min or max must be specified"; return 0; } Client::Context ctx( ns ); IndexDetails *id = 0; NamespaceDetails *d = nsdetails( ns ); if ( !d ) { errmsg = "ns not found"; return 0; } pair< int, int > ret = flexibleKeyAudit( min, max ); if ( ret == make_pair( -1, -1 ) ) { errmsg = "min and max keys do not share pattern"; return 0; } if ( keyPattern.isEmpty() ) { NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { IndexDetails& ii = i.next(); if ( indexWorks( ii.keyPattern(), min.isEmpty() ? max : min, ret.first, ret.second ) ) { if ( ii.getSpec().getType() == 0 ){ id = ⅈ keyPattern = ii.keyPattern(); break; } } } } else { if ( !indexWorks( keyPattern, min.isEmpty() ? max : min, ret.first, ret.second ) ) { errmsg = "requested keyPattern does not match specified keys"; return 0; } NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { IndexDetails& ii = i.next(); if( ii.keyPattern().woCompare(keyPattern) == 0 ) { id = ⅈ break; } if ( keyPattern.nFields() == 1 && ii.keyPattern().nFields() == 1 && IndexDetails::isIdIndexPattern( keyPattern ) && ii.isIdIndex() ){ id = ⅈ break; } } } if ( min.isEmpty() ) { min = extremeKeyForIndex( keyPattern, -1 ); } else if ( max.isEmpty() ) { max = extremeKeyForIndex( keyPattern, 1 ); } if ( !id ) { errmsg = (string)"no index found for specified keyPattern: " + keyPattern.toString(); return 0; } min = min.extractFieldsUnDotted( keyPattern ); max = max.extractFieldsUnDotted( keyPattern ); return id; } } // namespace mongo