do not pick a split key if it is not different from the chunks' min key (on behalf of Matt Taylor)

2024-12-01 09:32:32 +01:00 · 2010-10-19 13:12:28 -04:00 · 2010-10-19 13:12:28 -04:00 · 1e450f7b00
commit 1e450f7b00
parent 4c4349c696
2 changed files with 58 additions and 6 deletions
--- a/jstests/splitvector.js
+++ b/jstests/splitvector.js
@ -120,4 +120,29 @@ res = db.runCommand( { splitVector: "test.jstests_splitvector" , keyPattern: {x:
 assert.eq( true , res.ok , "6a" );
 assert.eq( 19 , res.splitKeys.length , "6b" );

+
+// -------------------------
+// Case 7: enough occurances of min key documents to pass the chunk limit
+// [1111111111111111,2,3)
+
+f.drop();
+f.ensureIndex( { x: 1 } );
+
+// Fill collection and get split vector for 1MB maxChunkSize
+numDocs = 2100;
+for( i=1; i<numDocs; i++ ){
+    f.save( { x: 1, y: filler } );
+}
+
+for( i=1; i<10; i++ ){
+    f.save( { x: 2, y: filler } );
+}
+db.getLastError();
+res = db.runCommand( { splitVector: "test.jstests_splitvector" , keyPattern: {x:1} , maxChunkSize: 1 , min: 1} );
+
+assert.eq( true , res.ok , "7a" );
+assert.eq( 2 , res.splitKeys[0].x, "7b");
+
+
 print("PASSED");
+
--- a/s/d_split.cpp
+++ b/s/d_split.cpp
@ -230,23 +230,38 @@ namespace mongo {
            Timer timer;
            long long currCount = 0;
            long long numChunks = 0;
-            vector<BSONObj> splitKeys;
-            BSONObj currKey;
            
            BtreeCursor * bc = new BtreeCursor( d , d->idxNo(*idx) , *idx , min , max , false , 1 );
            shared_ptr<Cursor> c( bc );
            scoped_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
+            if ( ! cc->ok() ){
+                errmsg = "can't open a cursor for splitting";
+                return false;
+            }

+            // Use every 'keyCount'-th key as a split point. We add the initial key as a sentinel, to be removed
+            // at the end. If a key appears more times than entries allowed on a chunk, we issue a warning and 
+            // split on the following key.
+            vector<BSONObj> splitKeys;
+            set<BSONObj> tooFrequentKeys;
+            splitKeys.push_back( c->currKey() );
            while ( cc->ok() ){ 
                currCount++;
                if ( currCount > keyCount ){
-                    if ( currKey.isEmpty() || currKey.woCompare( c->currKey() ) ){ 
-                        currKey = c->currKey();
-                        splitKeys.push_back( bc->prettyKey( currKey ) );
+                    BSONObj currKey = c->currKey();
+
+                    // Do not use this split key if it is the same used in the previous split point.
+                    if ( currKey.woCompare( splitKeys.back() ) == 0 ){
+                        tooFrequentKeys.insert( currKey );
+
+                    } else {
+                        splitKeys.push_back( currKey );
                        currCount = 0;
                        numChunks++;
-                        log(4) << "picked a split key: " << currKey << endl;
+
+                        log(4) << "picked a split key: " << bc->prettyKey( currKey ) << endl;
                    }
+
                }
                cc->advance();

@ -266,6 +281,18 @@ namespace mongo {
                }
            }

+            // Warn for keys that are more numerous than maxChunkSize allows.
+            for ( set<BSONObj>::const_iterator it = tooFrequentKeys.begin(); it != tooFrequentKeys.end(); ++it ){
+                log( LL_WARNING ) << "chunk is larger than " << maxChunkSize 
+                                  << " bytes because of key " << bc->prettyKey( *it ) << endl;
+            }
+
+            // Remove the sentinel at the beginning before returning and add fieldnames.
+            splitKeys.erase( splitKeys.begin() );
+            for ( vector<BSONObj>::iterator it = splitKeys.begin(); it != splitKeys.end() ; ++it ){
+                *it = bc->prettyKey( *it );
+            }
+
            ostringstream os;
            os << "Finding the split vector for " <<  ns << " over "<< keyPattern 
               << " keyCount: " << keyCount << " numSplits: " << splitKeys.size();