SERVER-1358 (Fix) chunk size accounting was using sum of all chunks on shards

2024-12-01 09:32:32 +01:00 · 2010-12-18 15:13:52 -05:00 · 2010-12-18 15:13:52 -05:00 · d3d11d6863
commit d3d11d6863
parent ddbf7adfac
2 changed files with 50 additions and 18 deletions
--- a/jstests/slowNightly/large_chunk.js
+++ b/jstests/slowNightly/large_chunk.js
@ -34,12 +34,13 @@ primary = s.getServer( "test" ).getDB( "test" );
 secondary = s.getOther( primary ).getDB( "test" );

 // Make sure that we don't move that chunk if it goes past what we consider the maximum chunk size
+print("Checkpoint 1a")
 max = 200 * 1024 * 1024;
 moveChunkCmd = { movechunk : "test.foo" , find : { _id : 1 } , to : secondary.getMongo().name , maxChunkSizeBytes : max };
-assert.throws( function(){ s.adminCommand( movChunkCmd ) } );
+assert.throws( function() { s.adminCommand( moveChunkCmd ); } );

 // Move the chunk
-
+print("checkpoint 1b");
 before = s.config.chunks.find().toArray();
 s.adminCommand( { movechunk : "test.foo" , find : { _id : 1 } , to : secondary.getMongo().name } );
 after = s.config.chunks.find().toArray();
--- a/s/d_migrate.cpp
+++ b/s/d_migrate.cpp
@ -369,7 +369,11 @@ namespace mongo {
        }

        /** 
-         * @return ok
+         * Get the disklocs that belong to the chunk migrated and sort them in _cloneLocs (to avoid seeking disk later)
+         *
+         * @param maxChunkSize number of bytes beyond which a chunk's base data (no indices) is considered too large to move
+         * @param errmsg filled with textual description of error if this call return false
+         * @return false if approximate chunk size is too big to move or true otherwise
         */
        bool storeCurrentLocs( long long maxChunkSize , string& errmsg ){
            readlock l( _ns ); 
@ -380,15 +384,8 @@ namespace mongo {
                return false;
            }

-            // if the chunk is larger than allowed, don't even bother trying
-            long long dataSize = d->stats.datasize;
-            if ( dataSize > maxChunkSize ) {
-                errmsg = str::stream() << "can't move chunk size at " << dataSize << " because size limit is " << maxChunkSize;
-                return false;
-            }
-            
            BSONObj keyPattern;
-            // the copies are needed because the indexDetailsForRange destrorys the input
+            // the copies are needed because the indexDetailsForRange destroys the input
            BSONObj min = _min.copy();
            BSONObj max = _max.copy();
            IndexDetails *idx = indexDetailsForRange( _ns.c_str() , errmsg , min , max , keyPattern ); 
@ -396,22 +393,56 @@ namespace mongo {
                errmsg = "can't find index in storeCurrentLocs";
                return false;
            }
-            
+
            scoped_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , 
                                                           shared_ptr<Cursor>( new BtreeCursor( d , d->idxNo(*idx) , *idx , min , max , false , 1 ) ) ,
                                                           _ns ) );
+
+            // use the average object size to estimate how many objects a full chunk would carry
+            // do that while traversing the chunk's range using the sharding index, below
+            // there's a fair amout of slack before we determine a chunk is too large because object sizes will vary
+            unsigned long long maxRecsWhenFull;
+            long long avgRecSize;
+            const long long totalRecs = d->stats.nrecords;
+            if ( totalRecs > 0 ){
+                avgRecSize = d->stats.datasize / totalRecs;
+                maxRecsWhenFull = maxChunkSize / avgRecSize;
+                maxRecsWhenFull = 130 * maxRecsWhenFull / 100; // slack
+            } else {
+                avgRecSize = 0;
+                maxRecsWhenFull = numeric_limits<long long>::max();
+            }
+
+            // do a full traversal of the chunk and don't stop even if we think it is a large chunk
+            // we want the number of records to better report, in that case
+            bool isLargeChunk = false;
+            unsigned long long recCount = 0;;
            while ( cc->ok() ){
                DiskLoc dl = cc->currLoc();
-                _cloneLocs.insert( dl );
+                if ( ! isLargeChunk ) {
+                    _cloneLocs.insert( dl ); 
+                }
                cc->advance();
-                
-                if ( ! cc->yieldSometimes() )
-                    break;

+                // we can afford to yield here because any change to the base data that we might miss is already being 
+                // queued and will be migrated in the 'transferMods' stage
+                if ( ! cc->yieldSometimes() ) {
+                   break;
+                }
+
+                if ( ++recCount > maxRecsWhenFull ) {
+                    isLargeChunk = true;
+                } 
+            }
+
+            if ( isLargeChunk ) {
+                errmsg = str::stream() << "can't move chunk of size (aprox) " << recCount * avgRecSize 
+                                       << " because maximum size allowed to move is " << maxChunkSize;
+                log( LL_WARNING ) << errmsg << endl;
+                return false;
            }
            
-            log() << "\t moveChunk number of documents: " << _cloneLocs.size() << endl;
-
+            log() << "moveChunk number of documents: " << _cloneLocs.size() << endl;
            return true;
        }