mirror of
https://github.com/mongodb/mongo.git
synced 2024-12-01 09:32:32 +01:00
SERVER-51601 Make OperationCPUTimer initializer more resilient
This commit is contained in:
parent
48fcb9bd10
commit
4739fb2322
@ -27,6 +27,8 @@
|
||||
* it in the license file.
|
||||
*/
|
||||
|
||||
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault
|
||||
|
||||
#include <boost/optional.hpp>
|
||||
#include <fmt/format.h>
|
||||
|
||||
@ -39,6 +41,7 @@
|
||||
#include "mongo/base/error_codes.h"
|
||||
#include "mongo/db/client.h"
|
||||
#include "mongo/db/operation_context.h"
|
||||
#include "mongo/logv2/log.h"
|
||||
#include "mongo/stdx/thread.h"
|
||||
#include "mongo/util/assert_util.h"
|
||||
#include "mongo/util/errno_util.h"
|
||||
@ -46,11 +49,22 @@
|
||||
|
||||
namespace mongo {
|
||||
|
||||
using namespace fmt::literals;
|
||||
|
||||
#if defined(__linux__)
|
||||
|
||||
namespace {
|
||||
|
||||
using namespace fmt::literals;
|
||||
// Reads the thread timer, and throws with `InternalError` if that fails.
|
||||
Nanoseconds getThreadCPUTime() {
|
||||
struct timespec t;
|
||||
if (auto ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t); ret != 0) {
|
||||
int ec = errno;
|
||||
internalAssert(Status(ErrorCodes::InternalError,
|
||||
"Unable to get time: {}"_format(errnoWithDescription(ec))));
|
||||
}
|
||||
return Seconds(t.tv_sec) + Nanoseconds(t.tv_nsec);
|
||||
}
|
||||
|
||||
MONGO_FAIL_POINT_DEFINE(hangCPUTimerAfterOnThreadAttach);
|
||||
MONGO_FAIL_POINT_DEFINE(hangCPUTimerAfterOnThreadDetach);
|
||||
@ -132,14 +146,13 @@ void PosixTimer::onThreadDetach() {
|
||||
hangCPUTimerAfterOnThreadDetach.pauseWhileSet();
|
||||
}
|
||||
|
||||
Nanoseconds PosixTimer::_getThreadTime() const {
|
||||
struct timespec t;
|
||||
if (auto ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t); ret != 0) {
|
||||
int ec = errno;
|
||||
internalAssert(Status(ErrorCodes::InternalError,
|
||||
"Unable to get time: {}"_format(errnoWithDescription(ec))));
|
||||
}
|
||||
return Seconds(t.tv_sec) + Nanoseconds(t.tv_nsec);
|
||||
Nanoseconds PosixTimer::_getThreadTime() const try {
|
||||
return getThreadCPUTime();
|
||||
} catch (const ExceptionFor<ErrorCodes::InternalError>& ex) {
|
||||
// Abort the process as the timer cannot account for the elapsed time. This path is only
|
||||
// reachable if the platform supports CPU time measurement at startup, but returns an error
|
||||
// for a subsequent attempt to get thread-specific CPU consumption.
|
||||
LOGV2_FATAL(4744601, "Failed to read the CPU time for the current thread", "error"_attr = ex);
|
||||
}
|
||||
|
||||
static auto getCPUTimer = OperationContext::declareDecoration<PosixTimer>();
|
||||
@ -154,7 +167,17 @@ OperationCPUTimer* OperationCPUTimer::get(OperationContext* opCtx) {
|
||||
// of SMP systems.
|
||||
static bool isTimeSupported = [] {
|
||||
clockid_t cid;
|
||||
return clock_getcpuclockid(0, &cid) == 0;
|
||||
if (clock_getcpuclockid(0, &cid) != 0)
|
||||
return false;
|
||||
|
||||
try {
|
||||
getThreadCPUTime();
|
||||
} catch (const ExceptionFor<ErrorCodes::InternalError>&) {
|
||||
// Unable to collect the CPU time for the current thread.
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}();
|
||||
|
||||
if (!isTimeSupported)
|
||||
|
@ -27,9 +27,14 @@
|
||||
* it in the license file.
|
||||
*/
|
||||
|
||||
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kTest
|
||||
|
||||
#include "mongo/db/operation_cpu_timer.h"
|
||||
#include "mongo/db/service_context_test_fixture.h"
|
||||
#include "mongo/logv2/log.h"
|
||||
#include "mongo/platform/atomic_word.h"
|
||||
#include "mongo/stdx/condition_variable.h"
|
||||
#include "mongo/stdx/mutex.h"
|
||||
#include "mongo/stdx/thread.h"
|
||||
#include "mongo/unittest/barrier.h"
|
||||
#include "mongo/unittest/death_test.h"
|
||||
@ -159,6 +164,53 @@ DEATH_TEST_F(OperationCPUTimerTest, GetElapsedForPausedTimer, "Not attached to c
|
||||
timer->getElapsed();
|
||||
}
|
||||
|
||||
TEST_F(OperationCPUTimerTest, TimerPausesOnBlockingSleep) {
|
||||
// This test checks if the time measured by `OperationCPUTimer` does not include the period of
|
||||
// time the operation was blocked (e.g., waiting on a condition variable). The idea is to have
|
||||
// the operation block for `kSomeDelay`, ensure the elapsed time observed by the timer is always
|
||||
// less than `kSomeDelay`, and repeat the test `kRepeats` times. To account for the sporadic
|
||||
// wake ups, the test does not fail unless the number of failures exceeds `kMaxFailures`. This
|
||||
// is just a best-effort, and the number of failures is not guaranteed to not exceed the upper
|
||||
// bound (i.e., `kMaxFailures`).
|
||||
const auto kSomeDelay = Milliseconds(1);
|
||||
const auto kRepeats = 1000;
|
||||
const auto kMaxFailureRate = 0.1;
|
||||
const auto kMaxFailures = kMaxFailureRate * kRepeats;
|
||||
|
||||
auto timer = getTimer();
|
||||
|
||||
auto checkTimer = [&] {
|
||||
auto elapsed = timer->getElapsed();
|
||||
if (elapsed < kSomeDelay)
|
||||
return true;
|
||||
LOGV2_WARNING(5160101,
|
||||
"Elapsed operation time exceeded the upper bound",
|
||||
"elapsed"_attr = elapsed,
|
||||
"delay"_attr = kSomeDelay);
|
||||
return false;
|
||||
};
|
||||
|
||||
auto failures = 0;
|
||||
for (auto i = 0; i < kRepeats; i++) {
|
||||
timer->start();
|
||||
sleepFor(kSomeDelay);
|
||||
if (!checkTimer())
|
||||
failures++;
|
||||
timer->stop();
|
||||
|
||||
stdx::condition_variable cv;
|
||||
auto mutex = MONGO_MAKE_LATCH("TimerPausesOnBlockingSleep");
|
||||
timer->start();
|
||||
stdx::unique_lock lk(mutex);
|
||||
cv.wait_for(lk, kSomeDelay.toSystemDuration(), [] { return false; });
|
||||
if (!checkTimer())
|
||||
failures++;
|
||||
timer->stop();
|
||||
}
|
||||
|
||||
ASSERT_LTE(failures, kMaxFailures);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
TEST_F(OperationCPUTimerTest, TimerNotSetIfNotSupported) {
|
||||
|
Loading…
Reference in New Issue
Block a user