0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 09:32:32 +01:00

SERVER-51601 Make OperationCPUTimer initializer more resilient

This commit is contained in:
Amirsaman Memaripour 2020-10-14 19:35:32 +00:00 committed by Evergreen Agent
parent 48fcb9bd10
commit 4739fb2322
2 changed files with 85 additions and 10 deletions

View File

@ -27,6 +27,8 @@
* it in the license file.
*/
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault
#include <boost/optional.hpp>
#include <fmt/format.h>
@ -39,6 +41,7 @@
#include "mongo/base/error_codes.h"
#include "mongo/db/client.h"
#include "mongo/db/operation_context.h"
#include "mongo/logv2/log.h"
#include "mongo/stdx/thread.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/errno_util.h"
@ -46,11 +49,22 @@
namespace mongo {
using namespace fmt::literals;
#if defined(__linux__)
namespace {
using namespace fmt::literals;
// Reads the thread timer, and throws with `InternalError` if that fails.
Nanoseconds getThreadCPUTime() {
struct timespec t;
if (auto ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t); ret != 0) {
int ec = errno;
internalAssert(Status(ErrorCodes::InternalError,
"Unable to get time: {}"_format(errnoWithDescription(ec))));
}
return Seconds(t.tv_sec) + Nanoseconds(t.tv_nsec);
}
MONGO_FAIL_POINT_DEFINE(hangCPUTimerAfterOnThreadAttach);
MONGO_FAIL_POINT_DEFINE(hangCPUTimerAfterOnThreadDetach);
@ -132,14 +146,13 @@ void PosixTimer::onThreadDetach() {
hangCPUTimerAfterOnThreadDetach.pauseWhileSet();
}
Nanoseconds PosixTimer::_getThreadTime() const {
struct timespec t;
if (auto ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t); ret != 0) {
int ec = errno;
internalAssert(Status(ErrorCodes::InternalError,
"Unable to get time: {}"_format(errnoWithDescription(ec))));
}
return Seconds(t.tv_sec) + Nanoseconds(t.tv_nsec);
Nanoseconds PosixTimer::_getThreadTime() const try {
return getThreadCPUTime();
} catch (const ExceptionFor<ErrorCodes::InternalError>& ex) {
// Abort the process as the timer cannot account for the elapsed time. This path is only
// reachable if the platform supports CPU time measurement at startup, but returns an error
// for a subsequent attempt to get thread-specific CPU consumption.
LOGV2_FATAL(4744601, "Failed to read the CPU time for the current thread", "error"_attr = ex);
}
static auto getCPUTimer = OperationContext::declareDecoration<PosixTimer>();
@ -154,7 +167,17 @@ OperationCPUTimer* OperationCPUTimer::get(OperationContext* opCtx) {
// of SMP systems.
static bool isTimeSupported = [] {
clockid_t cid;
return clock_getcpuclockid(0, &cid) == 0;
if (clock_getcpuclockid(0, &cid) != 0)
return false;
try {
getThreadCPUTime();
} catch (const ExceptionFor<ErrorCodes::InternalError>&) {
// Unable to collect the CPU time for the current thread.
return false;
}
return true;
}();
if (!isTimeSupported)

View File

@ -27,9 +27,14 @@
* it in the license file.
*/
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kTest
#include "mongo/db/operation_cpu_timer.h"
#include "mongo/db/service_context_test_fixture.h"
#include "mongo/logv2/log.h"
#include "mongo/platform/atomic_word.h"
#include "mongo/stdx/condition_variable.h"
#include "mongo/stdx/mutex.h"
#include "mongo/stdx/thread.h"
#include "mongo/unittest/barrier.h"
#include "mongo/unittest/death_test.h"
@ -159,6 +164,53 @@ DEATH_TEST_F(OperationCPUTimerTest, GetElapsedForPausedTimer, "Not attached to c
timer->getElapsed();
}
TEST_F(OperationCPUTimerTest, TimerPausesOnBlockingSleep) {
// This test checks if the time measured by `OperationCPUTimer` does not include the period of
// time the operation was blocked (e.g., waiting on a condition variable). The idea is to have
// the operation block for `kSomeDelay`, ensure the elapsed time observed by the timer is always
// less than `kSomeDelay`, and repeat the test `kRepeats` times. To account for the sporadic
// wake ups, the test does not fail unless the number of failures exceeds `kMaxFailures`. This
// is just a best-effort, and the number of failures is not guaranteed to not exceed the upper
// bound (i.e., `kMaxFailures`).
const auto kSomeDelay = Milliseconds(1);
const auto kRepeats = 1000;
const auto kMaxFailureRate = 0.1;
const auto kMaxFailures = kMaxFailureRate * kRepeats;
auto timer = getTimer();
auto checkTimer = [&] {
auto elapsed = timer->getElapsed();
if (elapsed < kSomeDelay)
return true;
LOGV2_WARNING(5160101,
"Elapsed operation time exceeded the upper bound",
"elapsed"_attr = elapsed,
"delay"_attr = kSomeDelay);
return false;
};
auto failures = 0;
for (auto i = 0; i < kRepeats; i++) {
timer->start();
sleepFor(kSomeDelay);
if (!checkTimer())
failures++;
timer->stop();
stdx::condition_variable cv;
auto mutex = MONGO_MAKE_LATCH("TimerPausesOnBlockingSleep");
timer->start();
stdx::unique_lock lk(mutex);
cv.wait_for(lk, kSomeDelay.toSystemDuration(), [] { return false; });
if (!checkTimer())
failures++;
timer->stop();
}
ASSERT_LTE(failures, kMaxFailures);
}
#else
TEST_F(OperationCPUTimerTest, TimerNotSetIfNotSupported) {