From 69d7e1b7963683cde2c22db6cc86fce61ff966df Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Fri, 15 Nov 2024 17:43:29 +0100 Subject: [PATCH 1/3] Fix duration measurement issues in the client Even with `CLOCK_MONOTONIC`, we are still seeing Bazel servers fail to start up occasionally due to start time being larger than end time. Make this non-fatal by truncating to 0 and emitting a warning with start and end time to facilitate further investigation. Also flip the conditions for command and extraction wait time, which previously were only included if *not* known. --- src/main/cpp/archive_utils.cc | 3 +-- src/main/cpp/archive_utils.h | 19 +++---------------- src/main/cpp/blaze.cc | 18 ++++++++---------- src/main/cpp/blaze_util.h | 24 ++++++++++++++++++++++++ src/main/cpp/blaze_util_posix.cc | 5 +++-- src/main/cpp/blaze_util_windows.cc | 2 +- 6 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src/main/cpp/archive_utils.cc b/src/main/cpp/archive_utils.cc index f999e916ae77e1..367817ff79ea0c 100644 --- a/src/main/cpp/archive_utils.cc +++ b/src/main/cpp/archive_utils.cc @@ -126,8 +126,7 @@ ExtractionDurationMillis ExtractData(const string &self_path, BlessFiles(tmp_install); uint64_t et = GetMillisecondsMonotonic(); - const ExtractionDurationMillis extract_data_duration( - et - st, /*archived_extracted=*/true); + const ExtractionDurationMillis extract_data_duration(st, et, /*archived_extracted=*/true); // Now rename the completed installation to its final name. int attempts = 0; diff --git a/src/main/cpp/archive_utils.h b/src/main/cpp/archive_utils.h index 18856ab67dbddd..53c6ef7a804c46 100644 --- a/src/main/cpp/archive_utils.h +++ b/src/main/cpp/archive_utils.h @@ -18,6 +18,7 @@ #include #include +#include "src/main/cpp/blaze_util.h" #include "src/main/cpp/startup_options.h" #include "src/main/cpp/util/logging.h" @@ -29,26 +30,12 @@ void DetermineArchiveContents(const std::string &archive_path, std::vector *files, std::string *install_md5); -struct DurationMillis { - public: - const uint64_t millis; - - DurationMillis() : millis(kUnknownDuration) {} - DurationMillis(const uint64_t ms) : millis(ms) {} - - bool IsUnknown() const { return millis == kUnknownDuration; } - - private: - // Value representing that a timing event never occurred or is unknown. - static constexpr uint64_t kUnknownDuration = 0; -}; - // DurationMillis that tracks if an archive was extracted. struct ExtractionDurationMillis : DurationMillis { const bool archive_extracted; ExtractionDurationMillis() : DurationMillis(), archive_extracted(false) {} - ExtractionDurationMillis(const uint64_t ms, const bool archive_extracted) - : DurationMillis(ms), archive_extracted(archive_extracted) {} + ExtractionDurationMillis(const uint64_t start, const uint64_t end, const bool archive_extracted) + : DurationMillis(start, end), archive_extracted(archive_extracted) {} }; // The reason for a blaze server restart. diff --git a/src/main/cpp/blaze.cc b/src/main/cpp/blaze.cc index b25909d274e8dd..58984e3c65401e 100644 --- a/src/main/cpp/blaze.cc +++ b/src/main/cpp/blaze.cc @@ -197,8 +197,8 @@ class BlazeServer final { explicit BlazeServer(const StartupOptions &startup_options); // Acquire a lock for the output base this server is running in. - // Returns the number of milliseconds spent waiting for the lock. - uint64_t AcquireLock(); + // Returns the time spent waiting for the lock. + DurationMillis AcquireLock(); // Whether there is an active connection to a server. bool Connected() const { return client_.get(); } @@ -275,7 +275,7 @@ static BlazeServer *blaze_server; // _exit(2) (attributed with ATTRIBUTE_NORETURN) meaning we have to delete the // objects before those. -uint64_t BlazeServer::AcquireLock() { +DurationMillis BlazeServer::AcquireLock() { if (output_base_lock_.has_value()) { BAZEL_DIE(blaze_exit_code::INTERNAL_ERROR) << "AcquireLock() called but the lock is already held."; @@ -288,7 +288,7 @@ uint64_t BlazeServer::AcquireLock() { output_base_lock_ = blaze::AcquireLock( "output base", output_base_.GetRelative("lock"), LockMode::kExclusive, batch_, block_for_lock_, &wait_time); - return wait_time; + return DurationMillis(wait_time); } void BlazeServer::ReleaseLock() { @@ -557,14 +557,14 @@ static void AddLoggingArgs(const LoggingInfo &logging_info, // The time in ms a command had to wait on a busy Blaze server process. // This is part of startup_time. - if (command_wait_duration_ms.IsUnknown()) { + if (!command_wait_duration_ms.IsUnknown()) { args->push_back("--command_wait_time=" + blaze_util::ToString(command_wait_duration_ms.millis)); } // The time in ms spent on extracting the new blaze version. // This is part of startup_time. - if (extract_data_duration.IsUnknown()) { + if (!extract_data_duration.IsUnknown()) { args->push_back("--extract_data_time=" + blaze_util::ToString(extract_data_duration.millis)); } @@ -657,8 +657,7 @@ static void RunBatchMode( server->KillRunningServer(); } - const DurationMillis client_startup_duration(GetMillisecondsMonotonic() - - logging_info->start_time_ms); + const DurationMillis client_startup_duration(logging_info->start_time_ms, GetMillisecondsMonotonic()); BAZEL_LOG(INFO) << "Starting " << startup_options.product_name << " in batch mode."; @@ -1093,8 +1092,7 @@ static ATTRIBUTE_NORETURN void RunClientServerMode( << server->ProcessInfo().server_pid_ << ")."; // Wall clock time since process startup. - const DurationMillis client_startup_duration = - (GetMillisecondsMonotonic() - logging_info->start_time_ms); + const DurationMillis client_startup_duration(logging_info->start_time_ms, GetMillisecondsMonotonic()); SignalHandler::Get().Install(startup_options.product_name, startup_options.output_base, diff --git a/src/main/cpp/blaze_util.h b/src/main/cpp/blaze_util.h index 78738b0d0c4a73..4d72d9c6f44c19 100644 --- a/src/main/cpp/blaze_util.h +++ b/src/main/cpp/blaze_util.h @@ -131,6 +131,30 @@ class WithEnvVars { ~WithEnvVars(); }; +struct DurationMillis { + public: + const uint64_t millis; + + DurationMillis() : millis(kUnknownDuration) {} + DurationMillis(const uint64_t millis) : millis(millis) {} + DurationMillis(const uint64_t start, const uint64_t end) : millis(ComputeDuration(start, end)) {} + + bool IsUnknown() const { return millis == kUnknownDuration; } + + private: + static uint64_t ComputeDuration(const uint64_t start, const uint64_t end) { + if (end < start) { + BAZEL_LOG(WARNING) << "Invalid duration: start=" << start << ", end=" << end; + return kUnknownDuration; + } else { + return end - start; + } + } + + // Value representing that a timing event never occurred or is unknown. + static constexpr uint64_t kUnknownDuration = 0; +}; + } // namespace blaze #endif // BAZEL_SRC_MAIN_CPP_BLAZE_UTIL_H_ diff --git a/src/main/cpp/blaze_util_posix.cc b/src/main/cpp/blaze_util_posix.cc index 74759b5e5350fd..03dc7940844b01 100644 --- a/src/main/cpp/blaze_util_posix.cc +++ b/src/main/cpp/blaze_util_posix.cc @@ -712,7 +712,8 @@ LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path, // avoid unnecessary noise in the logs. In this metric, we are only // interested in knowing how long it took for other commands to complete, not // how fast acquiring a lock is. - const uint64_t elapsed_time = !multiple_attempts ? 0 : end_time - start_time; + DurationMillis *elapsed_time = + !multiple_attempts ? new DurationMillis() : new DurationMillis(start_time, end_time); // If taking an exclusive lock, identify ourselves in the lockfile. // The contents are printed for human consumption when another client @@ -731,7 +732,7 @@ LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path, } } - *wait_time = elapsed_time; + *wait_time = elapsed_time->millis; return static_cast(fd); } diff --git a/src/main/cpp/blaze_util_windows.cc b/src/main/cpp/blaze_util_windows.cc index 94e8f3f2956d32..21525a1d43dcba 100644 --- a/src/main/cpp/blaze_util_windows.cc +++ b/src/main/cpp/blaze_util_windows.cc @@ -1164,7 +1164,7 @@ LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path, // a concurrent process can read and display it. On Windows we can't do so // because locks are mandatory, thus we cannot read the file concurrently. - *wait_time = GetMillisecondsMonotonic() - start_time; + *wait_time = DurationMillis(start_time, GetMillisecondsMonotonic()).millis; return reinterpret_cast(handle); } From 6e158bccb337f30875592871188e12aa407e4ece Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Sun, 17 Nov 2024 22:15:16 +0100 Subject: [PATCH 2/3] WIP --- src/main/cpp/archive_utils.cc | 6 +++--- src/main/cpp/archive_utils.h | 10 +--------- src/main/cpp/blaze.cc | 19 ++++++++----------- src/main/cpp/blaze_util_platform.h | 11 +++++------ src/main/cpp/blaze_util_posix.cc | 15 ++++++++------- src/main/cpp/blaze_util_windows.cc | 11 ++++++----- src/test/cpp/blaze_archive_test.cc | 11 +++++------ 7 files changed, 36 insertions(+), 47 deletions(-) diff --git a/src/main/cpp/archive_utils.cc b/src/main/cpp/archive_utils.cc index 367817ff79ea0c..824b604f952b05 100644 --- a/src/main/cpp/archive_utils.cc +++ b/src/main/cpp/archive_utils.cc @@ -110,7 +110,7 @@ struct PartialZipExtractor : public devtools_ijar::ZipExtractorProcessor { // it is in place. Concurrency during extraction is handled by // extracting in a tmp dir and then renaming it into place where it // becomes visible atomically at the new path. -ExtractionDurationMillis ExtractData(const string &self_path, +DurationMillis ExtractData(const string &self_path, const vector &archive_contents, const string &expected_install_md5, const StartupOptions &startup_options, @@ -126,7 +126,7 @@ ExtractionDurationMillis ExtractData(const string &self_path, BlessFiles(tmp_install); uint64_t et = GetMillisecondsMonotonic(); - const ExtractionDurationMillis extract_data_duration(st, et, /*archived_extracted=*/true); + const DurationMillis extract_data_duration(st, et); // Now rename the completed installation to its final name. int attempts = 0; @@ -196,7 +196,7 @@ ExtractionDurationMillis ExtractData(const string &self_path, << expected_install_md5 << "). Remove it or specify a different --install_base."; } - return ExtractionDurationMillis(); + return DurationMillis(); } } diff --git a/src/main/cpp/archive_utils.h b/src/main/cpp/archive_utils.h index 53c6ef7a804c46..69d5a9b04987b2 100644 --- a/src/main/cpp/archive_utils.h +++ b/src/main/cpp/archive_utils.h @@ -30,14 +30,6 @@ void DetermineArchiveContents(const std::string &archive_path, std::vector *files, std::string *install_md5); -// DurationMillis that tracks if an archive was extracted. -struct ExtractionDurationMillis : DurationMillis { - const bool archive_extracted; - ExtractionDurationMillis() : DurationMillis(), archive_extracted(false) {} - ExtractionDurationMillis(const uint64_t start, const uint64_t end, const bool archive_extracted) - : DurationMillis(start, end), archive_extracted(archive_extracted) {} -}; - // The reason for a blaze server restart. // Keep in sync with logging.proto. enum RestartReason { @@ -81,7 +73,7 @@ struct LoggingInfo { // BlessFiles. If the install base, the location the archive is unpacked, // already exists, extraction is skipped. Kills the client if an error is // encountered. -ExtractionDurationMillis ExtractData( +DurationMillis ExtractData( const std::string &self_path, const std::vector &archive_contents, const std::string &expected_install_md5, diff --git a/src/main/cpp/blaze.cc b/src/main/cpp/blaze.cc index 58984e3c65401e..6648748c58817d 100644 --- a/src/main/cpp/blaze.cc +++ b/src/main/cpp/blaze.cc @@ -284,8 +284,8 @@ DurationMillis BlazeServer::AcquireLock() { // commands may not run against the same output base. Note that this lock will // be released by ReleaseLock() once the server is running, as it can handle // concurrent clients on its own. - uint64_t wait_time; - output_base_lock_ = blaze::AcquireLock( + DurationMillis wait_time; + output_base_lock_, wait_time = blaze::AcquireLock( "output base", output_base_.GetRelative("lock"), LockMode::kExclusive, batch_, block_for_lock_, &wait_time); return DurationMillis(wait_time); @@ -557,17 +557,14 @@ static void AddLoggingArgs(const LoggingInfo &logging_info, // The time in ms a command had to wait on a busy Blaze server process. // This is part of startup_time. - if (!command_wait_duration_ms.IsUnknown()) { - args->push_back("--command_wait_time=" + - blaze_util::ToString(command_wait_duration_ms.millis)); - } + args->push_back("--command_wait_time=" + + blaze_util::ToString(command_wait_duration_ms.millis)); // The time in ms spent on extracting the new blaze version. // This is part of startup_time. - if (!extract_data_duration.IsUnknown()) { - args->push_back("--extract_data_time=" + - blaze_util::ToString(extract_data_duration.millis)); - } + args->push_back("--extract_data_time=" + + blaze_util::ToString(extract_data_duration.millis)); + if (logging_info.restart_reason != NO_RESTART) { args->push_back(string("--restart_reason=") + ReasonString(logging_info.restart_reason)); @@ -1401,7 +1398,7 @@ static void RunLauncher(const string &self_path, WarnFilesystemType(startup_options.output_base); - const ExtractionDurationMillis extract_data_duration = ExtractData( + const DurationMillis extract_data_duration = ExtractData( self_path, archive_contents, install_md5, startup_options, logging_info); blaze_server->Connect(); diff --git a/src/main/cpp/blaze_util_platform.h b/src/main/cpp/blaze_util_platform.h index 17b8bc972321d8..c5cd7491b07d6e 100644 --- a/src/main/cpp/blaze_util_platform.h +++ b/src/main/cpp/blaze_util_platform.h @@ -208,12 +208,11 @@ enum class LockMode { // Acquires a `mode` lock on `path`, busy-waiting until it becomes available if // `block` is true, and releasing it on exec if `batch_mode` is false. // Crashes if the lock cannot be acquired. Returns a handle that can be -// subsequently passed to ReleaseLock. Sets `wait_time` to the number of -// milliseconds spent waiting for the lock. The `name` argument is used to -// distinguish it from other locks in human-readable error messages. -LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path, - LockMode mode, bool batch_mode, bool block, - uint64_t* wait_time); +// subsequently passed to ReleaseLock as well as the time spent waiting for the +// lock. The `name` argument is used to distinguish it from other locks in +// human-readable error messages. +std::pair AcquireLock( + const std::string &name, const blaze_util::Path &path, LockMode mode, bool batch_mode, bool block); // Releases a lock previously obtained from AcquireLock. void ReleaseLock(LockHandle lock_handle); diff --git a/src/main/cpp/blaze_util_posix.cc b/src/main/cpp/blaze_util_posix.cc index 03dc7940844b01..d4dde3b6a5bee3 100644 --- a/src/main/cpp/blaze_util_posix.cc +++ b/src/main/cpp/blaze_util_posix.cc @@ -632,9 +632,11 @@ static int setlk(int fd, struct flock* lock) { return -1; } -LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path, - LockMode mode, bool batch_mode, bool block, - uint64_t* wait_time) { +std::pair AcquireLock(const std::string &name, + const blaze_util::Path &path, + LockMode mode, + bool batch_mode, + bool block) { int flags = O_CREAT; switch (mode) { case LockMode::kShared: @@ -712,8 +714,8 @@ LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path, // avoid unnecessary noise in the logs. In this metric, we are only // interested in knowing how long it took for other commands to complete, not // how fast acquiring a lock is. - DurationMillis *elapsed_time = - !multiple_attempts ? new DurationMillis() : new DurationMillis(start_time, end_time); + DurationMillis elapsed_time = + !multiple_attempts ? DurationMillis() : DurationMillis(start_time, end_time); // If taking an exclusive lock, identify ourselves in the lockfile. // The contents are printed for human consumption when another client @@ -732,8 +734,7 @@ LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path, } } - *wait_time = elapsed_time->millis; - return static_cast(fd); + return std::make_pair(static_cast(fd), elapsed_time); } void ReleaseLock(LockHandle lock_handle) { diff --git a/src/main/cpp/blaze_util_windows.cc b/src/main/cpp/blaze_util_windows.cc index 21525a1d43dcba..f70183513379ad 100644 --- a/src/main/cpp/blaze_util_windows.cc +++ b/src/main/cpp/blaze_util_windows.cc @@ -1087,9 +1087,11 @@ uint64_t WindowsClock::GetMilliseconds() const { return GetMillisecondsAsLargeInt(kFrequency).QuadPart; } -LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path, - LockMode mode, bool batch_mode, bool block, - uint64_t* wait_time) { +std::pair AcquireLock(const std::string &name, + const blaze_util::Path &path, + LockMode mode, + bool batch_mode, + bool block) { DWORD desired_access = GENERIC_READ; if (mode == LockMode::kExclusive) { desired_access |= GENERIC_WRITE; @@ -1164,8 +1166,7 @@ LockHandle AcquireLock(const std::string& name, const blaze_util::Path& path, // a concurrent process can read and display it. On Windows we can't do so // because locks are mandatory, thus we cannot read the file concurrently. - *wait_time = DurationMillis(start_time, GetMillisecondsMonotonic()).millis; - return reinterpret_cast(handle); + return std::make_pair(reinterpret_cast(handle), DurationMillis(start_time, GetMillisecondsMonotonic()); } void ReleaseLock(LockHandle lock_handle) { diff --git a/src/test/cpp/blaze_archive_test.cc b/src/test/cpp/blaze_archive_test.cc index b7a58dbe3530ed..1519a08c09f00d 100644 --- a/src/test/cpp/blaze_archive_test.cc +++ b/src/test/cpp/blaze_archive_test.cc @@ -121,7 +121,7 @@ TEST_F(BlazeArchiveTest, TestZipExtractionAndFarOutMTimes) { set_startup_options(startup_options, blaze_path, output_dir); LoggingInfo logging_info(blaze_path, blaze::GetMillisecondsMonotonic()); - ExtractionDurationMillis extraction_time = + DurationMillis extraction_time = ExtractData(blaze_path, archive_contents, expected_install_md5, startup_options, &logging_info); @@ -155,15 +155,14 @@ TEST_F(BlazeArchiveTest, TestNoDataExtractionIfInstallBaseExists) { set_startup_options(startup_options, blaze_path, output_dir); LoggingInfo logging_info(blaze_path, blaze::GetMillisecondsMonotonic()); - ExtractionDurationMillis extraction_time_one = + DurationMillis extraction_time_one = ExtractData(blaze_path, archive_contents, expected_install_md5, startup_options, &logging_info); - ASSERT_TRUE(extraction_time_one.archive_extracted); + ASSERT_NEQ(extraction_time_one, DurationMillis(0)); - ExtractionDurationMillis extraction_time_two = + DurationMillis extraction_time_two = ExtractData(blaze_path, archive_contents, expected_install_md5, startup_options, &logging_info); - - ASSERT_FALSE(extraction_time_two.archive_extracted); + ASSERT_EQ(extraction_time_two, DurationMillis(0)); } } // namespace blaze From 0bf85327b7b1f831463039bfd05546c5572d9914 Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Mon, 18 Nov 2024 10:32:09 +0100 Subject: [PATCH 3/3] Address comments --- src/main/cpp/blaze.cc | 8 ++++---- src/main/cpp/blaze_util.h | 10 ++-------- src/main/cpp/blaze_util_windows.cc | 2 +- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/main/cpp/blaze.cc b/src/main/cpp/blaze.cc index 6648748c58817d..c2c20f631cc775 100644 --- a/src/main/cpp/blaze.cc +++ b/src/main/cpp/blaze.cc @@ -284,11 +284,11 @@ DurationMillis BlazeServer::AcquireLock() { // commands may not run against the same output base. Note that this lock will // be released by ReleaseLock() once the server is running, as it can handle // concurrent clients on its own. - DurationMillis wait_time; - output_base_lock_, wait_time = blaze::AcquireLock( + auto lock_and_time = blaze::AcquireLock( "output base", output_base_.GetRelative("lock"), LockMode::kExclusive, - batch_, block_for_lock_, &wait_time); - return DurationMillis(wait_time); + batch_, block_for_lock_); + output_base_lock_ = lock_and_time.first; + return lock_and_time.second; } void BlazeServer::ReleaseLock() { diff --git a/src/main/cpp/blaze_util.h b/src/main/cpp/blaze_util.h index 4d72d9c6f44c19..bc35053ddf8ba5 100644 --- a/src/main/cpp/blaze_util.h +++ b/src/main/cpp/blaze_util.h @@ -135,24 +135,18 @@ struct DurationMillis { public: const uint64_t millis; - DurationMillis() : millis(kUnknownDuration) {} - DurationMillis(const uint64_t millis) : millis(millis) {} + DurationMillis() : millis(0) {} DurationMillis(const uint64_t start, const uint64_t end) : millis(ComputeDuration(start, end)) {} - bool IsUnknown() const { return millis == kUnknownDuration; } - private: static uint64_t ComputeDuration(const uint64_t start, const uint64_t end) { if (end < start) { BAZEL_LOG(WARNING) << "Invalid duration: start=" << start << ", end=" << end; - return kUnknownDuration; + return 0; } else { return end - start; } } - - // Value representing that a timing event never occurred or is unknown. - static constexpr uint64_t kUnknownDuration = 0; }; } // namespace blaze diff --git a/src/main/cpp/blaze_util_windows.cc b/src/main/cpp/blaze_util_windows.cc index f70183513379ad..c656f68f6dd3fa 100644 --- a/src/main/cpp/blaze_util_windows.cc +++ b/src/main/cpp/blaze_util_windows.cc @@ -1166,7 +1166,7 @@ std::pair AcquireLock(const std::string &name, // a concurrent process can read and display it. On Windows we can't do so // because locks are mandatory, thus we cannot read the file concurrently. - return std::make_pair(reinterpret_cast(handle), DurationMillis(start_time, GetMillisecondsMonotonic()); + return std::make_pair(reinterpret_cast(handle), DurationMillis(start_time, GetMillisecondsMonotonic())); } void ReleaseLock(LockHandle lock_handle) {