-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[rabit] Improved connection handling. #9531
Merged
trivialfis
merged 11 commits into
dmlc:master
from
trivialfis:rabit-tracker-connect-timeout
Aug 30, 2023
Merged
Changes from 8 commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
b5bbd8f
[rabit] Improved connection handling.
trivialfis ba5f2b4
retry.
trivialfis 06deb52
Cleanup.
trivialfis 4e2cd42
Handle error in poll.
trivialfis eaa2fed
nolint.
trivialfis d3d4aed
Fix.
trivialfis bb614db
error message.
trivialfis d1e372b
rtd
trivialfis 0ae2819
Windows.
trivialfis 72ffc65
no need for exception.
trivialfis c3f6213
Fix disabled IPv6.
trivialfis File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,4 +32,3 @@ formats: | |
python: | ||
install: | ||
- requirements: doc/requirements.txt | ||
system_packages: true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
/** | ||
* Copyright 2023, XGBoost Contributors | ||
*/ | ||
#pragma once | ||
|
||
#include <memory> // for unique_ptr | ||
#include <sstream> // for stringstream | ||
#include <string> // for string | ||
#include <utility> // for move | ||
|
||
namespace xgboost::collective { | ||
namespace detail { | ||
struct ResultImpl { | ||
std::string message; | ||
std::error_code errc{}; // optional for system error. | ||
|
||
std::unique_ptr<ResultImpl> prev{nullptr}; | ||
|
||
ResultImpl() = delete; // must initialize. | ||
ResultImpl(ResultImpl const& that) = delete; | ||
ResultImpl(ResultImpl&& that) = default; | ||
ResultImpl& operator=(ResultImpl const& that) = delete; | ||
ResultImpl& operator=(ResultImpl&& that) = default; | ||
|
||
explicit ResultImpl(std::string msg) : message{std::move(msg)} {} | ||
explicit ResultImpl(std::string msg, std::error_code errc) | ||
: message{std::move(msg)}, errc{std::move(errc)} {} | ||
explicit ResultImpl(std::string msg, std::unique_ptr<ResultImpl> prev) | ||
: message{std::move(msg)}, prev{std::move(prev)} {} | ||
explicit ResultImpl(std::string msg, std::error_code errc, std::unique_ptr<ResultImpl> prev) | ||
: message{std::move(msg)}, errc{std::move(errc)}, prev{std::move(prev)} {} | ||
|
||
[[nodiscard]] bool operator==(ResultImpl const& that) const { | ||
if ((prev && !that.prev) || (!prev && that.prev)) { | ||
// one of them doesn't have prev | ||
return false; | ||
} | ||
|
||
auto cur_eq = message == that.message && errc == that.errc; | ||
if (prev && that.prev) { | ||
// recursive comparison | ||
auto prev_eq = *prev == *that.prev; | ||
return cur_eq && prev_eq; | ||
} | ||
return cur_eq; | ||
} | ||
|
||
[[nodiscard]] std::string Report() { | ||
std::stringstream ss; | ||
ss << "\n- " << this->message; | ||
if (this->errc != std::error_code{}) { | ||
ss << " system error:" << this->errc.message(); | ||
} | ||
|
||
auto ptr = prev.get(); | ||
while (ptr) { | ||
ss << "\n- "; | ||
ss << ptr->message; | ||
|
||
if (ptr->errc != std::error_code{}) { | ||
ss << " " << ptr->errc.message(); | ||
} | ||
ptr = ptr->prev.get(); | ||
} | ||
|
||
return ss.str(); | ||
} | ||
}; | ||
} // namespace detail | ||
|
||
/** | ||
* @brief An error type that's easier to handle than throwing dmlc exception. We can | ||
* record and propagate the system error code. | ||
*/ | ||
struct Result { | ||
private: | ||
std::unique_ptr<detail::ResultImpl> impl_{nullptr}; | ||
|
||
public: | ||
Result() = default; | ||
explicit Result(std::string msg) : impl_{std::make_unique<detail::ResultImpl>(std::move(msg))} {} | ||
explicit Result(std::string msg, std::error_code errc) | ||
: impl_{std::make_unique<detail::ResultImpl>(std::move(msg), std::move(errc))} {} | ||
Result(std::string msg, Result&& prev) | ||
: impl_{std::make_unique<detail::ResultImpl>(std::move(msg), std::move(prev.impl_))} {} | ||
Result(std::string msg, std::error_code errc, Result&& prev) | ||
: impl_{std::make_unique<detail::ResultImpl>(std::move(msg), std::move(errc), | ||
std::move(prev.impl_))} {} | ||
|
||
Result(Result const& that) = delete; | ||
Result& operator=(Result const& that) = delete; | ||
Result(Result&& that) = default; | ||
Result& operator=(Result&& that) = default; | ||
|
||
[[nodiscard]] bool OK() const { return !impl_; } | ||
[[nodiscard]] std::string Report() const { return OK() ? "" : impl_->Report(); } | ||
[[nodiscard]] auto Code() const { return OK() ? std::error_code{} : impl_->errc; } | ||
[[nodiscard]] bool operator==(Result const& that) const { | ||
if (OK() && that.OK()) { | ||
return true; | ||
} | ||
if ((OK() && !that.OK()) || (!OK() && that.OK())) { | ||
return false; | ||
} | ||
return *impl_ == *that.impl_; | ||
} | ||
}; | ||
|
||
/** | ||
* @brief Return success. | ||
*/ | ||
[[nodiscard]] inline auto Success() { return Result{}; } | ||
/** | ||
* @brief Return failure. | ||
*/ | ||
[[nodiscard]] inline auto Fail(std::string msg) { return Result{std::move(msg)}; } | ||
/** | ||
* @brief Return failure with `errno`. | ||
*/ | ||
[[nodiscard]] inline auto Fail(std::string msg, std::error_code errc) { | ||
return Result{std::move(msg), std::move(errc)}; | ||
} | ||
/** | ||
* @brief Return failure with a previous error. | ||
*/ | ||
[[nodiscard]] inline auto Fail(std::string msg, Result&& prev) { | ||
return Result{std::move(msg), std::forward<Result>(prev)}; | ||
} | ||
/** | ||
* @brief Return failure with a previous error and a new `errno`. | ||
*/ | ||
[[nodiscard]] inline auto Fail(std::string msg, std::error_code errc, Result&& prev) { | ||
return Result{std::move(msg), std::move(errc), std::forward<Result>(prev)}; | ||
} | ||
} // namespace xgboost::collective |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At some point in the future, we need to propagate the error to Python or other language bindings for delegating the error handling to higher-level frameworks like dask. At the moment, a functional form of error handling is easier to handle than exceptions.