-
Notifications
You must be signed in to change notification settings - Fork 21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[#704] Add script to automatically fix node corruption #766
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# SPDX-FileCopyrightText: 2023 Oxhead Alpha | ||
# SPDX-License-Identifier: LicenseRef-MIT-OA | ||
|
||
import os | ||
import urllib | ||
import json | ||
import shutil | ||
|
||
from tezos_baking.util import extract_relevant_snapshot, fetch_snapshot, get_proc_output | ||
from tezos_baking.tezos_setup_wizard import default_providers | ||
|
||
|
||
def check_node_corruption(): | ||
network = os.environ.get("NETWORK", None) | ||
if not network: | ||
print( | ||
""" | ||
Could not find network name in environment. | ||
Can't check node for corruption | ||
""" | ||
) | ||
return False | ||
logs = get_proc_output(f"journalctl -u tezos-node-{network}.service") | ||
if b"Inconsistent_store" in logs.stdout: | ||
return True | ||
return False | ||
|
||
|
||
def restore_from_corruption(): | ||
history_mode = None | ||
with open(f"{os.environ['TEZOS_NODE_DIR']}/config.json") as f: | ||
history_mode = json.load(f)["shell"]["history_mode"] | ||
|
||
node_data_directory = os.environ["TEZOS_NODE_DIR"] | ||
try: | ||
shutil.rmtree(node_data_directory) | ||
except Exception as e: | ||
print("Could not delete node data dir. Manual restoration is required") | ||
|
||
snapshot_array = None | ||
config = {"network": os.environ["NETWORK"], "history_mode": history_mode} | ||
|
||
snapshot_array = None | ||
for json_url in default_providers.values(): | ||
with urllib.request.urlopen(json_url) as url: | ||
snapshot_array = json.load(url)["data"] | ||
if snapshot_array is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest to pick not the first suitable provider, but the one which have a snapshot with higher Unlike |
||
break | ||
|
||
snapshot_array.sort(reverse=True, key=lambda x: x["block_height"]) | ||
|
||
snapshot_meta = extract_relevant_snapshot(snapshot_array, config) | ||
|
||
snapshot_path = fetch_snapshot(snapshot_meta["url"]) | ||
|
||
reinstallation_result = get_proc_output( | ||
f""" | ||
octez-node snapshot import {snapshot_path} | ||
""" | ||
) | ||
|
||
os.remove(snapshot_path) | ||
|
||
if not reinstallation_result.returncode: | ||
print("Recovery from corruption was successfull") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure if output of these Did you check it? |
||
else: | ||
print("Recovery from corruption failed. Manual restoration is required") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose that the script should fail with non-zero exit code at this point (and in other places when the script failed to automatically restore node storage) |
||
|
||
|
||
def main(): | ||
is_corrupted = check_node_corruption() | ||
is_baking_installed = ( | ||
b"tezos-baking" in get_proc_output("which octez-baking").stdout | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
So we need to use another approach to checking whether |
||
) | ||
should_restore = os.environ["RESTORE_FROM_CORRUPTION"] | ||
if not is_corrupted: | ||
print( | ||
""" | ||
Node is not corrupted. | ||
""" | ||
) | ||
return | ||
if not is_baking_installed: | ||
print( | ||
""" | ||
Node has been corrupted. | ||
It order to restore it, you need `octez-baking` to be installed | ||
""" | ||
) | ||
return | ||
if not should_restore: | ||
print( | ||
""" | ||
Node has been corrupted. | ||
Automatic restoration is disabled. | ||
Manual restoration is required. | ||
""" | ||
) | ||
return | ||
restore_from_corruption() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd suggest modifying
extract_relevant_snapshot
to acceptnetwork
andhistory_mode
parameters separately rather thanconfig
dict, since theconfig
is an attribute of wizard object, but we don't have it there