Skip to content

Commit

Permalink
[#704] Add corruption recovery script
Browse files Browse the repository at this point in the history
Problem: Currently the only way to fix the problem
with node corruption is to manually delete the directory
and load snapshot anew.

Solution: Add script to do all of this automatically
  • Loading branch information
PruStephan committed Dec 9, 2023
1 parent a05c86d commit a78f852
Show file tree
Hide file tree
Showing 5 changed files with 335 additions and 236 deletions.
92 changes: 92 additions & 0 deletions baking/src/tezos_baking/restore-from-corruption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# SPDX-FileCopyrightText: 2023 Oxhead Alpha
# SPDX-License-Identifier: LicenseRef-MIT-OA

import os
import urllib
import json
import shutil

from tezos_baking.util import *
from tezos_baking.tezos_setup_wizard import default_providers


def check_node_corruption():
network = os.environ.get('NETWORK', None)
if not network:
print("""
Could not find network name in environment.
Can't check node for corruption
""")
return False
logs = get_proc_output(f"journalctl -u tezos-node-{network}.service")
if 'Inconsistent_store' in logs.stdout:
return True
return False

def restore_from_corruption():
node_data_directory = os.environ['TEZOS_NODE_DIR']
try:
shutil.rmtree(node_data_directory)
except Exception as e:
print("Could not delete node data dir. Manual restoration is required")

history_mode = None
with open(f"{os.environ['TEZOS_NODE_DIR']}/config.json") as f:
history_mode = json.load(f)["history_mode"]

snapshot_array = None
config = {
"network": os.environ["NETWORK"],
"history_mode": history_mode

}

snapshot_array = None
for json_url in default_providers:
with urllib.request.urlopen(json_url) as url:
snapshot_array = json.load(url)["data"]
if snapshot_array is not None:
break

snapshot_array.sort(reverse=True, key=lambda x: x["block_height"])

snapshot_meta = extract_relevant_snapshot(snapshot_array, config)
snapshot_path = fetch_snapshot(snapshot_meta)

reinstallation_result = get_proc_output(f"""
octez-node snapshot import {snapshot_path}
""")

remove_tmp_snapshot = get_proc_output(f"""
rm -rf {snapshot_path}
""")

if not reinstallation_result.returncode:
print("Recovery from corruption was successfull")
else:
print("Recovery from corruption failed. Manual restoration is required")


def main():
is_corrupted = check_node_corruption()
is_baking_installed = not get_proc_output("which octez-baking").returncode
should_restore = os.environ['RESTORE_FROM_CORRUPTION']
if not is_corrupted:
return
if not is_baking_installed:
print("""
Node has been corrupted.
It order to restore it, you need `octez-baking` to be installed
""")
return
if not should_restore:
print("""
Node has been corrupted.
Automatic restoration is disabled.
Manual restoration is required.
""")
return
restore_from_corruption()

if __name__ == '__main__':
main()
230 changes: 1 addition & 229 deletions baking/src/tezos_baking/tezos_setup_wizard.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@

recommended_provider = list(default_providers.keys())[0]

TMP_SNAPSHOT_LOCATION = "/tmp/octez_node.snapshot.d/"


# Wizard CLI utility
Expand All @@ -78,73 +77,6 @@
"""


def fetch_snapshot(url, sha256=None):

logging.info("Fetching snapshot")

dirname = TMP_SNAPSHOT_LOCATION
filename = os.path.join(dirname, "octez_node.snapshot")
metadata_file = os.path.join(dirname, "octez_node.snapshot.sha256")

# updates or removes the 'metadata_file' containing the snapshot's SHA256
def dump_metadata(metadata_file=metadata_file, sha256=sha256):
if sha256:
with open(metadata_file, "w+") as f:
f.write(sha256)
else:
try:
os.remove(metadata_file)
except FileNotFoundError:
pass

# reads `metadata_file` if any or returns None
def read_metadata(metadata_file=metadata_file):
if os.path.exists(metadata_file):
with open(metadata_file, "r") as f:
sha256 = f.read()
return sha256
else:
return None

def download(filename=filename, url=url, args=""):
from subprocess import CalledProcessError

try:
proc_call(f"wget {args} --show-progress -O {filename} {url}")
except CalledProcessError as e:
# see here https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
if e.returncode >= 4:
raise urllib.error.URLError
else:
raise e

print_and_log(f"Downloading the snapshot from {url}")

# expected for the (possibly) existing chunk
expected_sha256 = read_metadata()

os.makedirs(dirname, exist_ok=True)
if sha256 and expected_sha256 and expected_sha256 == sha256:
logging.info("Continuing download")
# that case means that the expected sha256 of snapshot
# we want to download is the same as the expected
# sha256 of the existing octez_node.snapshot file
# when it will be fully downloaded
# so that we can safely use `--continue` option here
download(args="--continue")
else:
# all other cases we just dump new metadata
# (so that we can resume download if we can ensure
# that existing octez_node.snapshot chunk belongs
# to the snapshot we want to download)
# and start download from scratch
dump_metadata()
download()

print()
return filename


class Sha256Mismatch(Exception):
"Raised when the actual and expected sha256 don't match."

Expand Down Expand Up @@ -183,19 +115,6 @@ def is_full_snapshot(snapshot_file, import_mode):
return False


def get_node_version():
version = get_proc_output("octez-node --version").stdout.decode("ascii")
major_version, minor_version, rc_version = re.search(
r"[a-z0-9]+ \(.*\) \(([0-9]+).([0-9]+)(?:(?:~rc([1-9]+))|(?:\+dev))?\)",
version,
).groups()
return (
int(major_version),
int(minor_version),
(int(rc_version) if rc_version is not None else None),
)


def is_non_protocol_testnet(network):
return network == "mainnet" or network == "ghostnet"

Expand All @@ -211,9 +130,6 @@ def network_name_or_teztnets_url(network):
return f"https://teztnets.xyz/{network}"


compatible_snapshot_version = 7


# Steps

network_query = Step(
Expand Down Expand Up @@ -457,150 +373,6 @@ def check_blockchain_data(self):
return False
return True

# Returns relevant snapshot's metadata
# It filters out provided snapshots by `network` and `history_mode`
# provided by the user and then follows this steps:
# * tries to find the snapshot of exact same Octez version, that is used by the user.
# * if there is none, try to find the snapshot with the same major version, but less minor version
# and with the `snapshot_version` compatible with the user's Octez version.
# * If there is none, try to find the snapshot with any Octez version, but compatible `snapshot_version`.
def extract_relevant_snapshot(self, snapshot_array):
from functools import reduce

def find_snapshot(pred):
return next(
filter(
lambda artifact: artifact["artifact_type"] == "tezos-snapshot"
and artifact["chain_name"] == self.config["network"]
and (
artifact["history_mode"] == self.config["history_mode"]
or (
self.config["history_mode"] == "archive"
and artifact["history_mode"] == "full"
)
)
and pred(
*(
get_artifact_node_version(artifact)
+ (artifact.get("snapshot_version", None),)
)
),
iter(snapshot_array),
),
None,
)

def get_artifact_node_version(artifact):
version = artifact["tezos_version"]["version"]
# there seem to be some inconsistency with that field in different providers
# so the only thing we check is if it's a string
additional_info = version["additional_info"]
return (
version["major"],
version["minor"],
None if type(additional_info) == str else additional_info["rc"],
)

def compose_pred(*preds):
return reduce(
lambda acc, x: lambda major, minor, rc, snapshot_version: acc(
major, minor, rc, snapshot_version
)
and x(major, minor, rc, snapshot_version),
preds,
)

def sum_pred(*preds):
return reduce(
lambda acc, x: lambda major, minor, rc, snapshot_version: acc(
major, minor, rc, snapshot_version
)
or x(major, minor, rc, snapshot_version),
preds,
)

node_version = get_node_version()
major_version, minor_version, rc_version = node_version

exact_version_pred = (
lambda major, minor, rc, snapshot_version: node_version
== (
major,
minor,
rc,
)
)

exact_major_version_pred = (
lambda major, minor, rc, snapshot_version: major_version == major
)

exact_minor_version_pred = (
lambda major, minor, rc, snapshot_version: minor_version == minor
)

less_minor_version_pred = (
lambda major, minor, rc, snapshot_version: minor_version > minor
)

exact_rc_version_pred = (
lambda major, minor, rc, snapshot_version: rc_version == rc
)

less_rc_version_pred = (
lambda major, minor, rc, snapshot_version: rc
and rc_version
and rc_version > rc
)

non_rc_version_pred = lambda major, minor, rc, snapshot_version: rc is None

compatible_version_pred = (
# it could happen that `snapshot_version` field is not supplied by provider
# e.g. marigold snapshots don't supply it
lambda major, minor, rc, snapshot_version: snapshot_version
and compatible_snapshot_version - snapshot_version <= 2
)

non_rc_on_stable_pred = lambda major, minor, rc, snapshot_version: (
rc_version is None and rc is None
) or (rc_version is not None)

preds = [
exact_version_pred,
compose_pred(
non_rc_on_stable_pred,
compatible_version_pred,
sum_pred(
compose_pred(
exact_major_version_pred,
exact_minor_version_pred,
less_rc_version_pred,
),
compose_pred(
exact_major_version_pred,
less_minor_version_pred,
non_rc_version_pred,
),
),
),
compose_pred(
non_rc_on_stable_pred,
compatible_version_pred,
),
]

return next(
(
snapshot
for snapshot in map(
lambda pred: find_snapshot(pred),
preds,
)
if snapshot is not None
),
None,
)

# Check the provider url and collect the most recent snapshot
# that is suited for the chosen history mode and network
Expand All @@ -612,7 +384,7 @@ def get_snapshot_metadata(self, name, json_url):
snapshot_array = json.load(url)["data"]
snapshot_array.sort(reverse=True, key=lambda x: x["block_height"])

snapshot_metadata = self.extract_relevant_snapshot(snapshot_array)
snapshot_metadata = extract_relevant_snapshot(snapshot_array, self.config)

if snapshot_metadata is None:
print_and_log(
Expand Down
Loading

0 comments on commit a78f852

Please sign in to comment.