From a78f8521042164878f6781caa3f456d2d66cd2b1 Mon Sep 17 00:00:00 2001 From: PruStephan Date: Sat, 9 Dec 2023 21:27:58 +0100 Subject: [PATCH] [#704] Add corruption recovery script Problem: Currently the only way to fix the problem with node corruption is to manually delete the directory and load snapshot anew. Solution: Add script to do all of this automatically --- .../tezos_baking/restore-from-corruption.py | 92 +++++++ baking/src/tezos_baking/tezos_setup_wizard.py | 230 +---------------- baking/src/tezos_baking/util.py | 237 +++++++++++++++++- baking/src/tezos_baking/wizard_structure.py | 5 - docker/package/packages.py | 7 + 5 files changed, 335 insertions(+), 236 deletions(-) create mode 100644 baking/src/tezos_baking/restore-from-corruption.py diff --git a/baking/src/tezos_baking/restore-from-corruption.py b/baking/src/tezos_baking/restore-from-corruption.py new file mode 100644 index 000000000..7c84d0328 --- /dev/null +++ b/baking/src/tezos_baking/restore-from-corruption.py @@ -0,0 +1,92 @@ +# SPDX-FileCopyrightText: 2023 Oxhead Alpha +# SPDX-License-Identifier: LicenseRef-MIT-OA + +import os +import urllib +import json +import shutil + +from tezos_baking.util import * +from tezos_baking.tezos_setup_wizard import default_providers + + +def check_node_corruption(): + network = os.environ.get('NETWORK', None) + if not network: + print(""" + Could not find network name in environment. + Can't check node for corruption + """) + return False + logs = get_proc_output(f"journalctl -u tezos-node-{network}.service") + if 'Inconsistent_store' in logs.stdout: + return True + return False + +def restore_from_corruption(): + node_data_directory = os.environ['TEZOS_NODE_DIR'] + try: + shutil.rmtree(node_data_directory) + except Exception as e: + print("Could not delete node data dir. Manual restoration is required") + + history_mode = None + with open(f"{os.environ['TEZOS_NODE_DIR']}/config.json") as f: + history_mode = json.load(f)["history_mode"] + + snapshot_array = None + config = { + "network": os.environ["NETWORK"], + "history_mode": history_mode + + } + + snapshot_array = None + for json_url in default_providers: + with urllib.request.urlopen(json_url) as url: + snapshot_array = json.load(url)["data"] + if snapshot_array is not None: + break + + snapshot_array.sort(reverse=True, key=lambda x: x["block_height"]) + + snapshot_meta = extract_relevant_snapshot(snapshot_array, config) + snapshot_path = fetch_snapshot(snapshot_meta) + + reinstallation_result = get_proc_output(f""" + octez-node snapshot import {snapshot_path} + """) + + remove_tmp_snapshot = get_proc_output(f""" + rm -rf {snapshot_path} + """) + + if not reinstallation_result.returncode: + print("Recovery from corruption was successfull") + else: + print("Recovery from corruption failed. Manual restoration is required") + + +def main(): + is_corrupted = check_node_corruption() + is_baking_installed = not get_proc_output("which octez-baking").returncode + should_restore = os.environ['RESTORE_FROM_CORRUPTION'] + if not is_corrupted: + return + if not is_baking_installed: + print(""" + Node has been corrupted. + It order to restore it, you need `octez-baking` to be installed + """) + return + if not should_restore: + print(""" + Node has been corrupted. + Automatic restoration is disabled. + Manual restoration is required. + """) + return + restore_from_corruption() + +if __name__ == '__main__': + main() diff --git a/baking/src/tezos_baking/tezos_setup_wizard.py b/baking/src/tezos_baking/tezos_setup_wizard.py index 5482721af..1a8f8c0e2 100644 --- a/baking/src/tezos_baking/tezos_setup_wizard.py +++ b/baking/src/tezos_baking/tezos_setup_wizard.py @@ -56,7 +56,6 @@ recommended_provider = list(default_providers.keys())[0] -TMP_SNAPSHOT_LOCATION = "/tmp/octez_node.snapshot.d/" # Wizard CLI utility @@ -78,73 +77,6 @@ """ -def fetch_snapshot(url, sha256=None): - - logging.info("Fetching snapshot") - - dirname = TMP_SNAPSHOT_LOCATION - filename = os.path.join(dirname, "octez_node.snapshot") - metadata_file = os.path.join(dirname, "octez_node.snapshot.sha256") - - # updates or removes the 'metadata_file' containing the snapshot's SHA256 - def dump_metadata(metadata_file=metadata_file, sha256=sha256): - if sha256: - with open(metadata_file, "w+") as f: - f.write(sha256) - else: - try: - os.remove(metadata_file) - except FileNotFoundError: - pass - - # reads `metadata_file` if any or returns None - def read_metadata(metadata_file=metadata_file): - if os.path.exists(metadata_file): - with open(metadata_file, "r") as f: - sha256 = f.read() - return sha256 - else: - return None - - def download(filename=filename, url=url, args=""): - from subprocess import CalledProcessError - - try: - proc_call(f"wget {args} --show-progress -O {filename} {url}") - except CalledProcessError as e: - # see here https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html - if e.returncode >= 4: - raise urllib.error.URLError - else: - raise e - - print_and_log(f"Downloading the snapshot from {url}") - - # expected for the (possibly) existing chunk - expected_sha256 = read_metadata() - - os.makedirs(dirname, exist_ok=True) - if sha256 and expected_sha256 and expected_sha256 == sha256: - logging.info("Continuing download") - # that case means that the expected sha256 of snapshot - # we want to download is the same as the expected - # sha256 of the existing octez_node.snapshot file - # when it will be fully downloaded - # so that we can safely use `--continue` option here - download(args="--continue") - else: - # all other cases we just dump new metadata - # (so that we can resume download if we can ensure - # that existing octez_node.snapshot chunk belongs - # to the snapshot we want to download) - # and start download from scratch - dump_metadata() - download() - - print() - return filename - - class Sha256Mismatch(Exception): "Raised when the actual and expected sha256 don't match." @@ -183,19 +115,6 @@ def is_full_snapshot(snapshot_file, import_mode): return False -def get_node_version(): - version = get_proc_output("octez-node --version").stdout.decode("ascii") - major_version, minor_version, rc_version = re.search( - r"[a-z0-9]+ \(.*\) \(([0-9]+).([0-9]+)(?:(?:~rc([1-9]+))|(?:\+dev))?\)", - version, - ).groups() - return ( - int(major_version), - int(minor_version), - (int(rc_version) if rc_version is not None else None), - ) - - def is_non_protocol_testnet(network): return network == "mainnet" or network == "ghostnet" @@ -211,9 +130,6 @@ def network_name_or_teztnets_url(network): return f"https://teztnets.xyz/{network}" -compatible_snapshot_version = 7 - - # Steps network_query = Step( @@ -457,150 +373,6 @@ def check_blockchain_data(self): return False return True - # Returns relevant snapshot's metadata - # It filters out provided snapshots by `network` and `history_mode` - # provided by the user and then follows this steps: - # * tries to find the snapshot of exact same Octez version, that is used by the user. - # * if there is none, try to find the snapshot with the same major version, but less minor version - # and with the `snapshot_version` compatible with the user's Octez version. - # * If there is none, try to find the snapshot with any Octez version, but compatible `snapshot_version`. - def extract_relevant_snapshot(self, snapshot_array): - from functools import reduce - - def find_snapshot(pred): - return next( - filter( - lambda artifact: artifact["artifact_type"] == "tezos-snapshot" - and artifact["chain_name"] == self.config["network"] - and ( - artifact["history_mode"] == self.config["history_mode"] - or ( - self.config["history_mode"] == "archive" - and artifact["history_mode"] == "full" - ) - ) - and pred( - *( - get_artifact_node_version(artifact) - + (artifact.get("snapshot_version", None),) - ) - ), - iter(snapshot_array), - ), - None, - ) - - def get_artifact_node_version(artifact): - version = artifact["tezos_version"]["version"] - # there seem to be some inconsistency with that field in different providers - # so the only thing we check is if it's a string - additional_info = version["additional_info"] - return ( - version["major"], - version["minor"], - None if type(additional_info) == str else additional_info["rc"], - ) - - def compose_pred(*preds): - return reduce( - lambda acc, x: lambda major, minor, rc, snapshot_version: acc( - major, minor, rc, snapshot_version - ) - and x(major, minor, rc, snapshot_version), - preds, - ) - - def sum_pred(*preds): - return reduce( - lambda acc, x: lambda major, minor, rc, snapshot_version: acc( - major, minor, rc, snapshot_version - ) - or x(major, minor, rc, snapshot_version), - preds, - ) - - node_version = get_node_version() - major_version, minor_version, rc_version = node_version - - exact_version_pred = ( - lambda major, minor, rc, snapshot_version: node_version - == ( - major, - minor, - rc, - ) - ) - - exact_major_version_pred = ( - lambda major, minor, rc, snapshot_version: major_version == major - ) - - exact_minor_version_pred = ( - lambda major, minor, rc, snapshot_version: minor_version == minor - ) - - less_minor_version_pred = ( - lambda major, minor, rc, snapshot_version: minor_version > minor - ) - - exact_rc_version_pred = ( - lambda major, minor, rc, snapshot_version: rc_version == rc - ) - - less_rc_version_pred = ( - lambda major, minor, rc, snapshot_version: rc - and rc_version - and rc_version > rc - ) - - non_rc_version_pred = lambda major, minor, rc, snapshot_version: rc is None - - compatible_version_pred = ( - # it could happen that `snapshot_version` field is not supplied by provider - # e.g. marigold snapshots don't supply it - lambda major, minor, rc, snapshot_version: snapshot_version - and compatible_snapshot_version - snapshot_version <= 2 - ) - - non_rc_on_stable_pred = lambda major, minor, rc, snapshot_version: ( - rc_version is None and rc is None - ) or (rc_version is not None) - - preds = [ - exact_version_pred, - compose_pred( - non_rc_on_stable_pred, - compatible_version_pred, - sum_pred( - compose_pred( - exact_major_version_pred, - exact_minor_version_pred, - less_rc_version_pred, - ), - compose_pred( - exact_major_version_pred, - less_minor_version_pred, - non_rc_version_pred, - ), - ), - ), - compose_pred( - non_rc_on_stable_pred, - compatible_version_pred, - ), - ] - - return next( - ( - snapshot - for snapshot in map( - lambda pred: find_snapshot(pred), - preds, - ) - if snapshot is not None - ), - None, - ) # Check the provider url and collect the most recent snapshot # that is suited for the chosen history mode and network @@ -612,7 +384,7 @@ def get_snapshot_metadata(self, name, json_url): snapshot_array = json.load(url)["data"] snapshot_array.sort(reverse=True, key=lambda x: x["block_height"]) - snapshot_metadata = self.extract_relevant_snapshot(snapshot_array) + snapshot_metadata = extract_relevant_snapshot(snapshot_array, self.config) if snapshot_metadata is None: print_and_log( diff --git a/baking/src/tezos_baking/util.py b/baking/src/tezos_baking/util.py index b76620864..331f529f6 100644 --- a/baking/src/tezos_baking/util.py +++ b/baking/src/tezos_baking/util.py @@ -8,10 +8,10 @@ import sys, subprocess, shlex import re import urllib.request -import json import os +import logging -# Regexes +# Regexes and constants secret_key_regex = b"(encrypted|unencrypted):(?:\w{54}|\w{88})" address_regex = b"tz[123]\w{33}" @@ -22,6 +22,9 @@ ledger_regex = b"ledger:\/\/[\w\-]+\/[\w\-]+\/[\w']+\/[\w']+" derivation_path_regex = b"(?:bip25519|ed25519|secp256k1|P-256)\/[0-9]+h\/[0-9]+h" +compatible_snapshot_version = 7 +TMP_SNAPSHOT_LOCATION = "/tmp/octez_node.snapshot.d/" + # Utilities @@ -56,6 +59,11 @@ def find_systemd_unit_env(show_systemd_output): return "" +def print_and_log(message, log=logging.info, colorcode=None): + print(color(message, colorcode) if colorcode else message) + log(message) + + # Returns all the environment variables of a systemd service unit # Note: definitions directly in the unit (not in environment files) take precedence def get_systemd_service_env(service_name): @@ -148,3 +156,228 @@ def url_is_reachable(url): return True except (urllib.error.URLError, ValueError): return False + + +def fetch_snapshot(url, sha256=None): + + logging.info("Fetching snapshot") + + dirname = TMP_SNAPSHOT_LOCATION + filename = os.path.join(dirname, "octez_node.snapshot") + metadata_file = os.path.join(dirname, "octez_node.snapshot.sha256") + + # updates or removes the 'metadata_file' containing the snapshot's SHA256 + def dump_metadata(metadata_file=metadata_file, sha256=sha256): + if sha256: + with open(metadata_file, "w+") as f: + f.write(sha256) + else: + try: + os.remove(metadata_file) + except FileNotFoundError: + pass + + # reads `metadata_file` if any or returns None + def read_metadata(metadata_file=metadata_file): + if os.path.exists(metadata_file): + with open(metadata_file, "r") as f: + sha256 = f.read() + return sha256 + else: + return None + + def download(filename=filename, url=url, args=""): + from subprocess import CalledProcessError + + try: + proc_call(f"wget {args} --show-progress -O {filename} {url}") + except CalledProcessError as e: + # see here https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html + if e.returncode >= 4: + raise urllib.error.URLError + else: + raise e + + print_and_log(f"Downloading the snapshot from {url}") + + # expected for the (possibly) existing chunk + expected_sha256 = read_metadata() + + os.makedirs(dirname, exist_ok=True) + if sha256 and expected_sha256 and expected_sha256 == sha256: + logging.info("Continuing download") + # that case means that the expected sha256 of snapshot + # we want to download is the same as the expected + # sha256 of the existing octez_node.snapshot file + # when it will be fully downloaded + # so that we can safely use `--continue` option here + download(args="--continue") + else: + # all other cases we just dump new metadata + # (so that we can resume download if we can ensure + # that existing octez_node.snapshot chunk belongs + # to the snapshot we want to download) + # and start download from scratch + dump_metadata() + download() + + print() + return filename + + +def get_node_version(): + version = get_proc_output("octez-node --version").stdout.decode("ascii") + major_version, minor_version, rc_version = re.search( + r"[a-z0-9]+ \(.*\) \(([0-9]+).([0-9]+)(?:(?:~rc([1-9]+))|(?:\+dev))?\)", + version, + ).groups() + return ( + int(major_version), + int(minor_version), + (int(rc_version) if rc_version is not None else None), + ) + +# Returns relevant snapshot's metadata +# It filters out provided snapshots by `network` and `history_mode` +# provided by the user and then follows this steps: +# * tries to find the snapshot of exact same Octez version, that is used by the user. +# * if there is none, try to find the snapshot with the same major version, but less minor version +# and with the `snapshot_version` compatible with the user's Octez version. +# * If there is none, try to find the snapshot with any Octez version, but compatible `snapshot_version`. +def extract_relevant_snapshot(snapshot_array, config): + from functools import reduce + + def find_snapshot(pred): + return next( + filter( + lambda artifact: artifact["artifact_type"] == "tezos-snapshot" + and artifact["chain_name"] == config["network"] + and ( + artifact["history_mode"] == config["history_mode"] + or ( + config["history_mode"] == "archive" + and artifact["history_mode"] == "full" + ) + ) + and pred( + *( + get_artifact_node_version(artifact) + + (artifact.get("snapshot_version", None),) + ) + ), + iter(snapshot_array), + ), + None, + ) + + def get_artifact_node_version(artifact): + version = artifact["tezos_version"]["version"] + # there seem to be some inconsistency with that field in different providers + # so the only thing we check is if it's a string + additional_info = version["additional_info"] + return ( + version["major"], + version["minor"], + None if type(additional_info) == str else additional_info["rc"], + ) + + def compose_pred(*preds): + return reduce( + lambda acc, x: lambda major, minor, rc, snapshot_version: acc( + major, minor, rc, snapshot_version + ) + and x(major, minor, rc, snapshot_version), + preds, + ) + + def sum_pred(*preds): + return reduce( + lambda acc, x: lambda major, minor, rc, snapshot_version: acc( + major, minor, rc, snapshot_version + ) + or x(major, minor, rc, snapshot_version), + preds, + ) + + node_version = get_node_version() + major_version, minor_version, rc_version = node_version + + exact_version_pred = ( + lambda major, minor, rc, snapshot_version: node_version + == ( + major, + minor, + rc, + ) + ) + + exact_major_version_pred = ( + lambda major, minor, rc, snapshot_version: major_version == major + ) + + exact_minor_version_pred = ( + lambda major, minor, rc, snapshot_version: minor_version == minor + ) + + less_minor_version_pred = ( + lambda major, minor, rc, snapshot_version: minor_version > minor + ) + + exact_rc_version_pred = ( + lambda major, minor, rc, snapshot_version: rc_version == rc + ) + + less_rc_version_pred = ( + lambda major, minor, rc, snapshot_version: rc + and rc_version + and rc_version > rc + ) + + non_rc_version_pred = lambda major, minor, rc, snapshot_version: rc is None + + compatible_version_pred = ( + # it could happen that `snapshot_version` field is not supplied by provider + # e.g. marigold snapshots don't supply it + lambda major, minor, rc, snapshot_version: snapshot_version + and compatible_snapshot_version - snapshot_version <= 2 + ) + + non_rc_on_stable_pred = lambda major, minor, rc, snapshot_version: ( + rc_version is None and rc is None + ) or (rc_version is not None) + + preds = [ + exact_version_pred, + compose_pred( + non_rc_on_stable_pred, + compatible_version_pred, + sum_pred( + compose_pred( + exact_major_version_pred, + exact_minor_version_pred, + less_rc_version_pred, + ), + compose_pred( + exact_major_version_pred, + less_minor_version_pred, + non_rc_version_pred, + ), + ), + ), + compose_pred( + non_rc_on_stable_pred, + compatible_version_pred, + ), + ] + + return next( + ( + snapshot + for snapshot in map( + lambda pred: find_snapshot(pred), + preds, + ) + if snapshot is not None + ), + None, + ) diff --git a/baking/src/tezos_baking/wizard_structure.py b/baking/src/tezos_baking/wizard_structure.py index bdb953deb..65e8f6e4d 100644 --- a/baking/src/tezos_baking/wizard_structure.py +++ b/baking/src/tezos_baking/wizard_structure.py @@ -107,11 +107,6 @@ def setup_logger(log_file): ) -def print_and_log(message, log=logging.info, colorcode=None): - print(color(message, colorcode) if colorcode else message) - log(message) - - def log_exception(exception, logfile): import traceback from datetime import datetime diff --git a/docker/package/packages.py b/docker/package/packages.py index 74fdc3919..9a48505a3 100644 --- a/docker/package/packages.py +++ b/docker/package/packages.py @@ -262,6 +262,13 @@ def mk_node_unit( transform=lambda x, network=network: x.replace("{network}", network), ) ) + node_additional_scripts.append( + AdditionalScript( + name="restore-from-corruption", + local_file_name="restore-from-corruption" + ) + ) + node_postinst_steps += f""" mkdir -p /var/lib/tezos/node-{network} [ ! -f /var/lib/tezos/node-{network}/config.json ] && octez-node config init --data-dir /var/lib/tezos/node-{network} --network {network_config}