Merge pull request #39 from eclipsevortex/release/2.2.3

Release/2.2.3
eclipsevortex · May 7, 2024 · dfe4ca1 · dfe4ca1
2 parents b6fb3d4 + 22ea24a
commit dfe4ca1
Show file tree

Hide file tree

Showing 46 changed files with 3,131 additions and 183 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## 2.2.3 / 2024-05-06
+
+## What's Changed
+* Release/2.2.2 by @eclipsevortex in https://github.com/eclipsevortex/SubVortex/pull/36
+* Add unit tests for resync miners by @eclipsevortex in https://github.com/eclipsevortex/SubVortex/pull/38
+* implement auto upgrade by @eclipsevortex in https://github.com/eclipsevortex/SubVortex/pull/40
+* isolate wandb by @eclipsevortex in https://github.com/eclipsevortex/SubVortex/pull/41
+
+
+**Full Changelog**: https://github.com/eclipsevortex/SubVortex/compare/v2.2.2...v2.2.3
+
 ## 2.2.2 / 2024-04-25
 
 **Full Changelog**: https://github.com/eclipsevortex/SubVortex/compare/v2.2.1...v2.2.2

diff --git a/README.md b/README.md
@@ -345,7 +345,8 @@ pm2 start neurons/miner.py \
   --subtensor.network local \
   --wallet.name YOUR_WALLET_NAME \
   --wallet.hotkey YOUR_HOTKEY_NAME \
-  --logging.debug
+  --logging.debug \
+  --auto-update
 ```
 
 > IMPORTANT: Do not run more than one miner per machine. Running multiple miners will result in the loss of incentive and emissions on all miners.
@@ -367,13 +368,16 @@ pm2 start neurons/validator.py \
   --netuid <SUBNET_UID> \
   --wallet.name YOUR_WALLET_NAME \
   --wallet.hotkey YOUR_HOTKEY_NAME \
-  --logging.debug
+  --logging.debug \
+  --auto-update
 ```
 
 > NOTE: if you run a validator in testnet do not forget to add the argument `--subtensor.network test` or `--subtensor.chain_endpoint ws://<LOCAL_SUBTENSOR_IP>:9944` (the local subtensor has to target the network testnet)
 
 > NOTE: to access the wandb UI to get statistics about the miners, you can click on this [link](https://wandb.ai/eclipsevortext/subvortex-team) and choose the validator run you want.
 
+> NOTE: by default the dumps created by the auto-update will be stored in /etc/redis. If you want to change the location, please use `--database.redis_dump_path`.
+
 ## Releases
 
 - [Release-2.2.0](./scripts/release/release-2.2.0/RELEASE-2.2.0.md)

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.2.2
+2.2.3
diff --git a/neurons/miner.py b/neurons/miner.py
@@ -86,6 +86,9 @@ def __init__(self):
         bt.logging(config=self.config, logging_dir=self.config.miner.full_path)
         bt.logging.info(f"{self.config}")
 
+        # Show miner version
+        bt.logging.debug(f"miner version {THIS_VERSION}")
+
         # Init device.
         bt.logging.debug("loading device")
         self.device = torch.device(self.config.miner.device)

diff --git a/neurons/validator.py b/neurons/validator.py
@@ -25,10 +25,12 @@
 from typing import List
 from traceback import print_exception
 
+from subnet import __version__ as THIS_VERSION
+
 from subnet.monitor.monitor import Monitor
 
 from subnet.shared.checks import check_registration
-from subnet.shared.utils import get_redis_password
+from subnet.shared.utils import get_redis_password, should_upgrade
 from subnet.shared.subtensor import get_current_block
 from subnet.shared.weights import should_set_weights
 from subnet.shared.mock import MockMetagraph, MockDendrite, MockSubtensor
@@ -37,13 +39,14 @@
 from subnet.validator.localisation import get_country, get_localisation
 from subnet.validator.forward import forward
 from subnet.validator.models import Miner
+from subnet.validator.version import VersionControl
 from subnet.validator.miner import get_all_miners
 from subnet.validator.state import (
     resync_metagraph_and_miners,
     load_state,
     save_state,
     init_wandb,
-    reinit_wandb,
+    finish_wandb,
     should_reinit_wandb,
 )
 from subnet.validator.weights import (
@@ -88,6 +91,9 @@ def __init__(self, config=None):
         self.check_config(self.config)
         bt.logging(config=self.config, logging_dir=self.config.neuron.full_path)
 
+        # Show miner version
+        bt.logging.debug(f"validator version {THIS_VERSION}")
+
         # Init device.
         bt.logging.debug("loading device")
         self.device = torch.device(self.config.neuron.device)
@@ -181,10 +187,15 @@ def __init__(self, config=None):
         self.last_registered_block = 0
         self.rebalance_queue = []
         self.miners: List[Miner] = []
+        self.last_upgrade_check = 0
 
     async def run(self):
         bt.logging.info("run()")
 
+        # Initi versioin control
+        dump_path = self.config.database.redis_dump_path
+        self.version_control = VersionControl(self.database, dump_path)
+
         # Init miners
         self.miners = await get_all_miners(self)
         bt.logging.debug(f"Miners loaded {len(self.miners)}")
@@ -198,6 +209,17 @@ async def run(self):
 
         try:
             while 1:
+                # Start the upgrade process every 10 minutes
+                if should_upgrade(self.config.auto_update, self.last_upgrade_check):
+                    bt.logging.debug("Checking upgrade")
+                    must_restart = await self.version_control.upgrade()
+                    if must_restart:
+                        finish_wandb()
+                        self.version_control.restart()
+                        return
+
+                    self.last_upgrade_check = time.time()
+
                 start_epoch = time.time()
 
                 await resync_metagraph_and_miners(self)
@@ -259,22 +281,20 @@ async def run_forward():
                 # Rollover wandb to a new run.
                 if should_reinit_wandb(self):
                     bt.logging.info("Reinitializing wandb")
-                    reinit_wandb(self)
+                    finish_wandb()
+                    init_wandb(self)
 
                 self.prev_step_block = get_current_block(self.subtensor)
                 if self.config.neuron.verbose:
                     bt.logging.debug(f"block at end of step: {self.prev_step_block}")
                     bt.logging.debug(f"Step took {time.time() - start_epoch} seconds")
+
                 self.step += 1
 
         except Exception as err:
             bt.logging.error("Error in training loop", str(err))
             bt.logging.debug(print_exception(type(err), err, err.__traceback__))
-
-            if self.wandb is not None:
-                self.wandb.finish()
-                assert self.wandb.run is None
-                bt.logging.debug("Finishing wandb run")
+            finish_wandb()
 
         # After all we have to ensure subtensor connection is closed properly
         finally:

diff --git a/scripts/redis/README.md b/scripts/redis/README.md
@@ -12,6 +12,12 @@ This document explains how to install and uninstall a redis.
 - [Uninstallation](#uninstallation)
   - [As process](#uninstallation-as-process)
   - [As docker container](#uninstallation-as-container)
+- [Migration](#migration)
+  - [Rollout](#migration-rollout)
+  - [Rollback](#migration-rollback)
+- [Dump](#migration)
+  - [Creation](#dump-creation)
+  - [Restoration](#dump-restoration)
 
 ---
 
@@ -252,3 +258,65 @@ You shoud have something similar (or at least list that does not container `subv
 ```
 CONTAINER ID   IMAGE     COMMAND   CREATED   STATUS    PORTS     NAMES
 ```
+
+# Migration
+
+## Rollout <a id="migration-rollout"></a>
+
+To rollout any Redis migration manually, you can use the python script `redis_migration.py`.
+
+For example, if you want to rollout the version 2.2.1, you can run in `SubVortex`
+
+```
+python3 ./scripts/redis/utils/redis_migration.py --run-type rollout --version 2.2.1
+```
+
+> IMPORTANT <br />
+> If you have to rollout multiple versions, execute them one by one from your current version to the targeted one.
+
+## Rollback <a id="migration-rollback"></a>
+
+To rollback any Redis migration manually, you can use the python script `redis_migration.py`.
+
+For example, if you want to rollback the version 2.2.1, you can run in `SubVortex`
+
+```
+python3 ./scripts/redis/utils/redis_migration.py --run-type rollback --version 2.2.1
+```
+
+> IMPORTANT <br />
+> If you have to rollback multiple versions, execute them one by one from your current version to the targeted one.
+
+# Dump
+
+## Creation <a id="dump-creation"></a>
+
+To create a Redis dump manually, you can use the python script `redis_dump.py`.
+
+For example, if you want to create the dump in the `subVortex` directory, you can run
+
+```
+python3 ./scripts/redis/utils/redis_dump.py --run-type create --dump-path redis-dump-2.0.0.json
+```
+
+If you want to create the dump in another location and/or name, you can use the argument `--dump-path`
+
+```
+python3 ./scripts/redis/utils/redis_dump.py --run-type create --dump-path /tmp/redis/redis-dump-2.0.0.json
+```
+
+## Restoration <a id="dump-restoration"></a>
+
+To restore a Redis dump manually, you can use the python script `redis_dump.py`.
+
+For example, if you want to create in `subVortex` directory, you can run
+
+```
+python3 ./scripts/redis/utils/redis_dump.py --run-type restore --dump-path redis-dump-2.0.0.json
+```
+
+If you want to restore a dump in another location, you can use the argument `--dump-path`
+
+```
+python3 ./scripts/redis/utils/redis_dump.py --run-type restore --dump-path /tmp/redis/redis-dump-2.0.0.json
+```
diff --git a/scripts/redis/migrations/migration-2.2.0.py b/scripts/redis/migrations/migration-2.2.0.py
@@ -0,0 +1,43 @@
+from redis import asyncio as aioredis
+
+current = "2.0.0"
+
+
+async def rollout(database: aioredis.Redis):
+    async for key in database.scan_iter("stats:*"):
+        metadata_dict = await database.hgetall(key)
+
+        if b"subtensor_successes" not in metadata_dict:
+            await database.hset(key, b"subtensor_successes", 0)
+        if b"subtensor_attempts" not in metadata_dict:
+            await database.hset(key, b"subtensor_attempts", 0)
+        if b"metric_successes" not in metadata_dict:
+            await database.hset(key, b"metric_successes", 0)
+        if b"metric_attempts" not in metadata_dict:
+            await database.hset(key, b"metric_attempts", 0)
+        if b"total_successes" not in metadata_dict:
+            await database.hset(key, b"total_successes", 0)
+        if b"tier" not in metadata_dict:
+            await database.hset(key, b"tier", "Bronze")
+
+    await database.set("version", current)
+
+
+async def rollback(database: aioredis.Redis):
+    async for key in database.scan_iter("stats:*"):
+        metadata_dict = await database.hgetall(key)
+
+        if b"subtensor_successes" in metadata_dict:
+            await database.hdel(key, b"subtensor_successes")
+        if b"subtensor_attempts" in metadata_dict:
+            await database.hdel(key, b"subtensor_attempts")
+        if b"metric_successes" in metadata_dict:
+            await database.hdel(key, b"metric_successes")
+        if b"metric_attempts" in metadata_dict:
+            await database.hdel(key, b"metric_attempts")
+        if b"total_successes" in metadata_dict:
+            await database.hdel(key, b"total_successes")
+        if b"tier" in metadata_dict:
+            await database.hdel(key, b"tier")
+
+    await database.set("version", None)
diff --git a/scripts/redis/utils/redis_dump.py b/scripts/redis/utils/redis_dump.py
@@ -0,0 +1,98 @@
+import asyncio
+import argparse
+import bittensor as bt
+from redis import asyncio as aioredis
+
+from subnet.shared.utils import get_redis_password
+from subnet.validator.database import create_dump, restore_dump
+
+
+async def create(args):
+    try:
+        bt.logging.info(
+            f"Loading database from {args.database_host}:{args.database_port}"
+        )
+        redis_password = get_redis_password(args.redis_password)
+        database = aioredis.StrictRedis(
+            host=args.database_host,
+            port=args.database_port,
+            db=args.database_index,
+            password=redis_password,
+        )
+
+        bt.logging.info("Create dump starting")
+
+        await create_dump(args.dump_path, database)
+
+        bt.logging.success("Create dump successful")
+    except Exception as e:
+        bt.logging.error(f"Error during rollout: {e}")
+
+
+async def restore(args):
+    try:
+        bt.logging.info(
+            f"Loading database from {args.database_host}:{args.database_port}"
+        )
+        redis_password = get_redis_password(args.redis_password)
+        database = aioredis.StrictRedis(
+            host=args.database_host,
+            port=args.database_port,
+            db=args.database_index,
+            password=redis_password,
+        )
+
+        bt.logging.info("Restore dump starting")
+
+        await restore_dump(args.dump_path, database)
+
+        bt.logging.success("Restore dump successful")
+
+    except Exception as e:
+        bt.logging.error(f"Error during rollback: {e}")
+
+
+async def main(args):
+    if args.run_type == "create":
+        await create(args)
+    else:
+        await restore(args)
+
+
+if __name__ == "__main__":
+    try:
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "--run-type",
+            type=str,
+            default="create",
+            help="Type of migration you want too execute. Possible values are rollout or rollback)",
+        )
+        parser.add_argument(
+            "--dump-path",
+            type=str,
+            default="/tmp/redis",
+            help="Dump file (with path) to create or restore",
+        )
+        parser.add_argument(
+            "--redis_password",
+            type=str,
+            default=None,
+            help="password for the redis database",
+        )
+        parser.add_argument(
+            "--redis_conf_path",
+            type=str,
+            default="/etc/redis/redis.conf",
+            help="path to the redis configuration file",
+        )
+        parser.add_argument("--database_host", type=str, default="localhost")
+        parser.add_argument("--database_port", type=int, default=6379)
+        parser.add_argument("--database_index", type=int, default=1)
+        args = parser.parse_args()
+
+        asyncio.run(main(args))
+    except KeyboardInterrupt:
+        print("KeyboardInterrupt")
+    except ValueError as e:
+        print(f"ValueError: {e}")