feat: add merging json file tool

instadeepai · Jan 19, 2024 · d37a021 · d37a021
1 parent 166d071
commit d37a021
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -142,6 +142,11 @@ In order to use the tools we suggest effectively, raw data json files are requir
 ```
 Here `run_1` to `run_n` correspond to the number of independent runs in a given experiment and `step_1` to `step_k` correspond to the number of logged steps in a given environment. We do not require an independent run to explicitly be named run, users may also name a run using the value of a particular seed that was used as a string. `step_count` corresponds to the amount of steps taken by agents in the environment when logging occurs and the values logged for each relevant metric for a given logging step should be a list containing either 1 element for a metric such as a win rate which gets computed over multiple episodes or as many elements as evaluation episodes that we run at the logging step. The final logging step for a given run should contain the `absolute_metrics` values for the given metric in an experiment with these lists containing either 1 element or 10 times as many elements as evaluation episodes at each logging step. For an explanation of the `absolute metric` please see [paragraph 1 on page 9 here](https://arxiv.org/pdf/2209.10485.pdf).
 
+#### Data Tooling
+[**JSON Files Merging Script**](marl_eval/utils/merge_json_files.py): We offer a function called `concatenate_files` that reads json files from a specified directory, concatenates their contents into a single structured dictionary,
+and ensures uniqueness of seed numbers within the data. It handles nested json structures and saves the concatenated
+result into a new json file. Designed primarily for managing and aggregating json data from multiple files in experimental setups.
+
 > 🚧 **Important note on data structure** 🚧
 >
 > Due to the underlying statistical aggregation relying on `numpy` array operations it is required that all data contain the same number of data points. This implies that, for a given environment, it is required that all experiment trials should be done using the same algorithms, on the same tasks, for the same number of independent runs and for the same amount of evaluation steps. The code will currently check that these conditions are met and will not be able to progress otherwise. In the case that this happens, the `check_data` method of the [`DiagnoseData`](marl_eval/utils/diagnose_data_errors.py) class will be able to tell a user exactly what is causing the issues in their raw experiment data.

diff --git a/marl_eval/utils/merge_json_files.py b/marl_eval/utils/merge_json_files.py
@@ -0,0 +1,90 @@
+# python3
+# Copyright 2022 InstaDeep Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections import defaultdict
+from typing import Dict, Tuple
+
+
+def _read_json_files(directory: str) -> list:
+    """Reads all JSON files in a directory and returns a list of JSON objects."""
+    json_data = []
+
+    for filename in os.listdir(directory):
+        if filename.endswith(".json"):
+            file_path = os.path.join(directory, filename)
+            with open(file_path) as file:
+                json_data.append(json.load(file))
+    return json_data
+
+
+def _get_seed_number(seed_str: str) -> Tuple[str, int]:
+    """Get the seed number from the seed string."""
+    if seed_str.isnumeric():
+        return "", int(seed_str)
+    else:
+        try:
+            seed_string, seed_number = seed_str.split("_")
+            return seed_string, int(seed_number)
+        except ValueError:
+            raise ValueError(
+                f"Seed number {seed_str} is not in the correct format.\
+                It should be an integer or a string with the format 'seed_number'"
+            )
+
+
+def _check_seed(concatenated_data: Dict, algo_data: Dict, seed_number: str) -> str:
+    """Function to check if seed is already in concatenated_data and algo_data."""
+    if seed_number in (concatenated_data.keys() or algo_data.keys()):
+        seed_string, seed_n = _get_seed_number(seed_number)
+        seed_number = (
+            f"{seed_string}_{seed_n+1}" if seed_string != "" else str(seed_n + 1)
+        )
+        return _check_seed(concatenated_data, algo_data, seed_number)
+    else:
+        return seed_number
+
+
+def concatenate_files(directory: str, json_path: str = "./concatenation") -> Dict:
+    """Concatenate all json files in a directory and save the result in a json file."""
+    # Read all json files in a directory
+    json_data = _read_json_files(directory)
+
+    # Using defaultdict for automatic handling of missing keys
+    concatenated_data: Dict = defaultdict(
+        lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    )
+    for data in json_data:
+        for env_name, envs in data.items():
+            for scenario_name, scenarios in envs.items():
+                for algo_name, algos in scenarios.items():
+                    concatenated_data[env_name][scenario_name][algo_name]
+                    for seed_number, algo_data in algos.items():
+                        # Get seed number
+                        seed_n = _check_seed(
+                            concatenated_data[env_name][scenario_name][algo_name],
+                            algo_data,
+                            seed_number,
+                        )
+                        concatenated_data[env_name][scenario_name][algo_name][
+                            seed_n
+                        ] = algo_data
+
+    # Save concatenated data in a json file
+    with open(f"{json_path}.json", "w") as f:
+        json.dump(concatenated_data, f, indent=4)
+
+    return concatenated_data