Automatically update dataset file size and number based on JSON files (…

…#142) * Refactoring: Add file size and number * Update file size and number * Use consistent thousands separator * Fix another separator
C2SM · Jul 4, 2024 · 5ac4d21 · 5ac4d21
1 parent 990dd96
commit 5ac4d21
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 59 deletions.
diff --git a/docs/datasets/climate_model_data.md b/docs/datasets/climate_model_data.md
@@ -21,7 +21,7 @@
 
 - Location: IAC
 - Size: 6 TB
-- Number of files: 40’000
+- Number of files: 40.000
 - Access: direct / rsync
 - Status: frozen (2016-12)
 - Resolution: native
@@ -30,7 +30,7 @@
 
 - Location: IAC
 - Size: 130 TB
-- Number of files: 700’000
+- Number of files: 700.000
 - Access: direct / rsync
 - Status: monthly updated
 - Resolution: native
@@ -47,7 +47,7 @@
     ```
 
 - Size: 520 TB 
-- Number of Files: 5’500’000
+- Number of Files: 5.500.000
 - Access: direct / rsync
 - Status: weekly updated
 - Resolution: native
@@ -58,13 +58,13 @@
     The datasets are checked, standardized, and regridded to common grid.
 
 !!! note
-    Next Generation (ng) archives were maintained by Reto’s group (Jan Sedlacek/Lukas Brunner). Ruth Lorenz / C2SM took over cmip6-ng in 2022. Contact: [[email protected]](mailto:[email protected]). Documentation: [https://doi.org/10.5281/zenodo.373412 :material-open-in-new:](https://doi.org/10.5281/zenodo.3734128){:target="_blank"}.
+    Next Generation (ng) archives were maintained by Reto's group (Jan Sedlacek/Lukas Brunner). Ruth Lorenz / C2SM took over cmip6-ng in 2022. Contact: [[email protected]](mailto:[email protected]). Documentation: [https://doi.org/10.5281/zenodo.373412 :material-open-in-new:](https://doi.org/10.5281/zenodo.3734128){:target="_blank"}.
 
 ### CMIP3-ng
 
 - Location: IAC
 - Size: 0.5 TB
-- Number of files: 5’000
+- Number of files: 5.000
 - Access: direct / rsync
 - Status: frozen (2019-03)
 - Variables: n/a
@@ -74,7 +74,7 @@
 
 - Location: IAC
 - Size: 33 TB
-- Number of files: 100’000
+- Number of files: 100.000
 - Access: direct / rsync
 - Status: frozen (2019-03)
 - Variables: n/a
@@ -92,7 +92,7 @@
     ```
 
 - Size: 175 TB
-- Number of files: 500’000
+- Number of files: 500.000
 - Access: direct / rsync
 - Status: frozen (2019-03)
 - Variables: 
@@ -176,7 +176,7 @@
     ```
 
 - Size: 370 TB
-- Number of files: 800’000
+- Number of files: 800.000
 - Access: direct / rsync
 - Status: monthly updated
 - Resolution: 0.44° and 0.11°
@@ -185,7 +185,7 @@
 
 - Location: IAC, Euler
 - Size: 23 TB
-- Number of files: 100’000
+- Number of files: 100.000
 - Access: direct
 - Status: monthly updated
 - Resolution: 0.11°
@@ -199,7 +199,7 @@
 
 - Location: IAC, CSCS
 - Size: 11 TB
-- Number of files: 1’800
+- Number of files: 1.800
 - Access: direct
 - Status: frozen (2019-04)
 - Variables: 
@@ -258,7 +258,7 @@
     ```
 
 - Size: 46 TB 
-- Number of files: 3’500
+- Number of files: 3.500
 - Access: direct
 - Status: ongoing
 - Variables: 
@@ -277,7 +277,7 @@
     ```
 
 - Size: 30 TB
-- Number of files: 22'949
+- Number of files: 22.949
 - Access: direct / rsync
 - Status: ongoing
 - Variables:

diff --git a/docs/datasets/obs_reanalysis_data.md b/docs/datasets/obs_reanalysis_data.md
@@ -157,6 +157,7 @@
     ```
 
 - Size: 6 TB
+- Number of files:
 - Access: direct
 - Status: updated
 - Time period: 1985-2020
@@ -197,8 +198,8 @@
     /net/atmos/data/cerra-land/processed/v1/
     ```
 
-- Location: IAC: `/net/atmos/data/cerra-land/processed/v1/`
 - Size: 1.8 TB
+- Number of files:
 - Access: direct
 - Status: updated
 - Time period: 1985-2020
@@ -228,11 +229,14 @@
     /net/co2/c2sm-data/ch202X/Obs_Data/EOBS/0.1deg_reg_v26.0e/processed/
     ```
 
-- Size: 50 GB
+- Size: 141 GB
+- Number of files: 12
 - Access: direct
 - Status: static
 - Time period: 1971-2020
-- Variables: `pr`, `tas`, `tasmax`, `tasmin`
+- Variables: 
+    - v23.1e: `CDD`, `TXx`, `pr`, `tas`, `tasmax`, `tasmin`
+    - v26.0e: `pr`, `tas`, `tasmax`, `tasmin`
 - Temporal Resolution: daily
 - Spatial Resolution: 0.11°
 - More information: E-OBS (v23.1e and v26.0e) data with higher resolution MCH data over Switzerland (prepared for CH2025)
@@ -249,6 +253,7 @@
     ```
 
 - Size: 500 GB
+- Number of files:
 - Access: direct
 - Status: updated
 - Time period: v1: 1940-present, v2: 1980-present
@@ -289,6 +294,7 @@
     ```
 
 - Size: 2.2 TB
+- Number of files:
 - Access: direct
 - Status: updated
 - Time period: 1950-present

diff --git a/scripts/update_datasets.py b/scripts/update_datasets.py
@@ -2,6 +2,11 @@
 import json
 import yaml
 
+# Globally define the list entry strings for the data to be replaced
+ENTRY_VARIABLES = "- Variables:"
+ENTRY_SIZE = "- Size:"
+ENTRY_NUMBER = "- Number of files:"
+
 def read_yaml(file_path):
     with open(file_path, 'r') as file:
         return yaml.safe_load(file)
@@ -18,7 +23,7 @@ def write_markdown(file_path, content):
     with open(file_path, 'a') as file:
         file.write(content)
 
-def generate_markdown_default(json_data, dataset):
+def generate_markdown_default(json_data, dataset, entry):
     """
     Generates markdown content for climate datasets.
 
@@ -32,21 +37,33 @@ def generate_markdown_default(json_data, dataset):
     Returns:
     - str: A markdown-formatted string listing variables and their details.
     """
-    markdown_content = "- Variables: \n"
+    if entry == ENTRY_VARIABLES:
+        markdown_content = f"{entry} \n"
+    else: 
+        markdown_content = f"{entry} "
 
-    first_iteration = True
-    for variable, resolutions in json_data['data'].items():
-        if dataset == 'cmip6-ng':
-            # Logic for CMIP6-ng datasets
-            markdown_content += generate_variable_info_cmip6ng(variable, resolutions, first_iteration)
-        else:
-            # Original logic for datasets with an additional scenario level
-            markdown_content += generate_variable_info_default(variable, resolutions, first_iteration)
-        first_iteration = False
+    if entry == ENTRY_VARIABLES:
+        first_iteration = True
+        for variable, resolutions in json_data['data'].items():
+            if dataset == 'cmip6-ng':
+                # Logic for CMIP6-ng datasets
+                markdown_content += generate_variable_info_cmip6ng(variable, resolutions, first_iteration)
+            else:
+                # Original logic for datasets with an additional scenario level
+                markdown_content += generate_variable_info_default(variable, resolutions, first_iteration)
+            first_iteration = False
+    elif entry == ENTRY_SIZE:
+        markdown_content += json_data['total_size'] 
+        markdown_content += f" :material-information-outline:{{ title=\"last updated: {json_data['last_updated']}\" }}"
+    elif entry == ENTRY_NUMBER:
+        markdown_content += "{:,}".format(json_data['file_count'])
+        markdown_content += f" :material-information-outline:{{ title=\"last updated: {json_data['last_updated']}\" }}"
+    else: 
+        raise ValueError(f"Invalid entry type: {entry}")
 
     return markdown_content
 
-def generate_markdown_cordex(json_data):
+def generate_markdown_cordex(json_data, entry):
     """
     Generates markdown content for CORDEX(-ReKLiEs) datasets.
 
@@ -60,23 +77,33 @@ def generate_markdown_cordex(json_data):
     Returns:
     - str: A markdown-formatted string listing variables along with their scenarios and temporal resolutions.
     """
-    markdown_content = "- Variables: \n"
-
-    variable_details = {}
-    for scenario, temporal_resolutions in json_data['data'].items():
-        for temporal_resolution, variables in temporal_resolutions.items():
-            for variable, details in variables.items():
-                if variable not in variable_details:
-                    variable_details[variable] = {}
-                if scenario not in variable_details[variable]:
-                    variable_details[variable][scenario] = set()
-                # Add resolution to the set of resolutions for this scenario
-                variable_details[variable][scenario].add(temporal_resolution)
-
-    first_iteration = True
-    for variable, scenarios_resolutions in variable_details.items():
-        markdown_content += generate_variable_info_cordex(variable, scenarios_resolutions, first_iteration)
-        first_iteration = False
+    if entry == ENTRY_VARIABLES:
+        markdown_content = f"{entry} \n"
+        variable_details = {}
+        for scenario, temporal_resolutions in json_data['data'].items():
+            for temporal_resolution, variables in temporal_resolutions.items():
+                for variable, details in variables.items():
+                    if variable not in variable_details:
+                        variable_details[variable] = {}
+                    if scenario not in variable_details[variable]:
+                        variable_details[variable][scenario] = set()
+                    # Add resolution to the set of resolutions for this scenario
+                    variable_details[variable][scenario].add(temporal_resolution)
+
+        first_iteration = True
+        for variable, scenarios_resolutions in variable_details.items():
+            markdown_content += generate_variable_info_cordex(variable, scenarios_resolutions, first_iteration)
+            first_iteration = False
+    elif entry == ENTRY_SIZE: 
+        markdown_content = f"{entry} "
+        markdown_content += json_data['total_size']
+        markdown_content += f" :material-information-outline:{{ title=\"last updated: {json_data['last_updated']}\" }}"
+    elif entry == ENTRY_NUMBER:
+        markdown_content = f"{entry} "
+        markdown_content += "{:,}".format(json_data['file_count'])
+        markdown_content += f" :material-information-outline:{{ title=\"last updated: {json_data['last_updated']}\" }}"
+    else: 
+        raise ValueError(f"Invalid entry type: {entry}")
 
     return markdown_content
 
@@ -174,26 +201,26 @@ def generate_variable_info_default(variable, resolutions, first_iteration):
     variable_info += "\" }"
     return variable_info
 
-def replace_variables_section(file_path, heading_dataset, new_variables_content):
+def replace_entry(file_path, heading_dataset, new_variables_content, entry):
     """
-    Replaces the variables section for a specified dataset within a markdown file.
+    Replaces a specified list entry for a specified dataset within a markdown file.
 
     This function searches for a specific dataset heading within a markdown file and replaces
-    the content of the variables section that follows this heading with new content. It handles
+    the content of the specified entry that follows this heading with new content. It handles
     skipping unrelated sections and ensures that the replacement is done at the correct indentation level.
 
     Parameters:
     - file_path (str): The path to the markdown file to be modified.
     - heading_dataset (str): The heading of the dataset section where variables need to be replaced.
                              This should match the markdown heading format, e.g., "### DatasetName".
-    - new_variables_content (str): The new content to replace the existing variables section with.
+    - new_variables_content (str): The new content to replace the existing list entry with.
                                    This content should be a string formatted according to markdown syntax.
 
     Note:
-    - The function assumes that the variables section starts with a line "- Variables:" at the same
+    - The function assumes that the variables section starts with a line `entry` at the same
       indentation level as the dataset heading.
     - It also assumes that a new section starts with a heading at the same indentation level as the
-      "- Variables:" line or with a higher-level heading (e.g., "##").
+      `entry` line or with a higher-level heading (e.g., "##").
     """
     with open(file_path, 'r') as file:
         content = file.readlines()
@@ -211,7 +238,7 @@ def replace_variables_section(file_path, heading_dataset, new_variables_content)
         # Check for an exact match with the heading, considering markdown syntax
         if not skip_section and line.strip() == f"### {heading_dataset}":
             heading_found = True
-        if heading_found and line.strip().startswith("- Variables:"):
+        if heading_found and line.strip().startswith(entry):
             start_index = i
             break
 
@@ -233,6 +260,11 @@ def replace_variables_section(file_path, heading_dataset, new_variables_content)
         with open(file_path, 'w') as file:
             file.writelines(content)
 
+        print(f"Updated '{entry}' entry in {heading_dataset} section in file {file_path}")
+        return True
+    else:
+        return False
+
 def check_heading_exists(file_path, heading_dataset):
     """
     Checks if a specified heading exists within a markdown file.
@@ -266,25 +298,44 @@ def main():
 
     # Process data for each dataset and append markdown file
     for dataset in datasets.keys():
+        # Load the JSON data for the dataset from S3 bucket and merge with metadata
         dataset_json = download_json(f'https://zephyr-c2sm.s3.eu-central-1.amazonaws.com/file_tree_{dataset}_noindent.json')
+        dataset_json_meta = download_json(f'https://zephyr-c2sm.s3.eu-central-1.amazonaws.com/meta_{dataset}.json')
+        dataset_json = {**dataset_json, **dataset_json_meta}
         heading_dataset = datasets[dataset]
+
         print(heading_dataset)
+
+        # Generate the markdown content for the dataset
         if dataset.startswith('cordex'):
-            dataset_markdown = generate_markdown_cordex(dataset_json)
+            markdown_variables = generate_markdown_cordex(dataset_json, ENTRY_VARIABLES)
+            markdown_size = generate_markdown_cordex(dataset_json, ENTRY_SIZE)
+            markdown_number = generate_markdown_cordex(dataset_json, ENTRY_NUMBER)
         else:
-            dataset_markdown = generate_markdown_default(dataset_json, dataset)
+            markdown_variables = generate_markdown_default(dataset_json, dataset, ENTRY_VARIABLES)
+            markdown_size = generate_markdown_default(dataset_json, dataset, ENTRY_SIZE)
+            markdown_number = generate_markdown_default(dataset_json, dataset, ENTRY_NUMBER)
+
+        # Initialize flags to track if the variables, size, and number of files entries were updated
+        updated_variables = False
+        updated_size = False
+        updated_number = False
 
         # Attempt to update each markdown file until the correct one is found and updated
-        updated = False
         for markdown_file_path in markdown_files:
             if check_heading_exists(markdown_file_path, heading_dataset):
-                replace_variables_section(markdown_file_path, heading_dataset, dataset_markdown)
-                print(f"Updated dataset section in: {markdown_file_path}")
-                updated = True
+                updated_variables = replace_entry(markdown_file_path, heading_dataset, markdown_variables, ENTRY_VARIABLES)
+                updated_size = replace_entry(markdown_file_path, heading_dataset, markdown_size, ENTRY_SIZE)
+                updated_number = replace_entry(markdown_file_path, heading_dataset, markdown_number, ENTRY_NUMBER)
                 break  # Stop searching once the correct file is updated
 
-        if not updated:
-            print(f"Could not find heading in any markdown files for dataset: {dataset}")
+        # Print a message if no updates were made for the dataset
+        if not updated_variables:
+            print(f"Could not find '{ENTRY_VARIABLES}' list entry for heading {heading_dataset} in any markdown files. No changes applied.")
+        if not updated_size:
+            print(f"Could not find '{ENTRY_SIZE}' list entry for heading {heading_dataset} in any markdown files. No changes applied.")
+        if not updated_number:
+            print(f"Could not find '{ENTRY_NUMBER}' list entry for heading {heading_dataset} in any markdown files. No changes applied.")
 
 if __name__ == "__main__":
     main()