datacommonsorg · swethammkumari · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 18, 2024
diff --git a/scripts/world_bank/datasets/README.md b/scripts/world_bank/datasets/README.md
@@ -0,0 +1,66 @@
+# World Bank Datasets
+
+- source: https://data.worldbank.org
+
+- how to download data: Auto download of data by using python script(datasets.py).
+
+- type of place: Country.
+
+- statvars: All Type
+
+- years: 1960 to 2050
+
+-copyright year: 2024
+
+### How to run:
+"""Processes WB datasets.
+
+update september 2024:
+To run all processing methods , please do not pass the mode 
+Run: python3 datasets.py
+
+Or If required to check issue in any individual process follow all the steps as below:
+
+Supports the following tasks:
+
+============================
+
+fetch_datasets: Fetches WB dataset lists and resources and writes them to 'output/wb-datasets.csv'
+
+Run: python3 datasets.py --mode=fetch_datasets
+
+============================
+
+download_datasets: Downloads datasets listed in 'output/wb-datasets.csv' to the 'output/downloads' folder.
+
+Run: python3 datasets.py --mode=download_datasets
+
+============================
+
+write_wb_codes: Extracts World Bank indicator codes (and related information) from files downloaded in the 'output/downloads' folder to 'output/wb-codes.csv'.
+
+It only operates on files that are named '*_CSV.zip'.
+
+Run: python3 datasets.py --mode=write_wb_codes
+
+============================
+
+load_stat_vars: Loads stat vars from a mapping file specified via the `stat_vars_file` flag.
+
+Use this for debugging to ensure that the mappings load correctly and fix any errors logged by this operation.
+
+Run: python3 datasets.py --mode=load_stat_vars --stat_vars_file=/path/to/statvars.csv
+
+See `sample-svs.csv` for a sample mappings file.
+
+============================
+
+write_observations: Extracts observations from files downloaded in the 'output/downloads' folder and saves them to CSVs in the 'output/observations' folder.
+
+The stat vars file to be used for mappings should be specified using the `stat_vars_file' flag.
+
+It only operates on files that are named '*_CSV.zip'.
+
+Run: python3 datasets.py --mode=write_observations --stat_vars_file=/path/to/statvars.csv
+"""
+
diff --git a/scripts/world_bank/datasets/datasets.py b/scripts/world_bank/datasets/datasets.py
@@ -13,6 +13,12 @@
 # limitations under the License.
 """Processes WB datasets.
 
+update september 2024:
+To run all processing methods , please do not pass the mode 
+Run: python3 datasets.py
+
+Or If required to check issue in any individual process follow all the steps as below:
+
 Supports the following tasks:
 
 ============================
@@ -41,7 +47,7 @@
 
 Use this for debugging to ensure that the mappings load correctly and fix any errors logged by this operation.
 
-Run: python3 datasets.py --mode=load_stat_vars --stat_vars_file=/path/to/sv_mappings.csv
+Run: python3 datasets.py --mode=load_stat_vars --stat_vars_file=/path/to/statvars.csv
 
 See `sample-svs.csv` for a sample mappings file.
 
@@ -53,7 +59,7 @@
 
 It only operates on files that are named '*_CSV.zip'.
 
-Run: python3 datasets.py --mode=write_observations --stat_vars_file=/path/to/sv_mappings.csv
+Run: python3 datasets.py --mode=write_observations --stat_vars_file=/path/to/statvars.csv
 """
 
 import requests
@@ -66,6 +72,7 @@
 import re
 import urllib3
 from urllib3.util.ssl_ import create_urllib3_context
+from urllib3.exceptions import HTTPError
 from absl import flags
 import zipfile
 import codecs
@@ -84,7 +91,7 @@ class Mode:
 
 
 flags.DEFINE_string(
- 'mode', Mode.WRITE_OBSERVATIONS,
+ 'mode', None,
  f"Specify one of the following modes: {Mode.FETCH_DATASETS}, {Mode.DOWNLOAD_DATASETS}, {Mode.WRITE_WB_CODES}, {Mode.LOAD_STAT_VARS}, {Mode.WRITE_OBSERVATIONS}"
 )
 
@@ -131,7 +138,7 @@ class Mode:
 
 def download_datasets():
  '''Downloads dataset files. This is a very expensive operation so run it with care. It assumes that the datasets CSV is already available.'''
-
+ logging.info('start download_datasets')
  with open(DATASETS_CSV_FILE_PATH, 'r') as f:
  csv_rows = list(csv.DictReader(f))
  download_urls = []
@@ -158,10 +165,13 @@ def download(url):
  # response = requests.get(url)
  # Using urllib3 for downloading content to avoid SSL issue.
  # See: https://github.com/urllib3/urllib3/issues/2653#issuecomment-1165418616
- with urllib3.PoolManager(ssl_context=ctx) as http:
- response = http.request("GET", url)
- with open(file_path, 'wb') as f:
- f.write(response.data)
+ with urllib3.PoolManager(ssl_context=ctx,timeout=90) as http:
+ try:
+ response = http.request("GET", url)
+ with open(file_path, 'wb') as f:
+ f.write(response.data)
+ except HTTPError as e:
+ print(f"HTTP error encountered: {e}")
  except Exception as e:
  logging.error("Error downloading %s", url, exc_info=e)
 
@@ -277,11 +287,15 @@ def load_json(url, params, response_file):
  return json.load(f)
 
  logging.info("Fetching url %s, params %s", url, params)
- response = requests.get(url, params=params).json()
- with open(response_file, 'w') as f:
- logging.info('Writing response to file %s', response_file)
- json.dump(response, f, indent=2)
- return response
+ try:
+ response = requests.get(url, params=params).json()
+ with open(response_file, 'w') as f:
+ logging.info('Writing response to file %s', response_file)
+ json.dump(response, f, indent=2)
+ return response
+ except Exception as e:
+ print(f"Http error {e}")
+ return None
 
 
 def load_json_file(json_file):
@@ -571,19 +585,27 @@ def get_data_and_series_file_names(zip):
 
 
 def main(_):
- match FLAGS.mode:
- case Mode.FETCH_DATASETS:
- download_datasets()
- case Mode.DOWNLOAD_DATASETS:
- fetch_and_write_datasets_csv()
- case Mode.WRITE_WB_CODES:
- write_wb_codes()
- case Mode.LOAD_STAT_VARS:
- load_stat_vars(FLAGS.stat_vars_file)
- case Mode.WRITE_OBSERVATIONS:
- write_all_observations(FLAGS.stat_vars_file)
- case _:
- logging.error('No mode specified.')
+ logging.info(FLAGS.mode)
+ if not FLAGS.mode:
+ fetch_and_write_datasets_csv()
+ download_datasets()
+ write_wb_codes()
+ load_stat_vars(FLAGS.stat_vars_file)
+ write_all_observations(FLAGS.stat_vars_file)
+ else:
+ match FLAGS.mode:
+ case Mode.FETCH_DATASETS:
+ download_datasets()
+ case Mode.DOWNLOAD_DATASETS:
+ fetch_and_write_datasets_csv()
+ case Mode.WRITE_WB_CODES:
+ write_wb_codes()
+ case Mode.LOAD_STAT_VARS:
+ load_stat_vars(FLAGS.stat_vars_file)
+ case Mode.WRITE_OBSERVATIONS:
+ write_all_observations(FLAGS.stat_vars_file)
+ case _:
+ logging.error('No mode specified.')
 
 
 if __name__ == '__main__':

diff --git a/scripts/world_bank/datasets/manifest.json b/scripts/world_bank/datasets/manifest.json
@@ -0,0 +1,94 @@
+{
+ "import_specifications": [
+ {
+ "import_name": "WorldBankDatasets",
+ "curator_emails": ["[email protected]"],
+ "provenance_url": "https://data.worldbank.org",
+ "provenance_description": "World Bank databases are essential tools for supporting critical management decisions and providing key statistical information for Bank operational activities.",
+ "scripts": ["datasets.py"],
+ "import_inputs": [
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/ASPIRE_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/EdStats_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/FINDEX_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/GFDD_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/GPFI_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/HCI_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/IDA_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/Jobs_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/MDG_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/PovStats_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/SDG_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/SE4ALL_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/Subnational-Population_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/Subnational-Poverty_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/WGI_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/BBSC_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/DB_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/Economic_Fitness_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/HEFPI_CSV_obs.csv"
+ },
+ {
+ "template_mcf": "wb.tmcf",
+ "cleaned_csv": "output/observations/WWBI_CSV_obs.csv"
+ }
+ ],
+ "cron_schedule": "5 3 15 * *"
+ }
+ ]
+}