diff --git a/scripts/world_bank/datasets/README.md b/scripts/world_bank/datasets/README.md new file mode 100644 index 0000000000..bbf284cce2 --- /dev/null +++ b/scripts/world_bank/datasets/README.md @@ -0,0 +1,66 @@ +# World Bank Datasets + +- source: https://data.worldbank.org + +- how to download data: Auto download of data by using python script(datasets.py). + +- type of place: Country. + +- statvars: All Type + +- years: 1960 to 2050 + +-copyright year: 2024 + +### How to run: +"""Processes WB datasets. + +update september 2024: +To run all processing methods , please do not pass the mode +Run: python3 datasets.py + +Or If required to check issue in any individual process follow all the steps as below: + +Supports the following tasks: + +============================ + +fetch_datasets: Fetches WB dataset lists and resources and writes them to 'output/wb-datasets.csv' + +Run: python3 datasets.py --mode=fetch_datasets + +============================ + +download_datasets: Downloads datasets listed in 'output/wb-datasets.csv' to the 'output/downloads' folder. + +Run: python3 datasets.py --mode=download_datasets + +============================ + +write_wb_codes: Extracts World Bank indicator codes (and related information) from files downloaded in the 'output/downloads' folder to 'output/wb-codes.csv'. + +It only operates on files that are named '*_CSV.zip'. + +Run: python3 datasets.py --mode=write_wb_codes + +============================ + +load_stat_vars: Loads stat vars from a mapping file specified via the `stat_vars_file` flag. + +Use this for debugging to ensure that the mappings load correctly and fix any errors logged by this operation. + +Run: python3 datasets.py --mode=load_stat_vars --stat_vars_file=/path/to/statvars.csv + +See `sample-svs.csv` for a sample mappings file. + +============================ + +write_observations: Extracts observations from files downloaded in the 'output/downloads' folder and saves them to CSVs in the 'output/observations' folder. + +The stat vars file to be used for mappings should be specified using the `stat_vars_file' flag. + +It only operates on files that are named '*_CSV.zip'. + +Run: python3 datasets.py --mode=write_observations --stat_vars_file=/path/to/statvars.csv +""" + diff --git a/scripts/world_bank/datasets/datasets.py b/scripts/world_bank/datasets/datasets.py index fe9d33563d..5a2078b08f 100644 --- a/scripts/world_bank/datasets/datasets.py +++ b/scripts/world_bank/datasets/datasets.py @@ -13,6 +13,12 @@ # limitations under the License. """Processes WB datasets. +update september 2024: +To run all processing methods , please do not pass the mode +Run: python3 datasets.py + +Or If required to check issue in any individual process follow all the steps as below: + Supports the following tasks: ============================ @@ -41,7 +47,7 @@ Use this for debugging to ensure that the mappings load correctly and fix any errors logged by this operation. -Run: python3 datasets.py --mode=load_stat_vars --stat_vars_file=/path/to/sv_mappings.csv +Run: python3 datasets.py --mode=load_stat_vars --stat_vars_file=/path/to/statvars.csv See `sample-svs.csv` for a sample mappings file. @@ -53,7 +59,7 @@ It only operates on files that are named '*_CSV.zip'. -Run: python3 datasets.py --mode=write_observations --stat_vars_file=/path/to/sv_mappings.csv +Run: python3 datasets.py --mode=write_observations --stat_vars_file=/path/to/statvars.csv """ import requests @@ -66,6 +72,7 @@ import re import urllib3 from urllib3.util.ssl_ import create_urllib3_context +from urllib3.exceptions import HTTPError from absl import flags import zipfile import codecs @@ -84,7 +91,7 @@ class Mode: flags.DEFINE_string( - 'mode', Mode.WRITE_OBSERVATIONS, + 'mode', None, f"Specify one of the following modes: {Mode.FETCH_DATASETS}, {Mode.DOWNLOAD_DATASETS}, {Mode.WRITE_WB_CODES}, {Mode.LOAD_STAT_VARS}, {Mode.WRITE_OBSERVATIONS}" ) @@ -131,7 +138,7 @@ class Mode: def download_datasets(): '''Downloads dataset files. This is a very expensive operation so run it with care. It assumes that the datasets CSV is already available.''' - + logging.info('start download_datasets') with open(DATASETS_CSV_FILE_PATH, 'r') as f: csv_rows = list(csv.DictReader(f)) download_urls = [] @@ -158,10 +165,13 @@ def download(url): # response = requests.get(url) # Using urllib3 for downloading content to avoid SSL issue. # See: https://github.com/urllib3/urllib3/issues/2653#issuecomment-1165418616 - with urllib3.PoolManager(ssl_context=ctx) as http: - response = http.request("GET", url) - with open(file_path, 'wb') as f: - f.write(response.data) + with urllib3.PoolManager(ssl_context=ctx,timeout=90) as http: + try: + response = http.request("GET", url) + with open(file_path, 'wb') as f: + f.write(response.data) + except HTTPError as e: + print(f"HTTP error encountered: {e}") except Exception as e: logging.error("Error downloading %s", url, exc_info=e) @@ -277,11 +287,15 @@ def load_json(url, params, response_file): return json.load(f) logging.info("Fetching url %s, params %s", url, params) - response = requests.get(url, params=params).json() - with open(response_file, 'w') as f: - logging.info('Writing response to file %s', response_file) - json.dump(response, f, indent=2) - return response + try: + response = requests.get(url, params=params).json() + with open(response_file, 'w') as f: + logging.info('Writing response to file %s', response_file) + json.dump(response, f, indent=2) + return response + except Exception as e: + print(f"Http error {e}") + return None def load_json_file(json_file): @@ -571,19 +585,27 @@ def get_data_and_series_file_names(zip): def main(_): - match FLAGS.mode: - case Mode.FETCH_DATASETS: - download_datasets() - case Mode.DOWNLOAD_DATASETS: - fetch_and_write_datasets_csv() - case Mode.WRITE_WB_CODES: - write_wb_codes() - case Mode.LOAD_STAT_VARS: - load_stat_vars(FLAGS.stat_vars_file) - case Mode.WRITE_OBSERVATIONS: - write_all_observations(FLAGS.stat_vars_file) - case _: - logging.error('No mode specified.') + logging.info(FLAGS.mode) + if not FLAGS.mode: + fetch_and_write_datasets_csv() + download_datasets() + write_wb_codes() + load_stat_vars(FLAGS.stat_vars_file) + write_all_observations(FLAGS.stat_vars_file) + else: + match FLAGS.mode: + case Mode.FETCH_DATASETS: + download_datasets() + case Mode.DOWNLOAD_DATASETS: + fetch_and_write_datasets_csv() + case Mode.WRITE_WB_CODES: + write_wb_codes() + case Mode.LOAD_STAT_VARS: + load_stat_vars(FLAGS.stat_vars_file) + case Mode.WRITE_OBSERVATIONS: + write_all_observations(FLAGS.stat_vars_file) + case _: + logging.error('No mode specified.') if __name__ == '__main__': diff --git a/scripts/world_bank/datasets/manifest.json b/scripts/world_bank/datasets/manifest.json new file mode 100644 index 0000000000..56c3244665 --- /dev/null +++ b/scripts/world_bank/datasets/manifest.json @@ -0,0 +1,94 @@ +{ + "import_specifications": [ + { + "import_name": "WorldBankDatasets", + "curator_emails": ["swethakumari@google.com"], + "provenance_url": "https://data.worldbank.org", + "provenance_description": "World Bank databases are essential tools for supporting critical management decisions and providing key statistical information for Bank operational activities.", + "scripts": ["datasets.py"], + "import_inputs": [ + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/ASPIRE_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/EdStats_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/FINDEX_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/GFDD_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/GPFI_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/HCI_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/IDA_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/Jobs_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/MDG_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/PovStats_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/SDG_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/SE4ALL_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/Subnational-Population_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/Subnational-Poverty_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/WGI_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/BBSC_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/DB_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/Economic_Fitness_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/HEFPI_CSV_obs.csv" + }, + { + "template_mcf": "wb.tmcf", + "cleaned_csv": "output/observations/WWBI_CSV_obs.csv" + } + ], + "cron_schedule": "5 3 15 * *" + } + ] +}