datacommonsorg · shapateriya · Oct 17, 2024 · Oct 17, 2024 · beets · Oct 17, 2024
diff --git a/...t/regional_statistics_by_nuts/birth_death_migration/EurostatNUTS3_BirthDeathMigration.csv b/...t/regional_statistics_by_nuts/birth_death_migration/EurostatNUTS3_BirthDeathMigration.csv
diff --git a/.../regional_statistics_by_nuts/birth_death_migration/EurostatNUTS3_BirthDeathMigration.tmcf b/.../regional_statistics_by_nuts/birth_death_migration/EurostatNUTS3_BirthDeathMigration.tmcf
@@ -5,45 +5,52 @@ variableMeasured: dcs:Count_Death
 observationAbout: C:EurostatNUTS3_BirthDeathMigration->geo
 observationDate: C:EurostatNUTS3_BirthDeathMigration->time
 value: C:EurostatNUTS3_BirthDeathMigration->Count_Death
+measurementMethod: dcs:EurostatRegionalStatistics
 
 Node: E:EurostatNUTS3_BirthDeathMigration->E1
 typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Count_BirthEvent_AsAFractionOf_Count_Person
+variableMeasured: dcs:Count_BirthEvent_AsAFractionOfCount_Person
 observationAbout: C:EurostatNUTS3_BirthDeathMigration->geo
 observationDate: C:EurostatNUTS3_BirthDeathMigration->time
-value: C:EurostatNUTS3_BirthDeathMigration->Count_BirthEvent_AsAFractionOf_Count_Person
+value: C:EurostatNUTS3_BirthDeathMigration->Count_BirthEvent_AsAFractionOfCount_Person
+measurementMethod: dcs:EurostatRegionalStatistics
 
 Node: E:EurostatNUTS3_BirthDeathMigration->E2
 typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Count_Death_AsAFractionOf_Count_Person
+variableMeasured: dcs:Count_Death_AsAFractionOfCount_Person
 observationAbout: C:EurostatNUTS3_BirthDeathMigration->geo
 observationDate: C:EurostatNUTS3_BirthDeathMigration->time
-value: C:EurostatNUTS3_BirthDeathMigration->Count_Death_AsAFractionOf_Count_Person
+value: C:EurostatNUTS3_BirthDeathMigration->Count_Death_AsAFractionOfCount_Person
+measurementMethod: dcs:EurostatRegionalStatistics
 
 Node: E:EurostatNUTS3_BirthDeathMigration->E3
 typeOf: dcs:StatVarObservation
 variableMeasured: dcs:IncrementalCount_Person
 observationAbout: C:EurostatNUTS3_BirthDeathMigration->geo
 observationDate: C:EurostatNUTS3_BirthDeathMigration->time
 value: C:EurostatNUTS3_BirthDeathMigration->IncrementalCount_Person
+measurementMethod: dcs:EurostatRegionalStatistics
 
 Node: E:EurostatNUTS3_BirthDeathMigration->E4
 typeOf: dcs:StatVarObservation
 variableMeasured: dcs:GrowthRate_Count_Person
 observationAbout: C:EurostatNUTS3_BirthDeathMigration->geo
 observationDate: C:EurostatNUTS3_BirthDeathMigration->time
 value: C:EurostatNUTS3_BirthDeathMigration->GrowthRate_Count_Person
+measurementMethod: dcs:EurostatRegionalStatistics
 
 Node: E:EurostatNUTS3_BirthDeathMigration->E5
 typeOf: dcs:StatVarObservation
 variableMeasured: dcs:Count_Person
 observationAbout: C:EurostatNUTS3_BirthDeathMigration->geo
 observationDate: C:EurostatNUTS3_BirthDeathMigration->time
 value: C:EurostatNUTS3_BirthDeathMigration->Count_Person
+measurementMethod: dcs:EurostatRegionalStatistics
 
 Node: E:EurostatNUTS3_BirthDeathMigration->E6
 typeOf: dcs:StatVarObservation
 variableMeasured: dcs:Count_BirthEvent
 observationAbout: C:EurostatNUTS3_BirthDeathMigration->geo
 observationDate: C:EurostatNUTS3_BirthDeathMigration->time
 value: C:EurostatNUTS3_BirthDeathMigration->Count_BirthEvent
+measurementMethod: dcs:EurostatRegionalStatistics
diff --git a/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py b/scripts/eurostat/regional_statistics_by_nuts/birth_death_migration/import_data.py
@@ -13,13 +13,20 @@
 # limitations under the License.
 
 import pandas as pd
-
+from six.moves import urllib
+import sys
+sys.path.insert(1, '../../../../util')
+from alpha2_to_dcid import COUNTRY_MAP
+from nuts_codes_names import NUTS1_CODES_NAMES
 
 def download_data(download_link):
  """Downloads raw data from Eurostat website and stores it in instance
  data frame.
  """
- raw_df = pd.read_table(download_link)
+ urllib.request.urlretrieve(download_link, "demo_r_gind3.tsv.gz")
+ raw_df = pd.read_table("demo_r_gind3.tsv.gz")
+ raw_df = raw_df.rename(columns=({'freq,indic_de,geo\TIME_PERIOD': 'indic_de,geo\\time'}))
+ raw_df['indic_de,geo\\time'] = raw_df['indic_de,geo\\time'].str.slice(2)
  return raw_df
 
 
@@ -73,8 +80,7 @@ def preprocess_data(raw_df):
  # '\time' labels the other columns so it is confusing.
 
  # Append extra space for all cells in value column that do not come with a note, so that we can split them without error.
- preprocessed_df.value = preprocessed_df.value.str.replace(
- "([0-9:])$", lambda m: m.group(0) + ' ')
+ preprocessed_df.value = preprocessed_df.value.str.replace("([0-9:])$", lambda m: m.group(0) + ' ',regex=True)
 
  first_column_list = preprocessed_df.columns[0].rsplit(sep=",", maxsplit=1)
 
@@ -88,15 +94,12 @@ def preprocess_data(raw_df):
  assert geo == "geo", "Column header should end with 'geo'."
 
  if statistical_variable:
- split_df = preprocessed_df[preprocessed_df.columns[0]].str.rsplit(
- ",", n=1, expand=True)
+ split_df = preprocessed_df[preprocessed_df.columns[0]].str.rsplit(",", n=1, expand=True)
  preprocessed_df['statistical_variable'] = split_df[0]
  preprocessed_df['geo'] = split_df[1]
  preprocessed_df.drop(columns=[preprocessed_df.columns[0]], inplace=True)
 
- preprocessed_df = (preprocessed_df.set_index(["geo", "time"]).pivot(
- columns="statistical_variable")['value'].reset_index().rename_axis(
- None, axis=1))
+ preprocessed_df = (preprocessed_df.set_index(["geo", "time"]).pivot(columns="statistical_variable")['value'].reset_index().rename_axis(None, axis=1))
  # Fill missing 'geo' values with a colon.
  preprocessed_df.fillna(': ', inplace=True)
 
@@ -126,8 +129,15 @@ def clean_data(preprocessed_df, output_path):
 
  # replace colon with NaN.
  clean_df = clean_df.replace(':', '')
-
- clean_df['geo'] = 'dcid:nuts/' + clean_df['geo']
+ # for ind, geo in enumerate(clean_df['geo']):
+ # # Convert geo IDS to geo codes, e.g., "country/SHN" or "nuts/AT342".
+ # if any(char.isdigit() for char in geo) or ('nuts/' + geo in NUTS1_CODES_NAMES):
+ # clean_df['geo'][ind] = 'nuts/' + geo
+ # else:
+ # clean_df['geo'][ind] = COUNTRY_MAP.get(geo, '~' + geo + '~')
+ clean_df['geo'] = clean_df['geo'].apply(lambda geo: f'nuts/{geo}' if any(geo.isdigit() for geo in geo) or ('nuts/' + geo in NUTS1_CODES_NAMES) else COUNTRY_MAP.get(geo, f'{geo}'))
+
+
  # trim the space in the time column i.e. '2020 ' -> '2020'
  clean_df['time'] = clean_df['time'].astype('int32')
  original_names = [
@@ -136,8 +146,8 @@ def clean_data(preprocessed_df, output_path):
  ]
  new_names = [
  'geo', 'time', 'Count_Death',
- 'Count_BirthEvent_AsAFractionOf_Count_Person',
- 'Count_Death_AsAFractionOf_Count_Person', 'IncrementalCount_Person',
+ 'Count_BirthEvent_AsAFractionOfCount_Person',
+ 'Count_Death_AsAFractionOfCount_Person', 'IncrementalCount_Person',
  'GrowthRate_Count_Person', 'Count_Person', 'Count_BirthEvent'
  ]
  clean_df = clean_df[original_names]
@@ -151,7 +161,7 @@ def clean_data(preprocessed_df, output_path):
 
 
 if __name__ == '__main__':
- download_link = "https://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/demo_r_gind3.tsv.gz"
+ download_link = "https://ec.europa.eu/eurostat/api/dissemination/sdmx/2.1/data/demo_r_gind3/?format=TSV&compressed=true"
  output_path = 'EurostatNUTS3_BirthDeathMigration.csv'
  raw_df = download_data(download_link)
  preprocessed_df = preprocess_data(raw_df)