Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT SUBMIT] Obsolete obs props implementation. #339

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions simple/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ absl-py==1.4.0
certifi==2023.7.22
charset-normalizer==3.2.0
cloud-sql-python-connector==1.4.3
dataclasses-json==0.6.7
freezegun==1.2.2
google-cloud-storage==2.11.0
httpx==0.26.0
Expand Down
10 changes: 9 additions & 1 deletion simple/stats/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
from stats.data import EventType
from stats.data import ImportType
from stats.data import InputFileFormat
from stats.data import ObservationProperties
from stats.data import Provenance
from stats.data import Source
from stats.data import StatVar
from stats.data import to_observation_properties

_INPUT_FILES_FIELD = "inputFiles"
_IMPORT_TYPE_FIELD = "importType"
Expand Down Expand Up @@ -54,6 +56,7 @@
_ENTITIES_FIELD = "entities"
_GROUP_STAT_VARS_BY_PROPERTY = "groupStatVarsByProperty"
_GENERATE_TOPICS = "generateTopics"
_OBSERVATION_PROPERTIES = "observationProperties"


class Config:
Expand Down Expand Up @@ -181,6 +184,11 @@ def special_files(self) -> dict[str, str]:
def generate_topics(self) -> bool:
return self.data.get(_GENERATE_TOPICS) or False

def observation_properties(self,
input_file_name: str) -> ObservationProperties:
return to_observation_properties(
self._input_file(input_file_name).get(_OBSERVATION_PROPERTIES, {}))

def _input_file(self, input_file_name: str) -> dict:
# Exact match.
input_file_config = self._input_files_config.get(input_file_name, {})
Expand All @@ -204,7 +212,7 @@ def _input_file_name_match(self, input_file_name: str) -> str | None:
return None

def _input_file_pattern_to_regex(self, input_file_pattern: str) -> str:
"""
r"""
Transforms a string of the form "a*b.c" to the regex "a.*b\.c".
"""
return input_file_pattern.replace(".", r"\.").replace("*", ".*")
Expand Down
4 changes: 4 additions & 0 deletions simple/stats/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@
COLUMN_VALUE = "value"
COLUMN_PROVENANCE = "provenance"
COLUMN_ENTITY = "entity"
COLUMN_UNIT = "unit"
COLUMN_SCALING_FACTOR = "scaling_factor"
COLUMN_MEASUREMENT_METHOD = "measurement_method"
COLUMN_OBSERVATION_PERIOD = "observation_period"

# Debug CSV columns and values
DEBUG_COLUMN_INPUT = "input"
Expand Down
19 changes: 19 additions & 0 deletions simple/stats/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from typing import Self
from urllib.parse import urlparse

from dataclasses_json import dataclass_json
from dataclasses_json import LetterCase
from stats import schema_constants as sc

_PREDICATE_TYPE_OF = "typeOf"
Expand Down Expand Up @@ -225,6 +227,23 @@ class Observation:
date: str
value: str
provenance: str
unit: str = ""
scaling_factor: str = ""
measurement_method: str = ""
observation_period: str = ""


@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
class ObservationProperties:
unit: str = ""
scaling_factor: str = ""
measurement_method: str = ""
observation_period: str = ""


def to_observation_properties(config_data: dict) -> ObservationProperties:
return ObservationProperties.from_dict(config_data)


@dataclass
Expand Down
36 changes: 14 additions & 22 deletions simple/stats/observations_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,7 @@

from util import dc_client as dc

# Number of entity IDs that will be sampled to resolved their entity type, if one is not specified by the user.
# Note that the importer assumes that all entities in a given CSV are all of the same type.
_SAMPLE_ENTITY_RESOLUTION_SIZE = 5


# TODO: Add support for units.
class ObservationsImporter(Importer):
"""Imports a single observations input file.
"""
Expand Down Expand Up @@ -61,7 +56,6 @@ def do_import(self) -> None:
self._sanitize_values()
self._rename_columns()
self._resolve_entities()
self._add_provenance_column()
self._add_entity_nodes()
self._write_observations()
self.reporter.report_success()
Expand Down Expand Up @@ -115,31 +109,29 @@ def _write_observations(self) -> None:
# Convert all values to str first, otherwise it inserts ints as floats.
observations_df = self.df.astype(str)
observations_df = observations_df.melt(
id_vars=[
constants.COLUMN_DCID, constants.COLUMN_DATE,
constants.COLUMN_PROVENANCE
],
id_vars=[constants.COLUMN_DCID, constants.COLUMN_DATE],
var_name=constants.COLUMN_VARIABLE,
value_name=constants.COLUMN_VALUE,
)

# Reorder columns so they are in the same order as observations
observations_df = observations_df.reindex(columns=[
constants.COLUMN_DCID, constants.COLUMN_VARIABLE, constants.COLUMN_DATE,
constants.COLUMN_VALUE, constants.COLUMN_PROVENANCE
])
provenance = self.nodes.provenance(self.input_file_name).id
obs_props = self.config.observation_properties(self.input_file_name)

observations: list[Observation] = []
for row in observations_df.itertuples(index=False):
observation = Observation(*row)
for _, row in observations_df.iterrows():
observation = Observation(entity=row[constants.COLUMN_DCID],
variable=row[constants.COLUMN_VARIABLE],
date=row[constants.COLUMN_DATE],
value=row[constants.COLUMN_VALUE],
provenance=provenance,
unit=obs_props.unit,
scaling_factor=obs_props.scaling_factor,
measurement_method=obs_props.measurement_method,
observation_period=obs_props.observation_period)
if observation.value and observation.value != "<NA>":
observations.append(Observation(*row))
observations.append(observation)
self.db.insert_observations(observations, self.input_file_name)

def _add_provenance_column(self):
self.df[constants.COLUMN_PROVENANCE] = self.nodes.provenance(
self.input_file_name).id

def _add_entity_nodes(self) -> None:
# Convert entity dcids to dict.
# Using dict instead of set to maintain insertion order which keeps results consistent for tests.
Expand Down
20 changes: 20 additions & 0 deletions simple/tests/stats/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from stats.data import AggregationMethod
from stats.data import ImportType
from stats.data import InputFileFormat
from stats.data import ObservationProperties
from stats.data import Provenance
from stats.data import Source
from stats.data import StatVar
Expand Down Expand Up @@ -301,3 +302,22 @@ def test_entity_columns(self):

config = Config({"inputFiles": {"foo.csv": {}}})
self.assertListEqual(config.entity_columns("foo.csv"), [], "unspecified")

def test_observation_properties(self):
config = Config({})
self.assertEqual(config.observation_properties("foo.csv"),
ObservationProperties(), "empty")

config = Config({
"inputFiles": {
"foo.csv": {
"observationProperties": {
"unit": "foo",
"observationPeriod": "bar"
}
}
}
})
self.assertEqual(
config.observation_properties("foo.csv"),
ObservationProperties(unit="foo", observation_period="bar"))
8 changes: 8 additions & 0 deletions simple/tests/stats/data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@

from stats.data import Event
from stats.data import McfNode
from stats.data import ObservationProperties
from stats.data import Provenance
from stats.data import StatVar
from stats.data import StatVarGroup
from stats.data import to_observation_properties
from stats.data import Triple

SV_ID1 = "sv_id1"
Expand Down Expand Up @@ -193,3 +195,9 @@ def test_mcf_node(self):
memberOf: svg1""".strip()

self.assertEqual(node.to_mcf(), expected)

def test_observation_properties(self):
config = {"unit": "foo", "measurementMethod": "bar"}
obs_props = to_observation_properties(config)
self.assertEqual(
obs_props, ObservationProperties(unit="foo", measurement_method="bar"))
25 changes: 12 additions & 13 deletions simple/tests/stats/observations_importer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import shutil
import tempfile
Expand Down Expand Up @@ -46,28 +47,26 @@ def _test_import(test: unittest.TestCase,
test.maxDiff = None

with tempfile.TemporaryDirectory() as temp_dir:
input_file = f"{test_name}.csv"
input_path = os.path.join(_INPUT_DIR, input_file)
input_dir = os.path.join(_INPUT_DIR, test_name)
expected_dir = os.path.join(_EXPECTED_DIR, test_name)

input_path = os.path.join(input_dir, "input.csv")
config_path = os.path.join(input_dir, "config.json")
db_path = os.path.join(temp_dir, f"{test_name}.db")

output_path = os.path.join(temp_dir, f"{test_name}.db.csv")
expected_path = os.path.join(_EXPECTED_DIR, f"{test_name}.db.csv")
output_path = os.path.join(temp_dir, "observations.db.csv")
expected_path = os.path.join(expected_dir, "observations.db.csv")

input_fh = LocalFileHandler(input_path)

with open(config_path) as config_file:
config = Config(json.load(config_file))
print("CONFIG", config.data)
db = create_db(create_sqlite_config(db_path))
debug_resolve_fh = LocalFileHandler(os.path.join(temp_dir, "debug.csv"))
report_fh = LocalFileHandler(os.path.join(temp_dir, "report.json"))
reporter = FileImportReporter(input_path, ImportReporter(report_fh))
nodes = Nodes(
Config({
"inputFiles": {
input_file: {
"entityType": entity_type,
"ignoreColumns": ignore_columns
}
}
}))
nodes = Nodes(Config(config))

dc_client.get_property_of_entities = MagicMock(return_value={})

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
entity,variable,date,value,provenance,unit,scaling_factor,measurement_method,observation_period
country/BRA,var1,2023,0.19,c/p/default,,,,
country/JPN,var1,2023,0.21,c/p/default,,,,
country/CHN,var1,2022,-123.456,c/p/default,,,,
country/BRA,var2,2023,6,c/p/default,,,,
country/JPN,var2,2023,56,c/p/default,,,,
country/USA,var2,2023,66,c/p/default,,,,
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"inputFiles": {
"input.csv": {
"entityType": "Dummy"
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"inputFiles": {
"input.csv": {
"entityType": "Dummy",
"observationProperties": {
"unit": "USD",
"observationPeriod": "P1Y"
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
countryAlpha3Code,year,var1,var2
BRA,2023,0.19,6
JPN,2023,0.21,56
USA,2023,,66
CHN,2022,-123.456,