Skip to content

Commit

Permalink
Merge branch 'epic/mypy-errors' into noise_config
Browse files Browse the repository at this point in the history
  • Loading branch information
hussain-jafari committed Oct 24, 2024
2 parents a4bc313 + 32a522b commit 298a8e7
Show file tree
Hide file tree
Showing 20 changed files with 146 additions and 134 deletions.
8 changes: 5 additions & 3 deletions docs/nitpick-exceptions
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ py:class pandas._libs.tslibs.timedeltas.Timedelta
py:class pandas.core.frame.DataFrame
py:class pandas.core.series.Series
py:class pandas.core.generic.PandasObject
# TODO: remove when dropping support for Python 3.9
py:class pd.DataFrame

# pseudopeople
py:exc ConfigurationError
py:exc DataSourceError

# layered_config_tree
py:class NestedDict
py:class NestedDictValue
# misc
# TODO: remove when dropping support for Python 3.9
py:class Path
1 change: 0 additions & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
from pathlib import Path
from typing import Optional

from docutils import nodes
from docutils.nodes import Text
Expand Down
4 changes: 1 addition & 3 deletions src/pseudopeople/column_getters.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from typing import List

from pseudopeople.constants.noise_type_metadata import COPY_HOUSEHOLD_MEMBER_COLS


def copy_from_household_member_column_getter(column_name: str) -> List[str]:
def copy_from_household_member_column_getter(column_name: str) -> list[str]:
return [COPY_HOUSEHOLD_MEMBER_COLS[column_name]]
19 changes: 10 additions & 9 deletions src/pseudopeople/configuration/generator.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from __future__ import annotations

from collections.abc import Sequence
from pathlib import Path
from typing import Dict, Optional, Union
from typing import Any

import yaml
from layered_config_tree import LayeredConfigTree
from layered_config_tree.types import NestedDict

from pseudopeople.configuration import NO_NOISE, Keys
from pseudopeople.configuration.noise_configuration import NoiseConfiguration
Expand Down Expand Up @@ -84,8 +85,8 @@


def get_configuration(
overrides: Optional[Union[Path, str, NestedDict]] = None,
dataset_schema: Optional[DatasetSchema] = None,
overrides: Path | str | dict[str, Any] | None = None,
dataset_schema: DatasetSchema | None = None,
filters: Sequence[DataFilter] = (),
) -> NoiseConfiguration:
"""
Expand Down Expand Up @@ -185,8 +186,8 @@ def get_noise_type_dict(noise_type: NoiseType, is_no_noise: bool) -> dict[str, f

def add_overrides(
noising_configuration: LayeredConfigTree,
overrides: Dict,
dataset_schema: Optional[DatasetSchema] = None,
overrides: dict,
dataset_schema: DatasetSchema | None = None,
filters: Sequence[DataFilter] = (),
) -> None:
overrides = _format_overrides(noising_configuration, overrides)
Expand All @@ -199,7 +200,7 @@ def add_overrides(
validate_noise_level_proportions(noising_configuration, dataset_schema, filters)


def _format_overrides(default_config: LayeredConfigTree, user_dict: Dict) -> Dict:
def _format_overrides(default_config: LayeredConfigTree, user_dict: dict) -> dict:
"""Formats the user's configuration file as necessary, so it can properly
update noising configuration to be used
"""
Expand All @@ -208,8 +209,8 @@ def _format_overrides(default_config: LayeredConfigTree, user_dict: Dict) -> Dic


def _format_misreport_age_perturbations(
default_config: LayeredConfigTree, user_dict: Dict
) -> Dict:
default_config: LayeredConfigTree, user_dict: dict
) -> dict:
# Format any age perturbation lists as a dictionary with uniform probabilities
for dataset_schema in user_dict:
user_perturbations = (
Expand Down
5 changes: 3 additions & 2 deletions src/pseudopeople/configuration/interface.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from __future__ import annotations

from pathlib import Path
from typing import Optional, Union

from pseudopeople.configuration.generator import get_configuration


def get_config(overrides: Optional[Union[Path, str, dict]] = None) -> dict:
def get_config(overrides: Path | str | dict | None = None) -> dict:
"""
Function that returns the pseudopeople configuration containing all
default values. To get the default probability of nonresponse in the
Expand Down
21 changes: 11 additions & 10 deletions src/pseudopeople/configuration/noise_configuration.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from typing import Any, Optional, Union
from typing import Any

from layered_config_tree import LayeredConfigTree
from layered_config_tree.types import InputData
Expand Down Expand Up @@ -31,7 +32,7 @@ def get_value(
dataset: str,
noise_type: str,
parameter_name: str,
column_name: Optional[str] = None,
column_name: str | None = None,
) -> float | int | list | dict:
config = self._config
try:
Expand Down Expand Up @@ -76,32 +77,32 @@ def get_value(
f"The parameter {parameter_name} was not found for {noise_type} in the configuration. "
f"Available parameters are {list(parameter_tree.keys())}."
)
noise_value: Union[int, float, LayeredConfigTree] = parameter_tree[parameter_name]
converted_noise_value: Union[int, float, dict] = (
noise_value: int | float | LayeredConfigTree = parameter_tree[parameter_name]
converted_noise_value: int | float | dict = (
noise_value.to_dict()
if isinstance(noise_value, LayeredConfigTree)
else noise_value
)
return converted_noise_value

def get_row_probability(self, dataset: str, noise_type: str) -> Union[int, float]:
value: Union[int, float] = self.get_value(
def get_row_probability(self, dataset: str, noise_type: str) -> int | float:
value: int | float = self.get_value(
dataset, noise_type, parameter_name="row_probability"
)
return value

def get_cell_probability(
self, dataset: str, noise_type: str, column_name: str
) -> Union[int, float]:
value: Union[int, float] = self.get_value(
) -> int | float:
value: int | float = self.get_value(
dataset, noise_type, parameter_name="cell_probability", column_name=column_name
)
return value

def get_token_probability(
self, dataset: str, noise_type: str, column_name: str
) -> Union[int, float]:
value: Union[int, float] = self.get_value(
) -> int | float:
value: int | float = self.get_value(
dataset, noise_type, parameter_name="token_probability", column_name=column_name
)
return value
Expand All @@ -123,7 +124,7 @@ def has_parameter(
dataset: str,
noise_type: str,
parameter_name: str,
column_name: Optional[str] = None,
column_name: str | None = None,
) -> bool:
if column_name:
has_parameter = parameter_name in self.to_dict().get(dataset, {}).get(
Expand Down
34 changes: 18 additions & 16 deletions src/pseudopeople/configuration/validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from collections.abc import Sequence
from typing import Any, Dict, List, Optional, Protocol
from typing import Any, Protocol

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -28,10 +30,10 @@ def validate_overrides(overrides: Any, default_config: LayeredConfigTree) -> Non
keys exist in the default configuration. Confirms that all user-provided
values are valid for their respective noise functions.
"""
if not isinstance(overrides, Dict):
if not isinstance(overrides, dict):
raise ConfigurationError("Invalid configuration type provided.") from None
for dataset_name, dataset_config in overrides.items():
if not isinstance(dataset_config, Dict):
if not isinstance(dataset_config, dict):
raise ConfigurationError(
f"'{dataset_name}' must be a Dict. "
f"Provided {dataset_config} of type {type(dataset_config)}."
Expand All @@ -51,14 +53,14 @@ def validate_overrides(overrides: Any, default_config: LayeredConfigTree) -> Non
]

row_noise_config = dataset_config.get(Keys.ROW_NOISE, {})
if not isinstance(row_noise_config, Dict):
if not isinstance(row_noise_config, dict):
raise ConfigurationError(
f"'{Keys.ROW_NOISE}' of '{dataset_name}' must be a Dict. "
f"Provided {row_noise_config} of type {type(row_noise_config)}."
)

for noise_type, noise_type_config in row_noise_config.items():
if not isinstance(noise_type_config, Dict):
if not isinstance(noise_type_config, dict):
raise ConfigurationError(
f"Row noise type '{noise_type}' of dataset '{dataset_name}' must be a Dict. "
f"Provided {noise_type_config} of type {type(noise_type_config)}."
Expand All @@ -75,14 +77,14 @@ def validate_overrides(overrides: Any, default_config: LayeredConfigTree) -> Non
)

column_noise_config = dataset_config.get(Keys.COLUMN_NOISE, {})
if not isinstance(column_noise_config, Dict):
if not isinstance(column_noise_config, dict):
raise ConfigurationError(
f"'{Keys.COLUMN_NOISE}' of '{dataset_name}' must be a Dict. "
f"Provided {column_noise_config} of type {type(column_noise_config)}."
)

for column, column_config in column_noise_config.items():
if not isinstance(column_config, Dict):
if not isinstance(column_config, dict):
raise ConfigurationError(
f"Column '{column}' of dataset '{dataset_name}' must be a Dict. "
f"Provided {column_config} of type {type(column_config)}."
Expand All @@ -92,7 +94,7 @@ def validate_overrides(overrides: Any, default_config: LayeredConfigTree) -> Non
default_column_noise_config, column, "column", dataset_name
)
for noise_type, noise_type_config in column_config.items():
if not isinstance(noise_type_config, Dict):
if not isinstance(noise_type_config, dict):
raise ConfigurationError(
f"Noise type '{noise_type}' of column '{column}' in dataset '{dataset_name}' must be a Dict. "
f"Provided {noise_type_config} of type {type(noise_type_config)}."
Expand Down Expand Up @@ -121,8 +123,8 @@ def _validate_noise_type_config(
default_noise_type_config: LayeredConfigTree,
dataset_name: str,
noise_type: str,
parameter_config_validator_map: Dict[str, ParameterConfigValidator],
column: Optional[str] = None,
parameter_config_validator_map: dict[str, ParameterConfigValidator],
column: str | None = None,
) -> None:
"""
Validates that all parameters are allowed for this noise function.
Expand Down Expand Up @@ -152,9 +154,9 @@ def _get_default_config_node(
default_config: LayeredConfigTree,
key: str,
key_type: str,
dataset_name: Optional[str] = None,
column: Optional[str] = None,
noise_type: Optional[str] = None,
dataset_name: str | None = None,
column: str | None = None,
noise_type: str | None = None,
) -> LayeredConfigTree:
"""
Validate that the node the user is trying to add exists in the default
Expand Down Expand Up @@ -184,7 +186,7 @@ def _validate_possible_age_differences(
Validates the user-provided values for the age-miswriting permutations
parameter
"""
if not isinstance(noise_type_config, (Dict, List)):
if not isinstance(noise_type_config, (dict, list)):
raise ConfigurationError(
base_error_message + f"'{parameter}' must be a Dict or List. "
f"Provided {noise_type_config} of type {type(noise_type_config)}."
Expand All @@ -202,7 +204,7 @@ def _validate_possible_age_differences(
)
if key == 0:
raise ConfigurationError(base_error_message + f"'{parameter}' cannot include 0.")
if isinstance(noise_type_config, Dict):
if isinstance(noise_type_config, dict):
for value in noise_type_config.values():
if not isinstance(value, (float, int)):
raise ConfigurationError(
Expand All @@ -227,7 +229,7 @@ def _validate_zipcode_digit_probabilities(
noise_type_config: Any, parameter: str, base_error_message: str, *_: Any
) -> None:
"""Validates the user-provided values for the zipcode digit noising probabilities"""
if not isinstance(noise_type_config, List):
if not isinstance(noise_type_config, list):
raise ConfigurationError(
base_error_message + f"'{parameter}' must be a List. "
f"Provided {noise_type_config} of type {type(noise_type_config)}."
Expand Down
10 changes: 4 additions & 6 deletions src/pseudopeople/constants/data_values.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from typing import Dict, List

import pandas as pd

from pseudopeople.constants.metadata import DatasetNames

# Targeted omission constants for do_not_respond
DO_NOT_RESPOND_BASE_PROBABILITY = 0.0024

DO_NOT_RESPOND_ADDITIVE_PROBABILITY_BY_RACE: Dict[str, float] = {
DO_NOT_RESPOND_ADDITIVE_PROBABILITY_BY_RACE: dict[str, float] = {
"AIAN": 0.0067,
"Asian": -0.0286,
"Black": 0.0306,
Expand All @@ -17,7 +15,7 @@
"White": -0.0188,
}

DO_NOT_RESPOND_AGE_INTERVALS: List[pd.Interval] = [
DO_NOT_RESPOND_AGE_INTERVALS: list[pd.Interval] = [
# Intervals should include their lower bound
pd.Interval(0, 5, closed="left"),
pd.Interval(5, 10, closed="left"),
Expand All @@ -27,7 +25,7 @@
pd.Interval(50, 125, closed="left"),
]

DO_NOT_RESPOND_ADDITIVE_PROBABILITY_BY_SEX_AGE: Dict[str, pd.Series] = {
DO_NOT_RESPOND_ADDITIVE_PROBABILITY_BY_SEX_AGE: dict[str, pd.Series] = {
"Female": pd.Series(
[0.0255, -0.0014, -0.0003, 0.0074, -0.0034, -0.0287],
index=DO_NOT_RESPOND_AGE_INTERVALS,
Expand All @@ -38,7 +36,7 @@
),
}

DEFAULT_DO_NOT_RESPOND_ROW_PROBABILITY: Dict[str, float] = {
DEFAULT_DO_NOT_RESPOND_ROW_PROBABILITY: dict[str, float] = {
DatasetNames.ACS: 0.0145, # 1.45%
DatasetNames.CPS: 0.2905, # 29.05%
DatasetNames.CENSUS: 0.0145, # 1.45%
Expand Down
6 changes: 4 additions & 2 deletions src/pseudopeople/dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from collections.abc import Sequence
from typing import Any, List, Optional
from typing import Any

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -48,7 +50,7 @@ def is_empty(self, column_name: str) -> Any:
is_empty: bool = self.missingness[column_name].all()
return is_empty

def get_non_empty_index(self, required_columns: Optional[List[str]] = None) -> pd.Index:
def get_non_empty_index(self, required_columns: list[str] | None = None) -> pd.Index:
"""Returns the non-empty data."""

if required_columns is None:
Expand Down
12 changes: 6 additions & 6 deletions src/pseudopeople/entity_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
from typing import TYPE_CHECKING, Any, Callable

import pandas as pd
from layered_config_tree import LayeredConfigTree
Expand Down Expand Up @@ -37,8 +37,8 @@ def default_noise_level_getter(
class NoiseType(ABC):
name: str
noise_function: Callable = _noise_function_not_implemented
probability: Optional[float] = 0.0
additional_parameters: Optional[Dict[str, Any]] = None
probability: float | None = 0.0
additional_parameters: dict[str, Any] | None = None

def __post_init__(self) -> None:
if self.noise_function == _noise_function_not_implemented:
Expand Down Expand Up @@ -71,7 +71,7 @@ class RowNoiseType(NoiseType):
[Dataset, NoiseConfiguration, pd.Index], None
] = _noise_function_not_implemented
get_noise_level: Callable[
[NoiseConfiguration, Dataset, str], Union[float, pd.Series]
[NoiseConfiguration, Dataset, str], float | pd.Series
] = default_noise_level_getter

@property
Expand Down Expand Up @@ -104,9 +104,9 @@ class ColumnNoiseType(NoiseType):
noise_function: Callable[
[Dataset, NoiseConfiguration, pd.Index, str], None
] = _noise_function_not_implemented
probability: Optional[float] = 0.01
probability: float | None = 0.01
noise_level_scaling_function: Callable[[pd.DataFrame, str], float] = lambda x, y: 1.0
additional_column_getter: Callable[[str], List[str]] = lambda column_name: []
additional_column_getter: Callable[[str], list[str]] = lambda column_name: []
output_dtype_getter: Callable[[pd_dtype], pd_dtype] = lambda dtype: dtype

@property
Expand Down
Loading

0 comments on commit 298a8e7

Please sign in to comment.