From e2884c293e8e3678063dd46eef41e8fc679d56e0 Mon Sep 17 00:00:00 2001 From: Alexander Nikitin <1243786+AlexanderVNikitin@users.noreply.github.com> Date: Tue, 31 Oct 2023 10:03:54 +0200 Subject: [PATCH] add covid-19 dataset --- tests/test_utils.py | 11 ++ tsgm/utils/__init__.py | 1 + tsgm/utils/covid19_data_utils.py | 186 +++++++++++++++++++++++++++++++ tsgm/utils/datasets.py | 45 +++++++- 4 files changed, 239 insertions(+), 4 deletions(-) create mode 100644 tsgm/utils/covid19_data_utils.py diff --git a/tests/test_utils.py b/tests/test_utils.py index d8f605e..0255a16 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -283,3 +283,14 @@ def test_download(mocker, caplog): assert "Cannot download dataset" in str(excinfo.value) finally: os.remove(resource_path) + + +def test_get_covid_19(): + X, graph, states = tsgm.utils.get_covid_19() + assert len(states) == 51 and "new york" in states and "california" in states + assert len(graph[0]) == len(states) # nodes + assert len(graph[1]) == 220 # edges + assert X.shape[0] == len(states) + assert len(X.shape) == 3 + assert X.shape[2] == 4 + assert X.shape[1] >= 150 \ No newline at end of file diff --git a/tsgm/utils/__init__.py b/tsgm/utils/__init__.py index ac7a141..c87f8b1 100644 --- a/tsgm/utils/__init__.py +++ b/tsgm/utils/__init__.py @@ -1,6 +1,7 @@ from tsgm.utils.file_utils import * # noqa from tsgm.utils.data_processing import * # noqa from tsgm.utils.visualization import * # noqa +from tsgm.utils.covid19_data_utils import * # noqa from tsgm.utils.datasets import * # noqa from tsgm.utils.utils import * # noqa from tsgm.utils.mmd import * # noqa diff --git a/tsgm/utils/covid19_data_utils.py b/tsgm/utils/covid19_data_utils.py new file mode 100644 index 0000000..31c4e9c --- /dev/null +++ b/tsgm/utils/covid19_data_utils.py @@ -0,0 +1,186 @@ +""" +Utils for COVID-19 graph time series dataset: +The dataset is based on data from The New York Times, based on reports from state and local health agencies [1]. + +And was adapted to graph case in [2]. +[1] The New York Times. (2021). Coronavirus (Covid-19) Data in the United States. Retrieved [Insert Date Here], from https://github.com/nytimes/covid-19-data. +[2] + +The code is an adapted version from: +https://github.com/AlexanderVNikitin/covid19-on-graphs +""" + +import pandas as pd + + +STATE_ADJACENCIES = { + "washington": ["oregon", "idaho"], + "oregon": ["washington", "idaho", "nevada", "california"], + "california": ["oregon", "nevada", "arizona"], + "idaho": ["washington", "montana", "wyoming", "utah", "nevada", "oregon"], + "montana": ["north dakota", "south dakota", "wyoming", "idaho"], + "north dakota": ["minnesota", "south dakota", "montana"], + "south dakota": ["north dakota", "minnesota", "iowa", "nebraska", "wyoming", "montana"], + "minnesota": ["wisconsin", "iowa", "south dakota", "north dakota"], + "michigan": ["indiana", "ohio", "wisconsin"], + "ohio": ["michigan", "pennsylvania", "west virginia", "kentucky", "indiana"], + "pennsylvania": ["new york", "new jersey", "delaware", "maryland", "west virginia", "ohio"], + "new york": ["vermont", "massachusetts", "rhode island", "new jersey", "pennsylvania", "connecticut"], + "vermont": ["new hampshire", "massachusetts", "new york"], + "new hampshire": ["maine", "massachusetts", "vermont"], + "maine": ["new hampshire"], + "wyoming": ["montana", "south dakota", "nebraska", "colorado", "utah", "idaho"], + "nebraska": ["south dakota", "iowa", "missouri", "kansas", "colorado", "wyoming"], + "iowa": ["minnesota", "wisconsin", "illinois", "missouri", "nebraska", "south dakota"], + "wisconsin": ["minnesota", "iowa", "illinois", "michigan"], + "illinois": ["wisconsin", "indiana", "kentucky", "missouri", "iowa"], + "indiana": ["michigan", "ohio", "kentucky", "illinois"], + "west virginia": ["ohio", "pennsylvania", "maryland", "virginia", "kentucky"], + "maryland": ["delaware", "pennsylvania", "west virginia", "virginia", "district of columbia"], + "delaware": ["maryland", "pennsylvania", "new jersey"], + "new jersey": ["delaware", "pennsylvania", "new york"], + "connecticut": ["new york", "massachusetts", "rhode island"], + "rhode island": ["connecticut", "massachusetts", "new york"], + "district of columbia": ["maryland", "virginia"], + "virginia": ["west virginia", "kentucky", "district of columbia", "maryland", "north carolina", "tennessee"], + "kentucky": ["indiana", "ohio", "west virginia", "virginia", "tennessee", "missouri", "illinois"], + "missouri": ["iowa", "illinois", "kentucky", "tennessee", "arkansas", "oklahoma", "kansas", "nebraska"], + "kansas": ["nebraska", "missouri", "oklahoma", "colorado"], + "colorado": ["wyoming", "nebraska", "kansas", "oklahoma", "new mexico", "utah", "arizona"], + "utah": ["idaho", "wyoming", "colorado", "new mexico", "arizona", "nevada"], + "nevada": ["oregon", "idaho", "utah", "arizona", "california"], + "arizona": ["california", "nevada", "utah", "colorado", "new mexico"], + "new mexico": ["arizona", "utah", "colorado", "oklahoma", "texas"], + "oklahoma": ["colorado", "kansas", "missouri", "arkansas", "texas", "new mexico"], + "texas": ["new mexico", "oklahoma", "arkansas", "louisiana"], + "arkansas": ["oklahoma", "missouri", "tennessee", "mississippi", "louisiana", "texas"], + "louisiana": ["texas", "arkansas", "mississippi"], + "mississippi": ["louisiana", "arkansas", "tennessee", "alabama"], + "tennessee": ["missouri", "kentucky", "virginia", "north carolina", "georgia", "alabama", "mississippi", "arkansas"], + "alabama": ["mississippi", "tennessee", "georgia", "florida"], + "georgia": ["tennessee", "north carolina", "south carolina", "florida", "alabama"], + "florida": ["alabama", "georgia"], + "south carolina": ["georgia", "north carolina"], + "north carolina": ["south carolina", "tennessee", "virginia", "georgia"], + "alaska": [], + "hawaii": [], + "massachusetts": ["new york", "vermont", "new hampshire", "rhode island", "connecticut"], +} + +LIST_OF_STATES = sorted(STATE_ADJACENCIES.keys()) + +# July 1 2019 +STATE_POPULATION = { + "california": 39_512_223, + "texas": 28_995_881, + "florida": 21_477_737, + "new york": 19_453_561, + "pennsylvania": 12_801_989, + "illinois": 12_671_821, + "ohio": 11_689_100, + "georgia": 10_617_423, + "north carolina": 10_488_084, + "michigan": 9_986_857, + "new jersey": 8_882_190, + "virginia": 8_535_519, + "washington": 7_614_893, + "arizona": 7_278_717, + "massachusetts": 6_949_503, + "tennessee": 6_833_174, + "indiana": 6_732_219, + "missouri": 6_137_428, + "maryland": 6_045_680, + "wisconsin": 5_822_434, + "colorado": 5_758_736, + "minnesota": 5_639_632, + "south carolina": 5_148_714, + "alabama": 4_903_185, + "louisiana": 4_648_794, + "kentucky": 4_467_673, + "oregon": 4_217_737, + "oklahoma": 3_956_971, + "connecticut": 3_565_287, + "utah": 3_205_958, + "iowa": 3_155_070, + "nevada": 3_080_156, + "arkansas": 3_017_825, + "mississippi": 2_976_149, + "kansas": 2_913_314, + "new mexico": 2_096_829, + "nebraska": 1_934_408, + "west virginia": 1_792_147, + "idaho": 1_787_065, + "hawaii": 1_415_872, + "new hampshire": 1_359_711, + "maine": 1_344_212, + "montana": 1_068_778, + "rhode island": 1_059_361, + "delaware": 973_764, + "south dakota": 884_659, + "north dakota": 762_062, + "alaska": 731_545, + "district of columbia": 705_749, + "vermont": 623_989, + "wyoming": 578_759, + "virgin islands": 104_914, + "puerto rico": 3_193_694, + "guam": 165_718, +} + + +def aggregate_by_weeks_max(df): + df['date'] = pd.to_datetime(df['date']) # + pd.to_timedelta(7, unit='d') + df = df.groupby(['state', pd.Grouper(key='date', freq='W-MON')])\ + .agg({"cases": max, "deaths": max})\ + .reset_index()\ + .sort_values('date') + return df + + +def get_adjacencies_graph(): + nodes, edges = [], [] + LIST_OF_STATES = sorted(STATE_ADJACENCIES.keys()) + + for state_name in LIST_OF_STATES: + nodes.append(state_name) + + for state, adj_states in STATE_ADJACENCIES.items(): + for adj_state in adj_states: + edges.append((state, adj_state)) + return nodes, edges + + +def covid_dataset(path): + covid_cases_df = pd.read_csv(path) + covid_cases_df["state"] = covid_cases_df["state"].str.lower() + covid_cases_df = aggregate_by_weeks_max(covid_cases_df) + graph = get_adjacencies_graph() + result = {} + for row in covid_cases_df.to_dict(orient="records"): + date = row["date"] + cases = row["cases"] + deaths = row["deaths"] + state = row["state"] + if date not in result: + result[date] = {} + if state in STATE_POPULATION: + result[date][state] = { + "deaths_normalized": deaths / STATE_POPULATION[state], + "cases_normalized": cases / STATE_POPULATION[state], + "deaths": deaths, + "cases": cases, + } + else: + print("[WARNING]: There is no data about population for: ", state) + + # fill missing values with zeros + for date in result.keys(): + for state in LIST_OF_STATES: + if state not in result[date]: + result[date][state] = { + "deaths": 0, + "cases": 0, + "deaths_normalized": 0, + "cases_normalized": 0, + } + return result, graph diff --git a/tsgm/utils/datasets.py b/tsgm/utils/datasets.py index 893b402..c507413 100644 --- a/tsgm/utils/datasets.py +++ b/tsgm/utils/datasets.py @@ -15,6 +15,7 @@ from tensorflow import keras +from tsgm.utils import covid19_data_utils from tsgm.utils import file_utils @@ -22,7 +23,7 @@ logger.setLevel(logging.DEBUG) -def gen_sine_dataset(N, T, D, max_value=10): +def gen_sine_dataset(N: int, T: int, D: int, max_value: int = 10) -> np.ndarray: result = [] for i in range(N): result.append([]) @@ -35,7 +36,7 @@ def gen_sine_dataset(N, T, D, max_value=10): return np.transpose(np.array(result), [0, 2, 1]) -def gen_sine_const_switch_dataset(N, T, D, max_value=10, const=0, frequency_switch=0.1): +def gen_sine_const_switch_dataset(N: int, T: int, D: int, max_value: int = 10, const: int = 0, frequency_switch: float = 0.1) -> tuple: result_X, result_y = [], [] cur_y = 0 scales = np.random.random(D) * max_value @@ -185,7 +186,7 @@ def get_mauna_loa() -> tuple: return X, y -def split_dataset_into_objects(X, y, step=10): +def split_dataset_into_objects(X, y, step=10) -> tuple: assert X.shape[0] == y.shape[0] Xs, ys = [], [] @@ -293,7 +294,7 @@ def get_physionet2012() -> tuple: return train_X, train_y, test_X, test_y, val_X, val_y -def download_physionet2012(): +def download_physionet2012() -> None: """ Downloads the Physionet 2012 dataset files from the Physionet website and extracts them in local folder 'physionet2012' @@ -359,3 +360,39 @@ def _get_physionet_y_dataframe(file_path: str) -> pd.DataFrame: y.index.name = 'recordid' y.reset_index(inplace=True) return y + + +def get_covid_19() -> tuple: + """ + Loads Covid-19 dataset with additional graph information + The dataset is based on data from The New York Times, based on reports from state and local health agencies [1]. + + And was adapted to graph case in [2]. + [1] The New York Times. (2021). Coronavirus (Covid-19) Data in the United States. Retrieved [Insert Date Here], from https://github.com/nytimes/covid-19-data. + [2] Alexander V. Nikitin, St John, Arno Solin, Samuel Kaski Proceedings of The 25th International Conference on Artificial Intelligence and Statistics, PMLR 151:10640-10660, 2022. + + Returns: + ------- + tuple + First element is time series data (n_nodes x n_timestamps x n_features). Each timestamp consists of + the number of deaths, cases, deaths normalized by the population, and cases normalized by the population. + The second element is the graph tuple (nodes, edges). + The third element is the order of states. + """ + base_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv" + destination_folder = "covid19" + file_utils.download(base_url, destination_folder) + result, graph = covid19_data_utils.covid_dataset( + os.path.join(destination_folder, "us-states.csv") + ) + + processed_dataset = [] + for timestamp in result.keys(): + processed_dataset.append([]) + for state in covid19_data_utils.LIST_OF_STATES: + cur_data = result[timestamp][state] + processed_dataset[-1].append( + [cur_data["deaths"], cur_data["cases"], + cur_data["deaths_normalized"], cur_data["cases_normalized"]] + ) + return np.transpose(np.array(processed_dataset), (1, 0, 2)), graph, covid19_data_utils.LIST_OF_STATES