diff --git a/pulkka/column_maps.py b/pulkka/column_maps.py index 2b518a0..3870ba2 100644 --- a/pulkka/column_maps.py +++ b/pulkka/column_maps.py @@ -28,6 +28,7 @@ TYOPAIKKA_COL = "Työpaikka" VUOSILASKUTUS_ALV0_COL = "Vuosilaskutus (ALV 0%, euroina)" VUOSITULOT_COL = "Vuositulot" +ID_COL = "Vastaustunniste" COLUMN_MAP_2023 = { "Timestamp": "Timestamp", @@ -172,7 +173,6 @@ "jänis", "kyllä, kiitos", "leppäkerttu", - "taisteluhelikopteri", "tihkutympönen", "yes", } @@ -182,7 +182,7 @@ "non-binary, afab", } -TIMESTAMPS_TO_DROP = { - # See "SUBMITTED TWICE, SORRY!!" in English data: - "2023-09-08 13:24:46.740", +IDS_TO_DROP = { + "6cab61607da9c2b6", # hupsu taisteluhelikopteri + "aefdb9e69b1621d5", # See "SUBMITTED TWICE, SORRY!!" in English data } diff --git a/pulkka/data_ingest.py b/pulkka/data_ingest.py index 2fbfcfa..0429471 100644 --- a/pulkka/data_ingest.py +++ b/pulkka/data_ingest.py @@ -1,5 +1,6 @@ from __future__ import annotations +import hashlib import re import warnings @@ -33,11 +34,13 @@ OTHER_GENDER_VALUES, TYOKOKEMUS_COL, ROOLI_NORM_COL, - TIMESTAMPS_TO_DROP, + ID_COL, + IDS_TO_DROP, ) -def map_sukupuoli(value: str) -> str | None: +def map_sukupuoli(r: pd.Series) -> str | None: + value = r[SUKUPUOLI_COL] if not isinstance(value, str): return value @@ -67,7 +70,7 @@ def map_sukupuoli(value: str) -> str | None: if value in OTHER_GENDER_VALUES: return "muu" - raise NotImplementedError(f"Unknown sukupuoli: {value}") + raise NotImplementedError(f"Unknown sukupuoli: {value} (row ID {r[ID_COL]})") def map_vuositulot(r): @@ -91,6 +94,11 @@ def ucfirst(val): return val +def hash_row(r: pd.Series) -> str: + source_data = f"{r[LANG_COL]}.{int(r.Timestamp.timestamp() * 1000)}" + return hashlib.sha256(source_data.encode()).hexdigest()[:16] + + def read_initial_dfs() -> pd.DataFrame: df_fi: pd.DataFrame = pd.read_excel( DATA_DIR / "results-fi.xlsx", @@ -106,6 +114,10 @@ def read_initial_dfs() -> pd.DataFrame: df = pd.concat([df_fi, df_en], ignore_index=True) df = df[df["Timestamp"].notna()] # Remove rows with no timestamp df[LANG_COL] = df[LANG_COL].astype("category") + # Give each row a unique hash ID + df[ID_COL] = df.apply(hash_row, axis=1) + # Ensure truncated sha is unique + assert len(df[ID_COL].unique()) == len(df) return df @@ -137,13 +149,10 @@ def read_data() -> pd.DataFrame: for col, val_map in VALUE_MAP_2023_EN_TO_FI.items(): df[col] = df[col].map(val_map).fillna(df[col]).astype("category") - # Drop bogus data - df = df.drop(df[df[SUKUPUOLI_COL] == "taisteluhelikopteri"].index) + # Drop known bogus data + df = df.drop(df[df[ID_COL].isin(IDS_TO_DROP)].index) - # Drop rows by timestamps known to be duplicate - df = df.drop(df[df["Timestamp"].isin(TIMESTAMPS_TO_DROP)].index) - - df[SUKUPUOLI_COL] = df[SUKUPUOLI_COL].apply(map_sukupuoli).astype("category") + df[SUKUPUOLI_COL] = df.apply(map_sukupuoli, axis=1).astype("category") df[IKA_COL] = df[IKA_COL].astype("category") # Assume that people entering 37.5 (hours) as their työaika means 100% @@ -180,11 +189,19 @@ def read_data() -> pd.DataFrame: df[TYOKOKEMUS_COL] = df[TYOKOKEMUS_COL].round() # Fix known bogus data - df.loc[ - (df[KKPALKKA_COL] == 4900) & (df[VUOSITULOT_COL] == 620000), - VUOSITULOT_COL, - ] = 62000 - + df = apply_fixups( + df, + [ + ( + {ID_COL: "a01216a11026d749", VUOSITULOT_COL: 620000}, + {VUOSITULOT_COL: 62000}, + ), + ( + {ID_COL: "79a200f529f6919b", VUOSITULOT_COL: 1500}, + {VUOSITULOT_COL: 150_000}, + ), + ], + ) # Fill in Vuositulot as 12.5 * Kk-tulot if empty df[VUOSITULOT_COL] = df.apply(map_vuositulot, axis=1) @@ -252,3 +269,16 @@ def main(): if __name__ == "__main__": main() + + +def apply_fixups(df: pd.DataFrame, fixups: list[tuple[dict, dict]]) -> pd.DataFrame: + for match_cond, replace_cond in fixups: + match_keys, match_values = zip(*match_cond.items()) + ix = df[list(match_keys)].eq(list(match_values)).all(axis=1) + if not ix.any(): + raise ValueError( + f"Fixup match condition {match_cond} did not match any rows", + ) + replace_keys, replace_values = zip(*replace_cond.items()) + df.loc[ix, list(replace_keys)] = replace_values + return df diff --git a/pulkka/massage_outputs.py b/pulkka/massage_outputs.py index ad32c47..7ae452d 100644 --- a/pulkka/massage_outputs.py +++ b/pulkka/massage_outputs.py @@ -26,6 +26,18 @@ def write_massaged_files(env, df): body_class="table-body", ), ) + with open(OUT_DIR / "data-vertical.html", "w") as f: + with io.StringIO() as s: + for _, row in df.iterrows(): + row.dropna().to_frame().to_html(s, header=False, na_rep="", border=0) + s.write("\n") + table_html = s.getvalue() + f.write( + env.get_template("_table.html").render( + table_html=table_html, + body_class="table-body", + ), + ) df.to_csv(OUT_DIR / "data.csv", index=False) df.to_excel(OUT_DIR / "data.xlsx", index=False) df.to_json( diff --git a/template/index.html b/template/index.html index a75d7b7..a428c41 100644 --- a/template/index.html +++ b/template/index.html @@ -48,6 +48,7 @@

Data

diff --git a/template/style.css b/template/style.css index 9db54d8..c6dfd36 100644 --- a/template/style.css +++ b/template/style.css @@ -27,6 +27,7 @@ body.table-body { body.table-body table { border-collapse: collapse; + margin-bottom: 1em; } body.table-body td, @@ -35,6 +36,10 @@ body.table-body th { border: 1px solid #999; } +body.table-body tr th { + text-align: left; +} + h1, h2, h3 {