Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ids, vertical HTML, data fixups #17

Merged
merged 3 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pulkka/column_maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
TYOPAIKKA_COL = "Työpaikka"
VUOSILASKUTUS_ALV0_COL = "Vuosilaskutus (ALV 0%, euroina)"
VUOSITULOT_COL = "Vuositulot"
ID_COL = "Vastaustunniste"

COLUMN_MAP_2023 = {
"Timestamp": "Timestamp",
Expand Down Expand Up @@ -172,7 +173,6 @@
"jänis",
"kyllä, kiitos",
"leppäkerttu",
"taisteluhelikopteri",
"tihkutympönen",
"yes",
}
Expand All @@ -182,7 +182,7 @@
"non-binary, afab",
}

TIMESTAMPS_TO_DROP = {
# See "SUBMITTED TWICE, SORRY!!" in English data:
"2023-09-08 13:24:46.740",
IDS_TO_DROP = {
"6cab61607da9c2b6", # hupsu taisteluhelikopteri
"aefdb9e69b1621d5", # See "SUBMITTED TWICE, SORRY!!" in English data
}
58 changes: 44 additions & 14 deletions pulkka/data_ingest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import hashlib
import re
import warnings

Expand Down Expand Up @@ -33,11 +34,13 @@
OTHER_GENDER_VALUES,
TYOKOKEMUS_COL,
ROOLI_NORM_COL,
TIMESTAMPS_TO_DROP,
ID_COL,
IDS_TO_DROP,
)


def map_sukupuoli(value: str) -> str | None:
def map_sukupuoli(r: pd.Series) -> str | None:
value = r[SUKUPUOLI_COL]
if not isinstance(value, str):
return value

Expand Down Expand Up @@ -67,7 +70,7 @@ def map_sukupuoli(value: str) -> str | None:
if value in OTHER_GENDER_VALUES:
return "muu"

raise NotImplementedError(f"Unknown sukupuoli: {value}")
raise NotImplementedError(f"Unknown sukupuoli: {value} (row ID {r[ID_COL]})")


def map_vuositulot(r):
Expand All @@ -91,6 +94,11 @@ def ucfirst(val):
return val


def hash_row(r: pd.Series) -> str:
source_data = f"{r[LANG_COL]}.{int(r.Timestamp.timestamp() * 1000)}"
return hashlib.sha256(source_data.encode()).hexdigest()[:16]


def read_initial_dfs() -> pd.DataFrame:
df_fi: pd.DataFrame = pd.read_excel(
DATA_DIR / "results-fi.xlsx",
Expand All @@ -106,6 +114,10 @@ def read_initial_dfs() -> pd.DataFrame:
df = pd.concat([df_fi, df_en], ignore_index=True)
df = df[df["Timestamp"].notna()] # Remove rows with no timestamp
df[LANG_COL] = df[LANG_COL].astype("category")
# Give each row a unique hash ID
df[ID_COL] = df.apply(hash_row, axis=1)
# Ensure truncated sha is unique
assert len(df[ID_COL].unique()) == len(df)
return df


Expand Down Expand Up @@ -137,13 +149,10 @@ def read_data() -> pd.DataFrame:
for col, val_map in VALUE_MAP_2023_EN_TO_FI.items():
df[col] = df[col].map(val_map).fillna(df[col]).astype("category")

# Drop bogus data
df = df.drop(df[df[SUKUPUOLI_COL] == "taisteluhelikopteri"].index)
# Drop known bogus data
df = df.drop(df[df[ID_COL].isin(IDS_TO_DROP)].index)

# Drop rows by timestamps known to be duplicate
df = df.drop(df[df["Timestamp"].isin(TIMESTAMPS_TO_DROP)].index)

df[SUKUPUOLI_COL] = df[SUKUPUOLI_COL].apply(map_sukupuoli).astype("category")
df[SUKUPUOLI_COL] = df.apply(map_sukupuoli, axis=1).astype("category")
df[IKA_COL] = df[IKA_COL].astype("category")

# Assume that people entering 37.5 (hours) as their työaika means 100%
Expand Down Expand Up @@ -180,11 +189,19 @@ def read_data() -> pd.DataFrame:
df[TYOKOKEMUS_COL] = df[TYOKOKEMUS_COL].round()

# Fix known bogus data
df.loc[
(df[KKPALKKA_COL] == 4900) & (df[VUOSITULOT_COL] == 620000),
VUOSITULOT_COL,
] = 62000

df = apply_fixups(
df,
[
(
{ID_COL: "a01216a11026d749", VUOSITULOT_COL: 620000},
{VUOSITULOT_COL: 62000},
),
(
{ID_COL: "79a200f529f6919b", VUOSITULOT_COL: 1500},
{VUOSITULOT_COL: 150_000},
),
],
)
# Fill in Vuositulot as 12.5 * Kk-tulot if empty
df[VUOSITULOT_COL] = df.apply(map_vuositulot, axis=1)

Expand Down Expand Up @@ -252,3 +269,16 @@ def main():

if __name__ == "__main__":
main()


def apply_fixups(df: pd.DataFrame, fixups: list[tuple[dict, dict]]) -> pd.DataFrame:
for match_cond, replace_cond in fixups:
match_keys, match_values = zip(*match_cond.items())
ix = df[list(match_keys)].eq(list(match_values)).all(axis=1)
if not ix.any():
raise ValueError(
f"Fixup match condition {match_cond} did not match any rows",
)
replace_keys, replace_values = zip(*replace_cond.items())
df.loc[ix, list(replace_keys)] = replace_values
return df
12 changes: 12 additions & 0 deletions pulkka/massage_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@ def write_massaged_files(env, df):
body_class="table-body",
),
)
with open(OUT_DIR / "data-vertical.html", "w") as f:
with io.StringIO() as s:
for _, row in df.iterrows():
row.dropna().to_frame().to_html(s, header=False, na_rep="", border=0)
s.write("\n")
table_html = s.getvalue()
f.write(
env.get_template("_table.html").render(
table_html=table_html,
body_class="table-body",
),
)
df.to_csv(OUT_DIR / "data.csv", index=False)
df.to_excel(OUT_DIR / "data.xlsx", index=False)
df.to_json(
Expand Down
1 change: 1 addition & 0 deletions template/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ <h2>Data</h2>
<ul>
<li><a href="data.csv">Lähdedata (CSV)</a></li>
<li><a href="data.html">Lähdedata (HTML)</a></li>
<li><a href="data-vertical.html">Vastaukset eriteltyinä (HTML)</a></li>
<li><a href="data.json">Lähdedata (JSON)</a></li>
<li><a href="data.xlsx">Lähdedata (XLSX)</a></li>
</ul>
Expand Down
5 changes: 5 additions & 0 deletions template/style.css
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ body.table-body {

body.table-body table {
border-collapse: collapse;
margin-bottom: 1em;
}

body.table-body td,
Expand All @@ -35,6 +36,10 @@ body.table-body th {
border: 1px solid #999;
}

body.table-body tr th {
text-align: left;
}

h1,
h2,
h3 {
Expand Down