From 39b7c7241947b40ff08642eca95c4345a4b37228 Mon Sep 17 00:00:00 2001 From: john-hawkins Date: Thu, 31 Dec 2020 22:08:45 +1100 Subject: [PATCH] Fixing bug with large integers and changing unique vals output --- dfsummarizer/dfsummarizer.py | 2 +- dfsummarizer/funcs.py | 21 ++++++++++++++------- markdown_test.md | 24 ++++++++++++------------ 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/dfsummarizer/dfsummarizer.py b/dfsummarizer/dfsummarizer.py index b7cc315..a79ca51 100644 --- a/dfsummarizer/dfsummarizer.py +++ b/dfsummarizer/dfsummarizer.py @@ -2,7 +2,7 @@ """dfsummarizer.dfsummarizer: provides entry point main().""" -__version__ = "0.1.3" +__version__ = "0.1.4" import numpy as np import pandas as pd diff --git a/dfsummarizer/funcs.py b/dfsummarizer/funcs.py index 5bc5de7..baaaca1 100644 --- a/dfsummarizer/funcs.py +++ b/dfsummarizer/funcs.py @@ -24,7 +24,7 @@ def analyse_df(df): colnames = df.columns records = len(df) df = coerce_dates(df) - rez = pd.DataFrame(columns=('Name', 'Type', 'Unique', 'Nulls', 'Min', 'Mean', 'Max')) + rez = pd.DataFrame(columns=('Name', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max')) for name in colnames: nacount = len(df[df[name].isna()]) @@ -61,6 +61,7 @@ def analyse_df(df): values_to_add = { 'Name':name, 'Type':valtype, + 'Unique Vals':unicount, 'Unique':unipercent, 'Nulls':napercent, 'Min':themin, @@ -93,13 +94,14 @@ def analyse_df_in_chunks(path_to_file): ######################################################################################## def generate_final_summary(temp, total_chunks): - rez = pd.DataFrame(columns=('Name', 'Type', 'Unique', 'Nulls', 'Min', 'Mean', 'Max')) + rez = pd.DataFrame(columns=('Name', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max')) for name in temp.keys(): col = temp[name] total = col['nulls'] + col['nonnulls'] unicount = col['uniques'].estimate() if unicount > total: uniprop = 1.0 + unicount = total else: uniprop = unicount / total unipercent = round(100 * uniprop, 1) @@ -111,6 +113,7 @@ def generate_final_summary(temp, total_chunks): values_to_add = { 'Name':name, 'Type': col['type'], + 'Unique Vals':unicount, 'Unique':unipercent, 'Nulls':napercent, 'Min': col['min'], @@ -321,12 +324,12 @@ def print_latex(summary): print(" \\caption{Data Summary Table}") print(" \\label{tab:table1}") print(" \\begin{tabular}{l|l|r|r|r|r} ") - print(" \\textbf{Name} & \\textbf{Type} & \\textbf{Unique \%} & \\textbf{Nulls \%} & \\textbf{Min} & \\textbf{Mean} & \\textbf{Max}\\\\") + print(" \\textbf{Name} & \\textbf{Type} & \\textbf{Unique Vals \%} & \\textbf{Nulls \%} & \\textbf{Min} & \\textbf{Mean} & \\textbf{Max}\\\\") print(" \\hline") for i in range(len(summary)): print(" ", summary.loc[i,"Name"], "&", summary.loc[i,"Type"], - "&", summary.loc[i,"Unique"], "%" + "&", summary.loc[i,"Unique Vals"], "%" "&", summary.loc[i,"Nulls"], "%" "&", summary.loc[i,"Min"], "&", summary.loc[i,"Mean"], @@ -386,6 +389,9 @@ def get_padded_number(n): return get_spaces(3 - after_decimal(n) + adjus) + str(n)+ " " if (abs(n)<10000000): return get_spaces(2 - after_decimal(n) + adjus) + str(n)+ " " + else: + number = "{:.2e}".format(n) + return get_spaces(2 + adjus) + number + " " else: return str(n) + " " @@ -406,14 +412,15 @@ def print_markdown(s): name_spacer = 6 print("| Name ", get_spaces(name_spacer-6), - "| Type | Unique | Nulls | Min | Mean | Max |", sep="") + "| Type | Unique Vals | Nulls | Min | Mean | Max |", sep="") print("| ---- ", get_spaces(name_spacer-6), - "| ------ | ------- | ------- | --- | ---- | --- |", sep="") + "| ------ | ----------- | ------- | --- | ---- | --- |", sep="") for i in range(len(s)): print("| ", s.loc[i,"Name"], get_spaces(name_spacer - len(s.loc[i,"Name"]) - 1 ), "| ", s.loc[i,"Type"], get_type_spacer(s.loc[i,"Type"]), - "| ", get_percent_spacer(s.loc[i,"Unique"]), s.loc[i,"Unique"],"% ", + #"| ", get_percent_spacer(s.loc[i,"Unique"]), s.loc[i,"Unique"],"% ", + "| ", get_padded_number(s.loc[i,"Unique Vals"]), "| ", get_percent_spacer(s.loc[i,"Nulls"]), s.loc[i,"Nulls"],"% ", "| ", get_padded_number(s.loc[i,"Min"]), "| ", get_padded_number(s.loc[i,"Mean"]), diff --git a/markdown_test.md b/markdown_test.md index 53af0b2..7f0a5c2 100644 --- a/markdown_test.md +++ b/markdown_test.md @@ -1,12 +1,12 @@ -| Name | Type | Unique | Nulls | Min | Mean | Max | -| ---- | ------ | ------- | ------- | --- | ---- | --- | -| id | Char | 100.0% | 0.0% | 4 | 4.0 | 4 | -| opening | Date | 100.0% | 0.0% | 2019-01-01 | 2019-04-18 | 2019-07-12 | -| first | Bool | 33.3% | 16.7% | 0.0 | 0.4 | 1.0 | -| last | Bool | 33.3% | 50.0% | 0 | 0.333 | 1 | -| state | Char | 50.0% | 16.7% | 3.0 | 3.0 | 3.0 | -| balance | Float | 83.3% | 0.0% | 200.0 | 1093.55 | 4230.9 | -| duration | Float | 50.0% | 33.3% | 12.0 | 21.0 | 24.0 | -| years | Int | 50.0% | 0.0% | 2 | 3.0 | 4 | -| flag | Float | 33.3% | 66.7% | 1.0 | 1.0 | 1.0 | -| comments | Char | 100.0% | 0.0% | 9 | 21.167 | 35 | +| Name | Type | Unique Vals | Nulls | Min | Mean | Max | +| ---- | ------ | ----------- | ------- | --- | ---- | --- | +| id | Char | 6 | 0.0% | 4 | 4.0 | 4 | +| opening | Date | 6 | 0.0% | 2019-01-01 | 2019-04-18 | 2019-07-12 | +| first | Bool | 2 | 16.7% | 0.0 | 0.4 | 1.0 | +| last | Bool | 2 | 50.0% | 0 | 0.333 | 1 | +| state | Char | 3 | 16.7% | 3.0 | 3.0 | 3.0 | +| balance | Float | 5 | 0.0% | 200.0 | 1093.55 | 4230.9 | +| duration | Float | 3 | 33.3% | 12.0 | 21.0 | 24.0 | +| years | Int | 3 | 0.0% | 2 | 3.0 | 4 | +| flag | Float | 2 | 66.7% | 1.0 | 1.0 | 1.0 | +| comments | Char | 6 | 0.0% | 9 | 21.167 | 35 |