Skip to content

Commit

Permalink
Fixing bug with large integers and changing unique vals output
Browse files Browse the repository at this point in the history
  • Loading branch information
john-hawkins committed Dec 31, 2020
1 parent a7290b2 commit 39b7c72
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 20 deletions.
2 changes: 1 addition & 1 deletion dfsummarizer/dfsummarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

"""dfsummarizer.dfsummarizer: provides entry point main()."""

__version__ = "0.1.3"
__version__ = "0.1.4"

import numpy as np
import pandas as pd
Expand Down
21 changes: 14 additions & 7 deletions dfsummarizer/funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def analyse_df(df):
colnames = df.columns
records = len(df)
df = coerce_dates(df)
rez = pd.DataFrame(columns=('Name', 'Type', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
rez = pd.DataFrame(columns=('Name', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))

for name in colnames:
nacount = len(df[df[name].isna()])
Expand Down Expand Up @@ -61,6 +61,7 @@ def analyse_df(df):
values_to_add = {
'Name':name,
'Type':valtype,
'Unique Vals':unicount,
'Unique':unipercent,
'Nulls':napercent,
'Min':themin,
Expand Down Expand Up @@ -93,13 +94,14 @@ def analyse_df_in_chunks(path_to_file):

########################################################################################
def generate_final_summary(temp, total_chunks):
rez = pd.DataFrame(columns=('Name', 'Type', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
rez = pd.DataFrame(columns=('Name', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
for name in temp.keys():
col = temp[name]
total = col['nulls'] + col['nonnulls']
unicount = col['uniques'].estimate()
if unicount > total:
uniprop = 1.0
unicount = total
else:
uniprop = unicount / total
unipercent = round(100 * uniprop, 1)
Expand All @@ -111,6 +113,7 @@ def generate_final_summary(temp, total_chunks):
values_to_add = {
'Name':name,
'Type': col['type'],
'Unique Vals':unicount,
'Unique':unipercent,
'Nulls':napercent,
'Min': col['min'],
Expand Down Expand Up @@ -321,12 +324,12 @@ def print_latex(summary):
print(" \\caption{Data Summary Table}")
print(" \\label{tab:table1}")
print(" \\begin{tabular}{l|l|r|r|r|r} ")
print(" \\textbf{Name} & \\textbf{Type} & \\textbf{Unique \%} & \\textbf{Nulls \%} & \\textbf{Min} & \\textbf{Mean} & \\textbf{Max}\\\\")
print(" \\textbf{Name} & \\textbf{Type} & \\textbf{Unique Vals \%} & \\textbf{Nulls \%} & \\textbf{Min} & \\textbf{Mean} & \\textbf{Max}\\\\")
print(" \\hline")
for i in range(len(summary)):
print(" ", summary.loc[i,"Name"],
"&", summary.loc[i,"Type"],
"&", summary.loc[i,"Unique"], "%"
"&", summary.loc[i,"Unique Vals"], "%"
"&", summary.loc[i,"Nulls"], "%"
"&", summary.loc[i,"Min"],
"&", summary.loc[i,"Mean"],
Expand Down Expand Up @@ -386,6 +389,9 @@ def get_padded_number(n):
return get_spaces(3 - after_decimal(n) + adjus) + str(n)+ " "
if (abs(n)<10000000):
return get_spaces(2 - after_decimal(n) + adjus) + str(n)+ " "
else:
number = "{:.2e}".format(n)
return get_spaces(2 + adjus) + number + " "
else:
return str(n) + " "

Expand All @@ -406,14 +412,15 @@ def print_markdown(s):
name_spacer = 6

print("| Name ", get_spaces(name_spacer-6),
"| Type | Unique | Nulls | Min | Mean | Max |", sep="")
"| Type | Unique Vals | Nulls | Min | Mean | Max |", sep="")
print("| ---- ", get_spaces(name_spacer-6),
"| ------ | ------- | ------- | --- | ---- | --- |", sep="")
"| ------ | ----------- | ------- | --- | ---- | --- |", sep="")
for i in range(len(s)):
print("| ", s.loc[i,"Name"],
get_spaces(name_spacer - len(s.loc[i,"Name"]) - 1 ),
"| ", s.loc[i,"Type"], get_type_spacer(s.loc[i,"Type"]),
"| ", get_percent_spacer(s.loc[i,"Unique"]), s.loc[i,"Unique"],"% ",
#"| ", get_percent_spacer(s.loc[i,"Unique"]), s.loc[i,"Unique"],"% ",
"| ", get_padded_number(s.loc[i,"Unique Vals"]),
"| ", get_percent_spacer(s.loc[i,"Nulls"]), s.loc[i,"Nulls"],"% ",
"| ", get_padded_number(s.loc[i,"Min"]),
"| ", get_padded_number(s.loc[i,"Mean"]),
Expand Down
24 changes: 12 additions & 12 deletions markdown_test.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
| Name | Type | Unique | Nulls | Min | Mean | Max |
| ---- | ------ | ------- | ------- | --- | ---- | --- |
| id | Char | 100.0% | 0.0% | 4 | 4.0 | 4 |
| opening | Date | 100.0% | 0.0% | 2019-01-01 | 2019-04-18 | 2019-07-12 |
| first | Bool | 33.3% | 16.7% | 0.0 | 0.4 | 1.0 |
| last | Bool | 33.3% | 50.0% | 0 | 0.333 | 1 |
| state | Char | 50.0% | 16.7% | 3.0 | 3.0 | 3.0 |
| balance | Float | 83.3% | 0.0% | 200.0 | 1093.55 | 4230.9 |
| duration | Float | 50.0% | 33.3% | 12.0 | 21.0 | 24.0 |
| years | Int | 50.0% | 0.0% | 2 | 3.0 | 4 |
| flag | Float | 33.3% | 66.7% | 1.0 | 1.0 | 1.0 |
| comments | Char | 100.0% | 0.0% | 9 | 21.167 | 35 |
| Name | Type | Unique Vals | Nulls | Min | Mean | Max |
| ---- | ------ | ----------- | ------- | --- | ---- | --- |
| id | Char | 6 | 0.0% | 4 | 4.0 | 4 |
| opening | Date | 6 | 0.0% | 2019-01-01 | 2019-04-18 | 2019-07-12 |
| first | Bool | 2 | 16.7% | 0.0 | 0.4 | 1.0 |
| last | Bool | 2 | 50.0% | 0 | 0.333 | 1 |
| state | Char | 3 | 16.7% | 3.0 | 3.0 | 3.0 |
| balance | Float | 5 | 0.0% | 200.0 | 1093.55 | 4230.9 |
| duration | Float | 3 | 33.3% | 12.0 | 21.0 | 24.0 |
| years | Int | 3 | 0.0% | 2 | 3.0 | 4 |
| flag | Float | 2 | 66.7% | 1.0 | 1.0 | 1.0 |
| comments | Char | 6 | 0.0% | 9 | 21.167 | 35 |

0 comments on commit 39b7c72

Please sign in to comment.