From 39b7c7241947b40ff08642eca95c4345a4b37228 Mon Sep 17 00:00:00 2001
From: john-hawkins <hawkins.john.c@gmail.com>
Date: Thu, 31 Dec 2020 22:08:45 +1100
Subject: [PATCH] Fixing bug with large integers and changing unique vals
 output

---
 dfsummarizer/dfsummarizer.py |  2 +-
 dfsummarizer/funcs.py        | 21 ++++++++++++++-------
 markdown_test.md             | 24 ++++++++++++------------
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/dfsummarizer/dfsummarizer.py b/dfsummarizer/dfsummarizer.py
index b7cc315..a79ca51 100644
--- a/dfsummarizer/dfsummarizer.py
+++ b/dfsummarizer/dfsummarizer.py
@@ -2,7 +2,7 @@
  
 """dfsummarizer.dfsummarizer: provides entry point main()."""
  
-__version__ = "0.1.3"
+__version__ = "0.1.4"
 
 import numpy as np
 import pandas as pd
diff --git a/dfsummarizer/funcs.py b/dfsummarizer/funcs.py
index 5bc5de7..baaaca1 100644
--- a/dfsummarizer/funcs.py
+++ b/dfsummarizer/funcs.py
@@ -24,7 +24,7 @@ def analyse_df(df):
     colnames = df.columns
     records = len(df)
     df = coerce_dates(df)
-    rez = pd.DataFrame(columns=('Name', 'Type', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
+    rez = pd.DataFrame(columns=('Name', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
 
     for name in colnames:
         nacount = len(df[df[name].isna()])
@@ -61,6 +61,7 @@ def analyse_df(df):
         values_to_add = {
             'Name':name, 
             'Type':valtype,
+            'Unique Vals':unicount, 
             'Unique':unipercent, 
             'Nulls':napercent, 
             'Min':themin, 
@@ -93,13 +94,14 @@ def analyse_df_in_chunks(path_to_file):
 
 ########################################################################################
 def generate_final_summary(temp, total_chunks):
-    rez = pd.DataFrame(columns=('Name', 'Type', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
+    rez = pd.DataFrame(columns=('Name', 'Type', 'Unique Vals', 'Unique', 'Nulls', 'Min', 'Mean', 'Max'))
     for name in temp.keys():
         col = temp[name]
         total = col['nulls'] + col['nonnulls']
         unicount = col['uniques'].estimate()
         if unicount > total:
             uniprop = 1.0
+            unicount = total
         else:
             uniprop = unicount / total
         unipercent = round(100 * uniprop, 1)
@@ -111,6 +113,7 @@ def generate_final_summary(temp, total_chunks):
         values_to_add = {
             'Name':name, 
             'Type': col['type'],
+            'Unique Vals':unicount,
             'Unique':unipercent,
             'Nulls':napercent,
             'Min': col['min'],
@@ -321,12 +324,12 @@ def print_latex(summary):
     print("   \\caption{Data Summary Table}")
     print("   \\label{tab:table1}")
     print("   \\begin{tabular}{l|l|r|r|r|r} ")
-    print("    \\textbf{Name} & \\textbf{Type} & \\textbf{Unique \%} & \\textbf{Nulls \%} & \\textbf{Min} & \\textbf{Mean} & \\textbf{Max}\\\\")
+    print("    \\textbf{Name} & \\textbf{Type} & \\textbf{Unique Vals \%} & \\textbf{Nulls \%} & \\textbf{Min} & \\textbf{Mean} & \\textbf{Max}\\\\")
     print("      \\hline")
     for i in range(len(summary)):
         print("      ", summary.loc[i,"Name"], 
               "&", summary.loc[i,"Type"], 
-              "&", summary.loc[i,"Unique"], "%" 
+              "&", summary.loc[i,"Unique Vals"], "%" 
               "&", summary.loc[i,"Nulls"], "%" 
               "&", summary.loc[i,"Min"], 
               "&", summary.loc[i,"Mean"], 
@@ -386,6 +389,9 @@ def get_padded_number(n):
             return get_spaces(3 - after_decimal(n) + adjus) + str(n)+ " "
         if (abs(n)<10000000):
             return get_spaces(2 - after_decimal(n) + adjus) + str(n)+ " "
+        else:
+            number = "{:.2e}".format(n)
+            return get_spaces(2 + adjus) + number + " "
     else:
         return str(n) + " "
 
@@ -406,14 +412,15 @@ def print_markdown(s):
         name_spacer = 6
 
     print("| Name ", get_spaces(name_spacer-6), 
-        "| Type   | Unique  | Nulls   |  Min       |  Mean      |  Max       |", sep="")
+        "| Type   | Unique Vals | Nulls   |  Min       |  Mean      |  Max       |", sep="")
     print("| ---- ", get_spaces(name_spacer-6), 
-        "| ------ | ------- | ------- |  ---       |  ----      |  ---       |", sep="")
+        "| ------ | ----------- | ------- |  ---       |  ----      |  ---       |", sep="")
     for i in range(len(s)):
         print("| ", s.loc[i,"Name"], 
             get_spaces(name_spacer - len(s.loc[i,"Name"]) - 1 ), 
             "| ", s.loc[i,"Type"], get_type_spacer(s.loc[i,"Type"]),
-            "| ", get_percent_spacer(s.loc[i,"Unique"]), s.loc[i,"Unique"],"% ", 
+            #"| ", get_percent_spacer(s.loc[i,"Unique"]), s.loc[i,"Unique"],"% ", 
+            "|  ", get_padded_number(s.loc[i,"Unique Vals"]), 
             "| ", get_percent_spacer(s.loc[i,"Nulls"]), s.loc[i,"Nulls"],"% ", 
             "| ", get_padded_number(s.loc[i,"Min"]), 
             "| ", get_padded_number(s.loc[i,"Mean"]),
diff --git a/markdown_test.md b/markdown_test.md
index 53af0b2..7f0a5c2 100644
--- a/markdown_test.md
+++ b/markdown_test.md
@@ -1,12 +1,12 @@
-| Name     | Type   | Unique  | Nulls   |  Min       |  Mean      |  Max       |
-| ----     | ------ | ------- | ------- |  ---       |  ----      |  ---       |
-| id       | Char   |  100.0% |    0.0% |          4 |        4.0 |          4 |
-| opening  | Date   |  100.0% |    0.0% | 2019-01-01 | 2019-04-18 | 2019-07-12 |
-| first    | Bool   |   33.3% |   16.7% |        0.0 |        0.4 |        1.0 |
-| last     | Bool   |   33.3% |   50.0% |          0 |      0.333 |          1 |
-| state    | Char   |   50.0% |   16.7% |        3.0 |        3.0 |        3.0 |
-| balance  | Float  |   83.3% |    0.0% |      200.0 |    1093.55 |     4230.9 |
-| duration | Float  |   50.0% |   33.3% |       12.0 |       21.0 |       24.0 |
-| years    | Int    |   50.0% |    0.0% |          2 |        3.0 |          4 |
-| flag     | Float  |   33.3% |   66.7% |        1.0 |        1.0 |        1.0 |
-| comments | Char   |  100.0% |    0.0% |          9 |     21.167 |         35 |
+| Name     | Type   | Unique Vals | Nulls   |  Min       |  Mean      |  Max       |
+| ----     | ------ | ----------- | ------- |  ---       |  ----      |  ---       |
+| id       | Char   |           6 |    0.0% |          4 |        4.0 |          4 |
+| opening  | Date   |           6 |    0.0% | 2019-01-01 | 2019-04-18 | 2019-07-12 |
+| first    | Bool   |           2 |   16.7% |        0.0 |        0.4 |        1.0 |
+| last     | Bool   |           2 |   50.0% |          0 |      0.333 |          1 |
+| state    | Char   |           3 |   16.7% |        3.0 |        3.0 |        3.0 |
+| balance  | Float  |           5 |    0.0% |      200.0 |    1093.55 |     4230.9 |
+| duration | Float  |           3 |   33.3% |       12.0 |       21.0 |       24.0 |
+| years    | Int    |           3 |    0.0% |          2 |        3.0 |          4 |
+| flag     | Float  |           2 |   66.7% |        1.0 |        1.0 |        1.0 |
+| comments | Char   |           6 |    0.0% |          9 |     21.167 |         35 |