From 9677ff4ca394a298eabbcd66c007c5cf64c5abe0 Mon Sep 17 00:00:00 2001
From: John Hawkins <john.hawkins@john.hawkins.MacBook>
Date: Thu, 6 Aug 2020 10:59:36 +1000
Subject: [PATCH] Fixing problem with Columns natively inferred to be Boolean

---
 README.md                    | 11 +++++++++--
 data/test.csv                | 14 +++++++-------
 dfsummarizer/dfsummarizer.py |  2 +-
 dfsummarizer/funcs.py        | 16 ++++++++++++----
 markdown_test.md             |  1 +
 setup.py                     |  5 +++++
 6 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/README.md b/README.md
index 0c0bdbf..6c4e766 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ based on the implementation by [Javia Jinkal](https://github.com/javiajinkal/Fla
 This [review article by Phillip Gibbons](https://www.cs.cmu.edu/~gibbons/Phillip%20B.%20Gibbons_files/Distinct-Values-Estimation-over-Data-Streams-PBGibbons.pdf) gives a great overview of the alternatives.
 
 
-## Testing
+## Usage
 
 You can use this application multiple ways
 
@@ -55,12 +55,19 @@ Or simply install the package and use the command line application directly
 
 # Installation
 
-Installation from the source tree (or via pip from PyPI)::
+Installation from the source tree:
 
 ```
 python setup.py install
 ```
 
+(or via pip from PyPI):
+
+```
+pip install dfsummarizer
+```
+
+
 Now, the ``dfsummarizer`` command is available::
 
 ```
diff --git a/data/test.csv b/data/test.csv
index 67652cb..bdaed71 100644
--- a/data/test.csv
+++ b/data/test.csv
@@ -1,7 +1,7 @@
-id,opening,first,state,balance,duration,years,flag,comments
-S001,2019-01-01,YES,NSW,230.40,24,2,,Simple transactions
-S002,2019-03-13,NO,QLD,4230.90,12,3,1,Temporary savings account
-S003,2019-06-09,YES,,900.00,24,4,,Combined savings account
-S004,2019-05-21,NO,VIC,500.00,24,4,,Holdings 
-S005,2019-07-12,NO,NSW,200.00,,2,1,Customer called to make a complaint
-S006,2019-03-25,,VIC,500.00,,3,,Unknown origin 
+id,opening,first,last,state,balance,duration,years,flag,comments
+S001,2019-01-01,YES,FALSE,NSW,230.40,24,2,,Simple transactions
+S002,2019-03-13,NO,,QLD,4230.90,12,3,1,Temporary savings account
+S003,2019-06-09,YES,FALSE,,900.00,24,4,,Combined savings account
+S004,2019-05-21,NO,,VIC,500.00,24,4,,Holdings 
+S005,2019-07-12,NO,TRUE,NSW,200.00,,2,1,Customer called to make a complaint
+S006,2019-03-25,,,VIC,500.00,,3,,Unknown origin 
diff --git a/dfsummarizer/dfsummarizer.py b/dfsummarizer/dfsummarizer.py
index 24416d4..9745dae 100644
--- a/dfsummarizer/dfsummarizer.py
+++ b/dfsummarizer/dfsummarizer.py
@@ -2,7 +2,7 @@
  
 """dfsummarizer.dfsummarizer: provides entry point main()."""
  
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 
 import numpy as np
 import pandas as pd
diff --git a/dfsummarizer/funcs.py b/dfsummarizer/funcs.py
index 88ccaec..5bc5de7 100644
--- a/dfsummarizer/funcs.py
+++ b/dfsummarizer/funcs.py
@@ -243,10 +243,15 @@ def infer_type(thetype, unicount, uniques):
         valtype = "Date"
      if thetype == "<class 'pandas._libs.tslibs.timestamps.Timestamp'>" :
         valtype = "Date"
-     # Infer Booleans by 2 unique values and additional criteria
-     #print("Type: ", thetype)
-     if unicount == 2:
-        if (valtype == "Char") :
+     if thetype == "<class 'numpy.bool_'>":
+        valtype = "Bool"
+     if thetype == "<class 'bool'>":
+        valtype = "Bool"
+
+     # Additional Inference of Booleans by strings with 2 unique values 
+     # and common names as additional criteria
+     if (valtype == "Char") :
+         if unicount == 2:
             temp = [x.lower() for x in uniques if x is not None]
             temp.sort()
             if (temp == ['no', 'yes']):
@@ -289,8 +294,11 @@ def booleanize(x):
         return x
     elif x is None :
         return x
+    elif str(type(x)) == "<class 'bool'>": 
+        return x
     else :
         x = x.lower()
+
     if x == "yes" or x == "y" or x == "true" or x == "t" or x == 1:
         return 1
     else :
diff --git a/markdown_test.md b/markdown_test.md
index 2d1eac3..53af0b2 100644
--- a/markdown_test.md
+++ b/markdown_test.md
@@ -3,6 +3,7 @@
 | id       | Char   |  100.0% |    0.0% |          4 |        4.0 |          4 |
 | opening  | Date   |  100.0% |    0.0% | 2019-01-01 | 2019-04-18 | 2019-07-12 |
 | first    | Bool   |   33.3% |   16.7% |        0.0 |        0.4 |        1.0 |
+| last     | Bool   |   33.3% |   50.0% |          0 |      0.333 |          1 |
 | state    | Char   |   50.0% |   16.7% |        3.0 |        3.0 |        3.0 |
 | balance  | Float  |   83.3% |    0.0% |      200.0 |    1093.55 |     4230.9 |
 | duration | Float  |   50.0% |   33.3% |       12.0 |       21.0 |       24.0 |
diff --git a/setup.py b/setup.py
index a50397b..ec079a0 100644
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,11 @@
 with open("README.md", "rb") as f:
     long_descr = f.read().decode("utf-8")
 
+with open("markdown_test.md", "rb") as f:
+    example = f.read().decode("utf-8")
+
+long_descr = long_descr + "\n" + example
+
 setup(
     name = "dfsummarizer",
     packages = ["dfsummarizer"],