From 9677ff4ca394a298eabbcd66c007c5cf64c5abe0 Mon Sep 17 00:00:00 2001 From: John Hawkins Date: Thu, 6 Aug 2020 10:59:36 +1000 Subject: [PATCH] Fixing problem with Columns natively inferred to be Boolean --- README.md | 11 +++++++++-- data/test.csv | 14 +++++++------- dfsummarizer/dfsummarizer.py | 2 +- dfsummarizer/funcs.py | 16 ++++++++++++---- markdown_test.md | 1 + setup.py | 5 +++++ 6 files changed, 35 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 0c0bdbf..6c4e766 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ based on the implementation by [Javia Jinkal](https://github.com/javiajinkal/Fla This [review article by Phillip Gibbons](https://www.cs.cmu.edu/~gibbons/Phillip%20B.%20Gibbons_files/Distinct-Values-Estimation-over-Data-Streams-PBGibbons.pdf) gives a great overview of the alternatives. -## Testing +## Usage You can use this application multiple ways @@ -55,12 +55,19 @@ Or simply install the package and use the command line application directly # Installation -Installation from the source tree (or via pip from PyPI):: +Installation from the source tree: ``` python setup.py install ``` +(or via pip from PyPI): + +``` +pip install dfsummarizer +``` + + Now, the ``dfsummarizer`` command is available:: ``` diff --git a/data/test.csv b/data/test.csv index 67652cb..bdaed71 100644 --- a/data/test.csv +++ b/data/test.csv @@ -1,7 +1,7 @@ -id,opening,first,state,balance,duration,years,flag,comments -S001,2019-01-01,YES,NSW,230.40,24,2,,Simple transactions -S002,2019-03-13,NO,QLD,4230.90,12,3,1,Temporary savings account -S003,2019-06-09,YES,,900.00,24,4,,Combined savings account -S004,2019-05-21,NO,VIC,500.00,24,4,,Holdings -S005,2019-07-12,NO,NSW,200.00,,2,1,Customer called to make a complaint -S006,2019-03-25,,VIC,500.00,,3,,Unknown origin +id,opening,first,last,state,balance,duration,years,flag,comments +S001,2019-01-01,YES,FALSE,NSW,230.40,24,2,,Simple transactions +S002,2019-03-13,NO,,QLD,4230.90,12,3,1,Temporary savings account +S003,2019-06-09,YES,FALSE,,900.00,24,4,,Combined savings account +S004,2019-05-21,NO,,VIC,500.00,24,4,,Holdings +S005,2019-07-12,NO,TRUE,NSW,200.00,,2,1,Customer called to make a complaint +S006,2019-03-25,,,VIC,500.00,,3,,Unknown origin diff --git a/dfsummarizer/dfsummarizer.py b/dfsummarizer/dfsummarizer.py index 24416d4..9745dae 100644 --- a/dfsummarizer/dfsummarizer.py +++ b/dfsummarizer/dfsummarizer.py @@ -2,7 +2,7 @@ """dfsummarizer.dfsummarizer: provides entry point main().""" -__version__ = "0.1.1" +__version__ = "0.1.2" import numpy as np import pandas as pd diff --git a/dfsummarizer/funcs.py b/dfsummarizer/funcs.py index 88ccaec..5bc5de7 100644 --- a/dfsummarizer/funcs.py +++ b/dfsummarizer/funcs.py @@ -243,10 +243,15 @@ def infer_type(thetype, unicount, uniques): valtype = "Date" if thetype == "" : valtype = "Date" - # Infer Booleans by 2 unique values and additional criteria - #print("Type: ", thetype) - if unicount == 2: - if (valtype == "Char") : + if thetype == "": + valtype = "Bool" + if thetype == "": + valtype = "Bool" + + # Additional Inference of Booleans by strings with 2 unique values + # and common names as additional criteria + if (valtype == "Char") : + if unicount == 2: temp = [x.lower() for x in uniques if x is not None] temp.sort() if (temp == ['no', 'yes']): @@ -289,8 +294,11 @@ def booleanize(x): return x elif x is None : return x + elif str(type(x)) == "": + return x else : x = x.lower() + if x == "yes" or x == "y" or x == "true" or x == "t" or x == 1: return 1 else : diff --git a/markdown_test.md b/markdown_test.md index 2d1eac3..53af0b2 100644 --- a/markdown_test.md +++ b/markdown_test.md @@ -3,6 +3,7 @@ | id | Char | 100.0% | 0.0% | 4 | 4.0 | 4 | | opening | Date | 100.0% | 0.0% | 2019-01-01 | 2019-04-18 | 2019-07-12 | | first | Bool | 33.3% | 16.7% | 0.0 | 0.4 | 1.0 | +| last | Bool | 33.3% | 50.0% | 0 | 0.333 | 1 | | state | Char | 50.0% | 16.7% | 3.0 | 3.0 | 3.0 | | balance | Float | 83.3% | 0.0% | 200.0 | 1093.55 | 4230.9 | | duration | Float | 50.0% | 33.3% | 12.0 | 21.0 | 24.0 | diff --git a/setup.py b/setup.py index a50397b..ec079a0 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,11 @@ with open("README.md", "rb") as f: long_descr = f.read().decode("utf-8") +with open("markdown_test.md", "rb") as f: + example = f.read().decode("utf-8") + +long_descr = long_descr + "\n" + example + setup( name = "dfsummarizer", packages = ["dfsummarizer"],