Update 0.1.6

- ANAI Open Source Build 6 - Updated Documentation - df_loader can now take kwargs related to pandas it shall be given while we are creating ANAI objects in form of df_kwargs argument - Added opion to show graphs while explaining ANAI Models - Fit method will run Automaticaly if ANAI is ran through anai.run() if Regerssion or Classification are used separately fit shall be called - Explain Method now returns result in for dataframe - Removed Unnecessary Import from Predictor - Added more error handeling - Now Preprocessor can be called without target var - Stats Summary will drop columns which have 100% missing values - Added support for legacy data loading. While calling anai.load() if legacy arg is true then it will give pandas loaded dataframe instead of modin.pandas Signed-off-by: Arsh <[email protected]>
Revca-ANAI · Sep 23, 2022 · 8119d93 · 8119d93
1 parent 5de8127
commit 8119d93
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 33 deletions.
diff --git a/anai/__init__.py b/anai/__init__.py
@@ -293,11 +293,14 @@ def __task(df, target):
         return False
 
 
-def load(df_filepath, **df_kwargs):
+def load(df_filepath, legacy =False, **df_kwargs):
     """Loads a dataframe from a filepath.
 
     Args:
         df_filepath (str): Filepath of the dataframe to be loaded.
+        legacy (bool, optional): If True, loads the dataframe using pandas.read_csv. 
+            If False, loads the dataframe using modin.pandas.read_csv. 
+            Defaults to False.
         df_kwargs (dict): Keyword arguments to be passed to df_loader function.
 
     Returns:
@@ -307,12 +310,13 @@ def load(df_filepath, **df_kwargs):
 
     suppress = False
     if type(df_filepath) is str:
-        df = __df_loader_single(df_filepath, suppress=False, **df_kwargs)
+        df = __df_loader_single(df_filepath, suppress=False, legacy = legacy, **df_kwargs)
     elif type(df_filepath) is list:
         print(Fore.YELLOW + "Loading Data [*]\n")
         df = pd.concat(
             [
-                __df_loader_single(df_filepath[i], suppress=True, **df_kwargs)
+                __df_loader_single(
+                    df_filepath[i], suppress=True, legacy=legacy, **df_kwargs)
                 for i in range(len(df_filepath))
             ]
         )

diff --git a/anai/preprocessing/__init__.py b/anai/preprocessing/__init__.py
@@ -7,7 +7,7 @@
 import category_encoders as ce
 import matplotlib.pyplot as plt
 import modin
-import modin.pandas as pd
+import pandas as pd
 import numpy as np
 import seaborn as sns
 from anai.preprocessing import *
@@ -28,7 +28,7 @@ class Preprocessor:
     def __init__(
         self,
         dataset,
-        target: str,
+        target: str = None,
         except_columns: list = [],
     ):
         """ Initialize the Preprocessor class.
@@ -56,8 +56,9 @@ def __init__(
 
         self.encoder = Encoder()
         self.scaler = Scaler()
-        self.features = self.__dataset.drop(self.target, axis=1)
-        self.labels = self.__dataset[self.target]
+        if self.target:
+            self.features = self.__dataset.drop(self.target, axis=1)
+            self.labels = self.__dataset[self.target]
 
     def prepare(self, features, labels, test_size, random_state, smote, k_neighbors):
         """

diff --git a/anai/preprocessing/statistics/__init__.py b/anai/preprocessing/statistics/__init__.py
@@ -1,5 +1,4 @@
 import numpy as np
-import modin.pandas as pd
 from scipy.stats import shapiro
 from dateutil.parser import parse
 from fuzzywuzzy import fuzz
@@ -31,41 +30,55 @@ def is_date(string, fuzzy=False):
         return True
     except ValueError:
         return False
+    except OverflowError:
+        return False
+    except Exception as e:
+        return False
 
 
 def dtype(df, col):
-    if df[col].dtype == "O":
-        if not is_date(df[col].iloc[0]):
-            return "Categorical"
-        elif is_date(df[col].iloc[0]):
-            return "Time Series"
-    elif df[col].dtype == "int64" or df[col].dtype == "float64":
-        return "Numeric"
-    else:
+    try:
+        if df[col].dtype == "O":
+            if not is_date(df[col].iloc[0]):
+                return "Categorical"
+            elif is_date(df[col].iloc[0]):
+                return "Time Series"
+        elif df[col].dtype == "int64" or df[col].dtype == "float64":
+            return "Numeric"
+        else:
+            return "Unknown"
+    except:
         return "Unknown"
 
 
 def dtype_ver(df, col):
-    if df[col].dtype == "O":
-        if not is_date(df[col].iloc[0]):
-            return "Categorical", ""
-        elif is_date(df[col].iloc[0]):
-            return "Categorical", "Time Series"
-    elif df[col].dtype == "int64" or df[col].dtype == "float64":
-        return "Numeric", ""
-    else:
+    try:
+        if df[col].dtype == "O":
+            if not is_date(df[col].iloc[0]):
+                return "Categorical", ""
+            elif is_date(df[col].iloc[0]):
+                return "Categorical", "Time Series"
+        elif df[col].dtype == "int64" or df[col].dtype == "float64":
+            return "Numeric", ""
+        else:
+            return "Unknown", ""
+    except Exception as e:
         return "Unknown", ""
 
+
 def shap(df, col):
     return "{:0.2f}".format(
         float(shapiro(df[col])[0]) if df[col].dtype != "O" else "NA"
     )
 
+
 def most_frequent_values(df, col):
     return (
-        df[col].value_counts()[:1].index.tolist()[0] if df[col].dtype == "O" else "NA"
+        df[col].value_counts()[:1].index.tolist()[
+            0] if df[col].dtype == "O" else "NA"
     )
 
+
 def column_stats_summary(df, col):
     if "identi" in col.lower():
         return {
@@ -159,7 +172,8 @@ def column_stats_summary(df, col):
 
 def data_stats_summary(df):
     anom = AnomalyDetector()
-    df2 = df.fillna(df.mean())
+    df2 = df.dropna(axis=1, how='all')
+    df2 = df2.fillna(df2.mean())
     X = []
     for i in df2.columns:
         if dtype_ver(df2, i)[0] == "Numeric":

diff --git a/anai/utils/connectors/data_handler.py b/anai/utils/connectors/data_handler.py
@@ -8,9 +8,9 @@ def __df_loader_single(
     obj=None,
     objfilepath=None,
     suppress=False,
-    df_kwargs = {}
+    legacy=False,
+    **df_kwargs
 ):
-    kwargs = df_kwargs
     df = None
     flag = 0
     if obj is None:
@@ -62,17 +62,17 @@ def __df_loader_single(
             print(
                 Fore.RED + "Data Loading Failed [", "\u2717", "]\n"
             ) if not suppress else None
-    return df
+    return df._to_pandas() if legacy else df
 
 
-def df_loader(df_filepath, obj=None, objfilepath=None, suppress=False, df_kwargs={}):
+def df_loader(df_filepath, obj=None, objfilepath=None, suppress=False, df_kwargs={}, legacy=False):
     if type(df_filepath) is str:
-        df = __df_loader_single(df_filepath, obj, objfilepath, suppress, **df_kwargs)
+        df = __df_loader_single(df_filepath, obj, objfilepath, suppress, legacy, **df_kwargs)
     elif type(df_filepath) is list:
         print(Fore.YELLOW + "Loading Data [*]\n")
         df = pd.concat(
             [
-                __df_loader_single(df_filepath[i], obj, objfilepath, True, **df_kwargs)
+                __df_loader_single(df_filepath[i], obj, objfilepath, True,legacy,  **df_kwargs)
                 for i in range(len(df_filepath))
             ]
         )

diff --git a/docs/Features.md b/docs/Features.md
@@ -13,6 +13,11 @@
     df = anai.load("data/bodyPerformance.csv", df_kwargs={"header": None})
     prep = Preprocessor(dataset=df, target="class", except_columns=['weight_kg'])
 
+### Data Loading
+    Load data from a file
+    df = anai.load("data/bodyPerformance.csv", df_kwargs={"header": None}, legacy=False)
+    Returns a pandas dataframe
+
 ### Available Preprocessing Methods
 #### Data Summary
   Gives a summary of the data.

diff --git a/setup.py b/setup.py
@@ -21,7 +21,7 @@
         ],
         include=["anai.*", "anai"],
     ),
-    version="0.1.6-alpha-1",
+    version="0.1.6",
     license="Apache License 2.0",
     description="Automated ML",
     url="https://github.com/Revca-ANAI/ANAI",