Skip to content

Commit

Permalink
Update 0.1.6
Browse files Browse the repository at this point in the history
- ANAI Open Source  Build 6

- Updated Documentation
- df_loader can now take kwargs related to pandas
it shall be given while we are creating ANAI objects in form of df_kwargs argument
- Added opion to show graphs while explaining ANAI Models
- Fit method will run Automaticaly if ANAI is ran through anai.run() if Regerssion or Classification are used separately fit shall be called
- Explain Method now returns result in for dataframe
- Removed Unnecessary Import from Predictor
- Added more error handeling
- Now Preprocessor can be called without target var
- Stats Summary will drop columns which have 100% missing values
- Added support for legacy data loading. While calling anai.load() if legacy arg is true then it will give pandas loaded dataframe instead of modin.pandas

Signed-off-by: Arsh <[email protected]>
  • Loading branch information
d4rk-lucif3r committed Sep 23, 2022
1 parent 5de8127 commit 8119d93
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 33 deletions.
10 changes: 7 additions & 3 deletions anai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,11 +293,14 @@ def __task(df, target):
return False


def load(df_filepath, **df_kwargs):
def load(df_filepath, legacy =False, **df_kwargs):
"""Loads a dataframe from a filepath.
Args:
df_filepath (str): Filepath of the dataframe to be loaded.
legacy (bool, optional): If True, loads the dataframe using pandas.read_csv.
If False, loads the dataframe using modin.pandas.read_csv.
Defaults to False.
df_kwargs (dict): Keyword arguments to be passed to df_loader function.
Returns:
Expand All @@ -307,12 +310,13 @@ def load(df_filepath, **df_kwargs):

suppress = False
if type(df_filepath) is str:
df = __df_loader_single(df_filepath, suppress=False, **df_kwargs)
df = __df_loader_single(df_filepath, suppress=False, legacy = legacy, **df_kwargs)
elif type(df_filepath) is list:
print(Fore.YELLOW + "Loading Data [*]\n")
df = pd.concat(
[
__df_loader_single(df_filepath[i], suppress=True, **df_kwargs)
__df_loader_single(
df_filepath[i], suppress=True, legacy=legacy, **df_kwargs)
for i in range(len(df_filepath))
]
)
Expand Down
9 changes: 5 additions & 4 deletions anai/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import category_encoders as ce
import matplotlib.pyplot as plt
import modin
import modin.pandas as pd
import pandas as pd
import numpy as np
import seaborn as sns
from anai.preprocessing import *
Expand All @@ -28,7 +28,7 @@ class Preprocessor:
def __init__(
self,
dataset,
target: str,
target: str = None,
except_columns: list = [],
):
""" Initialize the Preprocessor class.
Expand Down Expand Up @@ -56,8 +56,9 @@ def __init__(

self.encoder = Encoder()
self.scaler = Scaler()
self.features = self.__dataset.drop(self.target, axis=1)
self.labels = self.__dataset[self.target]
if self.target:
self.features = self.__dataset.drop(self.target, axis=1)
self.labels = self.__dataset[self.target]

def prepare(self, features, labels, test_size, random_state, smote, k_neighbors):
"""
Expand Down
52 changes: 33 additions & 19 deletions anai/preprocessing/statistics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import numpy as np
import modin.pandas as pd
from scipy.stats import shapiro
from dateutil.parser import parse
from fuzzywuzzy import fuzz
Expand Down Expand Up @@ -31,41 +30,55 @@ def is_date(string, fuzzy=False):
return True
except ValueError:
return False
except OverflowError:
return False
except Exception as e:
return False


def dtype(df, col):
if df[col].dtype == "O":
if not is_date(df[col].iloc[0]):
return "Categorical"
elif is_date(df[col].iloc[0]):
return "Time Series"
elif df[col].dtype == "int64" or df[col].dtype == "float64":
return "Numeric"
else:
try:
if df[col].dtype == "O":
if not is_date(df[col].iloc[0]):
return "Categorical"
elif is_date(df[col].iloc[0]):
return "Time Series"
elif df[col].dtype == "int64" or df[col].dtype == "float64":
return "Numeric"
else:
return "Unknown"
except:
return "Unknown"


def dtype_ver(df, col):
if df[col].dtype == "O":
if not is_date(df[col].iloc[0]):
return "Categorical", ""
elif is_date(df[col].iloc[0]):
return "Categorical", "Time Series"
elif df[col].dtype == "int64" or df[col].dtype == "float64":
return "Numeric", ""
else:
try:
if df[col].dtype == "O":
if not is_date(df[col].iloc[0]):
return "Categorical", ""
elif is_date(df[col].iloc[0]):
return "Categorical", "Time Series"
elif df[col].dtype == "int64" or df[col].dtype == "float64":
return "Numeric", ""
else:
return "Unknown", ""
except Exception as e:
return "Unknown", ""


def shap(df, col):
return "{:0.2f}".format(
float(shapiro(df[col])[0]) if df[col].dtype != "O" else "NA"
)


def most_frequent_values(df, col):
return (
df[col].value_counts()[:1].index.tolist()[0] if df[col].dtype == "O" else "NA"
df[col].value_counts()[:1].index.tolist()[
0] if df[col].dtype == "O" else "NA"
)


def column_stats_summary(df, col):
if "identi" in col.lower():
return {
Expand Down Expand Up @@ -159,7 +172,8 @@ def column_stats_summary(df, col):

def data_stats_summary(df):
anom = AnomalyDetector()
df2 = df.fillna(df.mean())
df2 = df.dropna(axis=1, how='all')
df2 = df2.fillna(df2.mean())
X = []
for i in df2.columns:
if dtype_ver(df2, i)[0] == "Numeric":
Expand Down
12 changes: 6 additions & 6 deletions anai/utils/connectors/data_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ def __df_loader_single(
obj=None,
objfilepath=None,
suppress=False,
df_kwargs = {}
legacy=False,
**df_kwargs
):
kwargs = df_kwargs
df = None
flag = 0
if obj is None:
Expand Down Expand Up @@ -62,17 +62,17 @@ def __df_loader_single(
print(
Fore.RED + "Data Loading Failed [", "\u2717", "]\n"
) if not suppress else None
return df
return df._to_pandas() if legacy else df


def df_loader(df_filepath, obj=None, objfilepath=None, suppress=False, df_kwargs={}):
def df_loader(df_filepath, obj=None, objfilepath=None, suppress=False, df_kwargs={}, legacy=False):
if type(df_filepath) is str:
df = __df_loader_single(df_filepath, obj, objfilepath, suppress, **df_kwargs)
df = __df_loader_single(df_filepath, obj, objfilepath, suppress, legacy, **df_kwargs)
elif type(df_filepath) is list:
print(Fore.YELLOW + "Loading Data [*]\n")
df = pd.concat(
[
__df_loader_single(df_filepath[i], obj, objfilepath, True, **df_kwargs)
__df_loader_single(df_filepath[i], obj, objfilepath, True,legacy, **df_kwargs)
for i in range(len(df_filepath))
]
)
Expand Down
5 changes: 5 additions & 0 deletions docs/Features.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
df = anai.load("data/bodyPerformance.csv", df_kwargs={"header": None})
prep = Preprocessor(dataset=df, target="class", except_columns=['weight_kg'])

### Data Loading
Load data from a file
df = anai.load("data/bodyPerformance.csv", df_kwargs={"header": None}, legacy=False)
Returns a pandas dataframe

### Available Preprocessing Methods
#### Data Summary
Gives a summary of the data.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
],
include=["anai.*", "anai"],
),
version="0.1.6-alpha-1",
version="0.1.6",
license="Apache License 2.0",
description="Automated ML",
url="https://github.com/Revca-ANAI/ANAI",
Expand Down

0 comments on commit 8119d93

Please sign in to comment.