Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

develop #35

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scorecardpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# from .info_ent_indx_gini import (ig, ie)
from scorecardpy.var_filter import var_filter
from scorecardpy.woebin import (woebin, woebin_ply, woebin_plot, woebin_adj)
from scorecardpy.woebinnum import woebin_num
from scorecardpy.perf import (perf_eva, perf_psi)
from scorecardpy.scorecard import (scorecard, scorecard_ply)

27 changes: 24 additions & 3 deletions scorecardpy/condition_fun.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def check_print_step(print_step):

# x variable
def x_variable(dat, y, x):
x_all = set(list(dat)).difference(set([y]))
x_all = dat.columns.difference([y]).tolist() #set(list(dat)).difference(set([y]))

if x is None:
x = x_all
Expand All @@ -125,7 +125,7 @@ def x_variable(dat, y, x):


# check breaks_list
def check_breaks_list(breaks_list, xs):
def check_breaks_list(breaks_list):
if breaks_list is not None:
# is string
if isinstance(breaks_list, str):
Expand All @@ -135,7 +135,6 @@ def check_breaks_list(breaks_list, xs):
raise Exception("Incorrect inputs; breaks_list should be a dict.")
return breaks_list


# check special_values
def check_special_values(special_values, xs):
if special_values is not None:
Expand All @@ -152,3 +151,25 @@ def check_special_values(special_values, xs):
raise Exception("Incorrect inputs; special_values should be a list or dict.")
return special_values

# check monotonic_variables
def check_monotonic_variables(dat,y,monotonic_variables):
variables = dat.columns.difference([y]).tolist()
if monotonic_variables:
# 列表
if not isinstance(monotonic_variables,str) and not isinstance(monotonic_variables,list):
warnings.warn("Incorrect inputs,The monotonic_variables should be a list or str.")
monotonic_variables = []
elif isinstance(monotonic_variables,str):
monotonic_variables = [monotonic_variables]

# 是否包含
if not set(monotonic_variables).issubset(set(variables)):
warnings.warn("Incorrect inputs,There are {} not in exist in input data".format(set(i for i in monotonic_variables if i not in variables)))
monotonic_variables = [i for i in monotonic_variables if i in variables]
return monotonic_variables

# check dataFrame
def check_dat(dat):
if not isinstance(dat,pd.DataFrame):
raise Exception("Incorrect inputs; dat should be a DataFrame.")
return dat
Empty file.
147 changes: 147 additions & 0 deletions scorecardpy/monotonous/merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# -*- coding: utf-8 -*-

from .monotonic import (_FeatureMonotone,_BinBadRate,_BadRateMonotone,_AssignBin)
import numpy as np

def monotone_merge(df, target, col):
'''
:return:
将数据集df中,不满足坏样本率单调性的变量col进行合并,使得合并后的新的变量中,坏样本率单调.

'''
def _merge_matrix(m, i,j,k):
'''
:param m: 需要合并行的矩阵
:param i,j: 合并第i和j行
:param k: 删除第k行
:return: 合并后的矩阵
'''
m[i, :] = m[i, :] + m[j, :]
m = np.delete(m, k, axis=0)
return m

def _merge_adjacent_rows(i, bad_by_bin_current, bins_list_current, not_monotone_count_current):
'''
:param i: 需要将第i行与前、后的行分别进行合并,比较哪种合并方案最佳。判断准则是,合并后非单调性程度减轻.
:param bad_by_bin_current:合并前的分箱矩阵,包括每一箱的样本个数、坏样本个数和坏样本率
:param bins_list_current: 合并前的分箱方案
:param not_monotone_count_current:合并前的非单调性元素个数
:return:分箱后的分箱矩阵、分箱方案、非单调性元素个数和衡量均匀性的指标balance
'''
i_prev = i - 1
i_next = i + 1
bins_list = bins_list_current.copy()
bad_by_bin = bad_by_bin_current.copy()
#合并方案a:将第i箱与前一箱进行合并
bad_by_bin2a = _merge_matrix(bad_by_bin.copy(), i_prev, i, i)
bad_by_bin2a[i_prev, -1] = bad_by_bin2a[i_prev, -2] / bad_by_bin2a[i_prev, -3]
not_monotone_count2a = _FeatureMonotone(bad_by_bin2a[:, -1])['count_of_nonmonotone']
# 合并方案b:将第i行与后一行进行合并
bad_by_bin2b = _merge_matrix(bad_by_bin.copy(), i, i_next, i_next)
bad_by_bin2b[i, -1] = bad_by_bin2b[i, -2] / bad_by_bin2b[i, -3]
not_monotone_count2b = _FeatureMonotone(bad_by_bin2b[:, -1])['count_of_nonmonotone']
# balance = ((bad_by_bin[:, 1] / N).T * (bad_by_bin[:, 1] / N))[0, 0]
balance_a = ((bad_by_bin2a[:, 1] / N).T * (bad_by_bin2a[:, 1] / N))[0, 0]
balance_b = ((bad_by_bin2b[:, 1] / N).T * (bad_by_bin2b[:, 1] / N))[0, 0]
# 满足下述2种情况时返回方案a:(1)方案a能减轻非单调性而方案b不能;(2)方案a和b都能减轻非单调性,但是方案a的样本均匀性优于方案b
if not_monotone_count2a < not_monotone_count_current and not_monotone_count2b >= not_monotone_count_current or \
not_monotone_count2a < not_monotone_count_current and not_monotone_count2b < not_monotone_count_current and balance_a < balance_b:
bins_list[i_prev] = bins_list[i_prev] + bins_list[i]
bins_list.remove(bins_list[i])
bad_by_bin = bad_by_bin2a
not_monotone_count = not_monotone_count2a
balance = balance_a
# 同样地,满足下述2种情况时返回方案b:(1)方案b能减轻非单调性而方案a不能;(2)方案a和b都能减轻非单调性,但是方案b的样本均匀性优于方案a
elif not_monotone_count2a >= not_monotone_count_current and not_monotone_count2b < not_monotone_count_current or \
not_monotone_count2a < not_monotone_count_current and not_monotone_count2b < not_monotone_count_current and balance_a > balance_b:
bins_list[i] = bins_list[i] + bins_list[i_next]
bins_list.remove(bins_list[i_next])
bad_by_bin = bad_by_bin2b
not_monotone_count = not_monotone_count2b
balance = balance_b
# 如果方案a和b都不能减轻非单调性,返回均匀性更优的合并方案
else:
if balance_a< balance_b:
bins_list[i] = bins_list[i] + bins_list[i_next]
bins_list.remove(bins_list[i_next])
bad_by_bin = bad_by_bin2b
not_monotone_count = not_monotone_count2b
balance = balance_b
else:
bins_list[i] = bins_list[i] + bins_list[i_next]
bins_list.remove(bins_list[i_next])
bad_by_bin = bad_by_bin2b
not_monotone_count = not_monotone_count2b
balance = balance_b
return {'bins_list': bins_list, 'bad_by_bin': bad_by_bin, 'not_monotone_count': not_monotone_count,
'balance': balance}


N = df.shape[0]
[badrate_bin, bad_by_bin] = _BinBadRate(df, col, target)
bins = list(bad_by_bin[col])
bins_list = [[i] for i in bins]
badRate = sorted(badrate_bin.items(), key=lambda x: x[0])
badRate = [i[1] for i in badRate]
not_monotone_count, not_monotone_position = _FeatureMonotone(badRate)['count_of_nonmonotone'], _FeatureMonotone(badRate)['index_of_nonmonotone']
# 迭代地寻找最优合并方案,终止条件是:当前的坏样本率已经单调,或者当前只有2箱
while (not_monotone_count > 0 and len(bins_list)>2):
# 当非单调的箱的个数超过1个时,每一次迭代中都尝试每一个箱的最优合并方案
all_possible_merging = []
for i in not_monotone_position:
merge_adjacent_rows = _merge_adjacent_rows(i, np.mat(bad_by_bin), bins_list, not_monotone_count)
all_possible_merging.append(merge_adjacent_rows)
balance_list = [i['balance'] for i in all_possible_merging]
not_monotone_count_new = [i['not_monotone_count'] for i in all_possible_merging]
# 如果所有的合并方案都不能减轻当前的非单调性,就选择更加均匀的合并方案
if min(not_monotone_count_new) >= not_monotone_count:
best_merging_position = balance_list.index(min(balance_list))
# 如果有多个合并方案都能减轻当前的非单调性,也选择更加均匀的合并方案
else:
better_merging_index = [i for i in range(len(not_monotone_count_new)) if not_monotone_count_new[i] < not_monotone_count]
better_balance = [balance_list[i] for i in better_merging_index]
best_balance_index = better_balance.index(min(better_balance))
best_merging_position = better_merging_index[best_balance_index]
bins_list = all_possible_merging[best_merging_position]['bins_list']
bad_by_bin = all_possible_merging[best_merging_position]['bad_by_bin']
not_monotone_count = all_possible_merging[best_merging_position]['not_monotone_count']
not_monotone_position = _FeatureMonotone(bad_by_bin[:, 3])['index_of_nonmonotone']
return bins_list


def monotonous_bin(df,col,cutOffPoints,target,special_values):

#单调性检验
var_cutoff = {}
col1 = col + '_Bin' # 检验单调性
df[col1] = df[col].map(lambda x: _AssignBin(x, cutOffPoints=cutOffPoints,
special_attribute=special_values))
BRM = _BadRateMonotone(df, col1, target, special_attribute=special_values) # 是否单调
if not BRM:
# 合并方案
if special_values == []:
bin_merged = monotone_merge(df, target, col1)
removed_index = []
for bin in bin_merged:
if len(bin) > 1:
indices = [int(b.replace('Bin ', '')) for b in bin]
removed_index = removed_index + indices[0:-1]
removed_point = [cutOffPoints[k] for k in removed_index]
for p in removed_point:
cutOffPoints.remove(p)
var_cutoff[col] = cutOffPoints
else:
cutOffPoints2 = [i for i in cutOffPoints if i not in special_values]
temp = df.loc[~df[col].isin(special_values)]
bin_merged = monotone_merge(temp, target, col1)
removed_index = []
for bin in bin_merged:
if len(bin) > 1:
indices = [int(b.replace('Bin ', '')) for b in bin]
removed_index = removed_index + indices[0:-1]
removed_point = [cutOffPoints2[k] for k in removed_index]
for p in removed_point:
cutOffPoints2.remove(p)
cutOffPoints2 = cutOffPoints2 + special_values
var_cutoff[col] = cutOffPoints2 # 单调性检验结果
return var_cutoff
80 changes: 80 additions & 0 deletions scorecardpy/monotonous/monotonic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-

import pandas as pd


def _AssignBin(x, cutOffPoints, special_attribute=None):

'''
:param x: 某个变量的某个取值
:param cutOffPoints: list 上述变量的分箱结果,用切分点表示
:param special_attribute:list 不参与分箱的特殊取值
:return: 分箱后的对应的第几个箱,从0开始
'''
cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute]
numBin = len(cutOffPoints2)
if x in special_attribute:
i = special_attribute.index(x)+1
return 'Bin {}'.format(0-i)
if x<=cutOffPoints2[0]:
return 'Bin 0'
elif x > cutOffPoints2[-1]:
return 'Bin {}'.format(numBin)
else:
for i in range(0,numBin):
if cutOffPoints2[i] < x <= cutOffPoints2[i+1]:
return 'Bin {}'.format(i+1)


def _FeatureMonotone(x):
'''
Param x: list cut off list
:return: 返回序列x中有几个元素不满足单调性,以及这些元素的位置
'''
monotone = [x[i]<x[i+1] and x[i] < x[i-1] or x[i]>x[i+1] and x[i] > x[i-1] for i in range(1,len(x)-1)]
index_of_nonmonotone = [i+1 for i in range(len(monotone)) if monotone[i]]
return {'count_of_nonmonotone':monotone.count(True), 'index_of_nonmonotone':index_of_nonmonotone}


def _BinBadRate(df, col, target, grantRateIndicator=0):
'''
:param df: 需要计算好坏比率的数据集
:param col: 需要计算好坏比率的特征
:param target: 好坏标签
:param grantRateIndicator: 1返回总体的坏样本率,0不返回
:return: 每箱的坏样本率,以及总体的坏样本率(当grantRateIndicator==1时)
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left')
regroup.reset_index(drop=False, inplace=True)
regroup['bad_rate'] = regroup.apply(lambda x: x.bad / x.total, axis=1)
dicts = dict(zip(regroup[col],regroup['bad_rate']))
if grantRateIndicator==0:
return (dicts, regroup)
N = sum(regroup['total'])
B = sum(regroup['bad'])
overallRate = B * 1.0 / N
return (dicts, regroup, overallRate)

def _BadRateMonotone(df, sortByVar, target,special_attribute = []):
'''
:param df: 包含检验坏样本率的变量,和目标变量
:param sortByVar: 需要检验坏样本率的变量
:param target: 目标变量,0、1表示好、坏
:param special_attribute: 不参与检验的特殊值
:return: 坏样本率单调与否
'''
df2 = df.loc[~df[sortByVar].isin(special_attribute)]
if len(set(df2[sortByVar])) <= 2:
return True
regroup = _BinBadRate(df2, sortByVar, target)[1]
combined = zip(regroup['total'],regroup['bad'])
badRate = [x[1]*1.0/x[0] for x in combined]
badRateNotMonotone = _FeatureMonotone(badRate)['count_of_nonmonotone']
if badRateNotMonotone > 0:
return False
else:
return True
88 changes: 88 additions & 0 deletions scorecardpy/woebinnum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
"""
连续变量分箱(单调性检验)
"""
import warnings
import copy
from .woebin import woebin
from .monotonous.merge import monotonous_bin
from .condition_fun import (check_y,check_monotonic_variables,check_dat,check_breaks_list)

def woebin_num(dt, y, x=None, breaks_list=None, special_values=None, monotonic_variables = None,
min_perc_fine_bin=0.02, min_perc_coarse_bin=0.05,stop_limit=0.1, max_num_bin=8,
positive="bad|1", no_cores=None, print_step=0, method="tree"):

"""
WOE Binning for number features, support badrate monotonic test
------
`woebin_num` only optimal binning for numerical variables, support variables badrate monotonic test.
using methods including tree-like segmentation or chi-square merge.
`woebin_num` can also customizing breakpoints if the breaks_list or special_values was provided.

Params
------
dt: A data frame with both x (predictor/feature) and y (response/label) variables.
y: Name of y variable.
x: Name of x variables. Default is NULL. If x is NULL,
then all variables except y are counted as x variables.
breaks_list: List of break points, default is NULL.
If it is not NULL, variable binning will based on the
provided breaks.
special_values: the values specified in special_values
will be in separate bins. Default is NULL.
monotonic_variables:list of monotonic testing variables.
min_perc_fine_bin: The minimum percentage of initial binning
class number over total. Accepted range: 0.01-0.2; default
is 0.02, which means initial binning into 50 fine bins for
continuous variables.
min_perc_coarse_bin: The minimum percentage of final binning
class number over total. Accepted range: 0.01-0.2; default
is 0.05.
stop_limit: Stop binning segmentation when information value
gain ratio less than the stop_limit, or stop binning merge
when the minimum of chi-square less than 'qchisq(1-stoplimit, 1)'.
Accepted range: 0-0.5; default is 0.1.
max_num_bin: Integer. The maximum number of binning.
positive: Value of positive class, default "bad|1".
no_cores: Number of CPU cores for parallel computation.
Defaults NULL. If no_cores is NULL, the no_cores will
set as 1 if length of x variables less than 10, and will
set as the number of all CPU cores if the length of x variables
greater than or equal to 10.
print_step: A non-negative integer. Default is 1. If print_step>0,
print variable names by each print_step-th iteration.
If print_step=0 or no_cores>1, no message is print.
method: Optimal binning method, it should be "tree" or "chimerge".
Default is "tree".

Returns
------
dictionary
Optimal or customized binning dataframe.
"""
dt = copy.deepcopy(dt)
dt = check_dat(dt)
dt = check_y(dat=dt,y=y,positive=positive)
breaks_list = check_breaks_list(breaks_list)
monotonic_variables = check_monotonic_variables(dat=dt,y=y,monotonic_variables=monotonic_variables)
if not breaks_list:
breaks_list = dict()

if monotonic_variables:
for col in monotonic_variables:
# print("There check {} monotonic testing..............".format(col))
try:
cutOffPoints = woebin(dt=dt[[col,y]],y=y,breaks_list=breaks_list,special_values=special_values,min_perc_fine_bin=min_perc_fine_bin,
min_perc_coarse_bin=min_perc_coarse_bin,stop_limit=stop_limit,max_num_bin=max_num_bin,positive=positive,
no_cores=no_cores,print_step=print_step,method=method)[col]["breaks"].tolist()
cutOffPoints = [float(i) for i in cutOffPoints if str(i) not in ['inf', '-inf']]
# 单调检验合并方案结果
mono_cutOffPoints = monotonous_bin(df=dt[[col,y]],col=col,cutOffPoints=cutOffPoints,target=y,special_values=special_values)
breaks_list.update(mono_cutOffPoints)
except:
warnings.warn("The {} have {} unique values, Fail monotonic testing".format(col,len(dt[col].unique())))

bins = woebin(dt=dt,y=y,x=x,breaks_list=breaks_list,special_values=special_values,min_perc_fine_bin=min_perc_fine_bin,
min_perc_coarse_bin=min_perc_coarse_bin,stop_limit=stop_limit,max_num_bin=max_num_bin,positive=positive,
no_cores=no_cores,print_step=print_step,method=method)
return bins