ShichenXie · huangxianyang · Oct 18, 2019
diff --git a/scorecardpy/__init__.py b/scorecardpy/__init__.py
@@ -6,6 +6,7 @@
 # from .info_ent_indx_gini import (ig, ie)
 from scorecardpy.var_filter import var_filter
 from scorecardpy.woebin import (woebin, woebin_ply, woebin_plot, woebin_adj)
+from scorecardpy.woebinnum import woebin_num
 from scorecardpy.perf import (perf_eva, perf_psi)
 from scorecardpy.scorecard import (scorecard, scorecard_ply)
 
diff --git a/scorecardpy/condition_fun.py b/scorecardpy/condition_fun.py
@@ -105,7 +105,7 @@ def check_print_step(print_step):
 
 # x variable
 def x_variable(dat, y, x):
-    x_all = set(list(dat)).difference(set([y]))
+    x_all = dat.columns.difference([y]).tolist() #set(list(dat)).difference(set([y]))
 
     if x is None:
         x = x_all
@@ -125,7 +125,7 @@ def x_variable(dat, y, x):
 
 
 # check breaks_list
-def check_breaks_list(breaks_list, xs):
+def check_breaks_list(breaks_list):
     if breaks_list is not None:
         # is string
         if isinstance(breaks_list, str):
@@ -135,7 +135,6 @@ def check_breaks_list(breaks_list, xs):
             raise Exception("Incorrect inputs; breaks_list should be a dict.")
     return breaks_list
 
-
 # check special_values
 def check_special_values(special_values, xs):
     if special_values is not None:
@@ -152,3 +151,25 @@ def check_special_values(special_values, xs):
             raise Exception("Incorrect inputs; special_values should be a list or dict.")
     return special_values
 
+# check monotonic_variables
+def check_monotonic_variables(dat,y,monotonic_variables):
+    variables = dat.columns.difference([y]).tolist()
+    if monotonic_variables:
+        # 列表
+        if not isinstance(monotonic_variables,str) and not isinstance(monotonic_variables,list):
+            warnings.warn("Incorrect inputs,The monotonic_variables should be a list or str.")
+            monotonic_variables = []
+        elif isinstance(monotonic_variables,str):
+            monotonic_variables = [monotonic_variables]
+
+        # 是否包含
+        if not set(monotonic_variables).issubset(set(variables)):
+            warnings.warn("Incorrect inputs,There are {} not in exist in input data".format(set(i for i in monotonic_variables if i not in variables)))
+            monotonic_variables = [i for i in monotonic_variables if i in variables]
+    return monotonic_variables
+
+# check dataFrame
+def check_dat(dat):
+    if not isinstance(dat,pd.DataFrame):
+        raise Exception("Incorrect inputs; dat should be a DataFrame.")
+    return dat
diff --git a/scorecardpy/monotonous/__init__.py b/scorecardpy/monotonous/__init__.py
diff --git a/scorecardpy/monotonous/merge.py b/scorecardpy/monotonous/merge.py
@@ -0,0 +1,147 @@
+# -*- coding: utf-8 -*-
+
+from .monotonic import (_FeatureMonotone,_BinBadRate,_BadRateMonotone,_AssignBin)
+import numpy as np
+
+def monotone_merge(df, target, col):
+    '''
+    :return:
+    将数据集df中，不满足坏样本率单调性的变量col进行合并，使得合并后的新的变量中，坏样本率单调.
+
+    '''
+    def _merge_matrix(m, i,j,k):
+        '''
+        :param m: 需要合并行的矩阵
+        :param i,j: 合并第i和j行
+        :param k: 删除第k行
+        :return: 合并后的矩阵
+        '''
+        m[i, :] = m[i, :] + m[j, :]
+        m = np.delete(m, k, axis=0)
+        return m
+
+    def _merge_adjacent_rows(i, bad_by_bin_current, bins_list_current, not_monotone_count_current):
+        '''
+        :param i: 需要将第i行与前、后的行分别进行合并，比较哪种合并方案最佳。判断准则是，合并后非单调性程度减轻.
+        :param bad_by_bin_current:合并前的分箱矩阵，包括每一箱的样本个数、坏样本个数和坏样本率
+        :param bins_list_current: 合并前的分箱方案
+        :param not_monotone_count_current:合并前的非单调性元素个数
+        :return:分箱后的分箱矩阵、分箱方案、非单调性元素个数和衡量均匀性的指标balance
+        '''
+        i_prev = i - 1
+        i_next = i + 1
+        bins_list = bins_list_current.copy()
+        bad_by_bin = bad_by_bin_current.copy()
+        #合并方案a：将第i箱与前一箱进行合并
+        bad_by_bin2a = _merge_matrix(bad_by_bin.copy(), i_prev, i, i)
+        bad_by_bin2a[i_prev, -1] = bad_by_bin2a[i_prev, -2] / bad_by_bin2a[i_prev, -3]
+        not_monotone_count2a = _FeatureMonotone(bad_by_bin2a[:, -1])['count_of_nonmonotone']
+        # 合并方案b：将第i行与后一行进行合并
+        bad_by_bin2b = _merge_matrix(bad_by_bin.copy(), i, i_next, i_next)
+        bad_by_bin2b[i, -1] = bad_by_bin2b[i, -2] / bad_by_bin2b[i, -3]
+        not_monotone_count2b = _FeatureMonotone(bad_by_bin2b[:, -1])['count_of_nonmonotone']
+        # balance = ((bad_by_bin[:, 1] / N).T * (bad_by_bin[:, 1] / N))[0, 0]
+        balance_a = ((bad_by_bin2a[:, 1] / N).T * (bad_by_bin2a[:, 1] / N))[0, 0]
+        balance_b = ((bad_by_bin2b[:, 1] / N).T * (bad_by_bin2b[:, 1] / N))[0, 0]
+        # 满足下述2种情况时返回方案a：（1）方案a能减轻非单调性而方案b不能；（2）方案a和b都能减轻非单调性，但是方案a的样本均匀性优于方案b
+        if not_monotone_count2a < not_monotone_count_current and not_monotone_count2b >= not_monotone_count_current or \
+                                        not_monotone_count2a < not_monotone_count_current and not_monotone_count2b < not_monotone_count_current and balance_a < balance_b:
+            bins_list[i_prev] = bins_list[i_prev] + bins_list[i]
+            bins_list.remove(bins_list[i])
+            bad_by_bin = bad_by_bin2a
+            not_monotone_count = not_monotone_count2a
+            balance = balance_a
+        # 同样地，满足下述2种情况时返回方案b：（1）方案b能减轻非单调性而方案a不能；（2）方案a和b都能减轻非单调性，但是方案b的样本均匀性优于方案a
+        elif not_monotone_count2a >= not_monotone_count_current and not_monotone_count2b < not_monotone_count_current or \
+                                        not_monotone_count2a < not_monotone_count_current and not_monotone_count2b < not_monotone_count_current and balance_a > balance_b:
+            bins_list[i] = bins_list[i] + bins_list[i_next]
+            bins_list.remove(bins_list[i_next])
+            bad_by_bin = bad_by_bin2b
+            not_monotone_count = not_monotone_count2b
+            balance = balance_b
+        # 如果方案a和b都不能减轻非单调性，返回均匀性更优的合并方案
+        else:
+            if balance_a< balance_b:
+                bins_list[i] = bins_list[i] + bins_list[i_next]
+                bins_list.remove(bins_list[i_next])
+                bad_by_bin = bad_by_bin2b
+                not_monotone_count = not_monotone_count2b
+                balance = balance_b
+            else:
+                bins_list[i] = bins_list[i] + bins_list[i_next]
+                bins_list.remove(bins_list[i_next])
+                bad_by_bin = bad_by_bin2b
+                not_monotone_count = not_monotone_count2b
+                balance = balance_b
+        return {'bins_list': bins_list, 'bad_by_bin': bad_by_bin, 'not_monotone_count': not_monotone_count,
+                'balance': balance}
+
+
+    N = df.shape[0]
+    [badrate_bin, bad_by_bin] = _BinBadRate(df, col, target)
+    bins = list(bad_by_bin[col])
+    bins_list = [[i] for i in bins]
+    badRate = sorted(badrate_bin.items(), key=lambda x: x[0])
+    badRate = [i[1] for i in badRate]
+    not_monotone_count, not_monotone_position = _FeatureMonotone(badRate)['count_of_nonmonotone'], _FeatureMonotone(badRate)['index_of_nonmonotone']
+    # 迭代地寻找最优合并方案，终止条件是:当前的坏样本率已经单调，或者当前只有2箱
+    while (not_monotone_count > 0 and len(bins_list)>2):
+        # 当非单调的箱的个数超过1个时，每一次迭代中都尝试每一个箱的最优合并方案
+        all_possible_merging = []
+        for i in not_monotone_position:
+            merge_adjacent_rows = _merge_adjacent_rows(i, np.mat(bad_by_bin), bins_list, not_monotone_count)
+            all_possible_merging.append(merge_adjacent_rows)
+        balance_list = [i['balance'] for i in all_possible_merging]
+        not_monotone_count_new = [i['not_monotone_count'] for i in all_possible_merging]
+        # 如果所有的合并方案都不能减轻当前的非单调性，就选择更加均匀的合并方案
+        if min(not_monotone_count_new) >= not_monotone_count:
+            best_merging_position = balance_list.index(min(balance_list))
+        # 如果有多个合并方案都能减轻当前的非单调性，也选择更加均匀的合并方案
+        else:
+            better_merging_index = [i for i in range(len(not_monotone_count_new)) if not_monotone_count_new[i] < not_monotone_count]
+            better_balance = [balance_list[i] for i in better_merging_index]
+            best_balance_index = better_balance.index(min(better_balance))
+            best_merging_position = better_merging_index[best_balance_index]
+        bins_list = all_possible_merging[best_merging_position]['bins_list']
+        bad_by_bin = all_possible_merging[best_merging_position]['bad_by_bin']
+        not_monotone_count = all_possible_merging[best_merging_position]['not_monotone_count']
+        not_monotone_position = _FeatureMonotone(bad_by_bin[:, 3])['index_of_nonmonotone']
+    return bins_list
+
+
+def monotonous_bin(df,col,cutOffPoints,target,special_values):
+
+    #单调性检验
+    var_cutoff = {}
+    col1 = col + '_Bin'  # 检验单调性
+    df[col1] = df[col].map(lambda x: _AssignBin(x, cutOffPoints=cutOffPoints,
+                                                     special_attribute=special_values))
+    BRM = _BadRateMonotone(df, col1, target, special_attribute=special_values)  # 是否单调
+    if not BRM:
+        # 合并方案
+        if special_values == []:
+            bin_merged = monotone_merge(df, target, col1)
+            removed_index = []
+            for bin in bin_merged:
+                if len(bin) > 1:
+                    indices = [int(b.replace('Bin ', '')) for b in bin]
+                    removed_index = removed_index + indices[0:-1]
+            removed_point = [cutOffPoints[k] for k in removed_index]
+            for p in removed_point:
+                cutOffPoints.remove(p)
+            var_cutoff[col] = cutOffPoints
+        else:
+            cutOffPoints2 = [i for i in cutOffPoints if i not in special_values]
+            temp = df.loc[~df[col].isin(special_values)]
+            bin_merged = monotone_merge(temp, target, col1)
+            removed_index = []
+            for bin in bin_merged:
+                if len(bin) > 1:
+                    indices = [int(b.replace('Bin ', '')) for b in bin]
+                    removed_index = removed_index + indices[0:-1]
+            removed_point = [cutOffPoints2[k] for k in removed_index]
+            for p in removed_point:
+                cutOffPoints2.remove(p)
+            cutOffPoints2 = cutOffPoints2 + special_values
+            var_cutoff[col] = cutOffPoints2  # 单调性检验结果
+    return var_cutoff
diff --git a/scorecardpy/monotonous/monotonic.py b/scorecardpy/monotonous/monotonic.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+
+import pandas as pd
+
+
+def _AssignBin(x, cutOffPoints, special_attribute=None):
+
+    '''
+    :param x: 某个变量的某个取值
+    :param cutOffPoints: list 上述变量的分箱结果，用切分点表示
+    :param special_attribute:list  不参与分箱的特殊取值
+    :return: 分箱后的对应的第几个箱，从0开始
+    '''
+    cutOffPoints2 = [i for i in cutOffPoints if i not in special_attribute]
+    numBin = len(cutOffPoints2)
+    if x in special_attribute:
+        i = special_attribute.index(x)+1
+        return 'Bin {}'.format(0-i)
+    if x<=cutOffPoints2[0]:
+        return 'Bin 0'
+    elif x > cutOffPoints2[-1]:
+        return 'Bin {}'.format(numBin)
+    else:
+        for i in range(0,numBin):
+            if cutOffPoints2[i] < x <=  cutOffPoints2[i+1]:
+                return 'Bin {}'.format(i+1)
+
+
+def _FeatureMonotone(x):
+    '''
+    Param x: list cut off list
+    :return: 返回序列x中有几个元素不满足单调性，以及这些元素的位置
+    '''
+    monotone = [x[i]<x[i+1] and x[i] < x[i-1] or x[i]>x[i+1] and x[i] > x[i-1] for i in range(1,len(x)-1)]
+    index_of_nonmonotone = [i+1 for i in range(len(monotone)) if monotone[i]]
+    return {'count_of_nonmonotone':monotone.count(True), 'index_of_nonmonotone':index_of_nonmonotone}
+
+
+def _BinBadRate(df, col, target, grantRateIndicator=0):
+    '''
+    :param df: 需要计算好坏比率的数据集
+    :param col: 需要计算好坏比率的特征
+    :param target: 好坏标签
+    :param grantRateIndicator: 1返回总体的坏样本率，0不返回
+    :return: 每箱的坏样本率，以及总体的坏样本率（当grantRateIndicator＝＝1时）
+    '''
+    total = df.groupby([col])[target].count()
+    total = pd.DataFrame({'total': total})
+    bad = df.groupby([col])[target].sum()
+    bad = pd.DataFrame({'bad': bad})
+    regroup = total.merge(bad, left_index=True, right_index=True, how='left')
+    regroup.reset_index(drop=False, inplace=True)
+    regroup['bad_rate'] = regroup.apply(lambda x: x.bad / x.total, axis=1)
+    dicts = dict(zip(regroup[col],regroup['bad_rate']))
+    if grantRateIndicator==0:
+        return (dicts, regroup)
+    N = sum(regroup['total'])
+    B = sum(regroup['bad'])
+    overallRate = B * 1.0 / N
+    return (dicts, regroup, overallRate)
+
+def _BadRateMonotone(df, sortByVar, target,special_attribute = []):
+    '''
+    :param df: 包含检验坏样本率的变量,和目标变量
+    :param sortByVar: 需要检验坏样本率的变量
+    :param target: 目标变量，0、1表示好、坏
+    :param special_attribute: 不参与检验的特殊值
+    :return: 坏样本率单调与否
+    '''
+    df2 = df.loc[~df[sortByVar].isin(special_attribute)]
+    if len(set(df2[sortByVar])) <= 2:
+        return True
+    regroup = _BinBadRate(df2, sortByVar, target)[1]
+    combined = zip(regroup['total'],regroup['bad'])
+    badRate = [x[1]*1.0/x[0] for x in combined]
+    badRateNotMonotone = _FeatureMonotone(badRate)['count_of_nonmonotone']
+    if badRateNotMonotone > 0:
+        return False
+    else:
+        return True
diff --git a/scorecardpy/woebinnum.py b/scorecardpy/woebinnum.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+"""
+连续变量分箱(单调性检验)
+"""
+import warnings
+import copy
+from .woebin import woebin
+from .monotonous.merge import monotonous_bin
+from .condition_fun import (check_y,check_monotonic_variables,check_dat,check_breaks_list)
+
+def woebin_num(dt, y, x=None, breaks_list=None, special_values=None, monotonic_variables = None,
+               min_perc_fine_bin=0.02, min_perc_coarse_bin=0.05,stop_limit=0.1, max_num_bin=8,
+               positive="bad|1", no_cores=None, print_step=0, method="tree"):
+
+    """
+    WOE Binning for number features, support badrate monotonic test
+    ------
+    `woebin_num` only optimal binning for numerical variables, support variables badrate monotonic test.
+    using methods including tree-like segmentation or chi-square merge.
+    `woebin_num` can also customizing breakpoints if the breaks_list or special_values was provided.
+
+    Params
+    ------
+    dt: A data frame with both x (predictor/feature) and y (response/label) variables.
+    y: Name of y variable.
+    x: Name of x variables. Default is NULL. If x is NULL,
+      then all variables except y are counted as x variables.
+    breaks_list: List of break points, default is NULL.
+      If it is not NULL, variable binning will based on the
+      provided breaks.
+    special_values: the values specified in special_values
+      will be in separate bins. Default is NULL.
+    monotonic_variables:list of monotonic testing variables.
+    min_perc_fine_bin: The minimum percentage of initial binning
+      class number over total. Accepted range: 0.01-0.2; default
+      is 0.02, which means initial binning into 50 fine bins for
+      continuous variables.
+    min_perc_coarse_bin: The minimum percentage of final binning
+      class number over total. Accepted range: 0.01-0.2; default
+      is 0.05.
+    stop_limit: Stop binning segmentation when information value
+      gain ratio less than the stop_limit, or stop binning merge
+      when the minimum of chi-square less than 'qchisq(1-stoplimit, 1)'.
+      Accepted range: 0-0.5; default is 0.1.
+    max_num_bin: Integer. The maximum number of binning.
+    positive: Value of positive class, default "bad|1".
+    no_cores: Number of CPU cores for parallel computation.
+      Defaults NULL. If no_cores is NULL, the no_cores will
+      set as 1 if length of x variables less than 10, and will
+      set as the number of all CPU cores if the length of x variables
+      greater than or equal to 10.
+    print_step: A non-negative integer. Default is 1. If print_step>0,
+      print variable names by each print_step-th iteration.
+      If print_step=0 or no_cores>1, no message is print.
+    method: Optimal binning method, it should be "tree" or "chimerge".
+      Default is "tree".
+
+    Returns
+    ------
+    dictionary
+        Optimal or customized binning dataframe.
+    """
+    dt = copy.deepcopy(dt)
+    dt = check_dat(dt)
+    dt = check_y(dat=dt,y=y,positive=positive)
+    breaks_list = check_breaks_list(breaks_list)
+    monotonic_variables = check_monotonic_variables(dat=dt,y=y,monotonic_variables=monotonic_variables)
+    if not breaks_list:
+        breaks_list = dict()
+
+    if monotonic_variables:
+        for col in monotonic_variables:
+            # print("There check {} monotonic testing..............".format(col))
+            try:
+                cutOffPoints = woebin(dt=dt[[col,y]],y=y,breaks_list=breaks_list,special_values=special_values,min_perc_fine_bin=min_perc_fine_bin,
+                                      min_perc_coarse_bin=min_perc_coarse_bin,stop_limit=stop_limit,max_num_bin=max_num_bin,positive=positive,
+                                      no_cores=no_cores,print_step=print_step,method=method)[col]["breaks"].tolist()
+                cutOffPoints = [float(i) for i in cutOffPoints if str(i) not in ['inf', '-inf']]
+                # 单调检验合并方案结果
+                mono_cutOffPoints = monotonous_bin(df=dt[[col,y]],col=col,cutOffPoints=cutOffPoints,target=y,special_values=special_values)
+                breaks_list.update(mono_cutOffPoints)
+            except:
+                warnings.warn("The {} have {} unique values, Fail monotonic testing".format(col,len(dt[col].unique())))
+
+    bins = woebin(dt=dt,y=y,x=x,breaks_list=breaks_list,special_values=special_values,min_perc_fine_bin=min_perc_fine_bin,
+                 min_perc_coarse_bin=min_perc_coarse_bin,stop_limit=stop_limit,max_num_bin=max_num_bin,positive=positive,
+                 no_cores=no_cores,print_step=print_step,method=method)
+    return bins