特征工程 | 温书

数据集载入

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv(u"2019-08-01_金融数据描述_data1.csv",encoding = 'gbk')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4754 entries, 0 to 4753
Data columns (total 90 columns):
Unnamed: 0                                    4754 non-null int64
custid                                        4754 non-null int64
trade_no                                      4754 non-null object
bank_card_no                                  4754 non-null object
low_volume_percent                            4752 non-null float64
middle_volume_percent                         4752 non-null float64
take_amount_in_later_12_month_highest         4754 non-null int64
trans_amount_increase_rate_lately             4751 non-null float64
trans_activity_month                          4752 non-null float64
trans_activity_day                            4752 non-null float64
transd_mcc                                    4752 non-null float64
trans_days_interval_filter                    4746 non-null float64
trans_days_interval                           4752 non-null float64
regional_mobility                             4752 non-null float64
student_feature                               1756 non-null float64
repayment_capability                          4754 non-null int64
is_high_user                                  4754 non-null int64
number_of_trans_from_2011                     4752 non-null float64
first_transaction_time                        4752 non-null float64
historical_trans_amount                       4754 non-null int64
historical_trans_day                          4752 non-null float64
rank_trad_1_month                             4752 non-null float64
trans_amount_3_month                          4754 non-null int64
avg_consume_less_12_valid_month               4752 non-null float64
abs                                           4754 non-null int64
top_trans_count_last_1_month                  4752 non-null float64
avg_price_last_12_month                       4754 non-null int64
avg_price_top_last_12_valid_month             4650 non-null float64
reg_preference_for_trad                       4752 non-null object
trans_top_time_last_1_month                   4746 non-null float64
trans_top_time_last_6_month                   4746 non-null float64
consume_top_time_last_1_month                 4746 non-null float64
consume_top_time_last_6_month                 4746 non-null float64
cross_consume_count_last_1_month              4328 non-null float64
trans_fail_top_count_enum_last_1_month        4738 non-null float64
trans_fail_top_count_enum_last_6_month        4738 non-null float64
trans_fail_top_count_enum_last_12_month       4738 non-null float64
consume_mini_time_last_1_month                4728 non-null float64
max_cumulative_consume_later_1_month          4754 non-null int64
max_consume_count_later_6_month               4746 non-null float64
railway_consume_count_last_12_month           4742 non-null float64
pawns_auctions_trusts_consume_last_1_month    4754 non-null int64
pawns_auctions_trusts_consume_last_6_month    4754 non-null int64
jewelry_consume_count_last_6_month            4742 non-null float64
status                                        4754 non-null int64
source                                        4754 non-null object
first_transaction_day                         4752 non-null float64
trans_day_last_12_month                       4752 non-null float64
id_name                                       4478 non-null object
apply_score                                   4450 non-null float64
apply_credibility                             4450 non-null float64
query_org_count                               4450 non-null float64
query_finance_count                           4450 non-null float64
query_cash_count                              4450 non-null float64
query_sum_count                               4450 non-null float64
latest_query_time                             4450 non-null object
latest_one_month_apply                        4450 non-null float64
latest_three_month_apply                      4450 non-null float64
latest_six_month_apply                        4450 non-null float64
loans_score                                   4457 non-null float64
loans_credibility_behavior                    4457 non-null float64
loans_count                                   4457 non-null float64
loans_settle_count                            4457 non-null float64
loans_overdue_count                           4457 non-null float64
loans_org_count_behavior                      4457 non-null float64
consfin_org_count_behavior                    4457 non-null float64
loans_cash_count                              4457 non-null float64
latest_one_month_loan                         4457 non-null float64
latest_three_month_loan                       4457 non-null float64
latest_six_month_loan                         4457 non-null float64
history_suc_fee                               4457 non-null float64
history_fail_fee                              4457 non-null float64
latest_one_month_suc                          4457 non-null float64
latest_one_month_fail                         4457 non-null float64
loans_long_time                               4457 non-null float64
loans_latest_time                             4457 non-null object
loans_credit_limit                            4457 non-null float64
loans_credibility_limit                       4457 non-null float64
loans_org_count_current                       4457 non-null float64
loans_product_count                           4457 non-null float64
loans_max_limit                               4457 non-null float64
loans_avg_limit                               4457 non-null float64
consfin_credit_limit                          4457 non-null float64
consfin_credibility                           4457 non-null float64
consfin_org_count_current                     4457 non-null float64
consfin_product_count                         4457 non-null float64
consfin_max_limit                             4457 non-null float64
consfin_avg_limit                             4457 non-null float64
latest_query_day                              4450 non-null float64
loans_latest_day                              4457 non-null float64
dtypes: float64(70), int64(13), object(7)
memory usage: 3.3+ MB

特征预处理

删除无用

1 2	delete = ['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no','id_name','latest_query_time','source','loans_latest_time','first_transaction_time', 'student_feature'] df = df.drop(delete,axis=1)

处理分类型特征

1 2	from sklearn.preprocessing import LabelEncoder df['reg_preference_for_trad'] = LabelEncoder().fit_transform(df['reg_preference_for_trad'].astype(str))

使用众数填充

from sklearn.preprocessing import Imputer
for i in range(df.shape[1]):
    feature = df.iloc[:,i].values.reshape(-1,1)
    imp_mode = Imputer(strategy='most_frequent')
    df.iloc[:,i] = imp_mode.fit_transform(feature)

特征衍生/升维

特征衍生是指用原始数据进行特征学习得到新的特征。衍生特征一般有两种原因引起的：数据自身的变化，使数据中出现很多原来没有的特征；进行特征学习时，算法根据特征之间的某种关系，产生了衍生特征，有时衍生特征更能反应数据特征之间的关系。衍生特征也要求机器学习和深度学习算法拥有更强的学习能力，即增量学习、在线学习、迁移学习。

衍生特征相对于原始特征能够更好的反映特征与数据的关系，因此对于某些数据来说这是极为重要的一个步骤。观察本数据，发现可以从衍生出以下几个特征，而如 latest_one_month_suc、latest_one_month_fail、latest_six_month_loan 都是处理好的特征：

查询内容占比

1 2	df['query_finance_percent'] = df['query_finance_count'] / df['query_sum_count'] df['query_cash_percent'] = df['query_cash_count'] / df['query_sum_count']

每单平均交易量

1	df['per_avg_amount'] = df['historical_trans_amount'] / df['number_of_trans_from_2011']

每天平均交易量

1	df['avg_amount_perday'] = df['historical_trans_amount'] / df['historical_trans_day']

特征筛选/降维

特征选择( Feature Selection )也称特征子集选择( Feature Subset Selection , FSS )，或属性选择( Attribute Selection )。是指从已有的M个特征(Feature)中选择N个特征使得系统的特定指标最优化，是从原始特征中选择出一些最有效特征以降低数据集维度的过程,是提高学习算法性能的一个重要手段,也是模式识别中关键的数据预处理步骤。对于一个学习算法来说,好的学习样本是训练模型的关键。

共线性分析

共线性问题指的是输入的自变量之间存在较高的线性相关度。共线性问题会导致回归模型的稳定性和准确性大大降低，另外，过多无关的维度计算也很浪费时间。

变量出现共线性的原因：

数据样本不够，导致共线性存在偶然性，这其实反映了缺少数据对于数据建模的影响，共线性仅仅是影响的一部分
多个变量都给予时间有共同或相反的演变趋势，例如春节期间的网络销售量和销售额都相对与正常时间有下降趋势。
多个变量存在一定的推移关系，但总体上变量间的趋势一致，只是发生的时间点不一致，例如广告费用和销售额之间，通常是品牌广告先进行大范围的曝光和信息推送，经过一定时间传播之后，才会在销售额上做出反映。
多变量之间存在线性的关系。例如y代表访客数，用x代表展示广告费用，那么二者的关系很可能是y=2*x + b

如何检验共线性：

容忍度（Tolerance）：容忍度是每个自变量作为因变量对其他自变量进行回归建模时得到的残差比例，大小用1减得到的决定系数来表示。容忍度值越小说明这个自变量与其他自变量间越可能存在共线性问题。
方差膨胀因子：VIF是容忍度的倒数，值越大则共线性问题越明显，通常以10作为判断边界。当VIF<10,不存在多重共线性；当10<=VIF<100,存在较强的多重共线性；当VIF>=100, 存在严重多重共线性。
特征值（Eigenvalue）：该方法实际上就是对自变量做主成分分析，如果多个维度的特征值等于0，则可能有比较严重的共线性。
相关系数：如果相关系数R>0.8时就可能存在较强相关性

如何处理共线性：

增大样本量：增大样本量可以消除犹豫数据量不足而出现的偶然的共线性现象，在可行的前提下这种方法是需要优先考虑的
岭回归法（Ridge Regression）：实际上是一种改良最小二乘估计法。通过放弃最小二乘法的无偏性，以损失部分信息、降低精度为代价来获得更实际和可靠性更强的回归系数。因此岭回归在存在较强共线性的回归应用中较为常用。
逐步回归法（Stepwise Regression）:每次引入一个自变量进行统计检验，然后逐步引入其他变量，同时对所有变量的回归系数进行检验，如果原来引入的变量由于后面变量的引入而变得不再显著，那么久将其剔除，逐步得到最有回归方程。
主成分回归（Principal Components Regression）:通过主成分分析，将原始参与建模的变量转换为少数几个主成分，么个主成分是原变量的线性组合，然后基于主成分做回归分析，这样也可以在不丢失重要数据特征的前提下避开共线性问题。
人工去除：结合人工经验，对自变量进行删减，但是对操作者的业务能力、经验有很高的要求。

1
2
3

corr_matrix = df.corr()
corr_matrix = corr_matrix.unstack()
corr_matrix[(abs(corr_matrix)>0.8) & (abs(corr_matrix) != 1)]

trans_activity_day                       historical_trans_day                       0.855777
historical_trans_amount                  per_avg_amount                             0.870396
historical_trans_day                     trans_activity_day                         0.855777
rank_trad_1_month                        top_trans_count_last_1_month               0.855890
top_trans_count_last_1_month             rank_trad_1_month                          0.855890
trans_top_time_last_1_month              consume_top_time_last_1_month              0.925978
trans_top_time_last_6_month              consume_top_time_last_6_month              0.921067
consume_top_time_last_1_month            trans_top_time_last_1_month                0.925978
consume_top_time_last_6_month            trans_top_time_last_6_month                0.921067
trans_fail_top_count_enum_last_6_month   trans_fail_top_count_enum_last_12_month    0.886180
trans_fail_top_count_enum_last_12_month  trans_fail_top_count_enum_last_6_month     0.886180
apply_score                              loans_score                                0.967695
query_org_count                          query_finance_count                        0.881597
                                         query_cash_count                           0.850818
                                         query_sum_count                            0.948918
                                         latest_three_month_apply                   0.825973
                                         latest_six_month_apply                     0.883612
query_finance_count                      query_org_count                            0.881597
                                         query_sum_count                            0.858778
                                         latest_six_month_apply                     0.817063
query_cash_count                         query_org_count                            0.850818
                                         query_sum_count                            0.804468
query_sum_count                          query_org_count                            0.948918
                                         query_finance_count                        0.858778
                                         query_cash_count                           0.804468
                                         latest_three_month_apply                   0.878368
                                         latest_six_month_apply                     0.942820
latest_one_month_apply                   latest_three_month_apply                   0.873954
                                         latest_six_month_apply                     0.810162
latest_three_month_apply                 query_org_count                            0.825973
                                                                                      ...   
latest_six_month_loan                    loans_settle_count                         0.847959
                                         loans_org_count_behavior                   0.840518
                                         loans_cash_count                           0.804668
                                         history_suc_fee                            0.805410
                                         loans_org_count_current                    0.804668
                                         loans_product_count                        0.808454
history_suc_fee                          loans_count                                0.911063
                                         loans_settle_count                         0.930297
                                         loans_org_count_behavior                   0.826103
                                         latest_six_month_loan                      0.805410
loans_org_count_current                  loans_count                                0.892128
                                         loans_settle_count                         0.874915
                                         loans_org_count_behavior                   0.944990
                                         latest_six_month_loan                      0.804668
                                         loans_product_count                        0.993636
loans_product_count                      loans_count                                0.891898
                                         loans_settle_count                         0.873971
                                         loans_org_count_behavior                   0.939593
                                         loans_cash_count                           0.993636
                                         latest_six_month_loan                      0.808454
                                         loans_org_count_current                    0.993636
consfin_credit_limit                     consfin_avg_limit                          0.910600
consfin_org_count_current                loans_org_count_behavior                   0.804413
                                         consfin_product_count                      0.988156
consfin_product_count                    consfin_org_count_behavior                 0.988156
                                         consfin_org_count_current                  0.988156
consfin_avg_limit                        consfin_credit_limit                       0.910600
per_avg_amount                           historical_trans_amount                    0.870396
                                         avg_amount_perday                          0.850940
avg_amount_perday                        per_avg_amount                             0.850940
Length: 100, dtype: float64

去掉方差较小的特征

方差阈值（VarianceThreshold）是特征选择的一个简单方法，去掉那些方差没有达到阈值的特征。默认情况下，删除零方差的特征，例如那些只有一个值的样本。假设我们有一个有布尔特征的数据集，然后我们想去掉那些超过80%的样本都是0（或者1）的特征。布尔特征是伯努利随机变量，方差为 p(1-p)。

1 2	X = df[:].drop("status",axis=1) y = df["status"]

from sklearn.feature_selection import VarianceThreshold

#得到返回至少含有90%特征信息的特征
sp = VarianceThreshold(threshold=0.8 * 0.2).fit(X, y)

#可以看到哪些特征被保留
X_result = sp.fit_transform(X, y)

#输出结果
sp.get_support()

array([False, False,  True,  True, False, False,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True], dtype=bool)

from sklearn.feature_selection import VarianceThreshold

#得到返回至少含有90%特征信息的特征
sp = VarianceThreshold(threshold=0.8 * 0.2).fit(X, y)

#可以看到哪些特征被保留
X_result = sp.fit_transform(X, y)

#输出结果
sp.get_support()

array([False, False,  True,  True, False, False,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True], dtype=bool)

单变量特征选择

单变量的特征选择是通过基于单变量的统计测试来选择最好的特征，它可以当做是评估器的预处理步骤。

Scikit-learn 将特征选择的内容作为实现了 transform 方法的对象

SelectKBest移除那些除了评分最高的 K 个特征之外的所有特征
SelectPercentile移除除了用户指定的最高得分百分比之外的所有特征

这些对象将得分函数作为输入，返回单变量的得分和 p 值:

对于回归: f_regression , mutual_info_regression
对于分类: chi2 , f_classif , mutual_info_classif 可自行查看官网API文档。

from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif

#得到返回至少含有90%特征信息的特征
sp = SelectPercentile(f_classif, percentile= 90)

#可以看到哪些特征被保留
X_result = sp.fit_transform(X, y)

#输出结果
sp.get_support()

pd.DataFrame({'columns':X.columns,'filter':sp.get_support()})

	columns	filter
0	low_volume_percent	True
1	middle_volume_percent	True
2	take_amount_in_later_12_month_highest	False
3	trans_amount_increase_rate_lately	False
4	trans_activity_month	True
5	trans_activity_day	True
6	transd_mcc	False
7	trans_days_interval_filter	False
8	trans_days_interval	True
9	regional_mobility	True
10	repayment_capability	True
11	is_high_user	True
12	number_of_trans_from_2011	True
13	historical_trans_amount	True
14	historical_trans_day	True
15	rank_trad_1_month	True
16	trans_amount_3_month	True
17	avg_consume_less_12_valid_month	True
18	abs	True
19	top_trans_count_last_1_month	True
20	avg_price_last_12_month	True
21	avg_price_top_last_12_valid_month	True
22	reg_preference_for_trad	True
23	trans_top_time_last_1_month	True
24	trans_top_time_last_6_month	True
25	consume_top_time_last_1_month	True
26	consume_top_time_last_6_month	True
27	cross_consume_count_last_1_month	True
28	trans_fail_top_count_enum_last_1_month	True
29	trans_fail_top_count_enum_last_6_month	True
...	...	...
53	loans_overdue_count	True
54	loans_org_count_behavior	True
55	consfin_org_count_behavior	True
56	loans_cash_count	True
57	latest_one_month_loan	True
58	latest_three_month_loan	True
59	latest_six_month_loan	True
60	history_suc_fee	True
61	history_fail_fee	True
62	latest_one_month_suc	True
63	latest_one_month_fail	True
64	loans_long_time	True
65	loans_credit_limit	True
66	loans_credibility_limit	True
67	loans_org_count_current	True
68	loans_product_count	True
69	loans_max_limit	True
70	loans_avg_limit	False
71	consfin_credit_limit	True
72	consfin_credibility	True
73	consfin_org_count_current	True
74	consfin_product_count	True
75	consfin_max_limit	True
76	consfin_avg_limit	True
77	latest_query_day	True
78	loans_latest_day	True
79	query_finance_percent	True
80	query_cash_percent	True
81	per_avg_amount	True
82	avg_amount_perday	True

83 rows × 2 columns

基于 L1 的特征选取

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
sp = SelectFromModel(lsvc, prefit=True)
X_new = sp.transform(X)

#输出结果
sp.get_support()

pd.DataFrame({'columns':X.columns,'filter':sp.get_support()})

	columns	filter
0	low_volume_percent	False
1	middle_volume_percent	False
2	take_amount_in_later_12_month_highest	False
3	trans_amount_increase_rate_lately	True
4	trans_activity_month	False
5	trans_activity_day	False
6	transd_mcc	True
7	trans_days_interval_filter	True
8	trans_days_interval	True
9	regional_mobility	False
10	repayment_capability	False
11	is_high_user	False
12	number_of_trans_from_2011	True
13	historical_trans_amount	False
14	historical_trans_day	True
15	rank_trad_1_month	False
16	trans_amount_3_month	False
17	avg_consume_less_12_valid_month	False
18	abs	False
19	top_trans_count_last_1_month	True
20	avg_price_last_12_month	True
21	avg_price_top_last_12_valid_month	False
22	reg_preference_for_trad	False
23	trans_top_time_last_1_month	True
24	trans_top_time_last_6_month	True
25	consume_top_time_last_1_month	False
26	consume_top_time_last_6_month	True
27	cross_consume_count_last_1_month	True
28	trans_fail_top_count_enum_last_1_month	True
29	trans_fail_top_count_enum_last_6_month	True
...	...	...
53	loans_overdue_count	True
54	loans_org_count_behavior	False
55	consfin_org_count_behavior	False
56	loans_cash_count	False
57	latest_one_month_loan	False
58	latest_three_month_loan	False
59	latest_six_month_loan	True
60	history_suc_fee	True
61	history_fail_fee	True
62	latest_one_month_suc	True
63	latest_one_month_fail	True
64	loans_long_time	True
65	loans_credit_limit	True
66	loans_credibility_limit	True
67	loans_org_count_current	True
68	loans_product_count	False
69	loans_max_limit	False
70	loans_avg_limit	True
71	consfin_credit_limit	False
72	consfin_credibility	True
73	consfin_org_count_current	False
74	consfin_product_count	True
75	consfin_max_limit	False
76	consfin_avg_limit	False
77	latest_query_day	True
78	loans_latest_day	True
79	query_finance_percent	False
80	query_cash_percent	False
81	per_avg_amount	False
82	avg_amount_perday	True

83 rows × 2 columns

基于决策树的特征选取

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

forest = DecisionTreeClassifier(criterion='entropy').fit(X, y)
sp = SelectFromModel(forest, prefit=True)
X_new = sp.transform(X)

#输出结果
sp.get_support()

pd.DataFrame({'columns':X.columns,'filter':sp.get_support()})

	columns	filter
0	low_volume_percent	False
1	middle_volume_percent	True
2	take_amount_in_later_12_month_highest	True
3	trans_amount_increase_rate_lately	True
4	trans_activity_month	True
5	trans_activity_day	True
6	transd_mcc	True
7	trans_days_interval_filter	False
8	trans_days_interval	True
9	regional_mobility	False
10	repayment_capability	True
11	is_high_user	False
12	number_of_trans_from_2011	True
13	historical_trans_amount	False
14	historical_trans_day	True
15	rank_trad_1_month	False
16	trans_amount_3_month	True
17	avg_consume_less_12_valid_month	False
18	abs	True
19	top_trans_count_last_1_month	False
20	avg_price_last_12_month	False
21	avg_price_top_last_12_valid_month	False
22	reg_preference_for_trad	False
23	trans_top_time_last_1_month	False
24	trans_top_time_last_6_month	False
25	consume_top_time_last_1_month	False
26	consume_top_time_last_6_month	False
27	cross_consume_count_last_1_month	False
28	trans_fail_top_count_enum_last_1_month	True
29	trans_fail_top_count_enum_last_6_month	False
...	...	...
53	loans_overdue_count	False
54	loans_org_count_behavior	False
55	consfin_org_count_behavior	False
56	loans_cash_count	False
57	latest_one_month_loan	False
58	latest_three_month_loan	False
59	latest_six_month_loan	False
60	history_suc_fee	True
61	history_fail_fee	True
62	latest_one_month_suc	False
63	latest_one_month_fail	True
64	loans_long_time	True
65	loans_credit_limit	False
66	loans_credibility_limit	False
67	loans_org_count_current	False
68	loans_product_count	False
69	loans_max_limit	False
70	loans_avg_limit	False
71	consfin_credit_limit	False
72	consfin_credibility	False
73	consfin_org_count_current	False
74	consfin_product_count	False
75	consfin_max_limit	True
76	consfin_avg_limit	False
77	latest_query_day	True
78	loans_latest_day	False
79	query_finance_percent	True
80	query_cash_percent	True
81	per_avg_amount	True
82	avg_amount_perday	True

83 rows × 2 columns

#输出特征排序
importance = forest.feature_importances_
imp_result = np.argsort(importance)[::-1]

pd.DataFrame({'columns':X.columns,'filter':sp.get_support(),'importance':imp_result}).sort_values("importance",ascending=True)

	columns	filter	importance
63	latest_one_month_fail	True	0
30	trans_fail_top_count_enum_last_12_month	False	1
25	consume_top_time_last_1_month	False	2
29	trans_fail_top_count_enum_last_6_month	False	3
8	trans_days_interval	True	4
19	top_trans_count_last_1_month	False	5
18	abs	True	6
53	loans_overdue_count	False	7
21	avg_price_top_last_12_valid_month	False	8
55	consfin_org_count_behavior	False	9
9	regional_mobility	False	10
77	latest_query_day	True	11
24	trans_top_time_last_6_month	False	12
74	consfin_product_count	False	13
28	trans_fail_top_count_enum_last_1_month	True	14
37	jewelry_consume_count_last_6_month	False	15
6	transd_mcc	True	16
69	loans_max_limit	False	17
4	trans_activity_month	True	18
67	loans_org_count_current	False	19
32	max_cumulative_consume_later_1_month	True	20
75	consfin_max_limit	True	21
65	loans_credit_limit	False	22
44	query_cash_count	False	23
57	latest_one_month_loan	False	24
39	trans_day_last_12_month	True	25
33	max_consume_count_later_6_month	False	26
82	avg_amount_perday	True	27
0	low_volume_percent	False	28
59	latest_six_month_loan	False	29
...	...	...	...
51	loans_count	False	53
47	latest_three_month_apply	False	54
58	latest_three_month_loan	False	55
73	consfin_org_count_current	False	56
81	per_avg_amount	True	57
70	loans_avg_limit	False	58
56	loans_cash_count	False	59
14	historical_trans_day	True	60
1	middle_volume_percent	True	61
36	pawns_auctions_trusts_consume_last_6_month	True	62
16	trans_amount_3_month	True	63
3	trans_amount_increase_rate_lately	True	64
42	query_org_count	False	65
50	loans_credibility_behavior	False	66
62	latest_one_month_suc	False	67
41	apply_credibility	False	68
43	query_finance_count	False	69
48	latest_six_month_apply	True	70
34	railway_consume_count_last_12_month	False	71
45	query_sum_count	False	72
76	consfin_avg_limit	False	73
60	history_suc_fee	True	74
22	reg_preference_for_trad	False	75
66	loans_credibility_limit	False	76
27	cross_consume_count_last_1_month	False	77
38	first_transaction_day	False	78
12	number_of_trans_from_2011	True	79
13	historical_trans_amount	False	80
10	repayment_capability	True	81
5	trans_activity_day	True	82

83 rows × 3 columns

WOE

全称是“Weight of Evidence”，即证据权重。WOE是对原始自变量的一种编码形式。
要对一个变量进行WOE编码，需要首先把这个变量进行分组处理（也叫离散化、分箱等等，说的都是一个意思）。分组后，对于第i组，WOE的计算公式如下：

其中，pyi是这个组中响应客户（风险模型中，对应的是违约客户，总之，指的是模型中预测变量取值为“是”或者说1的个体）占所有样本中所有响应客户的比例，pni是这个组中未响应客户占样本中所有未响应客户的比例，#yi是这个组中响应客户的数量，#ni是这个组中未响应客户的数量，#yT是样本中所有响应客户的数量，#nT是样本中所有未响应客户的数量。

从这个公式中我们可以体会到，WOE表示的实际上是“当前分组中响应客户占所有响应客户的比例”和“当前分组中没有响应的客户占所有没有响应的客户的比例”的差异。

对这个公式做一个简单变换，可以得到：

变换以后我们可以看出，WOE也可以这么理解，他表示的是当前这个组中响应的客户和未响应客户的比值，和所有样本中这个比值的差异。这个差异是用这两个比值的比值，再取对数来表示的。WOE越大，这种差异越大，这个分组里的样本响应的可能性就越大，WOE越小，差异越小，这个分组里的样本响应的可能性就越小。

IV

IV的全称是Information Value，中文意思是信息价值，或者信息量。

我们在用逻辑回归、决策树等模型方法构建分类模型时，经常需要对自变量进行筛选。比如我们有200个候选自变量，通常情况下，不会直接把200个变量直接放到模型中去进行拟合训练，而是会用一些方法，从这200个自变量中挑选一些出来，放进模型，形成入模变量列表。那么我们怎么去挑选入模变量呢？

挑选入模变量过程是个比较复杂的过程，需要考虑的因素很多，比如：变量的预测能力，变量之间的相关性，变量的简单性（容易生成和使用），变量的强壮性（不容易被绕过），变量在业务上的可解释性（被挑战时可以解释的通）等等。但是，其中最主要和最直接的衡量标准是变量的预测能力。

“变量的预测能力”这个说法很笼统，很主观，非量化，在筛选变量的时候我们总不能说：“我觉得这个变量预测能力很强，所以他要进入模型”吧？我们需要一些具体的量化指标来衡量每自变量的预测能力，并根据这些量化指标的大小，来确定哪些变量进入模型。IV就是这样一种指标，他可以用来衡量自变量的预测能力。类似的指标还有信息增益、基尼系数等等。

从直观逻辑上大体可以这样理解“用IV去衡量变量预测能力”这件事情：我们假设在一个分类问题中，目标变量的类别有两类：Y1，Y2。对于一个待预测的个体A，要判断A属于Y1还是Y2，我们是需要一定的信息的，假设这个信息总量是I，而这些所需要的信息，就蕴含在所有的自变量C1，C2，C3，……，Cn中，那么，对于其中的一个变量Ci来说，其蕴含的信息越多，那么它对于判断A属于Y1还是Y2的贡献就越大，Ci的信息价值就越大，Ci的IV就越大，它就越应该进入到入模变量列表中。

IVi无论等于负无穷还是正无穷，都是没有意义的。

使用IV其实有一个缺点，就是不能自动处理变量的分组中出现响应比例为0或100%的情况。那么，遇到响应比例为0或者100%的情况，我们应该怎么做呢？建议如下：

如果可能，直接把这个分组做成一个规则，作为模型的前置条件或补充条件
重新对变量进行离散化或分组，使每个分组的响应比例都不为0且不为100%，尤其是当一个分组个体数很小时（比如小于100个），强烈建议这样做，因为本身把一个分组个体数弄得很小就不是太合理
如果上面两种方法都无法使用，建议人工把该分组的响应数和非响应的数量进行一定的调整。如果响应数原本为0，可以人工调整响应数为1，如果非响应数原本为0，可以人工调整非响应数为1

pd.DataFrame([["<0.03","无预测能力"],
["0.03-0.09","低"],
["0.1-0.29","中"],
["0.3-0.49","高"],
[">=0.5","极高且可疑"],], columns = ["IV","预测能力"])

	IV	预测能力
0	<0.03	无预测能力
1	0.03-0.09	低
2	0.1-0.29	中
3	0.3-0.49	高
4	>=0.5	极高且可疑

def CalcIV(Xvar,Yvar):
    N_0=np.sum(Yvar==0)
    N_1=np.sum(Yvar==1)
    N_0_group=np.zeros(np.unique(Xvar).shape)
    
    N_1_group=np.zeros(np.unique(Xvar).shape)
    for i in range(len(np.unique(Xvar))):
        N_0_group[i] = Yvar[(Xvar==np.unique(Xvar)[i])&(Yvar==0)].count()
        N_1_group[i] = Yvar[(Xvar==np.unique(Xvar)[i])&(Yvar==1)].count()
    iv = np.sum((N_0_group/N_0-N_1_group/N_1)*np.log((N_0_group/N_0)/(N_1_group/N_1)))
    if iv>=1.0:## 处理极端值
        iv=1
    return iv

def caliv_batch(df,Yvar):
    ivlist=[]
    for col in df.columns:
        iv=CalcIV(df[col],Yvar)
        ivlist.append(iv)
    names=list(df.columns)
    iv_df=pd.DataFrame({'Var':names,'Iv':ivlist},columns=['Var','Iv'])

    return iv_df,ivlist

im_iv, ivl = caliv_batch(X,y)

1	im_iv['Iv'].describe()

count    83.000000
mean      0.930759
std       0.249840
min       0.005614
25%       1.000000
50%       1.000000
75%       1.000000
max       1.000000
Name: Iv, dtype: float64

threshold = 0.02
threshold2 = 0.6
data_index=[]
for i in range(len(ivl)):
    if (im_iv['Iv'][i]< threshold)|(im_iv['Iv'][i] > threshold2):
        data_index.append(im_iv['Var'][i])
datafinal_IV = df.drop(data_index,axis=1)

1	datafinal_IV.columns

Index(['rank_trad_1_month', 'top_trans_count_last_1_month', 'status',
       'consfin_product_count'],
      dtype='object')

1 2	x_train_IV = datafinal_IV.iloc[:,:-1] y_train_IV = datafinal_IV.iloc[:,-1]

参考：