https://tianchi.aliyun.com/competition/entrance/531830/informationpython
赛题以预测用户贷款是否违约为任务,数据集报名后可见并可下载,该数据来自某信贷平台的贷款记录,总数据量超过120w,包含47列变量信息,其中15列为匿名变量。为了保证比赛的公平性,将会从中抽取80万条做为训练集,20万条做为测试集A,20万条做为测试集B,同时会对employmentTitle、purpose、postCode和title等信息进行脱敏。git
# 导入标准库 import io, os, sys, types, time, datetime, math, random, requests, subprocess,io, tempfile, math # 导入第三方库 # 数据处理 import numpy as np import pandas as pd # 数据可视化 import matplotlib.pyplot as plt from tqdm import tqdm import missingno import seaborn as sns # from pandas.tools.plotting import scatter_matrix # No module named 'pandas.tools' from mpl_toolkits.mplot3d import Axes3D # plt.style.use('seaborn') # 改变图像风格 plt.rcParams['font.family'] = ['Arial Unicode MS', 'Microsoft Yahei', 'SimHei', 'sans-serif'] # 解决中文乱码 plt.rcParams['axes.unicode_minus'] = False # simhei黑体字 负号乱码 解决 # 特征选择和编码 from sklearn.feature_selection import RFE, RFECV from sklearn.svm import SVR from sklearn.decomposition import PCA from sklearn import preprocessing from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize # Imputer # from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute # 机器学习 import sklearn.ensemble as ske from sklearn import datasets, model_selection, tree, preprocessing, metrics, linear_model from sklearn.svm import LinearSVC from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, SGDClassifier from sklearn.tree import DecisionTreeClassifier import xgboost as xgb import lightgbm as lgb from catboost import CatBoostRegressor # 网格搜索、随机搜索 import scipy.stats as st from scipy.stats import randint as sp_randint from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import train_test_split # 模型度量(分类) from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss # 警告处理 import warnings warnings.filterwarnings('ignore') # 在Jupyter上画图 %matplotlib inline # 数据预处理 import numpy as np import scipy as sc import sklearn as sk import matplotlib.pyplot as plt # 绘图工具包 import seaborn as sns import pyecharts.options as opts from pyecharts.charts import Line, Grid
# 数据集路径 train_path = 'train.csv' test_path = 'testA.csv' dataset_path = './' data_train_path = dataset_path + train_path data_test_path = dataset_path + test_path # 2.数据集csv读入 train = pd.read_csv(data_train_path) test_a = pd.read_csv(data_test_path)
此部分为零基础入门金融风控的 Task3 特征工程部分,带你来了解各类特征工程以及分析方法,欢迎你们后续多多交流。github
赛题:零基础入门数据挖掘 - 零基础入门金融风控之贷款违约web
项目地址:https://github.com/datawhalechina/team-learning-data-mining/tree/master/FinancialRiskControl算法
比赛地址:https://tianchi.aliyun.com/competition/entrance/531830/introductionbootstrap
数据EDA部分咱们已经对数据的大概和某些特征分布有了了解,数据预处理部分通常咱们要处理一些EDA阶段分析出来的问题,这里介绍了数据缺失值的填充,时间格式特征的转化处理,某些对象类别特征的处理。数组
首先咱们查找出数据中的对象特征和数值特征app
numerical_fea = list(train.select_dtypes(exclude=['object']).columns) category_fea = list(filter(lambda x: x not in numerical_fea,list(train.columns))) label = 'isDefault' numerical_fea.remove(label)
把全部缺失值替换为指定的值0echarts
data_train = data_train.fillna(0)dom
向用缺失值上面的值替换缺失值
data_train = data_train.fillna(axis=0,method=‘ffill’)
纵向用缺失值下面的值替换缺失值,且设置最多只填充两个连续的缺失值
data_train = data_train.fillna(axis=0,method=‘bfill’,limit=2)
#查看缺失值状况 train.isnull().sum()
id 0 loanAmnt 0 term 0 interestRate 0 installment 0 grade 0 subGrade 0 employmentTitle 1 employmentLength 46799 homeOwnership 0 annualIncome 0 verificationStatus 0 issueDate 0 isDefault 0 purpose 0 postCode 1 regionCode 0 dti 239 delinquency_2years 0 ficoRangeLow 0 ficoRangeHigh 0 openAcc 0 pubRec 0 pubRecBankruptcies 405 revolBal 0 revolUtil 531 totalAcc 0 initialListStatus 0 applicationType 0 earliesCreditLine 0 title 1 policyCode 0 n0 40270 n1 40270 n2 40270 n3 40270 n4 33239 n5 40270 n6 40270 n7 40270 n8 40271 n9 40270 n10 33239 n11 69752 n12 40270 n13 40270 n14 40270 dtype: int64
#按照平均数填充数值型特征 train[numerical_fea] = train[numerical_fea].fillna(train[numerical_fea].median()) test_a[numerical_fea] = test_a[numerical_fea].fillna(train[numerical_fea].median()) #按照众数填充类别型特征 train[category_fea] = train[category_fea].fillna(train[category_fea].mode()) test_a[category_fea] = test_a[category_fea].fillna(train[category_fea].mode())
#查看类别特征
category_fea
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
#转化成时间格式 for data in [train, test_a]: data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d') startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d') #构造时间特征 data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
train['employmentLength'].value_counts(dropna=False).sort_index()
1 year 52489 10+ years 262753 2 years 72358 3 years 64152 4 years 47985 5 years 50102 6 years 37254 7 years 35407 8 years 36192 9 years 30272 < 1 year 64237 NaN 46799 Name: employmentLength, dtype: int64
# 对earliesCreditLine进行预处理 train['earliesCreditLine'].sample(5)
575381 Sep-1998 596915 Dec-2010 742902 Apr-1997 164860 Jun-1998 512564 Nov-2006 Name: earliesCreditLine, dtype: object
for data in [train, test_a]: data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
# 部分类别特征 cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \ 'applicationType', 'initialListStatus', 'title', 'policyCode'] for f in cate_features: print(f, '类型数:', data[f].nunique())
grade 类型数: 7 subGrade 类型数: 35 employmentTitle 类型数: 79282 homeOwnership 类型数: 6 verificationStatus 类型数: 3 purpose 类型数: 14 postCode 类型数: 889 regionCode 类型数: 51 applicationType 类型数: 2 initialListStatus 类型数: 2 title 类型数: 12058 policyCode 类型数: 1
for data in [train, test_a]: data['grade'] = data['grade'].map({ 'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
# 类型数在2之上,又不是高维稀疏的,且纯分类特征 for data in [train, test_a]: data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
def find_outliers_by_3segama(data,fea): data_std = np.std(data[fea]) data_mean = np.mean(data[fea]) outliers_cut_off = data_std * 3 lower_rule = data_mean - outliers_cut_off upper_rule = data_mean + outliers_cut_off data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值') return data
train = train.copy() for fea in numerical_fea: train = find_outliers_by_3segama(train,fea) print(train[fea+'_outliers'].value_counts()) print(train.groupby(fea+'_outliers')['isDefault'].sum()) print('*'*10)
正常值 800000 Name: id_outliers, dtype: int64 id_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: loanAmnt_outliers, dtype: int64 loanAmnt_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: term_outliers, dtype: int64 term_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 794259 异常值 5741 Name: interestRate_outliers, dtype: int64 interestRate_outliers 异常值 2916 正常值 156694 Name: isDefault, dtype: int64 ********** 正常值 792046 异常值 7954 Name: installment_outliers, dtype: int64 installment_outliers 异常值 2152 正常值 157458 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: employmentTitle_outliers, dtype: int64 employmentTitle_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 799701 异常值 299 Name: homeOwnership_outliers, dtype: int64 homeOwnership_outliers 异常值 62 正常值 159548 Name: isDefault, dtype: int64 ********** 正常值 793973 异常值 6027 Name: annualIncome_outliers, dtype: int64 annualIncome_outliers 异常值 756 正常值 158854 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: verificationStatus_outliers, dtype: int64 verificationStatus_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 783003 异常值 16997 Name: purpose_outliers, dtype: int64 purpose_outliers 异常值 3635 正常值 155975 Name: isDefault, dtype: int64 ********** 正常值 798931 异常值 1069 Name: postCode_outliers, dtype: int64 postCode_outliers 异常值 221 正常值 159389 Name: isDefault, dtype: int64 ********** 正常值 799994 异常值 6 Name: regionCode_outliers, dtype: int64 regionCode_outliers 异常值 1 正常值 159609 Name: isDefault, dtype: int64 ********** 正常值 798440 异常值 1560 Name: dti_outliers, dtype: int64 dti_outliers 异常值 466 正常值 159144 Name: isDefault, dtype: int64 ********** 正常值 778245 异常值 21755 Name: delinquency_2years_outliers, dtype: int64 delinquency_2years_outliers 异常值 5089 正常值 154521 Name: isDefault, dtype: int64 ********** 正常值 788261 异常值 11739 Name: ficoRangeLow_outliers, dtype: int64 ficoRangeLow_outliers 异常值 778 正常值 158832 Name: isDefault, dtype: int64 ********** 正常值 788261 异常值 11739 Name: ficoRangeHigh_outliers, dtype: int64 ficoRangeHigh_outliers 异常值 778 正常值 158832 Name: isDefault, dtype: int64 ********** 正常值 790889 异常值 9111 Name: openAcc_outliers, dtype: int64 openAcc_outliers 异常值 2195 正常值 157415 Name: isDefault, dtype: int64 ********** 正常值 792471 异常值 7529 Name: pubRec_outliers, dtype: int64 pubRec_outliers 异常值 1701 正常值 157909 Name: isDefault, dtype: int64 ********** 正常值 794120 异常值 5880 Name: pubRecBankruptcies_outliers, dtype: int64 pubRecBankruptcies_outliers 异常值 1423 正常值 158187 Name: isDefault, dtype: int64 ********** 正常值 790001 异常值 9999 Name: revolBal_outliers, dtype: int64 revolBal_outliers 异常值 1359 正常值 158251 Name: isDefault, dtype: int64 ********** 正常值 799948 异常值 52 Name: revolUtil_outliers, dtype: int64 revolUtil_outliers 异常值 23 正常值 159587 Name: isDefault, dtype: int64 ********** 正常值 791663 异常值 8337 Name: totalAcc_outliers, dtype: int64 totalAcc_outliers 异常值 1668 正常值 157942 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: initialListStatus_outliers, dtype: int64 initialListStatus_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 784586 异常值 15414 Name: applicationType_outliers, dtype: int64 applicationType_outliers 异常值 3875 正常值 155735 Name: isDefault, dtype: int64 ********** 正常值 775134 异常值 24866 Name: title_outliers, dtype: int64 title_outliers 异常值 3900 正常值 155710 Name: isDefault, dtype: int64 ********** 正常值 800000 Name: policyCode_outliers, dtype: int64 policyCode_outliers 正常值 159610 Name: isDefault, dtype: int64 ********** 正常值 782773 异常值 17227 Name: n0_outliers, dtype: int64 n0_outliers 异常值 3485 正常值 156125 Name: isDefault, dtype: int64 ********** 正常值 790500 异常值 9500 Name: n1_outliers, dtype: int64 n1_outliers 异常值 2491 正常值 157119 Name: isDefault, dtype: int64 ********** 正常值 789067 异常值 10933 Name: n2_outliers, dtype: int64 n2_outliers 异常值 3205 正常值 156405 Name: isDefault, dtype: int64 ********** 正常值 789067 异常值 10933 Name: n3_outliers, dtype: int64 n3_outliers 异常值 3205 正常值 156405 Name: isDefault, dtype: int64 ********** 正常值 788660 异常值 11340 Name: n4_outliers, dtype: int64 n4_outliers 异常值 2476 正常值 157134 Name: isDefault, dtype: int64 ********** 正常值 790355 异常值 9645 Name: n5_outliers, dtype: int64 n5_outliers 异常值 1858 正常值 157752 Name: isDefault, dtype: int64 ********** 正常值 786006 异常值 13994 Name: n6_outliers, dtype: int64 n6_outliers 异常值 3182 正常值 156428 Name: isDefault, dtype: int64 ********** 正常值 788430 异常值 11570 Name: n7_outliers, dtype: int64 n7_outliers 异常值 2746 正常值 156864 Name: isDefault, dtype: int64 ********** 正常值 789625 异常值 10375 Name: n8_outliers, dtype: int64 n8_outliers 异常值 2131 正常值 157479 Name: isDefault, dtype: int64 ********** 正常值 786384 异常值 13616 Name: n9_outliers, dtype: int64 n9_outliers 异常值 3953 正常值 155657 Name: isDefault, dtype: int64 ********** 正常值 788979 异常值 11021 Name: n10_outliers, dtype: int64 n10_outliers 异常值 2639 正常值 156971 Name: isDefault, dtype: int64 ********** 正常值 799434 异常值 566 Name: n11_outliers, dtype: int64 n11_outliers 异常值 112 正常值 159498 Name: isDefault, dtype: int64 ********** 正常值 797585 异常值 2415 Name: n12_outliers, dtype: int64 n12_outliers 异常值 545 正常值 159065 Name: isDefault, dtype: int64 ********** 正常值 788907 异常值 11093 Name: n13_outliers, dtype: int64 n13_outliers 异常值 2482 正常值 157128 Name: isDefault, dtype: int64 ********** 正常值 788884 异常值 11116 Name: n14_outliers, dtype: int64 n14_outliers 异常值 3364 正常值 156246 Name: isDefault, dtype: int64 **********
#删除异常值 for fea in numerical_fea: data_train = train[data_train[fea+'_outliers']=='正常值'] data_train = train.reset_index(drop=True)
总结一句话:四分位数会将数据分为三个点和四个区间,IQR = Q3 -Q1,下触须=Q1 − 1.5x IQR,上触须=Q3 + 1.5x IQR;
特征分箱的目的:
数据分桶的对象:
分箱的缘由:
分箱的优势:
特别要注意一下分箱的基本原则:
当数值横跨多个数量级时,最好按照 10 的幂(或任何常数的幂)来进行分组:0九、109九、10099九、10009999,等等。固定宽度分箱很是容易计算,但若是计数值中有比较大的缺口,就会产生不少没有任何数据的空箱子。
# 经过除法映射到间隔均匀的分箱中,每一个分箱的取值范围都是loanAmnt/1000 data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000) ## 经过对数函数映射到指数宽度分箱 data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)
for col in ['grade', 'subGrade']: temp_dict = train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={ 'mean': col + '_target_mean'}) temp_dict.index = temp_dict[col].values temp_dict = temp_dict[col + '_target_mean'].to_dict() train[col + '_target_mean'] = train[col].map(temp_dict) test_a[col + '_target_mean'] = test_a[col].map(temp_dict)
# 其余衍生变量 mean 和 std for df in [train, test_a]: for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']: df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean') df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')
#label-encode:subGrade,postCode,title # 高维类别特征须要进行转换 for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']): le = LabelEncoder() le.fit(list(train[col].astype(str).values) + list(test_a[col].astype(str).values)) train[col] = le.transform(list(train[col].astype(str).values)) test_a[col] = le.transform(list(test_a[col].astype(str).values)) print('Label Encoding 完成')
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00, 1.77s/it] Label Encoding 完成
对特征作归一化,去除相关性高的特征
归一化目的是让训练过程更好更快的收敛,避免特征大吃小的问题
去除相关性是增长模型的可解释性,加快预测过程。
data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))
特征选择的方法:
train.columns[10:20]
Index(['annualIncome', 'verificationStatus', 'issueDate', 'isDefault', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow'], dtype='object')
target_train = train['isDefault']
from sklearn.feature_selection import VarianceThreshold #其中参数threshold为方差的阈值 VarianceThreshold(threshold=3).fit_transform(train,target_train)
from sklearn.feature_selection import SelectKBest from scipy.stats import pearsonr #选择K个最好的特征,返回选择特征后的数据 #第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量, #输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数 #参数k为选择的特征个数 SelectKBest(k=5).fit_transform(train,target_train)
from sklearn.feature_selection import SelectKBest from minepy import MINE #因为MINE的设计不是函数式的,定义mic方法将其为函数式的, #返回一个二元组,二元组的第2项设置成固定的P值0.5 def mic(x, y): m = MINE() m.compute_score(x, y) return (m.mic(), 0.5) #参数k为选择的特征个数 SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(train,target_train)
from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression #递归特征消除法,返回特征选择后的数据 #参数estimator为基模型 #参数n_features_to_select为选择的特征个数 RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(train,target_train)
from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression #带L1惩罚项的逻辑回归做为基模型的特征选择 SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(train,target_train)
from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import GradientBoostingClassifier #GBDT做为基模型的特征选择 SelectFromModel(GradientBoostingClassifier()).fit_transform(train,target_train)
本数据集中咱们删除非入模特征后,并对缺失值填充,而后用计算协方差的方式看一下特征间相关性,而后进行模型训练
# 删除不须要的数据 for data in [train, test_a]: data.drop(['issueDate','id'], axis=1,inplace=True)
# 纵向用缺失值上面的值替换缺失值 train = train.fillna(axis=0,method='ffill')
#计算协方差 data_corr = train.corrwith(train.isDefault) #计算相关性 result = pd.DataFrame(columns=['features', 'corr']) result['features'] = data_corr.index result['corr'] = data_corr.values
# 固然也能够直接看图, 注意id不要 data_numeric = train[numerical_fea[1:]] correlation = data_numeric.corr() f , ax = plt.subplots(figsize = (7, 7)) plt.title('Correlation of Numeric Features with Price',y=1,size=16) sns.heatmap(correlation,square = True, vmax=0.8)
<matplotlib.axes._subplots.AxesSubplot at 0x19d77b665f8>
features = [f for f in train.columns if f not in ['id','issueDate','isDefault'] and '_outliers' not in f] x_train = train[features] x_test = test_a[features] y_train = train['isDefault']
def cv_model(clf, train_x, train_y, test_x, clf_name): folds = 5 seed = 2020 kf = KFold(n_splits=folds, shuffle=True, random_state=seed) train = np.zeros(train_x.shape[0]) test = np.zeros(test_x.shape[0]) cv_scores = [] for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)): print('************************************ {} ************************************'.format(str(i+1))) trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index] if clf_name == "lgb": train_matrix = clf.Dataset(trn_x, label=trn_y) valid_matrix = clf.Dataset(val_x, label=val_y) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'min_child_weight': 5, 'num_leaves': 2 ** 5, 'lambda_l2': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 4, 'learning_rate': 0.1, 'seed': 2020, 'nthread': 28, 'n_jobs':24, 'silent': True, 'verbose': -1, } model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200) val_pred = model.predict(val_x, num_iteration=model.best_iteration) test_pred = model.predict(test_x, num_iteration=model.best_iteration) # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20]) if clf_name == "xgb": train_matrix = clf.DMatrix(trn_x , label=trn_y) valid_matrix = clf.DMatrix(val_x , label=val_y) params = { 'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gamma': 1, 'min_child_weight': 1.5, 'max_depth': 5, 'lambda': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7, 'eta': 0.04, 'tree_method': 'exact', 'seed': 2020, 'nthread': 36, "silent": True, } watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')] model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200) val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit) test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit) if clf_name == "cat": params = { 'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli', 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False} model = clf(iterations=20000, **params) model.fit(trn_x, trn_y, eval_set=(val_x, val_y), cat_features=[], use_best_model=True, verbose=500) val_pred = model.predict(val_x) test_pred = model.predict(test_x) train[valid_index] = val_pred test = test_pred / kf.n_splits cv_scores.append(roc_auc_score(val_y, val_pred)) print(cv_scores) print("%s_scotrainre_list:" % clf_name, cv_scores) print("%s_score_mean:" % clf_name, np.mean(cv_scores)) print("%s_score_std:" % clf_name, np.std(cv_scores)) return train, test
def lgb_model(x_train, y_train, x_test): lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb") return lgb_train, lgb_test def xgb_model(x_train, y_train, x_test): xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb") return xgb_train, xgb_test def cat_model(x_train, y_train, x_test): cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)
testA_result = pd.read_csv('../testA_result.csv')
roc_auc_score(testA_result['isDefault'].values, lgb_test)
特征工程是机器学习以及深度学习中最为重要的一部分,在实际应用中每每也是所花费时间最多的一步,可能占到项目时间的70%以上。本文主要是经过一些经常使用的方法来作介绍,例如缺失值异常值的处理方法详细对任何数据集来讲都是适用的。特征工程在比赛和具体的应用针对性也有所不一样,在实际的金融风控评分卡制做过程当中,因为强调特征的可解释性,特征分箱尤为重要,选择对应模型也须要可解释性
提交结果为每一个测试样本是1的几率,也就是y为1的几率。评价方法为AUC评估模型效果(越大越好)。
分类经常使用使用的评估指标是:
本次是学习赛使用的评估指标是AUC
提交前请确保预测结果的格式与sample_submit.csv中的格式一致,以及提交文件后缀名为csv。