首先大体说一下各个Modelpython
点(0,1):即FPR=0, TPR=1,意味着FN=0且FP=0,将全部的样本都正确分类;web
点(1,0):即FPR=1,TPR=0,最差分类器,避开了全部正确答案;算法
点(0,0):即FPR=TPR=0,FP=TP=0,分类器把每一个实例都预测为负类;shell
点(1,1):分类器把每一个实例都预测为正类app
总之:ROC曲线越接近左上角,该分类器的性能越好,其泛化性能就越好。并且通常来讲,若是ROC是光滑的,那么基本能够判断没有太大的overfitting。
可是对于两个模型,咱们如何判断哪一个模型的泛化性能更优呢?这里咱们有主要如下两种方法:dom
若是模型A的ROC曲线彻底包住了模型B的ROC曲线,那么咱们就认为模型A要优于模型B;async
若是两条曲线有交叉的话,咱们就经过比较ROC与X,Y轴所围得曲线的面积来判断,面积越大,模型的性能就越优,这个面积咱们称之为AUC(area under ROC curve)svg
import pandas as pd import numpy as np import warnings import os import seaborn as sns import matplotlib.pyplot as plt """ sns 相关设置 @return: """ # 声明使用 Seaborn 样式 sns.set() # 有五种seaborn的绘图风格,它们分别是:darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。 sns.set_style("whitegrid") # 有四个预置的环境,按大小从小到大排列分别为:paper, notebook, talk, poster。其中,notebook是默认的。 sns.set_context('talk') # 中文字体设置-黑体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 解决保存图像是负号'-'显示为方块的问题 plt.rcParams['axes.unicode_minus'] = False # 解决Seaborn中文显示问题并调整字体大小 sns.set(font='SimHei')
#数据压缩 def reduce_mem_usage(df): start_mem = df.memory_usage().sum() print('Memory usage of dataframe is {:.2f} MB'.format(start_mem)) for col in df.columns: col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) else: df[col] = df[col].astype('category') end_mem = df.memory_usage().sum() print('Memory usage after optimization is: {:.2f} MB'.format(end_mem)) print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) return df
data = pd.read_csv('data_for_model01.csv') data = reduce_mem_usage(data)
D:\Anaconda1\lib\site-packages\IPython\core\interactiveshell.py:3063: DtypeWarning: Columns (46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86) have mixed types.Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result) Memory usage of dataframe is 793236320.00 MB Memory usage after optimization is: 181245298.00 MB Decreased by 77.2%
data.head()
loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | annualIncome | ... | grade_to_std_n11 | grade_to_mean_n12 | grade_to_std_n12 | grade_to_mean_n13 | grade_to_std_n13 | grade_to_mean_n14 | grade_to_std_n14 | sample | n2.2 | n2.3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 35008.0 | 5 | 19.515625 | 918.0000 | 5 | 21 | 161280 | 2.0 | 2 | 110000.0 | ... | 4.011719 | 1.852539 | 4.011719 | 1.857422 | 4.003906 | 1.856445 | 3.992188 | train | NaN | NaN |
1 | 18000.0 | 5 | 18.484375 | 462.0000 | 4 | 16 | 89538 | 5.0 | 0 | 46000.0 | ... | 3.207031 | 1.482422 | 3.207031 | 1.486328 | 3.205078 | 1.485352 | 3.193359 | train | NaN | NaN |
2 | 12000.0 | 5 | 16.984375 | 298.2500 | 4 | 17 | 159367 | 8.0 | 0 | 74000.0 | ... | 3.207031 | 1.482422 | 3.207031 | 1.486328 | 3.205078 | 1.315430 | 3.146484 | train | NaN | NaN |
3 | 2050.0 | 3 | 7.691406 | 63.9375 | 1 | 3 | 59830 | 9.0 | 0 | 35000.0 | ... | 0.801758 | 0.370605 | 0.801758 | 0.371582 | 0.801270 | 0.344238 | 0.793457 | train | NaN | NaN |
4 | 11504.0 | 3 | 14.976562 | 398.5000 | 3 | 12 | 85242 | 1.0 | 1 | 30000.0 | ... | 2.406250 | 1.111328 | 2.406250 | 1.114258 | 2.402344 | 1.114258 | 2.394531 | train | NaN | NaN |
5 rows × 122 columns函数
from sklearn.model_selection import KFold # 分离数据集,方便进行交叉验证 X_train = data.loc[data['sample']=='train', :].drop(['isDefault', 'sample'], axis=1) X_test = data.loc[data['sample']=='test', :].drop(['isDefault', 'sample'], axis=1) y_train = data.loc[data['sample']=='train', 'isDefault'] # 5折交叉验证 folds = 5 seed = 2020 kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
"""对训练集数据进行划分,分红训练集和验证集,并进行相应的操做""" from sklearn.model_selection import train_test_split import lightgbm as lgb # 数据集划分 X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2) train_matrix = lgb.Dataset(X_train_split, label=y_train_split) valid_matrix = lgb.Dataset(X_val, label=y_val) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'learning_rate': 0.1, 'metric': 'auc', 'min_child_weight': 1e-3, 'num_leaves': 31, 'max_depth': -1, 'reg_lambda': 0, 'reg_alpha': 0, 'feature_fraction': 1, 'bagging_fraction': 1, 'bagging_freq': 0, 'seed': 2020, 'nthread': 8, 'silent': True, 'verbose': -1, } """使用训练集数据进行模型训练""" model = lgb.train(params, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200)
D:\Anaconda1\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key)) Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [330] valid_0's auc: 0.731887
from sklearn import metrics from sklearn.metrics import roc_auc_score """预测并计算roc的相关指标""" val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration) fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb) roc_auc = metrics.auc(fpr, tpr) print('未调参前lightgbm单模型在验证集上的AUC:{}'.format(roc_auc)) """画出roc曲线图""" plt.figure(figsize=(8, 8)) plt.title('Validation ROC') plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc) plt.ylim(0,1) plt.xlim(0,1) plt.legend(loc='best') plt.title('ROC') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') # 画出对角线 plt.plot([0,1],[0,1],'r--') plt.show()
未调参前lightgbm单模型在验证集上的AUC:0.7318871300593701
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vNOFrVFI-1600963034277)(output_7_1.png)]post
import lightgbm as lgb """使用lightgbm 5折交叉验证进行建模预测""" cv_scores = [] for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)): print('************************************ {} ************************************'.format(str(i+1))) X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index] train_matrix = lgb.Dataset(X_train_split, label=y_train_split) valid_matrix = lgb.Dataset(X_val, label=y_val) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'learning_rate': 0.1, 'metric': 'auc', 'min_child_weight': 1e-3, 'num_leaves': 31, 'max_depth': -1, 'reg_lambda': 0, 'reg_alpha': 0, 'feature_fraction': 1, 'bagging_fraction': 1, 'bagging_freq': 0, 'seed': 2020, 'nthread': 8, 'silent': True, 'verbose': -1, } model = lgb.train(params, train_set=train_matrix, num_boost_round=20000, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200) val_pred = model.predict(X_val, num_iteration=model.best_iteration) cv_scores.append(roc_auc_score(y_val, val_pred)) print(cv_scores) print("lgb_scotrainre_list:{}".format(cv_scores)) print("lgb_score_mean:{}".format(np.mean(cv_scores))) print("lgb_score_std:{}".format(np.std(cv_scores)))
************************************ 1 ************************************ D:\Anaconda1\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key)) Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [308] valid_0's auc: 0.729253 [0.729252686605049] ************************************ 2 ************************************ D:\Anaconda1\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key)) Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [337] valid_0's auc: 0.730723 [0.729252686605049, 0.7307233610934907] ************************************ 3 ************************************ D:\Anaconda1\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key)) Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [527] valid_0's auc: 0.732105 [0.729252686605049, 0.7307233610934907, 0.7321048628412448] ************************************ 4 ************************************ D:\Anaconda1\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key)) Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [381] valid_0's auc: 0.727511 [0.729252686605049, 0.7307233610934907, 0.7321048628412448, 0.7275111359476779] ************************************ 5 ************************************ D:\Anaconda1\lib\site-packages\lightgbm\basic.py:794: UserWarning: silent keyword has been found in `params` and will be ignored. Please use silent argument of the Dataset constructor to pass this parameter. .format(key)) Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [462] valid_0's auc: 0.732217 [0.729252686605049, 0.7307233610934907, 0.7321048628412448, 0.7275111359476779, 0.7322174754202134] lgb_scotrainre_list:[0.729252686605049, 0.7307233610934907, 0.7321048628412448, 0.7275111359476779, 0.7322174754202134] lgb_score_mean:0.7303619043815351 lgb_score_std:0.0017871174424543119
from sklearn.model_selection import GridSearchCV def get_best_cv_params(learning_rate=0.1, n_estimators=581, num_leaves=31, max_depth=-1, bagging_fraction=1.0, feature_fraction=1.0, bagging_freq=0, min_data_in_leaf=20, min_child_weight=0.001, min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=None): # 设置5折交叉验证 cv_fold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True, ) model_lgb = lgb.LGBMClassifier(learning_rate=learning_rate, n_estimators=n_estimators, num_leaves=num_leaves, max_depth=max_depth, bagging_fraction=bagging_fraction, feature_fraction=feature_fraction, bagging_freq=bagging_freq, min_data_in_leaf=min_data_in_leaf, min_child_weight=min_child_weight, min_split_gain=min_split_gain, reg_lambda=reg_lambda, reg_alpha=reg_alpha, n_jobs= 8 ) grid_search = GridSearchCV(estimator=model_lgb, cv=cv_fold, param_grid=param_grid, scoring='roc_auc' ) grid_search.fit(X_train, y_train) print('模型当前最优参数为:{}'.format(grid_search.best_params_)) print('模型当前最优得分为:{}'.format(grid_search.best_score_))
# 设置5折交叉验证 from sklearn.model_selection import KFold,StratifiedKFold cv_fold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True, ) final_params = { 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'num_leaves': 29, 'max_depth': 7, 'min_data_in_leaf':45, 'min_child_weight':0.001, 'bagging_fraction': 0.9, 'feature_fraction': 0.9, 'bagging_freq': 40, 'min_split_gain': 0, 'reg_lambda':0, 'reg_alpha':0, 'nthread': 6 } cv_result = lgb.cv(train_set=lgb_train, early_stopping_rounds=20, num_boost_round=5000, nfold=5, stratified=True, shuffle=True, params=final_params, metrics='auc', seed=0, ) print('迭代次数{}'.format(len(cv_result['auc-mean']))) print('交叉验证的AUC为{}'.format(max(cv_result['auc-mean'])))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-20-b7fa2feb7fa4> in <module> 19 } 20 ---> 21 cv_result = lgb.cv(train_set=lgb_train, 22 early_stopping_rounds=20, 23 num_boost_round=5000, NameError: name 'lgb_train' is not defined
pip install bayesian-optimization
Collecting bayesian-optimization Downloading bayesian-optimization-1.2.0.tar.gz (14 kB) Requirement already satisfied: numpy>=1.9.0 in d:\anaconda1\lib\site-packages (from bayesian-optimization) (1.18.1) Requirement already satisfied: scipy>=0.14.0 in d:\anaconda1\lib\site-packages (from bayesian-optimization) (1.4.1) Requirement already satisfied: scikit-learn>=0.18.0 in d:\anaconda1\lib\site-packages (from bayesian-optimization) (0.22.1) Requirement already satisfied: joblib>=0.11 in d:\anaconda1\lib\site-packages (from scikit-learn>=0.18.0->bayesian-optimization) (0.15.1) Building wheels for collected packages: bayesian-optimization Building wheel for bayesian-optimization (setup.py): started Building wheel for bayesian-optimization (setup.py): finished with status 'done' Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11689 sha256=92f6d72f1257c45277321db01836ffce0c63dac8f591db2b3db6e9c47e6d07c1 Stored in directory: c:\users\苗苗\appdata\local\pip\cache\wheels\fd\9b\71\f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b Successfully built bayesian-optimization Installing collected packages: bayesian-optimization Successfully installed bayesian-optimization-1.2.0 Note: you may need to restart the kernel to use updated packages.
from sklearn.model_selection import cross_val_score """定义优化函数""" def rf_cv_lgb(num_leaves, max_depth, bagging_fraction, feature_fraction, bagging_freq, min_data_in_leaf, min_child_weight, min_split_gain, reg_lambda, reg_alpha): # 创建模型 model_lgb = lgb.LGBMClassifier(boosting_type='gbdt', bjective='binary', metric='auc', learning_rate=0.1, n_estimators=5000, num_leaves=int(num_leaves), max_depth=int(max_depth), bagging_fraction=round(bagging_fraction, 2), feature_fraction=round(feature_fraction, 2), bagging_freq=int(bagging_freq), min_data_in_leaf=int(min_data_in_leaf), min_child_weight=min_child_weight, min_split_gain=min_split_gain, reg_lambda=reg_lambda, reg_alpha=reg_alpha, n_jobs= 8 ) val = cross_val_score(model_lgb, X_train_split, y_train_split, cv=5, scoring='roc_auc').mean() return val
from bayes_opt import BayesianOptimization """定义优化参数""" bayes_lgb = BayesianOptimization( rf_cv_lgb, { 'num_leaves':(10, 200), 'max_depth':(3, 20), 'bagging_fraction':(0.5, 1.0), 'feature_fraction':(0.5, 1.0), 'bagging_freq':(0, 100), 'min_data_in_leaf':(10,100), 'min_child_weight':(0, 10), 'min_split_gain':(0.0, 1.0), 'reg_alpha':(0.0, 10), 'reg_lambda':(0.0, 10), } ) """开始优化""" bayes_lgb.maximize(n_iter=10)
| iter | target | baggin... | baggin... | featur... | max_depth | min_ch... | min_da... | min_sp... | num_le... | reg_alpha | reg_la... | ------------------------------------------------------------------------------------------------------------------------------------------------- | [0m 1 [0m | [0m 0.7171 [0m | [0m 0.5841 [0m | [0m 45.89 [0m | [0m 0.9789 [0m | [0m 15.1 [0m | [0m 4.607 [0m | [0m 48.88 [0m | [0m 0.4838 [0m | [0m 16.29 [0m | [0m 1.699 [0m | [0m 1.449 [0m | --------------------------------------------------------------------------- KeyError Traceback (most recent call last) D:\Anaconda1\lib\site-packages\bayes_opt\target_space.py in probe(self, params) 190 try: --> 191 target = self._cache[_hashable(x)] 192 except KeyError: KeyError: (0.9808468358238472, 95.31683577641724, 0.6846527338078261, 15.254621027977167, 6.084315056472179, 23.81958341293199, 0.6162173085058286, 42.95894924047164, 1.5295440304650598, 6.915985580798569) During handling of the above exception, another exception occurred: KeyboardInterrupt Traceback (most recent call last) <ipython-input-23-2c2786145eac> in <module> 18 19 """开始优化""" ---> 20 bayes_lgb.maximize(n_iter=10) D:\Anaconda1\lib\site-packages\bayes_opt\bayesian_optimization.py in maximize(self, init_points, n_iter, acq, kappa, kappa_decay, kappa_decay_delay, xi, **gp_params) 183 iteration += 1 184 --> 185 self.probe(x_probe, lazy=False) 186 187 if self._bounds_transformer: D:\Anaconda1\lib\site-packages\bayes_opt\bayesian_optimization.py in probe(self, params, lazy) 114 self._queue.add(params) 115 else: --> 116 self._space.probe(params) 117 self.dispatch(Events.OPTIMIZATION_STEP) 118 D:\Anaconda1\lib\site-packages\bayes_opt\target_space.py in probe(self, params) 192 except KeyError: 193 params = dict(zip(self._keys, x)) --> 194 target = self.target_func(**params) 195 self.register(x, target) 196 return target <ipython-input-22-f352aad073e3> in rf_cv_lgb(num_leaves, max_depth, bagging_fraction, feature_fraction, bagging_freq, min_data_in_leaf, min_child_weight, min_split_gain, reg_lambda, reg_alpha) 15 ) 16 ---> 17 val = cross_val_score(model_lgb, X_train_split, y_train_split, cv=5, scoring='roc_auc').mean() 18 19 return val D:\Anaconda1\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score) 388 fit_params=fit_params, 389 pre_dispatch=pre_dispatch, --> 390 error_score=error_score) 391 return cv_results['test_score'] 392 D:\Anaconda1\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score) 234 return_times=True, return_estimator=return_estimator, 235 error_score=error_score) --> 236 for train, test in cv.split(X, y, groups)) 237 238 zipped_scores = list(zip(*scores)) D:\Anaconda1\lib\site-packages\joblib\parallel.py in __call__(self, iterable) 1030 self._iterating = self._original_iterator is not None 1031 -> 1032 while self.dispatch_one_batch(iterator): 1033 pass 1034 D:\Anaconda1\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator) 845 return False 846 else: --> 847 self._dispatch(tasks) 848 return True 849 D:\Anaconda1\lib\site-packages\joblib\parallel.py in _dispatch(self, batch) 763 with self._lock: 764 job_idx = len(self._jobs) --> 765 job = self._backend.apply_async(batch, callback=cb) 766 # A job can complete so quickly than its callback is 767 # called before we get here, causing self._jobs to D:\Anaconda1\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback) 204 def apply_async(self, func, callback=None): 205 """Schedule a func to be run""" --> 206 result = ImmediateResult(func) 207 if callback: 208 callback(result) D:\Anaconda1\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch) 568 # Don't delay the application, to avoid keeping the input 569 # arguments in memory --> 570 self.results = batch() 571 572 def get(self): D:\Anaconda1\lib\site-packages\joblib\parallel.py in __call__(self) 251 with parallel_backend(self._backend, n_jobs=self._n_jobs): 252 return [func(*args, **kwargs) --> 253 for func, args, kwargs in self.items] 254 255 def __reduce__(self): D:\Anaconda1\lib\site-packages\joblib\parallel.py in <listcomp>(.0) 251 with parallel_backend(self._backend, n_jobs=self._n_jobs): 252 return [func(*args, **kwargs) --> 253 for func, args, kwargs in self.items] 254 255 def __reduce__(self): D:\Anaconda1\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score) 513 estimator.fit(X_train, **fit_params) 514 else: --> 515 estimator.fit(X_train, y_train, **fit_params) 516 517 except Exception as e: D:\Anaconda1\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks) 798 verbose=verbose, feature_name=feature_name, 799 categorical_feature=categorical_feature, --> 800 callbacks=callbacks) 801 return self 802 D:\Anaconda1\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, group, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_group, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks) 593 verbose_eval=verbose, feature_name=feature_name, 594 categorical_feature=categorical_feature, --> 595 callbacks=callbacks) 596 597 if evals_result: D:\Anaconda1\lib\site-packages\lightgbm\engine.py in train(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, keep_training_booster, callbacks) 247 evaluation_result_list=None)) 248 --> 249 booster.update(fobj=fobj) 250 251 evaluation_result_list = [] D:\Anaconda1\lib\site-packages\lightgbm\basic.py in update(self, train_set, fobj) 1924 _safe_call(_LIB.LGBM_BoosterUpdateOneIter( 1925 self.handle, -> 1926 ctypes.byref(is_finished))) 1927 self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)] 1928 return is_finished.value == 1 KeyboardInterrupt:
bayes_lgb.max
{'target': 0.7170845006643078, 'params': {'bagging_fraction': 0.5841220522935171, 'bagging_freq': 45.89371469870785, 'feature_fraction': 0.9788842825399383, 'max_depth': 15.098220845321368, 'min_child_weight': 4.606814369239687, 'min_data_in_leaf': 48.875222916404226, 'min_split_gain': 0.4837879568993534, 'num_leaves': 16.292948242912633, 'reg_alpha': 1.699317625022757, 'reg_lambda': 1.4494033099871717}}
base_params_lgb = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'num_leaves': 14, 'max_depth': 19, 'min_data_in_leaf': 37, 'min_child_weight':1.6, 'bagging_fraction': 0.98, 'feature_fraction': 0.69, 'bagging_freq': 96, 'reg_lambda': 9, 'reg_alpha': 7, 'min_split_gain': 0.4, 'nthread': 8, 'seed': 2020, 'silent': True, 'verbose': -1, } cv_result_lgb = lgb.cv( train_set=train_matrix, early_stopping_rounds=1000, num_boost_round=20000, nfold=5, stratified=True, shuffle=True, params=base_params_lgb, metrics='auc', seed=0 ) print('迭代次数{}'.format(len(cv_result_lgb['auc-mean']))) print('最终模型的AUC为{}'.format(max(cv_result_lgb['auc-mean'])))
import lightgbm as lgb """使用lightgbm 5折交叉验证进行建模预测""" cv_scores = [] for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)): print('************************************ {} ************************************'.format(str(i+1))) X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index] train_matrix = lgb.Dataset(X_train_split, label=y_train_split) valid_matrix = lgb.Dataset(X_val, label=y_val) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'num_leaves': 14, 'max_depth': 19, 'min_data_in_leaf': 37, 'min_child_weight':1.6, 'bagging_fraction': 0.98, 'feature_fraction': 0.69, 'bagging_freq': 96, 'reg_lambda': 9, 'reg_alpha': 7, 'min_split_gain': 0.4, 'nthread': 8, 'seed': 2020, 'silent': True, } model = lgb.train(params, train_set=train_matrix, num_boost_round=14269, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200) val_pred = model.predict(X_val, num_iteration=model.best_iteration) cv_scores.append(roc_auc_score(y_val, val_pred)) print(cv_scores) print("lgb_scotrainre_list:{}".format(cv_scores)) print("lgb_score_mean:{}".format(np.mean(cv_scores))) print("lgb_score_std:{}".format(np.std(cv_scores)))
base_params_lgb = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'num_leaves': 14, 'max_depth': 19, 'min_data_in_leaf': 37, 'min_child_weight':1.6, 'bagging_fraction': 0.98, 'feature_fraction': 0.69, 'bagging_freq': 96, 'reg_lambda': 9, 'reg_alpha': 7, 'min_split_gain': 0.4, 'nthread': 8, 'seed': 2020, 'silent': True, } """使用训练集数据进行模型训练""" final_model_lgb = lgb.train(base_params_lgb, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=13000, verbose_eval=1000, early_stopping_rounds=200) """预测并计算roc的相关指标""" val_pre_lgb = final_model_lgb.predict(X_val) fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb) roc_auc = metrics.auc(fpr, tpr) print('调参后lightgbm单模型在验证集上的AUC:{}'.format(roc_auc)) """画出roc曲线图""" plt.figure(figsize=(8, 8)) plt.title('Validation ROC') plt.plot(fpr, tpr, 'b', label = 'Val AUC = %0.4f' % roc_auc) plt.ylim(0,1) plt.xlim(0,1) plt.legend(loc='best') plt.title('ROC') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') # 画出对角线 plt.plot([0,1],[0,1],'r--') plt.show()
import pickle pickle.dump(final_model_lgb, open('dataset/model_lgb_best.pkl', 'wb'))