- 该项目是针对kaggle中的homesite进行的算法预测,使用xgboost的sklearn接口,进行数据建模,购买预测。
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()
|
QuoteNumber |
Original_Quote_Date |
QuoteConversion_Flag |
Field6 |
Field7 |
Field8 |
Field9 |
Field10 |
Field11 |
Field12 |
... |
GeographicField59A |
GeographicField59B |
GeographicField60A |
GeographicField60B |
GeographicField61A |
GeographicField61B |
GeographicField62A |
GeographicField62B |
GeographicField63 |
GeographicField64 |
0 |
1 |
2013-08-16 |
0 |
B |
23 |
0.9403 |
0.0006 |
965 |
1.0200 |
N |
... |
9 |
9 |
-1 |
8 |
-1 |
18 |
-1 |
10 |
N |
CA |
1 |
2 |
2014-04-22 |
0 |
F |
7 |
1.0006 |
0.0040 |
548 |
1.2433 |
N |
... |
10 |
10 |
-1 |
11 |
-1 |
17 |
-1 |
20 |
N |
NJ |
2 |
4 |
2014-08-25 |
0 |
F |
7 |
1.0006 |
0.0040 |
548 |
1.2433 |
N |
... |
15 |
18 |
-1 |
21 |
-1 |
11 |
-1 |
8 |
N |
NJ |
3 |
6 |
2013-04-15 |
0 |
J |
10 |
0.9769 |
0.0004 |
1,165 |
1.2665 |
N |
... |
6 |
5 |
-1 |
10 |
-1 |
9 |
-1 |
21 |
N |
TX |
4 |
8 |
2014-01-25 |
0 |
E |
23 |
0.9472 |
0.0006 |
1,487 |
1.3045 |
N |
... |
18 |
22 |
-1 |
10 |
-1 |
11 |
-1 |
12 |
N |
IL |
5 rows × 299 columnspython
train=train.drop('QuoteNumber',axis=1)
test = test.drop('QuoteNumber', axis=1)
时间格式的转化
train['Date']=pd.to_datetime(train['Original_Quote_Date'])
train= train.drop('Original_Quote_Date',axis=1)
test['Date']=pd.to_datetime(test['Original_Quote_Date'])
test= test.drop('Original_Quote_Date',axis=1)
train['year']=train['Date'].dt.year
train['month']=train['Date'].dt.month
train['weekday']=train['Date'].dt.weekday
train.head()
|
QuoteConversion_Flag |
Field6 |
Field7 |
Field8 |
Field9 |
Field10 |
Field11 |
Field12 |
CoverageField1A |
CoverageField1B |
... |
GeographicField61A |
GeographicField61B |
GeographicField62A |
GeographicField62B |
GeographicField63 |
GeographicField64 |
Date |
year |
month |
weekday |
0 |
0 |
B |
23 |
0.9403 |
0.0006 |
965 |
1.0200 |
N |
17 |
23 |
... |
-1 |
18 |
-1 |
10 |
N |
CA |
2013-08-16 |
2013 |
8 |
4 |
1 |
0 |
F |
7 |
1.0006 |
0.0040 |
548 |
1.2433 |
N |
6 |
8 |
... |
-1 |
17 |
-1 |
20 |
N |
NJ |
2014-04-22 |
2014 |
4 |
1 |
2 |
0 |
F |
7 |
1.0006 |
0.0040 |
548 |
1.2433 |
N |
7 |
12 |
... |
-1 |
11 |
-1 |
8 |
N |
NJ |
2014-08-25 |
2014 |
8 |
0 |
3 |
0 |
J |
10 |
0.9769 |
0.0004 |
1,165 |
1.2665 |
N |
3 |
2 |
... |
-1 |
9 |
-1 |
21 |
N |
TX |
2013-04-15 |
2013 |
4 |
0 |
4 |
0 |
E |
23 |
0.9472 |
0.0006 |
1,487 |
1.3045 |
N |
8 |
13 |
... |
-1 |
11 |
-1 |
12 |
N |
IL |
2014-01-25 |
2014 |
1 |
5 |
5 rows × 301 columns算法
test['year']=test['Date'].dt.year
test['month']=test['Date'].dt.month
test['weekday']=test['Date'].dt.weekday
train = train.drop('Date', axis=1)
test = test.drop('Date', axis=1)
查看数据类型
train.dtypes
QuoteConversion_Flag int64
Field6 object
Field7 int64
Field8 float64
Field9 float64
Field10 object
Field11 float64
Field12 object
CoverageField1A int64
CoverageField1B int64
CoverageField2A int64
CoverageField2B int64
CoverageField3A int64
CoverageField3B int64
CoverageField4A int64
CoverageField4B int64
CoverageField5A int64
CoverageField5B int64
CoverageField6A int64
CoverageField6B int64
CoverageField8 object
CoverageField9 object
CoverageField11A int64
CoverageField11B int64
SalesField1A int64
SalesField1B int64
SalesField2A int64
SalesField2B int64
SalesField3 int64
SalesField4 int64
...
GeographicField50B int64
GeographicField51A int64
GeographicField51B int64
GeographicField52A int64
GeographicField52B int64
GeographicField53A int64
GeographicField53B int64
GeographicField54A int64
GeographicField54B int64
GeographicField55A int64
GeographicField55B int64
GeographicField56A int64
GeographicField56B int64
GeographicField57A int64
GeographicField57B int64
GeographicField58A int64
GeographicField58B int64
GeographicField59A int64
GeographicField59B int64
GeographicField60A int64
GeographicField60B int64
GeographicField61A int64
GeographicField61B int64
GeographicField62A int64
GeographicField62B int64
GeographicField63 object
GeographicField64 object
year int64
month int64
weekday int64
Length: 300, dtype: object
查看DataFrame的详细信息
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260753 entries, 0 to 260752
Columns: 300 entries, QuoteConversion_Flag to weekday
dtypes: float64(6), int64(267), object(27)
memory usage: 596.8+ MB
填充缺失值
train = train.fillna(-999)
test = test.fillna(-999)
category 数据类型转化
from sklearn import preprocessing
features = list(train.columns[1:])
for i in features:
if train[i].dtype=='object':
le=preprocessing.LabelEncoder()
le.fit(list(train[i].values)+list(test[i].values))
train[i] = le.transform(list(train[i].values))
test[i] = le.transform(list(test[i].values))
模型参数设定
#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have
#much fun of fighting against overfit
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
xgb_model = xgb.XGBClassifier()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
'objective':['binary:logistic'],
'learning_rate': [0.05,0.1], #so called `eta` value
'max_depth': [6],
'min_child_weight': [11],
'silent': [1],
'subsample': [0.8],
'colsample_bytree': [0.7],
'n_estimators': [5], #number of trees, change it to 1000 for better results
'missing':[-999],
'seed': [1337]}
sfolder = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)
clf= GridSearchCV(xgb_model,parameters,n_jobs=4,cv=sfolder.split(train[features], train["QuoteConversion_Flag"]),scoring='roc_auc',
verbose=2, refit=True,return_train_score=True)
clf.fit(train[features], train["QuoteConversion_Flag"])
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=4)]: Done 10 out of 10 | elapsed: 2.4min finished
GridSearchCV(cv=<generator object _BaseKFold.split at 0x0000000018459888>,
error_score='raise',
estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1),
fit_params=None, iid=True, n_jobs=4,
param_grid={'nthread': [4], 'objective': ['binary:logistic'], 'learning_rate': [0.05, 0.1], 'max_depth': [6], 'min_child_weight': [11], 'silent': [1], 'subsample': [0.8], 'colsample_bytree': [0.7], 'n_estimators': [5], 'missing': [-999], 'seed': [1337]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
scoring='roc_auc', verbose=2)
clf.grid_scores_
c:\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_search.py:761: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
DeprecationWarning)
[mean: 0.94416, std: 0.00118, params: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 6, 'min_child_weight': 11, 'missing': -999, 'n_estimators': 5, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 1337, 'silent': 1, 'subsample': 0.8},
mean: 0.94589, std: 0.00120, params: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 11, 'missing': -999, 'n_estimators': 5, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 1337, 'silent': 1, 'subsample': 0.8}]
pd.DataFrame(clf.cv_results_['params'])
|
colsample_bytree |
learning_rate |
max_depth |
min_child_weight |
missing |
n_estimators |
nthread |
objective |
seed |
silent |
subsample |
0 |
0.7 |
0.05 |
6 |
11 |
-999 |
5 |
4 |
binary:logistic |
1337 |
1 |
0.8 |
1 |
0.7 |
0.10 |
6 |
11 |
-999 |
5 |
4 |
binary:logistic |
1337 |
1 |
0.8 |
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
print("%s: %r" % (param_name, best_parameters[param_name]))
Raw AUC score: 0.9458947562485674
colsample_bytree: 0.7
learning_rate: 0.1
max_depth: 6
min_child_weight: 11
missing: -999
n_estimators: 5
nthread: 4
objective: 'binary:logistic'
seed: 1337
silent: 1
subsample: 0.8
c:\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_search.py:761: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
DeprecationWarning)
test_probs = clf.predict_proba(test[features])[:,1]
sample = pd.read_csv('sample_submission.csv')
sample.QuoteConversion_Flag = test_probs
sample.to_csv("xgboost_best_parameter_submission.csv", index=False)
clf.best_estimator_.predict_proba(test[features])
array([[0.6988076 , 0.3011924 ],
[0.6787684 , 0.3212316 ],
[0.6797658 , 0.32023418],
...,
[0.5018287 , 0.4981713 ],
[0.6988076 , 0.3011924 ],
[0.62464744, 0.37535256]], dtype=float32)
下面的截断值0.5能够本身根据实际的项目设定截断值
kears_result=pd.read_csv('keras_nn_test.csv')
result1=[1 if i>0.5 else 0 for i in kears_result['QuoteConversion_Flag']]
xgb_result=pd.read_csv('xgboost_best_parameter_submission.csv')
result2=[1 if i>0.5 else 0 for i in xgb_result['QuoteConversion_Flag']]
from sklearn import metrics
metrics.accuracy_score(result1,result2)
0.8566004740099864
metrics.confusion_matrix(result1,result2)
array([[148836, 24862],
[ 66, 72]], dtype=int64)
结论
- 对数据的时间进行了预处理
- 对数据中的category类型进行了label化,我以为有必要对这个进行从新考虑,我的以为应该使用one-hot进行category的处理,而不是LabelEncoder处理(疑虑)
- Label encoding在某些状况下颇有用,可是场景限制不少。再举一例:好比有[dog,cat,dog,mouse,cat],咱们把其转换为[1,2,1,3,2]。这里就产生了一个奇怪的现象:dog和mouse的平均值是cat。因此目前尚未发现标签编码的普遍使用。
- 获得的模型对测试集进行处理,Raw AUC 0.94,而对应的准确率只有85%,实际上并无实际的分类效果,对于其实是0的,预测成1的太多了,也就是假阳性过高了,实际中的转换率也不会很高。
- 其实模型还有不少能够调整的参数都没有调整,若是对调参有兴趣的能够查看美团的文本分类项目中的例子。