scikit-learn机器学习(四)使用决策树作分类,并画出决策树,随机森林对比

数据来自 UCI 数据集 匹马印第安人糖尿病数据集数组

 

载入数据dom

# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib
matplotlib.rcParams['font.sans-serif']=[u'simHei']
matplotlib.rcParams['axes.unicode_minus']=False
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.datasets import load_breast_cancer

data_set = pd.read_csv('pima-indians-diabetes.csv')
data = data_set.values[:,:]

y = data[:,8]
X = data[:,:8]
X_train,X_test,y_train,y_test = train_test_split(X,y)

 

创建决策树,网格搜索微调模型spa

# In[1] 网格搜索微调模型
pipeline = Pipeline([
        ('clf',DecisionTreeClassifier(criterion='entropy'))
        ])
parameters={
        'clf__max_depth':(3,5,10,15,20,25,30,35,40),
        'clf__min_samples_split':(2,3),
        'clf__min_samples_leaf':(1,2,3)
        }
#GridSearchCV 用于系统地遍历多种参数组合,经过交叉验证肯定最佳效果参数。
grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=-1,scoring='f1')
grid_search.fit(X_train,y_train)

# 获取搜索到的最优参数
best_parameters = grid_search.best_estimator_.get_params()
print("最好的F1值为:",grid_search.best_score_)
print('最好的参数为:')
for param_name in sorted(parameters.keys()):
    print('t%s: %r' % (param_name,best_parameters[param_name]))
    
# In[2] 输出预测结果并评价
predictions = grid_search.predict(X_test)
print(classification_report(y_test,predictions))

 

最好的F1值为: 0.5573515325670498
最好的参数为:
tclf__max_depth: 5
tclf__min_samples_leaf: 1
tclf__min_samples_split: 2

 

 

 

评价模型rest

# In[2] 输出预测结果并评价
predictions = grid_search.predict(X_test)
print(classification_report(y_test,predictions))

 

              precision    recall  f1-score   support

         0.0       0.74      0.89      0.81       124
         1.0       0.67      0.43      0.52        68

 

 

画出决策树code

# In[3]打印树
from sklearn import tree  
feature_name=data_set.columns.values.tolist()[:-1]   # 列名称
DT = tree.DecisionTreeClassifier(criterion='entropy',max_depth=5,min_samples_split=2,min_samples_leaf=5)
DT.fit(X_train,y_train)

'''
# 法一
import pydotplus
from sklearn.externals.six import StringIO
dot_data = StringIO()
tree.export_graphviz(DT,out_file = dot_data,feature_names=feature_name,
                     class_names=["有糖尿病","无病"],filled=True,rounded=True,
                     special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("Tree.pdf")
print('Visible tree plot saved as pdf.')
'''

# 法二
import graphviz
#ID3为决策树分类器fit以后获得的模型,注意这里必须在fit后执行,在predict以后运行会报错
dot_data = tree.export_graphviz(DT, out_file=None,feature_names=feature_name,class_names=["有糖尿病","无病"]) # doctest: +SKIP
graph = graphviz.Source(dot_data) # doctest: +SKIP
#在同级目录下生成tree.pdf文件
graph.render("tree2") # doctest: +SKIP

 

 

随机森林blog

 

# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib
matplotlib.rcParams['font.sans-serif']=[u'simHei']
matplotlib.rcParams['axes.unicode_minus']=False
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import load_breast_cancer

data_set = pd.read_csv('pima-indians-diabetes.csv')
data = data_set.values[:,:]

y = data[:,8]
X = data[:,:8]
X_train,X_test,y_train,y_test = train_test_split(X,y)

RF = RandomForestClassifier(n_estimators=10,random_state=11)
RF.fit(X_train,y_train)
predictions = RF.predict(X_test)
print(classification_report(y_test,predictions))

 

              precision    recall  f1-score   support

         0.0       0.82      0.91      0.86       126
         1.0       0.78      0.61      0.68        66

   micro avg       0.81      0.81      0.81       192
   macro avg       0.80      0.76      0.77       192
weighted avg       0.80      0.81      0.80       192
相关文章
相关标签/搜索