#The Optimization of the Adaboost ###1.对于Adaboost error function的推导 再回到咱们上篇文章讲到的Adaboost算法,咱们要从Adaboost算法推导出GBDT。首先回顾一下上篇文章的Adaboost,主要思想就是把弱分类器集中起来获得一个强的分类器。首先第一次建造树的时候每个样本的权值都是同样的,以后的每一次训练只要有错误,那么这个错误就会被放大,而正确的权值就会被缩小,以后会获得每个模型的α,根据每个树的α把结果结合起来就获得须要的结果。 node
def loadDataSet(filename):
''' load dataSet :param filename: the filename which you need to open :return: dataset in file '''
dataMat = pd.read_csv(filename)
for i in range(np.shape(dataMat)[0]):
if dataMat.iloc[i, 2] == 0:
dataMat.iloc[i, 2] = -1
return dataMat
pass
def split_data(data_array, col, value):
'''split the data according to the feature'''
array_1 = data_array.loc[data_array.iloc[:, col] >= value, :]
array_2 = data_array.loc[data_array.iloc[:, col] < value, :]
return array_1, array_2
pass
def getErr(data_array):
'''calculate the var '''return np.var(data_array.iloc[:, -1]) * data_array.shape[0]
pass
def regLeaf(data_array):
return np.mean(data_array.iloc[:, -1])
复制代码
加载数据,分割数据,计算方差,计算叶子平均,其实就是计算拟合的类别了。
def get_best_split(data_array, ops = (1, 4)):
'''the best point to split data'''
tols = ops[0]
toln = ops[1]
if len(set(data_array.iloc[:, -1])) == 1:
return None, regLeaf(data_array)
m, n = data_array.shape
best_S = np.inf
best_col = 0
best_value = 0
S = getErr(data_array)
for col in range(n - 1):
values = set(data_array.iloc[:, col])
for value in values:
array_1, array_2 = split_data(data_array, col, value)
if (array_1.shape[0] < toln) or (array_2.shape[0] < toln):
continue
totalError = getErr(array_1) + getErr(array_2)
if totalError< best_S:
best_col = col
best_value = value
best_S = totalError
if (S - best_S) < tols:
return None, regLeaf(data_array)
array_1, array_2 = split_data(data_array, best_col, best_value)
if (array_1.shape[0] < toln) or (array_2.shape[0] < toln):
return None, regLeaf(data_array)
return best_col, best_value
复制代码
def treeCast(tree, inData):
'''get the classification'''if tree.results != None:
return tree.results
ifinData.iloc[tree.col] > tree.value:
return treeCast(tree.gb, inData)
else:
return treeCast(tree.lb, inData)
pass
def createForeCast(tree, testData):
m = len(testData)
yHat = np.mat(np.zeros((m, 1)))
for i in range(m):
yHat[i, 0] = treeCast(tree, testData.iloc[i])
return yHat
复制代码
建立分类。
def GBDT_model(data_array, num_iter, ops = (1, 4)):
m, n = data_array.shape
x = data_array.iloc[:, 0:-1]
y = data_array.iloc[:, -1]
y = np.mat(y).T
list_trees = []
yHat = None
for i in range(num_iter):
print('the ', i, ' tree')
if i == 0:
tree = buildTree(data_array, ops)
list_trees.append(tree)
yHat = createForeCast(tree, x)
else:
r = y - yHat
data_array = np.hstack((x, r))
data_array = pd.DataFrame(data_array)
tree = buildTree(data_array, ops)
list_trees.append(tree)
rHat = createForeCast(tree, x)
yHat = yHat + rHat
return list_trees, yHat
复制代码
这里只是使用了回归问题的回归树,x和(y - s)作拟合以后加入预测集便可。 接下来就是画图了:
def getwidth(tree):
if tree.gb == None and tree.lb == None: return 1
return getwidth(tree.gb) + getwidth(tree.lb)
def getdepth(tree):
if tree.gb == None and tree.lb == None: return 0
return max(getdepth(tree.gb), getdepth(tree.lb)) + 1
def drawtree(tree, jpeg='tree.jpg'):
w = getwidth(tree) * 100
h = getdepth(tree) * 100 + 120
img = Image.new('RGB', (w, h), (255, 255, 255))
draw = ImageDraw.Draw(img)
drawnode(draw, tree, w / 2, 20)
img.save(jpeg, 'JPEG')
def drawnode(draw, tree, x, y):
if tree.results == None:
# Get the width of each branch
w1 = getwidth(tree.lb) * 100
w2 = getwidth(tree.gb) * 100
# Determine the total space required by this node
left = x - (w1 + w2) / 2
right = x + (w1 + w2) / 2
# Draw the condition string
draw.text((x - 20, y - 10), str(tree.col) + ':' + str(tree.value), (0, 0, 0))
# Draw links to the branches
draw.line((x, y, left + w1 / 2, y + 100), fill=(255, 0, 0))
draw.line((x, y, right - w2 / 2, y + 100), fill=(255, 0, 0))
# Draw the branch nodes
drawnode(draw, tree.lb, left + w1 / 2, y + 100)
drawnode(draw, tree.gb, right - w2 / 2, y + 100)
else:
txt = str(tree.results)
draw.text((x - 20, y), txt, (0, 0, 0))
复制代码
以后就是运行主函数了:
if __name__ == '__main__':
data = loadDataSet('../Data/LogiReg_data.txt')
tree = buildTree(data)
drawtree(tree, jpeg='treeview_cart.jpg')
gbdt_results, y = GBDT_model(data, 10)
print(y)
for i in range(len(y)):
if y[i] > 0:
print('1')
elif y[i] < 0:
print('0')
复制代码