1 sklearn简单例子
from sklearn import svm
X = [[2, 0], [1, 1], [2,3]]
y = [0, 0, 1]
clf = svm.SVC(kernel = 'linear')
clf.fit(X, y)
print clf
# get support vectors
print clf.support_vectors_
# get indices of support vectors
print clf.support_
# get number of support vectors for each class
print clf.n_support_
2 sklearn画出决定界限
print(__doc__)
import numpy as np
import pylab as pl
from sklearn import svm
# we create 40 separable points
np.random.seed(0)
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
Y = [0] * 20 + [1] * 20
# fit the model
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)
# get the separating hyperplane
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]
# plot the parallels to the separating hyperplane that pass through the
# support vectors
b = clf.support_vectors_[0]
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0])
print "w: ", w
print "a: ", a
# print " xx: ", xx
# print " yy: ", yy
print "support_vectors_: ", clf.support_vectors_
print "clf.coef_: ", clf.coef_
# In scikit-learn coef_ attribute holds the vectors of the separating hyperplanes for linear models. It has shape (n_classes, n_features) if n_classes > 1 (multi-class one-vs-all) and (1, n_features) for binary classification.
#
# In this toy binary classification example, n_features == 2, hence w = coef_[0] is the vector orthogonal to the hyperplane (the hyperplane is fully defined by it + the intercept).
#
# To plot this hyperplane in the 2D case (any hyperplane of a 2D plane is a 1D line), we want to find a f as in y = f(x) = a.x + b. In this case a is the slope of the line and can be computed by a = -w[0] / w[1].
# plot the line, the points, and the nearest vectors to the plane
pl.plot(xx, yy, 'k-')
pl.plot(xx, yy_down, 'k--')
pl.plot(xx, yy_up, 'k--')
pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=80, facecolors='none')
pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)
pl.axis('tight')
pl.show()
5.2 支持向量机(SVM)算法(下)算法
1. SVM算法特性:数据库
1.1 训练好的模型的算法复杂度是由支持向量的个数决定的,而不是由数据的维度决定的。因此SVM不太容易产生overfitting
1.2 SVM训练出来的模型彻底依赖于支持向量(Support Vectors), 即便训练集里面全部非支持向量的点都被去除,重复训练过程,结果仍然会获得彻底同样的模型。
1.3 一个SVM若是训练得出的支持向量个数比较小,SVM训练出的模型比较容易被泛化。
2. 线性不可分的状况 (linearly inseparable case)
2.1 数据集在空间中对应的向量不可被一个超平面区分开
2.2 两个步骤来解决:
2.2.1 利用一个非线性的映射把原数据集中的向量点转化到一个更高维度的空间中
2.2.2 在这个高维度的空间中找一个线性的超平面来根据线性可分的状况处理
2.3 如何利用非线性映射把原始数据转化到高维中?
2.3.1 例子:
3维输入向量:
转化到6维空间 Z 中去:
新的决策超平面:

其中W和Z是向量,这个超平面是线性的
解出W和b以后,而且带入回原方程:
2.3.2 思考问题:
2.3.2.1: 如何选择合理的非线性转化把数据转到高纬度中?
2.3.2.2: 如何解决计算内积时算法复杂度很是高的问题?
2.3.3 使用核方法(kernel trick)
3. 核方法(kernel trick)
3.1 动机
在线性SVM中转化为最优化问题时求解的公式计算都是之内积(dot product)的形式出现的

,其中

是把训练集中的向量点转化到高维的非线性映射函数,由于内积的算法复杂
度很是大,因此咱们利用核函数来取代计算非线性映射函数的内积
3.1 如下核函数和非线性映射函数的内积等同
3.2 经常使用的核函数(kernel functions)
h度多项式核函数(polynomial kernel of degree h):
高斯径向基核函数(Gaussian radial basis function kernel):
S型核函数(Sigmoid function kernel):
如何选择使用哪一个kernel?
根据先验知识,好比图像分类,一般使用RBF,文字不使用RBF
尝试不一样的kernel,根据结果准确度而定
3.3 核函数举例:
假设定义两个向量: x = (x1, x2, x3); y = (y1, y2, y3)
定义方程:f(x) = (x1x1, x1x2, x1x3, x2x1, x2x2, x2x3, x3x1, x3x2, x3x3)
K(x, y ) = (<x, y>)^2
假设x = (1, 2, 3); y = (4, 5, 6).
f(x) = (1, 2, 3, 2, 4, 6, 3, 6, 9)
f(y) = (16, 20, 24, 20, 25, 36, 24, 30, 36)
<f(x), f(y)> = 16 + 40 + 72 + 40 + 100+ 180 + 72 + 180 + 324 = 1024windows
K(x, y) = (4 + 10 + 18 ) ^2 = 32^2 = 1024
一样的结果,使用kernel方法计算容易不少
4. SVM扩展可解决多个类别分类问题
对于每一个类,有一个当前类和其余类的二类分类器(one-vs-rest)
5.3 支持向量机(SVM)算法(下)应用 api
利用SVM进行人脸识别实例:
from __future__ import print_function
from time import time
import logging
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.datasets import fetch_lfw_people
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import RandomizedPCA
from sklearn.svm import SVC
print(__doc__)
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
###############################################################################
# Download the data, if not already on disk and load it as numpy arrays
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape
# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]
# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]
print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)
###############################################################################
# Split into a training set and a test set using a stratified k fold
# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25)
###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150
print("Extracting the top %d eigenfaces from %d faces"
% (n_components, X_train.shape[0]))
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))
eigenfaces = pca.components_.reshape((n_components, h, w))
print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))
###############################################################################
# Train a SVM classification model
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)
###############################################################################
# Quantitative evaluation of the model quality on the test set
print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))
print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))
###############################################################################
# Qualitative evaluation of the predictions using matplotlib
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
# plot the result of the prediction on a portion of the test set
def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
prediction_titles = [title(y_pred, y_test, target_names, i)
for i in range(y_pred.shape[0])]
plot_gallery(X_test, prediction_titles, h, w)
# plot the gallery of the most significative eigenfaces
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
plt.show()
6.1 神经网络算法(Nerual Networks)(上)
1. 背景:
1.1 以人脑中的神经网络为启发,历史上出现过不少不一样版本
1.2 最著名的算法是1980年的 backpropagation
2. 多层向前神经网络(Multilayer Feed-Forward Neural Network)
2.1 Backpropagation被使用在多层向前神经网络上
2.2 多层向前神经网络由如下部分组成:
输入层(input layer), 隐藏层 (hidden layers), 输入层 (output layers)
2.3 每层由单元(units)组成
2.4 输入层(input layer)是由训练集的实例特征向量传入
2.5 通过链接结点的权重(weight)传入下一层,一层的输出是下一层的输入
2.6 隐藏层的个数能够是任意的,输入层有一层,输出层有一层
2.7 每一个单元(unit)也能够被称做神经结点,根据生物学来源定义
2.8 以上成为2层的神经网络(输入层不算)
2.8 一层中加权的求和,而后根据非线性方程转化输出
2.9 做为多层向前神经网络,理论上,若是有足够多的隐藏层(hidden layers) 和足够大的训练集, 能够模
拟出任何方程
3. 设计神经网络结构
3.1 使用神经网络训练数据以前,必须肯定神经网络的层数,以及每层单元的个数
3.2 特征向量在被传入输入层时一般被先标准化(normalize)到0和1之间 (为了加速学习过程)
3.3 离散型变量能够被编码成每个输入单元对应一个特征值可能赋的值
好比:特征值A可能取三个值(a0, a1, a2), 可使用3个输入单元来表明A。
若是A=a0, 那么表明a0的单元值就取1, 其余取0;
若是A=a1, 那么表明a1de单元值就取1,其余取0,以此类推
3.4 神经网络便可以用来作分类(classification)问题,也能够解决回归(regression)问题
3.4.1 对于分类问题,若是是2类,能够用一个输出单元表示(0和1分别表明2类)
若是多余2类,每个类别用一个输出单元表示
因此输入层的单元数量一般等于类别的数量
3.4.2 没有明确的规则来设计最好有多少个隐藏层
3.4.2.1 根据实验测试和偏差,以及准确度来实验并改进
4. 交叉验证方法(Cross-Validation)
K-fold cross validation
5. Backpropagation算法
5.1 经过迭代性的来处理训练集中的实例
5.2 对比通过神经网络后输入层预测值(predicted value)与真实值(target value)之间
5.3 反方向(从输出层=>隐藏层=>输入层)来以最小化偏差(error)来更新每一个链接的权重(weight)
5.4 算法详细介绍
输入:D:数据集,l 学习率(learning rate), 一个多层前向神经网络
输入:一个训练好的神经网络(a trained neural network)
5.4.1 初始化权重(weights)和偏向(bias): 随机初始化在-1到1之间,或者-0.5到0.5之间,每一个单元有
一个偏向
5.4.2 对于每个训练实例X,执行如下步骤:
5.4.2.1: 由输入层向前传送
5.4.2.2 根据偏差(error)反向传送
对于输出层:
对于隐藏层:
权重更新:
偏向更新
5.4.3 终止条件
5.4.3.1 权重的更新低于某个阈值
5.4.3.2 预测的错误率低于某个阈值
5.4.3.3 达到预设必定的循环次数
6. Backpropagation 算法举例
对于输出层:
对于隐藏层:
权重更新:
偏向更新
6.2 神经网络算法(Nerual Networks)应用(上)
1. 关于非线性转化方程(non-linear transformation function)
sigmoid函数(S 曲线)用来做为activation function:
1.1 双曲函数(tanh)
1.2 逻辑函数(logistic function)
2. 实现一个简单的神经网络算法
import numpy as np
def tanh(x):
return np.tanh(x)
def tanh_deriv(x):
return 1.0 - np.tanh(x)*np.tanh(x)
def logistic(x):
return 1/(1 + np.exp(-x))
def logistic_derivative(x):
return logistic(x)*(1-logistic(x))
class NeuralNetwork:
def __init__(self, layers, activation='tanh'):
"""
:param layers: A list containing the number of units in each layer.
Should be at least two values
:param activation: The activation function to be used. Can be
"logistic" or "tanh"
"""
if activation == 'logistic':
self.activation = logistic
self.activation_deriv = logistic_derivative
elif activation == 'tanh':
self.activation = tanh
self.activation_deriv = tanh_deriv
self.weights = []
for i in range(1, len(layers) - 1):
self.weights.append((2*np.random.random((layers[i - 1] + 1, layers[i] + 1))-1)*0.25)
self.weights.append((2*np.random.random((layers[i] + 1, layers[i + 1]))-1)*0.25)
def fit(self, X, y, learning_rate=0.2, epochs=10000):
X = np.atleast_2d(X)
temp = np.ones([X.shape[0], X.shape[1]+1])
temp[:, 0:-1] = X # adding the bias unit to the input layer
X = temp
y = np.array(y)
for k in range(epochs):
i = np.random.randint(X.shape[0])
a = [X[i]]
for l in range(len(self.weights)): #going forward network, for each layer
a.append(self.activation(np.dot(a[l], self.weights[l]))) #Computer the node value for each layer (O_i) using activation function
error = y[i] - a[-1] #Computer the error at the top layer
deltas = [error * self.activation_deriv(a[-1])] #For output layer, Err calculation (delta is updated error)
#Staring backprobagation
for l in range(len(a) - 2, 0, -1): # we need to begin at the second to last layer
#Compute the updated error (i,e, deltas) for each node going from top layer to input layer
deltas.append(deltas[-1].dot(self.weights[l].T)*self.activation_deriv(a[l]))
deltas.reverse()
for i in range(len(self.weights)):
layer = np.atleast_2d(a[i])
delta = np.atleast_2d(deltas[i])
self.weights[i] += learning_rate * layer.T.dot(delta)
def predict(self, x):
x = np.array(x)
temp = np.ones(x.shape[0]+1)
temp[0:-1] = x
a = temp
for l in range(0, len(self.weights)):
a = self.activation(np.dot(a, self.weights[l]))
return a
6.3 神经网络算法(Nerual Networks)应用(下)网络
1. 简单非线性关系数据集测试(XOR):
X: Y
0 0 0
0 1 1
1 0 1
1 1 0
Code:
from NeuralNetwork import NeuralNetwork
import numpy as np
nn = NeuralNetwork([2,2,1], 'tanh')
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0])
nn.fit(X, y)
for i in [[0, 0], [0, 1], [1, 0], [1,1]]:
print(i, nn.predict(i))
2. 手写数字识别:
每一个图片8x8
识别数字:0,1,2,3,4,5,6,7,8,9
Code:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelBinarizer
from NeuralNetwork import NeuralNetwork
from sklearn.cross_validation import train_test_split
digits = load_digits()
X = digits.data
y = digits.target
X -= X.min() # normalize the values to bring them into the range 0-1
X /= X.max()
nn = NeuralNetwork([64,100,10],'logistic')
X_train, X_test, y_train, y_test = train_test_split(X, y)
labels_train = LabelBinarizer().fit_transform(y_train)
labels_test = LabelBinarizer().fit_transform(y_test)
print "start fitting"
nn.fit(X_train,labels_train,epochs=3000)
predictions = []
for i in range(X_test.shape[0]):
o = nn.predict(X_test[i] )
predictions.append(np.argmax(o))
print confusion_matrix(y_test,predictions)
print classification_report(y_test,predictions)
7.1 简单线性回归 (Simple Linear Regression)上app
0. 前提介绍:
为何须要统计量?
统计量:描述数据特征
0.1 集中趋势衡量
0.1.1均值(平均数,平均值)(mean)

{6, 2, 9, 1, 2}
(6 + 2 + 9 + 1 + 2) / 5 = 20 / 5 = 4
0.1.2中位数 (median): 将数据中的各个数值按照大小顺序排列,居于中间位置的变量
0.1.2.1. 给数据排序:1, 2, 2, 6, 9
0.1.2.2. 找出位置处于中间的变量:2
当n为基数的时候:直接取位置处于中间的变量
当n为偶数的时候,取中间两个量的平均值
0.1.2众数 (mode):数据中出现次数最多的数
0.2
0.2.1. 离散程度衡量
0.2.1.1方差(variance)

{6, 2, 9, 1, 2}
(1) (6 - 4)^2 + (2 - 4) ^2 + (9 - 4)^2 + (1 - 4)^2 + (2 - 4)^2
= 4 + 4 + 25 + 9 + 4
= 46
(2) n - 1 = 5 - 1 = 4
(3) 46 / 4 = 11.5
0.2.1.2标准差 (standard deviation)

s = sqrt(11.5) = 3.39
1. 介绍:回归(regression) Y变量为连续数值型(continuous numerical variable)
如:房价,人数,降雨量
分类(Classification): Y变量为类别型(categorical variable)
如:颜色类别,电脑品牌,有无信誉
2. 简单线性回归(Simple Linear Regression)
2.1 不少作决定过过程一般是根据两个或者多个变量之间的关系
2.3 回归分析(regression analysis)用来创建方程模拟两个或者多个变量之间如何关联
2.4 被预测的变量叫作:因变量(dependent variable), y, 输出(output)
2.5 被用来进行预测的变量叫作: 自变量(independent variable), x, 输入(input)
3. 简单线性回归介绍
3.1 简单线性回归包含一个自变量(x)和一个因变量(y)
3.2 以上两个变量的关系用一条直线来模拟
3.3 若是包含两个以上的自变量,则称做多元回归分析(multiple regression)
4. 简单线性回归模型
4.1 被用来描述因变量(y)和自变量(X)以及误差(error)之间关系的方程叫作回归模型
4.2 简单线性回归的模型是:
其中: 参数 误差
5. 简单线性回归方程
E(y) = β
0+β
1x
这个方程对应的图像是一条直线,称做回归线
其中,β
0是回归线的截距
β
1是回归线的斜率
E(y)是在一个给定x值下y的指望值(均值)
6. 正向线性关系:
7. 负向线性关系:
8. 无关系
9. 估计的简单线性回归方程
ŷ=b
0+b
1x
这个方程叫作估计线性方程(estimated regression line)
其中,b
0是估计线性方程的纵截距
b
1是估计线性方程的斜率
ŷ是在自变量x等于一个给定值的时候,y的估计值
10. 线性回归分析流程:
11. 关于误差ε的假定
11.1 是一个随机的变量,均值为0
11.2 ε的方差(variance)对于全部的自变量x是同样的
11.3 ε的值是独立的
11.4 ε知足正态分布
7.1 简单线性回归 (Simple Linear Regression)下
1. 简单线性回归模型举例:
汽车卖家作电视广告数量与卖出的汽车数量:
1.1 如何练处适合简单线性回归模型的最佳回归线?
使sum of squares最小
1.1.2 计算
分子 = (1-2)(14-20)+(3-2)(24-20)+(2-2)(18-20)+(1-2)(17-20)+(3-2)(27-20)
= 6 + 4 + 0 + 3 + 7
= 20
分母 = (1-2)^2 + (3-2)^2 + (2-2)^2 + (1-2)^2 + (3-2)^2
= 1 + 1 + 0 + 1 + 1
4
b1 = 20/4 =5
b0 = 20 - 5*2 = 20 - 10 = 10
1.2 预测:
假设有一周广告数量为6,预测的汽车销售量是多少?
x_given = 6
Y_hat = 5*6 + 10 = 40
1.3 Python实现:
import numpy as np
def fitSLR(x, y):
n = len(x)
dinominator = 0
numerator = 0
for i in range(0, n):
numerator += (x[i] - np.mean(x))*(y[i] - np.mean(y))
dinominator += (x[i] - np.mean(x))**2
b1 = numerator/float(dinominator)
b0 = np.mean(y)/float(np.mean(x))
return b0, b1
def predict(x, b0, b1):
return b0 + x*b1
x = [1, 3, 2, 1, 3]
y = [14, 24, 18, 17, 27]
b0, b1 = fitSLR(x, y)
print "intercept:", b0, " slope:", b1
x_test = 6
y_test = predict(6, b0, b1)
print "y_test:", y_test
7.3 多元回归分析(multiple regression)
1. 与简单线性回归区别(simple linear regression)
多个自变量(x)
2. 多元回归模型
y=β
0+β
1x
1+β
2x
2+ ... +β
px
p+ε
其中:β
0,β
1,β
2... β
p是参数
ε是偏差值
3. 多元回归方程
E(y)=β
0+β
1x
1+β
2x
2+ ... +β
px
p
4. 估计多元回归方程:
y_hat=b0+b1x1+b2x2+ ... +bpxp
一个样本被用来计算β
0,β
1,β
2... β
p的点估计b
0, b
1, b
2,..., b
p
5. 估计流程 (与简单线性回归相似)
6. 估计方法
使sum of squares最小
运算与简单线性回归相似,涉及到线性代数和矩阵代数的运算
7. 例子
一家快递公司送货:X1: 运输里程 X2: 运输次数 Y:总运输时间
Driving 框架 Assignment |
X1=Miles Traveled |
X2=Number of Deliveries |
Y= Travel Time (Hours) |
1 |
100 |
4 |
9.3 |
2 |
50 |
3 |
4.8 |
3 |
100 |
4 |
8.9 |
4 |
100 |
2 |
6.5 |
5 |
50 |
2 |
4.2 |
6 |
80 |
2 |
6.2 |
7 |
75 |
3 |
7.4 |
8 |
65 |
4 |
6.0 |
9 |
90 |
3 |
7.6 |
10 |
90 |
2 |
6.1 |
Time = b0+ b1*Miles + b2 * Deliveries
Time = -0.869 + 0.0611 Miles + 0.923 Deliveries
8. 描述参数含义
b0: 平均每多运送一英里,运输时间延长0.0611 小时
b1: 平均每多一次运输,运输时间延长 0.923 小时
9. 预测
若是一个运输任务是跑102英里,运输6次,预计多少小时?
Time = -0.869 +0.0611 *102+ 0.923 * 6
= 10.9 (小时)
10. 若是自变量中有分类型变量(categorical data) , 如何处理?
英里数 |
次数 |
车型 |
时间 |
100 |
4 |
1 |
9.3 |
50 |
3 |
0 |
4.8 |
100 |
4 |
1 |
8.9 |
100 |
2 |
2 |
6.5 |
50 |
2 |
2 |
4.2 |
80 |
2 |
1 |
6.2 |
75 |
3 |
1 |
7.4 |
65 |
4 |
0 |
6 |
90 |
3 |
0 |
7.6 |
11. 关于偏差的分布
偏差ε是一个随机变量,均值为0
ε的方差对于全部的自变量来讲相等
全部ε的值是独立的
ε知足正态分布,而且经过β
0+β
1x
1+β
2x
2+ ... +β
px
p反映y的指望值
7.4 多元回归分析(multiple regression)应用
1. 例子
一家快递公司送货:X1: 运输里程 X2: 运输次数 Y:总运输时间
Driving Assignment |
X1=Miles Traveled |
X2=Number of Deliveries |
Y= Travel Time (Hours) |
1 |
100 |
4 |
9.3 |
2 |
50 |
3 |
4.8 |
3 |
100 |
4 |
8.9 |
4 |
100 |
2 |
6.5 |
5 |
50 |
2 |
4.2 |
6 |
80 |
2 |
6.2 |
7 |
75 |
3 |
7.4 |
8 |
65 |
4 |
6.0 |
9 |
90 |
3 |
7.6 |
10 |
90 |
2 |
6.1 |
目的,求出b0, b1,.... bp:
y_hat=b
0+b
1x
1+b
2x
2+ ... +b
px
p
2. Python代码:
from numpy import genfromtxt
import numpy as np
from sklearn import datasets, linear_model
dataPath = r"D:\MaiziEdu\DeepLearningBasics_MachineLearning\Datasets\Delivery.csv"
deliveryData = genfromtxt(dataPath, delimiter=',')
print "data"
print deliveryData
X = deliveryData[:, :-1]
Y = deliveryData[:, -1]
print "X:"
print X
print "Y: "
print Y
regr = linear_model.LinearRegression()
regr.fit(X, Y)
print "coefficients"
print regr.coef_
print "intercept: "
print regr.intercept_
xPred = [102, 6]
yPred = regr.predict(xPred)
print "predicted y: "
print yPred
7.5 非线性回归 logistic regression
1. 几率:
1.1 定义 几率(P)robability: 对一件事情发生的可能性的衡量
1.2 范围 0 <= P <= 1
1.3 计算方法:
1.3.1 根据我的置信
1.3.2 根据历史数据
1.3.3 根据模拟数据
1.4 条件几率:
2. Logistic Regression (逻辑回归)
2.1 例子
h(x) > 0.5
h(x) > 0.2
2.2 基本模型
测试数据为X(x0,x1,x2···xn)
要学习的参数为: Θ(θ0,θ1,θ2,···θn)
向量表示:
处理二值数据,引入Sigmoid函数时曲线平滑化
预测函数:
用几率表示:
正例(y=1):
反例(y=0):
2.3 Cost函数
线性回归:
找到合适的 θ0,θ1使上式最小
Logistic regression:
Cost函数:
目标:找到合适的 θ0,θ1使上式最小
2.4 解法:梯度降低(gradient decent)
更新法则:
学习率
同时对全部的θ进行更新
重复更新直到收敛
7.6 非线性回归应用:losgistic regression application
Python 实现:
import numpy as np
import random
# m denotes the number of examples here, not the number of features
def gradientDescent(x, y, theta, alpha, m, numIterations):
xTrans = x.transpose()
for i in range(0, numIterations):
hypothesis = np.dot(x, theta)
loss = hypothesis - y
# avg cost per example (the 2 in 2*m doesn't really matter here.
# But to be consistent with the gradient, I include it)
cost = np.sum(loss ** 2) / (2 * m)
print("Iteration %d | Cost: %f" % (i, cost))
# avg gradient per example
gradient = np.dot(xTrans, loss) / m
# update
theta = theta - alpha * gradient
return theta
def genData(numPoints, bias, variance):
x = np.zeros(shape=(numPoints, 2))
y = np.zeros(shape=numPoints)
# basically a straight line
for i in range(0, numPoints):
# bias feature
x[i][0] = 1
x[i][1] = i
# our target variable
y[i] = (i + bias) + random.uniform(0, 1) * variance
return x, y
# gen 100 points with a bias of 25 and 10 variance as a bit of noise
x, y = genData(100, 25, 10)
m, n = np.shape(x)
numIterations= 100000
alpha = 0.0005
theta = np.ones(n)
theta = gradientDescent(x, y, theta, alpha, m, numIterations)
print(theta)
7.7 回归中的相关度和R平方值
1. 皮尔逊相关系数 (Pearson Correlation Coefficient):
1.1 衡量两个值线性相关强度的量
1.2 取值范围 [-1, 1]:
正向相关: >0, 负向相关:<0, 无相关性:=0
1.3
2. 计算方法举例:
X |
Y |
1 |
10 |
3 |
12 |
8 |
24 |
7 |
21 |
9 |
34 |
|
|
3. 其余例子:
4. R平方值:
4.1定义:决定系数,反应因变量的所有变异能经过回归关系被自变量解释的比例。
4.2 描述:如R平方为0.8,则表示回归关系能够解释因变量80%的变异。换句话说,若是咱们能控制自变量不变,则因变量的变异程度会减小80%
4.3: 简单线性回归:R^2 = r * r
多元线性回归:
5. R平方也有其局限性:R平方随着自变量的增长会变大,R平方和样本量是有关系的。所以,咱们要到R平方进行修正。修正的方法:
7.8 回归中的相关度和R平方值应用
Python实现:
import numpy as np
from astropy.units import Ybarn
import math
def computeCorrelation(X, Y):
xBar = np.mean(X)
yBar = np.mean(Y)
SSR = 0
varX = 0
varY = 0
for i in range(0 , len(X)):
diffXXBar = X[i] - xBar
diffYYBar = Y[i] - yBar
SSR += (diffXXBar * diffYYBar)
varX += diffXXBar**2
varY += diffYYBar**2
SST = math.sqrt(varX * varY)
return SSR / SST
testX = [1, 3, 8, 7, 9]
testY = [10, 12, 24, 21, 34]
print computeCorrelation(testX, testY)
8.1 聚类(Clustering) K-means算法
1. 归类:
聚类(clustering) 属于非监督学习 (unsupervised learning)
无类别标记(class label)
2. 举例:
3. K-means 算法:
3.1 Clustering 中的经典算法,数据挖掘十大经典算法之一
3.2 算法接受参数 k ;而后将事先输入的n个数据对象划分为 k个聚类以便使得所得到的聚类知足:同一
聚类中的对象类似度较高;而不一样聚类中的对象类似度较小。
3.3 算法思想:
以空间中k个点为中心进行聚类,对最靠近他们的对象归类。经过迭代的方法,逐次更新各聚类中心
的值,直至获得最好的聚类结果
3.4 算法描述:
(1)适当选择c个类的初始中心;
(2)在第k次迭代中,对任意一个样本,求其到c各中心的距离,将该样本归到距离最短的中心所在
的类;
(3)利用均值等方法更新该类的中心值;
(4)对于全部的c个聚类中心,若是利用(2)(3)的迭代法更新后,值保持不变,则迭代结束,
不然继续迭代。
3.5 算法流程:
输入:k, data[n];
(1) 选择k个初始中心点,例如c[0]=data[0],…c[k-1]=data[k-1];
(2) 对于data[0]….data[n], 分别与c[0]…c[k-1]比较,假定与c[i]差值最少,就标记为i;
(3) 对于全部标记为i点,从新计算c[i]={ 全部标记为i的data[j]之和}/标记为i的个数;
(4) 重复(2)(3),直到全部c[i]值的变化小于给定阈值。
4. 举例:
中止
优势:速度快,简单
缺点:最终结果跟初始点选择相关,容易陷入局部最优,需直到k值
8.2 聚类(Clustering) K-means算法应用
import numpy as np
# Function: K Means
# -------------
# K-Means is an algorithm that takes in a dataset and a constant
# k and returns k centroids (which define clusters of data in the
# dataset which are similar to one another).
def kmeans(X, k, maxIt):
numPoints, numDim = X.shape
dataSet = np.zeros((numPoints, numDim + 1))
dataSet[:, :-1] = X
# Initialize centroids randomly
centroids = dataSet[np.random.randint(numPoints, size = k), :]
centroids = dataSet[0:2, :]
#Randomly assign labels to initial centorid
centroids[:, -1] = range(1, k +1)
# Initialize book keeping vars.
iterations = 0
oldCentroids = None
# Run the main k-means algorithm
while not shouldStop(oldCentroids, centroids, iterations, maxIt):
print "iteration: \n", iterations
print "dataSet: \n", dataSet
print "centroids: \n", centroids
# Save old centroids for convergence test. Book keeping.
oldCentroids = np.copy(centroids)
iterations += 1
# Assign labels to each datapoint based on centroids
updateLabels(dataSet, centroids)
# Assign centroids based on datapoint labels
centroids = getCentroids(dataSet, k)
# We can get the labels too by calling getLabels(dataSet, centroids)
return dataSet
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations, maxIt):
if iterations > maxIt:
return True
return np.array_equal(oldCentroids, centroids)
# Function: Get Labels
# -------------
# Update a label for each piece of data in the dataset.
def updateLabels(dataSet, centroids):
# For each element in the dataset, chose the closest centroid.
# Make that centroid the element's label.
numPoints, numDim = dataSet.shape
for i in range(0, numPoints):
dataSet[i, -1] = getLabelFromClosestCentroid(dataSet[i, :-1], centroids)
def getLabelFromClosestCentroid(dataSetRow, centroids):
label = centroids[0, -1];
minDist = np.linalg.norm(dataSetRow - centroids[0, :-1])
for i in range(1 , centroids.shape[0]):
dist = np.linalg.norm(dataSetRow - centroids[i, :-1])
if dist < minDist:
minDist = dist
label = centroids[i, -1]
print "minDist:", minDist
return label
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, k):
# Each centroid is the geometric mean of the points that
# have that centroid's label. Important: If a centroid is empty (no points have
# that centroid's label) you should randomly re-initialize it.
result = np.zeros((k, dataSet.shape[1]))
for i in range(1, k + 1):
oneCluster = dataSet[dataSet[:, -1] == i, :-1]
result[i - 1, :-1] = np.mean(oneCluster, axis = 0)
result[i - 1, -1] = i
return result
x1 = np.array([1, 1])
x2 = np.array([2, 1])
x3 = np.array([4, 3])
x4 = np.array([5, 4])
testX = np.vstack((x1, x2, x3, x4))
result = kmeans(testX, 2, 10)
print "final result:"
print result
8.3 聚类(Clustering) hierarchical clustering 层次聚类
假设有N个待聚类的样本,对于层次聚类来讲,步骤:
一、(初始化)把每一个样本归为一类,计算每两个类之间的距离,也就是样本与样本之间的类似度;
二、寻找各个类之间最近的两个类,把他们归为一类(这样类的总数就少了一个);
三、从新计算新生成的这个类与各个旧类之间的类似度;
四、重复2和3直到全部样本点都归为一类,结束
整个聚类过程实际上是创建了一棵树,在创建的过程当中,能够经过在第二步上设置一个阈值,当最近的两个类的距离大于这个阈值,则认为迭代能够终止。另外关键的一步就是第三步,如何判断两个类之间的类似度有很多种方法。这里介绍一下三种:
SingleLinkage:又叫作 nearest-neighbor ,就是取两个类中距离最近的两个样本的距离做为这两个集合的距离,也就是说,最近两个样本之间的距离越小,这两个类之间的类似度就越大。容易形成一种叫作 Chaining 的效果,两个 cluster 明明从“大局”上离得比较远,可是因为其中个别的点距离比较近就被合并了,而且这样合并以后 Chaining 效应会进一步扩大,最后会获得比较松散的 cluster 。
CompleteLinkage:这个则彻底是 Single Linkage 的反面极端,取两个集合中距离最远的两个点的距离做为两个集合的距离。其效果也是恰好相反的,限制很是大,两个 cluster 即便已经很接近了,可是只要有不配合的点存在,就顽固到底,老死不相合并,也是不太好的办法。这两种类似度的定义方法的共同问题就是指考虑了某个有特色的数据,而没有考虑类内数据的总体特色。
Average-linkage:这种方法就是把两个集合中的点两两的距离所有放在一块儿求一个平均值,相对也能获得合适一点的结果。
average-linkage的一个变种就是取两两距离的中值,与取均值相比更加可以解除个别偏离样本对结果的干扰。
8.4 聚类(Clustering) hierarchical clustering 层次聚类应用
from numpy import *
"""
Code for hierarchical clustering, modified from
Programming Collective Intelligence by Toby Segaran
(O'Reilly Media 2007, page 33).
"""
class cluster_node:
def __init__(self,vec,left=None,right=None,distance=0.0,id=None,count=1):
self.left=left
self.right=right
self.vec=vec
self.id=id
self.distance=distance
self.count=count #only used for weighted average
def L2dist(v1,v2):
return sqrt(sum((v1-v2)**2))
def L1dist(v1,v2):
return sum(abs(v1-v2))
# def Chi2dist(v1,v2):
# return sqrt(sum((v1-v2)**2))
def hcluster(features,distance=L2dist):
#cluster the rows of the "features" matrix
distances={}
currentclustid=-1
# clusters are initially just the individual rows
clust=[cluster_node(array(features[i]),id=i) for i in range(len(features))]
while len(clust)>1:
lowestpair=(0,1)
closest=distance(clust[0].vec,clust[1].vec)
# loop through every pair looking for the smallest distance
for i in range(len(clust)):
for j in range(i+1,len(clust)):
# distances is the cache of distance calculations
if (clust[i].id,clust[j].id) not in distances:
distances[(clust[i].id,clust[j].id)]=distance(clust[i].vec,clust[j].vec)
d=distances[(clust[i].id,clust[j].id)]
if d<closest:
closest=d
lowestpair=(i,j)
# calculate the average of the two clusters
mergevec=[(clust[lowestpair[0]].vec[i]+clust[lowestpair[1]].vec[i])/2.0 \
for i in range(len(clust[0].vec))]
# create the new cluster
newcluster=cluster_node(array(mergevec),left=clust[lowestpair[0]],
right=clust[lowestpair[1]],
distance=closest,id=currentclustid)
# cluster ids that weren't in the original set are negative
currentclustid-=1
del clust[lowestpair[1]]
del clust[lowestpair[0]]
clust.append(newcluster)
return clust[0]
def extract_clusters(clust,dist):
# extract list of sub-tree clusters from hcluster tree with distance<dist
clusters = {}
if clust.distance<dist:
# we have found a cluster subtree
return [clust]
else:
# check the right and left branches
cl = []
cr = []
if clust.left!=None:
cl = extract_clusters(clust.left,dist=dist)
if clust.right!=None:
cr = extract_clusters(clust.right,dist=dist)
return cl+cr
def get_cluster_elements(clust):
# return ids for elements in a cluster sub-tree
if clust.id>=0:
# positive id means that this is a leaf
return [clust.id]
else:
# check the right and left branches
cl = []
cr = []
if clust.left!=None:
cl = get_cluster_elements(clust.left)
if clust.right!=None:
cr = get_cluster_elements(clust.right)
return cl+cr
def printclust(clust,labels=None,n=0):
# indent to make a hierarchy layout
for i in range(n): print ' ',
if clust.id<0:
# negative id means that this is branch
print '-'
else:
# positive id means that this is an endpoint
if labels==None: print clust.id
else: print labels[clust.id]
# now print the right and left branches
if clust.left!=None: printclust(clust.left,labels=labels,n=n+1)
if clust.right!=None: printclust(clust.right,labels=labels,n=n+1)
def getheight(clust):
# Is this an endpoint? Then the height is just 1
if clust.left==None and clust.right==None: return 1
# Otherwise the height is the same of the heights of
# each branch
return getheight(clust.left)+getheight(clust.right)
def getdepth(clust):
# The distance of an endpoint is 0.0
if clust.left==None and clust.right==None: return 0
# The distance of a branch is the greater of its two sides
# plus its own distance
return max(getdepth(clust.left),getdepth(clust.right))+clust.distance