Alias Method:时间复杂度O(1)的离散采样方法node
python3 Alias Samplepython
class Solution: def __init__(self, w: List[int]): N = len(w) sum_ = sum(w) prob = [p / sum_ for p in w] alias = [0] * N alias_prob = [p * N for p in prob] small_q = [] large_q = [] for i, p in enumerate(alias_prob): if p < 1: small_q.append(i) else: large_q.append(i) while small_q and large_q: small = small_q.pop(0) large = large_q.pop(0) alias[small] = large alias_prob[large] -= (1 - alias_prob[small]) if alias_prob[large] < 1: small_q.append(large) else: large_q.append(large) self.alias = alias self.N = N self.alias_prob = alias_prob def pickIndex(self) -> int: ix = random.randint(0, self.N - 1) p = random.random() if p < self.alias_prob[ix]: return ix else: return self.alias[ix]
补充其余与随机数相关的算法:git
linear congruential generator (LCG)github
R a n d S e e d = ( A ∗ R a n d S e e d + B ) % M RandSeed = (A * RandSeed + B) \% M RandSeed=(A∗RandSeed+B)%M算法
LCG的周期最大为 M,但大部分状况都会少于M。要令LCG达到最大周期,应符合如下条件:数组
反函数法网络
通常,一种几率分布,若是其分布函数为 y = F ( x ) y=F(x) y=F(x),那么,y的范围是0~1,求其反函数 G G G,而后产生0到1之间的随机数做为输入,那么输出的就是符合该分布的随机数了:app
y = G ( x ) y= G(x) y=G(x)dom
中心极限定理ide
分大量的批次,每一个批次生成12个 [ 0 , 1 ] [0,1] [0,1]间的随机数,而后求和。求和后的随机变量方差为 1 / 12 × 12 = 1 1/12\times 12=1 1/12×12=1,均值为 1 / 2 × 12 = 6 1/2 \times 12=6 1/2×12=6。而后-6。只要产生的批次足够多,就能生成整体服从 N ( 0 , 1 ) \mathcal{N}(0,1) N(0,1)的分布。
import numpy as np import pylab as plt n = 12 N = 5000 x = np.zeros([N]) for j in range(N): a = np.random.rand(n) u = sum(a) x[j] = u - n * 0.5 plt.hist(x) plt.show()
Box Muller
import numpy as np import pylab as plt N = 1000 x1 = np.random.rand(1, N) x2 = np.random.rand(1, N) y1 = np.sqrt(-2 * np.log(x1)) * np.cos(2 * np.pi * x2) y2 = np.sqrt(-2 * np.log(x1)) * np.sin(2 * np.pi * x2) y = np.hstack([y1, y2]) plt.hist(y) plt.show()
逆采样(Inverse Sampling)和拒绝采样(Reject Sampling)原理详解
from math import sqrt from random import random def sample_pi(max_cnt=100000): acc_cnt=0 for i in range(max_cnt): x=random() y=random() if sqrt(x**2+y**2)<1: acc_cnt+=1 print("pi =",acc_cnt/max_cnt*4) sample_pi()
a r c t a n ( z 0 z 1 ) + 0.5 arctan(\frac{z0}{z1})+0.5 arctan(z1z0)+0.5
生成二维标准正态分布的方法就是取两个独立的标准正态分布变量X和Y放在一块儿(X, Y)就好了
而后二维标准正态分布在直角坐标系里有各向同性,也就是(X, Y)这个点所指的方向和X轴(或者任何一个给定方向)的夹角是均匀分布的
很好理解, t a n ( θ ) = y x tan(\theta )=\frac{y}{x} tan(θ)=xy
反着用Box-Muller
按照prob对【标签-样本】pairs进行排序,若是模型具备良好的排序能力,结果应该是 [ 0 , 0 , 1 , 1 , 1 ] [0,0,1,1,1] [0,0,1,1,1],正样本数 M = 3 M=3 M=3,负样本数 N = 2 N=2 N=2
只考虑正样本获取对应的排序值rankList
,为 [ 3 , 4 , 5 ] [3,4,5] [3,4,5],求和为12 。右边公式算出来是6
,分母6
,结果1,符合定义。
def calAUC(prob,labels): f = list(zip(prob,labels)) rank = [values2 for values1,values2 in sorted(f,key=lambda x:x[0])] rankList = [i+1 for i in range(len(rank)) if rank[i]==1] posNum = 0 negNum = 0 for i in range(len(labels)): if(labels[i]==1): posNum+=1 else: negNum+=1 auc = 0 auc = (sum(rankList)- (posNum*(posNum+1))/2)/(posNum*negNum) print(auc) return auc蓄水池抽样
用 O ( N ) O(N) O(N)的时间复杂度从 N N N个数中无放回等可能抽样 K K K个数
用于不知道数据规模的状况,保证每一个样本被抽中的几率是等可能的
假设数据序列的规模为 n n n,须要采样的数量的为 k k k。
首先构建一个可容纳 k k k个元素的数组,将序列的前 k k k个元素放入数组中。
而后从第 k + 1 k+1 k+1个元素开始,以 k / n k/n k/n ( n n n表示动态增加的数据规模)的几率来决定该元素是否被替换到数组中(数组中的元素被替换的几率是相同的)。 当遍历完全部元素以后,数组中剩下的元素即为所需采起的样本。
class Solution: def __init__(self, head: ListNode): """ @param head The linked list's head. Note that the head is guaranteed to be not null, so it contains at least one node. """ self.res=[] self.K=1 self.head=head def getRandom(self) -> int: """ Returns a random node's value. """ k=self.K p=self.head while k and p: self.res.append(p.val) k-=1 p=p.next p=self.head i=0 while p: ix=random.randint(0, i) if ix < self.K: self.res[ix]=p.val i+=1 p=p.next return self.res[0]
这题也是蓄水池抽样
shuffle算法本质上也是蓄水池抽样,就是动做换成swap
全Shuffle和抽m个Shuffle:
from random import randint def shuffle(nums): n = len(nums) for i in range(n): ri = randint(0, i) nums[i], nums[ri] = nums[ri], nums[i] return nums def shuffle_m(nums, m): n = len(nums) for i in range(n): ri = randint(0, i) if ri < m: nums[i], nums[ri] = nums[ri], nums[i] return nums[:m] print(shuffle(list(range(10)))) print(shuffle_m(list(range(10)),3))1 手推公式与实现简单模型
主要背这几个代码
L = − 1 N Σ [ y ⋅ log ( y ^ ) + ( 1 − y ) ⋅ log ( 1 − y ^ ) ] L=-\frac{1}{N}\Sigma[y\cdot \log(\hat{y})+(1-y)\cdot \log(1-\hat{y})] L=−N1Σ[y⋅log(y^)+(1−y)⋅log(1−y^)]
本质上就是信息熵的 − p log p -p \log{p} −plogp的公式,不难记忆
loss = -(y @ np.log(y_hat) + (1 - y) @ np.log(1 - y_hat)) / n
就sigmoid, σ ( X W ) \sigma(XW) σ(XW)
return 1 / (1 + np.exp(-X @ self.w))
∂ L ∂ W = ( y ^ − y ) X \frac{\partial{L}}{\partial{W}}=(\hat{y}-y)X ∂W∂L=(y^−y)X
dw = (y_hat - y) @ X dw /= n
class LR: def __init__(self, lr=0.1, max_iter=1000, eps=1e-5): self.eps = eps self.max_iter = max_iter self.lr = lr def predict_proba(self, X): return 1 / (1 + np.exp(-X @ self.w)) def fit(self, X: np.ndarray, y: np.ndarray): n, m = X.shape X = np.hstack([X, np.ones([n, 1])]) m += 1 self.w = np.zeros([m]) pre_loss = 0 for i in range(self.max_iter): y_hat = self.predict_proba(X) # 交叉熵 loss = -(y @ np.log(y_hat) + (1 - y) @ np.log(1 - y_hat)) / n if abs(loss - pre_loss) < self.eps: print('偏差再也不减小') break pre_loss = loss print(loss) # 计算梯度 dw = (y_hat - y) @ X dw /= n # 梯度降低 self.w -= dw * self.lr print(self.w)
代码:https://github.com/auto-flow/numpy-machine-learning/
符号 | 含义 |
---|---|
Z Z Z | 线性变换输出 |
A A A | 激活层输出 |
L L L | 损失值 |
L \mathcal{L} L | 损失函数 |
前向传播:
Z = X W + b Z=XW+b Z=XW+b
A = σ ( Z ) A=\sigma(Z) A=σ(Z)
L = L ( Y , A ) L=\mathcal{L}(Y,A) L=L(Y,A)
反向传播:
神经元上的梯度
∂ L ∂ Z = ∂ L ∂ A ∂ A ∂ Z \frac{\partial{L}}{\partial{Z}} = \frac{\partial{L}}{\partial{A}} \frac{\partial{A}}{\partial{Z}} ∂Z∂L=∂A∂L∂Z∂A
权重梯度
∂ L ∂ W = ∂ L ∂ A ∂ A ∂ Z ∂ Z ∂ W \frac{\partial{L}}{\partial{W}} = \frac{\partial{L}}{\partial{A}} \frac{\partial{A}}{\partial{Z}} \frac{\partial{Z}}{\partial{W}} ∂W∂L=∂A∂L∂Z∂A∂W∂Z
搞清楚以上概念以后,咱们能够用PyTorch风格的函数接口写一个简单的神经网络:
Linear线性层的正向传播与反向传播:
反正就是 Z = X W + b Z=XW+b Z=XW+b
class Linear(Module): def forward(self, x): self.input = x # Z = XW + b self.output = np.dot(self.input, self.weight) + self.bias return self.output
反向传播的话,先求 W W W和 b b b的梯度,再求神经元上的梯度
求 b b b的梯度:
∂ L ∂ b = ∂ L ∂ Z ∂ Z ∂ b \frac{\partial{L}}{\partial{b}} = \frac{\partial{L}}{\partial{Z}} \frac{\partial{Z}}{\partial{b}} ∂b∂L=∂Z∂L∂b∂Z,其中 ∂ Z ∂ b = 1 \frac{\partial{Z}}{\partial{b}} = 1 ∂b∂Z=1,因此b的梯度直接是输出神经元的梯度 ∂ L ∂ Z \frac{\partial{L}}{\partial{Z}} ∂Z∂L,也就是代码中的grad
求W的梯度:
∂ L ∂ W = ∂ L ∂ Z ∂ Z ∂ W \frac{\partial{L}}{\partial{W}} = \frac{\partial{L}}{\partial{Z}} \frac{\partial{Z}}{\partial{W}} ∂W∂L=∂Z∂L∂W∂Z,其中 ∂ Z ∂ W = X \frac{\partial{Z}}{\partial{W}} = X ∂W∂Z=X,
这里须要注意一点,举个例子,若是当前层的线性映射是 4 × 9 4\times 9 4×9,那么 g r a d = [ B , 9 ] , W = [ 4 , 9 ] , b = [ 9 ] grad=[B, 9],W=[4, 9],b=[9] grad=[B,9],W=[4,9],b=[9]
求bgrad ,直接在axis=0
方面求mean
, b g r a d = [ 9 ] bgrad = [9] bgrad=[9]
求wgrad , X × g r a d = [ B , 4 ] × [ B , 9 ] = [ B , 4 , 9 ] X\times grad=[B,4]\times[B,9]=[B,4,9] X×grad=[B,4]×[B,9]=[B,4,9],而后对axis=0
求mean
,获得 w g r a d = [ 4 , 9 ] wgrad=[4, 9] wgrad=[4,9]
求上一层的grad, ∂ L ∂ X = ∂ L ∂ Z ∂ Z ∂ X = ∂ L ∂ Z W \frac{\partial{L}}{\partial{X}} = \frac{\partial{L}}{\partial{Z}} \frac{\partial{Z}}{\partial{X}} = \frac{\partial{L}}{\partial{Z}} W ∂X∂L=∂Z∂L∂X∂Z=∂Z∂LW, ∂ L ∂ Z \frac{\partial{L}}{\partial{Z}} ∂Z∂L就是下一层的grad
def backward(self, grad): self.bgrad = np.mean(grad, axis=0) # dL/dW = dL/dZ * dZ/dW = X * dL/dZ self.wgrad += np.mean(mul(self.input, grad), axis=0) # dL/dX = dL/dZ * W # 至关于用权重W对梯度grad作线性变换,将梯度传到上一层的神经元 grad = np.dot(grad, self.weight.T) return grad
注意,numpy的 [ B , 4 ] × [ B , 9 ] = [ B , 4 , 9 ] [B,4]\times[B,9]=[B,4,9] [B,4]×[B,9]=[B,4,9]是这么算的:
def mul(A: np.ndarray, B: np.ndarray): # A: 100x9 B: 100x1 # AxB = 100x9x1 na = A.shape[1] nb = B.shape[1] A = np.repeat(A[:, :, np.newaxis], nb, 2) B = np.repeat(B[:, np.newaxis,:], na, 1) return A * B
ReLU的前向传播:对于input<0的部分置为0,至关于引入稀疏性,对于服从标准正态的输入让一半的神经元为0
ReLU的反向传播:记录以前的input<0的神经元索引,使这些神经元的梯度为0
class Relu(Module): def __init__(self): super(Relu, self).__init__() self.input = None self.output = None def forward(self, x): self.input = x x[self.input <= 0] *= 0 self.output = x return self.output def backward(self, grad): grad[self.input > 0] *= 1 # 这么写实际上是***子放屁 grad[self.input <= 0] *= 0 return grad
σ ( x ) = 1 1 + e x p ( − x ) \sigma(x)=\frac{1}{1+exp(-x)} σ(x)=1+exp(−x)1
σ ′ ( x ) = σ ( x ) ⋅ ( 1 − σ ( x ) ) \sigma^{\prime}(x)=\sigma(x)\cdot (1-\sigma(x)) σ′(x)=σ(x)⋅(1−σ(x))
因此代码中的output
就是在记录 σ ( x ) \sigma(x) σ(x)值,反向传播本质就是grad
乘以 σ ( x ) ⋅ ( 1 − σ ( x ) ) \sigma(x)\cdot (1-\sigma(x)) σ(x)⋅(1−σ(x))
class Sigmoid(Module): def __init__(self): super(Sigmoid, self).__init__() self.input = None self.output = None def forward(self, x): self.input = x self.output = 1 / (1 + np.exp(-self.input)) return self.output def backward(self, grad): grad *= self.output * (1 - self.output) return grad
t a n h ( x ) = e x p ( x ) − e x p ( − x ) e x p ( x ) + e x p ( − x ) tanh(x)=\frac{exp(x)-exp(-x)}{exp(x)+exp(-x)} tanh(x)=exp(x)+exp(−x)exp(x)−exp(−x)
记忆:上减下加,左加右减
t a n h ′ ( x ) = 1 − t a n h 2 ( x ) tanh^{\prime}(x)=1-tanh^2(x) tanh′(x)=1−tanh2(x)
class Tanh(Module): def __init__(self): super(Tanh, self).__init__() self.input = None self.output = None def forward(self, x): self.input = x self.output = (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x)) return self.output def backward(self, grad): grad *= (1 - np.square(self.output)) return grad
L = L ( y , y ^ ) = 1 2 N Σ ( y − y ^ ) 2 L=\mathcal{L}(y,\hat{y})=\frac{1}{2N}\Sigma(y-\hat{y})^2 L=L(y,y^)=2N1Σ(y−y^)2
∂ L ∂ y ^ = y ^ − y \frac{\partial{L}}{\partial{\hat{y}}}=\hat{y}-y ∂y^∂L=y^−y
注意 y ^ \hat{y} y^放在前面,由于带负号。
class MSE(object): def __init__(self): self.label = None self.pred = None self.grad = None self.loss = None def __call__(self, pred, label): return self.forward(pred, label) def forward(self, pred, label): self.pred, self.label = pred, label self.loss = np.mean(0.5 * np.square(self.pred - self.label)) return self.loss def backward(self, grad=None): self.grad = (self.pred - self.label) return self.grad2 几率统计