CNN反向传播算法公式

时间 2020-06-29

原文原文链接

网络结构（6c-2s-12c-2s）：网络

初始化：dom

\begin{align}\notag W \sim U(- \frac{\sqrt{6}}{\sqrt{n_j+n_{j+1}}} , \frac{\sqrt{6}}{\sqrt{n_j+n_{j+1}}}) \end{align} 函数

\begin{align}\notag Var(W_i) = \frac{1}{n_i} ; Var(W_i) = \frac{1}{n_{i+1}} ; Var(W_i) = \frac{1}{n_i + n_{i+1}} \end{align} this

偏置 $ b $ 统一初始化为 $ 0 $ ，权重 $ W $ 设置为 $ random(-1,1)\sqrt{\frac{6}{fan_{in} + fan_{out}}} \sim U(- \frac{\sqrt{6}}{\sqrt{n_j+n_{j+1}}} , \frac{\sqrt{6}}{\sqrt{n_j+n_{j+1}}}) $ ， $ n_j $ 表示神经网络的大小， $ fan_{in} = 输入通道数\times卷积核size $ ， $ fan_{out} = 输出通道数\times卷积核size $ 。code

for l = 1 : numel(net.layers)   %  layer
        if strcmp(net.layers{l}.type, 's')
            mapsize = mapsize / net.layers{l}.scale;
            assert(all(floor(mapsize)==mapsize), ['Layer ' num2str(l) ' size must be integer. Actual: ' num2str(mapsize)]);
            for j = 1 : inputmaps
                net.layers{l}.b{j} = 0;
            end
        end
        if strcmp(net.layers{l}.type, 'c')
            mapsize = mapsize - net.layers{l}.kernelsize + 1;
            fan_out = net.layers{l}.outputmaps * net.layers{l}.kernelsize ^ 2;
            for j = 1 : net.layers{l}.outputmaps  %  output map
                fan_in = inputmaps * net.layers{l}.kernelsize ^ 2;
                for i = 1 : inputmaps  %  input map
                    net.layers{l}.k{i}{j} = (rand(net.layers{l}.kernelsize) - 0.5) * 2 * sqrt(6 / (fan_in + fan_out));
                end
                net.layers{l}.b{j} = 0;
            end
            inputmaps = net.layers{l}.outputmaps;
        end
    end

% 'onum' is the number of labels, that's why it is calculated using size(y, 1). If you have 20 labels so the output of the network will be 20 neurons.
    % 'fvnum' is the number of output neurons at the last layer, the layer just before the output layer.
    % 'ffb' is the biases of the output neurons.
    % 'ffW' is the weights between the last layer and the output neurons. Note that the last layer is fully connected to the output layer, that's why the size of the weights is (onum * fvnum)
    fvnum = prod(mapsize) * inputmaps;
    onum = size(y, 1);

    net.ffb = zeros(onum, 1);
    net.ffW = (rand(onum, fvnum) - 0.5) * 2 * sqrt(6 / (onum + fvnum));

前向传播：ip

\begin{align}\notag x_j^l = f(\sum_ {i\in M_j} x_i^{l-1} * k_{ij}^l + b_j^l) \end{align} ci

%  !!below can probably be handled by insane matrix operations
            for j = 1 : net.layers{l}.outputmaps   %  for each output map
                %  create temp output map
                z = zeros(size(net.layers{l - 1}.a{1}) - [net.layers{l}.kernelsize - 1 net.layers{l}.kernelsize - 1 0]);
                for i = 1 : inputmaps   %  for each input map
                    %  convolve with corresponding kernel and add to temp output map
                    z = z + convn(net.layers{l - 1}.a{i}, net.layers{l}.k{i}{j}, 'valid');
                end
                %  add bias, pass through nonlinearity
                net.layers{l}.a{j} = sigm(z + net.layers{l}.b{j});
            end
            %  set number of input maps to this layers number of outputmaps
            inputmaps = net.layers{l}.outputmaps;

前向传播：input

\begin{align}\notag x_j^l = f(\beta_j^l down(x_j^{l-1}) + b_j^l) \end{align} it

%  downsample
            for j = 1 : inputmaps
                z = convn(net.layers{l - 1}.a{j}, ones(net.layers{l}.scale) / (net.layers{l}.scale ^ 2), 'valid');   %  !! replace with variable
                net.layers{l}.a{j} = z(1 : net.layers{l}.scale : end, 1 : net.layers{l}.scale : end, :);
            end

前向传播：io

%  concatenate all end layer feature maps into vector
    net.fv = [];
    for j = 1 : numel(net.layers{n}.a)
        sa = size(net.layers{n}.a{j});
        net.fv = [net.fv; reshape(net.layers{n}.a{j}, sa(1) * sa(2), sa(3))];
    end
    %  feedforward into output perceptrons
    net.o = sigm(net.ffW * net.fv + repmat(net.ffb, 1, size(net.fv, 2)));

sigmoid函数求导：

\begin{align}\notag f(x) = \frac{1}{1+e^{-x}} ; f^\prime(x) = \frac{e^{-x}}{(1+e^{-x})^2} = f(x) \cdot [1 - f(x)] \end{align}

对网络的最后一层输出层，计算输出值和样本值得残差：

\begin{align}\notag \delta^n = -(y-a^n)\cdot f^\prime(z^n) \end{align}

%   error
    net.e = net.o - y;
    %%  backprop deltas
    net.od = net.e .* (net.o .* (1 - net.o));   %  output delta

对于隐层 $ l = n-1,n-2,n-3,...,2 $ ，计算各节点残差：

\begin{align}\notag \delta^l = ({(W^l)}^T \delta^{l+1}) \cdot f^\prime(z^l) \end{align}

%  concatenate all end layer feature maps into vector
    net.fv = [];
    for j = 1 : numel(net.layers{n}.a)
        sa = size(net.layers{n}.a{j});
        net.fv = [net.fv; reshape(net.layers{n}.a{j}, sa(1) * sa(2), sa(3))];
    end

net.fvd = (net.ffW' * net.od);              %  feature vector delta
    if strcmp(net.layers{n}.type, 'c')         %  only conv layers has sigm function
        net.fvd = net.fvd .* (net.fv .* (1 - net.fv));
    end

反向传播：

\begin{align}\notag \delta_j^l = f^\prime(u_j^l)\circ conv2(\delta_j^{l+1},rot180(k_j^{l+1}),'full') \end{align}

for i = 1 : numel(net.layers{l}.a)
                z = zeros(size(net.layers{l}.a{1}));
                for j = 1 : numel(net.layers{l + 1}.a)
                     z = z + convn(net.layers{l + 1}.d{j}, rot180(net.layers{l + 1}.k{i}{j}), 'full');
                end
                net.layers{l}.d{i} = z;
            end

反向传播：

\begin{align}\notag \delta_j^l = \beta_j^{l+1}(f^\prime(u_j^l) \circ up(\delta_j^{l+1})) \end{align}

for j = 1 : numel(net.layers{l}.a)
                net.layers{l}.d{j} = net.layers{l}.a{j} .* (1 - net.layers{l}.a{j}) .* (expand(net.layers{l + 1}.d{j}, [net.layers{l + 1}.scale net.layers{l + 1}.scale 1]) / net.layers{l + 1}.scale ^ 2);
            end

计算最终须要的偏导数值：

\begin{align}\notag \nabla_{W^l}J(W,b;x,y) = \delta^{l+1}(a^l)^T \end{align}

\begin{align}\notag \nabla_{b^l}J(W,b;x,y) = \delta^{l+1} \end{align}

\begin{align}\notag \nabla_{W^l}J(W,b) = [\frac{1}{m}\sum_{i=1}^m\nabla_{W^l}J(W,b;x,y)]+\lambda W_{ij}^l \end{align}

\begin{align}\notag \nabla_{b^l}J(W,b) = \frac{1}{m}\sum_{i=1}^m\nabla_{b^l}J(W,b;x,y) \end{align}

\begin{align}\notag \frac{\partial E}{\partial k_{ij}^l} = rot180(conv2(x_i^{l-1},rot180(\delta_j^l),'valid')) \end{align}

\begin{align}\notag \frac{\partial E}{\partial b_j} = \sum_{u,v}(\delta_j^l)_{uv} \end{align}

for l = 2 : n
        if strcmp(net.layers{l}.type, 'c')
            for j = 1 : numel(net.layers{l}.a)
                for i = 1 : numel(net.layers{l - 1}.a)
                    net.layers{l}.dk{i}{j} = convn(flipall(net.layers{l - 1}.a{i}), net.layers{l}.d{j}, 'valid') / size(net.layers{l}.d{j}, 3);
                end
                net.layers{l}.db{j} = sum(net.layers{l}.d{j}(:)) / size(net.layers{l}.d{j}, 3);
            end
        end
    end

net.dffW = net.od * (net.fv)' / size(net.od, 2);
    net.dffb = mean(net.od, 2);