《TensorFlow实战Google深度学习框架(第2版)》第5章练习

前言():python

{git

    这本书第5章的最佳实践样例程序中,使用了学习率的指数衰减、损失的正则化、可训练参数的滑动平均和模型的持久化。此样例总合了以前提到的知识点,所以是一个不错的练习题目。我根据其实现的效果本身写了一些代码,并在此记录(并且好久都没有更新了,找点内容更新)。
网络

}app


正文():函数

{学习

    项目文件:.net

    


    testC5_infer.py线程

# -- coding: utf-8 --
'''
此模块为头模块,定义了一组网络结构,正则化的损失和结合滑动平均的梯度降低操做。
'''
import tensorflow as tf
import math

dimension_of_input = 784
number_of_cells_in_hidden1 = 500
amount_of_output = 10
learning_rate_base = 0.8
learning_rate_decay = 0.99
regu_rate = 0.0001
moving_ave_decay = 0.99
#下函数用于创建一组网络结构,包括用于正则化和反向传播的原始网络结构,
#和用于计算准确率的滑动平均网络结构。输入为网络的输入结点与全局步骤;
#输出为原始网络结构的输出节点,滑动平均网络的输出节点,与滑动平均操
#做。
def bulid_network_with_ave(i_l, global_step): 
    W_1 = tf.Variable(tf.truncated_normal([dimension_of_input, number_of_cells_in_hidden1],
                                    stddev=0.1),)
    tf.add_to_collection("weights", W_1)
    b_1 = tf.Variable(tf.zeros([number_of_cells_in_hidden1]))
    h_l1 = tf.nn.relu(tf.matmul(i_l, W_1) + b_1)

    W_2 = tf.Variable(tf.truncated_normal([number_of_cells_in_hidden1, amount_of_output],
                                    stddev=0.1))
    tf.add_to_collection("weights", W_2)
    b_2 = tf.Variable(tf.zeros([amount_of_output]))

    o_l = tf.matmul(h_l1, W_2) + b_2 
    variable_averages = tf.train.ExponentialMovingAverage(moving_ave_decay, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())
    h_l1_ave = tf.nn.relu(tf.matmul(i_l, variable_averages.average(W_1)) + variable_averages.average(b_1))
    with tf.variable_scope("o_l_ave"):#把下面的输出放到命名空间中以方便调用
        o_l_ave = tf.matmul(h_l1_ave, variable_averages.average(W_2)) + variable_averages.average(b_2)
    return o_l, o_l_ave, variables_averages_op

def get_loss_with_regu(o_l, o_, weights): #用于定义损失。输出为正则化损失
    regu = tf.contrib.layers.l2_regularizer(regu_rate)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = o_l,
                                                          labels = o_,
                                                          name='xentropy')
    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
    for weight in weights:
        loss += regu(weight)
    return loss

#下函数用于定义训练步骤。输出为结合滑动平均的输出步骤。
def gradient_descent_operation_with_moving_averag(learning_rate, loss, global_step, variables_averages_op): 
    train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step = global_step)
    return tf.group(train_step, variables_averages_op)
'''
1.计算损失时用的是原始网络的参数,而计算准确率时用的是滑动平均网络的参数。
'''


    testC5_train.py
3d

# -- coding: utf-8 --
'''
此模块为训练模块,实现了训练过程,同时也是程序人口模块。
'''
import input_data
import tensorflow as tf
import math
import testC5_infer
import testC5_eval
import os
def testC5_train():
    mnist = input_data.read_data_sets("data/", one_hot=True)
    i_l = tf.placeholder("float", [None, testC5_infer.dimension_of_input], name="i_l")#定义占位符和变量.i_l加了名字以方便在评估中调用
    o_ = tf.placeholder("float", [None, testC5_infer.amount_of_output])
    global_step = tf.Variable(0, trainable=False)  

    o_l, o_l_ave, variables_averages_op = testC5_infer.bulid_network_with_ave(i_l, global_step)#定义网络

    loss = testC5_infer.get_loss_with_regu(o_l, o_, tf.get_collection("weights"))#定义损失

    learning_rate = tf.train.exponential_decay(testC5_infer.learning_rate_base, 
                                               global_step, 100, testC5_infer.learning_rate_decay)#定义学习率

    train_step = testC5_infer.gradient_descent_operation_with_moving_averag(learning_rate, 
                                                                            loss, global_step, variables_averages_op)#定义训练过程

    sess = tf.Session()
    sess.run(tf.initialize_all_variables())
    saver = tf.train.Saver()

    for i in range(30000):
        batch_xs, batch_ys = mnist.train.next_batch(100)
        sess.run(train_step, feed_dict={i_l: batch_xs, o_: batch_ys})
        if (i+1) % 1000 == 0:
            saver.save(sess, "D:/Backup/Documents/Visual Studio 2015/Projects/testC5-train/testC5-train.ckpt",
                       global_step=global_step)#保存checkpoint模型结构与数据
            print("%d-%dth iteration is passed" % (i-998, i+1))

    sess.close()

from multiprocessing import Process

if __name__ == "__main__":#主线程只负责开两个子线程。一个子线程执行训练函数,另外一个执行评估函数。
    train = Process(target=testC5_train)
    train.daemon = True 
    eval = Process(target=testC5_eval.testC5_eval)
    eval.daemon = True 
    train.start()
    eval.start()
'''
1,目录中最好用/代替\,由于/有时会被用做转义符(例如\t就是一段空格);
2,VS不能对multiprocessing建立的子进程下断点,目前就先用print调试。搜到的相似问题:https://www.v2ex.com/t/377421
'''

    testC5_eval.py
调试

# -- coding: utf-8 --
'''
此模块为评估模块,在评估过程的基础上添加了加载模型的部分。
'''
import input_data
import tensorflow as tf
import math
import testC5_infer
import time
def testC5_eval():
    detection_interval_in_sec = 0.5#检测间隔时间
    mnist = input_data.read_data_sets("data/", one_hot=True)

    ckpt = tf.train.get_checkpoint_state("D:/Backup/Documents/Visual Studio 2015/Projects/testC5-train/")
    while ckpt ==  None:                              #等待模型数据
        time.sleep(detection_interval_in_sec)
        ckpt = tf.train.get_checkpoint_state("D:/Backup/Documents/Visual Studio 2015/Projects/testC5-train/")

    time.sleep(detection_interval_in_sec)#①
    saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path + ".meta")#载入模型结构

    o_ = tf.placeholder("float", [None, testC5_infer.amount_of_output])#评估相关定义
    correct_prediction = tf.equal(tf.argmax(tf.get_default_graph().get_tensor_by_name("o_l_ave/add:0"),1), tf.argmax(o_,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

    sess = tf.Session()

    checkpoint_path = 0
    while True:                                #每隔检测间隔时间检测一次,若是模型文件变化则进行评估
        if(checkpoint_path != ckpt.model_checkpoint_path):
            time.sleep(detection_interval_in_sec)#②
            checkpoint_path = ckpt.model_checkpoint_path
            saver.restore(sess,ckpt.model_checkpoint_path)
            print("accuracy of ↑ is %f \n" 
                  % (sess.run(accuracy, feed_dict={tf.get_default_graph().get_tensor_by_name("i_l:0"): mnist.test.images, 
                                                   o_: mnist.test.labels})))
        time.sleep(detection_interval_in_sec)
        ckpt = tf.train.get_checkpoint_state("D:/Backup/Documents/Visual Studio 2015/Projects/testC5-train/")

    sess.close()
'''
1,若是不在testC5_eval的①和②处添加延迟,则有可能会出现文件读取错误,并且打印顺序也会乱。
所以估计持久化的顺序为:修改checkpoint文件-》建立当前模型文件-》还作了一些事占了时间。
'''


    input_data.py是谷歌官方的模块文件。翻不了墙的能够到:https://blog.csdn.net/taomiaotaomiao/article/details/78566775


    此程序的主线程只负责定义和执行两个子线程:train和eval。train负责训练网络,每迭代训练1000次保存一次;eval负责在每次检测到保存的模型文件的更新时进行评估。

    最后几回结果以下:

    

}


结语():

{

    不知道为何准确率差了一点,书上给的是0.9841。有没有哪位大神能够看到问题?

    书上的模型数据加载方式我没有用,我直接加载了模型的全部数据。书上用的是variables_to_restore(),详见:https://blog.csdn.net/sinat_29957455/article/details/78508793。

}