前言():python
{git
这本书第5章的最佳实践样例程序中,使用了学习率的指数衰减、损失的正则化、可训练参数的滑动平均和模型的持久化。此样例总合了以前提到的知识点,所以是一个不错的练习题目。我根据其实现的效果本身写了一些代码,并在此记录(并且好久都没有更新了,找点内容更新)。
网络
}app
正文():函数
{学习
项目文件:.net
testC5_infer.py线程
# -- coding: utf-8 -- ''' 此模块为头模块,定义了一组网络结构,正则化的损失和结合滑动平均的梯度降低操做。 ''' import tensorflow as tf import math dimension_of_input = 784 number_of_cells_in_hidden1 = 500 amount_of_output = 10 learning_rate_base = 0.8 learning_rate_decay = 0.99 regu_rate = 0.0001 moving_ave_decay = 0.99 #下函数用于创建一组网络结构,包括用于正则化和反向传播的原始网络结构, #和用于计算准确率的滑动平均网络结构。输入为网络的输入结点与全局步骤; #输出为原始网络结构的输出节点,滑动平均网络的输出节点,与滑动平均操 #做。 def bulid_network_with_ave(i_l, global_step): W_1 = tf.Variable(tf.truncated_normal([dimension_of_input, number_of_cells_in_hidden1], stddev=0.1),) tf.add_to_collection("weights", W_1) b_1 = tf.Variable(tf.zeros([number_of_cells_in_hidden1])) h_l1 = tf.nn.relu(tf.matmul(i_l, W_1) + b_1) W_2 = tf.Variable(tf.truncated_normal([number_of_cells_in_hidden1, amount_of_output], stddev=0.1)) tf.add_to_collection("weights", W_2) b_2 = tf.Variable(tf.zeros([amount_of_output])) o_l = tf.matmul(h_l1, W_2) + b_2 variable_averages = tf.train.ExponentialMovingAverage(moving_ave_decay, global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) h_l1_ave = tf.nn.relu(tf.matmul(i_l, variable_averages.average(W_1)) + variable_averages.average(b_1)) with tf.variable_scope("o_l_ave"):#把下面的输出放到命名空间中以方便调用 o_l_ave = tf.matmul(h_l1_ave, variable_averages.average(W_2)) + variable_averages.average(b_2) return o_l, o_l_ave, variables_averages_op def get_loss_with_regu(o_l, o_, weights): #用于定义损失。输出为正则化损失 regu = tf.contrib.layers.l2_regularizer(regu_rate) cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = o_l, labels = o_, name='xentropy') loss = tf.reduce_mean(cross_entropy, name='xentropy_mean') for weight in weights: loss += regu(weight) return loss #下函数用于定义训练步骤。输出为结合滑动平均的输出步骤。 def gradient_descent_operation_with_moving_averag(learning_rate, loss, global_step, variables_averages_op): train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step = global_step) return tf.group(train_step, variables_averages_op) ''' 1.计算损失时用的是原始网络的参数,而计算准确率时用的是滑动平均网络的参数。 '''
testC5_train.py
3d
# -- coding: utf-8 -- ''' 此模块为训练模块,实现了训练过程,同时也是程序人口模块。 ''' import input_data import tensorflow as tf import math import testC5_infer import testC5_eval import os def testC5_train(): mnist = input_data.read_data_sets("data/", one_hot=True) i_l = tf.placeholder("float", [None, testC5_infer.dimension_of_input], name="i_l")#定义占位符和变量.i_l加了名字以方便在评估中调用 o_ = tf.placeholder("float", [None, testC5_infer.amount_of_output]) global_step = tf.Variable(0, trainable=False) o_l, o_l_ave, variables_averages_op = testC5_infer.bulid_network_with_ave(i_l, global_step)#定义网络 loss = testC5_infer.get_loss_with_regu(o_l, o_, tf.get_collection("weights"))#定义损失 learning_rate = tf.train.exponential_decay(testC5_infer.learning_rate_base, global_step, 100, testC5_infer.learning_rate_decay)#定义学习率 train_step = testC5_infer.gradient_descent_operation_with_moving_averag(learning_rate, loss, global_step, variables_averages_op)#定义训练过程 sess = tf.Session() sess.run(tf.initialize_all_variables()) saver = tf.train.Saver() for i in range(30000): batch_xs, batch_ys = mnist.train.next_batch(100) sess.run(train_step, feed_dict={i_l: batch_xs, o_: batch_ys}) if (i+1) % 1000 == 0: saver.save(sess, "D:/Backup/Documents/Visual Studio 2015/Projects/testC5-train/testC5-train.ckpt", global_step=global_step)#保存checkpoint模型结构与数据 print("%d-%dth iteration is passed" % (i-998, i+1)) sess.close() from multiprocessing import Process if __name__ == "__main__":#主线程只负责开两个子线程。一个子线程执行训练函数,另外一个执行评估函数。 train = Process(target=testC5_train) train.daemon = True eval = Process(target=testC5_eval.testC5_eval) eval.daemon = True train.start() eval.start() ''' 1,目录中最好用/代替\,由于/有时会被用做转义符(例如\t就是一段空格); 2,VS不能对multiprocessing建立的子进程下断点,目前就先用print调试。搜到的相似问题:https://www.v2ex.com/t/377421 '''
testC5_eval.py
调试
# -- coding: utf-8 -- ''' 此模块为评估模块,在评估过程的基础上添加了加载模型的部分。 ''' import input_data import tensorflow as tf import math import testC5_infer import time def testC5_eval(): detection_interval_in_sec = 0.5#检测间隔时间 mnist = input_data.read_data_sets("data/", one_hot=True) ckpt = tf.train.get_checkpoint_state("D:/Backup/Documents/Visual Studio 2015/Projects/testC5-train/") while ckpt == None: #等待模型数据 time.sleep(detection_interval_in_sec) ckpt = tf.train.get_checkpoint_state("D:/Backup/Documents/Visual Studio 2015/Projects/testC5-train/") time.sleep(detection_interval_in_sec)#① saver = tf.train.import_meta_graph(ckpt.model_checkpoint_path + ".meta")#载入模型结构 o_ = tf.placeholder("float", [None, testC5_infer.amount_of_output])#评估相关定义 correct_prediction = tf.equal(tf.argmax(tf.get_default_graph().get_tensor_by_name("o_l_ave/add:0"),1), tf.argmax(o_,1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) sess = tf.Session() checkpoint_path = 0 while True: #每隔检测间隔时间检测一次,若是模型文件变化则进行评估 if(checkpoint_path != ckpt.model_checkpoint_path): time.sleep(detection_interval_in_sec)#② checkpoint_path = ckpt.model_checkpoint_path saver.restore(sess,ckpt.model_checkpoint_path) print("accuracy of ↑ is %f \n" % (sess.run(accuracy, feed_dict={tf.get_default_graph().get_tensor_by_name("i_l:0"): mnist.test.images, o_: mnist.test.labels}))) time.sleep(detection_interval_in_sec) ckpt = tf.train.get_checkpoint_state("D:/Backup/Documents/Visual Studio 2015/Projects/testC5-train/") sess.close() ''' 1,若是不在testC5_eval的①和②处添加延迟,则有可能会出现文件读取错误,并且打印顺序也会乱。 所以估计持久化的顺序为:修改checkpoint文件-》建立当前模型文件-》还作了一些事占了时间。 '''
input_data.py是谷歌官方的模块文件。翻不了墙的能够到:https://blog.csdn.net/taomiaotaomiao/article/details/78566775
此程序的主线程只负责定义和执行两个子线程:train和eval。train负责训练网络,每迭代训练1000次保存一次;eval负责在每次检测到保存的模型文件的更新时进行评估。
最后几回结果以下:
}
结语():
{
不知道为何准确率差了一点,书上给的是0.9841。有没有哪位大神能够看到问题?
书上的模型数据加载方式我没有用,我直接加载了模型的全部数据。书上用的是variables_to_restore(),详见:https://blog.csdn.net/sinat_29957455/article/details/78508793。
}