170503 TFRecord数据处理(待完善)

时间 2019-12-07 标签 tfrecord 数据处理完善

Github-TFRecord学习
 极客学院-数据处理
 知乎-十图详解tensorflow数据读取机制（附代码）
tensorflow使用笔记(3)–Reading data(1)
Standford-TensorFlow Input Pipeline
Python tensorflow.WholeFileReader() Examples
【TensorFlow】数据处理（输入文件队列）
Tensorflow读数据的一个浅坑
 CSDN-tensorflow学习笔记（五）：TensorFlow变量共享和数据读取
 TFRecordReader “OutOfRangeError错误缘由汇总
 How to write into and read from a TFRecords file in TensorFlowhtml

Shuffer=False 循环按照数据列表顺序遍历数据，每一个epoch顺序都同样

Shuffer=True 循环乱序遍历数据，每隔epoch顺序都不同

注意：此处 shuffle=True是随机打乱tfrecords文件的顺序，文件内部产生数据的顺序不变（以下图）。若是只有一个文件，且包含全部数据，则产生的数据顺序不变。

shuffer=True 只打乱数据文件的顺序，文件内部的样本出现的顺序不变。故此： num_shards设定的越多则样本出现随机性的几率也就越大。极限状况时 num_shards=sample_num则每次一个epoch后的循序都不同； num_shards=1则每次一个epoch后的顺序都是同样的。

Batch Generator每次几乎均匀地从不一样的数据文件中取出数据

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
""" Created on Sun Apr 29 20:49:30 2018 @author: brucelau """

import tensorflow as tf 
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np
import time
from os import listdir
import os
from tensorflow.python.ops import io_ops
from tensorflow.python.ops import variable_scope as vs
import matplotlib.pyplot as plt
#%% TFRecord Test: Making TFRecord Data File
import numpy as np
import tensorflow as tf

# Define Transfer Function
def _int64_feature(value):
    return tf.train.Feature(int64_list = tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list = tf.train.BytesList(value=[value]))

# Making Data
data_x = np.arange(1,1001).reshape((-1,10))
data_y = np.arange(100)

# Loop for Saving Files
for name_idx in range(1,11):
    fn = 'TestRecords/data.tfrecords-%.2d'%name_idx
    writer = tf.python_io.TFRecordWriter(fn)
    for file_num in range(1,11):
        item = (name_idx-1)*10+file_num-1
        image_raw = data_x[item].tostring()

# print(item)
        example = tf.train.Example(features = tf.train.Features(feature={
                'row':_bytes_feature(image_raw),
                'label':_int64_feature(data_y[item])
                }))
        writer.write(example.SerializeToString())
    writer.close()
##%% TFRecord Test: Loading TFRecordd Data File
#import tensorflow as tf
#reader = tf.TFRecordReader()
##files = tf.train.match_filenames_once(['TestRecords/data.tfrecords-01'])
#filename_queue = tf.train.string_input_producer(['TestRecords/data.tfrecords-01'])
#_,serialized_example = reader.read(filename_queue)
#features = tf.parse_single_example(
# serialized_example,
# features={
# 'row':tf.FixedLenFeature([],tf.string),
# 'label':tf.FixedLenFeature([],tf.int64)})
#
## tf.decode_raw能够将字符串解析成图像对应的像素素组
#images = tf.decode_raw(features['row'],tf.int64)
#labels = tf.cast(features['label'],tf.int32)
#
#with tf.Session() as sess:
# sess.run(tf.local_variables_initializer())
# sess.run(tf.global_variables_initializer())
# # 启动多线程处理输入数据，7.3节将更加详细地介绍Tensorflow多线程处理
# coord = tf.train.Coordinator()
# threads = tf.train.start_queue_runners(sess=sess, coord=coord)
# 
# # 每次运行能够读取TFRecord文件中的一个样例。当全部样例都读完以后，在此样例中程序会在重头读取
# for i in range(2):
# image, label= sess.run([images, labels])
# coord.request_stop()
# coord.join(threads) 
#print(image)
#print(label)
#%%
files = tf.train.match_filenames_once(['TestRecords/data.tfrecords-*'])

## Why can's take the following list into the tf.train_string_input_producer()
#import tensorflow as tf
#from os import listdir
#file = listdir('TestRecords/')
#file.sort()
#files = tf.get_variable(name='test7',initializer=file)
#%% Loop Generator for single sample each time according to the files order
filename_queue = tf.train.string_input_producer(files,shuffle=False,num_epochs=2)
reader = tf.TFRecordReader()
_,serialized_example =reader.read(filename_queue)
features = tf.parse_single_example(
        serialized_example,
        features={
                'row':tf.FixedLenFeature([],tf.string),
                'label':tf.FixedLenFeature([],tf.int64)})
decoded_data = tf.decode_raw(features['row'],tf.int64)  # decode according to the original dtype
labels = tf.cast(features['label'],tf.int64)
c = []    
with tf.Session() as sess:
    # Initialization
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    # See the loaded file names
    f1 = sess.run(files)
    f2 = [i.decode('ascii') for i in f1]
    f2.sort()
    print(f1)
    print(f2)
    # Make Coordinator
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess,coord=coord)
    for i in range(200):
        de_data,la = sess.run([decoded_data,labels]) # should use together as a [] list
# la = sess.run([labels]) # should not use in this format 
        print(la)
        c.append(la)
# print(sess.run([retyped_data]))
    coord.request_stop()
    coord.join(threads)
plt.plot(c)

#%%
#%% Batch generator: data from several files simultaneous
filename_queue = tf.train.string_input_producer(files,shuffle=False,num_epochs=2)
# 注意此处`shuffle=True`是随机打乱tfrecords文件的顺序，若是只有一个文件，且包含全部数据，则产生的数据顺序不变
reader = tf.TFRecordReader()
_,serialized_example =reader.read(filename_queue)
features = tf.parse_single_example(
        serialized_example,
        features={
                'row':tf.FixedLenFeature([],tf.string),
                'label':tf.FixedLenFeature([],tf.int64)})
decoded_data = tf.decode_raw(features['row'],tf.int64)  # decode
decoded_data = tf.reshape(decoded_data,[10])            # must be assin the shape or error
labels = tf.cast(features['label'],tf.int64)
c = [] 


batch_size = 10
capacity = 1000 + 3 * batch_size
capacity = 1000 + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch([decoded_data, labels], 
                                                    batch_size=batch_size, 
                                                    capacity=capacity,
                                                   min_after_dequeue=30)
c = []
with tf.Session() as sess:
    # Initialization
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    # See the loaded file names
    f1 = sess.run(files)
    f2 = [i.decode('ascii') for i in f1]
    f2.sort()
# print(f1)
# print(f2)
    # Make Coordinator
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess,coord=coord)
    for i in range(20):
        cur_example_batch, cur_label_batch = sess.run([example_batch, label_batch])
        print (cur_label_batch)
        c.append(cur_label_batch)
    coord.request_stop()
    coord.join(threads)

C = np.array(c).reshape((-1))
C.sort()

plt.plot(C)