本文介绍以下三个方面的知识:html
一、如何将pascal voc数据集转化为TensorFlow的tfrecord文件?python
二、如何使用lxml解析xml文件?git
三、如何使用opencv在图片上画出目标边框?github
【第一部分】将pascal voc数据集转化为TensorFlow的tfrecord文件session
pascal voc数据集下载地址为:app
http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html函数
其中的name与label数字标签的映射关系文件下载地址为:fetch
https://github.com/tensorflow/modelsui
位于:models-master\research\object_detection\data\pascal_label_map.pbtxtcode
整个解析过程分为三步:
1.将name与label的映射文件pascal_label_map.pbtxt解析为字典格式数据,即name---label格式
2.将xml文件使用lxml读取出来后,将其解析为字典格式的数据。
3.将原始图片数据与annotation数据转为tfrecord文件格式数据。
代码以下:
import tensorflow as tf from lxml import etree import os from PIL import Image #定义解析单个lxml文件 def parse_xml(xml_path,name_label_map): tree = etree.parse(xml_path) dict = {} for x in tree.xpath('//filename'): if len(x): print("error") else: dict["image_"+x.tag] = x.text for x in tree.xpath('//size'): for x1 in x.getchildren(): dict["image_"+x1.tag] = x1.text object_numbers = 0 #可能存在多个object节点,即多目标检测 for obj in tree.xpath('//object'): #获取当前object节点的子节点 for x in obj.getchildren(): #判断节点x是否有子节点 if len(x): if x.tag == 'bndbox': for bbox in x.getchildren(): dict['object_'+str(object_numbers)+'/bndbbox/'+bbox.tag] = bbox.text else: pass elif x.tag == 'name': #将name与id均保存到字典中 dict["object_"+str(object_numbers)+"/"+x.tag] = x.text dict["object_" + str(object_numbers) + "/id" ] = name_label_map[x.text] else: pass object_numbers += 1 dict['object_number'] = object_numbers return dict #将name与label的映射map文件解析为字典格式 # name<---->id def parse_map_file(map_file): name_label_map = {} with open(map_file) as f: id = 0 for line in f.readlines(): if len(line) > 1: if line.find('id') != -1: line = line.strip('\\n') line = line.strip(' ') colon = line.index(':') colon = colon + 1 line_id = line[colon:].strip(' ') id = int(line_id) elif line.find('name') != -1: line = line.strip('\\n').strip(' ') first = line.index("'") last = line.rindex("'") line_name = line[first+1:last] name_label_map[line_name]=id id = 0 else: pass else: #print("empty line") pass return name_label_map MAP_FILE = r"D:\models-master\research\object_detection\data\pascal_label_map.pbtxt" BASE_PATH= r"E:\VOCtrainval_11-May-2012\VOCdevkit\VOC2012\Annotations" BASE_JPEG_PATH = r"E:\VOCtrainval_11-May-2012\VOCdevkit\VOC2012\JPEGImages" name_label_map = parse_map_file(MAP_FILE) xml_file_list = os.listdir(BASE_PATH) train_list = [] test_list = [] j = 0 for i in range(len(xml_file_list)): if j % 6 == 0: test_list.append(xml_file_list[i]) else: train_list.append(xml_file_list[i]) j = j + 1 with tf.python_io.TFRecordWriter(path=r"E:\VOCtrainval_11-May-2012\train.tfrecords") as tf_writer: for i in range(len(train_list)): file_path = os.path.join(BASE_PATH,train_list[i]) if os.path.isfile(file_path): #解析xml为字典格式数据 xml_dict = parse_xml(file_path,name_label_map) image = Image.open(os.path.join(BASE_JPEG_PATH,xml_dict['image_filename'])) image_bytes = image.tobytes() features = {} features["image"] = tf.train.Feature(bytes_list=tf.train.BytesList(value = [image_bytes])) features['image_width'] = tf.train.Feature(int64_list=tf.train.Int64List(value = [int(xml_dict['image_width'])])) features['image_height'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[int(xml_dict['image_height'])])) features['image_depth'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[int(xml_dict['image_depth'])])) features['image/object_number'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[int(xml_dict['object_number'])])) xmin = [] xmax = [] ymin = [] ymax = [] object_id = [] object_name = [] object_number = xml_dict['object_number'] for j in range(object_number): object_i = 'object_'+str(j) #print(xml_dict[object_i+'/name']) #print(type(xml_dict[object_i+'/name'])) object_name.append(bytes(xml_dict[object_i+'/name'],'utf-8')) object_id.append(int(xml_dict[object_i+'/id'])) xmin.append(float(xml_dict[object_i+'/bndbbox/xmin'])) xmax.append(float(xml_dict[object_i + '/bndbbox/xmax'])) ymin.append(float(xml_dict[object_i + '/bndbbox/ymin'])) ymax.append(float(xml_dict[object_i + '/bndbbox/ymax'])) #变长数据以list形式存储 features["image/object/names"] = tf.train.Feature(bytes_list=tf.train.BytesList(value=object_name)) features['image/object/id'] = tf.train.Feature(int64_list=tf.train.Int64List(value=object_id)) features['image/object/xmin'] = tf.train.Feature(float_list=tf.train.FloatList(value=xmin)) features['image/object/xmax'] = tf.train.Feature(float_list=tf.train.FloatList(value=xmax)) features['image/object/ymin'] = tf.train.Feature(float_list=tf.train.FloatList(value=ymin)) features['image/object/ymax'] = tf.train.Feature(float_list=tf.train.FloatList(value=ymax)) tf_features = tf.train.Features(feature=features) tf_example = tf.train.Example(features=tf_features) tf_serialized = tf_example.SerializeToString() tf_writer.write(tf_serialized)
【第二部分】读取目标检测tfrecord数据并使用opencv在图片上画出目标边框
整个过程分为以下两步:
1.编写tfrecord解析函数,即反序列化函数。
2.获取图片标注数据,并使用OpenCV绘制边框。
具体代码以下:
import tensorflow as tf import numpy as np import cv2 def parse_tf(example_proto): dics = {} #定长数据解析 dics['image'] = tf.FixedLenFeature(shape=[],dtype=tf.string) dics['image_width'] = tf.FixedLenFeature(shape=[], dtype=tf.int64) dics['image_height'] = tf.FixedLenFeature(shape=[], dtype=tf.int64) dics['image_depth'] = tf.FixedLenFeature(shape=[], dtype=tf.int64) dics['image/object_number']= tf.FixedLenFeature(shape=[], dtype=tf.int64) #列表数据解析 dics["image/object/names"] = tf.VarLenFeature(tf.string) dics['image/object/id'] = tf.VarLenFeature(tf.int64) dics['image/object/xmin'] = tf.VarLenFeature(tf.float32) dics['image/object/xmax'] = tf.VarLenFeature(tf.float32) dics['image/object/ymin'] = tf.VarLenFeature(tf.float32) dics['image/object/ymax'] = tf.VarLenFeature(tf.float32) parse_example = tf.parse_single_example(serialized=example_proto,features=dics) object_number = parse_example["image/object_number"] xmin = parse_example['image/object/xmin'] xmax = parse_example['image/object/xmax'] ymin = parse_example['image/object/ymin'] ymax = parse_example['image/object/ymax'] image = tf.decode_raw(parse_example['image'],out_type=tf.uint8) w = parse_example['image_width'] h = parse_example['image_height'] c = parse_example['image_depth'] return image,w,h,c,object_number,xmin,xmax,ymin,ymax dataset = tf.data.TFRecordDataset(r"E:\VOCtrainval_11-May-2012\train.tfrecords") dataset = dataset.map(parse_tf).batch(1).repeat(1) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() with tf.Session() as session: image, w, h, c, object_number, xmin, xmax, ymin, ymax = session.run(fetches=next_element) image = np.reshape(image,newshape=[h[0],w[0],c[0]]) #使用OpenCV绘制表框 for i in range(object_number[0]): #左上角坐标与右下角坐标 cv2.rectangle(image,(xmin.values[i],ymin.values[i]),(xmax.values[i],ymax.values[i]),color=(0,255,0)) cv2.imshow("s",image) cv2.waitKey(0)
效果以下图:
PS:在解析tfrecord数据文件时,因为在解析函数中拿到的都是tensor,而不是数据自己,又因为session没法传递到解析函数中,因此许多预处理操做在解析函数中没法实施,须要在外面拿到数据后,在利用numpy等对数据进行预处理。