本文主要对ultralytics\yolov5在训练时的数据加载模块的dataset.py代码进行注释和解析。固然dataset.py中还有其余时候(例如detect时)所用到的加载方法(例如LoadImages、LoadWebcam等),本文主要是对训练时用到的LoadImagesAndLabels类的相关注释。python
mosaic加强
在这里要说一下,mosaic数据加强就是将四张图片拼接在一块儿传入网络训练,具体能够查看YOLOV4-mosaic数据加强详解。(该文章是基于pytorch YOLOV4代码作的解析)ios
矩形训练
能够看到yolov5会对图片进行填充,填充为正方形从而传入网络进行训练,能够看到这里面有不少冗余的信息,会让网络产生不少无心义的候选框,矩形训练就是减小这些冗余信息,减小网络产生的无心义的框的数量,加快网络训练速度。yolov5网络的总步长为32,因此其实只要图片边长可以整除32就能够了,不必定彻底须要正方形图片传入网络,矩形训练就是将图片填充为最小的32的倍数边长,从而减少冗余信息。
git
值得一提的是,除了矩形训练,还有矩形推理,也就是在作检测的时候也这样填充,从而加快推理速度,减小推理时间。github
import glob import math import os import random import shutil import time from pathlib import Path from threading import Thread import cv2 import numpy as np import torch from PIL import Image, ExifTags from torch.utils.data import Dataset from tqdm import tqdm from utils.utils import xyxy2xywh, xywh2xyxy help_url = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data' img_formats = ['.bmp', '.jpg', '.jpeg', '.png', '.tif', '.dng'] vid_formats = ['.mov', '.avi', '.mp4', '.mpg', '.mpeg', '.m4v', '.wmv', '.mkv'] # Get orientation exif tag for orientation in ExifTags.TAGS.keys(): if ExifTags.TAGS[orientation] == 'Orientation': break # 此函数根据图片的信息获取图片的宽、高信息 def exif_size(img): # Returns exif-corrected PIL size s = img.size # (width, height) try: rotation = dict(img._getexif().items())[orientation] if rotation == 6: # rotation 270 s = (s[1], s[0]) elif rotation == 8: # rotation 90 s = (s[1], s[0]) except: pass return s # 根据LoadImagesAndLabels建立dataloader def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False): """ 参数解析: path:包含图片路径的txt文件或者包含图片的文件夹路径 imgsz:网络输入图片大小 batch_size: 批次大小 stride:网络下采样最大总步长 opt:调用train.py时传入的参数,这里主要用到opt.single_cls,是不是单类数据集 hyp:网络训练时的一些超参数,包括学习率等,这里主要用到里面一些关于数据加强(旋转、平移等)的系数 augment:是否进行数据加强 cache:是否提早缓存图片到内存,以便加快训练速度 pad:设置矩形训练的shape时进行的填充 rect:是否进行矩形训练 """ dataset = LoadImagesAndLabels(path, imgsz, batch_size, augment=augment, # augment images hyp=hyp, # augmentation hyperparameters rect=rect, # rectangular training cache_images=cache, single_cls=opt.single_cls, stride=int(stride), pad=pad) batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=LoadImagesAndLabels.collate_fn) return dataloader, dataset class LoadImagesAndLabels(Dataset): # for training/testing def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False, cache_images=False, single_cls=False, stride=32, pad=0.0): try: f = [] for p in path if isinstance(path, list) else [path]: # 获取数据集路径path,包含图片路径的txt文件或者包含图片的文件夹路径 # 使用pathlib.Path生成与操做系统无关的路径,由于不一样操做系统路径的‘/’会有所不一样 p = str(Path(p)) # os-agnostic # 获取数据集路径的上级父目录,os.sep为路径里的破折号(不一样系统路径破折号不一样,os.sep根据系统自适应) parent = str(Path(p).parent) + os.sep # 若是路径path为包含图片路径的txt文件 if os.path.isfile(p): # file with open(p, 'r') as t: # 获取图片路径,更换相对路径 t = t.read().splitlines() f += [x.replace('./', parent) if x.startswith('./') else x for x in t] # local to global path # 若是路径path为包含图片的文件夹路径 elif os.path.isdir(p): # folder f += glob.iglob(p + os.sep + '*.*') else: raise Exception('%s does not exist' % p) path = p # *.npy dir # 破折号替换为os.sep,os.path.splitext(x)将文件名与扩展名分开并返回一个列表 self.img_files = [x.replace('/', os.sep) for x in f if os.path.splitext(x)[-1].lower() in img_formats] except Exception as e: raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url)) # 数据集的数量 n = len(self.img_files) assert n > 0, 'No images found in %s. See %s' % (path, help_url) # 获取batch的索引 bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index # 一个轮次batch的数量 nb = bi[-1] + 1 # number of batches self.n = n # number of images self.batch = bi # batch index of image self.img_size = img_size # 输入图片分辨率大小 self.augment = augment # 数据加强 self.hyp = hyp # 超参数 self.image_weights = image_weights # 图片采样 self.rect = False if image_weights else rect # 矩形训练 self.mosaic = self.augment and not self.rect # mosaic数据加强 self.mosaic_border = [-img_size // 2, -img_size // 2] # mosaic加强的边界 self.stride = stride # 模型下采样的总步长 # 获取数据集的标签 self.label_files = [x.replace('images', 'labels').replace(os.path.splitext(x)[-1], '.txt') for x in self.img_files] # 保存图片shape的路径 sp = path.replace('.txt', '') + '.shapes' # shapefile path try: # 若是存在该路径,则读取 with open(sp, 'r') as f: # read existing shapefile s = [x.split() for x in f.read().splitlines()] assert len(s) == n, 'Shapefile out of sync' except: # 若是不存在,则读取图片shape再保存 s = [exif_size(Image.open(f)) for f in tqdm(self.img_files, desc='Reading image shapes')] np.savetxt(sp, s, fmt='%g') # overwrites existing (if any) self.shapes = np.array(s, dtype=np.float64) # Rectangular Training https://github.com/ultralytics/yolov3/issues/232 # 矩形训练 if self.rect: # Sort by aspect ratio s = self.shapes # wh ar = s[:, 1] / s[:, 0] # h/w # 获取根据ar从小到大排序的索引 irect = ar.argsort() # 根据索引排序数据集与标签路径、shape、h/w self.img_files = [self.img_files[i] for i in irect] self.label_files = [self.label_files[i] for i in irect] self.shapes = s[irect] # wh ar = ar[irect] # Set training image shapes # 初始化shapes,nb为一轮批次batch的数量 shapes = [[1, 1]] * nb for i in range(nb): ari = ar[bi == i] mini, maxi = ari.min(), ari.max() # 若是一个batch中最大的h/w小于1,则此batch的shape为(img_size*maxi, img_size) if maxi < 1: shapes[i] = [maxi, 1] # 若是一个batch中最小的h/w大于1,则此batch的shape为(img_size, img_size/mini) elif mini > 1: shapes[i] = [1, 1 / mini] self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride # Cache labels # 初始化图片与标签,为缓存图片、标签作准备 self.imgs = [None] * n self.labels = [np.zeros((0, 5), dtype=np.float32)] * n # 设置是否建立数据子集、提取目标检测框作再次分类,labels是否已加载 create_datasubset, extract_bounding_boxes, labels_loaded = False, False, False # 漏掉的标签数量,找到的标签数量,空的标签数量,数据子集的数量,相同的标签数量 nm, nf, ne, ns, nd = 0, 0, 0, 0, 0 # number missing, found, empty, datasubset, duplicate # 保存labels的numpy文件路径 np_labels_path = str(Path(self.label_files[0]).parent) + '.npy' # saved labels in *.npy file # 若是存在labels.npy,则直接加载,并设置labels_loaded=True if os.path.isfile(np_labels_path): s = np_labels_path # print string x = np.load(np_labels_path, allow_pickle=True) if len(x) == n: self.labels = x labels_loaded = True else: s = path.replace('images', 'labels') # 对每个标签文件作处理 pbar = tqdm(self.label_files) for i, file in enumerate(pbar): # 若是labels已经预加载了,直接取出来 if labels_loaded: l = self.labels[i] # np.savetxt(file, l, '%g') # save *.txt from *.npy file else: try: # 读取标签txt文件,读取失败则nm+1,标签格式为:class x y w h with open(file, 'r') as f: l = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32) except: nm += 1 # print('missing labels for image %s' % self.img_files[i]) # file missing continue if l.shape[0]: # 判断标签是否有五列 assert l.shape[1] == 5, '> 5 label columns: %s' % file # 判断标签是否所有>=0 assert (l >= 0).all(), 'negative labels: %s' % file # 判断标签坐标x y w h是否归一化 assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels: %s' % file # 找出标签中重复的坐标 if np.unique(l, axis=0).shape[0] < l.shape[0]: # duplicate rows nd += 1 # print('WARNING: duplicate rows in %s' % self.label_files[i]) # duplicate rows # 若是数据集只有一个类,设置类别标签为0 if single_cls: l[:, 0] = 0 # force dataset into single-class mode self.labels[i] = l nf += 1 # file found # Create subdataset (a smaller dataset) # 建立一个数据子集(默认不调用) if create_datasubset and ns < 1E4: # 建立文件夹 if ns == 0: create_folder(path='./datasubset') os.makedirs('./datasubset/images') exclude_classes = 43 # 保存图片路径到本地 if exclude_classes not in l[:, 0]: ns += 1 # shutil.copy(src=self.img_files[i], dst='./datasubset/images/') # copy image with open('./datasubset/images.txt', 'a') as f: f.write(self.img_files[i] + '\n') # Extract object detection boxes for a second stage classifier # 获取目标框与图片,并将框从图片截取下来保存到本地(默认不使用) if extract_bounding_boxes: p = Path(self.img_files[i]) img = cv2.imread(str(p)) h, w = img.shape[:2] for j, x in enumerate(l): f = '%s%sclassifier%s%g_%g_%s' % (p.parent.parent, os.sep, os.sep, x[0], j, p.name) if not os.path.exists(Path(f).parent): os.makedirs(Path(f).parent) # make new output folder # 对归一化的坐标乘以w,h b = x[1:] * [w, h, w, h] # box b[2:] = b[2:].max() # rectangle to square b[2:] = b[2:] * 1.3 + 30 # pad # xywh格式转xyxy b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int) # 修正图片外的框 b[[0, 2]] = np.clip(b[[0, 2]], 0, w) # clip boxes outside of image b[[1, 3]] = np.clip(b[[1, 3]], 0, h) assert cv2.imwrite(f, img[b[1]:b[3], b[0]:b[2]]), 'Failure extracting classifier boxes' else: # l.shape[0] == 0则为空的标签,ne+1 ne += 1 # print('empty labels for image %s' % self.img_files[i]) # file empty # os.system("rm '%s' '%s'" % (self.img_files[i], self.label_files[i])) # remove # 显示信息 pbar.desc = 'Caching labels %s (%g found, %g missing, %g empty, %g duplicate, for %g images)' % ( s, nf, nm, ne, nd, n) assert nf > 0 or n == 20288, 'No labels found in %s. See %s' % (os.path.dirname(file) + os.sep, help_url) # 保存labels到本地 if not labels_loaded and n > 1000: print('Saving labels to %s for faster future loading' % np_labels_path) np.save(np_labels_path, self.labels) # save for next time # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM) # 提早缓存图片到内存中,能够提高训练速度 if cache_images: # if training gb = 0 # Gigabytes of cached images pbar = tqdm(range(len(self.img_files)), desc='Caching images') self.img_hw0, self.img_hw = [None] * n, [None] * n for i in pbar: # max 10k images self.imgs[i], self.img_hw0[i], self.img_hw[i] = load_image(self, i) # img, hw_original, hw_resized gb += self.imgs[i].nbytes pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9) # Detect corrupted images https://medium.com/joelthchao/programmatically-detect-corrupted-image-8c1b2006c3d3 # 检测损坏的图片文件 detect_corrupted_images = False if detect_corrupted_images: from skimage import io # conda install -c conda-forge scikit-image for file in tqdm(self.img_files, desc='Detecting corrupted images'): try: _ = io.imread(file) except: print('Corrupted image detected: %s' % file) def __len__(self): return len(self.img_files) # def __iter__(self): # self.count = -1 # print('ran dataset iter') # #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF) # return self def __getitem__(self, index): # 若是image_weights,则获取新的下标 if self.image_weights: # print(index, self.indices[index]) """ self.indices在train.py中设置 要配合着train.py中的如下代码配合使用 image_weights为根据标签中每一个类别的数量设置的图片采样权重 若是image_weights=True,则根据图片采样权重获取新的下标 # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx """ index = self.indices[index] # 超参数 hyp = self.hyp if self.mosaic: # Load mosaic # 使用mosaic数据加强方式加载 img, labels = load_mosaic(self, index) shapes = None else: # Load image # 加载图片并根据设定的输入大小与图片原大小的比例ratio进行resize(未作填充pad到正方形) img, (h0, w0), (h, w) = load_image(self, index) # Letterbox # 若是进行矩形训练,则获取每一个batch的输入图片的shape shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size # final letterboxed shape # 根据shape对图片作resize和pad填充,返回resize+pad以后的图片、缩放因子ratio、填充大小pad # 若是未进行矩形训练,则只进行pad填充到正方形 img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment) shapes = (h0, w0), ((h / h0, w / w0), pad) # for COCO mAP rescaling # Load labels labels = [] x = self.labels[index] if x.size > 0: # Normalized xywh to pixel xyxy format # 根据pad调整框的标签坐标,并从归一化的xywh->未归一化的xyxy labels = x.copy() labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + pad[0] # pad width labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + pad[1] # pad height labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + pad[0] labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + pad[1] if self.augment: # Augment imagespace if not self.mosaic: # 随机对图片进行旋转,平移,缩放,裁剪 img, labels = random_affine(img, labels, degrees=hyp['degrees'], translate=hyp['translate'], scale=hyp['scale'], shear=hyp['shear']) # Augment colorspace # 随机改变图片的色调(H),饱和度(S),亮度(V) augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v']) # Apply cutouts # if random.random() < 0.9: # labels = cutout(img, labels) nL = len(labels) # number of labels if nL: # 调整框的标签,xyxy to xywh # convert xyxy to xywh labels[:, 1:5] = xyxy2xywh(labels[:, 1:5]) # 从新归一化标签0 - 1 # Normalize coordinates 0 - 1 labels[:, [2, 4]] /= img.shape[0] # height labels[:, [1, 3]] /= img.shape[1] # width if self.augment: # 图片随机左右翻转 # random left-right flip lr_flip = True if lr_flip and random.random() < 0.5: img = np.fliplr(img) if nL: labels[:, 1] = 1 - labels[:, 1] # random up-down flip # 图片随机上下翻转 ud_flip = False if ud_flip and random.random() < 0.5: img = np.flipud(img) if nL: labels[:, 2] = 1 - labels[:, 2] # 初始化标签框对应的图片序号,配合下面的collate_fn使用 labels_out = torch.zeros((nL, 6)) if nL: labels_out[:, 1:] = torch.from_numpy(labels) # Convert img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 img = np.ascontiguousarray(img) return torch.from_numpy(img), labels_out, self.img_files[index], shapes """ pytorch的DataLoader打包一个batch的数据集时要通过此函数进行打包 经过重写此函数实现标签与图片对应的划分,一个batch中哪些标签属于哪一张图片,形如 [[0, 6, 0.5, 0.5, 0.26, 0.35], [0, 6, 0.5, 0.5, 0.26, 0.35], [1, 6, 0.5, 0.5, 0.26, 0.35], [2, 6, 0.5, 0.5, 0.26, 0.35],] 前两行标签属于第一张图片,第三行属于第二张。。。 """ @staticmethod def collate_fn(batch): img, label, path, shapes = zip(*batch) # transposed for i, l in enumerate(label): l[:, 0] = i # add target image index for build_targets() return torch.stack(img, 0), torch.cat(label, 0), path, shapes def load_image(self, index): # loads 1 image from dataset, returns img, original hw, resized hw img = self.imgs[index] if img is None: # not cached path = self.img_files[index] img = cv2.imread(path) # BGR assert img is not None, 'Image Not Found ' + path h0, w0 = img.shape[:2] # orig hw r = self.img_size / max(h0, w0) # resize image to img_size # 根据ratio选择不一样的插值方式 if r != 1: # always resize down, only resize up if training with augmentation interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp) return img, (h0, w0), img.shape[:2] # img, hw_original, hw_resized else: return self.imgs[index], self.img_hw0[index], self.img_hw[index] # img, hw_original, hw_resized def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5): # 随机取-1到1三个实数,乘以hyp中的hsv三通道的系数 r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains # 分离通道 hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV)) dtype = img.dtype # uint8 x = np.arange(0, 256, dtype=np.int16) lut_hue = ((x * r[0]) % 180).astype(dtype) lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) lut_val = np.clip(x * r[2], 0, 255).astype(dtype) # 随机调整hsv以后从新组合通道 img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype) # 将hsv格式转为BGR格式 cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed # Histogram equalization # if random.random() < 0.2: # for i in range(3): # img[:, :, i] = cv2.equalizeHist(img[:, :, i]) def load_mosaic(self, index): # loads images in a mosaic labels4 = [] s = self.img_size # 随机取mosaic中心点 yc, xc = [int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border] # mosaic center x, y # 随机取其余三张图片的索引 indices = [index] + [random.randint(0, len(self.labels) - 1) for _ in range(3)] # 3 additional image indices for i, index in enumerate(indices): # Load image img, _, (h, w) = load_image(self, index) # place img in img4 if i == 0: # top left # 初始化大图 img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles # 设置大图上的位置(左上角) x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image) # 选取小图上的位置 x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image) elif i == 1: # top right右上角 x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h elif i == 2: # bottom left左下角 x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h) x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, max(xc, w), min(y2a - y1a, h) elif i == 3: # bottom right右下角 x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h) x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h) # 将小图上截取的部分贴到大图上 img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] # 计算小图到大图上时所产生的偏移,用来计算mosaic加强后的标签框的位置 padw = x1a - x1b padh = y1a - y1b # Labels x = self.labels[index] labels = x.copy() # 从新调整标签框的位置 if x.size > 0: # Normalized xywh to pixel xyxy format labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + padw labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + padh labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + padw labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + padh labels4.append(labels) # Concat/clip labels if len(labels4): # 调整标签框在图片内部 labels4 = np.concatenate(labels4, 0) # np.clip(labels4[:, 1:] - s / 2, 0, s, out=labels4[:, 1:]) # use with center crop np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:]) # use with random_affine # Replicate # img4, labels4 = replicate(img4, labels4) # Augment # img4 = img4[s // 2: int(s * 1.5), s // 2:int(s * 1.5)] # center crop (WARNING, requires box pruning) # print('mosica:', img4.shape) # 进行mosaic的时候将四张图片整合到一块儿以后shape为[2*img_size, 2*img_size] # 对mosaic整合的图片进行随机旋转、平移、缩放、裁剪,并resize为输入大小img_size img4, labels4 = random_affine(img4, labels4, degrees=self.hyp['degrees'], translate=self.hyp['translate'], scale=self.hyp['scale'], shear=self.hyp['shear'], border=self.mosaic_border) # border to remove # print('mosica:', img4.shape) return img4, labels4 def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True): # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232 shape = img.shape[:2] # current shape [height, width] if isinstance(new_shape, int): new_shape = (new_shape, new_shape) # Scale ratio (new / old) # 计算缩放因子 r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) """ 缩放(resize)到输入大小img_size的时候,若是没有设置上采样的话,则只进行下采样 由于上采样图片会让图片模糊,对训练不友好影响性能。 """ if not scaleup: # only scale down, do not scale up (for better test mAP) r = min(r, 1.0) # Compute padding ratio = r, r # width, height ratios new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding # 获取最小的矩形填充 if auto: # minimum rectangle dw, dh = np.mod(dw, 64), np.mod(dh, 64) # wh padding # 若是scaleFill=True,则不进行填充,直接resize成img_size,任由图片进行拉伸和压缩 elif scaleFill: # stretch dw, dh = 0.0, 0.0 new_unpad = (new_shape[1], new_shape[0]) ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios # 计算上下左右填充大小 dw /= 2 # divide padding into 2 sides dh /= 2 if shape[::-1] != new_unpad: # resize img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) # 进行填充 img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border return img, ratio, (dw, dh) # import torchvision # torchvision.transforms.RandomAffine def random_affine(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, border=(0, 0)): # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10)) # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4 # targets = [cls, xyxy] height = img.shape[0] + border[0] * 2 # shape(h,w,c) width = img.shape[1] + border[1] * 2 # 设置旋转和缩放的仿射矩阵 # Rotation and Scale R = np.eye(3) a = random.uniform(-degrees, degrees) # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations s = random.uniform(1 - scale, 1 + scale) # s = 2 ** random.uniform(-scale, scale) R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s) # 设置平移的仿射系数 # Translation T = np.eye(3) T[0, 2] = random.uniform(-translate, translate) * img.shape[1] + border[1] # x translation (pixels) T[1, 2] = random.uniform(-translate, translate) * img.shape[0] + border[0] # y translation (pixels) # 设置裁剪的仿射矩阵系数 # Shear S = np.eye(3) S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg) S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg) # Combined rotation matrix # 融合仿射矩阵并做用在图片上 M = S @ T @ R # ORDER IS IMPORTANT HERE!! if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed img = cv2.warpAffine(img, M[:2], dsize=(width, height), flags=cv2.INTER_LINEAR, borderValue=(114, 114, 114)) # Transform label coordinates # 调整框的标签 n = len(targets) if n: # warp points xy = np.ones((n * 4, 3)) xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 xy = (xy @ M.T)[:, :2].reshape(n, 8) # create new boxes x = xy[:, [0, 2, 4, 6]] y = xy[:, [1, 3, 5, 7]] xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T # # apply angle-based reduction of bounding boxes # radians = a * math.pi / 180 # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 # x = (xy[:, 2] + xy[:, 0]) / 2 # y = (xy[:, 3] + xy[:, 1]) / 2 # w = (xy[:, 2] - xy[:, 0]) * reduction # h = (xy[:, 3] - xy[:, 1]) * reduction # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T # reject warped points outside of image xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) w = xy[:, 2] - xy[:, 0] h = xy[:, 3] - xy[:, 1] area = w * h area0 = (targets[:, 3] - targets[:, 1]) * (targets[:, 4] - targets[:, 2]) ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) # aspect ratio i = (w > 2) & (h > 2) & (area / (area0 * s + 1e-16) > 0.2) & (ar < 20) targets = targets[i] targets[:, 1:5] = xy[i] return img, targets def create_folder(path='./new_folder'): # Create folder if os.path.exists(path): shutil.rmtree(path) # delete output folder os.makedirs(path) # make new output folder
以上我根据ultralytics\yolov5的dataset.py代码,本身的理解,若是有错,欢迎指正,谢谢。
如今yolov5还在改进,一些代码随时会更新,例如train.py等,后续我会更新yolov5其余代码的解析和注释。
缓存