Yolov3代码实现
作者:互联网
Yolov3
- voc数据集构建文件
- VOC数据集dataset构建文件
- VOC2CSV
- Yolov3配置文件
- 模型backbone构建
- yolo head预测文件
- conv层模块代码文件
- 残差模块
- 激活函数模块
- 损失函数模块
- 特征金字塔yolo
- yolov3网络代码构建
voc数据集构建文件
import sys
import xml.etree.ElementTree as ET
import config.yolov3_config_voc as cfg
import os
from tqdm import tqdm
sys.path.append("..")
def parse_voc_annotation(data_path, file_type, anno_path, use_difficult_bbox=False):
"""
解析 pascal voc数据集的annotation, 表示的形式为[image_global_path xmin,ymin,xmax,ymax,cls_id]
:param data_path: 数据集的路径 , 如 "./data/VOC"
:param file_type: 文件的类型, 'trainval''train''val'
:param anno_path: 标签存储路径
:param use_difficult_bbox: 是否适用difficult==1的bbox
:return: 数据集大小
"""
classes = cfg.DATA["CLASSES"]
img_inds_file = os.path.join(data_path, 'ImageSets', 'Main', file_type+'.txt')
with open(img_inds_file, 'r') as f:
lines = f.readlines()
image_ids = [line.strip() for line in lines]
with open(anno_path, 'a') as f:
for image_id in tqdm(image_ids):
image_path = os.path.join(data_path, 'JPEGImages', image_id + '.jpg')
annotation = image_path
label_path = os.path.join(data_path, 'Annotations', image_id + '.xml')
root = ET.parse(label_path).getroot()
objects = root.findall('object')
for obj in objects:
difficult = obj.find("difficult").text.strip()
if (not use_difficult_bbox) and (int(difficult) == 1): # difficult 表示是否容易识别,0表示容易,1表示困难
continue
bbox = obj.find('bndbox')
class_id = classes.index(obj.find("name").text.lower().strip())
xmin = bbox.find('xmin').text.strip()
ymin = bbox.find('ymin').text.strip()
xmax = bbox.find('xmax').text.strip()
ymax = bbox.find('ymax').text.strip()
annotation += ' ' + ','.join([xmin, ymin, xmax, ymax, str(class_id)])
annotation += '\n'
# print(annotation)
f.write(annotation)
return len(image_ids)
if __name__ == "__main__":
# train_set : VOC2007_trainval 和 VOC2012_trainval
train_data_path_2007 = os.path.join(cfg.DATA_PATH, 'VOCtrainval-2007', 'VOCdevkit', 'VOC2007')
train_data_path_2012 = os.path.join(cfg.DATA_PATH, 'VOCtrainval-2012', 'VOCdevkit', 'VOC2012')
train_annotation_path = os.path.join('../data', 'train_annotation.txt')
if os.path.exists(train_annotation_path):
os.remove(train_annotation_path)
# val_set : VOC2007_test
test_data_path_2007 = os.path.join(cfg.DATA_PATH, 'VOCtest-2007', 'VOCdevkit', 'VOC2007')
test_annotation_path = os.path.join('../data', 'test_annotation.txt')
if os.path.exists(test_annotation_path):
os.remove(test_annotation_path)
len_train = parse_voc_annotation(train_data_path_2007,
"trainval",
train_annotation_path,
use_difficult_bbox=False) + \
parse_voc_annotation(train_data_path_2012,
"trainval",
train_annotation_path,
use_difficult_bbox=False)
len_test = parse_voc_annotation(test_data_path_2007, "test", test_annotation_path, use_difficult_bbox=False)
print("The number of images for train and test are :train : {0} | test : {1}".format(len_train, len_test))
VOC数据集dataset构建文件
import os
import sys
import torch
from torch.utils.data import Dataset, DataLoader
import config.yolov3_config_voc as cfg
import cv2
import numpy as np
import random
# from . import data_augment as dataAug
# from . import tools
import utils.data_augment as dataAug
import utils.tools as tools
sys.path.append("..")
sys.path.append("../utils")
class VocDataset(Dataset):
def __init__(self, anno_file_type, img_size=416):
self.img_size = img_size # For Multi-training
self.classes = cfg.DATA["CLASSES"]
self.num_classes = len(self.classes)
self.class_to_id = dict(zip(self.classes, range(self.num_classes)))
self.__annotations = self.__load_annotations(anno_file_type)
def __len__(self):
return len(self.__annotations)
def __getitem__(self, item):
img_org, bboxes_org = self.__parse_annotation(self.__annotations[item])
img_org = img_org.transpose(2, 0, 1) # HWC->CHW
item_mix = random.randint(0, len(self.__annotations)-1)
img_mix, bboxes_mix = self.__parse_annotation(self.__annotations[item_mix])
img_mix = img_mix.transpose(2, 0, 1)
img, bboxes = dataAug.Mixup()(img_org, bboxes_org, img_mix, bboxes_mix)
del img_org, bboxes_org, img_mix, bboxes_mix
label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.__creat_label(bboxes)
img = torch.from_numpy(img).float()
label_sbbox = torch.from_numpy(label_sbbox).float()
label_mbbox = torch.from_numpy(label_mbbox).float()
label_lbbox = torch.from_numpy(label_lbbox).float()
sbboxes = torch.from_numpy(sbboxes).float()
mbboxes = torch.from_numpy(mbboxes).float()
lbboxes = torch.from_numpy(lbboxes).float()
return img, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
def __load_annotations(self, anno_type):
assert anno_type in ['train', 'test'], "You must choice one of the 'train' or 'test' for anno_type parameter"
anno_path = os.path.join(cfg.PROJECT_PATH, 'data', anno_type+"_annotation.txt")
with open(anno_path, 'r') as f:
annotations = list(filter(lambda x: len(x) > 0, f.readlines()))
assert len(annotations) > 0, "No images found in {}".format(anno_path)
return annotations
def __parse_annotation(self, annotation):
"""
Data augument.
:param annotation: Image' path and bboxes' coordinates, categories.
ex. [image_path xmin,ymin,xmax,ymax,class_ind xmin,ymin,xmax,ymax,class_ind ...]
:return: Return the enhanced image and bboxes. bbox'shape is [xmin, ymin, xmax, ymax, class_ind]
"""
anno = annotation.strip().split(' ')
img_path = anno[0]
img = cv2.imread(img_path) # H*W*C and C=BGR
assert img is not None, 'File Not Found ' + img_path
bboxes = np.array([list(map(float, box.split(','))) for box in anno[1:]])
img, bboxes = dataAug.RandomHorizontalFilp()(np.copy(img), np.copy(bboxes))
img, bboxes = dataAug.RandomCrop()(np.copy(img), np.copy(bboxes))
img, bboxes = dataAug.RandomAffine()(np.copy(img), np.copy(bboxes))
img, bboxes = dataAug.Resize((self.img_size, self.img_size), True)(np.copy(img), np.copy(bboxes))
return img, bboxes
def __creat_label(self, bboxes):
"""
Label assignment. For a single picture all GT box bboxes are assigned anchor.
1、Select a bbox in order, convert its coordinates("xyxy") to "xywh"; and scale bbox'
xywh by the strides.
2、Calculate the iou between the each detection layer'anchors and the bbox in turn, and select the largest
anchor to predict the bbox.If the ious of all detection layers are smaller than 0.3, select the largest
of all detection layers' anchors to predict the bbox.
Note :
1、The same GT may be assigned to multiple anchors. And the anchors may be on the same or different layer.
2、The total number of bboxes may be more than it is, because the same GT may be assigned to multiple layers
of detection.
"""
anchors = np.array(cfg.MODEL["ANCHORS"])
strides = np.array(cfg.MODEL["STRIDES"])
train_output_size = self.img_size / strides
anchors_per_scale = cfg.MODEL["ANCHORS_PER_SCLAE"]
label = [np.zeros((int(train_output_size[i]),
int(train_output_size[i]),
anchors_per_scale,
6+self.num_classes)) for i in range(3)]
for i in range(3):
label[i][..., 5] = 1.0
bboxes_xywh = [np.zeros((150, 4)) for _ in range(3)] # Darknet the max_num is 30
bbox_count = np.zeros((3,))
for bbox in bboxes:
bbox_coor = bbox[:4]
bbox_class_ind = int(bbox[4])
bbox_mix = bbox[5]
# onehot
one_hot = np.zeros(self.num_classes, dtype=np.float32)
one_hot[bbox_class_ind] = 1.0
one_hot_smooth = dataAug.LabelSmooth()(one_hot, self.num_classes)
# convert "xyxy" to "xywh"
bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5,
bbox_coor[2:] - bbox_coor[:2]], axis=-1)
# print("bbox_xywh: ", bbox_xywh)
bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / strides[:, np.newaxis]
iou = []
exist_positive = False
for i in range(3):
anchors_xywh = np.zeros((anchors_per_scale, 4))
anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5 # 0.5 for compensation
anchors_xywh[:, 2:4] = anchors[i]
iou_scale = tools.iou_xywh_numpy(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh)
iou.append(iou_scale)
iou_mask = iou_scale > 0.3
if np.any(iou_mask):
xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32)
# Bug : 当多个bbox对应同一个anchor时,默认将该anchor分配给最后一个bbox
label[i][yind, xind, iou_mask, 0:4] = bbox_xywh
label[i][yind, xind, iou_mask, 4:5] = 1.0
label[i][yind, xind, iou_mask, 5:6] = bbox_mix
label[i][yind, xind, iou_mask, 6:] = one_hot_smooth
bbox_ind = int(bbox_count[i] % 150) # BUG : 150为一个先验值,内存消耗大
bboxes_xywh[i][bbox_ind, :4] = bbox_xywh
bbox_count[i] += 1
exist_positive = True
if not exist_positive:
best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
best_detect = int(best_anchor_ind / anchors_per_scale)
best_anchor = int(best_anchor_ind % anchors_per_scale)
xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)
label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
label[best_detect][yind, xind, best_anchor, 5:6] = bbox_mix
label[best_detect][yind, xind, best_anchor, 6:] = one_hot_smooth
bbox_ind = int(bbox_count[best_detect] % 150)
bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh
bbox_count[best_detect] += 1
label_sbbox, label_mbbox, label_lbbox = label
sbboxes, mbboxes, lbboxes = bboxes_xywh
return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes
if __name__ == "__main__":
voc_dataset = VocDataset(anno_file_type="train", img_size=448)
dataloader = DataLoader(voc_dataset, shuffle=True, batch_size=1, num_workers=0)
for i, (img, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes) in enumerate(dataloader):
if i==0:
print(img.shape)
print(label_sbbox.shape)
print(label_mbbox.shape)
print(label_lbbox.shape)
print(sbboxes.shape)
print(mbboxes.shape)
print(lbboxes.shape)
if img.shape[0] == 1:
labels = np.concatenate([label_sbbox.reshape(-1, 26), label_mbbox.reshape(-1, 26),
label_lbbox.reshape(-1, 26)], axis=0)
labels_mask = labels[..., 4] > 0
labels = np.concatenate([labels[labels_mask][..., :4], np.argmax(labels[labels_mask][..., 6:],
axis=-1).reshape(-1, 1)], axis=-1)
print(labels.shape)
tools.plot_box(labels, img, id=1)
VOC2CSV
import os
import random
import math
import argparse
from tqdm import tqdm
import xml.etree.ElementTree as ET
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--indir", type="str", default="")
parser.add_argument("-p", "--percent", type=float, default=0.2)
parser.add_argument("-t", "--train", type=str, default="")
parser.add_argument("-v", "--val", type=str, default="")
parser.add_argument("-c", "--classes", type=str, default="")
args = parser.parse_args()
return args
# 获取特定后缀名的文件列表,以list的形式返回
def get_file_index(indir, postfix):
print(indir)
file_list = []
for root, dirs, files in os.walk(indir):
for name in files:
if postfix in name:
file_list.append(os.path.join(root, name))
return file_list
# 写入标注信息
def convert_annotation(csv, address_list):
cls_list = []
with open(csv, "w") as f:
for i, address in enumerate(tqdm(address_list)):
in_file = open(address, encoding="utf-8")
strXmml = in_file.read()
in_file.close()
root = ET.XML(in_file)
for obj in root.iter("object"):
cls = obj.find("name").text
cls_list.append(cls)
xmlbox = obj.find("bndbox")
# 从xml文件中获取bbox的四个值,并转化为int类型
b = (int(float(xmlbox.find("xmin").text)), int(float(xmlbox.find("ymin").text)),
int(float(xmlbox.find("xmax").text)), int(float(xmlbox.find("ymax").text)))
f.write(file_dict[address_list[i]])
f.write(","+",".join([str(a) for a in b]) + ","+cls)
f.write("\n")
return cls_list
if __name__ == "__main__":
args = parse_args()
file_address = args.indir
test_percent = args.percent
train_csv = args.train
test_csv = args.val
class_csv = args.classes
Annotarions = get_file_index(file_address+"/Annotations", ".xml")
Annotarions.sort()
JPEGfiles = get_file_index(file_address+"/JPEGImages", ".jpg")
JPEGfiles.sort()
assert len(Annotarions) == len(JPEGfiles)
file_dict = dict(zip(Annotarions, JPEGfiles))
num = len(Annotarions)
test = random.sample(k=math.cell(num*test_percent), population=Annotarions)
train = list(set(Annotarions) - set(test))
cls_lsit1 = convert_annotation(train_csv, train)
cls_lsit2 = convert_annotation(test_csv, test)
cls_unique = list(set(cls_lsit1+cls_lsit2))
with open(class_csv, "w") as f:
for i, cls in enumerate(cls_unique):
f.write(cls+","+str(i)+"\n")
Yolov3配置文件
# 数据集文件路径;项目工程文件路径
DATA_PATH = "./data/VOC"
PROJECT_PATH = r"E:/CV/CV-图像检测/yolov3"
# 标签列表及数目
DATA = {"CLASSES": ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow',
'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep',
'sofa', 'train', 'tvmonitor'], "NUM": 20}
# model 锚框;边界大小;每单元锚框数
MODEL = {"ANCHORS": [[(1.25, 1.625), (2.0, 3.75), (4.125, 2.875)], # Anchors for small obj
[(1.875, 3.8125), (3.875, 2.8125), (3.6875, 7.4375)], # Anchors for medium obj
[(3.625, 2.8125), (4.875, 6.1875), (11.65625, 10.1875)]], # Anchors for big obj
"STRIDES": [8, 16, 32],
"ANCHORS_PER_SCLAE": 3
}
# train配置文件
TRAIN = {
"TRAIN_IMG_SIZE": 448,
"AUGMENT": True,
"BATCH_SIZE":4,
"MULTI_SCALE_TRAIN": True,
"IOU_THRESHOLD_LOSS": 0.5,
"EPOCHS": 50,
"NUMBER_WORKERS": 4,
"MOMENTUM": 0.9,
"WEIGHT_DECAY": 0.0005,
"LR_INIT": 1e-4,
"LR_END": 1e-6,
"WARMUP_EPOCHS": 2 # or None
}
# test配置文件
TEST = {
"TEST_IMG_SIZE": 448,
"BATCH_SIZE": 4,
"NUMBER_WORKERS": 2,
"CONF_THRESH": 0.01,
"NMS_THRESH": 0.5,
"MULTI_SCALE_TEST": False,
"FLIP_TEST": False
}
模型backbone构建
import torch.nn as nn
from ..layers.conv_module import Convolutional
from ..layers.blocks_module import Residual_block
class Darknet53(nn.Module):
def __init__(self):
super(Darknet53, self).__init__()
self.__conv = Convolutional(filters_in=3, filters_out=32, kernel_size=3, stride=1, pad=1, norm='bn',
activate='leaky')
self.__conv_5_0 = Convolutional(filters_in=32, filters_out=64, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_0 = Residual_block(filters_in=64, filters_out=64, filters_medium=32)
self.__conv_5_1 = Convolutional(filters_in=64, filters_out=128, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_1_0 = Residual_block(filters_in=128, filters_out=128, filters_medium=64)
self.__rb_5_1_1 = Residual_block(filters_in=128, filters_out=128, filters_medium=64)
self.__conv_5_2 = Convolutional(filters_in=128, filters_out=256, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_2_0 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_1 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_2 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_3 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_4 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_5 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_6 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__rb_5_2_7 = Residual_block(filters_in=256, filters_out=256, filters_medium=128)
self.__conv_5_3 = Convolutional(filters_in=256, filters_out=512, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_3_0 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_1 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_2 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_3 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_4 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_5 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_6 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__rb_5_3_7 = Residual_block(filters_in=512, filters_out=512, filters_medium=256)
self.__conv_5_4 = Convolutional(filters_in=512, filters_out=1024, kernel_size=3, stride=2, pad=1, norm='bn',
activate='leaky')
self.__rb_5_4_0 = Residual_block(filters_in=1024, filters_out=1024, filters_medium=512)
self.__rb_5_4_1 = Residual_block(filters_in=1024, filters_out=1024, filters_medium=512)
self.__rb_5_4_2 = Residual_block(filters_in=1024, filters_out=1024, filters_medium=512)
self.__rb_5_4_3 = Residual_block(filters_in=1024, filters_out=1024, filters_medium=512)
def forward(self, x):
x = self.__conv(x)
x0_0 = self.__conv_5_0(x)
x0_1 = self.__rb_5_0(x0_0)
x1_0 = self.__conv_5_1(x0_1)
x1_1 = self.__rb_5_1_0(x1_0)
x1_2 = self.__rb_5_1_1(x1_1)
x2_0 = self.__conv_5_2(x1_2)
x2_1 = self.__rb_5_2_0(x2_0)
x2_2 = self.__rb_5_2_1(x2_1)
x2_3 = self.__rb_5_2_2(x2_2)
x2_4 = self.__rb_5_2_3(x2_3)
x2_5 = self.__rb_5_2_4(x2_4)
x2_6 = self.__rb_5_2_5(x2_5)
x2_7 = self.__rb_5_2_6(x2_6)
x2_8 = self.__rb_5_2_7(x2_7) # small
x3_0 = self.__conv_5_3(x2_8)
x3_1 = self.__rb_5_3_0(x3_0)
x3_2 = self.__rb_5_3_1(x3_1)
x3_3 = self.__rb_5_3_2(x3_2)
x3_4 = self.__rb_5_3_3(x3_3)
x3_5 = self.__rb_5_3_4(x3_4)
x3_6 = self.__rb_5_3_5(x3_5)
x3_7 = self.__rb_5_3_6(x3_6)
x3_8 = self.__rb_5_3_7(x3_7) # medium
x4_0 = self.__conv_5_4(x3_8)
x4_1 = self.__rb_5_4_0(x4_0)
x4_2 = self.__rb_5_4_1(x4_1)
x4_3 = self.__rb_5_4_2(x4_2)
x4_4 = self.__rb_5_4_3(x4_3) # large
return x2_8, x3_8, x4_4
yolo head预测文件
import torch.nn as nn
import torch
class Yolo_head(nn.Module):
def __init__(self, nC, anchors, stride):
super(Yolo_head, self).__init__()
self.__anchors = anchors
self.__nA = len(anchors)
self.__nC = nC
self.__stride = stride
def forward(self, p):
bs, nG = p.shape[0], p.shape[-1]
p = p.view(bs, self.__nA, 5 + self.__nC, nG, nG).permute(0, 3, 4, 1, 2)
p_de = self.__decode(p.clone())
return (p, p_de)
def __decode(self, p):
batch_size, output_size = p.shape[:2]
device = p.device
stride = self.__stride
anchors = (1.0 * self.__anchors).to(device)
conv_raw_dxdy = p[:, :, :, :, 0:2]
conv_raw_dwdh = p[:, :, :, :, 2:4]
conv_raw_conf = p[:, :, :, :, 4:5]
conv_raw_prob = p[:, :, :, :, 5:]
y = torch.arange(0, output_size).unsqueeze(1).repeat(1, output_size)
x = torch.arange(0, output_size).unsqueeze(0).repeat(output_size, 1)
grid_xy = torch.stack([x, y], dim=-1)
grid_xy = grid_xy.unsqueeze(0).unsqueeze(3).repeat(batch_size, 1, 1, 3, 1).float().to(device)
pred_xy = (torch.sigmoid(conv_raw_dxdy) + grid_xy) * stride
pred_wh = (torch.exp(conv_raw_dwdh) * anchors) * stride
pred_xywh = torch.cat([pred_xy, pred_wh], dim=-1)
pred_conf = torch.sigmoid(conv_raw_conf)
pred_prob = torch.sigmoid(conv_raw_prob)
pred_bbox = torch.cat([pred_xywh, pred_conf, pred_prob], dim=-1)
return pred_bbox.view(-1, 5 + self.__nC) if not self.training else pred_bbox
conv层模块代码文件
import torch
import torch.nn as nn
import torch.nn.functional as F
from .activate import *
norm_name = {"bn": nn.BatchNorm2d}
activate_name = {
"relu": nn.ReLU,
"leaky": nn.LeakyReLU,
"mish": Mish}
class Convolutional(nn.Module):
def __init__(self, filters_in, filters_out, kernel_size, stride, pad, norm=None, activate=None):
super(Convolutional, self).__init__()
self.norm = norm
self.activate = activate
self.__conv = nn.Conv2d(in_channels=filters_in, out_channels=filters_out, kernel_size=kernel_size,
stride=stride, padding=pad, bias=not norm)
if norm:
assert norm in norm_name.keys()
if norm == "bn":
self.__norm = norm_name[norm](num_features=filters_out)
if activate:
assert activate in activate_name.keys()
if activate == "leaky":
self.__activate = activate_name[activate](negative_slope=0.1, inplace=True)
if activate == "relu":
self.__activate = activate_name[activate](inplace=True)
def forward(self, x):
x = self.__conv(x)
if self.norm:
x = self.__norm(x)
if self.activate:
x = self.__activate(x)
return x
残差模块
import torch.nn as nn
from ..layers.conv_module import Convolutional
class Residual_block(nn.Module):
def __init__(self, filters_in, filters_out, filters_medium):
super(Residual_block, self).__init__()
self.__conv1 = Convolutional(filters_in=filters_in, filters_out=filters_medium, kernel_size=1, stride=1, pad=0,
norm="bn", activate="leaky")
self.__conv2 = Convolutional(filters_in=filters_medium, filters_out=filters_out, kernel_size=3, stride=1, pad=1,
norm="bn", activate="leaky")
def forward(self, x):
r = self.__conv1(x)
r = self.__conv2(r)
out = x + r
return out
激活函数模块
import torch
import torch.nn as nn
import torch.nn.functional as F
class Mish(nn.Module):
def __init__(self):
super(Mish).__init__()
def forward(self, x):
x = x * (torch.tanh(F.softplus(x)))
return x
class Swish(nn.Module):
def __init__(self):
super(Swish, self).__init__()
def forward(self, x):
x = x * F.sigmoid(x)
return x
损失函数模块
import sys
sys.path.append("../utils")
import torch
import torch.nn as nn
from utils import tools
import config.yolov3_config_voc as cfg
class FocalLoss(nn.Module):
def __init__(self, gamma=2.0, alpha=1.0, reduction="mean"):
super(FocalLoss, self).__init__()
self.__gamma = gamma
self.__alpha = alpha
self.__loss = nn.BCEWithLogitsLoss(reduction=reduction)
def forward(self, input, target):
loss = self.__loss(input=input, target=target)
loss *= self.__alpha * torch.pow(torch.abs(target - torch.sigmoid(input)), self.__gamma)
return loss
class YoloV3Loss(nn.Module):
def __init__(self, anchors, strides, iou_threshold_loss=0.5):
super(YoloV3Loss, self).__init__()
self.__iou_threshold_loss = iou_threshold_loss
self.__strides = strides
def forward(self, p, p_d, label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes):
"""
:param p: Predicted offset values for three detection layers.
The shape is [p0, p1, p2], ex. p0=[bs, grid, grid, anchors, tx+ty+tw+th+conf+cls_20]
:param p_d: Decodeed predicted value. The size of value is for image size.
ex. p_d0=[bs, grid, grid, anchors, x+y+w+h+conf+cls_20]
:param label_sbbox: Small detection layer's label. The size of value is for original image size.
shape is [bs, grid, grid, anchors, x+y+w+h+conf+mix+cls_20]
:param label_mbbox: Same as label_sbbox.
:param label_lbbox: Same as label_sbbox.
:param sbboxes: Small detection layer bboxes.The size of value is for original image size.
shape is [bs, 150, x+y+w+h]
:param mbboxes: Same as sbboxes.
:param lbboxes: Same as sbboxes
"""
strides = self.__strides
loss_s, loss_s_giou, loss_s_conf, loss_s_cls = self.__cal_loss_per_layer(p[0], p_d[0], label_sbbox,
sbboxes, strides[0])
loss_m, loss_m_giou, loss_m_conf, loss_m_cls = self.__cal_loss_per_layer(p[1], p_d[1], label_mbbox,
mbboxes, strides[1])
loss_l, loss_l_giou, loss_l_conf, loss_l_cls = self.__cal_loss_per_layer(p[2], p_d[2], label_lbbox,
lbboxes, strides[2])
loss = loss_l + loss_m + loss_s
loss_giou = loss_s_giou + loss_m_giou + loss_l_giou
loss_conf = loss_s_conf + loss_m_conf + loss_l_conf
loss_cls = loss_s_cls + loss_m_cls + loss_l_cls
return loss, loss_giou, loss_conf, loss_cls
def __cal_loss_per_layer(self, p, p_d, label, bboxes, stride):
"""
(1)The loss of regression of boxes.
GIOU loss is defined in https://arxiv.org/abs/1902.09630.
Note: The loss factor is 2-w*h/(img_size**2), which is used to influence the
balance of the loss value at different scales.
(2)The loss of confidence.
Includes confidence loss values for foreground and background.
Note: The backgroud loss is calculated when the maximum iou of the box predicted
by the feature point and all GTs is less than the threshold.
(3)The loss of classes。
The category loss is BCE, which is the binary value of each class.
:param stride: The scale of the feature map relative to the original image
:return: The average loss(loss_giou, loss_conf, loss_cls) of all batches of this detection layer.
"""
BCE = nn.BCEWithLogitsLoss(reduction="none")
FOCAL = FocalLoss(gamma=2, alpha=1.0, reduction="none")
batch_size, grid = p.shape[:2]
img_size = stride * grid
p_conf = p[..., 4:5]
p_cls = p[..., 5:]
p_d_xywh = p_d[..., :4]
label_xywh = label[..., :4]
label_obj_mask = label[..., 4:5]
label_cls = label[..., 6:]
label_mix = label[..., 5:6]
# loss giou
giou = tools.GIOU_xywh_torch(p_d_xywh, label_xywh).unsqueeze(-1)
# The scaled weight of bbox is used to balance the impact of small objects and large objects on loss.
bbox_loss_scale = 2.0 - 1.0 * label_xywh[..., 2:3] * label_xywh[..., 3:4] / (img_size ** 2)
loss_giou = label_obj_mask * bbox_loss_scale * (1.0 - giou) * label_mix
# loss confidence
iou = tools.iou_xywh_torch(p_d_xywh.unsqueeze(4), bboxes.unsqueeze(1).unsqueeze(1).unsqueeze(1))
iou_max = iou.max(-1, keepdim=True)[0]
label_noobj_mask = (1.0 - label_obj_mask) * (iou_max < self.__iou_threshold_loss).float()
loss_conf = (label_obj_mask * FOCAL(input=p_conf, target=label_obj_mask) +
label_noobj_mask * FOCAL(input=p_conf, target=label_obj_mask)) * label_mix
# loss classes
loss_cls = label_obj_mask * BCE(input=p_cls, target=label_cls) * label_mix
loss_giou = (torch.sum(loss_giou)) / batch_size
loss_conf = (torch.sum(loss_conf)) / batch_size
loss_cls = (torch.sum(loss_cls)) / batch_size
loss = loss_giou + loss_conf + loss_cls
return loss, loss_giou, loss_conf, loss_cls
特征金字塔yolo
import torch
import torch.nn as nn
import torch.nn.functional as F
from ..layers.conv_module import Convolutional
class Upsample(nn.Module):
def __init__(self, scale_factor=1, mode='nearest'):
super(Upsample, self).__init__()
self.scale_factor = scale_factor
self.mode = mode
def forward(self, x):
return F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
class Route(nn.Module):
def __init__(self):
super(Route, self).__init__()
def forward(self, x1, x2):
"""
x1 means previous output; x2 means current output
"""
out = torch.cat((x2, x1), dim=1)
return out
class FPN_YOLOV3(nn.Module):
"""
FPN for yolov3, and is different from original FPN or retinanet' FPN.
"""
def __init__(self, fileters_in, fileters_out):
super(FPN_YOLOV3, self).__init__()
fi_0, fi_1, fi_2 = fileters_in
fo_0, fo_1, fo_2 = fileters_out
# large 输入:14*14*1024
self.__conv_set_0 = nn.Sequential(
Convolutional(filters_in=fi_0, filters_out=512, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=512, filters_out=1024, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=1024, filters_out=512, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=512, filters_out=1024, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=1024, filters_out=512, kernel_size=1, stride=1,pad=0, norm="bn",
activate="leaky"),
)
self.__conv0_0 = Convolutional(filters_in=512, filters_out=1024, kernel_size=3, stride=1,
pad=1, norm="bn", activate="leaky")
self.__conv0_1 = Convolutional(filters_in=1024, filters_out=fo_0, kernel_size=1,
stride=1, pad=0)
#输出 14*14*(20+5)
#上采样准备与24*24*512的中等scale进行融合
self.__conv0 = Convolutional(filters_in=512, filters_out=256, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky")
self.__upsample0 = Upsample(scale_factor=2)
self.__route0 = Route()
# medium 输入24*24*512
self.__conv_set_1 = nn.Sequential(
Convolutional(filters_in=fi_1+256, filters_out=256, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=256, filters_out=512, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=512, filters_out=256, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=256, filters_out=512, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=512, filters_out=256, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
)
self.__conv1_0 = Convolutional(filters_in=256, filters_out=512, kernel_size=3, stride=1,
pad=1, norm="bn", activate="leaky")
self.__conv1_1 = Convolutional(filters_in=512, filters_out=fo_1, kernel_size=1,
stride=1, pad=0)
#输出 28*28*(20+5)
#上采样,准备与56*56*256的小scale进行融合
self.__conv1 = Convolutional(filters_in=256, filters_out=128, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky")
self.__upsample1 = Upsample(scale_factor=2)
self.__route1 = Route()
# small
self.__conv_set_2 = nn.Sequential(
Convolutional(filters_in=fi_2+128, filters_out=128, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=128, filters_out=256, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=256, filters_out=128, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
Convolutional(filters_in=128, filters_out=256, kernel_size=3, stride=1, pad=1, norm="bn",
activate="leaky"),
Convolutional(filters_in=256, filters_out=128, kernel_size=1, stride=1, pad=0, norm="bn",
activate="leaky"),
)
self.__conv2_0 = Convolutional(filters_in=128, filters_out=256, kernel_size=3, stride=1,
pad=1, norm="bn", activate="leaky")
self.__conv2_1 = Convolutional(filters_in=256, filters_out=fo_2, kernel_size=1,
stride=1, pad=0)
def forward(self, x0, x1, x2): # large, medium, small
# large
r0 = self.__conv_set_0(x0)
out0 = self.__conv0_0(r0)
out0 = self.__conv0_1(out0)
# medium
r1 = self.__conv0(r0)
r1 = self.__upsample0(r1)
x1 = self.__route0(x1, r1)
r1 = self.__conv_set_1(x1)
out1 = self.__conv1_0(r1)
out1 = self.__conv1_1(out1)
# small
r2 = self.__conv1(r1)
r2 = self.__upsample1(r2)
x2 = self.__route1(x2, r2)
r2 = self.__conv_set_2(x2)
out2 = self.__conv2_0(r2)
out2 = self.__conv2_1(out2)
return out2, out1, out0 # small, medium, large
yolov3网络代码构建
import sys
import torch.nn as nn
import torch
import config.yolov3_config_voc as cfg
import numpy as np
from model.backbones.darknet53 import Darknet53
from model.necks.yolo_fpn import FPN_YOLOV3
from model.head.yolo_head import Yolo_head
from model.layers.conv_module import Convolutional
from utils.tools import *
sys.path.append("..")
# AbsolutePath = os.path.abspath(__file__) #将相对路径转换成绝对路径
# SuperiorCatalogue = os.path.dirname(AbsolutePath) #相对路径的上级路径
# BaseDir = os.path.dirname(SuperiorCatalogue) #在“SuperiorCatalogue”的基础上在脱掉一层路径,得到我们想要的路径。
# sys.path.insert(0,BaseDir) #将我们取出来的路径加入
class Yolov3(nn.Module):
"""
Note : int the __init__(), to define the modules should be in order, because of the weight file is order
"""
def __init__(self, init_weights=True):
super(Yolov3, self).__init__()
self.__anchors = torch.FloatTensor(cfg.MODEL["ANCHORS"])
self.__strides = torch.FloatTensor(cfg.MODEL["STRIDES"])
self.__nC = cfg.DATA["NUM"]
self.__out_channel = cfg.MODEL["ANCHORS_PER_SCLAE"] * (self.__nC + 5)
self.__backnone = Darknet53()
self.__fpn = FPN_YOLOV3(fileters_in=[1024, 512, 256],
fileters_out=[self.__out_channel, self.__out_channel, self.__out_channel])
# small
self.__head_s = Yolo_head(nC=self.__nC, anchors=self.__anchors[0], stride=self.__strides[0])
# medium
self.__head_m = Yolo_head(nC=self.__nC, anchors=self.__anchors[1], stride=self.__strides[1])
# large
self.__head_l = Yolo_head(nC=self.__nC, anchors=self.__anchors[2], stride=self.__strides[2])
if init_weights:
self.__init_weights()
def forward(self, x):
out = []
x_s, x_m, x_l = self.__backnone(x)
x_s, x_m, x_l = self.__fpn(x_l, x_m, x_s)
out.append(self.__head_s(x_s))
out.append(self.__head_m(x_m))
out.append(self.__head_l(x_l))
if self.training:
p, p_d = list(zip(*out))
return p, p_d # smalll, medium, large
else:
p, p_d = list(zip(*out))
return p, torch.cat(p_d, 0)
def __init_weights(self):
" Note :nn.Conv2d nn.BatchNorm2d'initing modes are uniform "
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.normal_(m.weight.data, 0.0, 0.01)
if m.bias is not None:
m.bias.data.zero_()
print("initing {}".format(m))
elif isinstance(m, nn.BatchNorm2d):
torch.nn.init.constant_(m.weight.data, 1.0)
torch.nn.init.constant_(m.bias.data, 0.0)
print("initing {}".format(m))
def load_darknet_weights(self, weight_file, cutoff=52):
"https://github.com/ultralytics/yolov3/blob/master/models.py"
print("load darknet weights : ", weight_file)
with open(weight_file, 'rb') as f:
_ = np.fromfile(f, dtype=np.int32, count=5)
weights = np.fromfile(f, dtype=np.float32)
count = 0
ptr = 0
for m in self.modules():
if isinstance(m, Convolutional):
# only initing backbone conv's weights
if count == cutoff:
break
count += 1
conv_layer = m._Convolutional__conv
if m.norm == "bn":
# Load BN bias, weights, running mean and running variance
bn_layer = m._Convolutional__norm
num_b = bn_layer.bias.numel() # Number of biases
# Bias
bn_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.bias.data)
bn_layer.bias.data.copy_(bn_b)
ptr += num_b
# Weight
bn_w = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.weight.data)
bn_layer.weight.data.copy_(bn_w)
ptr += num_b
# Running Mean
bn_rm = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_mean)
bn_layer.running_mean.data.copy_(bn_rm)
ptr += num_b
# Running Var
bn_rv = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(bn_layer.running_var)
bn_layer.running_var.data.copy_(bn_rv)
ptr += num_b
print("loading weight {}".format(bn_layer))
else:
# Load conv. bias
num_b = conv_layer.bias.numel()
conv_b = torch.from_numpy(weights[ptr:ptr + num_b]).view_as(conv_layer.bias.data)
conv_layer.bias.data.copy_(conv_b)
ptr += num_b
# Load conv. weights
num_w = conv_layer.weight.numel()
conv_w = torch.from_numpy(weights[ptr:ptr + num_w]).view_as(conv_layer.weight.data)
conv_layer.weight.data.copy_(conv_w)
ptr += num_w
print("loading weight {}".format(conv_layer))
标签:loss,.__,Yolov3,实现,代码,label,filters,self,out 来源: https://blog.csdn.net/gg13213/article/details/122134570