fcos: debug记录

2021-03-12 17:58:39 作者：互联网

fcos：demo代码debug记录

源代码来源：
https://github.com/tianzhi0549/FCOS
使用默认配置开始debug fcos_demo.py
前面都是参数设置：

每个类别的阈值设置，对应coco数据集的80个类别

thresholds_for_classes

得到用来测试的图片列表

demo_im_names = os.listdir(args.images_dir)

建立模型

coco_demo = COCODemo(
    cfg,
    confidence_thresholds_for_classes=thresholds_for_classes,
    min_image_size=args.min_image_size
)

模型建立以后再看，先debug流程

得到结果

composite = coco_demo.run_on_opencv_image(img) 	# img为传入图像

重要的是网络怎么运行得到的结果

run_on_opencv_image

# 得到预测结果
predictions = self.compute_prediction(image)
# 挑选出合适的label
top_predictions = self.select_top_predictions(predictions)

result = image.copy()
if self.show_mask_heatmaps:
    return self.create_mask_montage(result, top_predictions)
# 画框
result = self.overlay_boxes(result, top_predictions)
if self.cfg.MODEL.MASK_ON:
    result = self.overlay_mask(result, top_predictions)
if self.cfg.MODEL.KEYPOINT_ON:
    result = self.overlay_keypoints(result, top_predictions)
# 画className和score
result = self.overlay_class_names(result, top_predictions)
# 返回结果
return result

compute_prediction

首先对传入的图像进行transfroms转换，其定义如下：

 transform = T.Compose(
 [
 		 T.ToPILImage(),		#转换图像为PIL格式，因为transfroms支持的格式为PIL
 		 T.Resize(self.min_image_size),	#和论文中一致，将短边resize到800
 		 T.ToTensor(),	# 转换到tensor格式
 		  to_bgr_transform,	#  to_bgr_transform = T.Lambda(lambda x: x * 255)每个像素都乘以255
 		  normalize_transform, # 像素正则化，平均值为cfg.INPUT.PIXEL_MEAN， 方差为cfg.INPUT.PIXEL_STD
 ]
)
# 调用：image = self.transforms(original_image)
# shape变化：
# 输入图像shape(427,640,3) ->（3，800，1199）

将所有的图片都补充到同一个大小。由于输入的图片大小可能不一致，在经过了transforms后短边的长度相同，但长边的长度各不相同，但pytorch要求输入图像的大小相同，所以它以bach中最大的长边为基准，给其他长边较短的补充0到同样的大小。但由于我们在demo阶段输入的是一张图片，所以相当于什么都没有做。注意在这里还需要填充长边到能被8整除，防止下采样时小数的情况。

images = to_image_list(images)

传入模型得到输出

predictions = self.model(image_list)	
predictions = [o.to(self.cpu_device) for o in predictions]

# always single image is passed at a time
prediction = predictions[0]

# reshape prediction (a BoxList) into the original image size
# 得到原始图像大小
height, width = original_image.shape[:-1]
prediction = prediction.resize((width, height))

if prediction.has_field("mask"):
    # if we have masks, paste the masks in the right position
    # in the image, as defined by the bounding boxes
    masks = prediction.get_field("mask")
    # always single image is passed at a time
    masks = self.masker([masks], [prediction])[0]
    prediction.add_field("mask", masks)
return prediction

select_top_predictions

scores = predictions.get_field("scores")
labels = predictions.get_field("labels")
thresholds = self.confidence_thresholds_for_classes[(labels - 1).long()]
keep = torch.nonzero(scores > thresholds).squeeze(1)
predictions = predictions[keep]
scores = predictions.get_field("scores")
_, idx = scores.sort(0, descending=True)
#返回得分大于自己的阈值，并且将得分降序排列
return predictions[idx]

model

得到backbone（主干特征提取网络的输出）默认是resnet

features = self.backbone(images.tensors)

features包含5个特征层对应的shape分别为：
[(1,256,100,152), (1,256,50,76), (1,256,25,38), (1,256,13,19), (1,256,7,10)]最后将features传入rpn网络

# 这里的targets为None
proposals, proposal_losses = self.rpn(images, features, targets)  
if self.roi_heads:
    x, result, detector_losses = self.roi_heads(features, proposals, targets)
else:
    # 只使用rpn而没有使用roi-head
    x = features
    result = proposals
    detector_losses = {}

if self.training:
    losses = {}
    losses.update(detector_losses)
    losses.update(proposal_losses)
    return losses

return result

rpn

# 得到预测
box_cls, box_regression, centerness = self.head(features)
# 计算位置
locations = self.compute_locations(features)

if self.training:
    return self._forward_train(
        locations, box_cls, 
        box_regression, 
        centerness, targets
    )
else:
	# 返回值
    return self._forward_test(
        locations, box_cls, box_regression, 
        centerness, images.image_sizes
    )

head

传入head中:图像image, 和backbone的输出特征层features,进入for循环对每个特征图进行遍历：

 for l, feature in enumerate(x):
 
 	  # cls_tower = 3x3卷积（保证维度，通道数不变）， 分组批量正则化， relu
 	  #（分为32个组，由于特征图的维度都是256，所以每组正则化数目256//32）
      cls_tower = self.cls_tower(feature)
      # box_tower 与cls_tower相同处理（分别）
      box_tower = self.bbox_tower(feature)
      
	  # cls_logits = 3x3卷积（保证维度，通道数压缩为num_classes, 数据集的类别个数）
	  # 也即在这一步对每个类别做出了预测
      logits.append(self.cls_logits(cls_tower))
      
      # 按照论文所说，中心预测分支可选择在回归分支和分类分支， 这里使用回归分支
	  # centerness = 3x3卷积（保证维度，输出通道为1）
      if self.centerness_on_reg:
          centerness.append(self.centerness(box_tower))
      else:
          centerness.append(self.centerness(cls_tower))
      # 给出位置预测
      # bbox_pred 3x3卷积（保证维度，输出通道为4）
      # scales引入可以学习的参数（默认为1.0），对self.bbox_pred(box_tower)的输出
      # 进行逐元素乘，也就是在这一步，得到了（l, r, t, b）预测
      bbox_pred = self.scales[l](self.bbox_pred(box_tower))
      if self.norm_reg_targets:
      	  # 剔除负值
          bbox_pred = F.relu(bbox_pred)
          if self.training:
              # 在训练时不回归
              bbox_reg.append(bbox_pred)
          else:
              # 未训练时回归到原图上面
              # 这里的fpn_strides对应相对于原图的下采样率[8, 16, 32, 64, 128]
              # 8的得来：bbox_pred的shape(4, 100, 152)原图（3，800， 1216）
              # 800/100 == 1216/152 == 8（这里的长边在前面to_image_list中填充到能被8整除的大小）
              bbox_reg.append(bbox_pred * self.fpn_strides[l])
      else:
          bbox_reg.append(torch.exp(bbox_pred))
  # 返回
  return logits, bbox_reg, centerness
  # shape分别为
  # （80, h, w）, (4, h, w), (1, h, w)

这里和论文有点不相同，论文中说：剔除负值通过exp(x)函数。同时引入可以学习的参数i(每个特征层都不相同)。这里剔除负值直接使用的relu函数。

compute_locations

locations = []
# 对每个特征层进行遍历
for level, feature in enumerate(features):
	# 得到特征层的清晰度h, w
    h, w = feature.size()[-2:]
    # 特征层定位
    locations_per_level = self.compute_locations_per_level(
        h, w, self.fpn_strides[level],
        feature.device
    )
    locations.append(locations_per_level)
return locations
# shape (5, 2, h x w)其中5的得来是由于有5个特征层

compute_locations_per_level

# 以步长为间隔生成网格的x
shifts_x = torch.arange(
            0, w * stride, step=stride,
            dtype=torch.float32, device=device
        )
 # 以步长为间隔生成网格的y
shifts_y = torch.arange(
    0, h * stride, step=stride,
    dtype=torch.float32, device=device
)
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
shift_x = shift_x.reshape(-1)
shift_y = shift_y.reshape(-1)
# 为每一个网格生成左上角坐标
locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
return locations
# shape 为（2，h x w）

_forward_test

# 计算出原图的
boxes = self.box_selector_test(
            locations, box_cls, box_regression, 
            centerness, image_sizes
        )
return boxes, {}

box_selector_test

sampled_boxes = []
for _, (l, o, b, c) in enumerate(zip(locations, box_cls, box_regression, centerness)):
    sampled_boxes.append(
    # 这里的image_size为原图大小（800， 1199）
        self.forward_for_single_feature_map(
            l, o, b, c, image_sizes
        )
    )

boxlists = list(zip(*sampled_boxes))
# 合并特征层的输出
boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]
if not self.bbox_aug_enabled:
    boxlists = self.select_over_all_levels(boxlists)

return boxlists

forward_for_single_feature_map

# batchsize, channel, high, weight
N, C, H, W = box_cls.shape

# 和locations调整为相同的格式
box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1)
box_cls = box_cls.reshape(N, -1, C).sigmoid()
# shape变换 (N, C, H, W)-> (N, HxW, C)
box_regression = box_regression.view(N, 4, H, W).permute(0, 2, 3, 1)
box_regression = box_regression.reshape(N, -1, 4)
centerness = centerness.view(N, 1, H, W).permute(0, 2, 3, 1)
centerness = centerness.reshape(N, -1).sigmoid()
# 同上

# 筛选出大于self.pre_nms_thresh（0.05）分值的类别
candidate_inds = box_cls > self.pre_nms_thresh
# 统计大于0.05分类别的个数，其中可能有同一个网格预测的多个类别大于0.05的也被算在内（因为都是bool值）
# shape = （batchsize, 1） 这里的1表明在经过第一轮筛选（self.pre_nms_thresh）之后剩下的网格数
#（包含一个网格中有多个通道留下）
pre_nms_top_n = candidate_inds.view(N, -1).sum(1)

# 将类别个数多余self.pre_nms_top_n的赋值为1000，一个图中最多的数目为1000
# clamp（min, max）
pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)

# multiply the classification scores with centerness scores
# 得到预测得分，其中将none作为索引是为了增加维度
box_cls = box_cls * centerness[:, :, None]

results = []
# 对每个图像分别遍历
for i in range(N):
    per_box_cls = box_cls[i]    # 第i张图像，以下都是对第i张图像。预测类别得分
    per_candidate_inds = candidate_inds[i]  # 大于self.pre_nms_thresh分值的bool数组
    per_box_cls = per_box_cls[per_candidate_inds]   # bool索引得到类别分值
    # 得到分值不是0位置的索引
    per_candidate_nonzeros = per_candidate_inds.nonzero()
    per_box_loc = per_candidate_nonzeros[:, 0]  # 得到分值不是0位置的网格编号
    per_class = per_candidate_nonzeros[:, 1] + 1    # 得到每个网格所属的类别（由1开始）

    per_box_regression = box_regression[i]  # 得到第i张图片的bbox参数
    per_box_regression = per_box_regression[per_box_loc]    # 按照第一维的网格编号得到目标网格
    per_locations = locations[per_box_loc]	# 得到每个网格满足条件的王国对应到原图的位置，用bool索引实现
	# 满足条件即经过初筛选
    per_pre_nms_top_n = pre_nms_top_n[i]   # 得到第i张图像满足条件的网格个数
    
	# 如果出现一张图中预测了超过self.pre_nms_top_n数量的目标，那么就只取前per_pre_nms_top_n个
    if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
        per_box_cls, top_k_indices = \
            per_box_cls.topk(per_pre_nms_top_n, sorted=False)
        per_class = per_class[top_k_indices]
        per_box_regression = per_box_regression[top_k_indices]
        per_locations = per_locations[top_k_indices]

	# 得到原图中预测的真实位置
    detections = torch.stack([
        per_locations[:, 0] - per_box_regression[:, 0],
        per_locations[:, 1] - per_box_regression[:, 1],
        per_locations[:, 0] + per_box_regression[:, 2],
        per_locations[:, 1] + per_box_regression[:, 3],
    ], dim=1)

    h, w = image_sizes[i]
    # 将结果包装为类别，含有属性bbox=detections, size(原图), mode(怎样格式的数据.如‘xyxy’)
    boxlist = BoxList(detections, (int(w), int(h)), mode="xyxy")
    # 添加类别
    boxlist.add_field("labels", per_class)
    # 排序分数，不会打乱顺序，因为第一维就标明了属于那个网格
    boxlist.add_field("scores", torch.sqrt(per_box_cls))
    # 限制框的大小，防止框超出了边界
    boxlist = boxlist.clip_to_image(remove_empty=False)
    # 移除较小的框
    boxlist = remove_small_boxes(boxlist, self.min_size)
    results.append(boxlist)

return results

select_over_all_levels

num_images = len(boxlists)
results = []
for i in range(num_images):
# 遍历所有的图片
    # 非极大值抑制，得到所有的检测框
    result = boxlist_ml_nms(boxlists[i], self.nms_thresh)
    # 目标数量
    number_of_detections = len(result)

    # Limit to max_per_image detections **over all classes**
    # 限制每张图片输出的目标数量，最多为self.fpn_post_nms_top_n（100）
    if number_of_detections > self.fpn_post_nms_top_n > 0:
        cls_scores = result.get_field("scores")
        image_thresh, _ = torch.kthvalue(
            cls_scores.cpu(),
            number_of_detections - self.fpn_post_nms_top_n + 1
        )
        keep = cls_scores >= image_thresh.item()
        keep = torch.nonzero(keep).squeeze(1)
        result = result[keep]
    results.append(result)
return results

大多数都是一些API的操作，函数顺序按照debug顺序写的。目录顺序只是参考。有一些细节是在C++中完成，没有给出。
在这里插入图片描述

标签：box,fcos,记录,self,top,per,debug,image,cls
来源： https://blog.csdn.net/qq_40246742/article/details/114692410