Compare commits

..

No commits in common. "master" and "v0.2.0" have entirely different histories.

36 changed files with 268 additions and 4999 deletions

3
.env
View File

@ -1,3 +1,4 @@
UPLOAD_DIR=uploads
MOCK=false
MODEL=yolo_detect #segformer, yolo, yolo_detect
MODEL=yolo #segformer, yolo
PREPROCESS=sam3

View File

@ -15,15 +15,15 @@ WORKDIR /code
# 复制并安装 Python 依赖
COPY requirements.txt /code/requirements.txt
# 安装 Python 依赖
# 安装 Python 依赖(加速源)
RUN pip install --no-cache-dir --upgrade -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple \
&& pip install --no-cache-dir --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu130
# 复制应用代码(正式上线后取消注释)
# COPY ./app /code/app
# 复制应用代码
COPY ./app /code/app
# 删除核心文件,减小体积(正式上线后取消注释)
# RUN rm -rf /code/app/core
# 删除无用的文件,避免占用磁盘空间
RUN rm -rf /code/app/core/*.onnx /code/app/core/*.data /code/app/core/*.pt
# 暴露端口并启动应用
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"]

View File

@ -1,18 +1,26 @@
### Wall Docker 镜像使用教程
#### 构建方式
---
```bash
docker build -t wall .
> 采用模型文件持久化,方便后续更新模型而不需要重新创建容器,以及统一配置配置文件
1. 导入docker images `docker load -i wall.tar`
2. 进入持久化目录,新建.env添加相应内容
```env
UPLOAD_DIR=uploads
MOCK=false
MODEL=segformer #segformer, yolo目前打包模型只有segformer
```
#### 启动方式
3. 解压算法模型目录到core文件夹
> 采用代码持久化,方便后续更新代码及模型而不需要重新创建容器,以及统一配置配置文件
>
> 后续更新如果只更新了核心文件则仅需git pull后重新启动容器
>
> 如果更新了requirements.txt则需要重新构建镜像
```bash
tar -xvf core.tar
```
4. 使用指令运行docker镜像
```bash
sudo docker run -d \
@ -20,11 +28,29 @@ sudo docker run -d \
--gpus all \
-p [local_port]:80 \
-v $(pwd)/uploads:/code/uploads \
-v $(pwd)/app:/code/app \
-v $(pwd)/core:/code/app/core \
-v $(pwd)/.env:/code/.env \
wall
```
> TIPS由于部分文件采用GIT LFS 管理请先安装GIT LFS
>
> 在clone或者pull时建议先clone代码文件然后停掉再用git lfs pull可查看大文件下载进度
5. 如果后续需要更新模型只需要覆盖掉core内的文件更改.env配置文件后即可继续运行
---
> 如果不想要模型文件持久化,则不需要解压算法文件了
1. 导入docker images `docker load -i wall.tar`
2. 使用指令运行docker镜像
```bash
sudo docker run -d \
--name [docker_container_name] \
--gpus all \
-p [local_port]:80 \
-v $(pwd)/core:/code/app/core \
-v $(pwd)/.env:/code/.env \
wall
```

View File

@ -1,6 +1,5 @@
from app.core.segformer.detect import Detection as SegFormer, DetectionMock as SegFormerMock
from app.core.yolo.detect import YOLOSeg
from app.core.yolo_detect.detect import YOLODetect
class Model:
@ -12,9 +11,6 @@ class Model:
elif MODEL == "yolo":
print("使用 YOLO 模型")
self.detection = YOLOSeg()
elif MODEL == "yolo_detect":
print("使用 YOLO_Detect 模型")
self.detection = YOLODetect()
def getModel(self):
return self.detection

12
app/core/preprocess.py Normal file
View File

@ -0,0 +1,12 @@
from app.core.sam3.preprocess import SAM3
class Preprocess:
def __init__(self):
from app.main import PREPROCESS
if PREPROCESS == "sam3":
print("使用 SAM3 进行预处理判断")
self.preprocess = SAM3()
def getPreprocess(self):
return self.preprocess

172
app/core/sam3/preprocess.py Normal file
View File

@ -0,0 +1,172 @@
import os
import torch
from pathlib import Path
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sam3.model_builder import build_sam3_image_model
from sam3.train.data.collator import collate_fn_api as collate
from sam3.model.utils.misc import copy_data_to_device
from sam3.train.data.sam3_image_dataset import (
Datapoint, Image as SAMImage, FindQueryLoaded, InferenceMetadata
)
from sam3.train.transforms.basic_for_api import ComposeAPI, RandomResizeAPI, ToTensorAPI, NormalizeAPI
from sam3.eval.postprocessors import PostProcessImage
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
# ===== 配置 =====
CKPT_PATH = os.path.join(os.getcwd(), "app/core/sam3", "sam3.pt")
DEVICE = "cuda:0"
BATCH_SIZE = 12 # 批量大小,前端要设置
NUM_WORKERS = 12 # 加载图片的线程数,看前端要不要设置
CONF_TH = 0.5
RATIO_TH = 0.5 # 阈值,越大的话过滤越多,但太大会影响近景图片
_GLOBAL_ID = 1
PROMPTS = [
"wall",
"building wall",
"building facade",
"building exterior wall",
"exterior building facade",
]
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}
# ============
class ImgPathList(Dataset):
def __init__(self, img_paths: list):
"""
初始化 ImgFolder传入一个图片路径的列表
Args:
img_paths (list): 一个包含图片路径的列表
"""
self.paths = img_paths # 使用传入的路径列表
def __len__(self):
return len(self.paths)
def __getitem__(self, i):
p = self.paths[i] # 直接使用列表中的路径
img = Image.open(p).convert("RGB") # 打开图片并转换为RGB模式
return p, img # 返回图片的路径和图片本身
class SAM3:
def __init__(self):
self.dev = torch.device(DEVICE)
self.postprocessor = PostProcessImage(
max_dets_per_img=-1,
iou_type="segm",
use_original_sizes_box=True,
use_original_sizes_mask=True,
convert_mask_to_rle=False,
detection_threshold=CONF_TH,
to_cpu=False,
)
self.transform = ComposeAPI(
transforms=[
RandomResizeAPI(sizes=1008, max_size=1008, square=True, consistent_transform=False),
ToTensorAPI(),
NormalizeAPI(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
]
)
self.model = build_sam3_image_model(
checkpoint_path=CKPT_PATH, load_from_HF=False, device=DEVICE
).to(DEVICE).eval()
def preprocess(self, image_path_list):
labels = []
loader = DataLoader(
ImgPathList(image_path_list),
batch_size=BATCH_SIZE,
shuffle=False,
num_workers=NUM_WORKERS,
pin_memory=True,
collate_fn=self.collate_fn,
)
with torch.inference_mode():
for names, images in loader:
datapoints = []
name2qids = {} # name -> [qid,...]
for name, img in zip(names, images):
dp = self.create_empty_datapoint()
self.set_image(dp, img)
qids = [self.add_text_prompt(dp, p) for p in PROMPTS]
name2qids[name] = qids
datapoints.append(self.transform(dp))
batch = collate(datapoints, dict_key="dummy")["dummy"]
batch = copy_data_to_device(batch, self.dev, non_blocking=True)
output = self.model(batch)
processed = self.postprocessor.process_results(output, batch.find_metadatas)
for name in names:
any_masks = []
for qid in name2qids[name]:
res = processed[qid]
m = res.get("masks", None) # 期望: [N,H,W]
if m is None:
any_masks.append(torch.zeros(1, 1, device=self.dev, dtype=torch.bool).squeeze())
else:
if not torch.is_tensor(m):
m = torch.as_tensor(m, device=self.dev)
any_masks.append(m.any(0)) # [H,W]
wall_mask = torch.stack(any_masks, 0).any(0) # [H,W] bool
ratio = wall_mask.float().mean().item()
lab = 1 if ratio >= RATIO_TH else 0
labels.append(lab)
print(f"{name} | wall_ratio={ratio:.4f} -> {lab}") # 这行可以不要
return labels
@staticmethod
def add_text_prompt(datapoint, text_query):
global _GLOBAL_ID
assert len(datapoint.images) == 1, "please set the image first"
w, h = datapoint.images[0].size
datapoint.find_queries.append(
FindQueryLoaded(
query_text=text_query,
image_id=0,
object_ids_output=[],
is_exhaustive=True,
query_processing_order=0,
inference_metadata=InferenceMetadata(
coco_image_id=_GLOBAL_ID,
original_image_id=_GLOBAL_ID,
original_category_id=1,
original_size=[w, h],
object_id=0,
frame_index=0,
),
)
)
_GLOBAL_ID += 1
return _GLOBAL_ID - 1
@staticmethod
def create_empty_datapoint():
return Datapoint(find_queries=[], images=[])
@staticmethod
def set_image(datapoint, pil_image):
w, h = pil_image.size
datapoint.images = [SAMImage(data=pil_image, objects=[], size=[h, w])] # size 用 [H,W]
@staticmethod
def collate_fn(batch):
names, imgs = zip(*batch)
return list(names), list(imgs)

BIN
app/core/sam3/sam3.pt (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -1,150 +0,0 @@
import io
import os
import cv2
import torch
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from PIL import Image, ImageDraw, ImageFont, ImageSequence
from app.core.yolo_detect.yolo import YOLO # 假设你有 YOLO 模型类
from app.core.yolo_detect.utils.utils import (cvtColor, get_classes, preprocess_input,
resize_image, show_config)
CLASS_NAMES = {
"wall_konggu": "Hollowing",
"wall_shenshui": "Water seepage",
"wall_kailie": "Cracking",
"wall_konggu_gap": "Gap in hollowing",
"wall": "Wall",
}
class YOLODetect(YOLO):
def __init__(self):
super().__init__()
self.classes = CLASS_NAMES
def detect(self, img_input, crop=False, count=False):
try:
image = Image.open(img_input)
if image.format == "MPO":
image = next(ImageSequence.Iterator(image))
jpeg_image_in_memory = io.BytesIO()
image.save(jpeg_image_in_memory, format="JPEG")
jpeg_image_in_memory.seek(0)
image = Image.open(jpeg_image_in_memory)
# if isinstance(img_input, str):
# image = cv2.imdecode(np.fromfile(img_input, dtype=np.uint8), cv2.IMREAD_COLOR)
# # image = Image.open(img_input)
# else:
# image = img_input
# ---------------------------------------------------#
# 计算输入图片的高和宽
# ---------------------------------------------------#
image_shape = np.array(np.shape(image)[0:2])
# ---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
# ---------------------------------------------------------#
image = cvtColor(image)
# ---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
# ---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
# ---------------------------------------------------------#
# 添加上batch_size维度
# h, w, 3 => 3, h, w => 1, 3, h, w
# ---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
# ---------------------------------------------------------#
# 将图像输入网络当中进行预测!
# ---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
# ---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
# ---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres=self.confidence,
nms_thres=self.nms_iou)
if results[0] is None:
return image, []
top_label = np.array(results[0][:, 5], dtype='int32')
top_conf = results[0][:, 4]
top_boxes = results[0][:, :4]
mask = np.zeros((image.size[1], image.size[0], 3), dtype=np.uint8)
coords = []
# 先把 wall 区域存起来
wall_boxes = []
for i, c in enumerate(top_label):
predicted_class = self.class_names[int(c)]
if predicted_class == "wall":
box = top_boxes[i]
top, left, bottom, right = box
top = max(0, np.floor(top).astype('int32'))
left = max(0, np.floor(left).astype('int32'))
bottom = min(image.size[1], np.floor(bottom).astype('int32'))
right = min(image.size[0], np.floor(right).astype('int32'))
wall_boxes.append((left, top, right, bottom))
# 再处理特殊类别
for i, c in enumerate(top_label):
predicted_class = self.class_names[int(c)]
if predicted_class != "wall":
box = top_boxes[i]
score = top_conf[i]
top, left, bottom, right = box
top = max(0, np.floor(top).astype('int32'))
left = max(0, np.floor(left).astype('int32'))
bottom = min(image.size[1], np.floor(bottom).astype('int32'))
right = min(image.size[0], np.floor(right).astype('int32'))
# 计算与每个 wall 的重叠面积
special_area = (right - left) * (bottom - top)
keep = False
for w_left, w_top, w_right, w_bottom in wall_boxes:
inter_left = max(left, w_left)
inter_top = max(top, w_top)
inter_right = min(right, w_right)
inter_bottom = min(bottom, w_bottom)
if inter_right > inter_left and inter_bottom > inter_top:
inter_area = (inter_right - inter_left) * (inter_bottom - inter_top)
if inter_area / special_area >= 0.6: # 重叠比例 ≥ 60%
keep = True
break
if predicted_class == "wall_konggu":
# 面积不能超过整个图像面积的50%
if special_area / (image.size[0] * image.size[1]) > 0.5:
keep = False
if keep:
color = self.colors[int(c)]
mask[top:bottom, left:right] = color
coords.append(
(
self.classes.get(predicted_class),
float(score),
[(int(left), int(top)), (int(right), int(top)), (int(right), int(bottom)), (int(left), int(bottom))]
)
)
mask = cv2.cvtColor(mask, cv2.COLOR_RGB2BGR)
# print("coords:", coords)
return mask, coords
except Exception as e:
print(e)
if __name__ == "__main__":
model = YOLODetect()
image = "test.jpg"
mask, coords = model.detect(Image.open(image))
mask.save("mask.jpg", quality=95, subsampling=0)
print(coords)

View File

@ -1,138 +0,0 @@
import os
import xml.etree.ElementTree as ET
from PIL import Image
from tqdm import tqdm
from utils.utils import get_classes
from utils.utils_map import get_coco_map, get_map
from yolo import YOLO
if __name__ == "__main__":
'''
Recall和Precision不像AP是一个面积的概念因此在门限值Confidence不同时网络的Recall和Precision值是不同的
默认情况下本代码计算的Recall和Precision代表的是当门限值Confidence为0.5所对应的Recall和Precision值
受到mAP计算原理的限制网络在计算mAP时需要获得近乎所有的预测框这样才可以计算不同门限条件下的Recall和Precision值
因此本代码获得的map_out/detection-results/里面的txt的框的数量一般会比直接predict多一些目的是列出所有可能的预测框
'''
#------------------------------------------------------------------------------------------------------------------#
# map_mode用于指定该文件运行时计算的内容
# map_mode为0代表整个map计算流程包括获得预测结果、获得真实框、计算VOC_map。
# map_mode为1代表仅仅获得预测结果。
# map_mode为2代表仅仅获得真实框。
# map_mode为3代表仅仅计算VOC_map。
# map_mode为4代表利用COCO工具箱计算当前数据集的0.50:0.95map。需要获得预测结果、获得真实框后并安装pycocotools才行
#-------------------------------------------------------------------------------------------------------------------#
map_mode = 0
#--------------------------------------------------------------------------------------#
# 此处的classes_path用于指定需要测量VOC_map的类别
# 一般情况下与训练和预测所用的classes_path一致即可
#--------------------------------------------------------------------------------------#
classes_path = 'model_data/voc_classes.txt'
#--------------------------------------------------------------------------------------#
# MINOVERLAP用于指定想要获得的mAP0.xmAP0.x的意义是什么请同学们百度一下。
# 比如计算mAP0.75可以设定MINOVERLAP = 0.75。
#
# 当某一预测框与真实框重合度大于MINOVERLAP时该预测框被认为是正样本否则为负样本。
# 因此MINOVERLAP的值越大预测框要预测的越准确才能被认为是正样本此时算出来的mAP值越低
#--------------------------------------------------------------------------------------#
MINOVERLAP = 0.5
#--------------------------------------------------------------------------------------#
# 受到mAP计算原理的限制网络在计算mAP时需要获得近乎所有的预测框这样才可以计算mAP
# 因此confidence的值应当设置的尽量小进而获得全部可能的预测框。
#
# 该值一般不调整。因为计算mAP需要获得近乎所有的预测框此处的confidence不能随便更改。
# 想要获得不同门限值下的Recall和Precision值请修改下方的score_threhold。
#--------------------------------------------------------------------------------------#
confidence = 0.001
#--------------------------------------------------------------------------------------#
# 预测时使用到的非极大抑制值的大小,越大表示非极大抑制越不严格。
#
# 该值一般不调整。
#--------------------------------------------------------------------------------------#
nms_iou = 0.5
#---------------------------------------------------------------------------------------------------------------#
# Recall和Precision不像AP是一个面积的概念因此在门限值不同时网络的Recall和Precision值是不同的。
#
# 默认情况下本代码计算的Recall和Precision代表的是当门限值为0.5此处定义为score_threhold时所对应的Recall和Precision值。
# 因为计算mAP需要获得近乎所有的预测框上面定义的confidence不能随便更改。
# 这里专门定义一个score_threhold用于代表门限值进而在计算mAP时找到门限值对应的Recall和Precision值。
#---------------------------------------------------------------------------------------------------------------#
score_threhold = 0.5
#-------------------------------------------------------#
# map_vis用于指定是否开启VOC_map计算的可视化
#-------------------------------------------------------#
map_vis = False
#-------------------------------------------------------#
# 指向VOC数据集所在的文件夹
# 默认指向根目录下的VOC数据集
#-------------------------------------------------------#
VOCdevkit_path = 'VOCdevkit'
#-------------------------------------------------------#
# 结果输出的文件夹默认为map_out
#-------------------------------------------------------#
map_out_path = 'map_out'
image_ids = open(os.path.join(VOCdevkit_path, "VOC2007/ImageSets/Main/test.txt")).read().strip().split()
if not os.path.exists(map_out_path):
os.makedirs(map_out_path)
if not os.path.exists(os.path.join(map_out_path, 'ground-truth')):
os.makedirs(os.path.join(map_out_path, 'ground-truth'))
if not os.path.exists(os.path.join(map_out_path, 'detection-results')):
os.makedirs(os.path.join(map_out_path, 'detection-results'))
if not os.path.exists(os.path.join(map_out_path, 'images-optional')):
os.makedirs(os.path.join(map_out_path, 'images-optional'))
class_names, _ = get_classes(classes_path)
if map_mode == 0 or map_mode == 1:
print("Load model.")
yolo = YOLO(confidence = confidence, nms_iou = nms_iou)
print("Load model done.")
print("Get predict result.")
for image_id in tqdm(image_ids):
image_path = os.path.join(VOCdevkit_path, "VOC2007/JPEGImages/"+image_id+".jpg")
image = Image.open(image_path)
if map_vis:
image.save(os.path.join(map_out_path, "images-optional/" + image_id + ".jpg"))
yolo.get_map_txt(image_id, image, class_names, map_out_path)
print("Get predict result done.")
if map_mode == 0 or map_mode == 2:
print("Get ground truth result.")
for image_id in tqdm(image_ids):
with open(os.path.join(map_out_path, "ground-truth/"+image_id+".txt"), "w") as new_f:
root = ET.parse(os.path.join(VOCdevkit_path, "VOC2007/Annotations/"+image_id+".xml")).getroot()
for obj in root.findall('object'):
difficult_flag = False
if obj.find('difficult')!=None:
difficult = obj.find('difficult').text
if int(difficult)==1:
difficult_flag = True
obj_name = obj.find('name').text
if obj_name not in class_names:
continue
bndbox = obj.find('bndbox')
left = bndbox.find('xmin').text
top = bndbox.find('ymin').text
right = bndbox.find('xmax').text
bottom = bndbox.find('ymax').text
if difficult_flag:
new_f.write("%s %s %s %s %s difficult\n" % (obj_name, left, top, right, bottom))
else:
new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
print("Get ground truth result done.")
if map_mode == 0 or map_mode == 3:
print("Get map.")
get_map(MINOVERLAP, True, score_threhold = score_threhold, path = map_out_path)
print("Get map done.")
if map_mode == 4:
print("Get map.")
get_coco_map(class_names = class_names, path = map_out_path)
print("Get map done.")

Binary file not shown.

View File

@ -1,5 +0,0 @@
wall
wall_shenshui
wall_konggu
wall_konggu_gap
wall_kailie

View File

@ -1 +0,0 @@
#

View File

@ -1,143 +0,0 @@
import torch
import torch.nn as nn
def autopad(k, p=None, d=1):
# kernel, padding, dilation
# 对输入的特征层进行自动padding按照Same原则
if d > 1:
# actual kernel-size
k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]
if p is None:
# auto-pad
p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
return p
class SiLU(nn.Module):
# SiLU激活函数
@staticmethod
def forward(x):
return x * torch.sigmoid(x)
class Conv(nn.Module):
# 标准卷积+标准化+激活函数
default_act = SiLU()
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
super().__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
self.bn = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
def forward(self, x):
return self.act(self.bn(self.conv(x)))
def forward_fuse(self, x):
return self.act(self.conv(x))
class Bottleneck(nn.Module):
# 标准瓶颈结构,残差结构
# c1为输入通道数c2为输出通道数
def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
super().__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, k[0], 1)
self.cv2 = Conv(c_, c2, k[1], 1, g=g)
self.add = shortcut and c1 == c2
def forward(self, x):
return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
class C2f(nn.Module):
# CSPNet结构结构大残差结构
# c1为输入通道数c2为输出通道数
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
super().__init__()
self.c = int(c2 * e)
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
self.cv2 = Conv((2 + n) * self.c, c2, 1)
self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
def forward(self, x):
# 进行一个卷积然后划分成两份每个通道都为c
y = list(self.cv1(x).split((self.c, self.c), 1))
# 每进行一次残差结构都保留,然后堆叠在一起,密集残差
y.extend(m(y[-1]) for m in self.m)
return self.cv2(torch.cat(y, 1))
class SPPF(nn.Module):
# SPP结构5、9、13最大池化核的最大池化。
def __init__(self, c1, c2, k=5):
super().__init__()
c_ = c1 // 2
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c_ * 4, c2, 1, 1)
self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
def forward(self, x):
x = self.cv1(x)
y1 = self.m(x)
y2 = self.m(y1)
return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
class Backbone(nn.Module):
def __init__(self, base_channels, base_depth, deep_mul, phi, pretrained=False):
super().__init__()
#-----------------------------------------------#
# 输入图片是3, 640, 640
#-----------------------------------------------#
# 3, 640, 640 => 32, 640, 640 => 64, 320, 320
self.stem = Conv(3, base_channels, 3, 2)
# 64, 320, 320 => 128, 160, 160 => 128, 160, 160
self.dark2 = nn.Sequential(
Conv(base_channels, base_channels * 2, 3, 2),
C2f(base_channels * 2, base_channels * 2, base_depth, True),
)
# 128, 160, 160 => 256, 80, 80 => 256, 80, 80
self.dark3 = nn.Sequential(
Conv(base_channels * 2, base_channels * 4, 3, 2),
C2f(base_channels * 4, base_channels * 4, base_depth * 2, True),
)
# 256, 80, 80 => 512, 40, 40 => 512, 40, 40
self.dark4 = nn.Sequential(
Conv(base_channels * 4, base_channels * 8, 3, 2),
C2f(base_channels * 8, base_channels * 8, base_depth * 2, True),
)
# 512, 40, 40 => 1024 * deep_mul, 20, 20 => 1024 * deep_mul, 20, 20
self.dark5 = nn.Sequential(
Conv(base_channels * 8, int(base_channels * 16 * deep_mul), 3, 2),
C2f(int(base_channels * 16 * deep_mul), int(base_channels * 16 * deep_mul), base_depth, True),
SPPF(int(base_channels * 16 * deep_mul), int(base_channels * 16 * deep_mul), k=5)
)
if pretrained:
url = {
"n" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_n_backbone_weights.pth',
"s" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_s_backbone_weights.pth',
"m" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_m_backbone_weights.pth',
"l" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_l_backbone_weights.pth',
"x" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_x_backbone_weights.pth',
}[phi]
checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", model_dir="./model_data")
self.load_state_dict(checkpoint, strict=False)
print("Load weights from " + url.split('/')[-1])
def forward(self, x):
x = self.stem(x)
x = self.dark2(x)
#-----------------------------------------------#
# dark3的输出为256, 80, 80是一个有效特征层
#-----------------------------------------------#
x = self.dark3(x)
feat1 = x
#-----------------------------------------------#
# dark4的输出为512, 40, 40是一个有效特征层
#-----------------------------------------------#
x = self.dark4(x)
feat2 = x
#-----------------------------------------------#
# dark5的输出为1024 * deep_mul, 20, 20是一个有效特征层
#-----------------------------------------------#
x = self.dark5(x)
feat3 = x
return feat1, feat2, feat3

View File

@ -1,176 +0,0 @@
import numpy as np
import torch
import torch.nn as nn
from app.core.yolo_detect.nets.backbone import Backbone, C2f, Conv
from app.core.yolo_detect.nets.yolo_training import weights_init
from app.core.yolo_detect.utils.utils_bbox import make_anchors
def fuse_conv_and_bn(conv, bn):
# 混合Conv2d + BatchNorm2d 减少计算量
# Fuse Conv2d() and BatchNorm2d() layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/
fusedconv = nn.Conv2d(conv.in_channels,
conv.out_channels,
kernel_size=conv.kernel_size,
stride=conv.stride,
padding=conv.padding,
dilation=conv.dilation,
groups=conv.groups,
bias=True).requires_grad_(False).to(conv.weight.device)
# 准备kernel
w_conv = conv.weight.clone().view(conv.out_channels, -1)
w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
# 准备bias
b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias
b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
return fusedconv
class DFL(nn.Module):
# DFL模块
# Distribution Focal Loss (DFL) proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
def __init__(self, c1=16):
super().__init__()
self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
x = torch.arange(c1, dtype=torch.float)
self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
self.c1 = c1
def forward(self, x):
# bs, self.reg_max * 4, 8400
b, c, a = x.shape
# bs, 4, self.reg_max, 8400 => bs, self.reg_max, 4, 8400 => b, 4, 8400
# 以softmax的方式对0~16的数字计算百分比获得最终数字。
return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
# return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
#---------------------------------------------------#
# yolo_body
#---------------------------------------------------#
class YoloBody(nn.Module):
def __init__(self, input_shape, num_classes, phi, pretrained=False):
super(YoloBody, self).__init__()
depth_dict = {'n' : 0.33, 's' : 0.33, 'm' : 0.67, 'l' : 1.00, 'x' : 1.00,}
width_dict = {'n' : 0.25, 's' : 0.50, 'm' : 0.75, 'l' : 1.00, 'x' : 1.25,}
deep_width_dict = {'n' : 1.00, 's' : 1.00, 'm' : 0.75, 'l' : 0.50, 'x' : 0.50,}
dep_mul, wid_mul, deep_mul = depth_dict[phi], width_dict[phi], deep_width_dict[phi]
base_channels = int(wid_mul * 64) # 64
base_depth = max(round(dep_mul * 3), 1) # 3
#-----------------------------------------------#
# 输入图片是3, 640, 640
#-----------------------------------------------#
#---------------------------------------------------#
# 生成主干模型
# 获得三个有效特征层他们的shape分别是
# 256, 80, 80
# 512, 40, 40
# 1024 * deep_mul, 20, 20
#---------------------------------------------------#
self.backbone = Backbone(base_channels, base_depth, deep_mul, phi, pretrained=pretrained)
#------------------------加强特征提取网络------------------------#
self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
# 1024 * deep_mul + 512, 40, 40 => 512, 40, 40
self.conv3_for_upsample1 = C2f(int(base_channels * 16 * deep_mul) + base_channels * 8, base_channels * 8, base_depth, shortcut=False)
# 768, 80, 80 => 256, 80, 80
self.conv3_for_upsample2 = C2f(base_channels * 8 + base_channels * 4, base_channels * 4, base_depth, shortcut=False)
# 256, 80, 80 => 256, 40, 40
self.down_sample1 = Conv(base_channels * 4, base_channels * 4, 3, 2)
# 512 + 256, 40, 40 => 512, 40, 40
self.conv3_for_downsample1 = C2f(base_channels * 8 + base_channels * 4, base_channels * 8, base_depth, shortcut=False)
# 512, 40, 40 => 512, 20, 20
self.down_sample2 = Conv(base_channels * 8, base_channels * 8, 3, 2)
# 1024 * deep_mul + 512, 20, 20 => 1024 * deep_mul, 20, 20
self.conv3_for_downsample2 = C2f(int(base_channels * 16 * deep_mul) + base_channels * 8, int(base_channels * 16 * deep_mul), base_depth, shortcut=False)
#------------------------加强特征提取网络------------------------#
ch = [base_channels * 4, base_channels * 8, int(base_channels * 16 * deep_mul)]
self.shape = None
self.nl = len(ch)
# self.stride = torch.zeros(self.nl)
self.stride = torch.tensor([256 / x.shape[-2] for x in self.backbone.forward(torch.zeros(1, 3, 256, 256))]) # forward
self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
self.no = num_classes + self.reg_max * 4 # number of outputs per anchor
self.num_classes = num_classes
c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], num_classes) # channels
self.cv2 = nn.ModuleList(nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, num_classes, 1)) for x in ch)
if not pretrained:
weights_init(self)
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
def fuse(self):
print('Fusing layers... ')
for m in self.modules():
if type(m) is Conv and hasattr(m, 'bn'):
m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv
delattr(m, 'bn') # remove batchnorm
m.forward = m.forward_fuse # update forward
return self
def forward(self, x):
# backbone
feat1, feat2, feat3 = self.backbone.forward(x)
#------------------------加强特征提取网络------------------------#
# 1024 * deep_mul, 20, 20 => 1024 * deep_mul, 40, 40
P5_upsample = self.upsample(feat3)
# 1024 * deep_mul, 40, 40 cat 512, 40, 40 => 1024 * deep_mul + 512, 40, 40
P4 = torch.cat([P5_upsample, feat2], 1)
# 1024 * deep_mul + 512, 40, 40 => 512, 40, 40
P4 = self.conv3_for_upsample1(P4)
# 512, 40, 40 => 512, 80, 80
P4_upsample = self.upsample(P4)
# 512, 80, 80 cat 256, 80, 80 => 768, 80, 80
P3 = torch.cat([P4_upsample, feat1], 1)
# 768, 80, 80 => 256, 80, 80
P3 = self.conv3_for_upsample2(P3)
# 256, 80, 80 => 256, 40, 40
P3_downsample = self.down_sample1(P3)
# 512, 40, 40 cat 256, 40, 40 => 768, 40, 40
P4 = torch.cat([P3_downsample, P4], 1)
# 768, 40, 40 => 512, 40, 40
P4 = self.conv3_for_downsample1(P4)
# 512, 40, 40 => 512, 20, 20
P4_downsample = self.down_sample2(P4)
# 512, 20, 20 cat 1024 * deep_mul, 20, 20 => 1024 * deep_mul + 512, 20, 20
P5 = torch.cat([P4_downsample, feat3], 1)
# 1024 * deep_mul + 512, 20, 20 => 1024 * deep_mul, 20, 20
P5 = self.conv3_for_downsample2(P5)
#------------------------加强特征提取网络------------------------#
# P3 256, 80, 80
# P4 512, 40, 40
# P5 1024 * deep_mul, 20, 20
shape = P3.shape # BCHW
# P3 256, 80, 80 => num_classes + self.reg_max * 4, 80, 80
# P4 512, 40, 40 => num_classes + self.reg_max * 4, 40, 40
# P5 1024 * deep_mul, 20, 20 => num_classes + self.reg_max * 4, 20, 20
x = [P3, P4, P5]
for i in range(self.nl):
x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
if self.shape != shape:
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
self.shape = shape
# num_classes + self.reg_max * 4 , 8400 => cls num_classes, 8400;
# box self.reg_max * 4, 8400
box, cls = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2).split((self.reg_max * 4, self.num_classes), 1)
# origin_cls = [xi.split((self.reg_max * 4, self.num_classes), 1)[1] for xi in x]
dbox = self.dfl(box)
return dbox, cls, x, self.anchors.to(dbox.device), self.strides.to(dbox.device)

View File

@ -1,592 +0,0 @@
import math
from copy import deepcopy
from functools import partial
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from app.core.yolo_detect.utils.utils_bbox import dist2bbox, make_anchors
def select_candidates_in_gts(xy_centers, gt_bboxes, eps=1e-9, roll_out=False):
"""select the positive anchor center in gt
Args:
xy_centers (Tensor): shape(h*w, 4)
gt_bboxes (Tensor): shape(b, n_boxes, 4)
Return:
(Tensor): shape(b, n_boxes, h*w)
"""
n_anchors = xy_centers.shape[0]
bs, n_boxes, _ = gt_bboxes.shape
# 计算每个真实框距离每个anchors锚点的左上右下的距离然后求min
# 保证真实框在锚点附近,包围锚点
if roll_out:
bbox_deltas = torch.empty((bs, n_boxes, n_anchors), device=gt_bboxes.device)
for b in range(bs):
lt, rb = gt_bboxes[b].view(-1, 1, 4).chunk(2, 2) # left-top, right-bottom
bbox_deltas[b] = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]),
dim=2).view(n_boxes, n_anchors, -1).amin(2).gt_(eps)
return bbox_deltas
else:
# 真实框的坐上右下left-top, right-bottom
lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2)
# 真实框距离每个anchors锚点的左上右下的距离
bbox_deltas = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]), dim=2).view(bs, n_boxes, n_anchors, -1)
# return (bbox_deltas.min(3)[0] > eps).to(gt_bboxes.dtype)
return bbox_deltas.amin(3).gt_(eps)
def select_highest_overlaps(mask_pos, overlaps, n_max_boxes):
"""if an anchor box is assigned to multiple gts,
the one with the highest iou will be selected.
Args:
mask_pos (Tensor): shape(b, n_max_boxes, h*w)
overlaps (Tensor): shape(b, n_max_boxes, h*w)
Return:
target_gt_idx (Tensor): shape(b, h*w)
fg_mask (Tensor): shape(b, h*w)
mask_pos (Tensor): shape(b, n_max_boxes, h*w)
"""
# b, n_max_boxes, 8400 -> b, 8400
fg_mask = mask_pos.sum(-2)
# 如果有一个anchor被指派去预测多个真实框
if fg_mask.max() > 1:
# b, n_max_boxes, 8400
mask_multi_gts = (fg_mask.unsqueeze(1) > 1).repeat([1, n_max_boxes, 1])
# 如果有一个anchor被指派去预测多个真实框首先计算这个anchor最重合的真实框
# 然后做一个onehot
# b, 8400
max_overlaps_idx = overlaps.argmax(1)
# b, 8400, n_max_boxes
is_max_overlaps = F.one_hot(max_overlaps_idx, n_max_boxes)
# b, n_max_boxes, 8400
is_max_overlaps = is_max_overlaps.permute(0, 2, 1).to(overlaps.dtype)
# b, n_max_boxes, 8400
mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos)
fg_mask = mask_pos.sum(-2)
# 找到每个anchor符合哪个gt
target_gt_idx = mask_pos.argmax(-2) # (b, h*w)
return target_gt_idx, fg_mask, mask_pos
class TaskAlignedAssigner(nn.Module):
def __init__(self, topk=13, num_classes=80, alpha=1.0, beta=6.0, eps=1e-9, roll_out_thr=0):
super().__init__()
self.topk = topk
self.num_classes = num_classes
self.bg_idx = num_classes
self.alpha = alpha
self.beta = beta
self.eps = eps
# roll_out_thr为64
self.roll_out_thr = roll_out_thr
@torch.no_grad()
def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt):
"""This code referenced to
https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py
Args:
pd_scores (Tensor) : shape(bs, num_total_anchors, num_classes)
pd_bboxes (Tensor) : shape(bs, num_total_anchors, 4)
anc_points (Tensor) : shape(num_total_anchors, 2)
gt_labels (Tensor) : shape(bs, n_max_boxes, 1)
gt_bboxes (Tensor) : shape(bs, n_max_boxes, 4)
mask_gt (Tensor) : shape(bs, n_max_boxes, 1)
Returns:
target_labels (Tensor) : shape(bs, num_total_anchors)
target_bboxes (Tensor) : shape(bs, num_total_anchors, 4)
target_scores (Tensor) : shape(bs, num_total_anchors, num_classes)
fg_mask (Tensor) : shape(bs, num_total_anchors)
"""
# 获得batch_size
self.bs = pd_scores.size(0)
# 获得真实框中的最大框数量
self.n_max_boxes = gt_bboxes.size(1)
# 如果self.n_max_boxes大于self.roll_out_thr则roll_out
self.roll_out = self.n_max_boxes > self.roll_out_thr if self.roll_out_thr else False
if self.n_max_boxes == 0:
device = gt_bboxes.device
return (torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), torch.zeros_like(pd_bboxes).to(device),
torch.zeros_like(pd_scores).to(device), torch.zeros_like(pd_scores[..., 0]).to(device),
torch.zeros_like(pd_scores[..., 0]).to(device))
# b, max_num_obj, 8400
# mask_pos 满足在真实框内、是真实框topk最重合的正样本、满足mask_gt的锚点
# align_metric 某个先验点属于某个真实框的类的概率乘上某个先验点与真实框的重合程度
# overlaps 所有真实框和锚点的重合程度
mask_pos, align_metric, overlaps = self.get_pos_mask(pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt)
# target_gt_idx b, 8400 每个anchor符合哪个gt
# fg_mask b, 8400 每个anchor是否有符合的gt
# mask_pos b, max_num_obj, 8400 one_hot后的target_gt_idx
target_gt_idx, fg_mask, mask_pos = select_highest_overlaps(mask_pos, overlaps, self.n_max_boxes)
# 指定目标到对应的anchor点上
# b, 8400
# b, 8400, 4
# b, 8400, 80
target_labels, target_bboxes, target_scores = self.get_targets(gt_labels, gt_bboxes, target_gt_idx, fg_mask)
# 乘上mask_pos把不满足真实框满足的锚点的都置0
align_metric *= mask_pos
# 每个真实框对应的最大得分
# b, max_num_obj
pos_align_metrics = align_metric.amax(axis=-1, keepdim=True)
# 每个真实框对应的最大重合度
# b, max_num_obj
pos_overlaps = (overlaps * mask_pos).amax(axis=-1, keepdim=True)
# 把每个真实框和先验点的得分乘上最大重合程度,再除上最大得分
norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1)
# target_scores作为正则的标签
target_scores = target_scores * norm_align_metric
return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx
def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt):
# pd_scores bs, num_total_anchors, num_classes
# pd_bboxes bs, num_total_anchors, 4
# gt_labels bs, n_max_boxes, 1
# gt_bboxes bs, n_max_boxes, 4
#
# align_metric是一个算出来的代价值某个先验点属于某个真实框的类的概率乘上某个先验点与真实框的重合程度
# overlaps是某个先验点与真实框的重合程度
# align_metric, overlaps bs, max_num_obj, 8400
align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes)
# 正样本锚点需要同时满足:
# 1、在真实框内
# 2、是真实框topk最重合的正样本
# 3、满足mask_gt
# get in_gts mask b, max_num_obj, 8400
# 判断先验点是否在真实框内
mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes, roll_out=self.roll_out)
# get topk_metric mask b, max_num_obj, 8400
# 判断锚点是否在真实框的topk中
mask_topk = self.select_topk_candidates(align_metric * mask_in_gts, topk_mask=mask_gt.repeat([1, 1, self.topk]).bool())
# merge all mask to a final mask, b, max_num_obj, h*w
# 真实框存在非padding
mask_pos = mask_topk * mask_in_gts * mask_gt
return mask_pos, align_metric, overlaps
def get_box_metrics(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes):
if self.roll_out:
align_metric = torch.empty((self.bs, self.n_max_boxes, pd_scores.shape[1]), device=pd_scores.device)
overlaps = torch.empty((self.bs, self.n_max_boxes, pd_scores.shape[1]), device=pd_scores.device)
ind_0 = torch.empty(self.n_max_boxes, dtype=torch.long)
for b in range(self.bs):
ind_0[:], ind_2 = b, gt_labels[b].squeeze(-1).long()
# 获得属于这个类别的得分
# bs, max_num_obj, 8400
bbox_scores = pd_scores[ind_0, :, ind_2]
# 计算真实框和预测框的ciou
# bs, max_num_obj, 8400
overlaps[b] = bbox_iou(gt_bboxes[b].unsqueeze(1), pd_bboxes[b].unsqueeze(0), xywh=False, CIoU=True).squeeze(2).clamp(0)
align_metric[b] = bbox_scores.pow(self.alpha) * overlaps[b].pow(self.beta)
else:
# 2, b, max_num_obj
ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long)
# b, max_num_obj
# [0]代表第几个图片的
ind[0] = torch.arange(end=self.bs).view(-1, 1).repeat(1, self.n_max_boxes)
# [1]真是标签是什么
ind[1] = gt_labels.long().squeeze(-1)
# 获得属于这个类别的得分
# 取出某个先验点属于某个类的概率
# b, max_num_obj, 8400
bbox_scores = pd_scores[ind[0], :, ind[1]]
# 计算真实框和预测框的ciou
# bs, max_num_obj, 8400
overlaps = bbox_iou(gt_bboxes.unsqueeze(2), pd_bboxes.unsqueeze(1), xywh=False, CIoU=True).squeeze(3).clamp(0)
align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta)
return align_metric, overlaps
def select_topk_candidates(self, metrics, largest=True, topk_mask=None):
"""
Args:
metrics : (b, max_num_obj, h*w).
topk_mask : (b, max_num_obj, topk) or None
"""
# 8400
num_anchors = metrics.shape[-1]
# b, max_num_obj, topk
topk_metrics, topk_idxs = torch.topk(metrics, self.topk, dim=-1, largest=largest)
if topk_mask is None:
topk_mask = (topk_metrics.max(-1, keepdim=True) > self.eps).tile([1, 1, self.topk])
# b, max_num_obj, topk
topk_idxs[~topk_mask] = 0
# b, max_num_obj, topk, 8400 -> b, max_num_obj, 8400
# 这一步得到的is_in_topk为b, max_num_obj, 8400
# 代表每个真实框对应的top k个先验点
if self.roll_out:
is_in_topk = torch.empty(metrics.shape, dtype=torch.long, device=metrics.device)
for b in range(len(topk_idxs)):
is_in_topk[b] = F.one_hot(topk_idxs[b], num_anchors).sum(-2)
else:
is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(-2)
# 判断锚点是否在真实框的topk中
is_in_topk = torch.where(is_in_topk > 1, 0, is_in_topk)
return is_in_topk.to(metrics.dtype)
def get_targets(self, gt_labels, gt_bboxes, target_gt_idx, fg_mask):
"""
Args:
gt_labels : (b, max_num_obj, 1)
gt_bboxes : (b, max_num_obj, 4)
target_gt_idx : (b, h*w)
fg_mask : (b, h*w)
"""
# 用于读取真实框标签, (b, 1)
batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[..., None]
# b, h*w 获得gt_labelsgt_bboxes在flatten后的序号
target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes
# b, h*w 用于flatten后读取标签
target_labels = gt_labels.long().flatten()[target_gt_idx]
# b, h*w, 4 用于flatten后读取box
target_bboxes = gt_bboxes.view(-1, 4)[target_gt_idx]
# assigned target scores
target_labels.clamp(0)
# 进行one_hot映射到训练需要的形式。
target_scores = F.one_hot(target_labels, self.num_classes) # (b, h*w, 80)
fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes) # (b, h*w, 80)
target_scores = torch.where(fg_scores_mask > 0, target_scores, 0)
return target_labels, target_bboxes, target_scores
def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
# Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4)
# Get the coordinates of bounding boxes
if xywh: # transform from xywh to xyxy
(x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1)
w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2
b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_
b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_
else: # x1, y1, x2, y2 = box1
b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
# Intersection area
inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * \
(b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0)
# Union Area
union = w1 * h1 + w2 * h2 - inter + eps
# IoU
iou = inter / union
if CIoU or DIoU or GIoU:
cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width
ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height
if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared
rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center dist ** 2
if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2)
with torch.no_grad():
alpha = v / (v - iou + (1 + eps))
return iou - (rho2 / c2 + v * alpha) # CIoU
return iou - rho2 / c2 # DIoU
c_area = cw * ch + eps # convex area
return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf
return iou # IoU
def bbox2dist(anchor_points, bbox, reg_max):
"""Transform bbox(xyxy) to dist(ltrb)."""
x1y1, x2y2 = torch.split(bbox, 2, -1)
return torch.cat((anchor_points - x1y1, x2y2 - anchor_points), -1).clamp(0, reg_max - 0.01) # dist (lt, rb)
class BboxLoss(nn.Module):
def __init__(self, reg_max=16, use_dfl=False):
super().__init__()
self.reg_max = reg_max
self.use_dfl = use_dfl
def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
# 计算IOU损失
# weight代表损失中标签应该有的置信度0最小1最大
weight = torch.masked_select(target_scores.sum(-1), fg_mask).unsqueeze(-1)
# 计算预测框和真实框的重合程度
iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True)
# 然后1-重合程度,乘上应该有的置信度,求和后求平均。
loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum
# 计算DFL损失
if self.use_dfl:
target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max)
loss_dfl = self._df_loss(pred_dist[fg_mask].view(-1, self.reg_max + 1), target_ltrb[fg_mask]) * weight
loss_dfl = loss_dfl.sum() / target_scores_sum
else:
loss_dfl = torch.tensor(0.0).to(pred_dist.device)
return loss_iou, loss_dfl
@staticmethod
def _df_loss(pred_dist, target):
# Return sum of left and right DFL losses
# Distribution Focal Loss (DFL) proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
tl = target.long() # target left
tr = tl + 1 # target right
wl = tr - target # weight left
wr = 1 - wl # weight right
# 一个点一般不会处于anchor点上一般是xx.xx。如果要用DFL的话不可能直接一个cross_entropy就能拟合
# 所以把它认为是相对于xx.xx左上角锚点与右下角锚点的距离 如果距离右下角锚点距离小wl就小左上角损失就小
# 如果距离左上角锚点距离小wr就小右下角损失就小
return (F.cross_entropy(pred_dist, tl.view(-1), reduction="none").view(tl.shape) * wl +
F.cross_entropy(pred_dist, tr.view(-1), reduction="none").view(tl.shape) * wr).mean(-1, keepdim=True)
def xywh2xyxy(x):
"""
Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
top-left corner and (x2, y2) is the bottom-right corner.
Args:
x (np.ndarray) or (torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
Returns:
y (np.ndarray) or (torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
"""
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
y[..., 0] = x[..., 0] - x[..., 2] / 2 # top left x
y[..., 1] = x[..., 1] - x[..., 3] / 2 # top left y
y[..., 2] = x[..., 0] + x[..., 2] / 2 # bottom right x
y[..., 3] = x[..., 1] + x[..., 3] / 2 # bottom right y
return y
# Criterion class for computing training losses
class Loss:
def __init__(self, model):
self.bce = nn.BCEWithLogitsLoss(reduction='none')
self.stride = model.stride # model strides
self.nc = model.num_classes # number of classes
self.no = model.no
self.reg_max = model.reg_max
self.use_dfl = model.reg_max > 1
roll_out_thr = 64
self.assigner = TaskAlignedAssigner(topk=10,
num_classes=self.nc,
alpha=0.5,
beta=6.0,
roll_out_thr=roll_out_thr)
self.bbox_loss = BboxLoss(model.reg_max - 1, use_dfl=self.use_dfl)
self.proj = torch.arange(model.reg_max, dtype=torch.float)
def preprocess(self, targets, batch_size, scale_tensor):
if targets.shape[0] == 0:
out = torch.zeros(batch_size, 0, 5, device=targets.device)
else:
# 获得图像索引
i = targets[:, 0]
_, counts = i.unique(return_counts=True)
out = torch.zeros(batch_size, counts.max(), 5, device=targets.device)
# 对batch进行循环然后赋值
for j in range(batch_size):
matches = i == j
n = matches.sum()
if n:
out[j, :n] = targets[matches, 1:]
# 缩放到原图大小。
out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor))
return out
def bbox_decode(self, anchor_points, pred_dist):
if self.use_dfl:
# batch, anchors, channels
b, a, c = pred_dist.shape
# DFL的解码
pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.to(pred_dist.device).type(pred_dist.dtype))
# pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
# pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
# 然后解码获得预测框
return dist2bbox(pred_dist, anchor_points, xywh=False)
def __call__(self, preds, batch):
# 获得使用的device
device = preds[1].device
# box, cls, dfl三部分的损失
loss = torch.zeros(3, device=device)
# 获得特征,并进行划分
feats = preds[2] if isinstance(preds, tuple) else preds
pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split((self.reg_max * 4, self.nc), 1)
# bs, num_classes + self.reg_max * 4 , 8400 => cls bs, num_classes, 8400;
# box bs, self.reg_max * 4, 8400
pred_scores = pred_scores.permute(0, 2, 1).contiguous()
pred_distri = pred_distri.permute(0, 2, 1).contiguous()
# 获得batch size与dtype
dtype = pred_scores.dtype
batch_size = pred_scores.shape[0]
# 获得输入图片大小
imgsz = torch.tensor(feats[0].shape[2:], device=device, dtype=dtype) * self.stride[0]
# 获得anchors点和步长对应的tensor
anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
# 把一个batch中的东西弄一个矩阵
# 0为属于第几个图片
# 1为种类
# 2:为框的坐标
targets = torch.cat((batch[:, 0].view(-1, 1), batch[:, 1].view(-1, 1), batch[:, 2:]), 1)
# 先进行初步的处理对输入进来的gt进行padding到最大数量并把框的坐标进行缩放
# bs, max_boxes_num, 5
targets = self.preprocess(targets.to(device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
# bs, max_boxes_num, 5 => bs, max_boxes_num, 1 ; bs, max_boxes_num, 4
gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy
# 求哪些框是有目标的,哪些是填充的
# bs, max_boxes_num
mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
# pboxes
# 对预测结果进行解码,获得预测框
# bs, 8400, 4
pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4)
# 对预测框与真实框进行分配
# target_bboxes bs, 8400, 4
# target_scores bs, 8400, 80
# fg_mask bs, 8400
_, target_bboxes, target_scores, fg_mask, _ = self.assigner(
pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt
)
target_bboxes /= stride_tensor
target_scores_sum = max(target_scores.sum(), 1)
# 计算分类的损失
# loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way
loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE
# 计算bbox的损失
if fg_mask.sum():
loss[0], loss[2] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores,
target_scores_sum, fg_mask)
loss[0] *= 7.5 # box gain
loss[1] *= 0.5 # cls gain
loss[2] *= 1.5 # dfl gain
return loss.sum() # loss(box, cls, dfl) # * batch_size
def is_parallel(model):
# Returns True if model is of type DP or DDP
return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
def de_parallel(model):
# De-parallelize a model: returns single-GPU model if model is of type DP or DDP
return model.module if is_parallel(model) else model
def copy_attr(a, b, include=(), exclude=()):
# Copy attributes from b to a, options to only include [...] and to exclude [...]
for k, v in b.__dict__.items():
if (len(include) and k not in include) or k.startswith('_') or k in exclude:
continue
else:
setattr(a, k, v)
class ModelEMA:
""" Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models
Keeps a moving average of everything in the model state_dict (parameters and buffers)
For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
"""
def __init__(self, model, decay=0.9999, tau=2000, updates=0):
# Create EMA
self.ema = deepcopy(de_parallel(model)).eval() # FP32 EMA
# if next(model.parameters()).device.type != 'cpu':
# self.ema.half() # FP16 EMA
self.updates = updates # number of EMA updates
self.decay = lambda x: decay * (1 - math.exp(-x / tau)) # decay exponential ramp (to help early epochs)
for p in self.ema.parameters():
p.requires_grad_(False)
def update(self, model):
# Update EMA parameters
with torch.no_grad():
self.updates += 1
d = self.decay(self.updates)
msd = de_parallel(model).state_dict() # model state_dict
for k, v in self.ema.state_dict().items():
if v.dtype.is_floating_point:
v *= d
v += (1 - d) * msd[k].detach()
def update_attr(self, model, include=(), exclude=('process_group', 'reducer')):
# Update EMA attributes
copy_attr(self.ema, model, include, exclude)
def weights_init(net, init_type='normal', init_gain = 0.02):
def init_func(m):
classname = m.__class__.__name__
if hasattr(m, 'weight') and classname.find('Conv') != -1:
if init_type == 'normal':
torch.nn.init.normal_(m.weight.data, 0.0, init_gain)
elif init_type == 'xavier':
torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain)
elif init_type == 'kaiming':
torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
elif init_type == 'orthogonal':
torch.nn.init.orthogonal_(m.weight.data, gain=init_gain)
else:
raise NotImplementedError('initialization method [%s] is not implemented' % init_type)
elif classname.find('BatchNorm2d') != -1:
torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
torch.nn.init.constant_(m.bias.data, 0.0)
print('initialize network with %s type' % init_type)
net.apply(init_func)
def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio = 0.05, warmup_lr_ratio = 0.1, no_aug_iter_ratio = 0.05, step_num = 10):
def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters):
if iters <= warmup_total_iters:
# lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2
) + warmup_lr_start
elif iters >= total_iters - no_aug_iter:
lr = min_lr
else:
lr = min_lr + 0.5 * (lr - min_lr) * (
1.0
+ math.cos(
math.pi
* (iters - warmup_total_iters)
/ (total_iters - warmup_total_iters - no_aug_iter)
)
)
return lr
def step_lr(lr, decay_rate, step_size, iters):
if step_size < 1:
raise ValueError("step_size must above 1.")
n = iters // step_size
out_lr = lr * decay_rate ** n
return out_lr
if lr_decay_type == "cos":
warmup_total_iters = min(max(warmup_iters_ratio * total_iters, 1), 3)
warmup_lr_start = max(warmup_lr_ratio * lr, 1e-6)
no_aug_iter = min(max(no_aug_iter_ratio * total_iters, 1), 15)
func = partial(yolox_warm_cos_lr ,lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter)
else:
decay_rate = (min_lr / lr) ** (1 / (step_num - 1))
step_size = total_iters / step_num
func = partial(step_lr, lr, decay_rate, step_size)
return func
def set_optimizer_lr(optimizer, lr_scheduler_func, epoch):
lr = lr_scheduler_func(epoch)
for param_group in optimizer.param_groups:
param_group['lr'] = lr

View File

@ -1,186 +0,0 @@
#-----------------------------------------------------------------------#
# predict.py将单张图片预测、摄像头检测、FPS测试和目录遍历检测等功能
# 整合到了一个py文件中通过指定mode进行模式的修改。
#-----------------------------------------------------------------------#
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import time
import cv2
import numpy as np
from PIL import Image
from yolo import YOLO
if __name__ == "__main__":
yolo = YOLO()
#----------------------------------------------------------------------------------------------------------#
# mode用于指定测试的模式
# 'predict' 表示单张图片预测,如果想对预测过程进行修改,如保存图片,截取对象等,可以先看下方详细的注释
# 'video' 表示视频检测,可调用摄像头或者视频进行检测,详情查看下方注释。
# 'fps' 表示测试fps使用的图片是img里面的street.jpg详情查看下方注释。
# 'dir_predict' 表示遍历文件夹进行检测并保存。默认遍历img文件夹保存img_out文件夹详情查看下方注释。
# 'heatmap' 表示进行预测结果的热力图可视化,详情查看下方注释。
# 'export_onnx' 表示将模型导出为onnx需要pytorch1.7.1以上。
#----------------------------------------------------------------------------------------------------------#
mode = "predict"
#-------------------------------------------------------------------------#
# crop 指定了是否在单张图片预测后对目标进行截取
# count 指定了是否进行目标的计数
# crop、count仅在mode='predict'时有效
#-------------------------------------------------------------------------#
crop = False
count = False
#----------------------------------------------------------------------------------------------------------#
# video_path 用于指定视频的路径当video_path=0时表示检测摄像头
# 想要检测视频则设置如video_path = "xxx.mp4"即可代表读取出根目录下的xxx.mp4文件。
# video_save_path 表示视频保存的路径当video_save_path=""时表示不保存
# 想要保存视频则设置如video_save_path = "yyy.mp4"即可代表保存为根目录下的yyy.mp4文件。
# video_fps 用于保存的视频的fps
#
# video_path、video_save_path和video_fps仅在mode='video'时有效
# 保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。
#----------------------------------------------------------------------------------------------------------#
video_path = 0
video_save_path = ""
video_fps = 25.0
#----------------------------------------------------------------------------------------------------------#
# test_interval 用于指定测量fps的时候图片检测的次数。理论上test_interval越大fps越准确。
# fps_image_path 用于指定测试的fps图片
#
# test_interval和fps_image_path仅在mode='fps'有效
#----------------------------------------------------------------------------------------------------------#
test_interval = 100
fps_image_path = "img/street.jpg"
#-------------------------------------------------------------------------#
# dir_origin_path 指定了用于检测的图片的文件夹路径
# dir_save_path 指定了检测完图片的保存路径
#
# dir_origin_path和dir_save_path仅在mode='dir_predict'时有效
#-------------------------------------------------------------------------#
dir_origin_path = "img"
dir_save_path = "img_out"
#-------------------------------------------------------------------------#
# heatmap_save_path 热力图的保存路径默认保存在model_data下
#
# heatmap_save_path仅在mode='heatmap'有效
#-------------------------------------------------------------------------#
heatmap_save_path = "model_data/heatmap_vision.png"
#-------------------------------------------------------------------------#
# simplify 使用Simplify onnx
# onnx_save_path 指定了onnx的保存路径
#-------------------------------------------------------------------------#
simplify = False
onnx_save_path = "model_data/models.onnx"
if mode == "predict":
'''
1如果想要进行检测完的图片的保存利用r_image.save("img.jpg")即可保存直接在predict.py里进行修改即可
2如果想要获得预测框的坐标可以进入yolo.detect_image函数在绘图部分读取topleftbottomright这四个值
3如果想要利用预测框截取下目标可以进入yolo.detect_image函数在绘图部分利用获取到的topleftbottomright这四个值
在原图上利用矩阵的方式进行截取
4如果想要在预测图上写额外的字比如检测到的特定目标的数量可以进入yolo.detect_image函数在绘图部分对predicted_class进行判断
比如判断if predicted_class == 'car': 即可判断当前目标是否为车然后记录数量即可利用draw.text即可写字
'''
while True:
img = input('Input image filename:')
try:
image = Image.open(img)
except:
print('Open Error! Try again!')
continue
else:
r_image = yolo.detect_image(image, crop = crop, count=count)
elif mode == "video":
capture = cv2.VideoCapture(video_path)
if video_save_path!="":
fourcc = cv2.VideoWriter_fourcc(*'XVID')
size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
out = cv2.VideoWriter(video_save_path, fourcc, video_fps, size)
ref, frame = capture.read()
if not ref:
raise ValueError("未能正确读取摄像头(视频),请注意是否正确安装摄像头(是否正确填写视频路径)。")
fps = 0.0
while(True):
t1 = time.time()
# 读取某一帧
ref, frame = capture.read()
if not ref:
break
# 格式转变BGRtoRGB
frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
# 转变成Image
frame = Image.fromarray(np.uint8(frame))
# 进行检测
frame = np.array(yolo.detect_image(frame))
# RGBtoBGR满足opencv显示格式
frame = cv2.cvtColor(frame,cv2.COLOR_RGB2BGR)
fps = ( fps + (1./(time.time()-t1)) ) / 2
print("fps= %.2f"%(fps))
frame = cv2.putText(frame, "fps= %.2f"%(fps), (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow("video",frame)
c= cv2.waitKey(1) & 0xff
if video_save_path!="":
out.write(frame)
if c==27:
capture.release()
break
print("Video Detection Done!")
capture.release()
if video_save_path!="":
print("Save processed video to the path :" + video_save_path)
out.release()
cv2.destroyAllWindows()
elif mode == "fps":
img = Image.open(fps_image_path)
tact_time = yolo.get_FPS(img, test_interval)
print(str(tact_time) + ' seconds, ' + str(1/tact_time) + 'FPS, @batch_size 1')
elif mode == "dir_predict":
import os
from tqdm import tqdm
img_names = os.listdir(dir_origin_path)
for img_name in tqdm(img_names):
if img_name.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
image_path = os.path.join(dir_origin_path, img_name)
image = Image.open(image_path)
r_image,predicted_class_list = yolo.detect_image(image)
if not os.path.exists(dir_save_path):
os.makedirs(dir_save_path)
print("+++++++++++++++++++++")
print(predicted_class_list)
for tag in ["wall", "wall_shenshui", "wall_konggu", "wall_konggu_gap", "wall_kailie"]:
if tag in predicted_class_list:
r_image.save(os.path.join(dir_save_path, img_name.replace(".jpg", ".png")), quality=95, subsampling=0)
# if "wall_shenshui" in predicted_class_list:
# r_image.save(os.path.join(dir_save_path, img_name.replace(".jpg", ".png")), quality=95, subsampling=0)
elif mode == "heatmap":
while True:
img = input('Input image filename:')
try:
image = Image.open(img)
except:
print('Open Error! Try again!')
continue
else:
yolo.detect_heatmap(image, heatmap_save_path)
elif mode == "export_onnx":
yolo.convert_to_onnx(simplify, onnx_save_path)
else:
raise AssertionError("Please specify the correct mode: 'predict', 'video', 'fps', 'heatmap', 'export_onnx', 'dir_predict'.")

View File

@ -1,32 +0,0 @@
#--------------------------------------------#
# 该部分代码用于看网络结构
#--------------------------------------------#
import torch
from thop import clever_format, profile
from nets.yolo import YoloBody
if __name__ == "__main__":
input_shape = [640, 640]
anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
num_classes = 80
phi = 's'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
m = YoloBody(input_shape, num_classes, phi, False).to(device)
for i in m.children():
print(i)
print('==============================')
dummy_input = torch.randn(1, 3, input_shape[0], input_shape[1]).to(device)
flops, params = profile(m.to(device), (dummy_input, ), verbose=False)
#--------------------------------------------------------#
# flops * 2是因为profile没有将卷积作为两个operations
# 有些论文将卷积算乘法、加法两个operations。此时乘2
# 有些论文只考虑乘法的运算次数忽略加法。此时不乘2
# 本代码选择乘2参考YOLOX。
#--------------------------------------------------------#
flops = flops * 2
flops, params = clever_format([flops, params], "%.3f")
print('Total GFLOPS: %s' % (flops))
print('Total params: %s' % (params))

View File

@ -1,565 +0,0 @@
#-------------------------------------#
# 对数据集进行训练
#-------------------------------------#
import datetime
import os
from functools import partial
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from nets.yolo import YoloBody
from nets.yolo_training import (Loss, ModelEMA, get_lr_scheduler,
set_optimizer_lr, weights_init)
from utils.callbacks import EvalCallback, LossHistory
from utils.dataloader import YoloDataset, yolo_dataset_collate
from utils.utils import (download_weights, get_classes, seed_everything,
show_config, worker_init_fn)
from utils.utils_fit import fit_one_epoch
'''
训练自己的目标检测模型一定需要注意以下几点
1训练前仔细检查自己的格式是否满足要求该库要求数据集格式为VOC格式需要准备好的内容有输入图片和标签
输入图片为.jpg图片无需固定大小传入训练前会自动进行resize
灰度图会自动转成RGB图片进行训练无需自己修改
输入图片如果后缀非jpg需要自己批量转成jpg后再开始训练
标签为.xml格式文件中会有需要检测的目标信息标签文件和输入图片文件相对应
2损失值的大小用于判断是否收敛比较重要的是有收敛的趋势即验证集损失不断下降如果验证集损失基本上不改变的话模型基本上就收敛了
损失值的具体大小并没有什么意义大和小只在于损失的计算方式并不是接近于0才好如果想要让损失好看点可以直接到对应的损失函数里面除上10000
训练过程中的损失值会保存在logs文件夹下的loss_%Y_%m_%d_%H_%M_%S文件夹中
3训练好的权值文件保存在logs文件夹中每个训练世代Epoch包含若干训练步长Step每个训练步长Step进行一次梯度下降
如果只是训练了几个Step是不会保存的Epoch和Step的概念要捋清楚一下
'''
if __name__ == "__main__":
#---------------------------------#
# Cuda 是否使用Cuda
# 没有GPU可以设置成False
#---------------------------------#
Cuda = True
#----------------------------------------------#
# Seed 用于固定随机种子
# 使得每次独立训练都可以获得一样的结果
#----------------------------------------------#
seed = 11
#---------------------------------------------------------------------#
# distributed 用于指定是否使用单机多卡分布式运行
# 终端指令仅支持Ubuntu。CUDA_VISIBLE_DEVICES用于在Ubuntu下指定显卡。
# Windows系统下默认使用DP模式调用所有显卡不支持DDP。
# DP模式
# 设置 distributed = False
# 在终端中输入 CUDA_VISIBLE_DEVICES=0,1 python train.py
# DDP模式
# 设置 distributed = True
# 在终端中输入 CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train.py
#---------------------------------------------------------------------#
distributed = False
#---------------------------------------------------------------------#
# sync_bn 是否使用sync_bnDDP模式多卡可用
#---------------------------------------------------------------------#
sync_bn = False
#---------------------------------------------------------------------#
# fp16 是否使用混合精度训练
# 可减少约一半的显存、需要pytorch1.7.1以上
#---------------------------------------------------------------------#
fp16 = True
#---------------------------------------------------------------------#
# classes_path 指向model_data下的txt与自己训练的数据集相关
# 训练前一定要修改classes_path使其对应自己的数据集
#---------------------------------------------------------------------#
classes_path = 'model_data/voc_classes.txt'
#----------------------------------------------------------------------------------------------------------------------------#
# 权值文件的下载请看README可以通过网盘下载。模型的 预训练权重 对不同数据集是通用的,因为特征是通用的。
# 模型的 预训练权重 比较重要的部分是 主干特征提取网络的权值部分,用于进行特征提取。
# 预训练权重对于99%的情况都必须要用,不用的话主干部分的权值太过随机,特征提取效果不明显,网络训练的结果也不会好
#
# 如果训练过程中存在中断训练的操作可以将model_path设置成logs文件夹下的权值文件将已经训练了一部分的权值再次载入。
# 同时修改下方的 冻结阶段 或者 解冻阶段 的参数来保证模型epoch的连续性。
#
# 当model_path = ''的时候不加载整个模型的权值。
#
# 此处使用的是整个模型的权重因此是在train.py进行加载的。
# 如果想要让模型从0开始训练则设置model_path = ''下面的Freeze_Train = Fasle此时从0开始训练且没有冻结主干的过程。
#
# 一般来讲网络从0开始的训练效果会很差因为权值太过随机特征提取效果不明显因此非常、非常、非常不建议大家从0开始训练
# 从0开始训练有两个方案
# 1、得益于Mosaic数据增强方法强大的数据增强能力将UnFreeze_Epoch设置的较大300及以上、batch较大16及以上、数据较多万以上的情况下
# 可以设置mosaic=True直接随机初始化参数开始训练但得到的效果仍然不如有预训练的情况。像COCO这样的大数据集可以这样做
# 2、了解imagenet数据集首先训练分类模型获得网络的主干部分权值分类模型的 主干部分 和该模型通用,基于此进行训练。
#----------------------------------------------------------------------------------------------------------------------------#
model_path = 'model_data/yolov8_l_backbone_weights.pth'
#------------------------------------------------------#
# input_shape 输入的shape大小一定要是32的倍数
#------------------------------------------------------#
input_shape = [640, 640]
#------------------------------------------------------#
# phi 所使用到的yolov8的版本
# n : 对应yolov8_n
# s : 对应yolov8_s
# m : 对应yolov8_m
# l : 对应yolov8_l
# x : 对应yolov8_x
#------------------------------------------------------#
phi = 'l'
#----------------------------------------------------------------------------------------------------------------------------#
# pretrained 是否使用主干网络的预训练权重,此处使用的是主干的权重,因此是在模型构建的时候进行加载的。
# 如果设置了model_path则主干的权值无需加载pretrained的值无意义。
# 如果不设置model_pathpretrained = True此时仅加载主干开始训练。
# 如果不设置model_pathpretrained = FalseFreeze_Train = Fasle此时从0开始训练且没有冻结主干的过程。
#----------------------------------------------------------------------------------------------------------------------------#
pretrained = True
#------------------------------------------------------------------#
# mosaic 马赛克数据增强。
# mosaic_prob 每个step有多少概率使用mosaic数据增强默认50%。
#
# mixup 是否使用mixup数据增强仅在mosaic=True时有效。
# 只会对mosaic增强后的图片进行mixup的处理。
# mixup_prob 有多少概率在mosaic后使用mixup数据增强默认50%。
# 总的mixup概率为mosaic_prob * mixup_prob。
#
# special_aug_ratio 参考YoloX由于Mosaic生成的训练图片远远脱离自然图片的真实分布。
# 当mosaic=True时本代码会在special_aug_ratio范围内开启mosaic。
# 默认为前70%个epoch100个世代会开启70个世代。
#------------------------------------------------------------------#
mosaic = True
mosaic_prob = 0.5
mixup = False
mixup_prob = 0.5
special_aug_ratio = 0.7
#------------------------------------------------------------------#
# label_smoothing 标签平滑。一般0.01以下。如0.01、0.005。
#------------------------------------------------------------------#
label_smoothing = 0.005
#----------------------------------------------------------------------------------------------------------------------------#
# 训练分为两个阶段,分别是冻结阶段和解冻阶段。设置冻结阶段是为了满足机器性能不足的同学的训练需求。
# 冻结训练需要的显存较小显卡非常差的情况下可设置Freeze_Epoch等于UnFreeze_EpochFreeze_Train = True此时仅仅进行冻结训练。
#
# 在此提供若干参数设置建议,各位训练者根据自己的需求进行灵活调整:
# (一)从整个模型的预训练权重开始训练:
# Adam
# Init_Epoch = 0Freeze_Epoch = 50UnFreeze_Epoch = 100Freeze_Train = Trueoptimizer_type = 'adam'Init_lr = 1e-3weight_decay = 0。冻结
# Init_Epoch = 0UnFreeze_Epoch = 100Freeze_Train = Falseoptimizer_type = 'adam'Init_lr = 1e-3weight_decay = 0。不冻结
# SGD
# Init_Epoch = 0Freeze_Epoch = 50UnFreeze_Epoch = 300Freeze_Train = Trueoptimizer_type = 'sgd'Init_lr = 1e-2weight_decay = 5e-4。冻结
# Init_Epoch = 0UnFreeze_Epoch = 300Freeze_Train = Falseoptimizer_type = 'sgd'Init_lr = 1e-2weight_decay = 5e-4。不冻结
# 其中UnFreeze_Epoch可以在100-300之间调整。
# 从0开始训练
# Init_Epoch = 0UnFreeze_Epoch >= 300Unfreeze_batch_size >= 16Freeze_Train = False不冻结训练
# 其中UnFreeze_Epoch尽量不小于300。optimizer_type = 'sgd'Init_lr = 1e-2mosaic = True。
# batch_size的设置
# 在显卡能够接受的范围内以大为好。显存不足与数据集大小无关提示显存不足OOM或者CUDA out of memory请调小batch_size。
# 受到BatchNorm层影响batch_size最小为2不能为1。
# 正常情况下Freeze_batch_size建议为Unfreeze_batch_size的1-2倍。不建议设置的差距过大因为关系到学习率的自动调整。
#----------------------------------------------------------------------------------------------------------------------------#
#------------------------------------------------------------------#
# 冻结阶段训练参数
# 此时模型的主干被冻结了,特征提取网络不发生改变
# 占用的显存较小,仅对网络进行微调
# Init_Epoch 模型当前开始的训练世代其值可以大于Freeze_Epoch如设置
# Init_Epoch = 60、Freeze_Epoch = 50、UnFreeze_Epoch = 100
# 会跳过冻结阶段直接从60代开始并调整对应的学习率。
# (断点续练时使用)
# Freeze_Epoch 模型冻结训练的Freeze_Epoch
# (当Freeze_Train=False时失效)
# Freeze_batch_size 模型冻结训练的batch_size
# (当Freeze_Train=False时失效)
#------------------------------------------------------------------#
Init_Epoch = 0
Freeze_Epoch = 20
Freeze_batch_size = 64
#------------------------------------------------------------------#
# 解冻阶段训练参数
# 此时模型的主干不被冻结了,特征提取网络会发生改变
# 占用的显存较大,网络所有的参数都会发生改变
# UnFreeze_Epoch 模型总共训练的epoch
# SGD需要更长的时间收敛因此设置较大的UnFreeze_Epoch
# Adam可以使用相对较小的UnFreeze_Epoch
# Unfreeze_batch_size 模型在解冻后的batch_size
#------------------------------------------------------------------#
UnFreeze_Epoch = 100
Unfreeze_batch_size = 32
#------------------------------------------------------------------#
# Freeze_Train 是否进行冻结训练
# 默认先冻结主干训练后解冻训练。
#------------------------------------------------------------------#
Freeze_Train = True
#------------------------------------------------------------------#
# 其它训练参数:学习率、优化器、学习率下降有关
#------------------------------------------------------------------#
#------------------------------------------------------------------#
# Init_lr 模型的最大学习率
# Min_lr 模型的最小学习率默认为最大学习率的0.01
#------------------------------------------------------------------#
Init_lr = 1e-2
Min_lr = Init_lr * 0.01
#------------------------------------------------------------------#
# optimizer_type 使用到的优化器种类可选的有adam、sgd
# 当使用Adam优化器时建议设置 Init_lr=1e-3
# 当使用SGD优化器时建议设置 Init_lr=1e-2
# momentum 优化器内部使用到的momentum参数
# weight_decay 权值衰减,可防止过拟合
# adam会导致weight_decay错误使用adam时建议设置为0。
#------------------------------------------------------------------#
optimizer_type = "sgd"
momentum = 0.937
weight_decay = 5e-4
#------------------------------------------------------------------#
# lr_decay_type 使用到的学习率下降方式可选的有step、cos
#------------------------------------------------------------------#
lr_decay_type = "cos"
#------------------------------------------------------------------#
# save_period 多少个epoch保存一次权值
#------------------------------------------------------------------#
save_period = 10
#------------------------------------------------------------------#
# save_dir 权值与日志文件保存的文件夹
#------------------------------------------------------------------#
save_dir = 'logs'
#------------------------------------------------------------------#
# eval_flag 是否在训练时进行评估,评估对象为验证集
# 安装pycocotools库后评估体验更佳。
# eval_period 代表多少个epoch评估一次不建议频繁的评估
# 评估需要消耗较多的时间,频繁评估会导致训练非常慢
# 此处获得的mAP会与get_map.py获得的会有所不同原因有二
# 此处获得的mAP为验证集的mAP。
# (二)此处设置评估参数较为保守,目的是加快评估速度。
#------------------------------------------------------------------#
eval_flag = True
eval_period = 5
#------------------------------------------------------------------#
# num_workers 用于设置是否使用多线程读取数据
# 开启后会加快数据读取速度,但是会占用更多内存
# 内存较小的电脑可以设置为2或者0
#------------------------------------------------------------------#
num_workers = 8
#------------------------------------------------------#
# train_annotation_path 训练图片路径和标签
# val_annotation_path 验证图片路径和标签
#------------------------------------------------------#
train_annotation_path = '2007_train.txt'
val_annotation_path = '2007_val.txt'
seed_everything(seed)
#------------------------------------------------------#
# 设置用到的显卡
#------------------------------------------------------#
ngpus_per_node = torch.cuda.device_count()
if distributed:
dist.init_process_group(backend="nccl")
local_rank = int(os.environ["LOCAL_RANK"])
rank = int(os.environ["RANK"])
device = torch.device("cuda", local_rank)
if local_rank == 0:
print(f"[{os.getpid()}] (rank = {rank}, local_rank = {local_rank}) training...")
print("Gpu Device Count : ", ngpus_per_node)
else:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
local_rank = 0
rank = 0
#------------------------------------------------------#
# 获取classes和anchor
#------------------------------------------------------#
class_names, num_classes = get_classes(classes_path)
#----------------------------------------------------#
# 下载预训练权重
#----------------------------------------------------#
if pretrained:
if distributed:
if local_rank == 0:
download_weights(phi)
dist.barrier()
else:
download_weights(phi)
#------------------------------------------------------#
# 创建yolo模型
#------------------------------------------------------#
model = YoloBody(input_shape, num_classes, phi, pretrained=pretrained)
if model_path != '':
#------------------------------------------------------#
# 权值文件请看README百度网盘下载
#------------------------------------------------------#
if local_rank == 0:
print('Load weights {}.'.format(model_path))
#------------------------------------------------------#
# 根据预训练权重的Key和模型的Key进行加载
#------------------------------------------------------#
model_dict = model.state_dict()
pretrained_dict = torch.load(model_path, map_location = device)
load_key, no_load_key, temp_dict = [], [], {}
for k, v in pretrained_dict.items():
if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v):
temp_dict[k] = v
load_key.append(k)
else:
no_load_key.append(k)
model_dict.update(temp_dict)
model.load_state_dict(model_dict)
#------------------------------------------------------#
# 显示没有匹配上的Key
#------------------------------------------------------#
if local_rank == 0:
print("\nSuccessful Load Key:", str(load_key)[:500], "……\nSuccessful Load Key Num:", len(load_key))
print("\nFail To Load Key:", str(no_load_key)[:500], "……\nFail To Load Key num:", len(no_load_key))
print("\n\033[1;33;44m温馨提示head部分没有载入是正常现象Backbone部分没有载入是错误的。\033[0m")
#----------------------#
# 获得损失函数
#----------------------#
yolo_loss = Loss(model)
#----------------------#
# 记录Loss
#----------------------#
if local_rank == 0:
time_str = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S')
log_dir = os.path.join(save_dir, "loss_" + str(time_str))
loss_history = LossHistory(log_dir, model, input_shape=input_shape)
else:
loss_history = None
#------------------------------------------------------------------#
# torch 1.2不支持amp建议使用torch 1.7.1及以上正确使用fp16
# 因此torch1.2这里显示"could not be resolve"
#------------------------------------------------------------------#
if fp16:
from torch.cuda.amp import GradScaler as GradScaler
scaler = GradScaler()
else:
scaler = None
model_train = model.train()
#----------------------------#
# 多卡同步Bn
#----------------------------#
if sync_bn and ngpus_per_node > 1 and distributed:
model_train = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model_train)
elif sync_bn:
print("Sync_bn is not support in one gpu or not distributed.")
if Cuda:
if distributed:
#----------------------------#
# 多卡平行运行
#----------------------------#
model_train = model_train.cuda(local_rank)
model_train = torch.nn.parallel.DistributedDataParallel(model_train, device_ids=[local_rank], find_unused_parameters=True)
else:
model_train = torch.nn.DataParallel(model)
cudnn.benchmark = True
model_train = model_train.cuda()
#----------------------------#
# 权值平滑
#----------------------------#
ema = ModelEMA(model_train)
#---------------------------#
# 读取数据集对应的txt
#---------------------------#
with open(train_annotation_path, encoding='utf-8') as f:
train_lines = f.readlines()
with open(val_annotation_path, encoding='utf-8') as f:
val_lines = f.readlines()
num_train = len(train_lines)
num_val = len(val_lines)
if local_rank == 0:
show_config(
classes_path = classes_path, model_path = model_path, input_shape = input_shape, \
Init_Epoch = Init_Epoch, Freeze_Epoch = Freeze_Epoch, UnFreeze_Epoch = UnFreeze_Epoch, Freeze_batch_size = Freeze_batch_size, Unfreeze_batch_size = Unfreeze_batch_size, Freeze_Train = Freeze_Train, \
Init_lr = Init_lr, Min_lr = Min_lr, optimizer_type = optimizer_type, momentum = momentum, lr_decay_type = lr_decay_type, \
save_period = save_period, save_dir = save_dir, num_workers = num_workers, num_train = num_train, num_val = num_val
)
#---------------------------------------------------------#
# 总训练世代指的是遍历全部数据的总次数
# 总训练步长指的是梯度下降的总次数
# 每个训练世代包含若干训练步长,每个训练步长进行一次梯度下降。
# 此处仅建议最低训练世代,上不封顶,计算时只考虑了解冻部分
#----------------------------------------------------------#
wanted_step = 5e4 if optimizer_type == "sgd" else 1.5e4
total_step = num_train // Unfreeze_batch_size * UnFreeze_Epoch
if total_step <= wanted_step:
if num_train // Unfreeze_batch_size == 0:
raise ValueError('数据集过小,无法进行训练,请扩充数据集。')
wanted_epoch = wanted_step // (num_train // Unfreeze_batch_size) + 1
print("\n\033[1;33;44m[Warning] 使用%s优化器时,建议将训练总步长设置到%d以上。\033[0m"%(optimizer_type, wanted_step))
print("\033[1;33;44m[Warning] 本次运行的总训练数据量为%dUnfreeze_batch_size为%d,共训练%d个Epoch计算出总训练步长为%d\033[0m"%(num_train, Unfreeze_batch_size, UnFreeze_Epoch, total_step))
print("\033[1;33;44m[Warning] 由于总训练步长为%d,小于建议总步长%d,建议设置总世代为%d\033[0m"%(total_step, wanted_step, wanted_epoch))
#------------------------------------------------------#
# 主干特征提取网络特征通用,冻结训练可以加快训练速度
# 也可以在训练初期防止权值被破坏。
# Init_Epoch为起始世代
# Freeze_Epoch为冻结训练的世代
# UnFreeze_Epoch总训练世代
# 提示OOM或者显存不足请调小Batch_size
#------------------------------------------------------#
if True:
UnFreeze_flag = False
#------------------------------------#
# 冻结一定部分训练
#------------------------------------#
if Freeze_Train:
for param in model.backbone.parameters():
param.requires_grad = False
#-------------------------------------------------------------------#
# 如果不冻结训练的话直接设置batch_size为Unfreeze_batch_size
#-------------------------------------------------------------------#
batch_size = Freeze_batch_size if Freeze_Train else Unfreeze_batch_size
#-------------------------------------------------------------------#
# 判断当前batch_size自适应调整学习率
#-------------------------------------------------------------------#
nbs = 64
lr_limit_max = 1e-3 if optimizer_type == 'adam' else 5e-2
lr_limit_min = 3e-4 if optimizer_type == 'adam' else 5e-4
Init_lr_fit = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
Min_lr_fit = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
#---------------------------------------#
# 根据optimizer_type选择优化器
#---------------------------------------#
pg0, pg1, pg2 = [], [], []
for k, v in model.named_modules():
if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
pg2.append(v.bias)
if isinstance(v, nn.BatchNorm2d) or "bn" in k:
pg0.append(v.weight)
elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
pg1.append(v.weight)
optimizer = {
'adam' : optim.Adam(pg0, Init_lr_fit, betas = (momentum, 0.999)),
'sgd' : optim.SGD(pg0, Init_lr_fit, momentum = momentum, nesterov=True)
}[optimizer_type]
optimizer.add_param_group({"params": pg1, "weight_decay": weight_decay})
optimizer.add_param_group({"params": pg2})
#---------------------------------------#
# 获得学习率下降的公式
#---------------------------------------#
lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
#---------------------------------------#
# 判断每一个世代的长度
#---------------------------------------#
epoch_step = num_train // batch_size
epoch_step_val = num_val // batch_size
if epoch_step == 0 or epoch_step_val == 0:
raise ValueError("数据集过小,无法继续进行训练,请扩充数据集。")
if ema:
ema.updates = epoch_step * Init_Epoch
#---------------------------------------#
# 构建数据集加载器。
#---------------------------------------#
train_dataset = YoloDataset(train_lines, input_shape, num_classes, epoch_length=UnFreeze_Epoch, \
mosaic=mosaic, mixup=mixup, mosaic_prob=mosaic_prob, mixup_prob=mixup_prob, train=True, special_aug_ratio=special_aug_ratio)
val_dataset = YoloDataset(val_lines, input_shape, num_classes, epoch_length=UnFreeze_Epoch, \
mosaic=False, mixup=False, mosaic_prob=0, mixup_prob=0, train=False, special_aug_ratio=0)
if distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True,)
val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False,)
batch_size = batch_size // ngpus_per_node
shuffle = False
else:
train_sampler = None
val_sampler = None
shuffle = True
gen = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler,
worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
gen_val = DataLoader(val_dataset , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler,
worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
#----------------------#
# 记录eval的map曲线
#----------------------#
if local_rank == 0:
eval_callback = EvalCallback(model, input_shape, class_names, num_classes, val_lines, log_dir, Cuda, \
eval_flag=eval_flag, period=eval_period)
else:
eval_callback = None
#---------------------------------------#
# 开始模型训练
#---------------------------------------#
for epoch in range(Init_Epoch, UnFreeze_Epoch):
#---------------------------------------#
# 如果模型有冻结学习部分
# 则解冻,并设置参数
#---------------------------------------#
if epoch >= Freeze_Epoch and not UnFreeze_flag and Freeze_Train:
batch_size = Unfreeze_batch_size
#-------------------------------------------------------------------#
# 判断当前batch_size自适应调整学习率
#-------------------------------------------------------------------#
nbs = 64
lr_limit_max = 1e-3 if optimizer_type == 'adam' else 5e-2
lr_limit_min = 3e-4 if optimizer_type == 'adam' else 5e-4
Init_lr_fit = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max)
Min_lr_fit = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2)
#---------------------------------------#
# 获得学习率下降的公式
#---------------------------------------#
lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch)
for param in model.backbone.parameters():
param.requires_grad = True
epoch_step = num_train // batch_size
epoch_step_val = num_val // batch_size
if epoch_step == 0 or epoch_step_val == 0:
raise ValueError("数据集过小,无法继续进行训练,请扩充数据集。")
if ema:
ema.updates = epoch_step * epoch
if distributed:
batch_size = batch_size // ngpus_per_node
gen = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler,
worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
gen_val = DataLoader(val_dataset , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True,
drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler,
worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed))
UnFreeze_flag = True
gen.dataset.epoch_now = epoch
gen_val.dataset.epoch_now = epoch
if distributed:
train_sampler.set_epoch(epoch)
set_optimizer_lr(optimizer, lr_scheduler_func, epoch)
fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, fp16, scaler, save_period, save_dir, local_rank)
if distributed:
dist.barrier()
if local_rank == 0:
loss_history.writer.close()

View File

@ -1 +0,0 @@
#

View File

@ -1,230 +0,0 @@
import datetime
import os
import torch
import matplotlib
matplotlib.use('Agg')
import scipy.signal
from matplotlib import pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import shutil
import numpy as np
from PIL import Image
from tqdm import tqdm
from .utils import cvtColor, preprocess_input, resize_image
from .utils_bbox import DecodeBox
from .utils_map import get_coco_map, get_map
class LossHistory():
def __init__(self, log_dir, model, input_shape):
self.log_dir = log_dir
self.losses = []
self.val_loss = []
os.makedirs(self.log_dir)
self.writer = SummaryWriter(self.log_dir)
# try:
# dummy_input = torch.randn(2, 3, input_shape[0], input_shape[1])
# self.writer.add_graph(model, dummy_input)
# except:
# pass
def append_loss(self, epoch, loss, val_loss):
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
self.losses.append(loss)
self.val_loss.append(val_loss)
with open(os.path.join(self.log_dir, "epoch_loss.txt"), 'a') as f:
f.write(str(loss))
f.write("\n")
with open(os.path.join(self.log_dir, "epoch_val_loss.txt"), 'a') as f:
f.write(str(val_loss))
f.write("\n")
self.writer.add_scalar('loss', loss, epoch)
self.writer.add_scalar('val_loss', val_loss, epoch)
self.loss_plot()
def loss_plot(self):
iters = range(len(self.losses))
plt.figure()
plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss')
plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss')
# try:
# if len(self.losses) < 25:
# num = 5
# else:
# num = 15
# plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss')
# plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss')
# except:
# pass
plt.grid(True)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc="upper right")
plt.savefig(os.path.join(self.log_dir, "epoch_loss.png"))
plt.cla()
plt.close("all")
class EvalCallback():
def __init__(self, net, input_shape, class_names, num_classes, val_lines, log_dir, cuda, \
map_out_path=".temp_map_out", max_boxes=100, confidence=0.05, nms_iou=0.5, letterbox_image=True, MINOVERLAP=0.5, eval_flag=True, period=1):
super(EvalCallback, self).__init__()
self.net = net
self.input_shape = input_shape
self.class_names = class_names
self.num_classes = num_classes
self.val_lines = val_lines
self.log_dir = log_dir
self.cuda = cuda
self.map_out_path = map_out_path
self.max_boxes = max_boxes
self.confidence = confidence
self.nms_iou = nms_iou
self.letterbox_image = letterbox_image
self.MINOVERLAP = MINOVERLAP
self.eval_flag = eval_flag
self.period = period
self.bbox_util = DecodeBox(self.num_classes, (self.input_shape[0], self.input_shape[1]))
self.maps = [0]
self.epoches = [0]
if self.eval_flag:
with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
f.write(str(0))
f.write("\n")
def get_map_txt(self, image_id, image, class_names, map_out_path):
f = open(os.path.join(map_out_path, "detection-results/"+image_id+".txt"), "w", encoding='utf-8')
image_shape = np.array(np.shape(image)[0:2])
#---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
#---------------------------------------------------------#
image = cvtColor(image)
#---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
#---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
#---------------------------------------------------------#
# 添加上batch_size维度
#---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
#---------------------------------------------------------#
# 将图像输入网络当中进行预测!
#---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
#---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
#---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou)
if results[0] is None:
return
top_label = np.array(results[0][:, 5], dtype = 'int32')
top_conf = results[0][:, 4]
top_boxes = results[0][:, :4]
top_100 = np.argsort(top_conf)[::-1][:self.max_boxes]
top_boxes = top_boxes[top_100]
top_conf = top_conf[top_100]
top_label = top_label[top_100]
for i, c in list(enumerate(top_label)):
predicted_class = self.class_names[int(c)]
box = top_boxes[i]
score = str(top_conf[i])
top, left, bottom, right = box
if predicted_class not in class_names:
continue
f.write("%s %s %s %s %s %s\n" % (predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)),str(int(bottom))))
f.close()
return
def on_epoch_end(self, epoch, model_eval):
if epoch % self.period == 0 and self.eval_flag:
self.net = model_eval
if not os.path.exists(self.map_out_path):
os.makedirs(self.map_out_path)
if not os.path.exists(os.path.join(self.map_out_path, "ground-truth")):
os.makedirs(os.path.join(self.map_out_path, "ground-truth"))
if not os.path.exists(os.path.join(self.map_out_path, "detection-results")):
os.makedirs(os.path.join(self.map_out_path, "detection-results"))
print("Get map.")
for annotation_line in tqdm(self.val_lines):
line = annotation_line.split()
image_id = os.path.basename(line[0]).split('.')[0]
#------------------------------#
# 读取图像并转换成RGB图像
#------------------------------#
image = Image.open(line[0])
#------------------------------#
# 获得预测框
#------------------------------#
gt_boxes = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
#------------------------------#
# 获得预测txt
#------------------------------#
self.get_map_txt(image_id, image, self.class_names, self.map_out_path)
#------------------------------#
# 获得真实框txt
#------------------------------#
with open(os.path.join(self.map_out_path, "ground-truth/"+image_id+".txt"), "w") as new_f:
for box in gt_boxes:
left, top, right, bottom, obj = box
obj_name = self.class_names[obj]
new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom))
print("Calculate Map.")
try:
temp_map = get_coco_map(class_names = self.class_names, path = self.map_out_path)[1]
except:
temp_map = get_map(self.MINOVERLAP, False, path = self.map_out_path)
self.maps.append(temp_map)
self.epoches.append(epoch)
with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f:
f.write(str(temp_map))
f.write("\n")
plt.figure()
plt.plot(self.epoches, self.maps, 'red', linewidth = 2, label='train map')
plt.grid(True)
plt.xlabel('Epoch')
plt.ylabel('Map %s'%str(self.MINOVERLAP))
plt.title('A Map Curve')
plt.legend(loc="upper right")
plt.savefig(os.path.join(self.log_dir, "epoch_map.png"))
plt.cla()
plt.close("all")
print("Get map done.")
shutil.rmtree(self.map_out_path)

View File

@ -1,426 +0,0 @@
from random import sample, shuffle
import cv2
import numpy as np
import torch
from PIL import Image
from torch.utils.data.dataset import Dataset
from utils.utils import cvtColor, preprocess_input
class YoloDataset(Dataset):
def __init__(self, annotation_lines, input_shape, num_classes, epoch_length, \
mosaic, mixup, mosaic_prob, mixup_prob, train, special_aug_ratio = 0.7):
super(YoloDataset, self).__init__()
self.annotation_lines = annotation_lines
self.input_shape = input_shape
self.num_classes = num_classes
self.epoch_length = epoch_length
self.mosaic = mosaic
self.mosaic_prob = mosaic_prob
self.mixup = mixup
self.mixup_prob = mixup_prob
self.train = train
self.special_aug_ratio = special_aug_ratio
self.epoch_now = -1
self.length = len(self.annotation_lines)
self.bbox_attrs = 5 + num_classes
def __len__(self):
return self.length
def __getitem__(self, index):
index = index % self.length
#---------------------------------------------------#
# 训练时进行数据的随机增强
# 验证时不进行数据的随机增强
#---------------------------------------------------#
if self.mosaic and self.rand() < self.mosaic_prob and self.epoch_now < self.epoch_length * self.special_aug_ratio:
lines = sample(self.annotation_lines, 3)
lines.append(self.annotation_lines[index])
shuffle(lines)
image, box = self.get_random_data_with_Mosaic(lines, self.input_shape)
if self.mixup and self.rand() < self.mixup_prob:
lines = sample(self.annotation_lines, 1)
image_2, box_2 = self.get_random_data(lines[0], self.input_shape, random = self.train)
image, box = self.get_random_data_with_MixUp(image, box, image_2, box_2)
else:
image, box = self.get_random_data(self.annotation_lines[index], self.input_shape, random = self.train)
image = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1))
box = np.array(box, dtype=np.float32)
#---------------------------------------------------#
# 对真实框进行预处理
#---------------------------------------------------#
nL = len(box)
labels_out = np.zeros((nL, 6))
if nL:
#---------------------------------------------------#
# 对真实框进行归一化调整到0-1之间
#---------------------------------------------------#
box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1]
box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0]
#---------------------------------------------------#
# 序号为0、1的部分为真实框的中心
# 序号为2、3的部分为真实框的宽高
# 序号为4的部分为真实框的种类
#---------------------------------------------------#
box[:, 2:4] = box[:, 2:4] - box[:, 0:2]
box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2
#---------------------------------------------------#
# 调整顺序,符合训练的格式
# labels_out中序号为0的部分在collect时处理
#---------------------------------------------------#
labels_out[:, 1] = box[:, -1]
labels_out[:, 2:] = box[:, :4]
return image, labels_out
def rand(self, a=0, b=1):
return np.random.rand()*(b-a) + a
def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True):
line = annotation_line.split()
#------------------------------#
# 读取图像并转换成RGB图像
#------------------------------#
image = Image.open(line[0])
image = cvtColor(image)
#------------------------------#
# 获得图像的高宽与目标高宽
#------------------------------#
iw, ih = image.size
h, w = input_shape
#------------------------------#
# 获得预测框
#------------------------------#
box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
if not random:
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
dx = (w-nw)//2
dy = (h-nh)//2
#---------------------------------#
# 将图像多余的部分加上灰条
#---------------------------------#
image = image.resize((nw,nh), Image.BICUBIC)
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image, np.float32)
#---------------------------------#
# 对真实框进行调整
#---------------------------------#
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
return image_data, box
#------------------------------------------#
# 对图像进行缩放并且进行长和宽的扭曲
#------------------------------------------#
new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
scale = self.rand(.25, 2)
if new_ar < 1:
nh = int(scale*h)
nw = int(nh*new_ar)
else:
nw = int(scale*w)
nh = int(nw/new_ar)
image = image.resize((nw,nh), Image.BICUBIC)
#------------------------------------------#
# 将图像多余的部分加上灰条
#------------------------------------------#
dx = int(self.rand(0, w-nw))
dy = int(self.rand(0, h-nh))
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image = new_image
#------------------------------------------#
# 翻转图像
#------------------------------------------#
flip = self.rand()<.5
if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
image_data = np.array(image, np.uint8)
#---------------------------------#
# 对图像进行色域变换
# 计算色域变换的参数
#---------------------------------#
r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
#---------------------------------#
# 将图像转到HSV上
#---------------------------------#
hue, sat, val = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV))
dtype = image_data.dtype
#---------------------------------#
# 应用变换
#---------------------------------#
x = np.arange(0, 256, dtype=r.dtype)
lut_hue = ((x * r[0]) % 180).astype(dtype)
lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB)
#---------------------------------#
# 对真实框进行调整
#---------------------------------#
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
if flip: box[:, [0,2]] = w - box[:, [2,0]]
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)]
return image_data, box
def merge_bboxes(self, bboxes, cutx, cuty):
merge_bbox = []
for i in range(len(bboxes)):
for box in bboxes[i]:
tmp_box = []
x1, y1, x2, y2 = box[0], box[1], box[2], box[3]
if i == 0:
if y1 > cuty or x1 > cutx:
continue
if y2 >= cuty and y1 <= cuty:
y2 = cuty
if x2 >= cutx and x1 <= cutx:
x2 = cutx
if i == 1:
if y2 < cuty or x1 > cutx:
continue
if y2 >= cuty and y1 <= cuty:
y1 = cuty
if x2 >= cutx and x1 <= cutx:
x2 = cutx
if i == 2:
if y2 < cuty or x2 < cutx:
continue
if y2 >= cuty and y1 <= cuty:
y1 = cuty
if x2 >= cutx and x1 <= cutx:
x1 = cutx
if i == 3:
if y1 > cuty or x2 < cutx:
continue
if y2 >= cuty and y1 <= cuty:
y2 = cuty
if x2 >= cutx and x1 <= cutx:
x1 = cutx
tmp_box.append(x1)
tmp_box.append(y1)
tmp_box.append(x2)
tmp_box.append(y2)
tmp_box.append(box[-1])
merge_bbox.append(tmp_box)
return merge_bbox
def get_random_data_with_Mosaic(self, annotation_line, input_shape, jitter=0.3, hue=.1, sat=0.7, val=0.4):
h, w = input_shape
min_offset_x = self.rand(0.3, 0.7)
min_offset_y = self.rand(0.3, 0.7)
image_datas = []
box_datas = []
index = 0
for line in annotation_line:
#---------------------------------#
# 每一行进行分割
#---------------------------------#
line_content = line.split()
#---------------------------------#
# 打开图片
#---------------------------------#
image = Image.open(line_content[0])
image = cvtColor(image)
#---------------------------------#
# 图片的大小
#---------------------------------#
iw, ih = image.size
#---------------------------------#
# 保存框的位置
#---------------------------------#
box = np.array([np.array(list(map(int,box.split(',')))) for box in line_content[1:]])
#---------------------------------#
# 是否翻转图片
#---------------------------------#
flip = self.rand()<.5
if flip and len(box)>0:
image = image.transpose(Image.FLIP_LEFT_RIGHT)
box[:, [0,2]] = iw - box[:, [2,0]]
#------------------------------------------#
# 对图像进行缩放并且进行长和宽的扭曲
#------------------------------------------#
new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter)
scale = self.rand(.4, 1)
if new_ar < 1:
nh = int(scale*h)
nw = int(nh*new_ar)
else:
nw = int(scale*w)
nh = int(nw/new_ar)
image = image.resize((nw, nh), Image.BICUBIC)
#-----------------------------------------------#
# 将图片进行放置,分别对应四张分割图片的位置
#-----------------------------------------------#
if index == 0:
dx = int(w*min_offset_x) - nw
dy = int(h*min_offset_y) - nh
elif index == 1:
dx = int(w*min_offset_x) - nw
dy = int(h*min_offset_y)
elif index == 2:
dx = int(w*min_offset_x)
dy = int(h*min_offset_y)
elif index == 3:
dx = int(w*min_offset_x)
dy = int(h*min_offset_y) - nh
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image_data = np.array(new_image)
index = index + 1
box_data = []
#---------------------------------#
# 对box进行重新处理
#---------------------------------#
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)]
box_data = np.zeros((len(box),5))
box_data[:len(box)] = box
image_datas.append(image_data)
box_datas.append(box_data)
#---------------------------------#
# 将图片分割,放在一起
#---------------------------------#
cutx = int(w * min_offset_x)
cuty = int(h * min_offset_y)
new_image = np.zeros([h, w, 3])
new_image[:cuty, :cutx, :] = image_datas[0][:cuty, :cutx, :]
new_image[cuty:, :cutx, :] = image_datas[1][cuty:, :cutx, :]
new_image[cuty:, cutx:, :] = image_datas[2][cuty:, cutx:, :]
new_image[:cuty, cutx:, :] = image_datas[3][:cuty, cutx:, :]
new_image = np.array(new_image, np.uint8)
#---------------------------------#
# 对图像进行色域变换
# 计算色域变换的参数
#---------------------------------#
r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1
#---------------------------------#
# 将图像转到HSV上
#---------------------------------#
hue, sat, val = cv2.split(cv2.cvtColor(new_image, cv2.COLOR_RGB2HSV))
dtype = new_image.dtype
#---------------------------------#
# 应用变换
#---------------------------------#
x = np.arange(0, 256, dtype=r.dtype)
lut_hue = ((x * r[0]) % 180).astype(dtype)
lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
new_image = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val)))
new_image = cv2.cvtColor(new_image, cv2.COLOR_HSV2RGB)
#---------------------------------#
# 对框进行进一步的处理
#---------------------------------#
new_boxes = self.merge_bboxes(box_datas, cutx, cuty)
return new_image, new_boxes
def get_random_data_with_MixUp(self, image_1, box_1, image_2, box_2):
new_image = np.array(image_1, np.float32) * 0.5 + np.array(image_2, np.float32) * 0.5
if len(box_1) == 0:
new_boxes = box_2
elif len(box_2) == 0:
new_boxes = box_1
else:
new_boxes = np.concatenate([box_1, box_2], axis=0)
return new_image, new_boxes
# DataLoader中collate_fn使用
def yolo_dataset_collate(batch):
images = []
bboxes = []
for i, (img, box) in enumerate(batch):
images.append(img)
box[:, 0] = i
bboxes.append(box)
images = torch.from_numpy(np.array(images)).type(torch.FloatTensor)
bboxes = torch.from_numpy(np.concatenate(bboxes, 0)).type(torch.FloatTensor)
return images, bboxes
# # DataLoader中collate_fn使用
# def yolo_dataset_collate(batch):
# images = []
# n_max_boxes = 0
# bs = len(batch)
# for i, (img, box) in enumerate(batch):
# images.append(img)
# n_max_boxes = max(n_max_boxes, len(box))
# bboxes = torch.zeros((bs, n_max_boxes, 4))
# labels = torch.zeros((bs, n_max_boxes, 1))
# masks = torch.zeros((bs, n_max_boxes, 1))
# for i, (img, box) in enumerate(batch):
# _sub_length = len(box)
# bboxes[i, :_sub_length] = box[:, :4]
# labels[i, :_sub_length] = box[:, 4]
# masks[i, :_sub_length] = 1
# images = torch.from_numpy(np.array(images)).type(torch.FloatTensor)
# bboxes = torch.from_numpy(np.concatenate(bboxes, 0)).type(torch.FloatTensor)
# return images, bboxes, labels, masks

View File

@ -1,103 +0,0 @@
import random
import numpy as np
import torch
from PIL import Image
#---------------------------------------------------------#
# 将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
#---------------------------------------------------------#
def cvtColor(image):
if len(np.shape(image)) == 3 and np.shape(image)[2] == 3:
return image
else:
image = image.convert('RGB')
return image
#---------------------------------------------------#
# 对输入图像进行resize
#---------------------------------------------------#
def resize_image(image, size, letterbox_image):
iw, ih = image.size
w, h = size
if letterbox_image:
scale = min(w/iw, h/ih)
nw = int(iw*scale)
nh = int(ih*scale)
image = image.resize((nw,nh), Image.BICUBIC)
new_image = Image.new('RGB', size, (128,128,128))
new_image.paste(image, ((w-nw)//2, (h-nh)//2))
else:
new_image = image.resize((w, h), Image.BICUBIC)
return new_image
#---------------------------------------------------#
# 获得类
#---------------------------------------------------#
def get_classes(classes_path):
with open(classes_path, encoding='utf-8') as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names, len(class_names)
#---------------------------------------------------#
# 获得学习率
#---------------------------------------------------#
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
#---------------------------------------------------#
# 设置种子
#---------------------------------------------------#
def seed_everything(seed=11):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
#---------------------------------------------------#
# 设置Dataloader的种子
#---------------------------------------------------#
def worker_init_fn(worker_id, rank, seed):
worker_seed = rank + seed
random.seed(worker_seed)
np.random.seed(worker_seed)
torch.manual_seed(worker_seed)
def preprocess_input(image):
image /= 255.0
return image
def show_config(**kwargs):
print('Configurations:')
print('-' * 70)
print('|%25s | %40s|' % ('keys', 'values'))
print('-' * 70)
for key, value in kwargs.items():
print('|%25s | %40s|' % (str(key), str(value)))
print('-' * 70)
def download_weights(phi, model_dir="./model_data"):
import os
from torch.hub import load_state_dict_from_url
download_urls = {
"n" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_n_backbone_weights.pth',
"s" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_s_backbone_weights.pth',
"m" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_m_backbone_weights.pth',
"l" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_l_backbone_weights.pth',
"x" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_x_backbone_weights.pth',
}
url = download_urls[phi]
if not os.path.exists(model_dir):
os.makedirs(model_dir)
load_state_dict_from_url(url, model_dir)

View File

@ -1,348 +0,0 @@
import numpy as np
import torch
from torchvision.ops import nms
import pkg_resources as pkg
def check_version(current: str = "0.0.0",
minimum: str = "0.0.0",
name: str = "version ",
pinned: bool = False) -> bool:
current, minimum = (pkg.parse_version(x) for x in (current, minimum))
result = (current == minimum) if pinned else (current >= minimum) # bool
return result
TORCH_1_10 = check_version(torch.__version__, '1.10.0')
def make_anchors(feats, strides, grid_cell_offset=0.5):
"""Generate anchors from features."""
anchor_points, stride_tensor = [], []
assert feats is not None
dtype, device = feats[0].dtype, feats[0].device
for i, stride in enumerate(strides):
_, _, h, w = feats[i].shape
sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset # shift x
sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y
sy, sx = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx)
anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2))
stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
return torch.cat(anchor_points), torch.cat(stride_tensor)
def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
"""Transform distance(ltrb) to box(xywh or xyxy)."""
# 左上右下
lt, rb = torch.split(distance, 2, dim)
x1y1 = anchor_points - lt
x2y2 = anchor_points + rb
if xywh:
c_xy = (x1y1 + x2y2) / 2
wh = x2y2 - x1y1
return torch.cat((c_xy, wh), dim) # xywh bbox
return torch.cat((x1y1, x2y2), dim) # xyxy bbox
class DecodeBox():
def __init__(self, num_classes, input_shape):
super(DecodeBox, self).__init__()
self.num_classes = num_classes
self.bbox_attrs = 4 + num_classes
self.input_shape = input_shape
def decode_box(self, inputs):
# dbox batch_size, 4, 8400
# cls batch_size, 20, 8400
dbox, cls, origin_cls, anchors, strides = inputs
# 获得中心宽高坐标
dbox = dist2bbox(dbox, anchors.unsqueeze(0), xywh=True, dim=1) * strides
y = torch.cat((dbox, cls.sigmoid()), 1).permute(0, 2, 1)
# 进行归一化到0~1之间
y[:, :, :4] = y[:, :, :4] / torch.Tensor([self.input_shape[1], self.input_shape[0], self.input_shape[1], self.input_shape[0]]).to(y.device)
return y
def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image):
#-----------------------------------------------------------------#
# 把y轴放前面是因为方便预测框和图像的宽高进行相乘
#-----------------------------------------------------------------#
box_yx = box_xy[..., ::-1]
box_hw = box_wh[..., ::-1]
input_shape = np.array(input_shape)
image_shape = np.array(image_shape)
if letterbox_image:
#-----------------------------------------------------------------#
# 这里求出来的offset是图像有效区域相对于图像左上角的偏移情况
# new_shape指的是宽高缩放情况
#-----------------------------------------------------------------#
new_shape = np.round(image_shape * np.min(input_shape/image_shape))
offset = (input_shape - new_shape)/2./input_shape
scale = input_shape/new_shape
box_yx = (box_yx - offset) * scale
box_hw *= scale
box_mins = box_yx - (box_hw / 2.)
box_maxes = box_yx + (box_hw / 2.)
boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]], axis=-1)
boxes *= np.concatenate([image_shape, image_shape], axis=-1)
return boxes
def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5, nms_thres=0.4):
#----------------------------------------------------------#
# 将预测结果的格式转换成左上角右下角的格式。
# prediction [batch_size, num_anchors, 85]
#----------------------------------------------------------#
box_corner = prediction.new(prediction.shape)
box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
prediction[:, :, :4] = box_corner[:, :, :4]
output = [None for _ in range(len(prediction))]
for i, image_pred in enumerate(prediction):
#----------------------------------------------------------#
# 对种类预测部分取max。
# class_conf [num_anchors, 1] 种类置信度
# class_pred [num_anchors, 1] 种类
#----------------------------------------------------------#
class_conf, class_pred = torch.max(image_pred[:, 4:4 + num_classes], 1, keepdim=True)
#----------------------------------------------------------#
# 利用置信度进行第一轮筛选
#----------------------------------------------------------#
conf_mask = (class_conf[:, 0] >= conf_thres).squeeze()
#----------------------------------------------------------#
# 根据置信度进行预测结果的筛选
#----------------------------------------------------------#
image_pred = image_pred[conf_mask]
class_conf = class_conf[conf_mask]
class_pred = class_pred[conf_mask]
if not image_pred.size(0):
continue
#-------------------------------------------------------------------------#
# detections [num_anchors, 6]
# 6的内容为x1, y1, x2, y2, class_conf, class_pred
#-------------------------------------------------------------------------#
detections = torch.cat((image_pred[:, :4], class_conf.float(), class_pred.float()), 1)
#------------------------------------------#
# 获得预测结果中包含的所有种类
#------------------------------------------#
unique_labels = detections[:, -1].cpu().unique()
if prediction.is_cuda:
unique_labels = unique_labels.cuda()
detections = detections.cuda()
for c in unique_labels:
#------------------------------------------#
# 获得某一类得分筛选后全部的预测结果
#------------------------------------------#
detections_class = detections[detections[:, -1] == c]
#------------------------------------------#
# 使用官方自带的非极大抑制会速度更快一些!
# 筛选出一定区域内,属于同一种类得分最大的框
#------------------------------------------#
keep = nms(
detections_class[:, :4],
detections_class[:, 4],
nms_thres
)
max_detections = detections_class[keep]
# # 按照存在物体的置信度排序
# _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True)
# detections_class = detections_class[conf_sort_index]
# # 进行非极大抑制
# max_detections = []
# while detections_class.size(0):
# # 取出这一类置信度最高的一步一步往下判断判断重合程度是否大于nms_thres如果是则去除掉
# max_detections.append(detections_class[0].unsqueeze(0))
# if len(detections_class) == 1:
# break
# ious = bbox_iou(max_detections[-1], detections_class[1:])
# detections_class = detections_class[1:][ious < nms_thres]
# # 堆叠
# max_detections = torch.cat(max_detections).data
# Add max detections to outputs
output[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections))
if output[i] is not None:
output[i] = output[i].cpu().numpy()
box_xy, box_wh = (output[i][:, 0:2] + output[i][:, 2:4])/2, output[i][:, 2:4] - output[i][:, 0:2]
output[i][:, :4] = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image)
return output
if __name__ == "__main__":
import matplotlib.pyplot as plt
import numpy as np
#---------------------------------------------------#
# 将预测值的每个特征层调成真实值
#---------------------------------------------------#
def get_anchors_and_decode(input, input_shape, anchors, anchors_mask, num_classes):
#-----------------------------------------------#
# input batch_size, 3 * (4 + 1 + num_classes), 20, 20
#-----------------------------------------------#
batch_size = input.size(0)
input_height = input.size(2)
input_width = input.size(3)
#-----------------------------------------------#
# 输入为640x640时 input_shape = [640, 640] input_height = 20, input_width = 20
# 640 / 20 = 32
# stride_h = stride_w = 32
#-----------------------------------------------#
stride_h = input_shape[0] / input_height
stride_w = input_shape[1] / input_width
#-------------------------------------------------#
# 此时获得的scaled_anchors大小是相对于特征层的
# anchor_width, anchor_height / stride_h, stride_w
#-------------------------------------------------#
scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in anchors[anchors_mask[2]]]
#-----------------------------------------------#
# batch_size, 3 * (4 + 1 + num_classes), 20, 20 =>
# batch_size, 3, 5 + num_classes, 20, 20 =>
# batch_size, 3, 20, 20, 4 + 1 + num_classes
#-----------------------------------------------#
prediction = input.view(batch_size, len(anchors_mask[2]),
num_classes + 5, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous()
#-----------------------------------------------#
# 先验框的中心位置的调整参数
#-----------------------------------------------#
x = torch.sigmoid(prediction[..., 0])
y = torch.sigmoid(prediction[..., 1])
#-----------------------------------------------#
# 先验框的宽高调整参数
#-----------------------------------------------#
w = torch.sigmoid(prediction[..., 2])
h = torch.sigmoid(prediction[..., 3])
#-----------------------------------------------#
# 获得置信度,是否有物体 0 - 1
#-----------------------------------------------#
conf = torch.sigmoid(prediction[..., 4])
#-----------------------------------------------#
# 种类置信度 0 - 1
#-----------------------------------------------#
pred_cls = torch.sigmoid(prediction[..., 5:])
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
#----------------------------------------------------------#
# 生成网格,先验框中心,网格左上角
# batch_size,3,20,20
# range(20)
# [
# [0, 1, 2, 3 ……, 19],
# [0, 1, 2, 3 ……, 19],
# …… 20次
# [0, 1, 2, 3 ……, 19]
# ] * (batch_size * 3)
# [batch_size, 3, 20, 20]
#
# [
# [0, 1, 2, 3 ……, 19],
# [0, 1, 2, 3 ……, 19],
# …… 20次
# [0, 1, 2, 3 ……, 19]
# ].T * (batch_size * 3)
# [batch_size, 3, 20, 20]
#----------------------------------------------------------#
grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat(
batch_size * len(anchors_mask[2]), 1, 1).view(x.shape).type(FloatTensor)
grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat(
batch_size * len(anchors_mask[2]), 1, 1).view(y.shape).type(FloatTensor)
#----------------------------------------------------------#
# 按照网格格式生成先验框的宽高
# batch_size, 3, 20 * 20 => batch_size, 3, 20, 20
# batch_size, 3, 20 * 20 => batch_size, 3, 20, 20
#----------------------------------------------------------#
anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape)
anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape)
#----------------------------------------------------------#
# 利用预测结果对先验框进行调整
# 首先调整先验框的中心,从先验框中心向右下角偏移
# 再调整先验框的宽高。
# x 0 ~ 1 => 0 ~ 2 => -0.5 ~ 1.5 + grid_x
# y 0 ~ 1 => 0 ~ 2 => -0.5 ~ 1.5 + grid_y
# w 0 ~ 1 => 0 ~ 2 => 0 ~ 4 * anchor_w
# h 0 ~ 1 => 0 ~ 2 => 0 ~ 4 * anchor_h
#----------------------------------------------------------#
pred_boxes = FloatTensor(prediction[..., :4].shape)
pred_boxes[..., 0] = x.data * 2. - 0.5 + grid_x
pred_boxes[..., 1] = y.data * 2. - 0.5 + grid_y
pred_boxes[..., 2] = (w.data * 2) ** 2 * anchor_w
pred_boxes[..., 3] = (h.data * 2) ** 2 * anchor_h
point_h = 5
point_w = 5
box_xy = pred_boxes[..., 0:2].cpu().numpy() * 32
box_wh = pred_boxes[..., 2:4].cpu().numpy() * 32
grid_x = grid_x.cpu().numpy() * 32
grid_y = grid_y.cpu().numpy() * 32
anchor_w = anchor_w.cpu().numpy() * 32
anchor_h = anchor_h.cpu().numpy() * 32
fig = plt.figure()
ax = fig.add_subplot(121)
from PIL import Image
img = Image.open("img/street.jpg").resize([640, 640])
plt.imshow(img, alpha=0.5)
plt.ylim(-30, 650)
plt.xlim(-30, 650)
plt.scatter(grid_x, grid_y)
plt.scatter(point_h * 32, point_w * 32, c='black')
plt.gca().invert_yaxis()
anchor_left = grid_x - anchor_w / 2
anchor_top = grid_y - anchor_h / 2
rect1 = plt.Rectangle([anchor_left[0, 0, point_h, point_w],anchor_top[0, 0, point_h, point_w]], \
anchor_w[0, 0, point_h, point_w],anchor_h[0, 0, point_h, point_w],color="r",fill=False)
rect2 = plt.Rectangle([anchor_left[0, 1, point_h, point_w],anchor_top[0, 1, point_h, point_w]], \
anchor_w[0, 1, point_h, point_w],anchor_h[0, 1, point_h, point_w],color="r",fill=False)
rect3 = plt.Rectangle([anchor_left[0, 2, point_h, point_w],anchor_top[0, 2, point_h, point_w]], \
anchor_w[0, 2, point_h, point_w],anchor_h[0, 2, point_h, point_w],color="r",fill=False)
ax.add_patch(rect1)
ax.add_patch(rect2)
ax.add_patch(rect3)
ax = fig.add_subplot(122)
plt.imshow(img, alpha=0.5)
plt.ylim(-30, 650)
plt.xlim(-30, 650)
plt.scatter(grid_x, grid_y)
plt.scatter(point_h * 32, point_w * 32, c='black')
plt.scatter(box_xy[0, :, point_h, point_w, 0], box_xy[0, :, point_h, point_w, 1], c='r')
plt.gca().invert_yaxis()
pre_left = box_xy[...,0] - box_wh[...,0] / 2
pre_top = box_xy[...,1] - box_wh[...,1] / 2
rect1 = plt.Rectangle([pre_left[0, 0, point_h, point_w], pre_top[0, 0, point_h, point_w]],\
box_wh[0, 0, point_h, point_w,0], box_wh[0, 0, point_h, point_w,1],color="r",fill=False)
rect2 = plt.Rectangle([pre_left[0, 1, point_h, point_w], pre_top[0, 1, point_h, point_w]],\
box_wh[0, 1, point_h, point_w,0], box_wh[0, 1, point_h, point_w,1],color="r",fill=False)
rect3 = plt.Rectangle([pre_left[0, 2, point_h, point_w], pre_top[0, 2, point_h, point_w]],\
box_wh[0, 2, point_h, point_w,0], box_wh[0, 2, point_h, point_w,1],color="r",fill=False)
ax.add_patch(rect1)
ax.add_patch(rect2)
ax.add_patch(rect3)
plt.show()
#
feat = torch.from_numpy(np.random.normal(0.2, 0.5, [4, 255, 20, 20])).float()
anchors = np.array([[116, 90], [156, 198], [373, 326], [30,61], [62,45], [59,119], [10,13], [16,30], [33,23]])
anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
get_anchors_and_decode(feat, [640, 640], anchors, anchors_mask, 80)

View File

@ -1,126 +0,0 @@
import os
import torch
from tqdm import tqdm
from utils.utils import get_lr
def fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda, fp16, scaler, save_period, save_dir, local_rank=0):
loss = 0
val_loss = 0
if local_rank == 0:
print('Start Train')
pbar = tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3)
model_train.train()
for iteration, batch in enumerate(gen):
if iteration >= epoch_step:
break
images, bboxes = batch
with torch.no_grad():
if cuda:
images = images.cuda(local_rank)
bboxes = bboxes.cuda(local_rank)
#----------------------#
# 清零梯度
#----------------------#
optimizer.zero_grad()
if not fp16:
#----------------------#
# 前向传播
#----------------------#
# dbox, cls, origin_cls, anchors, strides
outputs = model_train(images)
loss_value = yolo_loss(outputs, bboxes)
#----------------------#
# 反向传播
#----------------------#
loss_value.backward()
torch.nn.utils.clip_grad_norm_(model_train.parameters(), max_norm=10.0) # clip gradients
optimizer.step()
else:
from torch.cuda.amp import autocast
with autocast():
#----------------------#
# 前向传播
#----------------------#
outputs = model_train(images)
loss_value = yolo_loss(outputs, bboxes)
#----------------------#
# 反向传播
#----------------------#
scaler.scale(loss_value).backward()
scaler.unscale_(optimizer) # unscale gradients
torch.nn.utils.clip_grad_norm_(model_train.parameters(), max_norm=10.0) # clip gradients
scaler.step(optimizer)
scaler.update()
if ema:
ema.update(model_train)
loss += loss_value.item()
if local_rank == 0:
pbar.set_postfix(**{'loss' : loss / (iteration + 1),
'lr' : get_lr(optimizer)})
pbar.update(1)
if local_rank == 0:
pbar.close()
print('Finish Train')
print('Start Validation')
pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3)
if ema:
model_train_eval = ema.ema
else:
model_train_eval = model_train.eval()
for iteration, batch in enumerate(gen_val):
if iteration >= epoch_step_val:
break
images, bboxes = batch[0], batch[1]
with torch.no_grad():
if cuda:
images = images.cuda(local_rank)
bboxes = bboxes.cuda(local_rank)
#----------------------#
# 清零梯度
#----------------------#
optimizer.zero_grad()
#----------------------#
# 前向传播
#----------------------#
outputs = model_train_eval(images)
loss_value = yolo_loss(outputs, bboxes)
val_loss += loss_value.item()
if local_rank == 0:
pbar.set_postfix(**{'val_loss': val_loss / (iteration + 1)})
pbar.update(1)
if local_rank == 0:
pbar.close()
print('Finish Validation')
loss_history.append_loss(epoch + 1, loss / epoch_step, val_loss / epoch_step_val)
eval_callback.on_epoch_end(epoch + 1, model_train_eval)
print('Epoch:'+ str(epoch + 1) + '/' + str(Epoch))
print('Total Loss: %.3f || Val Loss: %.3f ' % (loss / epoch_step, val_loss / epoch_step_val))
#-----------------------------------------------#
# 保存权值
#-----------------------------------------------#
if ema:
save_state_dict = ema.ema.state_dict()
else:
save_state_dict = model.state_dict()
if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch:
torch.save(save_state_dict, os.path.join(save_dir, "ep%03d-loss%.3f-val_loss%.3f.pth" % (epoch + 1, loss / epoch_step, val_loss / epoch_step_val)))
if len(loss_history.val_loss) <= 1 or (val_loss / epoch_step_val) <= min(loss_history.val_loss):
print('Save best model to best_epoch_weights.pth')
torch.save(save_state_dict, os.path.join(save_dir, "best_epoch_weights.pth"))
torch.save(save_state_dict, os.path.join(save_dir, "last_epoch_weights.pth"))

View File

@ -1,923 +0,0 @@
import glob
import json
import math
import operator
import os
import shutil
import sys
try:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
except:
pass
import cv2
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import numpy as np
'''
0,0 ------> x (width)
|
| (Left,Top)
| *_________
| | |
| |
y |_________|
(height) *
(Right,Bottom)
'''
def log_average_miss_rate(precision, fp_cumsum, num_images):
"""
log-average miss rate:
Calculated by averaging miss rates at 9 evenly spaced FPPI points
between 10e-2 and 10e0, in log-space.
output:
lamr | log-average miss rate
mr | miss rate
fppi | false positives per image
references:
[1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the
State of the Art." Pattern Analysis and Machine Intelligence, IEEE
Transactions on 34.4 (2012): 743 - 761.
"""
if precision.size == 0:
lamr = 0
mr = 1
fppi = 0
return lamr, mr, fppi
fppi = fp_cumsum / float(num_images)
mr = (1 - precision)
fppi_tmp = np.insert(fppi, 0, -1.0)
mr_tmp = np.insert(mr, 0, 1.0)
ref = np.logspace(-2.0, 0.0, num = 9)
for i, ref_i in enumerate(ref):
j = np.where(fppi_tmp <= ref_i)[-1][-1]
ref[i] = mr_tmp[j]
lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref))))
return lamr, mr, fppi
"""
throw error and exit
"""
def error(msg):
print(msg)
sys.exit(0)
"""
check if the number is a float between 0.0 and 1.0
"""
def is_float_between_0_and_1(value):
try:
val = float(value)
if val > 0.0 and val < 1.0:
return True
else:
return False
except ValueError:
return False
"""
Calculate the AP given the recall and precision array
1st) We compute a version of the measured precision/recall curve with
precision monotonically decreasing
2nd) We compute the AP as the area under this curve by numerical integration.
"""
def voc_ap(rec, prec):
"""
--- Official matlab code VOC2012---
mrec=[0 ; rec ; 1];
mpre=[0 ; prec ; 0];
for i=numel(mpre)-1:-1:1
mpre(i)=max(mpre(i),mpre(i+1));
end
i=find(mrec(2:end)~=mrec(1:end-1))+1;
ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
"""
rec.insert(0, 0.0) # insert 0.0 at begining of list
rec.append(1.0) # insert 1.0 at end of list
mrec = rec[:]
prec.insert(0, 0.0) # insert 0.0 at begining of list
prec.append(0.0) # insert 0.0 at end of list
mpre = prec[:]
"""
This part makes the precision monotonically decreasing
(goes from the end to the beginning)
matlab: for i=numel(mpre)-1:-1:1
mpre(i)=max(mpre(i),mpre(i+1));
"""
for i in range(len(mpre)-2, -1, -1):
mpre[i] = max(mpre[i], mpre[i+1])
"""
This part creates a list of indexes where the recall changes
matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1;
"""
i_list = []
for i in range(1, len(mrec)):
if mrec[i] != mrec[i-1]:
i_list.append(i) # if it was matlab would be i + 1
"""
The Average Precision (AP) is the area under the curve
(numerical integration)
matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
"""
ap = 0.0
for i in i_list:
ap += ((mrec[i]-mrec[i-1])*mpre[i])
return ap, mrec, mpre
"""
Convert the lines of a file to a list
"""
def file_lines_to_list(path):
# open txt file lines to a list
with open(path) as f:
content = f.readlines()
# remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]
return content
"""
Draws text in image
"""
def draw_text_in_image(img, text, pos, color, line_width):
font = cv2.FONT_HERSHEY_PLAIN
fontScale = 1
lineType = 1
bottomLeftCornerOfText = pos
cv2.putText(img, text,
bottomLeftCornerOfText,
font,
fontScale,
color,
lineType)
text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0]
return img, (line_width + text_width)
"""
Plot - adjust axes
"""
def adjust_axes(r, t, fig, axes):
# get text width for re-scaling
bb = t.get_window_extent(renderer=r)
text_width_inches = bb.width / fig.dpi
# get axis width in inches
current_fig_width = fig.get_figwidth()
new_fig_width = current_fig_width + text_width_inches
propotion = new_fig_width / current_fig_width
# get axis limit
x_lim = axes.get_xlim()
axes.set_xlim([x_lim[0], x_lim[1]*propotion])
"""
Draw plot using Matplotlib
"""
def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar):
# sort the dictionary by decreasing value, into a list of tuples
sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1))
# unpacking the list of tuples into two lists
sorted_keys, sorted_values = zip(*sorted_dic_by_value)
#
if true_p_bar != "":
"""
Special case to draw in:
- green -> TP: True Positives (object detected and matches ground-truth)
- red -> FP: False Positives (object detected but does not match ground-truth)
- orange -> FN: False Negatives (object not detected but present in the ground-truth)
"""
fp_sorted = []
tp_sorted = []
for key in sorted_keys:
fp_sorted.append(dictionary[key] - true_p_bar[key])
tp_sorted.append(true_p_bar[key])
plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive')
plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted)
# add legend
plt.legend(loc='lower right')
"""
Write number on side of bar
"""
fig = plt.gcf() # gcf - get current figure
axes = plt.gca()
r = fig.canvas.get_renderer()
for i, val in enumerate(sorted_values):
fp_val = fp_sorted[i]
tp_val = tp_sorted[i]
fp_str_val = " " + str(fp_val)
tp_str_val = fp_str_val + " " + str(tp_val)
# trick to paint multicolor with offset:
# first paint everything and then repaint the first number
t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold')
plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold')
if i == (len(sorted_values)-1): # largest bar
adjust_axes(r, t, fig, axes)
else:
plt.barh(range(n_classes), sorted_values, color=plot_color)
"""
Write number on side of bar
"""
fig = plt.gcf() # gcf - get current figure
axes = plt.gca()
r = fig.canvas.get_renderer()
for i, val in enumerate(sorted_values):
str_val = " " + str(val) # add a space before
if val < 1.0:
str_val = " {0:.2f}".format(val)
t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold')
# re-set axes to show number inside the figure
if i == (len(sorted_values)-1): # largest bar
adjust_axes(r, t, fig, axes)
# set window title
fig.canvas.set_window_title(window_title)
# write classes in y axis
tick_font_size = 12
plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size)
"""
Re-scale height accordingly
"""
init_height = fig.get_figheight()
# comput the matrix height in points and inches
dpi = fig.dpi
height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing)
height_in = height_pt / dpi
# compute the required figure height
top_margin = 0.15 # in percentage of the figure height
bottom_margin = 0.05 # in percentage of the figure height
figure_height = height_in / (1 - top_margin - bottom_margin)
# set new height
if figure_height > init_height:
fig.set_figheight(figure_height)
# set plot title
plt.title(plot_title, fontsize=14)
# set axis titles
# plt.xlabel('classes')
plt.xlabel(x_label, fontsize='large')
# adjust size of window
fig.tight_layout()
# save the plot
fig.savefig(output_path)
# show image
if to_show:
plt.show()
# close the plot
plt.close()
def get_map(MINOVERLAP, draw_plot, score_threhold=0.5, path = './map_out'):
GT_PATH = os.path.join(path, 'ground-truth')
DR_PATH = os.path.join(path, 'detection-results')
IMG_PATH = os.path.join(path, 'images-optional')
TEMP_FILES_PATH = os.path.join(path, '.temp_files')
RESULTS_FILES_PATH = os.path.join(path, 'results')
show_animation = True
if os.path.exists(IMG_PATH):
for dirpath, dirnames, files in os.walk(IMG_PATH):
if not files:
show_animation = False
else:
show_animation = False
if not os.path.exists(TEMP_FILES_PATH):
os.makedirs(TEMP_FILES_PATH)
if os.path.exists(RESULTS_FILES_PATH):
shutil.rmtree(RESULTS_FILES_PATH)
else:
os.makedirs(RESULTS_FILES_PATH)
if draw_plot:
try:
matplotlib.use('TkAgg')
except:
pass
os.makedirs(os.path.join(RESULTS_FILES_PATH, "AP"))
os.makedirs(os.path.join(RESULTS_FILES_PATH, "F1"))
os.makedirs(os.path.join(RESULTS_FILES_PATH, "Recall"))
os.makedirs(os.path.join(RESULTS_FILES_PATH, "Precision"))
if show_animation:
os.makedirs(os.path.join(RESULTS_FILES_PATH, "images", "detections_one_by_one"))
ground_truth_files_list = glob.glob(GT_PATH + '/*.txt')
if len(ground_truth_files_list) == 0:
error("Error: No ground-truth files found!")
ground_truth_files_list.sort()
gt_counter_per_class = {}
counter_images_per_class = {}
for txt_file in ground_truth_files_list:
file_id = txt_file.split(".txt", 1)[0]
file_id = os.path.basename(os.path.normpath(file_id))
temp_path = os.path.join(DR_PATH, (file_id + ".txt"))
if not os.path.exists(temp_path):
error_msg = "Error. File not found: {}\n".format(temp_path)
error(error_msg)
lines_list = file_lines_to_list(txt_file)
bounding_boxes = []
is_difficult = False
already_seen_classes = []
for line in lines_list:
try:
if "difficult" in line:
class_name, left, top, right, bottom, _difficult = line.split()
is_difficult = True
else:
class_name, left, top, right, bottom = line.split()
except:
if "difficult" in line:
line_split = line.split()
_difficult = line_split[-1]
bottom = line_split[-2]
right = line_split[-3]
top = line_split[-4]
left = line_split[-5]
class_name = ""
for name in line_split[:-5]:
class_name += name + " "
class_name = class_name[:-1]
is_difficult = True
else:
line_split = line.split()
bottom = line_split[-1]
right = line_split[-2]
top = line_split[-3]
left = line_split[-4]
class_name = ""
for name in line_split[:-4]:
class_name += name + " "
class_name = class_name[:-1]
bbox = left + " " + top + " " + right + " " + bottom
if is_difficult:
bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True})
is_difficult = False
else:
bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False})
if class_name in gt_counter_per_class:
gt_counter_per_class[class_name] += 1
else:
gt_counter_per_class[class_name] = 1
if class_name not in already_seen_classes:
if class_name in counter_images_per_class:
counter_images_per_class[class_name] += 1
else:
counter_images_per_class[class_name] = 1
already_seen_classes.append(class_name)
with open(TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json", 'w') as outfile:
json.dump(bounding_boxes, outfile)
gt_classes = list(gt_counter_per_class.keys())
gt_classes = sorted(gt_classes)
n_classes = len(gt_classes)
dr_files_list = glob.glob(DR_PATH + '/*.txt')
dr_files_list.sort()
for class_index, class_name in enumerate(gt_classes):
bounding_boxes = []
for txt_file in dr_files_list:
file_id = txt_file.split(".txt",1)[0]
file_id = os.path.basename(os.path.normpath(file_id))
temp_path = os.path.join(GT_PATH, (file_id + ".txt"))
if class_index == 0:
if not os.path.exists(temp_path):
error_msg = "Error. File not found: {}\n".format(temp_path)
error(error_msg)
lines = file_lines_to_list(txt_file)
for line in lines:
try:
tmp_class_name, confidence, left, top, right, bottom = line.split()
except:
line_split = line.split()
bottom = line_split[-1]
right = line_split[-2]
top = line_split[-3]
left = line_split[-4]
confidence = line_split[-5]
tmp_class_name = ""
for name in line_split[:-5]:
tmp_class_name += name + " "
tmp_class_name = tmp_class_name[:-1]
if tmp_class_name == class_name:
bbox = left + " " + top + " " + right + " " +bottom
bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox})
bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True)
with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile:
json.dump(bounding_boxes, outfile)
sum_AP = 0.0
ap_dictionary = {}
lamr_dictionary = {}
with open(RESULTS_FILES_PATH + "/results.txt", 'w') as results_file:
results_file.write("# AP and precision/recall per class\n")
count_true_positives = {}
for class_index, class_name in enumerate(gt_classes):
count_true_positives[class_name] = 0
dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json"
dr_data = json.load(open(dr_file))
nd = len(dr_data)
tp = [0] * nd
fp = [0] * nd
score = [0] * nd
score_threhold_idx = 0
for idx, detection in enumerate(dr_data):
file_id = detection["file_id"]
score[idx] = float(detection["confidence"])
if score[idx] >= score_threhold:
score_threhold_idx = idx
if show_animation:
ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*")
if len(ground_truth_img) == 0:
error("Error. Image not found with id: " + file_id)
elif len(ground_truth_img) > 1:
error("Error. Multiple image with id: " + file_id)
else:
img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0])
img_cumulative_path = RESULTS_FILES_PATH + "/images/" + ground_truth_img[0]
if os.path.isfile(img_cumulative_path):
img_cumulative = cv2.imread(img_cumulative_path)
else:
img_cumulative = img.copy()
bottom_border = 60
BLACK = [0, 0, 0]
img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK)
gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
ground_truth_data = json.load(open(gt_file))
ovmax = -1
gt_match = -1
bb = [float(x) for x in detection["bbox"].split()]
for obj in ground_truth_data:
if obj["class_name"] == class_name:
bbgt = [ float(x) for x in obj["bbox"].split() ]
bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])]
iw = bi[2] - bi[0] + 1
ih = bi[3] - bi[1] + 1
if iw > 0 and ih > 0:
ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0]
+ 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih
ov = iw * ih / ua
if ov > ovmax:
ovmax = ov
gt_match = obj
if show_animation:
status = "NO MATCH FOUND!"
min_overlap = MINOVERLAP
if ovmax >= min_overlap:
if "difficult" not in gt_match:
if not bool(gt_match["used"]):
tp[idx] = 1
gt_match["used"] = True
count_true_positives[class_name] += 1
with open(gt_file, 'w') as f:
f.write(json.dumps(ground_truth_data))
if show_animation:
status = "MATCH!"
else:
fp[idx] = 1
if show_animation:
status = "REPEATED MATCH!"
else:
fp[idx] = 1
if ovmax > 0:
status = "INSUFFICIENT OVERLAP"
"""
Draw image to show animation
"""
if show_animation:
height, widht = img.shape[:2]
white = (255,255,255)
light_blue = (255,200,100)
green = (0,255,0)
light_red = (30,30,255)
margin = 10
# 1nd line
v_pos = int(height - margin - (bottom_border / 2.0))
text = "Image: " + ground_truth_img[0] + " "
img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " "
img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width)
if ovmax != -1:
color = light_red
if status == "INSUFFICIENT OVERLAP":
text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100)
else:
text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100)
color = green
img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
# 2nd line
v_pos += int(bottom_border / 2.0)
rank_pos = str(idx+1)
text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100)
img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
color = light_red
if status == "MATCH!":
color = green
text = "Result: " + status + " "
img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
font = cv2.FONT_HERSHEY_SIMPLEX
if ovmax > 0:
bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ]
cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA)
bb = [int(i) for i in bb]
cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA)
cv2.imshow("Animation", img)
cv2.waitKey(20)
output_img_path = RESULTS_FILES_PATH + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg"
cv2.imwrite(output_img_path, img)
cv2.imwrite(img_cumulative_path, img_cumulative)
cumsum = 0
for idx, val in enumerate(fp):
fp[idx] += cumsum
cumsum += val
cumsum = 0
for idx, val in enumerate(tp):
tp[idx] += cumsum
cumsum += val
rec = tp[:]
for idx, val in enumerate(tp):
rec[idx] = float(tp[idx]) / np.maximum(gt_counter_per_class[class_name], 1)
prec = tp[:]
for idx, val in enumerate(tp):
prec[idx] = float(tp[idx]) / np.maximum((fp[idx] + tp[idx]), 1)
ap, mrec, mprec = voc_ap(rec[:], prec[:])
F1 = np.array(rec)*np.array(prec)*2 / np.where((np.array(prec)+np.array(rec))==0, 1, (np.array(prec)+np.array(rec)))
sum_AP += ap
text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100)
if len(prec)>0:
F1_text = "{0:.2f}".format(F1[score_threhold_idx]) + " = " + class_name + " F1 "
Recall_text = "{0:.2f}%".format(rec[score_threhold_idx]*100) + " = " + class_name + " Recall "
Precision_text = "{0:.2f}%".format(prec[score_threhold_idx]*100) + " = " + class_name + " Precision "
else:
F1_text = "0.00" + " = " + class_name + " F1 "
Recall_text = "0.00%" + " = " + class_name + " Recall "
Precision_text = "0.00%" + " = " + class_name + " Precision "
rounded_prec = [ '%.2f' % elem for elem in prec ]
rounded_rec = [ '%.2f' % elem for elem in rec ]
results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n")
if len(prec)>0:
print(text + "\t||\tscore_threhold=" + str(score_threhold) + " : " + "F1=" + "{0:.2f}".format(F1[score_threhold_idx])\
+ " ; Recall=" + "{0:.2f}%".format(rec[score_threhold_idx]*100) + " ; Precision=" + "{0:.2f}%".format(prec[score_threhold_idx]*100))
else:
print(text + "\t||\tscore_threhold=" + str(score_threhold) + " : " + "F1=0.00% ; Recall=0.00% ; Precision=0.00%")
ap_dictionary[class_name] = ap
n_images = counter_images_per_class[class_name]
lamr, mr, fppi = log_average_miss_rate(np.array(rec), np.array(fp), n_images)
lamr_dictionary[class_name] = lamr
if draw_plot:
plt.plot(rec, prec, '-o')
area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]]
area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]]
plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r')
fig = plt.gcf()
fig.canvas.set_window_title('AP ' + class_name)
plt.title('class: ' + text)
plt.xlabel('Recall')
plt.ylabel('Precision')
axes = plt.gca()
axes.set_xlim([0.0,1.0])
axes.set_ylim([0.0,1.05])
fig.savefig(RESULTS_FILES_PATH + "/AP/" + class_name + ".png")
plt.cla()
plt.plot(score, F1, "-", color='orangered')
plt.title('class: ' + F1_text + "\nscore_threhold=" + str(score_threhold))
plt.xlabel('Score_Threhold')
plt.ylabel('F1')
axes = plt.gca()
axes.set_xlim([0.0,1.0])
axes.set_ylim([0.0,1.05])
fig.savefig(RESULTS_FILES_PATH + "/F1/" + class_name + ".png")
plt.cla()
plt.plot(score, rec, "-H", color='gold')
plt.title('class: ' + Recall_text + "\nscore_threhold=" + str(score_threhold))
plt.xlabel('Score_Threhold')
plt.ylabel('Recall')
axes = plt.gca()
axes.set_xlim([0.0,1.0])
axes.set_ylim([0.0,1.05])
fig.savefig(RESULTS_FILES_PATH + "/Recall/" + class_name + ".png")
plt.cla()
plt.plot(score, prec, "-s", color='palevioletred')
plt.title('class: ' + Precision_text + "\nscore_threhold=" + str(score_threhold))
plt.xlabel('Score_Threhold')
plt.ylabel('Precision')
axes = plt.gca()
axes.set_xlim([0.0,1.0])
axes.set_ylim([0.0,1.05])
fig.savefig(RESULTS_FILES_PATH + "/Precision/" + class_name + ".png")
plt.cla()
if show_animation:
cv2.destroyAllWindows()
if n_classes == 0:
print("未检测到任何种类请检查标签信息与get_map.py中的classes_path是否修改。")
return 0
results_file.write("\n# mAP of all classes\n")
mAP = sum_AP / n_classes
text = "mAP = {0:.2f}%".format(mAP*100)
results_file.write(text + "\n")
print(text)
shutil.rmtree(TEMP_FILES_PATH)
"""
Count total of detection-results
"""
det_counter_per_class = {}
for txt_file in dr_files_list:
lines_list = file_lines_to_list(txt_file)
for line in lines_list:
class_name = line.split()[0]
if class_name in det_counter_per_class:
det_counter_per_class[class_name] += 1
else:
det_counter_per_class[class_name] = 1
dr_classes = list(det_counter_per_class.keys())
"""
Write number of ground-truth objects per class to results.txt
"""
with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file:
results_file.write("\n# Number of ground-truth objects per class\n")
for class_name in sorted(gt_counter_per_class):
results_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n")
"""
Finish counting true positives
"""
for class_name in dr_classes:
if class_name not in gt_classes:
count_true_positives[class_name] = 0
"""
Write number of detected objects per class to results.txt
"""
with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file:
results_file.write("\n# Number of detected objects per class\n")
for class_name in sorted(dr_classes):
n_det = det_counter_per_class[class_name]
text = class_name + ": " + str(n_det)
text += " (tp:" + str(count_true_positives[class_name]) + ""
text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n"
results_file.write(text)
"""
Plot the total number of occurences of each class in the ground-truth
"""
if draw_plot:
window_title = "ground-truth-info"
plot_title = "ground-truth\n"
plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)"
x_label = "Number of objects per class"
output_path = RESULTS_FILES_PATH + "/ground-truth-info.png"
to_show = False
plot_color = 'forestgreen'
draw_plot_func(
gt_counter_per_class,
n_classes,
window_title,
plot_title,
x_label,
output_path,
to_show,
plot_color,
'',
)
# """
# Plot the total number of occurences of each class in the "detection-results" folder
# """
# if draw_plot:
# window_title = "detection-results-info"
# # Plot title
# plot_title = "detection-results\n"
# plot_title += "(" + str(len(dr_files_list)) + " files and "
# count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values()))
# plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)"
# # end Plot title
# x_label = "Number of objects per class"
# output_path = RESULTS_FILES_PATH + "/detection-results-info.png"
# to_show = False
# plot_color = 'forestgreen'
# true_p_bar = count_true_positives
# draw_plot_func(
# det_counter_per_class,
# len(det_counter_per_class),
# window_title,
# plot_title,
# x_label,
# output_path,
# to_show,
# plot_color,
# true_p_bar
# )
"""
Draw log-average miss rate plot (Show lamr of all classes in decreasing order)
"""
if draw_plot:
window_title = "lamr"
plot_title = "log-average miss rate"
x_label = "log-average miss rate"
output_path = RESULTS_FILES_PATH + "/lamr.png"
to_show = False
plot_color = 'royalblue'
draw_plot_func(
lamr_dictionary,
n_classes,
window_title,
plot_title,
x_label,
output_path,
to_show,
plot_color,
""
)
"""
Draw mAP plot (Show AP's of all classes in decreasing order)
"""
if draw_plot:
window_title = "mAP"
plot_title = "mAP = {0:.2f}%".format(mAP*100)
x_label = "Average Precision"
output_path = RESULTS_FILES_PATH + "/mAP.png"
to_show = True
plot_color = 'royalblue'
draw_plot_func(
ap_dictionary,
n_classes,
window_title,
plot_title,
x_label,
output_path,
to_show,
plot_color,
""
)
return mAP
def preprocess_gt(gt_path, class_names):
image_ids = os.listdir(gt_path)
results = {}
images = []
bboxes = []
for i, image_id in enumerate(image_ids):
lines_list = file_lines_to_list(os.path.join(gt_path, image_id))
boxes_per_image = []
image = {}
image_id = os.path.splitext(image_id)[0]
image['file_name'] = image_id + '.jpg'
image['width'] = 1
image['height'] = 1
#-----------------------------------------------------------------#
# 感谢 多学学英语吧 的提醒
# 解决了'Results do not correspond to current coco set'问题
#-----------------------------------------------------------------#
image['id'] = str(image_id)
for line in lines_list:
difficult = 0
if "difficult" in line:
line_split = line.split()
left, top, right, bottom, _difficult = line_split[-5:]
class_name = ""
for name in line_split[:-5]:
class_name += name + " "
class_name = class_name[:-1]
difficult = 1
else:
line_split = line.split()
left, top, right, bottom = line_split[-4:]
class_name = ""
for name in line_split[:-4]:
class_name += name + " "
class_name = class_name[:-1]
left, top, right, bottom = float(left), float(top), float(right), float(bottom)
if class_name not in class_names:
continue
cls_id = class_names.index(class_name) + 1
bbox = [left, top, right - left, bottom - top, difficult, str(image_id), cls_id, (right - left) * (bottom - top) - 10.0]
boxes_per_image.append(bbox)
images.append(image)
bboxes.extend(boxes_per_image)
results['images'] = images
categories = []
for i, cls in enumerate(class_names):
category = {}
category['supercategory'] = cls
category['name'] = cls
category['id'] = i + 1
categories.append(category)
results['categories'] = categories
annotations = []
for i, box in enumerate(bboxes):
annotation = {}
annotation['area'] = box[-1]
annotation['category_id'] = box[-2]
annotation['image_id'] = box[-3]
annotation['iscrowd'] = box[-4]
annotation['bbox'] = box[:4]
annotation['id'] = i
annotations.append(annotation)
results['annotations'] = annotations
return results
def preprocess_dr(dr_path, class_names):
image_ids = os.listdir(dr_path)
results = []
for image_id in image_ids:
lines_list = file_lines_to_list(os.path.join(dr_path, image_id))
image_id = os.path.splitext(image_id)[0]
for line in lines_list:
line_split = line.split()
confidence, left, top, right, bottom = line_split[-5:]
class_name = ""
for name in line_split[:-5]:
class_name += name + " "
class_name = class_name[:-1]
left, top, right, bottom = float(left), float(top), float(right), float(bottom)
result = {}
result["image_id"] = str(image_id)
if class_name not in class_names:
continue
result["category_id"] = class_names.index(class_name) + 1
result["bbox"] = [left, top, right - left, bottom - top]
result["score"] = float(confidence)
results.append(result)
return results
def get_coco_map(class_names, path):
GT_PATH = os.path.join(path, 'ground-truth')
DR_PATH = os.path.join(path, 'detection-results')
COCO_PATH = os.path.join(path, 'coco_eval')
if not os.path.exists(COCO_PATH):
os.makedirs(COCO_PATH)
GT_JSON_PATH = os.path.join(COCO_PATH, 'instances_gt.json')
DR_JSON_PATH = os.path.join(COCO_PATH, 'instances_dr.json')
with open(GT_JSON_PATH, "w") as f:
results_gt = preprocess_gt(GT_PATH, class_names)
json.dump(results_gt, f, indent=4)
with open(DR_JSON_PATH, "w") as f:
results_dr = preprocess_dr(DR_PATH, class_names)
json.dump(results_dr, f, indent=4)
if len(results_dr) == 0:
print("未检测到任何目标。")
return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
cocoGt = COCO(GT_JSON_PATH)
cocoDt = cocoGt.loadRes(DR_JSON_PATH)
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
return cocoEval.stats

View File

@ -1,117 +0,0 @@
#-------------------------------------------------------#
# 用于处理COCO数据集根据json文件生成txt文件用于训练
#-------------------------------------------------------#
import json
import os
from collections import defaultdict
#-------------------------------------------------------#
# 指向了COCO训练集与验证集图片的路径
#-------------------------------------------------------#
train_datasets_path = "coco_dataset/train2017"
val_datasets_path = "coco_dataset/val2017"
#-------------------------------------------------------#
# 指向了COCO训练集与验证集标签的路径
#-------------------------------------------------------#
train_annotation_path = "coco_dataset/annotations/instances_train2017.json"
val_annotation_path = "coco_dataset/annotations/instances_val2017.json"
#-------------------------------------------------------#
# 生成的txt文件路径
#-------------------------------------------------------#
train_output_path = "coco_train.txt"
val_output_path = "coco_val.txt"
if __name__ == "__main__":
name_box_id = defaultdict(list)
id_name = dict()
f = open(train_annotation_path, encoding='utf-8')
data = json.load(f)
annotations = data['annotations']
for ant in annotations:
id = ant['image_id']
name = os.path.join(train_datasets_path, '%012d.jpg' % id)
cat = ant['category_id']
if cat >= 1 and cat <= 11:
cat = cat - 1
elif cat >= 13 and cat <= 25:
cat = cat - 2
elif cat >= 27 and cat <= 28:
cat = cat - 3
elif cat >= 31 and cat <= 44:
cat = cat - 5
elif cat >= 46 and cat <= 65:
cat = cat - 6
elif cat == 67:
cat = cat - 7
elif cat == 70:
cat = cat - 9
elif cat >= 72 and cat <= 82:
cat = cat - 10
elif cat >= 84 and cat <= 90:
cat = cat - 11
name_box_id[name].append([ant['bbox'], cat])
f = open(train_output_path, 'w')
for key in name_box_id.keys():
f.write(key)
box_infos = name_box_id[key]
for info in box_infos:
x_min = int(info[0][0])
y_min = int(info[0][1])
x_max = x_min + int(info[0][2])
y_max = y_min + int(info[0][3])
box_info = " %d,%d,%d,%d,%d" % (
x_min, y_min, x_max, y_max, int(info[1]))
f.write(box_info)
f.write('\n')
f.close()
name_box_id = defaultdict(list)
id_name = dict()
f = open(val_annotation_path, encoding='utf-8')
data = json.load(f)
annotations = data['annotations']
for ant in annotations:
id = ant['image_id']
name = os.path.join(val_datasets_path, '%012d.jpg' % id)
cat = ant['category_id']
if cat >= 1 and cat <= 11:
cat = cat - 1
elif cat >= 13 and cat <= 25:
cat = cat - 2
elif cat >= 27 and cat <= 28:
cat = cat - 3
elif cat >= 31 and cat <= 44:
cat = cat - 5
elif cat >= 46 and cat <= 65:
cat = cat - 6
elif cat == 67:
cat = cat - 7
elif cat == 70:
cat = cat - 9
elif cat >= 72 and cat <= 82:
cat = cat - 10
elif cat >= 84 and cat <= 90:
cat = cat - 11
name_box_id[name].append([ant['bbox'], cat])
f = open(val_output_path, 'w')
for key in name_box_id.keys():
f.write(key)
box_infos = name_box_id[key]
for info in box_infos:
x_min = int(info[0][0])
y_min = int(info[0][1])
x_max = x_min + int(info[0][2])
y_max = y_min + int(info[0][3])
box_info = " %d,%d,%d,%d,%d" % (
x_min, y_min, x_max, y_max, int(info[1]))
f.write(box_info)
f.write('\n')
f.close()

View File

@ -1,113 +0,0 @@
import json
import os
import numpy as np
import torch
from PIL import Image
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from tqdm import tqdm
from utils.utils import cvtColor, preprocess_input, resize_image
from yolo import YOLO
#---------------------------------------------------------------------------#
# map_mode用于指定该文件运行时计算的内容
# map_mode为0代表整个map计算流程包括获得预测结果、计算map。
# map_mode为1代表仅仅获得预测结果。
# map_mode为2代表仅仅获得计算map。
#---------------------------------------------------------------------------#
map_mode = 0
#-------------------------------------------------------#
# 指向了验证集标签与图片路径
#-------------------------------------------------------#
cocoGt_path = 'coco_dataset/annotations/instances_val2017.json'
dataset_img_path = 'coco_dataset/val2017'
#-------------------------------------------------------#
# 结果输出的文件夹默认为map_out
#-------------------------------------------------------#
temp_save_path = 'map_out/coco_eval'
class mAP_YOLO(YOLO):
#---------------------------------------------------#
# 检测图片
#---------------------------------------------------#
def detect_image(self, image_id, image, results, clsid2catid):
#---------------------------------------------------#
# 计算输入图片的高和宽
#---------------------------------------------------#
image_shape = np.array(np.shape(image)[0:2])
#---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
#---------------------------------------------------------#
image = cvtColor(image)
#---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
#---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1],self.input_shape[0]), self.letterbox_image)
#---------------------------------------------------------#
# 添加上batch_size维度
#---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
#---------------------------------------------------------#
# 将图像输入网络当中进行预测!
#---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
#---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
#---------------------------------------------------------#
outputs = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou)
if outputs[0] is None:
return outputs
top_label = np.array(outputs[0][:, 5], dtype = 'int32')
top_conf = outputs[0][:, 4]
top_boxes = outputs[0][:, :4]
for i, c in enumerate(top_label):
result = {}
top, left, bottom, right = top_boxes[i]
result["image_id"] = int(image_id)
result["category_id"] = clsid2catid[c]
result["bbox"] = [float(left),float(top),float(right-left),float(bottom-top)]
result["score"] = float(top_conf[i])
results.append(result)
return results
if __name__ == "__main__":
if not os.path.exists(temp_save_path):
os.makedirs(temp_save_path)
cocoGt = COCO(cocoGt_path)
ids = list(cocoGt.imgToAnns.keys())
clsid2catid = cocoGt.getCatIds()
if map_mode == 0 or map_mode == 1:
yolo = mAP_YOLO(confidence = 0.001, nms_iou = 0.65)
with open(os.path.join(temp_save_path, 'eval_results.json'),"w") as f:
results = []
for image_id in tqdm(ids):
image_path = os.path.join(dataset_img_path, cocoGt.loadImgs(image_id)[0]['file_name'])
image = Image.open(image_path)
results = yolo.detect_image(image_id, image, results, clsid2catid)
json.dump(results, f)
if map_mode == 0 or map_mode == 2:
cocoDt = cocoGt.loadRes(os.path.join(temp_save_path, 'eval_results.json'))
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()
print("Get map done.")

View File

@ -1,153 +0,0 @@
import os
import random
import xml.etree.ElementTree as ET
import numpy as np
from utils.utils import get_classes
#--------------------------------------------------------------------------------------------------------------------------------#
# annotation_mode用于指定该文件运行时计算的内容
# annotation_mode为0代表整个标签处理过程包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt
# annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt
# annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt
#--------------------------------------------------------------------------------------------------------------------------------#
annotation_mode = 0
#-------------------------------------------------------------------#
# 必须要修改用于生成2007_train.txt、2007_val.txt的目标信息
# 与训练和预测所用的classes_path一致即可
# 如果生成的2007_train.txt里面没有目标信息
# 那么就是因为classes没有设定正确
# 仅在annotation_mode为0和2的时候有效
#-------------------------------------------------------------------#
classes_path = 'model_data/voc_classes.txt'
#--------------------------------------------------------------------------------------------------------------------------------#
# trainval_percent用于指定(训练集+验证集)与测试集的比例,默认情况下 (训练集+验证集):测试集 = 9:1
# train_percent用于指定(训练集+验证集)中训练集与验证集的比例,默认情况下 训练集:验证集 = 9:1
# 仅在annotation_mode为0和1的时候有效
#--------------------------------------------------------------------------------------------------------------------------------#
trainval_percent = 0.9
train_percent = 0.9
#-------------------------------------------------------#
# 指向VOC数据集所在的文件夹
# 默认指向根目录下的VOC数据集
#-------------------------------------------------------#
VOCdevkit_path = 'VOCdevkit'
VOCdevkit_sets = [('2007', 'train'), ('2007', 'val')]
classes, _ = get_classes(classes_path)
#-------------------------------------------------------#
# 统计目标数量
#-------------------------------------------------------#
photo_nums = np.zeros(len(VOCdevkit_sets))
nums = np.zeros(len(classes))
def convert_annotation(year, image_id, list_file):
in_file = open(os.path.join(VOCdevkit_path, 'VOC%s/Annotations/%s.xml'%(year, image_id)), encoding='utf-8')
tree=ET.parse(in_file)
root = tree.getroot()
for obj in root.iter('object'):
difficult = 0
if obj.find('difficult')!=None:
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult)==1:
continue
cls_id = classes.index(cls)
xmlbox = obj.find('bndbox')
b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)), int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
nums[classes.index(cls)] = nums[classes.index(cls)] + 1
if __name__ == "__main__":
random.seed(0)
if " " in os.path.abspath(VOCdevkit_path):
raise ValueError("数据集存放的文件夹路径与图片名称中不可以存在空格,否则会影响正常的模型训练,请注意修改。")
if annotation_mode == 0 or annotation_mode == 1:
print("Generate txt in ImageSets.")
xmlfilepath = os.path.join(VOCdevkit_path, 'VOC2007/Annotations')
saveBasePath = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets/Main')
temp_xml = os.listdir(xmlfilepath)
total_xml = []
for xml in temp_xml:
if xml.endswith(".xml"):
total_xml.append(xml)
num = len(total_xml)
list = range(num)
tv = int(num*trainval_percent)
tr = int(tv*train_percent)
trainval= random.sample(list,tv)
train = random.sample(trainval,tr)
print("train and val size",tv)
print("train size",tr)
ftrainval = open(os.path.join(saveBasePath,'trainval.txt'), 'w')
ftest = open(os.path.join(saveBasePath,'test.txt'), 'w')
ftrain = open(os.path.join(saveBasePath,'train.txt'), 'w')
fval = open(os.path.join(saveBasePath,'val.txt'), 'w')
for i in list:
name=total_xml[i][:-4]+'\n'
if i in trainval:
ftrainval.write(name)
if i in train:
ftrain.write(name)
else:
fval.write(name)
else:
ftest.write(name)
ftrainval.close()
ftrain.close()
fval.close()
ftest.close()
print("Generate txt in ImageSets done.")
if annotation_mode == 0 or annotation_mode == 2:
print("Generate 2007_train.txt and 2007_val.txt for train.")
type_index = 0
for year, image_set in VOCdevkit_sets:
image_ids = open(os.path.join(VOCdevkit_path, 'VOC%s/ImageSets/Main/%s.txt'%(year, image_set)), encoding='utf-8').read().strip().split()
list_file = open('%s_%s.txt'%(year, image_set), 'w', encoding='utf-8')
for image_id in image_ids:
list_file.write('%s/VOC%s/JPEGImages/%s.jpg'%(os.path.abspath(VOCdevkit_path), year, image_id))
convert_annotation(year, image_id, list_file)
list_file.write('\n')
photo_nums[type_index] = len(image_ids)
type_index += 1
list_file.close()
print("Generate 2007_train.txt and 2007_val.txt for train done.")
def printTable(List1, List2):
for i in range(len(List1[0])):
print("|", end=' ')
for j in range(len(List1)):
print(List1[j][i].rjust(int(List2[j])), end=' ')
print("|", end=' ')
print()
str_nums = [str(int(x)) for x in nums]
tableData = [
classes, str_nums
]
colWidths = [0]*len(tableData)
len1 = 0
for i in range(len(tableData)):
for j in range(len(tableData[i])):
if len(tableData[i][j]) > colWidths[i]:
colWidths[i] = len(tableData[i][j])
printTable(tableData, colWidths)
if photo_nums[0] <= 500:
print("训练集数量小于500属于较小的数据量请注意设置较大的训练世代Epoch以满足足够的梯度下降次数Step")
if np.sum(nums) == 0:
print("在数据集中并未获得任何目标请注意修改classes_path对应自己的数据集并且保证标签名字正确否则训练将会没有任何效果")
print("在数据集中并未获得任何目标请注意修改classes_path对应自己的数据集并且保证标签名字正确否则训练将会没有任何效果")
print("在数据集中并未获得任何目标请注意修改classes_path对应自己的数据集并且保证标签名字正确否则训练将会没有任何效果")
print("(重要的事情说三遍)。")

View File

@ -1,424 +0,0 @@
import colorsys
import os
import time
import numpy as np
import torch
import torch.nn as nn
from PIL import ImageDraw, ImageFont
from app.core.yolo_detect.nets.yolo import YoloBody
from app.core.yolo_detect.utils.utils import (cvtColor, get_classes, preprocess_input,
resize_image, show_config)
from app.core.yolo_detect.utils.utils_bbox import DecodeBox
'''
训练自己的数据集必看注释
'''
class YOLO(object):
_defaults = {
#--------------------------------------------------------------------------#
# 使用自己训练好的模型进行预测一定要修改model_path和classes_path
# model_path指向logs文件夹下的权值文件label_size = draw.textsize(label, font)
#
# 训练好后logs文件夹下存在多个权值文件选择验证集损失较低的即可。
# 验证集损失较低不代表mAP较高仅代表该权值在验证集上泛化性能较好。
# 如果出现shape不匹配同时要注意训练时的model_path和classes_path参数的修改
#--------------------------------------------------------------------------#
"model_path" : 'app/core/yolo_detect/model_data/best_epoch_weights.pth',
"classes_path" : 'app/core/yolo_detect/model_data/voc_classes.txt',
#---------------------------------------------------------------------#
# 输入图片的大小必须为32的倍数。
#---------------------------------------------------------------------#
"input_shape" : [640, 640],
#------------------------------------------------------#
# 所使用到的yolov8的版本
# n : 对应yolov8_n
# s : 对应yolov8_s
# m : 对应yolov8_m
# l : 对应yolov8_l
# x : 对应yolov8_x
#------------------------------------------------------#
"phi" : 'l',
#---------------------------------------------------------------------#
# 只有得分大于置信度的预测框会被保留下来
#---------------------------------------------------------------------#
"confidence" : 0.5,
#---------------------------------------------------------------------#
# 非极大抑制所用到的nms_iou大小
#---------------------------------------------------------------------#
"nms_iou" : 0.3,
#---------------------------------------------------------------------#
# 该变量用于控制是否使用letterbox_image对输入图像进行不失真的resize
# 在多次测试后发现关闭letterbox_image直接resize的效果更好
#---------------------------------------------------------------------#
"letterbox_image" : False,
#-------------------------------#
# 是否使用Cuda
# 没有GPU可以设置成False
#-------------------------------#
"cuda" : True,
}
@classmethod
def get_defaults(cls, n):
if n in cls._defaults:
return cls._defaults[n]
else:
return "Unrecognized attribute name '" + n + "'"
#---------------------------------------------------#
# 初始化YOLO
#---------------------------------------------------#
def __init__(self, **kwargs):
self.__dict__.update(self._defaults)
for name, value in kwargs.items():
setattr(self, name, value)
self._defaults[name] = value
#---------------------------------------------------#
# 获得种类和先验框的数量
#---------------------------------------------------#
self.class_names, self.num_classes = get_classes(self.classes_path)
self.bbox_util = DecodeBox(self.num_classes, (self.input_shape[0], self.input_shape[1]))
#---------------------------------------------------#
# 画框设置不同的颜色
#---------------------------------------------------#
hsv_tuples = [(x / self.num_classes, 1., 1.) for x in range(self.num_classes)]
self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
self.colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), self.colors))
self.generate()
show_config(**self._defaults)
#---------------------------------------------------#
# 生成模型
#---------------------------------------------------#
def generate(self, onnx=False):
#---------------------------------------------------#
# 建立yolo模型载入yolo模型的权重
#---------------------------------------------------#
self.net = YoloBody(self.input_shape, self.num_classes, self.phi)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.net.load_state_dict(torch.load(self.model_path, map_location=device))
self.net = self.net.fuse().eval()
print('{} model, and classes loaded.'.format(self.model_path))
if not onnx:
if self.cuda:
self.net = nn.DataParallel(self.net)
self.net = self.net.cuda()
#---------------------------------------------------#
# 检测图片
#---------------------------------------------------#
def detect_image(self, image, crop = False, count = False):
#---------------------------------------------------#
# 计算输入图片的高和宽
#---------------------------------------------------#
image_shape = np.array(np.shape(image)[0:2])
#---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
#---------------------------------------------------------#
image = cvtColor(image)
#---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
#---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
#---------------------------------------------------------#
# 添加上batch_size维度
# h, w, 3 => 3, h, w => 1, 3, h, w
#---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
#---------------------------------------------------------#
# 将图像输入网络当中进行预测!
#---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
#---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
#---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou)
if results[0] is None:
return image,[]
top_label = np.array(results[0][:, 5], dtype = 'int32')
top_conf = results[0][:, 4]
top_boxes = results[0][:, :4]
#---------------------------------------------------------#
# 设置字体与边框厚度
#---------------------------------------------------------#
font = ImageFont.truetype(font='model_data/simhei.ttf', size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
thickness = int(max((image.size[0] + image.size[1]) // np.mean(self.input_shape), 1))
#---------------------------------------------------------#
# 计数
#---------------------------------------------------------#
if count:
print("top_label:", top_label)
classes_nums = np.zeros([self.num_classes])
for i in range(self.num_classes):
num = np.sum(top_label == i)
if num > 0:
print(self.class_names[i], " : ", num)
classes_nums[i] = num
print("classes_nums:", classes_nums)
#---------------------------------------------------------#
# 是否进行目标的裁剪
#---------------------------------------------------------#
if crop:
for i, c in list(enumerate(top_boxes)):
top, left, bottom, right = top_boxes[i]
top = max(0, np.floor(top).astype('int32'))
left = max(0, np.floor(left).astype('int32'))
bottom = min(image.size[1], np.floor(bottom).astype('int32'))
right = min(image.size[0], np.floor(right).astype('int32'))
dir_save_path = "img_crop"
if not os.path.exists(dir_save_path):
os.makedirs(dir_save_path)
crop_image = image.crop([left, top, right, bottom])
crop_image.save(os.path.join(dir_save_path, "crop_" + str(i) + ".png"), quality=95, subsampling=0)
print("save crop_" + str(i) + ".png to " + dir_save_path)
#---------------------------------------------------------#
# 图像绘制
#---------------------------------------------------------#
predicted_class_list=[]
for i, c in list(enumerate(top_label)):
predicted_class = self.class_names[int(c)]
box = top_boxes[i]
score = top_conf[i]
top, left, bottom, right = box
top = max(0, np.floor(top).astype('int32'))
left = max(0, np.floor(left).astype('int32'))
bottom = min(image.size[1], np.floor(bottom).astype('int32'))
right = min(image.size[0], np.floor(right).astype('int32'))
label = '{} {:.2f}'.format(predicted_class, score)
predicted_class_list.append(predicted_class)
draw = ImageDraw.Draw(image)
# label_size = draw.textsize(label, font)
bbox = draw.textbbox((0, 0), label, font=font)
label_size = (bbox[2] - bbox[0], bbox[3] - bbox[1]) # 计算文本宽度和高度
label = label.encode('utf-8')
print(label, top, left, bottom, right)
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]])
else:
text_origin = np.array([left, top + 1])
for i in range(thickness):
draw.rectangle([left + i, top + i, right - i, bottom - i], outline=self.colors[c])
draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)], fill=self.colors[c])
draw.text(text_origin, str(label,'UTF-8'), fill=(0, 0, 0), font=font)
del draw
return image,predicted_class_list
def get_FPS(self, image, test_interval):
image_shape = np.array(np.shape(image)[0:2])
#---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
#---------------------------------------------------------#
image = cvtColor(image)
#---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
#---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
#---------------------------------------------------------#
# 添加上batch_size维度
#---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
#---------------------------------------------------------#
# 将图像输入网络当中进行预测!
#---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
#---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
#---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou)
t1 = time.time()
for _ in range(test_interval):
with torch.no_grad():
#---------------------------------------------------------#
# 将图像输入网络当中进行预测!
#---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
#---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
#---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou)
t2 = time.time()
tact_time = (t2 - t1) / test_interval
return tact_time
def detect_heatmap(self, image, heatmap_save_path):
import cv2
import matplotlib.pyplot as plt
def sigmoid(x):
y = 1.0 / (1.0 + np.exp(-x))
return y
#---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
#---------------------------------------------------------#
image = cvtColor(image)
#---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
#---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1],self.input_shape[0]), self.letterbox_image)
#---------------------------------------------------------#
# 添加上batch_size维度
#---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
#---------------------------------------------------------#
# 将图像输入网络当中进行预测!
#---------------------------------------------------------#
dbox, cls, x, anchors, strides = self.net(images)
outputs = [xi.split((xi.size()[1] - self.num_classes, self.num_classes), 1)[1] for xi in x]
plt.imshow(image, alpha=1)
plt.axis('off')
mask = np.zeros((image.size[1], image.size[0]))
for sub_output in outputs:
sub_output = sub_output.cpu().numpy()
b, c, h, w = np.shape(sub_output)
sub_output = np.transpose(np.reshape(sub_output, [b, -1, h, w]), [0, 2, 3, 1])[0]
score = np.max(sigmoid(sub_output[..., :]), -1)
score = cv2.resize(score, (image.size[0], image.size[1]))
normed_score = (score * 255).astype('uint8')
mask = np.maximum(mask, normed_score)
plt.imshow(mask, alpha=0.5, interpolation='nearest', cmap="jet")
plt.axis('off')
plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
plt.margins(0, 0)
plt.savefig(heatmap_save_path, dpi=200, bbox_inches='tight', pad_inches = -0.1)
print("Save to the " + heatmap_save_path)
plt.show()
def convert_to_onnx(self, simplify, model_path):
import onnx
self.generate(onnx=True)
im = torch.zeros(1, 3, *self.input_shape).to('cpu') # image size(1, 3, 512, 512) BCHW
input_layer_names = ["images"]
output_layer_names = ["output"]
# Export the model
print(f'Starting export with onnx {onnx.__version__}.')
torch.onnx.export(self.net,
im,
f = model_path,
verbose = False,
opset_version = 12,
training = torch.onnx.TrainingMode.EVAL,
do_constant_folding = True,
input_names = input_layer_names,
output_names = output_layer_names,
dynamic_axes = None)
# Checks
model_onnx = onnx.load(model_path) # load onnx model
onnx.checker.check_model(model_onnx) # check onnx model
# Simplify onnx
if simplify:
import onnxsim
print(f'Simplifying with onnx-simplifier {onnxsim.__version__}.')
model_onnx, check = onnxsim.simplify(
model_onnx,
dynamic_input_shape=False,
input_shapes=None)
assert check, 'assert check failed'
onnx.save(model_onnx, model_path)
print('Onnx model save as {}'.format(model_path))
def get_map_txt(self, image_id, image, class_names, map_out_path):
f = open(os.path.join(map_out_path, "detection-results/"+image_id+".txt"), "w", encoding='utf-8')
image_shape = np.array(np.shape(image)[0:2])
#---------------------------------------------------------#
# 在这里将图像转换成RGB图像防止灰度图在预测时报错。
# 代码仅仅支持RGB图像的预测所有其它类型的图像都会转化成RGB
#---------------------------------------------------------#
image = cvtColor(image)
#---------------------------------------------------------#
# 给图像增加灰条实现不失真的resize
# 也可以直接resize进行识别
#---------------------------------------------------------#
image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image)
#---------------------------------------------------------#
# 添加上batch_size维度
#---------------------------------------------------------#
image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0)
with torch.no_grad():
images = torch.from_numpy(image_data)
if self.cuda:
images = images.cuda()
#---------------------------------------------------------#
# 将图像输入网络当中进行预测!
#---------------------------------------------------------#
outputs = self.net(images)
outputs = self.bbox_util.decode_box(outputs)
#---------------------------------------------------------#
# 将预测框进行堆叠,然后进行非极大抑制
#---------------------------------------------------------#
results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape,
image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou)
if results[0] is None:
return
top_label = np.array(results[0][:, 5], dtype = 'int32')
top_conf = results[0][:, 4]
top_boxes = results[0][:, :4]
for i, c in list(enumerate(top_label)):
predicted_class = self.class_names[int(c)]
box = top_boxes[i]
score = str(top_conf[i])
top, left, bottom, right = box
if predicted_class not in class_names:
continue
f.write("%s %s %s %s %s %s\n" % (predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)),str(int(bottom))))
f.close()
return

View File

@ -52,7 +52,7 @@ async def get_task_result(task_id: str, response: Response):
# 构建mask信息
masks = [
MaskInfo(name=mask["name"], score=mask["score"], coords=mask["coords"])
MaskInfo(name=mask["name"], coords=mask["coords"])
for mask in coords_data
]

View File

@ -11,7 +11,6 @@ class ImageInfo(BaseModel):
class MaskInfo(BaseModel):
name: str
score: float
coords: List[List[int]]

View File

@ -7,12 +7,14 @@ from queue import Queue
from typing import Dict
from app.core.model import Model
from app.core.preprocess import Preprocess
from app.services.model import TaskStatus, TaskStore
class Worker:
def __init__(self):
self.detection = Model().getModel()
self.preprocess = Preprocess().getPreprocess()
self.task_queue = Queue()
self.task_store: Dict[str, TaskStore] = {}
@ -38,17 +40,30 @@ class Worker:
output_dir = os.path.join(UPLOAD_DIR, task_id, "outputs")
os.makedirs(output_dir, exist_ok=True)
for idx, input_img_path in enumerate(task.images):
# 获取图像的标签列表
image_labels = self.preprocess.preprocess(task.images) # 返回一个0和1的列表0代表跳过1代表进行检测
for idx, (input_img_path, label) in enumerate(zip(task.images, image_labels)):
print(f"处理任务 {task_id}, 处理图片 {input_img_path}...")
img_res, coords_res = self.detection.detect(input_img_path)
coords_res = [{"name": name, "score": score, "coords": coords} for name, score, coords in coords_res]
print(coords_res)
coords_json = json.dumps(coords_res, ensure_ascii=False)
out_img_path = os.path.join(str(output_dir), f"{idx}.jpg")
cv2.imwrite(out_img_path, img_res)
if label == 0:
# 如果标签是0跳过模型检测输出路径和坐标为空
task.result.append(
{"input_img_path": input_img_path, "output_img_path": out_img_path, "coords": coords_json})
{"input_img_path": input_img_path, "output_img_path": "", "coords": "[]"}
)
else:
# 进行模型检测
img_res, coords_res = self.detection.detect(input_img_path)
coords_res = [{"name": name, "coords": coords} for name, coords in coords_res]
coords_json = json.dumps(coords_res, ensure_ascii=False)
out_img_path = os.path.join(output_dir, f"{idx}.jpg")
cv2.imwrite(out_img_path, img_res)
task.result.append(
{"input_img_path": input_img_path, "output_img_path": out_img_path, "coords": coords_json}
)
task.progress = int((idx + 1) / len(task.images) * 100)