diff --git a/.env b/.env index 3ea3ff0..d4b72b8 100644 --- a/.env +++ b/.env @@ -1,4 +1,3 @@ UPLOAD_DIR=uploads MOCK=false -MODEL=yolo #segformer, yolo -PREPROCESS=sam3 \ No newline at end of file +MODEL=yolo_detect #segformer, yolo, yolo_detect \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 1207d96..eeb61bd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,7 +23,7 @@ RUN pip install --no-cache-dir --upgrade -r requirements.txt -i https://pypi.tun COPY ./app /code/app # 删除无用的文件,避免占用磁盘空间 -RUN rm -rf /code/app/core/*.onnx /code/app/core/*.data /code/app/core/*.pt +RUN rm -rf /code/app/core # 暴露端口并启动应用 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"] diff --git a/app/core/model.py b/app/core/model.py index 495de48..3c964d7 100644 --- a/app/core/model.py +++ b/app/core/model.py @@ -1,5 +1,6 @@ from app.core.segformer.detect import Detection as SegFormer, DetectionMock as SegFormerMock from app.core.yolo.detect import YOLOSeg +from app.core.yolo_detect.detect import YOLODetect class Model: @@ -11,6 +12,9 @@ class Model: elif MODEL == "yolo": print("使用 YOLO 模型") self.detection = YOLOSeg() + elif MODEL == "yolo_detect": + print("使用 YOLO_Detect 模型") + self.detection = YOLODetect() def getModel(self): return self.detection diff --git a/app/core/preprocess.py b/app/core/preprocess.py deleted file mode 100644 index 632d2ef..0000000 --- a/app/core/preprocess.py +++ /dev/null @@ -1,12 +0,0 @@ -from app.core.sam3.preprocess import SAM3 - - -class Preprocess: - def __init__(self): - from app.main import PREPROCESS - if PREPROCESS == "sam3": - print("使用 SAM3 进行预处理判断") - self.preprocess = SAM3() - - def getPreprocess(self): - return self.preprocess diff --git a/app/core/sam3/preprocess.py b/app/core/sam3/preprocess.py deleted file mode 100644 index 63e6c98..0000000 --- a/app/core/sam3/preprocess.py +++ /dev/null @@ -1,172 +0,0 @@ -import os -import torch -from pathlib import Path -from PIL import Image -from torch.utils.data import Dataset, DataLoader -from sam3.model_builder import build_sam3_image_model -from sam3.train.data.collator import collate_fn_api as collate -from sam3.model.utils.misc import copy_data_to_device -from sam3.train.data.sam3_image_dataset import ( - Datapoint, Image as SAMImage, FindQueryLoaded, InferenceMetadata -) -from sam3.train.transforms.basic_for_api import ComposeAPI, RandomResizeAPI, ToTensorAPI, NormalizeAPI -from sam3.eval.postprocessors import PostProcessImage - -os.environ["CUDA_VISIBLE_DEVICES"] = "0" -os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256" - -# ===== 配置 ===== -CKPT_PATH = os.path.join(os.getcwd(), "app/core/sam3", "sam3.pt") -DEVICE = "cuda:0" - -BATCH_SIZE = 12 # 批量大小,前端要设置 -NUM_WORKERS = 12 # 加载图片的线程数,看前端要不要设置 -CONF_TH = 0.5 -RATIO_TH = 0.5 # 阈值,越大的话过滤越多,但太大会影响近景图片 -_GLOBAL_ID = 1 - -PROMPTS = [ - "wall", - "building wall", - "building facade", - "building exterior wall", - "exterior building facade", -] -IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"} - - -# ============ - - -class ImgPathList(Dataset): - def __init__(self, img_paths: list): - """ - 初始化 ImgFolder,传入一个图片路径的列表 - - Args: - img_paths (list): 一个包含图片路径的列表 - """ - self.paths = img_paths # 使用传入的路径列表 - - def __len__(self): - return len(self.paths) - - def __getitem__(self, i): - p = self.paths[i] # 直接使用列表中的路径 - img = Image.open(p).convert("RGB") # 打开图片并转换为RGB模式 - return p, img # 返回图片的路径和图片本身 - - - -class SAM3: - def __init__(self): - self.dev = torch.device(DEVICE) - self.postprocessor = PostProcessImage( - max_dets_per_img=-1, - iou_type="segm", - use_original_sizes_box=True, - use_original_sizes_mask=True, - convert_mask_to_rle=False, - detection_threshold=CONF_TH, - to_cpu=False, - ) - self.transform = ComposeAPI( - transforms=[ - RandomResizeAPI(sizes=1008, max_size=1008, square=True, consistent_transform=False), - ToTensorAPI(), - NormalizeAPI(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), - ] - ) - self.model = build_sam3_image_model( - checkpoint_path=CKPT_PATH, load_from_HF=False, device=DEVICE - ).to(DEVICE).eval() - - def preprocess(self, image_path_list): - labels = [] - - loader = DataLoader( - ImgPathList(image_path_list), - batch_size=BATCH_SIZE, - shuffle=False, - num_workers=NUM_WORKERS, - pin_memory=True, - collate_fn=self.collate_fn, - ) - - with torch.inference_mode(): - for names, images in loader: - datapoints = [] - name2qids = {} # name -> [qid,...] - for name, img in zip(names, images): - dp = self.create_empty_datapoint() - self.set_image(dp, img) - - qids = [self.add_text_prompt(dp, p) for p in PROMPTS] - name2qids[name] = qids - - datapoints.append(self.transform(dp)) - - batch = collate(datapoints, dict_key="dummy")["dummy"] - batch = copy_data_to_device(batch, self.dev, non_blocking=True) - output = self.model(batch) - - processed = self.postprocessor.process_results(output, batch.find_metadatas) - - for name in names: - any_masks = [] - for qid in name2qids[name]: - res = processed[qid] - m = res.get("masks", None) # 期望: [N,H,W] - if m is None: - any_masks.append(torch.zeros(1, 1, device=self.dev, dtype=torch.bool).squeeze()) - else: - if not torch.is_tensor(m): - m = torch.as_tensor(m, device=self.dev) - any_masks.append(m.any(0)) # [H,W] - - wall_mask = torch.stack(any_masks, 0).any(0) # [H,W] bool - ratio = wall_mask.float().mean().item() - lab = 1 if ratio >= RATIO_TH else 0 - labels.append(lab) - print(f"{name} | wall_ratio={ratio:.4f} -> {lab}") # 这行可以不要 - - return labels - - @staticmethod - def add_text_prompt(datapoint, text_query): - global _GLOBAL_ID - assert len(datapoint.images) == 1, "please set the image first" - w, h = datapoint.images[0].size - datapoint.find_queries.append( - FindQueryLoaded( - query_text=text_query, - image_id=0, - object_ids_output=[], - is_exhaustive=True, - query_processing_order=0, - inference_metadata=InferenceMetadata( - coco_image_id=_GLOBAL_ID, - original_image_id=_GLOBAL_ID, - original_category_id=1, - original_size=[w, h], - object_id=0, - frame_index=0, - ), - ) - ) - _GLOBAL_ID += 1 - return _GLOBAL_ID - 1 - - @staticmethod - def create_empty_datapoint(): - return Datapoint(find_queries=[], images=[]) - - @staticmethod - def set_image(datapoint, pil_image): - w, h = pil_image.size - datapoint.images = [SAMImage(data=pil_image, objects=[], size=[h, w])] # size 用 [H,W] - - @staticmethod - def collate_fn(batch): - names, imgs = zip(*batch) - return list(names), list(imgs) diff --git a/app/core/sam3/sam3.pt b/app/core/sam3/sam3.pt deleted file mode 100644 index 5b7c2ea..0000000 --- a/app/core/sam3/sam3.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9999e2341ceef5e136daa386eecb55cb414446a00ac2b55eb2dfd2f7c3cf8c9e -size 3450062241 diff --git a/app/core/sam3/__init__.py b/app/core/yolo_detect/__init__.py similarity index 100% rename from app/core/sam3/__init__.py rename to app/core/yolo_detect/__init__.py diff --git a/app/core/yolo_detect/detect.py b/app/core/yolo_detect/detect.py new file mode 100644 index 0000000..2f65362 --- /dev/null +++ b/app/core/yolo_detect/detect.py @@ -0,0 +1,144 @@ +import io +import os +import cv2 +import torch +import numpy as np + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +from PIL import Image, ImageDraw, ImageFont, ImageSequence +from app.core.yolo_detect.yolo import YOLO # 假设你有 YOLO 模型类 +from app.core.yolo_detect.utils.utils import (cvtColor, get_classes, preprocess_input, + resize_image, show_config) + +CLASS_NAMES = { + "wall_konggu": "Hollowing", + "wall_shenshui": "Water seepage", + "wall_kailie": "Cracking", + "wall_konggu_gap": "Gap in hollowing", + "wall": "Wall", +} + + +class YOLODetect(YOLO): + def __init__(self): + super().__init__() + self.classes = CLASS_NAMES + + def detect(self, img_input, crop=False, count=False): + try: + image = Image.open(img_input) + if image.format == "MPO": + image = next(ImageSequence.Iterator(image)) + jpeg_image_in_memory = io.BytesIO() + image.save(jpeg_image_in_memory, format="JPEG") + jpeg_image_in_memory.seek(0) + image = Image.open(jpeg_image_in_memory) + # if isinstance(img_input, str): + # image = cv2.imdecode(np.fromfile(img_input, dtype=np.uint8), cv2.IMREAD_COLOR) + # # image = Image.open(img_input) + # else: + # image = img_input + # ---------------------------------------------------# + # 计算输入图片的高和宽 + # ---------------------------------------------------# + image_shape = np.array(np.shape(image)[0:2]) + # ---------------------------------------------------------# + # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB + # ---------------------------------------------------------# + image = cvtColor(image) + # ---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + # 也可以直接resize进行识别 + # ---------------------------------------------------------# + image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image) + # ---------------------------------------------------------# + # 添加上batch_size维度 + # h, w, 3 => 3, h, w => 1, 3, h, w + # ---------------------------------------------------------# + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) + with torch.no_grad(): + images = torch.from_numpy(image_data) + if self.cuda: + images = images.cuda() + # ---------------------------------------------------------# + # 将图像输入网络当中进行预测! + # ---------------------------------------------------------# + outputs = self.net(images) + outputs = self.bbox_util.decode_box(outputs) + # ---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + # ---------------------------------------------------------# + results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres=self.confidence, + nms_thres=self.nms_iou) + + if results[0] is None: + return image, [] + + top_label = np.array(results[0][:, 5], dtype='int32') + top_conf = results[0][:, 4] + top_boxes = results[0][:, :4] + mask = np.zeros((image.size[1], image.size[0], 3), dtype=np.uint8) + coords = [] + # 先把 wall 区域存起来 + wall_boxes = [] + for i, c in enumerate(top_label): + predicted_class = self.class_names[int(c)] + if predicted_class == "wall": + box = top_boxes[i] + top, left, bottom, right = box + top = max(0, np.floor(top).astype('int32')) + left = max(0, np.floor(left).astype('int32')) + bottom = min(image.size[1], np.floor(bottom).astype('int32')) + right = min(image.size[0], np.floor(right).astype('int32')) + wall_boxes.append((left, top, right, bottom)) + # 再处理特殊类别 + for i, c in enumerate(top_label): + predicted_class = self.class_names[int(c)] + if predicted_class != "wall": + box = top_boxes[i] + top, left, bottom, right = box + top = max(0, np.floor(top).astype('int32')) + left = max(0, np.floor(left).astype('int32')) + bottom = min(image.size[1], np.floor(bottom).astype('int32')) + right = min(image.size[0], np.floor(right).astype('int32')) + + # 计算与每个 wall 的重叠面积 + special_area = (right - left) * (bottom - top) + keep = False + for w_left, w_top, w_right, w_bottom in wall_boxes: + inter_left = max(left, w_left) + inter_top = max(top, w_top) + inter_right = min(right, w_right) + inter_bottom = min(bottom, w_bottom) + if inter_right > inter_left and inter_bottom > inter_top: + inter_area = (inter_right - inter_left) * (inter_bottom - inter_top) + if inter_area / special_area >= 0.6: # 重叠比例 ≥ 60% + keep = True + break + + if predicted_class == "wall_konggu": + # 面积不能超过整个图像面积的50% + if special_area / (image.size[0] * image.size[1]) > 0.5: + keep = False + + if keep: + color = self.colors[int(c)] + mask[top:bottom, left:right] = color + coords.append((self.classes.get(predicted_class), + [(int(left), int(top)), (int(right), int(top)), (int(right), int(bottom)), (int(left), int(bottom))])) + mask = cv2.cvtColor(mask, cv2.COLOR_RGB2BGR) + # print("coords:", coords) + + return mask, coords + + except Exception as e: + print(e) + +if __name__ == "__main__": + model = YOLODetect() + image = "test.jpg" + mask, coords = model.detect(Image.open(image)) + mask.save("mask.jpg", quality=95, subsampling=0) + print(coords) diff --git a/app/core/yolo_detect/get_map.py b/app/core/yolo_detect/get_map.py new file mode 100644 index 0000000..ccda5eb --- /dev/null +++ b/app/core/yolo_detect/get_map.py @@ -0,0 +1,138 @@ +import os +import xml.etree.ElementTree as ET + +from PIL import Image +from tqdm import tqdm + +from utils.utils import get_classes +from utils.utils_map import get_coco_map, get_map +from yolo import YOLO + +if __name__ == "__main__": + ''' + Recall和Precision不像AP是一个面积的概念,因此在门限值(Confidence)不同时,网络的Recall和Precision值是不同的。 + 默认情况下,本代码计算的Recall和Precision代表的是当门限值(Confidence)为0.5时,所对应的Recall和Precision值。 + + 受到mAP计算原理的限制,网络在计算mAP时需要获得近乎所有的预测框,这样才可以计算不同门限条件下的Recall和Precision值 + 因此,本代码获得的map_out/detection-results/里面的txt的框的数量一般会比直接predict多一些,目的是列出所有可能的预测框, + ''' + #------------------------------------------------------------------------------------------------------------------# + # map_mode用于指定该文件运行时计算的内容 + # map_mode为0代表整个map计算流程,包括获得预测结果、获得真实框、计算VOC_map。 + # map_mode为1代表仅仅获得预测结果。 + # map_mode为2代表仅仅获得真实框。 + # map_mode为3代表仅仅计算VOC_map。 + # map_mode为4代表利用COCO工具箱计算当前数据集的0.50:0.95map。需要获得预测结果、获得真实框后并安装pycocotools才行 + #-------------------------------------------------------------------------------------------------------------------# + map_mode = 0 + #--------------------------------------------------------------------------------------# + # 此处的classes_path用于指定需要测量VOC_map的类别 + # 一般情况下与训练和预测所用的classes_path一致即可 + #--------------------------------------------------------------------------------------# + classes_path = 'model_data/voc_classes.txt' + #--------------------------------------------------------------------------------------# + # MINOVERLAP用于指定想要获得的mAP0.x,mAP0.x的意义是什么请同学们百度一下。 + # 比如计算mAP0.75,可以设定MINOVERLAP = 0.75。 + # + # 当某一预测框与真实框重合度大于MINOVERLAP时,该预测框被认为是正样本,否则为负样本。 + # 因此MINOVERLAP的值越大,预测框要预测的越准确才能被认为是正样本,此时算出来的mAP值越低, + #--------------------------------------------------------------------------------------# + MINOVERLAP = 0.5 + #--------------------------------------------------------------------------------------# + # 受到mAP计算原理的限制,网络在计算mAP时需要获得近乎所有的预测框,这样才可以计算mAP + # 因此,confidence的值应当设置的尽量小进而获得全部可能的预测框。 + # + # 该值一般不调整。因为计算mAP需要获得近乎所有的预测框,此处的confidence不能随便更改。 + # 想要获得不同门限值下的Recall和Precision值,请修改下方的score_threhold。 + #--------------------------------------------------------------------------------------# + confidence = 0.001 + #--------------------------------------------------------------------------------------# + # 预测时使用到的非极大抑制值的大小,越大表示非极大抑制越不严格。 + # + # 该值一般不调整。 + #--------------------------------------------------------------------------------------# + nms_iou = 0.5 + #---------------------------------------------------------------------------------------------------------------# + # Recall和Precision不像AP是一个面积的概念,因此在门限值不同时,网络的Recall和Precision值是不同的。 + # + # 默认情况下,本代码计算的Recall和Precision代表的是当门限值为0.5(此处定义为score_threhold)时所对应的Recall和Precision值。 + # 因为计算mAP需要获得近乎所有的预测框,上面定义的confidence不能随便更改。 + # 这里专门定义一个score_threhold用于代表门限值,进而在计算mAP时找到门限值对应的Recall和Precision值。 + #---------------------------------------------------------------------------------------------------------------# + score_threhold = 0.5 + #-------------------------------------------------------# + # map_vis用于指定是否开启VOC_map计算的可视化 + #-------------------------------------------------------# + map_vis = False + #-------------------------------------------------------# + # 指向VOC数据集所在的文件夹 + # 默认指向根目录下的VOC数据集 + #-------------------------------------------------------# + VOCdevkit_path = 'VOCdevkit' + #-------------------------------------------------------# + # 结果输出的文件夹,默认为map_out + #-------------------------------------------------------# + map_out_path = 'map_out' + + image_ids = open(os.path.join(VOCdevkit_path, "VOC2007/ImageSets/Main/test.txt")).read().strip().split() + + if not os.path.exists(map_out_path): + os.makedirs(map_out_path) + if not os.path.exists(os.path.join(map_out_path, 'ground-truth')): + os.makedirs(os.path.join(map_out_path, 'ground-truth')) + if not os.path.exists(os.path.join(map_out_path, 'detection-results')): + os.makedirs(os.path.join(map_out_path, 'detection-results')) + if not os.path.exists(os.path.join(map_out_path, 'images-optional')): + os.makedirs(os.path.join(map_out_path, 'images-optional')) + + class_names, _ = get_classes(classes_path) + + if map_mode == 0 or map_mode == 1: + print("Load model.") + yolo = YOLO(confidence = confidence, nms_iou = nms_iou) + print("Load model done.") + + print("Get predict result.") + for image_id in tqdm(image_ids): + image_path = os.path.join(VOCdevkit_path, "VOC2007/JPEGImages/"+image_id+".jpg") + image = Image.open(image_path) + if map_vis: + image.save(os.path.join(map_out_path, "images-optional/" + image_id + ".jpg")) + yolo.get_map_txt(image_id, image, class_names, map_out_path) + print("Get predict result done.") + + if map_mode == 0 or map_mode == 2: + print("Get ground truth result.") + for image_id in tqdm(image_ids): + with open(os.path.join(map_out_path, "ground-truth/"+image_id+".txt"), "w") as new_f: + root = ET.parse(os.path.join(VOCdevkit_path, "VOC2007/Annotations/"+image_id+".xml")).getroot() + for obj in root.findall('object'): + difficult_flag = False + if obj.find('difficult')!=None: + difficult = obj.find('difficult').text + if int(difficult)==1: + difficult_flag = True + obj_name = obj.find('name').text + if obj_name not in class_names: + continue + bndbox = obj.find('bndbox') + left = bndbox.find('xmin').text + top = bndbox.find('ymin').text + right = bndbox.find('xmax').text + bottom = bndbox.find('ymax').text + + if difficult_flag: + new_f.write("%s %s %s %s %s difficult\n" % (obj_name, left, top, right, bottom)) + else: + new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom)) + print("Get ground truth result done.") + + if map_mode == 0 or map_mode == 3: + print("Get map.") + get_map(MINOVERLAP, True, score_threhold = score_threhold, path = map_out_path) + print("Get map done.") + + if map_mode == 4: + print("Get map.") + get_coco_map(class_names = class_names, path = map_out_path) + print("Get map done.") diff --git a/app/core/yolo_detect/model_data/__init__.py b/app/core/yolo_detect/model_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/core/yolo_detect/model_data/best_epoch_weights.pth b/app/core/yolo_detect/model_data/best_epoch_weights.pth new file mode 100644 index 0000000..5ad9cf9 --- /dev/null +++ b/app/core/yolo_detect/model_data/best_epoch_weights.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fab1422f9b5500cd5941a0f6b4445d366937637b3d8615f931f2092aa1f76d5 +size 174941676 diff --git a/app/core/yolo_detect/model_data/simhei.ttf b/app/core/yolo_detect/model_data/simhei.ttf new file mode 100644 index 0000000..5bd4687 Binary files /dev/null and b/app/core/yolo_detect/model_data/simhei.ttf differ diff --git a/app/core/yolo_detect/model_data/voc_classes.txt b/app/core/yolo_detect/model_data/voc_classes.txt new file mode 100644 index 0000000..dc40fec --- /dev/null +++ b/app/core/yolo_detect/model_data/voc_classes.txt @@ -0,0 +1,5 @@ +wall +wall_shenshui +wall_konggu +wall_konggu_gap +wall_kailie \ No newline at end of file diff --git a/app/core/yolo_detect/nets/__init__.py b/app/core/yolo_detect/nets/__init__.py new file mode 100644 index 0000000..4287ca8 --- /dev/null +++ b/app/core/yolo_detect/nets/__init__.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/app/core/yolo_detect/nets/backbone.py b/app/core/yolo_detect/nets/backbone.py new file mode 100644 index 0000000..a44ce7f --- /dev/null +++ b/app/core/yolo_detect/nets/backbone.py @@ -0,0 +1,143 @@ +import torch +import torch.nn as nn + + +def autopad(k, p=None, d=1): + # kernel, padding, dilation + # 对输入的特征层进行自动padding,按照Same原则 + if d > 1: + # actual kernel-size + k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k] + if p is None: + # auto-pad + p = k // 2 if isinstance(k, int) else [x // 2 for x in k] + return p + +class SiLU(nn.Module): + # SiLU激活函数 + @staticmethod + def forward(x): + return x * torch.sigmoid(x) + +class Conv(nn.Module): + # 标准卷积+标准化+激活函数 + default_act = SiLU() + def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True): + super().__init__() + self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) + self.bn = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03, affine=True, track_running_stats=True) + self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity() + + def forward(self, x): + return self.act(self.bn(self.conv(x))) + + def forward_fuse(self, x): + return self.act(self.conv(x)) + +class Bottleneck(nn.Module): + # 标准瓶颈结构,残差结构 + # c1为输入通道数,c2为输出通道数 + def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5): + super().__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, k[0], 1) + self.cv2 = Conv(c_, c2, k[1], 1, g=g) + self.add = shortcut and c1 == c2 + + def forward(self, x): + return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) + +class C2f(nn.Module): + # CSPNet结构结构,大残差结构 + # c1为输入通道数,c2为输出通道数 + def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5): + super().__init__() + self.c = int(c2 * e) + self.cv1 = Conv(c1, 2 * self.c, 1, 1) + self.cv2 = Conv((2 + n) * self.c, c2, 1) + self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)) + + def forward(self, x): + # 进行一个卷积,然后划分成两份,每个通道都为c + y = list(self.cv1(x).split((self.c, self.c), 1)) + # 每进行一次残差结构都保留,然后堆叠在一起,密集残差 + y.extend(m(y[-1]) for m in self.m) + return self.cv2(torch.cat(y, 1)) + +class SPPF(nn.Module): + # SPP结构,5、9、13最大池化核的最大池化。 + def __init__(self, c1, c2, k=5): + super().__init__() + c_ = c1 // 2 + self.cv1 = Conv(c1, c_, 1, 1) + self.cv2 = Conv(c_ * 4, c2, 1, 1) + self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2) + + def forward(self, x): + x = self.cv1(x) + y1 = self.m(x) + y2 = self.m(y1) + return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1)) + +class Backbone(nn.Module): + def __init__(self, base_channels, base_depth, deep_mul, phi, pretrained=False): + super().__init__() + #-----------------------------------------------# + # 输入图片是3, 640, 640 + #-----------------------------------------------# + # 3, 640, 640 => 32, 640, 640 => 64, 320, 320 + self.stem = Conv(3, base_channels, 3, 2) + + # 64, 320, 320 => 128, 160, 160 => 128, 160, 160 + self.dark2 = nn.Sequential( + Conv(base_channels, base_channels * 2, 3, 2), + C2f(base_channels * 2, base_channels * 2, base_depth, True), + ) + # 128, 160, 160 => 256, 80, 80 => 256, 80, 80 + self.dark3 = nn.Sequential( + Conv(base_channels * 2, base_channels * 4, 3, 2), + C2f(base_channels * 4, base_channels * 4, base_depth * 2, True), + ) + # 256, 80, 80 => 512, 40, 40 => 512, 40, 40 + self.dark4 = nn.Sequential( + Conv(base_channels * 4, base_channels * 8, 3, 2), + C2f(base_channels * 8, base_channels * 8, base_depth * 2, True), + ) + # 512, 40, 40 => 1024 * deep_mul, 20, 20 => 1024 * deep_mul, 20, 20 + self.dark5 = nn.Sequential( + Conv(base_channels * 8, int(base_channels * 16 * deep_mul), 3, 2), + C2f(int(base_channels * 16 * deep_mul), int(base_channels * 16 * deep_mul), base_depth, True), + SPPF(int(base_channels * 16 * deep_mul), int(base_channels * 16 * deep_mul), k=5) + ) + + if pretrained: + url = { + "n" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_n_backbone_weights.pth', + "s" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_s_backbone_weights.pth', + "m" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_m_backbone_weights.pth', + "l" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_l_backbone_weights.pth', + "x" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_x_backbone_weights.pth', + }[phi] + checkpoint = torch.hub.load_state_dict_from_url(url=url, map_location="cpu", model_dir="./model_data") + self.load_state_dict(checkpoint, strict=False) + print("Load weights from " + url.split('/')[-1]) + + def forward(self, x): + x = self.stem(x) + x = self.dark2(x) + #-----------------------------------------------# + # dark3的输出为256, 80, 80,是一个有效特征层 + #-----------------------------------------------# + x = self.dark3(x) + feat1 = x + #-----------------------------------------------# + # dark4的输出为512, 40, 40,是一个有效特征层 + #-----------------------------------------------# + x = self.dark4(x) + feat2 = x + #-----------------------------------------------# + # dark5的输出为1024 * deep_mul, 20, 20,是一个有效特征层 + #-----------------------------------------------# + x = self.dark5(x) + feat3 = x + return feat1, feat2, feat3 diff --git a/app/core/yolo_detect/nets/yolo.py b/app/core/yolo_detect/nets/yolo.py new file mode 100644 index 0000000..68a21bc --- /dev/null +++ b/app/core/yolo_detect/nets/yolo.py @@ -0,0 +1,176 @@ +import numpy as np +import torch +import torch.nn as nn + +from app.core.yolo_detect.nets.backbone import Backbone, C2f, Conv +from app.core.yolo_detect.nets.yolo_training import weights_init +from app.core.yolo_detect.utils.utils_bbox import make_anchors + +def fuse_conv_and_bn(conv, bn): + # 混合Conv2d + BatchNorm2d 减少计算量 + # Fuse Conv2d() and BatchNorm2d() layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/ + fusedconv = nn.Conv2d(conv.in_channels, + conv.out_channels, + kernel_size=conv.kernel_size, + stride=conv.stride, + padding=conv.padding, + dilation=conv.dilation, + groups=conv.groups, + bias=True).requires_grad_(False).to(conv.weight.device) + + # 准备kernel + w_conv = conv.weight.clone().view(conv.out_channels, -1) + w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var))) + fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape)) + + # 准备bias + b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias + b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps)) + fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn) + + return fusedconv + +class DFL(nn.Module): + # DFL模块 + # Distribution Focal Loss (DFL) proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391 + def __init__(self, c1=16): + super().__init__() + self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False) + x = torch.arange(c1, dtype=torch.float) + self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1)) + self.c1 = c1 + + def forward(self, x): + # bs, self.reg_max * 4, 8400 + b, c, a = x.shape + # bs, 4, self.reg_max, 8400 => bs, self.reg_max, 4, 8400 => b, 4, 8400 + # 以softmax的方式,对0~16的数字计算百分比,获得最终数字。 + return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a) + # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a) + +#---------------------------------------------------# +# yolo_body +#---------------------------------------------------# +class YoloBody(nn.Module): + def __init__(self, input_shape, num_classes, phi, pretrained=False): + super(YoloBody, self).__init__() + depth_dict = {'n' : 0.33, 's' : 0.33, 'm' : 0.67, 'l' : 1.00, 'x' : 1.00,} + width_dict = {'n' : 0.25, 's' : 0.50, 'm' : 0.75, 'l' : 1.00, 'x' : 1.25,} + deep_width_dict = {'n' : 1.00, 's' : 1.00, 'm' : 0.75, 'l' : 0.50, 'x' : 0.50,} + dep_mul, wid_mul, deep_mul = depth_dict[phi], width_dict[phi], deep_width_dict[phi] + + base_channels = int(wid_mul * 64) # 64 + base_depth = max(round(dep_mul * 3), 1) # 3 + #-----------------------------------------------# + # 输入图片是3, 640, 640 + #-----------------------------------------------# + + #---------------------------------------------------# + # 生成主干模型 + # 获得三个有效特征层,他们的shape分别是: + # 256, 80, 80 + # 512, 40, 40 + # 1024 * deep_mul, 20, 20 + #---------------------------------------------------# + self.backbone = Backbone(base_channels, base_depth, deep_mul, phi, pretrained=pretrained) + + #------------------------加强特征提取网络------------------------# + self.upsample = nn.Upsample(scale_factor=2, mode="nearest") + + # 1024 * deep_mul + 512, 40, 40 => 512, 40, 40 + self.conv3_for_upsample1 = C2f(int(base_channels * 16 * deep_mul) + base_channels * 8, base_channels * 8, base_depth, shortcut=False) + # 768, 80, 80 => 256, 80, 80 + self.conv3_for_upsample2 = C2f(base_channels * 8 + base_channels * 4, base_channels * 4, base_depth, shortcut=False) + + # 256, 80, 80 => 256, 40, 40 + self.down_sample1 = Conv(base_channels * 4, base_channels * 4, 3, 2) + # 512 + 256, 40, 40 => 512, 40, 40 + self.conv3_for_downsample1 = C2f(base_channels * 8 + base_channels * 4, base_channels * 8, base_depth, shortcut=False) + + # 512, 40, 40 => 512, 20, 20 + self.down_sample2 = Conv(base_channels * 8, base_channels * 8, 3, 2) + # 1024 * deep_mul + 512, 20, 20 => 1024 * deep_mul, 20, 20 + self.conv3_for_downsample2 = C2f(int(base_channels * 16 * deep_mul) + base_channels * 8, int(base_channels * 16 * deep_mul), base_depth, shortcut=False) + #------------------------加强特征提取网络------------------------# + + ch = [base_channels * 4, base_channels * 8, int(base_channels * 16 * deep_mul)] + self.shape = None + self.nl = len(ch) + # self.stride = torch.zeros(self.nl) + self.stride = torch.tensor([256 / x.shape[-2] for x in self.backbone.forward(torch.zeros(1, 3, 256, 256))]) # forward + self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x) + self.no = num_classes + self.reg_max * 4 # number of outputs per anchor + self.num_classes = num_classes + + c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], num_classes) # channels + self.cv2 = nn.ModuleList(nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch) + self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, num_classes, 1)) for x in ch) + if not pretrained: + weights_init(self) + self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity() + + + def fuse(self): + print('Fusing layers... ') + for m in self.modules(): + if type(m) is Conv and hasattr(m, 'bn'): + m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv + delattr(m, 'bn') # remove batchnorm + m.forward = m.forward_fuse # update forward + return self + + def forward(self, x): + # backbone + feat1, feat2, feat3 = self.backbone.forward(x) + + #------------------------加强特征提取网络------------------------# + # 1024 * deep_mul, 20, 20 => 1024 * deep_mul, 40, 40 + P5_upsample = self.upsample(feat3) + # 1024 * deep_mul, 40, 40 cat 512, 40, 40 => 1024 * deep_mul + 512, 40, 40 + P4 = torch.cat([P5_upsample, feat2], 1) + # 1024 * deep_mul + 512, 40, 40 => 512, 40, 40 + P4 = self.conv3_for_upsample1(P4) + + # 512, 40, 40 => 512, 80, 80 + P4_upsample = self.upsample(P4) + # 512, 80, 80 cat 256, 80, 80 => 768, 80, 80 + P3 = torch.cat([P4_upsample, feat1], 1) + # 768, 80, 80 => 256, 80, 80 + P3 = self.conv3_for_upsample2(P3) + + # 256, 80, 80 => 256, 40, 40 + P3_downsample = self.down_sample1(P3) + # 512, 40, 40 cat 256, 40, 40 => 768, 40, 40 + P4 = torch.cat([P3_downsample, P4], 1) + # 768, 40, 40 => 512, 40, 40 + P4 = self.conv3_for_downsample1(P4) + + # 512, 40, 40 => 512, 20, 20 + P4_downsample = self.down_sample2(P4) + # 512, 20, 20 cat 1024 * deep_mul, 20, 20 => 1024 * deep_mul + 512, 20, 20 + P5 = torch.cat([P4_downsample, feat3], 1) + # 1024 * deep_mul + 512, 20, 20 => 1024 * deep_mul, 20, 20 + P5 = self.conv3_for_downsample2(P5) + #------------------------加强特征提取网络------------------------# + # P3 256, 80, 80 + # P4 512, 40, 40 + # P5 1024 * deep_mul, 20, 20 + shape = P3.shape # BCHW + + # P3 256, 80, 80 => num_classes + self.reg_max * 4, 80, 80 + # P4 512, 40, 40 => num_classes + self.reg_max * 4, 40, 40 + # P5 1024 * deep_mul, 20, 20 => num_classes + self.reg_max * 4, 20, 20 + x = [P3, P4, P5] + for i in range(self.nl): + x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1) + + if self.shape != shape: + self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5)) + self.shape = shape + + # num_classes + self.reg_max * 4 , 8400 => cls num_classes, 8400; + # box self.reg_max * 4, 8400 + box, cls = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2).split((self.reg_max * 4, self.num_classes), 1) + # origin_cls = [xi.split((self.reg_max * 4, self.num_classes), 1)[1] for xi in x] + dbox = self.dfl(box) + return dbox, cls, x, self.anchors.to(dbox.device), self.strides.to(dbox.device) \ No newline at end of file diff --git a/app/core/yolo_detect/nets/yolo_training.py b/app/core/yolo_detect/nets/yolo_training.py new file mode 100644 index 0000000..7d6a238 --- /dev/null +++ b/app/core/yolo_detect/nets/yolo_training.py @@ -0,0 +1,592 @@ +import math +from copy import deepcopy +from functools import partial + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from app.core.yolo_detect.utils.utils_bbox import dist2bbox, make_anchors + + +def select_candidates_in_gts(xy_centers, gt_bboxes, eps=1e-9, roll_out=False): + """select the positive anchor center in gt + + Args: + xy_centers (Tensor): shape(h*w, 4) + gt_bboxes (Tensor): shape(b, n_boxes, 4) + Return: + (Tensor): shape(b, n_boxes, h*w) + """ + n_anchors = xy_centers.shape[0] + bs, n_boxes, _ = gt_bboxes.shape + # 计算每个真实框距离每个anchors锚点的左上右下的距离,然后求min + # 保证真实框在锚点附近,包围锚点 + if roll_out: + bbox_deltas = torch.empty((bs, n_boxes, n_anchors), device=gt_bboxes.device) + for b in range(bs): + lt, rb = gt_bboxes[b].view(-1, 1, 4).chunk(2, 2) # left-top, right-bottom + bbox_deltas[b] = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]), + dim=2).view(n_boxes, n_anchors, -1).amin(2).gt_(eps) + return bbox_deltas + else: + # 真实框的坐上右下left-top, right-bottom + lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2) + # 真实框距离每个anchors锚点的左上右下的距离 + bbox_deltas = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]), dim=2).view(bs, n_boxes, n_anchors, -1) + # return (bbox_deltas.min(3)[0] > eps).to(gt_bboxes.dtype) + return bbox_deltas.amin(3).gt_(eps) + + +def select_highest_overlaps(mask_pos, overlaps, n_max_boxes): + """if an anchor box is assigned to multiple gts, + the one with the highest iou will be selected. + + Args: + mask_pos (Tensor): shape(b, n_max_boxes, h*w) + overlaps (Tensor): shape(b, n_max_boxes, h*w) + Return: + target_gt_idx (Tensor): shape(b, h*w) + fg_mask (Tensor): shape(b, h*w) + mask_pos (Tensor): shape(b, n_max_boxes, h*w) + """ + # b, n_max_boxes, 8400 -> b, 8400 + fg_mask = mask_pos.sum(-2) + # 如果有一个anchor被指派去预测多个真实框 + if fg_mask.max() > 1: + # b, n_max_boxes, 8400 + mask_multi_gts = (fg_mask.unsqueeze(1) > 1).repeat([1, n_max_boxes, 1]) + # 如果有一个anchor被指派去预测多个真实框,首先计算这个anchor最重合的真实框 + # 然后做一个onehot + # b, 8400 + max_overlaps_idx = overlaps.argmax(1) + # b, 8400, n_max_boxes + is_max_overlaps = F.one_hot(max_overlaps_idx, n_max_boxes) + # b, n_max_boxes, 8400 + is_max_overlaps = is_max_overlaps.permute(0, 2, 1).to(overlaps.dtype) + # b, n_max_boxes, 8400 + mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos) + fg_mask = mask_pos.sum(-2) + # 找到每个anchor符合哪个gt + target_gt_idx = mask_pos.argmax(-2) # (b, h*w) + return target_gt_idx, fg_mask, mask_pos + + +class TaskAlignedAssigner(nn.Module): + + def __init__(self, topk=13, num_classes=80, alpha=1.0, beta=6.0, eps=1e-9, roll_out_thr=0): + super().__init__() + self.topk = topk + self.num_classes = num_classes + self.bg_idx = num_classes + self.alpha = alpha + self.beta = beta + self.eps = eps + # roll_out_thr为64 + self.roll_out_thr = roll_out_thr + + @torch.no_grad() + def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt): + """This code referenced to + https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py + + Args: + pd_scores (Tensor) : shape(bs, num_total_anchors, num_classes) + pd_bboxes (Tensor) : shape(bs, num_total_anchors, 4) + anc_points (Tensor) : shape(num_total_anchors, 2) + gt_labels (Tensor) : shape(bs, n_max_boxes, 1) + gt_bboxes (Tensor) : shape(bs, n_max_boxes, 4) + mask_gt (Tensor) : shape(bs, n_max_boxes, 1) + Returns: + target_labels (Tensor) : shape(bs, num_total_anchors) + target_bboxes (Tensor) : shape(bs, num_total_anchors, 4) + target_scores (Tensor) : shape(bs, num_total_anchors, num_classes) + fg_mask (Tensor) : shape(bs, num_total_anchors) + """ + # 获得batch_size + self.bs = pd_scores.size(0) + # 获得真实框中的最大框数量 + self.n_max_boxes = gt_bboxes.size(1) + # 如果self.n_max_boxes大于self.roll_out_thr则roll_out + self.roll_out = self.n_max_boxes > self.roll_out_thr if self.roll_out_thr else False + + if self.n_max_boxes == 0: + device = gt_bboxes.device + return (torch.full_like(pd_scores[..., 0], self.bg_idx).to(device), torch.zeros_like(pd_bboxes).to(device), + torch.zeros_like(pd_scores).to(device), torch.zeros_like(pd_scores[..., 0]).to(device), + torch.zeros_like(pd_scores[..., 0]).to(device)) + + # b, max_num_obj, 8400 + # mask_pos 满足在真实框内、是真实框topk最重合的正样本、满足mask_gt的锚点 + # align_metric 某个先验点属于某个真实框的类的概率乘上某个先验点与真实框的重合程度 + # overlaps 所有真实框和锚点的重合程度 + mask_pos, align_metric, overlaps = self.get_pos_mask(pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt) + + # target_gt_idx b, 8400 每个anchor符合哪个gt + # fg_mask b, 8400 每个anchor是否有符合的gt + # mask_pos b, max_num_obj, 8400 one_hot后的target_gt_idx + target_gt_idx, fg_mask, mask_pos = select_highest_overlaps(mask_pos, overlaps, self.n_max_boxes) + + # 指定目标到对应的anchor点上 + # b, 8400 + # b, 8400, 4 + # b, 8400, 80 + target_labels, target_bboxes, target_scores = self.get_targets(gt_labels, gt_bboxes, target_gt_idx, fg_mask) + + # 乘上mask_pos,把不满足真实框满足的锚点的都置0 + align_metric *= mask_pos + # 每个真实框对应的最大得分 + # b, max_num_obj + pos_align_metrics = align_metric.amax(axis=-1, keepdim=True) + # 每个真实框对应的最大重合度 + # b, max_num_obj + pos_overlaps = (overlaps * mask_pos).amax(axis=-1, keepdim=True) + # 把每个真实框和先验点的得分乘上最大重合程度,再除上最大得分 + norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1) + # target_scores作为正则的标签 + target_scores = target_scores * norm_align_metric + + return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx + + def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt): + # pd_scores bs, num_total_anchors, num_classes + # pd_bboxes bs, num_total_anchors, 4 + # gt_labels bs, n_max_boxes, 1 + # gt_bboxes bs, n_max_boxes, 4 + # + # align_metric是一个算出来的代价值,某个先验点属于某个真实框的类的概率乘上某个先验点与真实框的重合程度 + # overlaps是某个先验点与真实框的重合程度 + # align_metric, overlaps bs, max_num_obj, 8400 + align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes) + + # 正样本锚点需要同时满足: + # 1、在真实框内 + # 2、是真实框topk最重合的正样本 + # 3、满足mask_gt + + # get in_gts mask b, max_num_obj, 8400 + # 判断先验点是否在真实框内 + mask_in_gts = select_candidates_in_gts(anc_points, gt_bboxes, roll_out=self.roll_out) + # get topk_metric mask b, max_num_obj, 8400 + # 判断锚点是否在真实框的topk中 + mask_topk = self.select_topk_candidates(align_metric * mask_in_gts, topk_mask=mask_gt.repeat([1, 1, self.topk]).bool()) + # merge all mask to a final mask, b, max_num_obj, h*w + # 真实框存在,非padding + mask_pos = mask_topk * mask_in_gts * mask_gt + + return mask_pos, align_metric, overlaps + + def get_box_metrics(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes): + if self.roll_out: + align_metric = torch.empty((self.bs, self.n_max_boxes, pd_scores.shape[1]), device=pd_scores.device) + overlaps = torch.empty((self.bs, self.n_max_boxes, pd_scores.shape[1]), device=pd_scores.device) + ind_0 = torch.empty(self.n_max_boxes, dtype=torch.long) + for b in range(self.bs): + ind_0[:], ind_2 = b, gt_labels[b].squeeze(-1).long() + # 获得属于这个类别的得分 + # bs, max_num_obj, 8400 + bbox_scores = pd_scores[ind_0, :, ind_2] + # 计算真实框和预测框的ciou + # bs, max_num_obj, 8400 + overlaps[b] = bbox_iou(gt_bboxes[b].unsqueeze(1), pd_bboxes[b].unsqueeze(0), xywh=False, CIoU=True).squeeze(2).clamp(0) + align_metric[b] = bbox_scores.pow(self.alpha) * overlaps[b].pow(self.beta) + else: + # 2, b, max_num_obj + ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long) + # b, max_num_obj + # [0]代表第几个图片的 + ind[0] = torch.arange(end=self.bs).view(-1, 1).repeat(1, self.n_max_boxes) + # [1]真是标签是什么 + ind[1] = gt_labels.long().squeeze(-1) + # 获得属于这个类别的得分 + # 取出某个先验点属于某个类的概率 + # b, max_num_obj, 8400 + bbox_scores = pd_scores[ind[0], :, ind[1]] + + # 计算真实框和预测框的ciou + # bs, max_num_obj, 8400 + overlaps = bbox_iou(gt_bboxes.unsqueeze(2), pd_bboxes.unsqueeze(1), xywh=False, CIoU=True).squeeze(3).clamp(0) + align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta) + return align_metric, overlaps + + def select_topk_candidates(self, metrics, largest=True, topk_mask=None): + """ + Args: + metrics : (b, max_num_obj, h*w). + topk_mask : (b, max_num_obj, topk) or None + """ + # 8400 + num_anchors = metrics.shape[-1] + # b, max_num_obj, topk + topk_metrics, topk_idxs = torch.topk(metrics, self.topk, dim=-1, largest=largest) + if topk_mask is None: + topk_mask = (topk_metrics.max(-1, keepdim=True) > self.eps).tile([1, 1, self.topk]) + # b, max_num_obj, topk + topk_idxs[~topk_mask] = 0 + # b, max_num_obj, topk, 8400 -> b, max_num_obj, 8400 + # 这一步得到的is_in_topk为b, max_num_obj, 8400 + # 代表每个真实框对应的top k个先验点 + if self.roll_out: + is_in_topk = torch.empty(metrics.shape, dtype=torch.long, device=metrics.device) + for b in range(len(topk_idxs)): + is_in_topk[b] = F.one_hot(topk_idxs[b], num_anchors).sum(-2) + else: + is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(-2) + # 判断锚点是否在真实框的topk中 + is_in_topk = torch.where(is_in_topk > 1, 0, is_in_topk) + return is_in_topk.to(metrics.dtype) + + def get_targets(self, gt_labels, gt_bboxes, target_gt_idx, fg_mask): + """ + Args: + gt_labels : (b, max_num_obj, 1) + gt_bboxes : (b, max_num_obj, 4) + target_gt_idx : (b, h*w) + fg_mask : (b, h*w) + """ + + # 用于读取真实框标签, (b, 1) + batch_ind = torch.arange(end=self.bs, dtype=torch.int64, device=gt_labels.device)[..., None] + # b, h*w 获得gt_labels,gt_bboxes在flatten后的序号 + target_gt_idx = target_gt_idx + batch_ind * self.n_max_boxes + # b, h*w 用于flatten后读取标签 + target_labels = gt_labels.long().flatten()[target_gt_idx] + # b, h*w, 4 用于flatten后读取box + target_bboxes = gt_bboxes.view(-1, 4)[target_gt_idx] + + # assigned target scores + target_labels.clamp(0) + # 进行one_hot映射到训练需要的形式。 + target_scores = F.one_hot(target_labels, self.num_classes) # (b, h*w, 80) + fg_scores_mask = fg_mask[:, :, None].repeat(1, 1, self.num_classes) # (b, h*w, 80) + target_scores = torch.where(fg_scores_mask > 0, target_scores, 0) + + return target_labels, target_bboxes, target_scores + +def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7): + # Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4) + + # Get the coordinates of bounding boxes + if xywh: # transform from xywh to xyxy + (x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1) + w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2 + b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_ + b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_ + else: # x1, y1, x2, y2 = box1 + b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1) + b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1) + w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps + w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps + + # Intersection area + inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) * \ + (b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0) + + # Union Area + union = w1 * h1 + w2 * h2 - inter + eps + + # IoU + iou = inter / union + if CIoU or DIoU or GIoU: + cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex (smallest enclosing box) width + ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height + if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 + c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared + rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center dist ** 2 + if CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 + v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) + with torch.no_grad(): + alpha = v / (v - iou + (1 + eps)) + return iou - (rho2 / c2 + v * alpha) # CIoU + return iou - rho2 / c2 # DIoU + c_area = cw * ch + eps # convex area + return iou - (c_area - union) / c_area # GIoU https://arxiv.org/pdf/1902.09630.pdf + return iou # IoU + +def bbox2dist(anchor_points, bbox, reg_max): + """Transform bbox(xyxy) to dist(ltrb).""" + x1y1, x2y2 = torch.split(bbox, 2, -1) + return torch.cat((anchor_points - x1y1, x2y2 - anchor_points), -1).clamp(0, reg_max - 0.01) # dist (lt, rb) + +class BboxLoss(nn.Module): + def __init__(self, reg_max=16, use_dfl=False): + super().__init__() + self.reg_max = reg_max + self.use_dfl = use_dfl + + def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask): + # 计算IOU损失 + # weight代表损失中标签应该有的置信度,0最小,1最大 + weight = torch.masked_select(target_scores.sum(-1), fg_mask).unsqueeze(-1) + # 计算预测框和真实框的重合程度 + iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True) + # 然后1-重合程度,乘上应该有的置信度,求和后求平均。 + loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum + + # 计算DFL损失 + if self.use_dfl: + target_ltrb = bbox2dist(anchor_points, target_bboxes, self.reg_max) + loss_dfl = self._df_loss(pred_dist[fg_mask].view(-1, self.reg_max + 1), target_ltrb[fg_mask]) * weight + loss_dfl = loss_dfl.sum() / target_scores_sum + else: + loss_dfl = torch.tensor(0.0).to(pred_dist.device) + + return loss_iou, loss_dfl + + @staticmethod + def _df_loss(pred_dist, target): + # Return sum of left and right DFL losses + # Distribution Focal Loss (DFL) proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391 + tl = target.long() # target left + tr = tl + 1 # target right + wl = tr - target # weight left + wr = 1 - wl # weight right + # 一个点一般不会处于anchor点上,一般是xx.xx。如果要用DFL的话,不可能直接一个cross_entropy就能拟合 + # 所以把它认为是相对于xx.xx左上角锚点与右下角锚点的距离 如果距离右下角锚点距离小,wl就小,左上角损失就小 + # 如果距离左上角锚点距离小,wr就小,右下角损失就小 + return (F.cross_entropy(pred_dist, tl.view(-1), reduction="none").view(tl.shape) * wl + + F.cross_entropy(pred_dist, tr.view(-1), reduction="none").view(tl.shape) * wr).mean(-1, keepdim=True) + +def xywh2xyxy(x): + """ + Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the + top-left corner and (x2, y2) is the bottom-right corner. + + Args: + x (np.ndarray) or (torch.Tensor): The input bounding box coordinates in (x, y, width, height) format. + Returns: + y (np.ndarray) or (torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format. + """ + y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x) + y[..., 0] = x[..., 0] - x[..., 2] / 2 # top left x + y[..., 1] = x[..., 1] - x[..., 3] / 2 # top left y + y[..., 2] = x[..., 0] + x[..., 2] / 2 # bottom right x + y[..., 3] = x[..., 1] + x[..., 3] / 2 # bottom right y + return y + +# Criterion class for computing training losses +class Loss: + def __init__(self, model): + self.bce = nn.BCEWithLogitsLoss(reduction='none') + self.stride = model.stride # model strides + self.nc = model.num_classes # number of classes + self.no = model.no + self.reg_max = model.reg_max + + self.use_dfl = model.reg_max > 1 + roll_out_thr = 64 + + self.assigner = TaskAlignedAssigner(topk=10, + num_classes=self.nc, + alpha=0.5, + beta=6.0, + roll_out_thr=roll_out_thr) + self.bbox_loss = BboxLoss(model.reg_max - 1, use_dfl=self.use_dfl) + self.proj = torch.arange(model.reg_max, dtype=torch.float) + + def preprocess(self, targets, batch_size, scale_tensor): + if targets.shape[0] == 0: + out = torch.zeros(batch_size, 0, 5, device=targets.device) + else: + # 获得图像索引 + i = targets[:, 0] + _, counts = i.unique(return_counts=True) + out = torch.zeros(batch_size, counts.max(), 5, device=targets.device) + # 对batch进行循环,然后赋值 + for j in range(batch_size): + matches = i == j + n = matches.sum() + if n: + out[j, :n] = targets[matches, 1:] + # 缩放到原图大小。 + out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor)) + return out + + def bbox_decode(self, anchor_points, pred_dist): + if self.use_dfl: + # batch, anchors, channels + b, a, c = pred_dist.shape + # DFL的解码 + pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.to(pred_dist.device).type(pred_dist.dtype)) + # pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype)) + # pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2) + # 然后解码获得预测框 + return dist2bbox(pred_dist, anchor_points, xywh=False) + + def __call__(self, preds, batch): + # 获得使用的device + device = preds[1].device + # box, cls, dfl三部分的损失 + loss = torch.zeros(3, device=device) + # 获得特征,并进行划分 + feats = preds[2] if isinstance(preds, tuple) else preds + pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split((self.reg_max * 4, self.nc), 1) + + # bs, num_classes + self.reg_max * 4 , 8400 => cls bs, num_classes, 8400; + # box bs, self.reg_max * 4, 8400 + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + + # 获得batch size与dtype + dtype = pred_scores.dtype + batch_size = pred_scores.shape[0] + # 获得输入图片大小 + imgsz = torch.tensor(feats[0].shape[2:], device=device, dtype=dtype) * self.stride[0] + # 获得anchors点和步长对应的tensor + anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) + + # 把一个batch中的东西弄一个矩阵 + # 0为属于第几个图片 + # 1为种类 + # 2:为框的坐标 + targets = torch.cat((batch[:, 0].view(-1, 1), batch[:, 1].view(-1, 1), batch[:, 2:]), 1) + # 先进行初步的处理,对输入进来的gt进行padding,到最大数量,并把框的坐标进行缩放 + # bs, max_boxes_num, 5 + targets = self.preprocess(targets.to(device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]]) + # bs, max_boxes_num, 5 => bs, max_boxes_num, 1 ; bs, max_boxes_num, 4 + gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy + # 求哪些框是有目标的,哪些是填充的 + # bs, max_boxes_num + mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) + + # pboxes + # 对预测结果进行解码,获得预测框 + # bs, 8400, 4 + pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4) + + # 对预测框与真实框进行分配 + # target_bboxes bs, 8400, 4 + # target_scores bs, 8400, 80 + # fg_mask bs, 8400 + _, target_bboxes, target_scores, fg_mask, _ = self.assigner( + pred_scores.detach().sigmoid(), (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), + anchor_points * stride_tensor, gt_labels, gt_bboxes, mask_gt + ) + + target_bboxes /= stride_tensor + target_scores_sum = max(target_scores.sum(), 1) + + # 计算分类的损失 + # loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way + loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE + + # 计算bbox的损失 + if fg_mask.sum(): + loss[0], loss[2] = self.bbox_loss(pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, + target_scores_sum, fg_mask) + + loss[0] *= 7.5 # box gain + loss[1] *= 0.5 # cls gain + loss[2] *= 1.5 # dfl gain + return loss.sum() # loss(box, cls, dfl) # * batch_size + +def is_parallel(model): + # Returns True if model is of type DP or DDP + return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) + +def de_parallel(model): + # De-parallelize a model: returns single-GPU model if model is of type DP or DDP + return model.module if is_parallel(model) else model + +def copy_attr(a, b, include=(), exclude=()): + # Copy attributes from b to a, options to only include [...] and to exclude [...] + for k, v in b.__dict__.items(): + if (len(include) and k not in include) or k.startswith('_') or k in exclude: + continue + else: + setattr(a, k, v) + +class ModelEMA: + """ Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models + Keeps a moving average of everything in the model state_dict (parameters and buffers) + For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + """ + + def __init__(self, model, decay=0.9999, tau=2000, updates=0): + # Create EMA + self.ema = deepcopy(de_parallel(model)).eval() # FP32 EMA + # if next(model.parameters()).device.type != 'cpu': + # self.ema.half() # FP16 EMA + self.updates = updates # number of EMA updates + self.decay = lambda x: decay * (1 - math.exp(-x / tau)) # decay exponential ramp (to help early epochs) + for p in self.ema.parameters(): + p.requires_grad_(False) + + def update(self, model): + # Update EMA parameters + with torch.no_grad(): + self.updates += 1 + d = self.decay(self.updates) + + msd = de_parallel(model).state_dict() # model state_dict + for k, v in self.ema.state_dict().items(): + if v.dtype.is_floating_point: + v *= d + v += (1 - d) * msd[k].detach() + + def update_attr(self, model, include=(), exclude=('process_group', 'reducer')): + # Update EMA attributes + copy_attr(self.ema, model, include, exclude) + +def weights_init(net, init_type='normal', init_gain = 0.02): + def init_func(m): + classname = m.__class__.__name__ + if hasattr(m, 'weight') and classname.find('Conv') != -1: + if init_type == 'normal': + torch.nn.init.normal_(m.weight.data, 0.0, init_gain) + elif init_type == 'xavier': + torch.nn.init.xavier_normal_(m.weight.data, gain=init_gain) + elif init_type == 'kaiming': + torch.nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in') + elif init_type == 'orthogonal': + torch.nn.init.orthogonal_(m.weight.data, gain=init_gain) + else: + raise NotImplementedError('initialization method [%s] is not implemented' % init_type) + elif classname.find('BatchNorm2d') != -1: + torch.nn.init.normal_(m.weight.data, 1.0, 0.02) + torch.nn.init.constant_(m.bias.data, 0.0) + print('initialize network with %s type' % init_type) + net.apply(init_func) + +def get_lr_scheduler(lr_decay_type, lr, min_lr, total_iters, warmup_iters_ratio = 0.05, warmup_lr_ratio = 0.1, no_aug_iter_ratio = 0.05, step_num = 10): + def yolox_warm_cos_lr(lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter, iters): + if iters <= warmup_total_iters: + # lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start + lr = (lr - warmup_lr_start) * pow(iters / float(warmup_total_iters), 2 + ) + warmup_lr_start + elif iters >= total_iters - no_aug_iter: + lr = min_lr + else: + lr = min_lr + 0.5 * (lr - min_lr) * ( + 1.0 + + math.cos( + math.pi + * (iters - warmup_total_iters) + / (total_iters - warmup_total_iters - no_aug_iter) + ) + ) + return lr + + def step_lr(lr, decay_rate, step_size, iters): + if step_size < 1: + raise ValueError("step_size must above 1.") + n = iters // step_size + out_lr = lr * decay_rate ** n + return out_lr + + if lr_decay_type == "cos": + warmup_total_iters = min(max(warmup_iters_ratio * total_iters, 1), 3) + warmup_lr_start = max(warmup_lr_ratio * lr, 1e-6) + no_aug_iter = min(max(no_aug_iter_ratio * total_iters, 1), 15) + func = partial(yolox_warm_cos_lr ,lr, min_lr, total_iters, warmup_total_iters, warmup_lr_start, no_aug_iter) + else: + decay_rate = (min_lr / lr) ** (1 / (step_num - 1)) + step_size = total_iters / step_num + func = partial(step_lr, lr, decay_rate, step_size) + + return func + +def set_optimizer_lr(optimizer, lr_scheduler_func, epoch): + lr = lr_scheduler_func(epoch) + for param_group in optimizer.param_groups: + param_group['lr'] = lr diff --git a/app/core/yolo_detect/predict.py b/app/core/yolo_detect/predict.py new file mode 100644 index 0000000..ed72ec2 --- /dev/null +++ b/app/core/yolo_detect/predict.py @@ -0,0 +1,186 @@ +#-----------------------------------------------------------------------# +# predict.py将单张图片预测、摄像头检测、FPS测试和目录遍历检测等功能 +# 整合到了一个py文件中,通过指定mode进行模式的修改。 +#-----------------------------------------------------------------------# +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "7" +import time + +import cv2 +import numpy as np +from PIL import Image + +from yolo import YOLO + +if __name__ == "__main__": + yolo = YOLO() + #----------------------------------------------------------------------------------------------------------# + # mode用于指定测试的模式: + # 'predict' 表示单张图片预测,如果想对预测过程进行修改,如保存图片,截取对象等,可以先看下方详细的注释 + # 'video' 表示视频检测,可调用摄像头或者视频进行检测,详情查看下方注释。 + # 'fps' 表示测试fps,使用的图片是img里面的street.jpg,详情查看下方注释。 + # 'dir_predict' 表示遍历文件夹进行检测并保存。默认遍历img文件夹,保存img_out文件夹,详情查看下方注释。 + # 'heatmap' 表示进行预测结果的热力图可视化,详情查看下方注释。 + # 'export_onnx' 表示将模型导出为onnx,需要pytorch1.7.1以上。 + #----------------------------------------------------------------------------------------------------------# + mode = "dir_predict" + #-------------------------------------------------------------------------# + # crop 指定了是否在单张图片预测后对目标进行截取 + # count 指定了是否进行目标的计数 + # crop、count仅在mode='predict'时有效 + #-------------------------------------------------------------------------# + crop = False + count = False + #----------------------------------------------------------------------------------------------------------# + # video_path 用于指定视频的路径,当video_path=0时表示检测摄像头 + # 想要检测视频,则设置如video_path = "xxx.mp4"即可,代表读取出根目录下的xxx.mp4文件。 + # video_save_path 表示视频保存的路径,当video_save_path=""时表示不保存 + # 想要保存视频,则设置如video_save_path = "yyy.mp4"即可,代表保存为根目录下的yyy.mp4文件。 + # video_fps 用于保存的视频的fps + # + # video_path、video_save_path和video_fps仅在mode='video'时有效 + # 保存视频时需要ctrl+c退出或者运行到最后一帧才会完成完整的保存步骤。 + #----------------------------------------------------------------------------------------------------------# + video_path = 0 + video_save_path = "" + video_fps = 25.0 + #----------------------------------------------------------------------------------------------------------# + # test_interval 用于指定测量fps的时候,图片检测的次数。理论上test_interval越大,fps越准确。 + # fps_image_path 用于指定测试的fps图片 + # + # test_interval和fps_image_path仅在mode='fps'有效 + #----------------------------------------------------------------------------------------------------------# + test_interval = 100 + fps_image_path = "img/street.jpg" + #-------------------------------------------------------------------------# + # dir_origin_path 指定了用于检测的图片的文件夹路径 + # dir_save_path 指定了检测完图片的保存路径 + # + # dir_origin_path和dir_save_path仅在mode='dir_predict'时有效 + #-------------------------------------------------------------------------# + dir_origin_path = "img" + dir_save_path = "img_out" + #-------------------------------------------------------------------------# + # heatmap_save_path 热力图的保存路径,默认保存在model_data下 + # + # heatmap_save_path仅在mode='heatmap'有效 + #-------------------------------------------------------------------------# + heatmap_save_path = "model_data/heatmap_vision.png" + #-------------------------------------------------------------------------# + # simplify 使用Simplify onnx + # onnx_save_path 指定了onnx的保存路径 + #-------------------------------------------------------------------------# + simplify = False + onnx_save_path = "model_data/models.onnx" + + if mode == "predict": + ''' + 1、如果想要进行检测完的图片的保存,利用r_image.save("img.jpg")即可保存,直接在predict.py里进行修改即可。 + 2、如果想要获得预测框的坐标,可以进入yolo.detect_image函数,在绘图部分读取top,left,bottom,right这四个值。 + 3、如果想要利用预测框截取下目标,可以进入yolo.detect_image函数,在绘图部分利用获取到的top,left,bottom,right这四个值 + 在原图上利用矩阵的方式进行截取。 + 4、如果想要在预测图上写额外的字,比如检测到的特定目标的数量,可以进入yolo.detect_image函数,在绘图部分对predicted_class进行判断, + 比如判断if predicted_class == 'car': 即可判断当前目标是否为车,然后记录数量即可。利用draw.text即可写字。 + ''' + while True: + img = input('Input image filename:') + try: + image = Image.open(img) + except: + print('Open Error! Try again!') + continue + else: + r_image = yolo.detect_image(image, crop = crop, count=count) + + elif mode == "video": + capture = cv2.VideoCapture(video_path) + if video_save_path!="": + fourcc = cv2.VideoWriter_fourcc(*'XVID') + size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)), int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))) + out = cv2.VideoWriter(video_save_path, fourcc, video_fps, size) + + ref, frame = capture.read() + if not ref: + raise ValueError("未能正确读取摄像头(视频),请注意是否正确安装摄像头(是否正确填写视频路径)。") + + fps = 0.0 + while(True): + t1 = time.time() + # 读取某一帧 + ref, frame = capture.read() + if not ref: + break + # 格式转变,BGRtoRGB + frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) + # 转变成Image + frame = Image.fromarray(np.uint8(frame)) + # 进行检测 + frame = np.array(yolo.detect_image(frame)) + # RGBtoBGR满足opencv显示格式 + frame = cv2.cvtColor(frame,cv2.COLOR_RGB2BGR) + + fps = ( fps + (1./(time.time()-t1)) ) / 2 + print("fps= %.2f"%(fps)) + frame = cv2.putText(frame, "fps= %.2f"%(fps), (0, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) + + cv2.imshow("video",frame) + c= cv2.waitKey(1) & 0xff + if video_save_path!="": + out.write(frame) + + if c==27: + capture.release() + break + + print("Video Detection Done!") + capture.release() + if video_save_path!="": + print("Save processed video to the path :" + video_save_path) + out.release() + cv2.destroyAllWindows() + + elif mode == "fps": + img = Image.open(fps_image_path) + tact_time = yolo.get_FPS(img, test_interval) + print(str(tact_time) + ' seconds, ' + str(1/tact_time) + 'FPS, @batch_size 1') + + elif mode == "dir_predict": + import os + + from tqdm import tqdm + + img_names = os.listdir(dir_origin_path) + for img_name in tqdm(img_names): + if img_name.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')): + image_path = os.path.join(dir_origin_path, img_name) + image = Image.open(image_path) + r_image,predicted_class_list = yolo.detect_image(image) + + if not os.path.exists(dir_save_path): + os.makedirs(dir_save_path) + print("+++++++++++++++++++++") + print(predicted_class_list) + for tag in ["wall", "wall_shenshui", "wall_konggu", "wall_konggu_gap", "wall_kailie"]: + if tag in predicted_class_list: + r_image.save(os.path.join(dir_save_path, img_name.replace(".jpg", ".png")), quality=95, subsampling=0) + + # if "wall_shenshui" in predicted_class_list: + # r_image.save(os.path.join(dir_save_path, img_name.replace(".jpg", ".png")), quality=95, subsampling=0) + + + elif mode == "heatmap": + while True: + img = input('Input image filename:') + try: + image = Image.open(img) + except: + print('Open Error! Try again!') + continue + else: + yolo.detect_heatmap(image, heatmap_save_path) + + elif mode == "export_onnx": + yolo.convert_to_onnx(simplify, onnx_save_path) + + else: + raise AssertionError("Please specify the correct mode: 'predict', 'video', 'fps', 'heatmap', 'export_onnx', 'dir_predict'.") diff --git a/app/core/yolo_detect/summary.py b/app/core/yolo_detect/summary.py new file mode 100644 index 0000000..97e8328 --- /dev/null +++ b/app/core/yolo_detect/summary.py @@ -0,0 +1,32 @@ +#--------------------------------------------# +# 该部分代码用于看网络结构 +#--------------------------------------------# +import torch +from thop import clever_format, profile + +from nets.yolo import YoloBody + +if __name__ == "__main__": + input_shape = [640, 640] + anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + num_classes = 80 + phi = 's' + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + m = YoloBody(input_shape, num_classes, phi, False).to(device) + for i in m.children(): + print(i) + print('==============================') + + dummy_input = torch.randn(1, 3, input_shape[0], input_shape[1]).to(device) + flops, params = profile(m.to(device), (dummy_input, ), verbose=False) + #--------------------------------------------------------# + # flops * 2是因为profile没有将卷积作为两个operations + # 有些论文将卷积算乘法、加法两个operations。此时乘2 + # 有些论文只考虑乘法的运算次数,忽略加法。此时不乘2 + # 本代码选择乘2,参考YOLOX。 + #--------------------------------------------------------# + flops = flops * 2 + flops, params = clever_format([flops, params], "%.3f") + print('Total GFLOPS: %s' % (flops)) + print('Total params: %s' % (params)) diff --git a/app/core/yolo_detect/train.py b/app/core/yolo_detect/train.py new file mode 100644 index 0000000..0a18724 --- /dev/null +++ b/app/core/yolo_detect/train.py @@ -0,0 +1,565 @@ +#-------------------------------------# +# 对数据集进行训练 +#-------------------------------------# +import datetime +import os +from functools import partial + +import numpy as np +import torch +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.nn as nn +import torch.optim as optim +from torch.utils.data import DataLoader + +from nets.yolo import YoloBody +from nets.yolo_training import (Loss, ModelEMA, get_lr_scheduler, + set_optimizer_lr, weights_init) +from utils.callbacks import EvalCallback, LossHistory +from utils.dataloader import YoloDataset, yolo_dataset_collate +from utils.utils import (download_weights, get_classes, seed_everything, + show_config, worker_init_fn) +from utils.utils_fit import fit_one_epoch + +''' +训练自己的目标检测模型一定需要注意以下几点: +1、训练前仔细检查自己的格式是否满足要求,该库要求数据集格式为VOC格式,需要准备好的内容有输入图片和标签 + 输入图片为.jpg图片,无需固定大小,传入训练前会自动进行resize。 + 灰度图会自动转成RGB图片进行训练,无需自己修改。 + 输入图片如果后缀非jpg,需要自己批量转成jpg后再开始训练。 + + 标签为.xml格式,文件中会有需要检测的目标信息,标签文件和输入图片文件相对应。 + +2、损失值的大小用于判断是否收敛,比较重要的是有收敛的趋势,即验证集损失不断下降,如果验证集损失基本上不改变的话,模型基本上就收敛了。 + 损失值的具体大小并没有什么意义,大和小只在于损失的计算方式,并不是接近于0才好。如果想要让损失好看点,可以直接到对应的损失函数里面除上10000。 + 训练过程中的损失值会保存在logs文件夹下的loss_%Y_%m_%d_%H_%M_%S文件夹中 + +3、训练好的权值文件保存在logs文件夹中,每个训练世代(Epoch)包含若干训练步长(Step),每个训练步长(Step)进行一次梯度下降。 + 如果只是训练了几个Step是不会保存的,Epoch和Step的概念要捋清楚一下。 +''' +if __name__ == "__main__": + #---------------------------------# + # Cuda 是否使用Cuda + # 没有GPU可以设置成False + #---------------------------------# + Cuda = True + #----------------------------------------------# + # Seed 用于固定随机种子 + # 使得每次独立训练都可以获得一样的结果 + #----------------------------------------------# + seed = 11 + #---------------------------------------------------------------------# + # distributed 用于指定是否使用单机多卡分布式运行 + # 终端指令仅支持Ubuntu。CUDA_VISIBLE_DEVICES用于在Ubuntu下指定显卡。 + # Windows系统下默认使用DP模式调用所有显卡,不支持DDP。 + # DP模式: + # 设置 distributed = False + # 在终端中输入 CUDA_VISIBLE_DEVICES=0,1 python train.py + # DDP模式: + # 设置 distributed = True + # 在终端中输入 CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 train.py + #---------------------------------------------------------------------# + distributed = False + #---------------------------------------------------------------------# + # sync_bn 是否使用sync_bn,DDP模式多卡可用 + #---------------------------------------------------------------------# + sync_bn = False + #---------------------------------------------------------------------# + # fp16 是否使用混合精度训练 + # 可减少约一半的显存、需要pytorch1.7.1以上 + #---------------------------------------------------------------------# + fp16 = True + #---------------------------------------------------------------------# + # classes_path 指向model_data下的txt,与自己训练的数据集相关 + # 训练前一定要修改classes_path,使其对应自己的数据集 + #---------------------------------------------------------------------# + classes_path = 'model_data/voc_classes.txt' + #----------------------------------------------------------------------------------------------------------------------------# + # 权值文件的下载请看README,可以通过网盘下载。模型的 预训练权重 对不同数据集是通用的,因为特征是通用的。 + # 模型的 预训练权重 比较重要的部分是 主干特征提取网络的权值部分,用于进行特征提取。 + # 预训练权重对于99%的情况都必须要用,不用的话主干部分的权值太过随机,特征提取效果不明显,网络训练的结果也不会好 + # + # 如果训练过程中存在中断训练的操作,可以将model_path设置成logs文件夹下的权值文件,将已经训练了一部分的权值再次载入。 + # 同时修改下方的 冻结阶段 或者 解冻阶段 的参数,来保证模型epoch的连续性。 + # + # 当model_path = ''的时候不加载整个模型的权值。 + # + # 此处使用的是整个模型的权重,因此是在train.py进行加载的。 + # 如果想要让模型从0开始训练,则设置model_path = '',下面的Freeze_Train = Fasle,此时从0开始训练,且没有冻结主干的过程。 + # + # 一般来讲,网络从0开始的训练效果会很差,因为权值太过随机,特征提取效果不明显,因此非常、非常、非常不建议大家从0开始训练! + # 从0开始训练有两个方案: + # 1、得益于Mosaic数据增强方法强大的数据增强能力,将UnFreeze_Epoch设置的较大(300及以上)、batch较大(16及以上)、数据较多(万以上)的情况下, + # 可以设置mosaic=True,直接随机初始化参数开始训练,但得到的效果仍然不如有预训练的情况。(像COCO这样的大数据集可以这样做) + # 2、了解imagenet数据集,首先训练分类模型,获得网络的主干部分权值,分类模型的 主干部分 和该模型通用,基于此进行训练。 + #----------------------------------------------------------------------------------------------------------------------------# + model_path = 'model_data/yolov8_l_backbone_weights.pth' + #------------------------------------------------------# + # input_shape 输入的shape大小,一定要是32的倍数 + #------------------------------------------------------# + input_shape = [640, 640] + #------------------------------------------------------# + # phi 所使用到的yolov8的版本 + # n : 对应yolov8_n + # s : 对应yolov8_s + # m : 对应yolov8_m + # l : 对应yolov8_l + # x : 对应yolov8_x + #------------------------------------------------------# + phi = 'l' + #----------------------------------------------------------------------------------------------------------------------------# + # pretrained 是否使用主干网络的预训练权重,此处使用的是主干的权重,因此是在模型构建的时候进行加载的。 + # 如果设置了model_path,则主干的权值无需加载,pretrained的值无意义。 + # 如果不设置model_path,pretrained = True,此时仅加载主干开始训练。 + # 如果不设置model_path,pretrained = False,Freeze_Train = Fasle,此时从0开始训练,且没有冻结主干的过程。 + #----------------------------------------------------------------------------------------------------------------------------# + pretrained = True + #------------------------------------------------------------------# + # mosaic 马赛克数据增强。 + # mosaic_prob 每个step有多少概率使用mosaic数据增强,默认50%。 + # + # mixup 是否使用mixup数据增强,仅在mosaic=True时有效。 + # 只会对mosaic增强后的图片进行mixup的处理。 + # mixup_prob 有多少概率在mosaic后使用mixup数据增强,默认50%。 + # 总的mixup概率为mosaic_prob * mixup_prob。 + # + # special_aug_ratio 参考YoloX,由于Mosaic生成的训练图片,远远脱离自然图片的真实分布。 + # 当mosaic=True时,本代码会在special_aug_ratio范围内开启mosaic。 + # 默认为前70%个epoch,100个世代会开启70个世代。 + #------------------------------------------------------------------# + mosaic = True + mosaic_prob = 0.5 + mixup = False + mixup_prob = 0.5 + special_aug_ratio = 0.7 + #------------------------------------------------------------------# + # label_smoothing 标签平滑。一般0.01以下。如0.01、0.005。 + #------------------------------------------------------------------# + label_smoothing = 0.005 + + #----------------------------------------------------------------------------------------------------------------------------# + # 训练分为两个阶段,分别是冻结阶段和解冻阶段。设置冻结阶段是为了满足机器性能不足的同学的训练需求。 + # 冻结训练需要的显存较小,显卡非常差的情况下,可设置Freeze_Epoch等于UnFreeze_Epoch,Freeze_Train = True,此时仅仅进行冻结训练。 + # + # 在此提供若干参数设置建议,各位训练者根据自己的需求进行灵活调整: + # (一)从整个模型的预训练权重开始训练: + # Adam: + # Init_Epoch = 0,Freeze_Epoch = 50,UnFreeze_Epoch = 100,Freeze_Train = True,optimizer_type = 'adam',Init_lr = 1e-3,weight_decay = 0。(冻结) + # Init_Epoch = 0,UnFreeze_Epoch = 100,Freeze_Train = False,optimizer_type = 'adam',Init_lr = 1e-3,weight_decay = 0。(不冻结) + # SGD: + # Init_Epoch = 0,Freeze_Epoch = 50,UnFreeze_Epoch = 300,Freeze_Train = True,optimizer_type = 'sgd',Init_lr = 1e-2,weight_decay = 5e-4。(冻结) + # Init_Epoch = 0,UnFreeze_Epoch = 300,Freeze_Train = False,optimizer_type = 'sgd',Init_lr = 1e-2,weight_decay = 5e-4。(不冻结) + # 其中:UnFreeze_Epoch可以在100-300之间调整。 + # (二)从0开始训练: + # Init_Epoch = 0,UnFreeze_Epoch >= 300,Unfreeze_batch_size >= 16,Freeze_Train = False(不冻结训练) + # 其中:UnFreeze_Epoch尽量不小于300。optimizer_type = 'sgd',Init_lr = 1e-2,mosaic = True。 + # (三)batch_size的设置: + # 在显卡能够接受的范围内,以大为好。显存不足与数据集大小无关,提示显存不足(OOM或者CUDA out of memory)请调小batch_size。 + # 受到BatchNorm层影响,batch_size最小为2,不能为1。 + # 正常情况下Freeze_batch_size建议为Unfreeze_batch_size的1-2倍。不建议设置的差距过大,因为关系到学习率的自动调整。 + #----------------------------------------------------------------------------------------------------------------------------# + #------------------------------------------------------------------# + # 冻结阶段训练参数 + # 此时模型的主干被冻结了,特征提取网络不发生改变 + # 占用的显存较小,仅对网络进行微调 + # Init_Epoch 模型当前开始的训练世代,其值可以大于Freeze_Epoch,如设置: + # Init_Epoch = 60、Freeze_Epoch = 50、UnFreeze_Epoch = 100 + # 会跳过冻结阶段,直接从60代开始,并调整对应的学习率。 + # (断点续练时使用) + # Freeze_Epoch 模型冻结训练的Freeze_Epoch + # (当Freeze_Train=False时失效) + # Freeze_batch_size 模型冻结训练的batch_size + # (当Freeze_Train=False时失效) + #------------------------------------------------------------------# + Init_Epoch = 0 + Freeze_Epoch = 20 + Freeze_batch_size = 64 + #------------------------------------------------------------------# + # 解冻阶段训练参数 + # 此时模型的主干不被冻结了,特征提取网络会发生改变 + # 占用的显存较大,网络所有的参数都会发生改变 + # UnFreeze_Epoch 模型总共训练的epoch + # SGD需要更长的时间收敛,因此设置较大的UnFreeze_Epoch + # Adam可以使用相对较小的UnFreeze_Epoch + # Unfreeze_batch_size 模型在解冻后的batch_size + #------------------------------------------------------------------# + UnFreeze_Epoch = 100 + Unfreeze_batch_size = 32 + #------------------------------------------------------------------# + # Freeze_Train 是否进行冻结训练 + # 默认先冻结主干训练后解冻训练。 + #------------------------------------------------------------------# + Freeze_Train = True + + #------------------------------------------------------------------# + # 其它训练参数:学习率、优化器、学习率下降有关 + #------------------------------------------------------------------# + #------------------------------------------------------------------# + # Init_lr 模型的最大学习率 + # Min_lr 模型的最小学习率,默认为最大学习率的0.01 + #------------------------------------------------------------------# + Init_lr = 1e-2 + Min_lr = Init_lr * 0.01 + #------------------------------------------------------------------# + # optimizer_type 使用到的优化器种类,可选的有adam、sgd + # 当使用Adam优化器时建议设置 Init_lr=1e-3 + # 当使用SGD优化器时建议设置 Init_lr=1e-2 + # momentum 优化器内部使用到的momentum参数 + # weight_decay 权值衰减,可防止过拟合 + # adam会导致weight_decay错误,使用adam时建议设置为0。 + #------------------------------------------------------------------# + optimizer_type = "sgd" + momentum = 0.937 + weight_decay = 5e-4 + #------------------------------------------------------------------# + # lr_decay_type 使用到的学习率下降方式,可选的有step、cos + #------------------------------------------------------------------# + lr_decay_type = "cos" + #------------------------------------------------------------------# + # save_period 多少个epoch保存一次权值 + #------------------------------------------------------------------# + save_period = 10 + #------------------------------------------------------------------# + # save_dir 权值与日志文件保存的文件夹 + #------------------------------------------------------------------# + save_dir = 'logs' + #------------------------------------------------------------------# + # eval_flag 是否在训练时进行评估,评估对象为验证集 + # 安装pycocotools库后,评估体验更佳。 + # eval_period 代表多少个epoch评估一次,不建议频繁的评估 + # 评估需要消耗较多的时间,频繁评估会导致训练非常慢 + # 此处获得的mAP会与get_map.py获得的会有所不同,原因有二: + # (一)此处获得的mAP为验证集的mAP。 + # (二)此处设置评估参数较为保守,目的是加快评估速度。 + #------------------------------------------------------------------# + eval_flag = True + eval_period = 5 + #------------------------------------------------------------------# + # num_workers 用于设置是否使用多线程读取数据 + # 开启后会加快数据读取速度,但是会占用更多内存 + # 内存较小的电脑可以设置为2或者0 + #------------------------------------------------------------------# + num_workers = 8 + + #------------------------------------------------------# + # train_annotation_path 训练图片路径和标签 + # val_annotation_path 验证图片路径和标签 + #------------------------------------------------------# + train_annotation_path = '2007_train.txt' + val_annotation_path = '2007_val.txt' + + seed_everything(seed) + #------------------------------------------------------# + # 设置用到的显卡 + #------------------------------------------------------# + ngpus_per_node = torch.cuda.device_count() + if distributed: + dist.init_process_group(backend="nccl") + local_rank = int(os.environ["LOCAL_RANK"]) + rank = int(os.environ["RANK"]) + device = torch.device("cuda", local_rank) + if local_rank == 0: + print(f"[{os.getpid()}] (rank = {rank}, local_rank = {local_rank}) training...") + print("Gpu Device Count : ", ngpus_per_node) + else: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + local_rank = 0 + rank = 0 + + #------------------------------------------------------# + # 获取classes和anchor + #------------------------------------------------------# + class_names, num_classes = get_classes(classes_path) + + #----------------------------------------------------# + # 下载预训练权重 + #----------------------------------------------------# + if pretrained: + if distributed: + if local_rank == 0: + download_weights(phi) + dist.barrier() + else: + download_weights(phi) + + #------------------------------------------------------# + # 创建yolo模型 + #------------------------------------------------------# + model = YoloBody(input_shape, num_classes, phi, pretrained=pretrained) + + if model_path != '': + #------------------------------------------------------# + # 权值文件请看README,百度网盘下载 + #------------------------------------------------------# + if local_rank == 0: + print('Load weights {}.'.format(model_path)) + + #------------------------------------------------------# + # 根据预训练权重的Key和模型的Key进行加载 + #------------------------------------------------------# + model_dict = model.state_dict() + pretrained_dict = torch.load(model_path, map_location = device) + load_key, no_load_key, temp_dict = [], [], {} + for k, v in pretrained_dict.items(): + if k in model_dict.keys() and np.shape(model_dict[k]) == np.shape(v): + temp_dict[k] = v + load_key.append(k) + else: + no_load_key.append(k) + model_dict.update(temp_dict) + model.load_state_dict(model_dict) + #------------------------------------------------------# + # 显示没有匹配上的Key + #------------------------------------------------------# + if local_rank == 0: + print("\nSuccessful Load Key:", str(load_key)[:500], "……\nSuccessful Load Key Num:", len(load_key)) + print("\nFail To Load Key:", str(no_load_key)[:500], "……\nFail To Load Key num:", len(no_load_key)) + print("\n\033[1;33;44m温馨提示,head部分没有载入是正常现象,Backbone部分没有载入是错误的。\033[0m") + + #----------------------# + # 获得损失函数 + #----------------------# + yolo_loss = Loss(model) + #----------------------# + # 记录Loss + #----------------------# + if local_rank == 0: + time_str = datetime.datetime.strftime(datetime.datetime.now(),'%Y_%m_%d_%H_%M_%S') + log_dir = os.path.join(save_dir, "loss_" + str(time_str)) + loss_history = LossHistory(log_dir, model, input_shape=input_shape) + else: + loss_history = None + + #------------------------------------------------------------------# + # torch 1.2不支持amp,建议使用torch 1.7.1及以上正确使用fp16 + # 因此torch1.2这里显示"could not be resolve" + #------------------------------------------------------------------# + if fp16: + from torch.cuda.amp import GradScaler as GradScaler + scaler = GradScaler() + else: + scaler = None + + model_train = model.train() + #----------------------------# + # 多卡同步Bn + #----------------------------# + if sync_bn and ngpus_per_node > 1 and distributed: + model_train = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model_train) + elif sync_bn: + print("Sync_bn is not support in one gpu or not distributed.") + + if Cuda: + if distributed: + #----------------------------# + # 多卡平行运行 + #----------------------------# + model_train = model_train.cuda(local_rank) + model_train = torch.nn.parallel.DistributedDataParallel(model_train, device_ids=[local_rank], find_unused_parameters=True) + else: + model_train = torch.nn.DataParallel(model) + cudnn.benchmark = True + model_train = model_train.cuda() + + #----------------------------# + # 权值平滑 + #----------------------------# + ema = ModelEMA(model_train) + + #---------------------------# + # 读取数据集对应的txt + #---------------------------# + with open(train_annotation_path, encoding='utf-8') as f: + train_lines = f.readlines() + with open(val_annotation_path, encoding='utf-8') as f: + val_lines = f.readlines() + num_train = len(train_lines) + num_val = len(val_lines) + + if local_rank == 0: + show_config( + classes_path = classes_path, model_path = model_path, input_shape = input_shape, \ + Init_Epoch = Init_Epoch, Freeze_Epoch = Freeze_Epoch, UnFreeze_Epoch = UnFreeze_Epoch, Freeze_batch_size = Freeze_batch_size, Unfreeze_batch_size = Unfreeze_batch_size, Freeze_Train = Freeze_Train, \ + Init_lr = Init_lr, Min_lr = Min_lr, optimizer_type = optimizer_type, momentum = momentum, lr_decay_type = lr_decay_type, \ + save_period = save_period, save_dir = save_dir, num_workers = num_workers, num_train = num_train, num_val = num_val + ) + #---------------------------------------------------------# + # 总训练世代指的是遍历全部数据的总次数 + # 总训练步长指的是梯度下降的总次数 + # 每个训练世代包含若干训练步长,每个训练步长进行一次梯度下降。 + # 此处仅建议最低训练世代,上不封顶,计算时只考虑了解冻部分 + #----------------------------------------------------------# + wanted_step = 5e4 if optimizer_type == "sgd" else 1.5e4 + total_step = num_train // Unfreeze_batch_size * UnFreeze_Epoch + if total_step <= wanted_step: + if num_train // Unfreeze_batch_size == 0: + raise ValueError('数据集过小,无法进行训练,请扩充数据集。') + wanted_epoch = wanted_step // (num_train // Unfreeze_batch_size) + 1 + print("\n\033[1;33;44m[Warning] 使用%s优化器时,建议将训练总步长设置到%d以上。\033[0m"%(optimizer_type, wanted_step)) + print("\033[1;33;44m[Warning] 本次运行的总训练数据量为%d,Unfreeze_batch_size为%d,共训练%d个Epoch,计算出总训练步长为%d。\033[0m"%(num_train, Unfreeze_batch_size, UnFreeze_Epoch, total_step)) + print("\033[1;33;44m[Warning] 由于总训练步长为%d,小于建议总步长%d,建议设置总世代为%d。\033[0m"%(total_step, wanted_step, wanted_epoch)) + + #------------------------------------------------------# + # 主干特征提取网络特征通用,冻结训练可以加快训练速度 + # 也可以在训练初期防止权值被破坏。 + # Init_Epoch为起始世代 + # Freeze_Epoch为冻结训练的世代 + # UnFreeze_Epoch总训练世代 + # 提示OOM或者显存不足请调小Batch_size + #------------------------------------------------------# + if True: + UnFreeze_flag = False + #------------------------------------# + # 冻结一定部分训练 + #------------------------------------# + if Freeze_Train: + for param in model.backbone.parameters(): + param.requires_grad = False + + #-------------------------------------------------------------------# + # 如果不冻结训练的话,直接设置batch_size为Unfreeze_batch_size + #-------------------------------------------------------------------# + batch_size = Freeze_batch_size if Freeze_Train else Unfreeze_batch_size + + #-------------------------------------------------------------------# + # 判断当前batch_size,自适应调整学习率 + #-------------------------------------------------------------------# + nbs = 64 + lr_limit_max = 1e-3 if optimizer_type == 'adam' else 5e-2 + lr_limit_min = 3e-4 if optimizer_type == 'adam' else 5e-4 + Init_lr_fit = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max) + Min_lr_fit = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2) + + #---------------------------------------# + # 根据optimizer_type选择优化器 + #---------------------------------------# + pg0, pg1, pg2 = [], [], [] + for k, v in model.named_modules(): + if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter): + pg2.append(v.bias) + if isinstance(v, nn.BatchNorm2d) or "bn" in k: + pg0.append(v.weight) + elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter): + pg1.append(v.weight) + optimizer = { + 'adam' : optim.Adam(pg0, Init_lr_fit, betas = (momentum, 0.999)), + 'sgd' : optim.SGD(pg0, Init_lr_fit, momentum = momentum, nesterov=True) + }[optimizer_type] + optimizer.add_param_group({"params": pg1, "weight_decay": weight_decay}) + optimizer.add_param_group({"params": pg2}) + + #---------------------------------------# + # 获得学习率下降的公式 + #---------------------------------------# + lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch) + + #---------------------------------------# + # 判断每一个世代的长度 + #---------------------------------------# + epoch_step = num_train // batch_size + epoch_step_val = num_val // batch_size + + if epoch_step == 0 or epoch_step_val == 0: + raise ValueError("数据集过小,无法继续进行训练,请扩充数据集。") + + if ema: + ema.updates = epoch_step * Init_Epoch + + #---------------------------------------# + # 构建数据集加载器。 + #---------------------------------------# + train_dataset = YoloDataset(train_lines, input_shape, num_classes, epoch_length=UnFreeze_Epoch, \ + mosaic=mosaic, mixup=mixup, mosaic_prob=mosaic_prob, mixup_prob=mixup_prob, train=True, special_aug_ratio=special_aug_ratio) + val_dataset = YoloDataset(val_lines, input_shape, num_classes, epoch_length=UnFreeze_Epoch, \ + mosaic=False, mixup=False, mosaic_prob=0, mixup_prob=0, train=False, special_aug_ratio=0) + + if distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=True,) + val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False,) + batch_size = batch_size // ngpus_per_node + shuffle = False + else: + train_sampler = None + val_sampler = None + shuffle = True + + gen = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, + drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler, + worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed)) + gen_val = DataLoader(val_dataset , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, + drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler, + worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed)) + + #----------------------# + # 记录eval的map曲线 + #----------------------# + if local_rank == 0: + eval_callback = EvalCallback(model, input_shape, class_names, num_classes, val_lines, log_dir, Cuda, \ + eval_flag=eval_flag, period=eval_period) + else: + eval_callback = None + + #---------------------------------------# + # 开始模型训练 + #---------------------------------------# + for epoch in range(Init_Epoch, UnFreeze_Epoch): + #---------------------------------------# + # 如果模型有冻结学习部分 + # 则解冻,并设置参数 + #---------------------------------------# + if epoch >= Freeze_Epoch and not UnFreeze_flag and Freeze_Train: + batch_size = Unfreeze_batch_size + + #-------------------------------------------------------------------# + # 判断当前batch_size,自适应调整学习率 + #-------------------------------------------------------------------# + nbs = 64 + lr_limit_max = 1e-3 if optimizer_type == 'adam' else 5e-2 + lr_limit_min = 3e-4 if optimizer_type == 'adam' else 5e-4 + Init_lr_fit = min(max(batch_size / nbs * Init_lr, lr_limit_min), lr_limit_max) + Min_lr_fit = min(max(batch_size / nbs * Min_lr, lr_limit_min * 1e-2), lr_limit_max * 1e-2) + #---------------------------------------# + # 获得学习率下降的公式 + #---------------------------------------# + lr_scheduler_func = get_lr_scheduler(lr_decay_type, Init_lr_fit, Min_lr_fit, UnFreeze_Epoch) + + for param in model.backbone.parameters(): + param.requires_grad = True + + epoch_step = num_train // batch_size + epoch_step_val = num_val // batch_size + + if epoch_step == 0 or epoch_step_val == 0: + raise ValueError("数据集过小,无法继续进行训练,请扩充数据集。") + + if ema: + ema.updates = epoch_step * epoch + + if distributed: + batch_size = batch_size // ngpus_per_node + + gen = DataLoader(train_dataset, shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, + drop_last=True, collate_fn=yolo_dataset_collate, sampler=train_sampler, + worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed)) + gen_val = DataLoader(val_dataset , shuffle = shuffle, batch_size = batch_size, num_workers = num_workers, pin_memory=True, + drop_last=True, collate_fn=yolo_dataset_collate, sampler=val_sampler, + worker_init_fn=partial(worker_init_fn, rank=rank, seed=seed)) + + UnFreeze_flag = True + + gen.dataset.epoch_now = epoch + gen_val.dataset.epoch_now = epoch + + if distributed: + train_sampler.set_epoch(epoch) + + set_optimizer_lr(optimizer, lr_scheduler_func, epoch) + + fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, UnFreeze_Epoch, Cuda, fp16, scaler, save_period, save_dir, local_rank) + + if distributed: + dist.barrier() + + if local_rank == 0: + loss_history.writer.close() diff --git a/app/core/yolo_detect/utils/__init__.py b/app/core/yolo_detect/utils/__init__.py new file mode 100644 index 0000000..4287ca8 --- /dev/null +++ b/app/core/yolo_detect/utils/__init__.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/app/core/yolo_detect/utils/callbacks.py b/app/core/yolo_detect/utils/callbacks.py new file mode 100644 index 0000000..2aee837 --- /dev/null +++ b/app/core/yolo_detect/utils/callbacks.py @@ -0,0 +1,230 @@ +import datetime +import os + +import torch +import matplotlib +matplotlib.use('Agg') +import scipy.signal +from matplotlib import pyplot as plt +from torch.utils.tensorboard import SummaryWriter + +import shutil +import numpy as np + +from PIL import Image +from tqdm import tqdm +from .utils import cvtColor, preprocess_input, resize_image +from .utils_bbox import DecodeBox +from .utils_map import get_coco_map, get_map + + +class LossHistory(): + def __init__(self, log_dir, model, input_shape): + self.log_dir = log_dir + self.losses = [] + self.val_loss = [] + + os.makedirs(self.log_dir) + self.writer = SummaryWriter(self.log_dir) + # try: + # dummy_input = torch.randn(2, 3, input_shape[0], input_shape[1]) + # self.writer.add_graph(model, dummy_input) + # except: + # pass + + def append_loss(self, epoch, loss, val_loss): + if not os.path.exists(self.log_dir): + os.makedirs(self.log_dir) + + self.losses.append(loss) + self.val_loss.append(val_loss) + + with open(os.path.join(self.log_dir, "epoch_loss.txt"), 'a') as f: + f.write(str(loss)) + f.write("\n") + with open(os.path.join(self.log_dir, "epoch_val_loss.txt"), 'a') as f: + f.write(str(val_loss)) + f.write("\n") + + self.writer.add_scalar('loss', loss, epoch) + self.writer.add_scalar('val_loss', val_loss, epoch) + self.loss_plot() + + def loss_plot(self): + iters = range(len(self.losses)) + + plt.figure() + plt.plot(iters, self.losses, 'red', linewidth = 2, label='train loss') + plt.plot(iters, self.val_loss, 'coral', linewidth = 2, label='val loss') + # try: + # if len(self.losses) < 25: + # num = 5 + # else: + # num = 15 + + # plt.plot(iters, scipy.signal.savgol_filter(self.losses, num, 3), 'green', linestyle = '--', linewidth = 2, label='smooth train loss') + # plt.plot(iters, scipy.signal.savgol_filter(self.val_loss, num, 3), '#8B4513', linestyle = '--', linewidth = 2, label='smooth val loss') + # except: + # pass + + plt.grid(True) + plt.xlabel('Epoch') + plt.ylabel('Loss') + plt.legend(loc="upper right") + + plt.savefig(os.path.join(self.log_dir, "epoch_loss.png")) + + plt.cla() + plt.close("all") + +class EvalCallback(): + def __init__(self, net, input_shape, class_names, num_classes, val_lines, log_dir, cuda, \ + map_out_path=".temp_map_out", max_boxes=100, confidence=0.05, nms_iou=0.5, letterbox_image=True, MINOVERLAP=0.5, eval_flag=True, period=1): + super(EvalCallback, self).__init__() + + self.net = net + self.input_shape = input_shape + self.class_names = class_names + self.num_classes = num_classes + self.val_lines = val_lines + self.log_dir = log_dir + self.cuda = cuda + self.map_out_path = map_out_path + self.max_boxes = max_boxes + self.confidence = confidence + self.nms_iou = nms_iou + self.letterbox_image = letterbox_image + self.MINOVERLAP = MINOVERLAP + self.eval_flag = eval_flag + self.period = period + + self.bbox_util = DecodeBox(self.num_classes, (self.input_shape[0], self.input_shape[1])) + + self.maps = [0] + self.epoches = [0] + if self.eval_flag: + with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f: + f.write(str(0)) + f.write("\n") + + def get_map_txt(self, image_id, image, class_names, map_out_path): + f = open(os.path.join(map_out_path, "detection-results/"+image_id+".txt"), "w", encoding='utf-8') + image_shape = np.array(np.shape(image)[0:2]) + #---------------------------------------------------------# + # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB + #---------------------------------------------------------# + image = cvtColor(image) + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + # 也可以直接resize进行识别 + #---------------------------------------------------------# + image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image) + #---------------------------------------------------------# + # 添加上batch_size维度 + #---------------------------------------------------------# + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) + + with torch.no_grad(): + images = torch.from_numpy(image_data) + if self.cuda: + images = images.cuda() + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# + outputs = self.net(images) + outputs = self.bbox_util.decode_box(outputs) + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# + results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou) + + if results[0] is None: + return + + top_label = np.array(results[0][:, 5], dtype = 'int32') + top_conf = results[0][:, 4] + top_boxes = results[0][:, :4] + + top_100 = np.argsort(top_conf)[::-1][:self.max_boxes] + top_boxes = top_boxes[top_100] + top_conf = top_conf[top_100] + top_label = top_label[top_100] + + for i, c in list(enumerate(top_label)): + predicted_class = self.class_names[int(c)] + box = top_boxes[i] + score = str(top_conf[i]) + + top, left, bottom, right = box + if predicted_class not in class_names: + continue + + f.write("%s %s %s %s %s %s\n" % (predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)),str(int(bottom)))) + + f.close() + return + + def on_epoch_end(self, epoch, model_eval): + if epoch % self.period == 0 and self.eval_flag: + self.net = model_eval + if not os.path.exists(self.map_out_path): + os.makedirs(self.map_out_path) + if not os.path.exists(os.path.join(self.map_out_path, "ground-truth")): + os.makedirs(os.path.join(self.map_out_path, "ground-truth")) + if not os.path.exists(os.path.join(self.map_out_path, "detection-results")): + os.makedirs(os.path.join(self.map_out_path, "detection-results")) + print("Get map.") + for annotation_line in tqdm(self.val_lines): + line = annotation_line.split() + image_id = os.path.basename(line[0]).split('.')[0] + #------------------------------# + # 读取图像并转换成RGB图像 + #------------------------------# + image = Image.open(line[0]) + #------------------------------# + # 获得预测框 + #------------------------------# + gt_boxes = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]]) + #------------------------------# + # 获得预测txt + #------------------------------# + self.get_map_txt(image_id, image, self.class_names, self.map_out_path) + + #------------------------------# + # 获得真实框txt + #------------------------------# + with open(os.path.join(self.map_out_path, "ground-truth/"+image_id+".txt"), "w") as new_f: + for box in gt_boxes: + left, top, right, bottom, obj = box + obj_name = self.class_names[obj] + new_f.write("%s %s %s %s %s\n" % (obj_name, left, top, right, bottom)) + + print("Calculate Map.") + try: + temp_map = get_coco_map(class_names = self.class_names, path = self.map_out_path)[1] + except: + temp_map = get_map(self.MINOVERLAP, False, path = self.map_out_path) + self.maps.append(temp_map) + self.epoches.append(epoch) + + with open(os.path.join(self.log_dir, "epoch_map.txt"), 'a') as f: + f.write(str(temp_map)) + f.write("\n") + + plt.figure() + plt.plot(self.epoches, self.maps, 'red', linewidth = 2, label='train map') + + plt.grid(True) + plt.xlabel('Epoch') + plt.ylabel('Map %s'%str(self.MINOVERLAP)) + plt.title('A Map Curve') + plt.legend(loc="upper right") + + plt.savefig(os.path.join(self.log_dir, "epoch_map.png")) + plt.cla() + plt.close("all") + + print("Get map done.") + shutil.rmtree(self.map_out_path) diff --git a/app/core/yolo_detect/utils/dataloader.py b/app/core/yolo_detect/utils/dataloader.py new file mode 100644 index 0000000..f46dffd --- /dev/null +++ b/app/core/yolo_detect/utils/dataloader.py @@ -0,0 +1,426 @@ +from random import sample, shuffle + +import cv2 +import numpy as np +import torch +from PIL import Image +from torch.utils.data.dataset import Dataset + +from utils.utils import cvtColor, preprocess_input + + +class YoloDataset(Dataset): + def __init__(self, annotation_lines, input_shape, num_classes, epoch_length, \ + mosaic, mixup, mosaic_prob, mixup_prob, train, special_aug_ratio = 0.7): + super(YoloDataset, self).__init__() + self.annotation_lines = annotation_lines + self.input_shape = input_shape + self.num_classes = num_classes + self.epoch_length = epoch_length + self.mosaic = mosaic + self.mosaic_prob = mosaic_prob + self.mixup = mixup + self.mixup_prob = mixup_prob + self.train = train + self.special_aug_ratio = special_aug_ratio + + self.epoch_now = -1 + self.length = len(self.annotation_lines) + + self.bbox_attrs = 5 + num_classes + + def __len__(self): + return self.length + + def __getitem__(self, index): + index = index % self.length + + #---------------------------------------------------# + # 训练时进行数据的随机增强 + # 验证时不进行数据的随机增强 + #---------------------------------------------------# + if self.mosaic and self.rand() < self.mosaic_prob and self.epoch_now < self.epoch_length * self.special_aug_ratio: + lines = sample(self.annotation_lines, 3) + lines.append(self.annotation_lines[index]) + shuffle(lines) + image, box = self.get_random_data_with_Mosaic(lines, self.input_shape) + + if self.mixup and self.rand() < self.mixup_prob: + lines = sample(self.annotation_lines, 1) + image_2, box_2 = self.get_random_data(lines[0], self.input_shape, random = self.train) + image, box = self.get_random_data_with_MixUp(image, box, image_2, box_2) + else: + image, box = self.get_random_data(self.annotation_lines[index], self.input_shape, random = self.train) + + image = np.transpose(preprocess_input(np.array(image, dtype=np.float32)), (2, 0, 1)) + box = np.array(box, dtype=np.float32) + + #---------------------------------------------------# + # 对真实框进行预处理 + #---------------------------------------------------# + nL = len(box) + labels_out = np.zeros((nL, 6)) + if nL: + #---------------------------------------------------# + # 对真实框进行归一化,调整到0-1之间 + #---------------------------------------------------# + box[:, [0, 2]] = box[:, [0, 2]] / self.input_shape[1] + box[:, [1, 3]] = box[:, [1, 3]] / self.input_shape[0] + #---------------------------------------------------# + # 序号为0、1的部分,为真实框的中心 + # 序号为2、3的部分,为真实框的宽高 + # 序号为4的部分,为真实框的种类 + #---------------------------------------------------# + box[:, 2:4] = box[:, 2:4] - box[:, 0:2] + box[:, 0:2] = box[:, 0:2] + box[:, 2:4] / 2 + + #---------------------------------------------------# + # 调整顺序,符合训练的格式 + # labels_out中序号为0的部分在collect时处理 + #---------------------------------------------------# + labels_out[:, 1] = box[:, -1] + labels_out[:, 2:] = box[:, :4] + + return image, labels_out + + def rand(self, a=0, b=1): + return np.random.rand()*(b-a) + a + + def get_random_data(self, annotation_line, input_shape, jitter=.3, hue=.1, sat=0.7, val=0.4, random=True): + line = annotation_line.split() + #------------------------------# + # 读取图像并转换成RGB图像 + #------------------------------# + image = Image.open(line[0]) + image = cvtColor(image) + #------------------------------# + # 获得图像的高宽与目标高宽 + #------------------------------# + iw, ih = image.size + h, w = input_shape + #------------------------------# + # 获得预测框 + #------------------------------# + box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]]) + + if not random: + scale = min(w/iw, h/ih) + nw = int(iw*scale) + nh = int(ih*scale) + dx = (w-nw)//2 + dy = (h-nh)//2 + + #---------------------------------# + # 将图像多余的部分加上灰条 + #---------------------------------# + image = image.resize((nw,nh), Image.BICUBIC) + new_image = Image.new('RGB', (w,h), (128,128,128)) + new_image.paste(image, (dx, dy)) + image_data = np.array(new_image, np.float32) + + #---------------------------------# + # 对真实框进行调整 + #---------------------------------# + if len(box)>0: + np.random.shuffle(box) + box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx + box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy + box[:, 0:2][box[:, 0:2]<0] = 0 + box[:, 2][box[:, 2]>w] = w + box[:, 3][box[:, 3]>h] = h + box_w = box[:, 2] - box[:, 0] + box_h = box[:, 3] - box[:, 1] + box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box + + return image_data, box + + #------------------------------------------# + # 对图像进行缩放并且进行长和宽的扭曲 + #------------------------------------------# + new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter) + scale = self.rand(.25, 2) + if new_ar < 1: + nh = int(scale*h) + nw = int(nh*new_ar) + else: + nw = int(scale*w) + nh = int(nw/new_ar) + image = image.resize((nw,nh), Image.BICUBIC) + + #------------------------------------------# + # 将图像多余的部分加上灰条 + #------------------------------------------# + dx = int(self.rand(0, w-nw)) + dy = int(self.rand(0, h-nh)) + new_image = Image.new('RGB', (w,h), (128,128,128)) + new_image.paste(image, (dx, dy)) + image = new_image + + #------------------------------------------# + # 翻转图像 + #------------------------------------------# + flip = self.rand()<.5 + if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT) + + image_data = np.array(image, np.uint8) + #---------------------------------# + # 对图像进行色域变换 + # 计算色域变换的参数 + #---------------------------------# + r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1 + #---------------------------------# + # 将图像转到HSV上 + #---------------------------------# + hue, sat, val = cv2.split(cv2.cvtColor(image_data, cv2.COLOR_RGB2HSV)) + dtype = image_data.dtype + #---------------------------------# + # 应用变换 + #---------------------------------# + x = np.arange(0, 256, dtype=r.dtype) + lut_hue = ((x * r[0]) % 180).astype(dtype) + lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) + lut_val = np.clip(x * r[2], 0, 255).astype(dtype) + + image_data = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))) + image_data = cv2.cvtColor(image_data, cv2.COLOR_HSV2RGB) + + #---------------------------------# + # 对真实框进行调整 + #---------------------------------# + if len(box)>0: + np.random.shuffle(box) + box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx + box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy + if flip: box[:, [0,2]] = w - box[:, [2,0]] + box[:, 0:2][box[:, 0:2]<0] = 0 + box[:, 2][box[:, 2]>w] = w + box[:, 3][box[:, 3]>h] = h + box_w = box[:, 2] - box[:, 0] + box_h = box[:, 3] - box[:, 1] + box = box[np.logical_and(box_w>1, box_h>1)] + + return image_data, box + + def merge_bboxes(self, bboxes, cutx, cuty): + merge_bbox = [] + for i in range(len(bboxes)): + for box in bboxes[i]: + tmp_box = [] + x1, y1, x2, y2 = box[0], box[1], box[2], box[3] + + if i == 0: + if y1 > cuty or x1 > cutx: + continue + if y2 >= cuty and y1 <= cuty: + y2 = cuty + if x2 >= cutx and x1 <= cutx: + x2 = cutx + + if i == 1: + if y2 < cuty or x1 > cutx: + continue + if y2 >= cuty and y1 <= cuty: + y1 = cuty + if x2 >= cutx and x1 <= cutx: + x2 = cutx + + if i == 2: + if y2 < cuty or x2 < cutx: + continue + if y2 >= cuty and y1 <= cuty: + y1 = cuty + if x2 >= cutx and x1 <= cutx: + x1 = cutx + + if i == 3: + if y1 > cuty or x2 < cutx: + continue + if y2 >= cuty and y1 <= cuty: + y2 = cuty + if x2 >= cutx and x1 <= cutx: + x1 = cutx + tmp_box.append(x1) + tmp_box.append(y1) + tmp_box.append(x2) + tmp_box.append(y2) + tmp_box.append(box[-1]) + merge_bbox.append(tmp_box) + return merge_bbox + + def get_random_data_with_Mosaic(self, annotation_line, input_shape, jitter=0.3, hue=.1, sat=0.7, val=0.4): + h, w = input_shape + min_offset_x = self.rand(0.3, 0.7) + min_offset_y = self.rand(0.3, 0.7) + + image_datas = [] + box_datas = [] + index = 0 + for line in annotation_line: + #---------------------------------# + # 每一行进行分割 + #---------------------------------# + line_content = line.split() + #---------------------------------# + # 打开图片 + #---------------------------------# + image = Image.open(line_content[0]) + image = cvtColor(image) + + #---------------------------------# + # 图片的大小 + #---------------------------------# + iw, ih = image.size + #---------------------------------# + # 保存框的位置 + #---------------------------------# + box = np.array([np.array(list(map(int,box.split(',')))) for box in line_content[1:]]) + + #---------------------------------# + # 是否翻转图片 + #---------------------------------# + flip = self.rand()<.5 + if flip and len(box)>0: + image = image.transpose(Image.FLIP_LEFT_RIGHT) + box[:, [0,2]] = iw - box[:, [2,0]] + + #------------------------------------------# + # 对图像进行缩放并且进行长和宽的扭曲 + #------------------------------------------# + new_ar = iw/ih * self.rand(1-jitter,1+jitter) / self.rand(1-jitter,1+jitter) + scale = self.rand(.4, 1) + if new_ar < 1: + nh = int(scale*h) + nw = int(nh*new_ar) + else: + nw = int(scale*w) + nh = int(nw/new_ar) + image = image.resize((nw, nh), Image.BICUBIC) + + #-----------------------------------------------# + # 将图片进行放置,分别对应四张分割图片的位置 + #-----------------------------------------------# + if index == 0: + dx = int(w*min_offset_x) - nw + dy = int(h*min_offset_y) - nh + elif index == 1: + dx = int(w*min_offset_x) - nw + dy = int(h*min_offset_y) + elif index == 2: + dx = int(w*min_offset_x) + dy = int(h*min_offset_y) + elif index == 3: + dx = int(w*min_offset_x) + dy = int(h*min_offset_y) - nh + + new_image = Image.new('RGB', (w,h), (128,128,128)) + new_image.paste(image, (dx, dy)) + image_data = np.array(new_image) + + index = index + 1 + box_data = [] + #---------------------------------# + # 对box进行重新处理 + #---------------------------------# + if len(box)>0: + np.random.shuffle(box) + box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx + box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy + box[:, 0:2][box[:, 0:2]<0] = 0 + box[:, 2][box[:, 2]>w] = w + box[:, 3][box[:, 3]>h] = h + box_w = box[:, 2] - box[:, 0] + box_h = box[:, 3] - box[:, 1] + box = box[np.logical_and(box_w>1, box_h>1)] + box_data = np.zeros((len(box),5)) + box_data[:len(box)] = box + + image_datas.append(image_data) + box_datas.append(box_data) + + #---------------------------------# + # 将图片分割,放在一起 + #---------------------------------# + cutx = int(w * min_offset_x) + cuty = int(h * min_offset_y) + + new_image = np.zeros([h, w, 3]) + new_image[:cuty, :cutx, :] = image_datas[0][:cuty, :cutx, :] + new_image[cuty:, :cutx, :] = image_datas[1][cuty:, :cutx, :] + new_image[cuty:, cutx:, :] = image_datas[2][cuty:, cutx:, :] + new_image[:cuty, cutx:, :] = image_datas[3][:cuty, cutx:, :] + + new_image = np.array(new_image, np.uint8) + #---------------------------------# + # 对图像进行色域变换 + # 计算色域变换的参数 + #---------------------------------# + r = np.random.uniform(-1, 1, 3) * [hue, sat, val] + 1 + #---------------------------------# + # 将图像转到HSV上 + #---------------------------------# + hue, sat, val = cv2.split(cv2.cvtColor(new_image, cv2.COLOR_RGB2HSV)) + dtype = new_image.dtype + #---------------------------------# + # 应用变换 + #---------------------------------# + x = np.arange(0, 256, dtype=r.dtype) + lut_hue = ((x * r[0]) % 180).astype(dtype) + lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) + lut_val = np.clip(x * r[2], 0, 255).astype(dtype) + + new_image = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))) + new_image = cv2.cvtColor(new_image, cv2.COLOR_HSV2RGB) + + #---------------------------------# + # 对框进行进一步的处理 + #---------------------------------# + new_boxes = self.merge_bboxes(box_datas, cutx, cuty) + + return new_image, new_boxes + + def get_random_data_with_MixUp(self, image_1, box_1, image_2, box_2): + new_image = np.array(image_1, np.float32) * 0.5 + np.array(image_2, np.float32) * 0.5 + if len(box_1) == 0: + new_boxes = box_2 + elif len(box_2) == 0: + new_boxes = box_1 + else: + new_boxes = np.concatenate([box_1, box_2], axis=0) + return new_image, new_boxes + + +# DataLoader中collate_fn使用 +def yolo_dataset_collate(batch): + images = [] + bboxes = [] + for i, (img, box) in enumerate(batch): + images.append(img) + box[:, 0] = i + bboxes.append(box) + + images = torch.from_numpy(np.array(images)).type(torch.FloatTensor) + bboxes = torch.from_numpy(np.concatenate(bboxes, 0)).type(torch.FloatTensor) + return images, bboxes + +# # DataLoader中collate_fn使用 +# def yolo_dataset_collate(batch): +# images = [] +# n_max_boxes = 0 +# bs = len(batch) +# for i, (img, box) in enumerate(batch): +# images.append(img) +# n_max_boxes = max(n_max_boxes, len(box)) + +# bboxes = torch.zeros((bs, n_max_boxes, 4)) +# labels = torch.zeros((bs, n_max_boxes, 1)) +# masks = torch.zeros((bs, n_max_boxes, 1)) + +# for i, (img, box) in enumerate(batch): +# _sub_length = len(box) +# bboxes[i, :_sub_length] = box[:, :4] +# labels[i, :_sub_length] = box[:, 4] +# masks[i, :_sub_length] = 1 + +# images = torch.from_numpy(np.array(images)).type(torch.FloatTensor) +# bboxes = torch.from_numpy(np.concatenate(bboxes, 0)).type(torch.FloatTensor) +# return images, bboxes, labels, masks diff --git a/app/core/yolo_detect/utils/utils.py b/app/core/yolo_detect/utils/utils.py new file mode 100644 index 0000000..3c843bb --- /dev/null +++ b/app/core/yolo_detect/utils/utils.py @@ -0,0 +1,103 @@ +import random + +import numpy as np +import torch +from PIL import Image + + +#---------------------------------------------------------# +# 将图像转换成RGB图像,防止灰度图在预测时报错。 +# 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB +#---------------------------------------------------------# +def cvtColor(image): + if len(np.shape(image)) == 3 and np.shape(image)[2] == 3: + return image + else: + image = image.convert('RGB') + return image + +#---------------------------------------------------# +# 对输入图像进行resize +#---------------------------------------------------# +def resize_image(image, size, letterbox_image): + iw, ih = image.size + w, h = size + if letterbox_image: + scale = min(w/iw, h/ih) + nw = int(iw*scale) + nh = int(ih*scale) + + image = image.resize((nw,nh), Image.BICUBIC) + new_image = Image.new('RGB', size, (128,128,128)) + new_image.paste(image, ((w-nw)//2, (h-nh)//2)) + else: + new_image = image.resize((w, h), Image.BICUBIC) + return new_image + +#---------------------------------------------------# +# 获得类 +#---------------------------------------------------# +def get_classes(classes_path): + with open(classes_path, encoding='utf-8') as f: + class_names = f.readlines() + class_names = [c.strip() for c in class_names] + return class_names, len(class_names) + +#---------------------------------------------------# +# 获得学习率 +#---------------------------------------------------# +def get_lr(optimizer): + for param_group in optimizer.param_groups: + return param_group['lr'] + +#---------------------------------------------------# +# 设置种子 +#---------------------------------------------------# +def seed_everything(seed=11): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + +#---------------------------------------------------# +# 设置Dataloader的种子 +#---------------------------------------------------# +def worker_init_fn(worker_id, rank, seed): + worker_seed = rank + seed + random.seed(worker_seed) + np.random.seed(worker_seed) + torch.manual_seed(worker_seed) + +def preprocess_input(image): + image /= 255.0 + return image + +def show_config(**kwargs): + print('Configurations:') + print('-' * 70) + print('|%25s | %40s|' % ('keys', 'values')) + print('-' * 70) + for key, value in kwargs.items(): + print('|%25s | %40s|' % (str(key), str(value))) + print('-' * 70) + +def download_weights(phi, model_dir="./model_data"): + import os + + from torch.hub import load_state_dict_from_url + + download_urls = { + "n" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_n_backbone_weights.pth', + "s" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_s_backbone_weights.pth', + "m" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_m_backbone_weights.pth', + "l" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_l_backbone_weights.pth', + "x" : 'https://github.com/bubbliiiing/yolov8-pytorch/releases/download/v1.0/yolov8_x_backbone_weights.pth', + } + url = download_urls[phi] + + if not os.path.exists(model_dir): + os.makedirs(model_dir) + load_state_dict_from_url(url, model_dir) \ No newline at end of file diff --git a/app/core/yolo_detect/utils/utils_bbox.py b/app/core/yolo_detect/utils/utils_bbox.py new file mode 100644 index 0000000..e58678a --- /dev/null +++ b/app/core/yolo_detect/utils/utils_bbox.py @@ -0,0 +1,348 @@ +import numpy as np +import torch +from torchvision.ops import nms +import pkg_resources as pkg + +def check_version(current: str = "0.0.0", + minimum: str = "0.0.0", + name: str = "version ", + pinned: bool = False) -> bool: + current, minimum = (pkg.parse_version(x) for x in (current, minimum)) + result = (current == minimum) if pinned else (current >= minimum) # bool + return result + +TORCH_1_10 = check_version(torch.__version__, '1.10.0') + +def make_anchors(feats, strides, grid_cell_offset=0.5): + """Generate anchors from features.""" + anchor_points, stride_tensor = [], [] + assert feats is not None + dtype, device = feats[0].dtype, feats[0].device + for i, stride in enumerate(strides): + _, _, h, w = feats[i].shape + sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset # shift x + sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y + sy, sx = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx) + anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2)) + stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) + return torch.cat(anchor_points), torch.cat(stride_tensor) + +def dist2bbox(distance, anchor_points, xywh=True, dim=-1): + """Transform distance(ltrb) to box(xywh or xyxy).""" + # 左上右下 + lt, rb = torch.split(distance, 2, dim) + x1y1 = anchor_points - lt + x2y2 = anchor_points + rb + if xywh: + c_xy = (x1y1 + x2y2) / 2 + wh = x2y2 - x1y1 + return torch.cat((c_xy, wh), dim) # xywh bbox + return torch.cat((x1y1, x2y2), dim) # xyxy bbox + +class DecodeBox(): + def __init__(self, num_classes, input_shape): + super(DecodeBox, self).__init__() + self.num_classes = num_classes + self.bbox_attrs = 4 + num_classes + self.input_shape = input_shape + + def decode_box(self, inputs): + # dbox batch_size, 4, 8400 + # cls batch_size, 20, 8400 + dbox, cls, origin_cls, anchors, strides = inputs + # 获得中心宽高坐标 + dbox = dist2bbox(dbox, anchors.unsqueeze(0), xywh=True, dim=1) * strides + y = torch.cat((dbox, cls.sigmoid()), 1).permute(0, 2, 1) + # 进行归一化,到0~1之间 + y[:, :, :4] = y[:, :, :4] / torch.Tensor([self.input_shape[1], self.input_shape[0], self.input_shape[1], self.input_shape[0]]).to(y.device) + return y + + def yolo_correct_boxes(self, box_xy, box_wh, input_shape, image_shape, letterbox_image): + #-----------------------------------------------------------------# + # 把y轴放前面是因为方便预测框和图像的宽高进行相乘 + #-----------------------------------------------------------------# + box_yx = box_xy[..., ::-1] + box_hw = box_wh[..., ::-1] + input_shape = np.array(input_shape) + image_shape = np.array(image_shape) + + if letterbox_image: + #-----------------------------------------------------------------# + # 这里求出来的offset是图像有效区域相对于图像左上角的偏移情况 + # new_shape指的是宽高缩放情况 + #-----------------------------------------------------------------# + new_shape = np.round(image_shape * np.min(input_shape/image_shape)) + offset = (input_shape - new_shape)/2./input_shape + scale = input_shape/new_shape + + box_yx = (box_yx - offset) * scale + box_hw *= scale + + box_mins = box_yx - (box_hw / 2.) + box_maxes = box_yx + (box_hw / 2.) + boxes = np.concatenate([box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2]], axis=-1) + boxes *= np.concatenate([image_shape, image_shape], axis=-1) + return boxes + + def non_max_suppression(self, prediction, num_classes, input_shape, image_shape, letterbox_image, conf_thres=0.5, nms_thres=0.4): + #----------------------------------------------------------# + # 将预测结果的格式转换成左上角右下角的格式。 + # prediction [batch_size, num_anchors, 85] + #----------------------------------------------------------# + box_corner = prediction.new(prediction.shape) + box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 + box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 + box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 + box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 + prediction[:, :, :4] = box_corner[:, :, :4] + + output = [None for _ in range(len(prediction))] + for i, image_pred in enumerate(prediction): + #----------------------------------------------------------# + # 对种类预测部分取max。 + # class_conf [num_anchors, 1] 种类置信度 + # class_pred [num_anchors, 1] 种类 + #----------------------------------------------------------# + class_conf, class_pred = torch.max(image_pred[:, 4:4 + num_classes], 1, keepdim=True) + + #----------------------------------------------------------# + # 利用置信度进行第一轮筛选 + #----------------------------------------------------------# + conf_mask = (class_conf[:, 0] >= conf_thres).squeeze() + + #----------------------------------------------------------# + # 根据置信度进行预测结果的筛选 + #----------------------------------------------------------# + image_pred = image_pred[conf_mask] + class_conf = class_conf[conf_mask] + class_pred = class_pred[conf_mask] + if not image_pred.size(0): + continue + #-------------------------------------------------------------------------# + # detections [num_anchors, 6] + # 6的内容为:x1, y1, x2, y2, class_conf, class_pred + #-------------------------------------------------------------------------# + detections = torch.cat((image_pred[:, :4], class_conf.float(), class_pred.float()), 1) + + #------------------------------------------# + # 获得预测结果中包含的所有种类 + #------------------------------------------# + unique_labels = detections[:, -1].cpu().unique() + + if prediction.is_cuda: + unique_labels = unique_labels.cuda() + detections = detections.cuda() + + for c in unique_labels: + #------------------------------------------# + # 获得某一类得分筛选后全部的预测结果 + #------------------------------------------# + detections_class = detections[detections[:, -1] == c] + #------------------------------------------# + # 使用官方自带的非极大抑制会速度更快一些! + # 筛选出一定区域内,属于同一种类得分最大的框 + #------------------------------------------# + keep = nms( + detections_class[:, :4], + detections_class[:, 4], + nms_thres + ) + max_detections = detections_class[keep] + + # # 按照存在物体的置信度排序 + # _, conf_sort_index = torch.sort(detections_class[:, 4]*detections_class[:, 5], descending=True) + # detections_class = detections_class[conf_sort_index] + # # 进行非极大抑制 + # max_detections = [] + # while detections_class.size(0): + # # 取出这一类置信度最高的,一步一步往下判断,判断重合程度是否大于nms_thres,如果是则去除掉 + # max_detections.append(detections_class[0].unsqueeze(0)) + # if len(detections_class) == 1: + # break + # ious = bbox_iou(max_detections[-1], detections_class[1:]) + # detections_class = detections_class[1:][ious < nms_thres] + # # 堆叠 + # max_detections = torch.cat(max_detections).data + + # Add max detections to outputs + output[i] = max_detections if output[i] is None else torch.cat((output[i], max_detections)) + + if output[i] is not None: + output[i] = output[i].cpu().numpy() + box_xy, box_wh = (output[i][:, 0:2] + output[i][:, 2:4])/2, output[i][:, 2:4] - output[i][:, 0:2] + output[i][:, :4] = self.yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape, letterbox_image) + return output + + +if __name__ == "__main__": + import matplotlib.pyplot as plt + import numpy as np + + #---------------------------------------------------# + # 将预测值的每个特征层调成真实值 + #---------------------------------------------------# + def get_anchors_and_decode(input, input_shape, anchors, anchors_mask, num_classes): + #-----------------------------------------------# + # input batch_size, 3 * (4 + 1 + num_classes), 20, 20 + #-----------------------------------------------# + batch_size = input.size(0) + input_height = input.size(2) + input_width = input.size(3) + + #-----------------------------------------------# + # 输入为640x640时 input_shape = [640, 640] input_height = 20, input_width = 20 + # 640 / 20 = 32 + # stride_h = stride_w = 32 + #-----------------------------------------------# + stride_h = input_shape[0] / input_height + stride_w = input_shape[1] / input_width + #-------------------------------------------------# + # 此时获得的scaled_anchors大小是相对于特征层的 + # anchor_width, anchor_height / stride_h, stride_w + #-------------------------------------------------# + scaled_anchors = [(anchor_width / stride_w, anchor_height / stride_h) for anchor_width, anchor_height in anchors[anchors_mask[2]]] + + #-----------------------------------------------# + # batch_size, 3 * (4 + 1 + num_classes), 20, 20 => + # batch_size, 3, 5 + num_classes, 20, 20 => + # batch_size, 3, 20, 20, 4 + 1 + num_classes + #-----------------------------------------------# + prediction = input.view(batch_size, len(anchors_mask[2]), + num_classes + 5, input_height, input_width).permute(0, 1, 3, 4, 2).contiguous() + + #-----------------------------------------------# + # 先验框的中心位置的调整参数 + #-----------------------------------------------# + x = torch.sigmoid(prediction[..., 0]) + y = torch.sigmoid(prediction[..., 1]) + #-----------------------------------------------# + # 先验框的宽高调整参数 + #-----------------------------------------------# + w = torch.sigmoid(prediction[..., 2]) + h = torch.sigmoid(prediction[..., 3]) + #-----------------------------------------------# + # 获得置信度,是否有物体 0 - 1 + #-----------------------------------------------# + conf = torch.sigmoid(prediction[..., 4]) + #-----------------------------------------------# + # 种类置信度 0 - 1 + #-----------------------------------------------# + pred_cls = torch.sigmoid(prediction[..., 5:]) + + FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor + LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor + + #----------------------------------------------------------# + # 生成网格,先验框中心,网格左上角 + # batch_size,3,20,20 + # range(20) + # [ + # [0, 1, 2, 3 ……, 19], + # [0, 1, 2, 3 ……, 19], + # …… (20次) + # [0, 1, 2, 3 ……, 19] + # ] * (batch_size * 3) + # [batch_size, 3, 20, 20] + # + # [ + # [0, 1, 2, 3 ……, 19], + # [0, 1, 2, 3 ……, 19], + # …… (20次) + # [0, 1, 2, 3 ……, 19] + # ].T * (batch_size * 3) + # [batch_size, 3, 20, 20] + #----------------------------------------------------------# + grid_x = torch.linspace(0, input_width - 1, input_width).repeat(input_height, 1).repeat( + batch_size * len(anchors_mask[2]), 1, 1).view(x.shape).type(FloatTensor) + grid_y = torch.linspace(0, input_height - 1, input_height).repeat(input_width, 1).t().repeat( + batch_size * len(anchors_mask[2]), 1, 1).view(y.shape).type(FloatTensor) + + #----------------------------------------------------------# + # 按照网格格式生成先验框的宽高 + # batch_size, 3, 20 * 20 => batch_size, 3, 20, 20 + # batch_size, 3, 20 * 20 => batch_size, 3, 20, 20 + #----------------------------------------------------------# + anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0])) + anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1])) + anchor_w = anchor_w.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(w.shape) + anchor_h = anchor_h.repeat(batch_size, 1).repeat(1, 1, input_height * input_width).view(h.shape) + + #----------------------------------------------------------# + # 利用预测结果对先验框进行调整 + # 首先调整先验框的中心,从先验框中心向右下角偏移 + # 再调整先验框的宽高。 + # x 0 ~ 1 => 0 ~ 2 => -0.5 ~ 1.5 + grid_x + # y 0 ~ 1 => 0 ~ 2 => -0.5 ~ 1.5 + grid_y + # w 0 ~ 1 => 0 ~ 2 => 0 ~ 4 * anchor_w + # h 0 ~ 1 => 0 ~ 2 => 0 ~ 4 * anchor_h + #----------------------------------------------------------# + pred_boxes = FloatTensor(prediction[..., :4].shape) + pred_boxes[..., 0] = x.data * 2. - 0.5 + grid_x + pred_boxes[..., 1] = y.data * 2. - 0.5 + grid_y + pred_boxes[..., 2] = (w.data * 2) ** 2 * anchor_w + pred_boxes[..., 3] = (h.data * 2) ** 2 * anchor_h + + point_h = 5 + point_w = 5 + + box_xy = pred_boxes[..., 0:2].cpu().numpy() * 32 + box_wh = pred_boxes[..., 2:4].cpu().numpy() * 32 + grid_x = grid_x.cpu().numpy() * 32 + grid_y = grid_y.cpu().numpy() * 32 + anchor_w = anchor_w.cpu().numpy() * 32 + anchor_h = anchor_h.cpu().numpy() * 32 + + fig = plt.figure() + ax = fig.add_subplot(121) + from PIL import Image + img = Image.open("img/street.jpg").resize([640, 640]) + plt.imshow(img, alpha=0.5) + plt.ylim(-30, 650) + plt.xlim(-30, 650) + plt.scatter(grid_x, grid_y) + plt.scatter(point_h * 32, point_w * 32, c='black') + plt.gca().invert_yaxis() + + anchor_left = grid_x - anchor_w / 2 + anchor_top = grid_y - anchor_h / 2 + + rect1 = plt.Rectangle([anchor_left[0, 0, point_h, point_w],anchor_top[0, 0, point_h, point_w]], \ + anchor_w[0, 0, point_h, point_w],anchor_h[0, 0, point_h, point_w],color="r",fill=False) + rect2 = plt.Rectangle([anchor_left[0, 1, point_h, point_w],anchor_top[0, 1, point_h, point_w]], \ + anchor_w[0, 1, point_h, point_w],anchor_h[0, 1, point_h, point_w],color="r",fill=False) + rect3 = plt.Rectangle([anchor_left[0, 2, point_h, point_w],anchor_top[0, 2, point_h, point_w]], \ + anchor_w[0, 2, point_h, point_w],anchor_h[0, 2, point_h, point_w],color="r",fill=False) + + ax.add_patch(rect1) + ax.add_patch(rect2) + ax.add_patch(rect3) + + ax = fig.add_subplot(122) + plt.imshow(img, alpha=0.5) + plt.ylim(-30, 650) + plt.xlim(-30, 650) + plt.scatter(grid_x, grid_y) + plt.scatter(point_h * 32, point_w * 32, c='black') + plt.scatter(box_xy[0, :, point_h, point_w, 0], box_xy[0, :, point_h, point_w, 1], c='r') + plt.gca().invert_yaxis() + + pre_left = box_xy[...,0] - box_wh[...,0] / 2 + pre_top = box_xy[...,1] - box_wh[...,1] / 2 + + rect1 = plt.Rectangle([pre_left[0, 0, point_h, point_w], pre_top[0, 0, point_h, point_w]],\ + box_wh[0, 0, point_h, point_w,0], box_wh[0, 0, point_h, point_w,1],color="r",fill=False) + rect2 = plt.Rectangle([pre_left[0, 1, point_h, point_w], pre_top[0, 1, point_h, point_w]],\ + box_wh[0, 1, point_h, point_w,0], box_wh[0, 1, point_h, point_w,1],color="r",fill=False) + rect3 = plt.Rectangle([pre_left[0, 2, point_h, point_w], pre_top[0, 2, point_h, point_w]],\ + box_wh[0, 2, point_h, point_w,0], box_wh[0, 2, point_h, point_w,1],color="r",fill=False) + + ax.add_patch(rect1) + ax.add_patch(rect2) + ax.add_patch(rect3) + + plt.show() + # + feat = torch.from_numpy(np.random.normal(0.2, 0.5, [4, 255, 20, 20])).float() + anchors = np.array([[116, 90], [156, 198], [373, 326], [30,61], [62,45], [59,119], [10,13], [16,30], [33,23]]) + anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] + get_anchors_and_decode(feat, [640, 640], anchors, anchors_mask, 80) diff --git a/app/core/yolo_detect/utils/utils_fit.py b/app/core/yolo_detect/utils/utils_fit.py new file mode 100644 index 0000000..3013bd4 --- /dev/null +++ b/app/core/yolo_detect/utils/utils_fit.py @@ -0,0 +1,126 @@ +import os + +import torch +from tqdm import tqdm + +from utils.utils import get_lr + +def fit_one_epoch(model_train, model, ema, yolo_loss, loss_history, eval_callback, optimizer, epoch, epoch_step, epoch_step_val, gen, gen_val, Epoch, cuda, fp16, scaler, save_period, save_dir, local_rank=0): + loss = 0 + val_loss = 0 + + if local_rank == 0: + print('Start Train') + pbar = tqdm(total=epoch_step,desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) + model_train.train() + for iteration, batch in enumerate(gen): + if iteration >= epoch_step: + break + + images, bboxes = batch + with torch.no_grad(): + if cuda: + images = images.cuda(local_rank) + bboxes = bboxes.cuda(local_rank) + #----------------------# + # 清零梯度 + #----------------------# + optimizer.zero_grad() + if not fp16: + #----------------------# + # 前向传播 + #----------------------# + # dbox, cls, origin_cls, anchors, strides + outputs = model_train(images) + loss_value = yolo_loss(outputs, bboxes) + #----------------------# + # 反向传播 + #----------------------# + loss_value.backward() + torch.nn.utils.clip_grad_norm_(model_train.parameters(), max_norm=10.0) # clip gradients + optimizer.step() + else: + from torch.cuda.amp import autocast + with autocast(): + #----------------------# + # 前向传播 + #----------------------# + outputs = model_train(images) + loss_value = yolo_loss(outputs, bboxes) + + #----------------------# + # 反向传播 + #----------------------# + scaler.scale(loss_value).backward() + scaler.unscale_(optimizer) # unscale gradients + torch.nn.utils.clip_grad_norm_(model_train.parameters(), max_norm=10.0) # clip gradients + scaler.step(optimizer) + scaler.update() + if ema: + ema.update(model_train) + + loss += loss_value.item() + + if local_rank == 0: + pbar.set_postfix(**{'loss' : loss / (iteration + 1), + 'lr' : get_lr(optimizer)}) + pbar.update(1) + + if local_rank == 0: + pbar.close() + print('Finish Train') + print('Start Validation') + pbar = tqdm(total=epoch_step_val, desc=f'Epoch {epoch + 1}/{Epoch}',postfix=dict,mininterval=0.3) + + if ema: + model_train_eval = ema.ema + else: + model_train_eval = model_train.eval() + + for iteration, batch in enumerate(gen_val): + if iteration >= epoch_step_val: + break + images, bboxes = batch[0], batch[1] + with torch.no_grad(): + if cuda: + images = images.cuda(local_rank) + bboxes = bboxes.cuda(local_rank) + #----------------------# + # 清零梯度 + #----------------------# + optimizer.zero_grad() + #----------------------# + # 前向传播 + #----------------------# + outputs = model_train_eval(images) + loss_value = yolo_loss(outputs, bboxes) + + val_loss += loss_value.item() + if local_rank == 0: + pbar.set_postfix(**{'val_loss': val_loss / (iteration + 1)}) + pbar.update(1) + + if local_rank == 0: + pbar.close() + print('Finish Validation') + loss_history.append_loss(epoch + 1, loss / epoch_step, val_loss / epoch_step_val) + eval_callback.on_epoch_end(epoch + 1, model_train_eval) + print('Epoch:'+ str(epoch + 1) + '/' + str(Epoch)) + print('Total Loss: %.3f || Val Loss: %.3f ' % (loss / epoch_step, val_loss / epoch_step_val)) + + #-----------------------------------------------# + # 保存权值 + #-----------------------------------------------# + if ema: + save_state_dict = ema.ema.state_dict() + else: + save_state_dict = model.state_dict() + + if (epoch + 1) % save_period == 0 or epoch + 1 == Epoch: + torch.save(save_state_dict, os.path.join(save_dir, "ep%03d-loss%.3f-val_loss%.3f.pth" % (epoch + 1, loss / epoch_step, val_loss / epoch_step_val))) + + if len(loss_history.val_loss) <= 1 or (val_loss / epoch_step_val) <= min(loss_history.val_loss): + print('Save best model to best_epoch_weights.pth') + torch.save(save_state_dict, os.path.join(save_dir, "best_epoch_weights.pth")) + + torch.save(save_state_dict, os.path.join(save_dir, "last_epoch_weights.pth")) \ No newline at end of file diff --git a/app/core/yolo_detect/utils/utils_map.py b/app/core/yolo_detect/utils/utils_map.py new file mode 100644 index 0000000..b49ddba --- /dev/null +++ b/app/core/yolo_detect/utils/utils_map.py @@ -0,0 +1,923 @@ +import glob +import json +import math +import operator +import os +import shutil +import sys +try: + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval +except: + pass +import cv2 +import matplotlib +matplotlib.use('Agg') +from matplotlib import pyplot as plt +import numpy as np + +''' + 0,0 ------> x (width) + | + | (Left,Top) + | *_________ + | | | + | | + y |_________| + (height) * + (Right,Bottom) +''' + +def log_average_miss_rate(precision, fp_cumsum, num_images): + """ + log-average miss rate: + Calculated by averaging miss rates at 9 evenly spaced FPPI points + between 10e-2 and 10e0, in log-space. + + output: + lamr | log-average miss rate + mr | miss rate + fppi | false positives per image + + references: + [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the + State of the Art." Pattern Analysis and Machine Intelligence, IEEE + Transactions on 34.4 (2012): 743 - 761. + """ + + if precision.size == 0: + lamr = 0 + mr = 1 + fppi = 0 + return lamr, mr, fppi + + fppi = fp_cumsum / float(num_images) + mr = (1 - precision) + + fppi_tmp = np.insert(fppi, 0, -1.0) + mr_tmp = np.insert(mr, 0, 1.0) + + ref = np.logspace(-2.0, 0.0, num = 9) + for i, ref_i in enumerate(ref): + j = np.where(fppi_tmp <= ref_i)[-1][-1] + ref[i] = mr_tmp[j] + + lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref)))) + + return lamr, mr, fppi + +""" + throw error and exit +""" +def error(msg): + print(msg) + sys.exit(0) + +""" + check if the number is a float between 0.0 and 1.0 +""" +def is_float_between_0_and_1(value): + try: + val = float(value) + if val > 0.0 and val < 1.0: + return True + else: + return False + except ValueError: + return False + +""" + Calculate the AP given the recall and precision array + 1st) We compute a version of the measured precision/recall curve with + precision monotonically decreasing + 2nd) We compute the AP as the area under this curve by numerical integration. +""" +def voc_ap(rec, prec): + """ + --- Official matlab code VOC2012--- + mrec=[0 ; rec ; 1]; + mpre=[0 ; prec ; 0]; + for i=numel(mpre)-1:-1:1 + mpre(i)=max(mpre(i),mpre(i+1)); + end + i=find(mrec(2:end)~=mrec(1:end-1))+1; + ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); + """ + rec.insert(0, 0.0) # insert 0.0 at begining of list + rec.append(1.0) # insert 1.0 at end of list + mrec = rec[:] + prec.insert(0, 0.0) # insert 0.0 at begining of list + prec.append(0.0) # insert 0.0 at end of list + mpre = prec[:] + """ + This part makes the precision monotonically decreasing + (goes from the end to the beginning) + matlab: for i=numel(mpre)-1:-1:1 + mpre(i)=max(mpre(i),mpre(i+1)); + """ + for i in range(len(mpre)-2, -1, -1): + mpre[i] = max(mpre[i], mpre[i+1]) + """ + This part creates a list of indexes where the recall changes + matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1; + """ + i_list = [] + for i in range(1, len(mrec)): + if mrec[i] != mrec[i-1]: + i_list.append(i) # if it was matlab would be i + 1 + """ + The Average Precision (AP) is the area under the curve + (numerical integration) + matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); + """ + ap = 0.0 + for i in i_list: + ap += ((mrec[i]-mrec[i-1])*mpre[i]) + return ap, mrec, mpre + + +""" + Convert the lines of a file to a list +""" +def file_lines_to_list(path): + # open txt file lines to a list + with open(path) as f: + content = f.readlines() + # remove whitespace characters like `\n` at the end of each line + content = [x.strip() for x in content] + return content + +""" + Draws text in image +""" +def draw_text_in_image(img, text, pos, color, line_width): + font = cv2.FONT_HERSHEY_PLAIN + fontScale = 1 + lineType = 1 + bottomLeftCornerOfText = pos + cv2.putText(img, text, + bottomLeftCornerOfText, + font, + fontScale, + color, + lineType) + text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0] + return img, (line_width + text_width) + +""" + Plot - adjust axes +""" +def adjust_axes(r, t, fig, axes): + # get text width for re-scaling + bb = t.get_window_extent(renderer=r) + text_width_inches = bb.width / fig.dpi + # get axis width in inches + current_fig_width = fig.get_figwidth() + new_fig_width = current_fig_width + text_width_inches + propotion = new_fig_width / current_fig_width + # get axis limit + x_lim = axes.get_xlim() + axes.set_xlim([x_lim[0], x_lim[1]*propotion]) + +""" + Draw plot using Matplotlib +""" +def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar): + # sort the dictionary by decreasing value, into a list of tuples + sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1)) + # unpacking the list of tuples into two lists + sorted_keys, sorted_values = zip(*sorted_dic_by_value) + # + if true_p_bar != "": + """ + Special case to draw in: + - green -> TP: True Positives (object detected and matches ground-truth) + - red -> FP: False Positives (object detected but does not match ground-truth) + - orange -> FN: False Negatives (object not detected but present in the ground-truth) + """ + fp_sorted = [] + tp_sorted = [] + for key in sorted_keys: + fp_sorted.append(dictionary[key] - true_p_bar[key]) + tp_sorted.append(true_p_bar[key]) + plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive') + plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted) + # add legend + plt.legend(loc='lower right') + """ + Write number on side of bar + """ + fig = plt.gcf() # gcf - get current figure + axes = plt.gca() + r = fig.canvas.get_renderer() + for i, val in enumerate(sorted_values): + fp_val = fp_sorted[i] + tp_val = tp_sorted[i] + fp_str_val = " " + str(fp_val) + tp_str_val = fp_str_val + " " + str(tp_val) + # trick to paint multicolor with offset: + # first paint everything and then repaint the first number + t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold') + plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold') + if i == (len(sorted_values)-1): # largest bar + adjust_axes(r, t, fig, axes) + else: + plt.barh(range(n_classes), sorted_values, color=plot_color) + """ + Write number on side of bar + """ + fig = plt.gcf() # gcf - get current figure + axes = plt.gca() + r = fig.canvas.get_renderer() + for i, val in enumerate(sorted_values): + str_val = " " + str(val) # add a space before + if val < 1.0: + str_val = " {0:.2f}".format(val) + t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold') + # re-set axes to show number inside the figure + if i == (len(sorted_values)-1): # largest bar + adjust_axes(r, t, fig, axes) + # set window title + fig.canvas.set_window_title(window_title) + # write classes in y axis + tick_font_size = 12 + plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size) + """ + Re-scale height accordingly + """ + init_height = fig.get_figheight() + # comput the matrix height in points and inches + dpi = fig.dpi + height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing) + height_in = height_pt / dpi + # compute the required figure height + top_margin = 0.15 # in percentage of the figure height + bottom_margin = 0.05 # in percentage of the figure height + figure_height = height_in / (1 - top_margin - bottom_margin) + # set new height + if figure_height > init_height: + fig.set_figheight(figure_height) + + # set plot title + plt.title(plot_title, fontsize=14) + # set axis titles + # plt.xlabel('classes') + plt.xlabel(x_label, fontsize='large') + # adjust size of window + fig.tight_layout() + # save the plot + fig.savefig(output_path) + # show image + if to_show: + plt.show() + # close the plot + plt.close() + +def get_map(MINOVERLAP, draw_plot, score_threhold=0.5, path = './map_out'): + GT_PATH = os.path.join(path, 'ground-truth') + DR_PATH = os.path.join(path, 'detection-results') + IMG_PATH = os.path.join(path, 'images-optional') + TEMP_FILES_PATH = os.path.join(path, '.temp_files') + RESULTS_FILES_PATH = os.path.join(path, 'results') + + show_animation = True + if os.path.exists(IMG_PATH): + for dirpath, dirnames, files in os.walk(IMG_PATH): + if not files: + show_animation = False + else: + show_animation = False + + if not os.path.exists(TEMP_FILES_PATH): + os.makedirs(TEMP_FILES_PATH) + + if os.path.exists(RESULTS_FILES_PATH): + shutil.rmtree(RESULTS_FILES_PATH) + else: + os.makedirs(RESULTS_FILES_PATH) + if draw_plot: + try: + matplotlib.use('TkAgg') + except: + pass + os.makedirs(os.path.join(RESULTS_FILES_PATH, "AP")) + os.makedirs(os.path.join(RESULTS_FILES_PATH, "F1")) + os.makedirs(os.path.join(RESULTS_FILES_PATH, "Recall")) + os.makedirs(os.path.join(RESULTS_FILES_PATH, "Precision")) + if show_animation: + os.makedirs(os.path.join(RESULTS_FILES_PATH, "images", "detections_one_by_one")) + + ground_truth_files_list = glob.glob(GT_PATH + '/*.txt') + if len(ground_truth_files_list) == 0: + error("Error: No ground-truth files found!") + ground_truth_files_list.sort() + gt_counter_per_class = {} + counter_images_per_class = {} + + for txt_file in ground_truth_files_list: + file_id = txt_file.split(".txt", 1)[0] + file_id = os.path.basename(os.path.normpath(file_id)) + temp_path = os.path.join(DR_PATH, (file_id + ".txt")) + if not os.path.exists(temp_path): + error_msg = "Error. File not found: {}\n".format(temp_path) + error(error_msg) + lines_list = file_lines_to_list(txt_file) + bounding_boxes = [] + is_difficult = False + already_seen_classes = [] + for line in lines_list: + try: + if "difficult" in line: + class_name, left, top, right, bottom, _difficult = line.split() + is_difficult = True + else: + class_name, left, top, right, bottom = line.split() + except: + if "difficult" in line: + line_split = line.split() + _difficult = line_split[-1] + bottom = line_split[-2] + right = line_split[-3] + top = line_split[-4] + left = line_split[-5] + class_name = "" + for name in line_split[:-5]: + class_name += name + " " + class_name = class_name[:-1] + is_difficult = True + else: + line_split = line.split() + bottom = line_split[-1] + right = line_split[-2] + top = line_split[-3] + left = line_split[-4] + class_name = "" + for name in line_split[:-4]: + class_name += name + " " + class_name = class_name[:-1] + + bbox = left + " " + top + " " + right + " " + bottom + if is_difficult: + bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True}) + is_difficult = False + else: + bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False}) + if class_name in gt_counter_per_class: + gt_counter_per_class[class_name] += 1 + else: + gt_counter_per_class[class_name] = 1 + + if class_name not in already_seen_classes: + if class_name in counter_images_per_class: + counter_images_per_class[class_name] += 1 + else: + counter_images_per_class[class_name] = 1 + already_seen_classes.append(class_name) + + with open(TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json", 'w') as outfile: + json.dump(bounding_boxes, outfile) + + gt_classes = list(gt_counter_per_class.keys()) + gt_classes = sorted(gt_classes) + n_classes = len(gt_classes) + + dr_files_list = glob.glob(DR_PATH + '/*.txt') + dr_files_list.sort() + for class_index, class_name in enumerate(gt_classes): + bounding_boxes = [] + for txt_file in dr_files_list: + file_id = txt_file.split(".txt",1)[0] + file_id = os.path.basename(os.path.normpath(file_id)) + temp_path = os.path.join(GT_PATH, (file_id + ".txt")) + if class_index == 0: + if not os.path.exists(temp_path): + error_msg = "Error. File not found: {}\n".format(temp_path) + error(error_msg) + lines = file_lines_to_list(txt_file) + for line in lines: + try: + tmp_class_name, confidence, left, top, right, bottom = line.split() + except: + line_split = line.split() + bottom = line_split[-1] + right = line_split[-2] + top = line_split[-3] + left = line_split[-4] + confidence = line_split[-5] + tmp_class_name = "" + for name in line_split[:-5]: + tmp_class_name += name + " " + tmp_class_name = tmp_class_name[:-1] + + if tmp_class_name == class_name: + bbox = left + " " + top + " " + right + " " +bottom + bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox}) + + bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True) + with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile: + json.dump(bounding_boxes, outfile) + + sum_AP = 0.0 + ap_dictionary = {} + lamr_dictionary = {} + with open(RESULTS_FILES_PATH + "/results.txt", 'w') as results_file: + results_file.write("# AP and precision/recall per class\n") + count_true_positives = {} + + for class_index, class_name in enumerate(gt_classes): + count_true_positives[class_name] = 0 + dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json" + dr_data = json.load(open(dr_file)) + + nd = len(dr_data) + tp = [0] * nd + fp = [0] * nd + score = [0] * nd + score_threhold_idx = 0 + for idx, detection in enumerate(dr_data): + file_id = detection["file_id"] + score[idx] = float(detection["confidence"]) + if score[idx] >= score_threhold: + score_threhold_idx = idx + + if show_animation: + ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*") + if len(ground_truth_img) == 0: + error("Error. Image not found with id: " + file_id) + elif len(ground_truth_img) > 1: + error("Error. Multiple image with id: " + file_id) + else: + img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0]) + img_cumulative_path = RESULTS_FILES_PATH + "/images/" + ground_truth_img[0] + if os.path.isfile(img_cumulative_path): + img_cumulative = cv2.imread(img_cumulative_path) + else: + img_cumulative = img.copy() + bottom_border = 60 + BLACK = [0, 0, 0] + img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK) + + gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json" + ground_truth_data = json.load(open(gt_file)) + ovmax = -1 + gt_match = -1 + bb = [float(x) for x in detection["bbox"].split()] + for obj in ground_truth_data: + if obj["class_name"] == class_name: + bbgt = [ float(x) for x in obj["bbox"].split() ] + bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])] + iw = bi[2] - bi[0] + 1 + ih = bi[3] - bi[1] + 1 + if iw > 0 and ih > 0: + ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0] + + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih + ov = iw * ih / ua + if ov > ovmax: + ovmax = ov + gt_match = obj + + if show_animation: + status = "NO MATCH FOUND!" + + min_overlap = MINOVERLAP + if ovmax >= min_overlap: + if "difficult" not in gt_match: + if not bool(gt_match["used"]): + tp[idx] = 1 + gt_match["used"] = True + count_true_positives[class_name] += 1 + with open(gt_file, 'w') as f: + f.write(json.dumps(ground_truth_data)) + if show_animation: + status = "MATCH!" + else: + fp[idx] = 1 + if show_animation: + status = "REPEATED MATCH!" + else: + fp[idx] = 1 + if ovmax > 0: + status = "INSUFFICIENT OVERLAP" + + """ + Draw image to show animation + """ + if show_animation: + height, widht = img.shape[:2] + white = (255,255,255) + light_blue = (255,200,100) + green = (0,255,0) + light_red = (30,30,255) + margin = 10 + # 1nd line + v_pos = int(height - margin - (bottom_border / 2.0)) + text = "Image: " + ground_truth_img[0] + " " + img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) + text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " " + img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width) + if ovmax != -1: + color = light_red + if status == "INSUFFICIENT OVERLAP": + text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100) + else: + text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100) + color = green + img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) + # 2nd line + v_pos += int(bottom_border / 2.0) + rank_pos = str(idx+1) + text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100) + img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) + color = light_red + if status == "MATCH!": + color = green + text = "Result: " + status + " " + img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) + + font = cv2.FONT_HERSHEY_SIMPLEX + if ovmax > 0: + bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ] + cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) + cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) + cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA) + bb = [int(i) for i in bb] + cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2) + cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2) + cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA) + + cv2.imshow("Animation", img) + cv2.waitKey(20) + output_img_path = RESULTS_FILES_PATH + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg" + cv2.imwrite(output_img_path, img) + cv2.imwrite(img_cumulative_path, img_cumulative) + + cumsum = 0 + for idx, val in enumerate(fp): + fp[idx] += cumsum + cumsum += val + + cumsum = 0 + for idx, val in enumerate(tp): + tp[idx] += cumsum + cumsum += val + + rec = tp[:] + for idx, val in enumerate(tp): + rec[idx] = float(tp[idx]) / np.maximum(gt_counter_per_class[class_name], 1) + + prec = tp[:] + for idx, val in enumerate(tp): + prec[idx] = float(tp[idx]) / np.maximum((fp[idx] + tp[idx]), 1) + + ap, mrec, mprec = voc_ap(rec[:], prec[:]) + F1 = np.array(rec)*np.array(prec)*2 / np.where((np.array(prec)+np.array(rec))==0, 1, (np.array(prec)+np.array(rec))) + + sum_AP += ap + text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100) + + if len(prec)>0: + F1_text = "{0:.2f}".format(F1[score_threhold_idx]) + " = " + class_name + " F1 " + Recall_text = "{0:.2f}%".format(rec[score_threhold_idx]*100) + " = " + class_name + " Recall " + Precision_text = "{0:.2f}%".format(prec[score_threhold_idx]*100) + " = " + class_name + " Precision " + else: + F1_text = "0.00" + " = " + class_name + " F1 " + Recall_text = "0.00%" + " = " + class_name + " Recall " + Precision_text = "0.00%" + " = " + class_name + " Precision " + + rounded_prec = [ '%.2f' % elem for elem in prec ] + rounded_rec = [ '%.2f' % elem for elem in rec ] + results_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") + + if len(prec)>0: + print(text + "\t||\tscore_threhold=" + str(score_threhold) + " : " + "F1=" + "{0:.2f}".format(F1[score_threhold_idx])\ + + " ; Recall=" + "{0:.2f}%".format(rec[score_threhold_idx]*100) + " ; Precision=" + "{0:.2f}%".format(prec[score_threhold_idx]*100)) + else: + print(text + "\t||\tscore_threhold=" + str(score_threhold) + " : " + "F1=0.00% ; Recall=0.00% ; Precision=0.00%") + ap_dictionary[class_name] = ap + + n_images = counter_images_per_class[class_name] + lamr, mr, fppi = log_average_miss_rate(np.array(rec), np.array(fp), n_images) + lamr_dictionary[class_name] = lamr + + if draw_plot: + plt.plot(rec, prec, '-o') + area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]] + area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]] + plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r') + + fig = plt.gcf() + fig.canvas.set_window_title('AP ' + class_name) + + plt.title('class: ' + text) + plt.xlabel('Recall') + plt.ylabel('Precision') + axes = plt.gca() + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) + fig.savefig(RESULTS_FILES_PATH + "/AP/" + class_name + ".png") + plt.cla() + + plt.plot(score, F1, "-", color='orangered') + plt.title('class: ' + F1_text + "\nscore_threhold=" + str(score_threhold)) + plt.xlabel('Score_Threhold') + plt.ylabel('F1') + axes = plt.gca() + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) + fig.savefig(RESULTS_FILES_PATH + "/F1/" + class_name + ".png") + plt.cla() + + plt.plot(score, rec, "-H", color='gold') + plt.title('class: ' + Recall_text + "\nscore_threhold=" + str(score_threhold)) + plt.xlabel('Score_Threhold') + plt.ylabel('Recall') + axes = plt.gca() + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) + fig.savefig(RESULTS_FILES_PATH + "/Recall/" + class_name + ".png") + plt.cla() + + plt.plot(score, prec, "-s", color='palevioletred') + plt.title('class: ' + Precision_text + "\nscore_threhold=" + str(score_threhold)) + plt.xlabel('Score_Threhold') + plt.ylabel('Precision') + axes = plt.gca() + axes.set_xlim([0.0,1.0]) + axes.set_ylim([0.0,1.05]) + fig.savefig(RESULTS_FILES_PATH + "/Precision/" + class_name + ".png") + plt.cla() + + if show_animation: + cv2.destroyAllWindows() + if n_classes == 0: + print("未检测到任何种类,请检查标签信息与get_map.py中的classes_path是否修改。") + return 0 + results_file.write("\n# mAP of all classes\n") + mAP = sum_AP / n_classes + text = "mAP = {0:.2f}%".format(mAP*100) + results_file.write(text + "\n") + print(text) + + shutil.rmtree(TEMP_FILES_PATH) + + """ + Count total of detection-results + """ + det_counter_per_class = {} + for txt_file in dr_files_list: + lines_list = file_lines_to_list(txt_file) + for line in lines_list: + class_name = line.split()[0] + if class_name in det_counter_per_class: + det_counter_per_class[class_name] += 1 + else: + det_counter_per_class[class_name] = 1 + dr_classes = list(det_counter_per_class.keys()) + + """ + Write number of ground-truth objects per class to results.txt + """ + with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file: + results_file.write("\n# Number of ground-truth objects per class\n") + for class_name in sorted(gt_counter_per_class): + results_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n") + + """ + Finish counting true positives + """ + for class_name in dr_classes: + if class_name not in gt_classes: + count_true_positives[class_name] = 0 + + """ + Write number of detected objects per class to results.txt + """ + with open(RESULTS_FILES_PATH + "/results.txt", 'a') as results_file: + results_file.write("\n# Number of detected objects per class\n") + for class_name in sorted(dr_classes): + n_det = det_counter_per_class[class_name] + text = class_name + ": " + str(n_det) + text += " (tp:" + str(count_true_positives[class_name]) + "" + text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n" + results_file.write(text) + + """ + Plot the total number of occurences of each class in the ground-truth + """ + if draw_plot: + window_title = "ground-truth-info" + plot_title = "ground-truth\n" + plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)" + x_label = "Number of objects per class" + output_path = RESULTS_FILES_PATH + "/ground-truth-info.png" + to_show = False + plot_color = 'forestgreen' + draw_plot_func( + gt_counter_per_class, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + '', + ) + + # """ + # Plot the total number of occurences of each class in the "detection-results" folder + # """ + # if draw_plot: + # window_title = "detection-results-info" + # # Plot title + # plot_title = "detection-results\n" + # plot_title += "(" + str(len(dr_files_list)) + " files and " + # count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values())) + # plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)" + # # end Plot title + # x_label = "Number of objects per class" + # output_path = RESULTS_FILES_PATH + "/detection-results-info.png" + # to_show = False + # plot_color = 'forestgreen' + # true_p_bar = count_true_positives + # draw_plot_func( + # det_counter_per_class, + # len(det_counter_per_class), + # window_title, + # plot_title, + # x_label, + # output_path, + # to_show, + # plot_color, + # true_p_bar + # ) + + """ + Draw log-average miss rate plot (Show lamr of all classes in decreasing order) + """ + if draw_plot: + window_title = "lamr" + plot_title = "log-average miss rate" + x_label = "log-average miss rate" + output_path = RESULTS_FILES_PATH + "/lamr.png" + to_show = False + plot_color = 'royalblue' + draw_plot_func( + lamr_dictionary, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + "" + ) + + """ + Draw mAP plot (Show AP's of all classes in decreasing order) + """ + if draw_plot: + window_title = "mAP" + plot_title = "mAP = {0:.2f}%".format(mAP*100) + x_label = "Average Precision" + output_path = RESULTS_FILES_PATH + "/mAP.png" + to_show = True + plot_color = 'royalblue' + draw_plot_func( + ap_dictionary, + n_classes, + window_title, + plot_title, + x_label, + output_path, + to_show, + plot_color, + "" + ) + return mAP + +def preprocess_gt(gt_path, class_names): + image_ids = os.listdir(gt_path) + results = {} + + images = [] + bboxes = [] + for i, image_id in enumerate(image_ids): + lines_list = file_lines_to_list(os.path.join(gt_path, image_id)) + boxes_per_image = [] + image = {} + image_id = os.path.splitext(image_id)[0] + image['file_name'] = image_id + '.jpg' + image['width'] = 1 + image['height'] = 1 + #-----------------------------------------------------------------# + # 感谢 多学学英语吧 的提醒 + # 解决了'Results do not correspond to current coco set'问题 + #-----------------------------------------------------------------# + image['id'] = str(image_id) + + for line in lines_list: + difficult = 0 + if "difficult" in line: + line_split = line.split() + left, top, right, bottom, _difficult = line_split[-5:] + class_name = "" + for name in line_split[:-5]: + class_name += name + " " + class_name = class_name[:-1] + difficult = 1 + else: + line_split = line.split() + left, top, right, bottom = line_split[-4:] + class_name = "" + for name in line_split[:-4]: + class_name += name + " " + class_name = class_name[:-1] + + left, top, right, bottom = float(left), float(top), float(right), float(bottom) + if class_name not in class_names: + continue + cls_id = class_names.index(class_name) + 1 + bbox = [left, top, right - left, bottom - top, difficult, str(image_id), cls_id, (right - left) * (bottom - top) - 10.0] + boxes_per_image.append(bbox) + images.append(image) + bboxes.extend(boxes_per_image) + results['images'] = images + + categories = [] + for i, cls in enumerate(class_names): + category = {} + category['supercategory'] = cls + category['name'] = cls + category['id'] = i + 1 + categories.append(category) + results['categories'] = categories + + annotations = [] + for i, box in enumerate(bboxes): + annotation = {} + annotation['area'] = box[-1] + annotation['category_id'] = box[-2] + annotation['image_id'] = box[-3] + annotation['iscrowd'] = box[-4] + annotation['bbox'] = box[:4] + annotation['id'] = i + annotations.append(annotation) + results['annotations'] = annotations + return results + +def preprocess_dr(dr_path, class_names): + image_ids = os.listdir(dr_path) + results = [] + for image_id in image_ids: + lines_list = file_lines_to_list(os.path.join(dr_path, image_id)) + image_id = os.path.splitext(image_id)[0] + for line in lines_list: + line_split = line.split() + confidence, left, top, right, bottom = line_split[-5:] + class_name = "" + for name in line_split[:-5]: + class_name += name + " " + class_name = class_name[:-1] + left, top, right, bottom = float(left), float(top), float(right), float(bottom) + result = {} + result["image_id"] = str(image_id) + if class_name not in class_names: + continue + result["category_id"] = class_names.index(class_name) + 1 + result["bbox"] = [left, top, right - left, bottom - top] + result["score"] = float(confidence) + results.append(result) + return results + +def get_coco_map(class_names, path): + GT_PATH = os.path.join(path, 'ground-truth') + DR_PATH = os.path.join(path, 'detection-results') + COCO_PATH = os.path.join(path, 'coco_eval') + + if not os.path.exists(COCO_PATH): + os.makedirs(COCO_PATH) + + GT_JSON_PATH = os.path.join(COCO_PATH, 'instances_gt.json') + DR_JSON_PATH = os.path.join(COCO_PATH, 'instances_dr.json') + + with open(GT_JSON_PATH, "w") as f: + results_gt = preprocess_gt(GT_PATH, class_names) + json.dump(results_gt, f, indent=4) + + with open(DR_JSON_PATH, "w") as f: + results_dr = preprocess_dr(DR_PATH, class_names) + json.dump(results_dr, f, indent=4) + if len(results_dr) == 0: + print("未检测到任何目标。") + return [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + cocoGt = COCO(GT_JSON_PATH) + cocoDt = cocoGt.loadRes(DR_JSON_PATH) + cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') + cocoEval.evaluate() + cocoEval.accumulate() + cocoEval.summarize() + + return cocoEval.stats \ No newline at end of file diff --git a/app/core/yolo_detect/utils_coco/__init__.py b/app/core/yolo_detect/utils_coco/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/core/yolo_detect/utils_coco/coco_annotation.py b/app/core/yolo_detect/utils_coco/coco_annotation.py new file mode 100644 index 0000000..50f022f --- /dev/null +++ b/app/core/yolo_detect/utils_coco/coco_annotation.py @@ -0,0 +1,117 @@ +#-------------------------------------------------------# +# 用于处理COCO数据集,根据json文件生成txt文件用于训练 +#-------------------------------------------------------# +import json +import os +from collections import defaultdict + +#-------------------------------------------------------# +# 指向了COCO训练集与验证集图片的路径 +#-------------------------------------------------------# +train_datasets_path = "coco_dataset/train2017" +val_datasets_path = "coco_dataset/val2017" + +#-------------------------------------------------------# +# 指向了COCO训练集与验证集标签的路径 +#-------------------------------------------------------# +train_annotation_path = "coco_dataset/annotations/instances_train2017.json" +val_annotation_path = "coco_dataset/annotations/instances_val2017.json" + +#-------------------------------------------------------# +# 生成的txt文件路径 +#-------------------------------------------------------# +train_output_path = "coco_train.txt" +val_output_path = "coco_val.txt" + +if __name__ == "__main__": + name_box_id = defaultdict(list) + id_name = dict() + f = open(train_annotation_path, encoding='utf-8') + data = json.load(f) + + annotations = data['annotations'] + for ant in annotations: + id = ant['image_id'] + name = os.path.join(train_datasets_path, '%012d.jpg' % id) + cat = ant['category_id'] + if cat >= 1 and cat <= 11: + cat = cat - 1 + elif cat >= 13 and cat <= 25: + cat = cat - 2 + elif cat >= 27 and cat <= 28: + cat = cat - 3 + elif cat >= 31 and cat <= 44: + cat = cat - 5 + elif cat >= 46 and cat <= 65: + cat = cat - 6 + elif cat == 67: + cat = cat - 7 + elif cat == 70: + cat = cat - 9 + elif cat >= 72 and cat <= 82: + cat = cat - 10 + elif cat >= 84 and cat <= 90: + cat = cat - 11 + name_box_id[name].append([ant['bbox'], cat]) + + f = open(train_output_path, 'w') + for key in name_box_id.keys(): + f.write(key) + box_infos = name_box_id[key] + for info in box_infos: + x_min = int(info[0][0]) + y_min = int(info[0][1]) + x_max = x_min + int(info[0][2]) + y_max = y_min + int(info[0][3]) + + box_info = " %d,%d,%d,%d,%d" % ( + x_min, y_min, x_max, y_max, int(info[1])) + f.write(box_info) + f.write('\n') + f.close() + + name_box_id = defaultdict(list) + id_name = dict() + f = open(val_annotation_path, encoding='utf-8') + data = json.load(f) + + annotations = data['annotations'] + for ant in annotations: + id = ant['image_id'] + name = os.path.join(val_datasets_path, '%012d.jpg' % id) + cat = ant['category_id'] + if cat >= 1 and cat <= 11: + cat = cat - 1 + elif cat >= 13 and cat <= 25: + cat = cat - 2 + elif cat >= 27 and cat <= 28: + cat = cat - 3 + elif cat >= 31 and cat <= 44: + cat = cat - 5 + elif cat >= 46 and cat <= 65: + cat = cat - 6 + elif cat == 67: + cat = cat - 7 + elif cat == 70: + cat = cat - 9 + elif cat >= 72 and cat <= 82: + cat = cat - 10 + elif cat >= 84 and cat <= 90: + cat = cat - 11 + name_box_id[name].append([ant['bbox'], cat]) + + f = open(val_output_path, 'w') + for key in name_box_id.keys(): + f.write(key) + box_infos = name_box_id[key] + for info in box_infos: + x_min = int(info[0][0]) + y_min = int(info[0][1]) + x_max = x_min + int(info[0][2]) + y_max = y_min + int(info[0][3]) + + box_info = " %d,%d,%d,%d,%d" % ( + x_min, y_min, x_max, y_max, int(info[1])) + f.write(box_info) + f.write('\n') + f.close() diff --git a/app/core/yolo_detect/utils_coco/get_map_coco.py b/app/core/yolo_detect/utils_coco/get_map_coco.py new file mode 100644 index 0000000..0392192 --- /dev/null +++ b/app/core/yolo_detect/utils_coco/get_map_coco.py @@ -0,0 +1,113 @@ +import json +import os + +import numpy as np +import torch +from PIL import Image +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval +from tqdm import tqdm + +from utils.utils import cvtColor, preprocess_input, resize_image +from yolo import YOLO + +#---------------------------------------------------------------------------# +# map_mode用于指定该文件运行时计算的内容 +# map_mode为0代表整个map计算流程,包括获得预测结果、计算map。 +# map_mode为1代表仅仅获得预测结果。 +# map_mode为2代表仅仅获得计算map。 +#---------------------------------------------------------------------------# +map_mode = 0 +#-------------------------------------------------------# +# 指向了验证集标签与图片路径 +#-------------------------------------------------------# +cocoGt_path = 'coco_dataset/annotations/instances_val2017.json' +dataset_img_path = 'coco_dataset/val2017' +#-------------------------------------------------------# +# 结果输出的文件夹,默认为map_out +#-------------------------------------------------------# +temp_save_path = 'map_out/coco_eval' + +class mAP_YOLO(YOLO): + #---------------------------------------------------# + # 检测图片 + #---------------------------------------------------# + def detect_image(self, image_id, image, results, clsid2catid): + #---------------------------------------------------# + # 计算输入图片的高和宽 + #---------------------------------------------------# + image_shape = np.array(np.shape(image)[0:2]) + #---------------------------------------------------------# + # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB + #---------------------------------------------------------# + image = cvtColor(image) + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + # 也可以直接resize进行识别 + #---------------------------------------------------------# + image_data = resize_image(image, (self.input_shape[1],self.input_shape[0]), self.letterbox_image) + #---------------------------------------------------------# + # 添加上batch_size维度 + #---------------------------------------------------------# + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) + + with torch.no_grad(): + images = torch.from_numpy(image_data) + if self.cuda: + images = images.cuda() + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# + outputs = self.net(images) + outputs = self.bbox_util.decode_box(outputs) + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# + outputs = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou) + + if outputs[0] is None: + return outputs + + top_label = np.array(outputs[0][:, 5], dtype = 'int32') + top_conf = outputs[0][:, 4] + top_boxes = outputs[0][:, :4] + + for i, c in enumerate(top_label): + result = {} + top, left, bottom, right = top_boxes[i] + + result["image_id"] = int(image_id) + result["category_id"] = clsid2catid[c] + result["bbox"] = [float(left),float(top),float(right-left),float(bottom-top)] + result["score"] = float(top_conf[i]) + results.append(result) + return results + +if __name__ == "__main__": + if not os.path.exists(temp_save_path): + os.makedirs(temp_save_path) + + cocoGt = COCO(cocoGt_path) + ids = list(cocoGt.imgToAnns.keys()) + clsid2catid = cocoGt.getCatIds() + + if map_mode == 0 or map_mode == 1: + yolo = mAP_YOLO(confidence = 0.001, nms_iou = 0.65) + + with open(os.path.join(temp_save_path, 'eval_results.json'),"w") as f: + results = [] + for image_id in tqdm(ids): + image_path = os.path.join(dataset_img_path, cocoGt.loadImgs(image_id)[0]['file_name']) + image = Image.open(image_path) + results = yolo.detect_image(image_id, image, results, clsid2catid) + json.dump(results, f) + + if map_mode == 0 or map_mode == 2: + cocoDt = cocoGt.loadRes(os.path.join(temp_save_path, 'eval_results.json')) + cocoEval = COCOeval(cocoGt, cocoDt, 'bbox') + cocoEval.evaluate() + cocoEval.accumulate() + cocoEval.summarize() + print("Get map done.") diff --git a/app/core/yolo_detect/voc_annotation.py b/app/core/yolo_detect/voc_annotation.py new file mode 100644 index 0000000..bccbb84 --- /dev/null +++ b/app/core/yolo_detect/voc_annotation.py @@ -0,0 +1,153 @@ +import os +import random +import xml.etree.ElementTree as ET + +import numpy as np + +from utils.utils import get_classes + +#--------------------------------------------------------------------------------------------------------------------------------# +# annotation_mode用于指定该文件运行时计算的内容 +# annotation_mode为0代表整个标签处理过程,包括获得VOCdevkit/VOC2007/ImageSets里面的txt以及训练用的2007_train.txt、2007_val.txt +# annotation_mode为1代表获得VOCdevkit/VOC2007/ImageSets里面的txt +# annotation_mode为2代表获得训练用的2007_train.txt、2007_val.txt +#--------------------------------------------------------------------------------------------------------------------------------# +annotation_mode = 0 +#-------------------------------------------------------------------# +# 必须要修改,用于生成2007_train.txt、2007_val.txt的目标信息 +# 与训练和预测所用的classes_path一致即可 +# 如果生成的2007_train.txt里面没有目标信息 +# 那么就是因为classes没有设定正确 +# 仅在annotation_mode为0和2的时候有效 +#-------------------------------------------------------------------# +classes_path = 'model_data/voc_classes.txt' +#--------------------------------------------------------------------------------------------------------------------------------# +# trainval_percent用于指定(训练集+验证集)与测试集的比例,默认情况下 (训练集+验证集):测试集 = 9:1 +# train_percent用于指定(训练集+验证集)中训练集与验证集的比例,默认情况下 训练集:验证集 = 9:1 +# 仅在annotation_mode为0和1的时候有效 +#--------------------------------------------------------------------------------------------------------------------------------# +trainval_percent = 0.9 +train_percent = 0.9 +#-------------------------------------------------------# +# 指向VOC数据集所在的文件夹 +# 默认指向根目录下的VOC数据集 +#-------------------------------------------------------# +VOCdevkit_path = 'VOCdevkit' + +VOCdevkit_sets = [('2007', 'train'), ('2007', 'val')] +classes, _ = get_classes(classes_path) + +#-------------------------------------------------------# +# 统计目标数量 +#-------------------------------------------------------# +photo_nums = np.zeros(len(VOCdevkit_sets)) +nums = np.zeros(len(classes)) +def convert_annotation(year, image_id, list_file): + in_file = open(os.path.join(VOCdevkit_path, 'VOC%s/Annotations/%s.xml'%(year, image_id)), encoding='utf-8') + tree=ET.parse(in_file) + root = tree.getroot() + + for obj in root.iter('object'): + difficult = 0 + if obj.find('difficult')!=None: + difficult = obj.find('difficult').text + cls = obj.find('name').text + if cls not in classes or int(difficult)==1: + continue + cls_id = classes.index(cls) + xmlbox = obj.find('bndbox') + b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)), int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text))) + list_file.write(" " + ",".join([str(a) for a in b]) + ',' + str(cls_id)) + + nums[classes.index(cls)] = nums[classes.index(cls)] + 1 + +if __name__ == "__main__": + random.seed(0) + if " " in os.path.abspath(VOCdevkit_path): + raise ValueError("数据集存放的文件夹路径与图片名称中不可以存在空格,否则会影响正常的模型训练,请注意修改。") + + if annotation_mode == 0 or annotation_mode == 1: + print("Generate txt in ImageSets.") + xmlfilepath = os.path.join(VOCdevkit_path, 'VOC2007/Annotations') + saveBasePath = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets/Main') + temp_xml = os.listdir(xmlfilepath) + total_xml = [] + for xml in temp_xml: + if xml.endswith(".xml"): + total_xml.append(xml) + + num = len(total_xml) + list = range(num) + tv = int(num*trainval_percent) + tr = int(tv*train_percent) + trainval= random.sample(list,tv) + train = random.sample(trainval,tr) + + print("train and val size",tv) + print("train size",tr) + ftrainval = open(os.path.join(saveBasePath,'trainval.txt'), 'w') + ftest = open(os.path.join(saveBasePath,'test.txt'), 'w') + ftrain = open(os.path.join(saveBasePath,'train.txt'), 'w') + fval = open(os.path.join(saveBasePath,'val.txt'), 'w') + + for i in list: + name=total_xml[i][:-4]+'\n' + if i in trainval: + ftrainval.write(name) + if i in train: + ftrain.write(name) + else: + fval.write(name) + else: + ftest.write(name) + + ftrainval.close() + ftrain.close() + fval.close() + ftest.close() + print("Generate txt in ImageSets done.") + + if annotation_mode == 0 or annotation_mode == 2: + print("Generate 2007_train.txt and 2007_val.txt for train.") + type_index = 0 + for year, image_set in VOCdevkit_sets: + image_ids = open(os.path.join(VOCdevkit_path, 'VOC%s/ImageSets/Main/%s.txt'%(year, image_set)), encoding='utf-8').read().strip().split() + list_file = open('%s_%s.txt'%(year, image_set), 'w', encoding='utf-8') + for image_id in image_ids: + list_file.write('%s/VOC%s/JPEGImages/%s.jpg'%(os.path.abspath(VOCdevkit_path), year, image_id)) + + convert_annotation(year, image_id, list_file) + list_file.write('\n') + photo_nums[type_index] = len(image_ids) + type_index += 1 + list_file.close() + print("Generate 2007_train.txt and 2007_val.txt for train done.") + + def printTable(List1, List2): + for i in range(len(List1[0])): + print("|", end=' ') + for j in range(len(List1)): + print(List1[j][i].rjust(int(List2[j])), end=' ') + print("|", end=' ') + print() + + str_nums = [str(int(x)) for x in nums] + tableData = [ + classes, str_nums + ] + colWidths = [0]*len(tableData) + len1 = 0 + for i in range(len(tableData)): + for j in range(len(tableData[i])): + if len(tableData[i][j]) > colWidths[i]: + colWidths[i] = len(tableData[i][j]) + printTable(tableData, colWidths) + + if photo_nums[0] <= 500: + print("训练集数量小于500,属于较小的数据量,请注意设置较大的训练世代(Epoch)以满足足够的梯度下降次数(Step)。") + + if np.sum(nums) == 0: + print("在数据集中并未获得任何目标,请注意修改classes_path对应自己的数据集,并且保证标签名字正确,否则训练将会没有任何效果!") + print("在数据集中并未获得任何目标,请注意修改classes_path对应自己的数据集,并且保证标签名字正确,否则训练将会没有任何效果!") + print("在数据集中并未获得任何目标,请注意修改classes_path对应自己的数据集,并且保证标签名字正确,否则训练将会没有任何效果!") + print("(重要的事情说三遍)。") diff --git a/app/core/yolo_detect/yolo.py b/app/core/yolo_detect/yolo.py new file mode 100644 index 0000000..6277876 --- /dev/null +++ b/app/core/yolo_detect/yolo.py @@ -0,0 +1,424 @@ +import colorsys +import os +import time + +import numpy as np +import torch +import torch.nn as nn +from PIL import ImageDraw, ImageFont + +from app.core.yolo_detect.nets.yolo import YoloBody +from app.core.yolo_detect.utils.utils import (cvtColor, get_classes, preprocess_input, + resize_image, show_config) +from app.core.yolo_detect.utils.utils_bbox import DecodeBox + +''' +训练自己的数据集必看注释! +''' +class YOLO(object): + _defaults = { + #--------------------------------------------------------------------------# + # 使用自己训练好的模型进行预测一定要修改model_path和classes_path! + # model_path指向logs文件夹下的权值文件,label_size = draw.textsize(label, font) + # + # 训练好后logs文件夹下存在多个权值文件,选择验证集损失较低的即可。 + # 验证集损失较低不代表mAP较高,仅代表该权值在验证集上泛化性能较好。 + # 如果出现shape不匹配,同时要注意训练时的model_path和classes_path参数的修改 + #--------------------------------------------------------------------------# + "model_path" : 'app/core/yolo_detect/model_data/best_epoch_weights.pth', + "classes_path" : 'app/core/yolo_detect/model_data/voc_classes.txt', + #---------------------------------------------------------------------# + # 输入图片的大小,必须为32的倍数。 + #---------------------------------------------------------------------# + "input_shape" : [640, 640], + #------------------------------------------------------# + # 所使用到的yolov8的版本: + # n : 对应yolov8_n + # s : 对应yolov8_s + # m : 对应yolov8_m + # l : 对应yolov8_l + # x : 对应yolov8_x + #------------------------------------------------------# + "phi" : 'l', + #---------------------------------------------------------------------# + # 只有得分大于置信度的预测框会被保留下来 + #---------------------------------------------------------------------# + "confidence" : 0.5, + #---------------------------------------------------------------------# + # 非极大抑制所用到的nms_iou大小 + #---------------------------------------------------------------------# + "nms_iou" : 0.3, + #---------------------------------------------------------------------# + # 该变量用于控制是否使用letterbox_image对输入图像进行不失真的resize, + # 在多次测试后,发现关闭letterbox_image直接resize的效果更好 + #---------------------------------------------------------------------# + "letterbox_image" : False, + #-------------------------------# + # 是否使用Cuda + # 没有GPU可以设置成False + #-------------------------------# + "cuda" : True, + } + + @classmethod + def get_defaults(cls, n): + if n in cls._defaults: + return cls._defaults[n] + else: + return "Unrecognized attribute name '" + n + "'" + + #---------------------------------------------------# + # 初始化YOLO + #---------------------------------------------------# + def __init__(self, **kwargs): + self.__dict__.update(self._defaults) + for name, value in kwargs.items(): + setattr(self, name, value) + self._defaults[name] = value + + #---------------------------------------------------# + # 获得种类和先验框的数量 + #---------------------------------------------------# + self.class_names, self.num_classes = get_classes(self.classes_path) + self.bbox_util = DecodeBox(self.num_classes, (self.input_shape[0], self.input_shape[1])) + + #---------------------------------------------------# + # 画框设置不同的颜色 + #---------------------------------------------------# + hsv_tuples = [(x / self.num_classes, 1., 1.) for x in range(self.num_classes)] + self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) + self.colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), self.colors)) + self.generate() + + show_config(**self._defaults) + + #---------------------------------------------------# + # 生成模型 + #---------------------------------------------------# + def generate(self, onnx=False): + #---------------------------------------------------# + # 建立yolo模型,载入yolo模型的权重 + #---------------------------------------------------# + self.net = YoloBody(self.input_shape, self.num_classes, self.phi) + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.net.load_state_dict(torch.load(self.model_path, map_location=device)) + self.net = self.net.fuse().eval() + print('{} model, and classes loaded.'.format(self.model_path)) + if not onnx: + if self.cuda: + self.net = nn.DataParallel(self.net) + self.net = self.net.cuda() + + #---------------------------------------------------# + # 检测图片 + #---------------------------------------------------# + def detect_image(self, image, crop = False, count = False): + #---------------------------------------------------# + # 计算输入图片的高和宽 + #---------------------------------------------------# + image_shape = np.array(np.shape(image)[0:2]) + #---------------------------------------------------------# + # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB + #---------------------------------------------------------# + image = cvtColor(image) + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + # 也可以直接resize进行识别 + #---------------------------------------------------------# + image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image) + #---------------------------------------------------------# + # 添加上batch_size维度 + # h, w, 3 => 3, h, w => 1, 3, h, w + #---------------------------------------------------------# + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) + + with torch.no_grad(): + images = torch.from_numpy(image_data) + if self.cuda: + images = images.cuda() + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# + outputs = self.net(images) + outputs = self.bbox_util.decode_box(outputs) + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# + results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou) + + if results[0] is None: + return image,[] + + top_label = np.array(results[0][:, 5], dtype = 'int32') + top_conf = results[0][:, 4] + top_boxes = results[0][:, :4] + #---------------------------------------------------------# + # 设置字体与边框厚度 + #---------------------------------------------------------# + font = ImageFont.truetype(font='model_data/simhei.ttf', size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32')) + thickness = int(max((image.size[0] + image.size[1]) // np.mean(self.input_shape), 1)) + #---------------------------------------------------------# + # 计数 + #---------------------------------------------------------# + if count: + print("top_label:", top_label) + classes_nums = np.zeros([self.num_classes]) + for i in range(self.num_classes): + num = np.sum(top_label == i) + if num > 0: + print(self.class_names[i], " : ", num) + classes_nums[i] = num + print("classes_nums:", classes_nums) + #---------------------------------------------------------# + # 是否进行目标的裁剪 + #---------------------------------------------------------# + if crop: + for i, c in list(enumerate(top_boxes)): + top, left, bottom, right = top_boxes[i] + top = max(0, np.floor(top).astype('int32')) + left = max(0, np.floor(left).astype('int32')) + bottom = min(image.size[1], np.floor(bottom).astype('int32')) + right = min(image.size[0], np.floor(right).astype('int32')) + + dir_save_path = "img_crop" + if not os.path.exists(dir_save_path): + os.makedirs(dir_save_path) + crop_image = image.crop([left, top, right, bottom]) + crop_image.save(os.path.join(dir_save_path, "crop_" + str(i) + ".png"), quality=95, subsampling=0) + print("save crop_" + str(i) + ".png to " + dir_save_path) + #---------------------------------------------------------# + # 图像绘制 + #---------------------------------------------------------# + predicted_class_list=[] + for i, c in list(enumerate(top_label)): + predicted_class = self.class_names[int(c)] + box = top_boxes[i] + score = top_conf[i] + + top, left, bottom, right = box + + top = max(0, np.floor(top).astype('int32')) + left = max(0, np.floor(left).astype('int32')) + bottom = min(image.size[1], np.floor(bottom).astype('int32')) + right = min(image.size[0], np.floor(right).astype('int32')) + + label = '{} {:.2f}'.format(predicted_class, score) + predicted_class_list.append(predicted_class) + draw = ImageDraw.Draw(image) + + # label_size = draw.textsize(label, font) + bbox = draw.textbbox((0, 0), label, font=font) + label_size = (bbox[2] - bbox[0], bbox[3] - bbox[1]) # 计算文本宽度和高度 + + label = label.encode('utf-8') + print(label, top, left, bottom, right) + + if top - label_size[1] >= 0: + text_origin = np.array([left, top - label_size[1]]) + else: + text_origin = np.array([left, top + 1]) + + for i in range(thickness): + draw.rectangle([left + i, top + i, right - i, bottom - i], outline=self.colors[c]) + draw.rectangle([tuple(text_origin), tuple(text_origin + label_size)], fill=self.colors[c]) + draw.text(text_origin, str(label,'UTF-8'), fill=(0, 0, 0), font=font) + del draw + + return image,predicted_class_list + + def get_FPS(self, image, test_interval): + image_shape = np.array(np.shape(image)[0:2]) + #---------------------------------------------------------# + # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB + #---------------------------------------------------------# + image = cvtColor(image) + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + # 也可以直接resize进行识别 + #---------------------------------------------------------# + image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image) + #---------------------------------------------------------# + # 添加上batch_size维度 + #---------------------------------------------------------# + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) + + with torch.no_grad(): + images = torch.from_numpy(image_data) + if self.cuda: + images = images.cuda() + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# + outputs = self.net(images) + outputs = self.bbox_util.decode_box(outputs) + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# + results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou) + + t1 = time.time() + for _ in range(test_interval): + with torch.no_grad(): + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# + outputs = self.net(images) + outputs = self.bbox_util.decode_box(outputs) + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# + results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou) + + t2 = time.time() + tact_time = (t2 - t1) / test_interval + return tact_time + + def detect_heatmap(self, image, heatmap_save_path): + import cv2 + import matplotlib.pyplot as plt + def sigmoid(x): + y = 1.0 / (1.0 + np.exp(-x)) + return y + #---------------------------------------------------------# + # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB + #---------------------------------------------------------# + image = cvtColor(image) + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + # 也可以直接resize进行识别 + #---------------------------------------------------------# + image_data = resize_image(image, (self.input_shape[1],self.input_shape[0]), self.letterbox_image) + #---------------------------------------------------------# + # 添加上batch_size维度 + #---------------------------------------------------------# + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) + + with torch.no_grad(): + images = torch.from_numpy(image_data) + if self.cuda: + images = images.cuda() + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# + dbox, cls, x, anchors, strides = self.net(images) + outputs = [xi.split((xi.size()[1] - self.num_classes, self.num_classes), 1)[1] for xi in x] + + plt.imshow(image, alpha=1) + plt.axis('off') + mask = np.zeros((image.size[1], image.size[0])) + for sub_output in outputs: + sub_output = sub_output.cpu().numpy() + b, c, h, w = np.shape(sub_output) + sub_output = np.transpose(np.reshape(sub_output, [b, -1, h, w]), [0, 2, 3, 1])[0] + score = np.max(sigmoid(sub_output[..., :]), -1) + score = cv2.resize(score, (image.size[0], image.size[1])) + normed_score = (score * 255).astype('uint8') + mask = np.maximum(mask, normed_score) + + plt.imshow(mask, alpha=0.5, interpolation='nearest', cmap="jet") + + plt.axis('off') + plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) + plt.margins(0, 0) + plt.savefig(heatmap_save_path, dpi=200, bbox_inches='tight', pad_inches = -0.1) + print("Save to the " + heatmap_save_path) + plt.show() + + def convert_to_onnx(self, simplify, model_path): + import onnx + self.generate(onnx=True) + + im = torch.zeros(1, 3, *self.input_shape).to('cpu') # image size(1, 3, 512, 512) BCHW + input_layer_names = ["images"] + output_layer_names = ["output"] + + # Export the model + print(f'Starting export with onnx {onnx.__version__}.') + torch.onnx.export(self.net, + im, + f = model_path, + verbose = False, + opset_version = 12, + training = torch.onnx.TrainingMode.EVAL, + do_constant_folding = True, + input_names = input_layer_names, + output_names = output_layer_names, + dynamic_axes = None) + + # Checks + model_onnx = onnx.load(model_path) # load onnx model + onnx.checker.check_model(model_onnx) # check onnx model + + # Simplify onnx + if simplify: + import onnxsim + print(f'Simplifying with onnx-simplifier {onnxsim.__version__}.') + model_onnx, check = onnxsim.simplify( + model_onnx, + dynamic_input_shape=False, + input_shapes=None) + assert check, 'assert check failed' + onnx.save(model_onnx, model_path) + + print('Onnx model save as {}'.format(model_path)) + + def get_map_txt(self, image_id, image, class_names, map_out_path): + f = open(os.path.join(map_out_path, "detection-results/"+image_id+".txt"), "w", encoding='utf-8') + image_shape = np.array(np.shape(image)[0:2]) + #---------------------------------------------------------# + # 在这里将图像转换成RGB图像,防止灰度图在预测时报错。 + # 代码仅仅支持RGB图像的预测,所有其它类型的图像都会转化成RGB + #---------------------------------------------------------# + image = cvtColor(image) + #---------------------------------------------------------# + # 给图像增加灰条,实现不失真的resize + # 也可以直接resize进行识别 + #---------------------------------------------------------# + image_data = resize_image(image, (self.input_shape[1], self.input_shape[0]), self.letterbox_image) + #---------------------------------------------------------# + # 添加上batch_size维度 + #---------------------------------------------------------# + image_data = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, dtype='float32')), (2, 0, 1)), 0) + + with torch.no_grad(): + images = torch.from_numpy(image_data) + if self.cuda: + images = images.cuda() + #---------------------------------------------------------# + # 将图像输入网络当中进行预测! + #---------------------------------------------------------# + outputs = self.net(images) + outputs = self.bbox_util.decode_box(outputs) + #---------------------------------------------------------# + # 将预测框进行堆叠,然后进行非极大抑制 + #---------------------------------------------------------# + results = self.bbox_util.non_max_suppression(outputs, self.num_classes, self.input_shape, + image_shape, self.letterbox_image, conf_thres = self.confidence, nms_thres = self.nms_iou) + + if results[0] is None: + return + + top_label = np.array(results[0][:, 5], dtype = 'int32') + top_conf = results[0][:, 4] + top_boxes = results[0][:, :4] + + for i, c in list(enumerate(top_label)): + predicted_class = self.class_names[int(c)] + box = top_boxes[i] + score = str(top_conf[i]) + + top, left, bottom, right = box + if predicted_class not in class_names: + continue + + f.write("%s %s %s %s %s %s\n" % (predicted_class, score[:6], str(int(left)), str(int(top)), str(int(right)),str(int(bottom)))) + + f.close() + return diff --git a/app/services/worker.py b/app/services/worker.py index d058086..7a395e4 100644 --- a/app/services/worker.py +++ b/app/services/worker.py @@ -7,14 +7,12 @@ from queue import Queue from typing import Dict from app.core.model import Model -from app.core.preprocess import Preprocess from app.services.model import TaskStatus, TaskStore class Worker: def __init__(self): self.detection = Model().getModel() - self.preprocess = Preprocess().getPreprocess() self.task_queue = Queue() self.task_store: Dict[str, TaskStore] = {} @@ -40,30 +38,16 @@ class Worker: output_dir = os.path.join(UPLOAD_DIR, task_id, "outputs") os.makedirs(output_dir, exist_ok=True) - # 获取图像的标签列表 - image_labels = self.preprocess.preprocess(task.images) # 返回一个0和1的列表,0代表跳过,1代表进行检测 - - for idx, (input_img_path, label) in enumerate(zip(task.images, image_labels)): + for idx, input_img_path in enumerate(task.images): print(f"处理任务 {task_id}, 处理图片 {input_img_path}...") - if label == 0: - # 如果标签是0,跳过模型检测,输出路径和坐标为空 - task.result.append( - {"input_img_path": input_img_path, "output_img_path": "", "coords": "[]"} - ) - else: - # 进行模型检测 - img_res, coords_res = self.detection.detect(input_img_path) - - coords_res = [{"name": name, "coords": coords} for name, coords in coords_res] - coords_json = json.dumps(coords_res, ensure_ascii=False) - - out_img_path = os.path.join(output_dir, f"{idx}.jpg") - cv2.imwrite(out_img_path, img_res) - - task.result.append( - {"input_img_path": input_img_path, "output_img_path": out_img_path, "coords": coords_json} - ) + img_res, coords_res = self.detection.detect(input_img_path) + coords_res = [{"name": name, "coords": coords} for name, coords in coords_res] + coords_json = json.dumps(coords_res, ensure_ascii=False) + out_img_path = os.path.join(str(output_dir), f"{idx}.jpg") + cv2.imwrite(out_img_path, img_res) + task.result.append( + {"input_img_path": input_img_path, "output_img_path": out_img_path, "coords": coords_json}) task.progress = int((idx + 1) / len(task.images) * 100)