[Fix] Fix multisports dataset detection (#2584)

open-mmlab · Sep 6, 2023 · 8ff889a · 8ff889a
1 parent 4fee8c2
commit 8ff889a
Show file tree

Hide file tree

Showing 32 changed files with 387 additions and 444 deletions.
diff --git a/...etection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py b/...etection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava21-rgb.py
@@ -46,6 +46,7 @@
  shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2304,
  num_classes=81,
  multilabel=True,
@@ -88,9 +89,6 @@
 proposal_file_val = f'{anno_root}/ava_dense_proposals_val.FAIR.recall_93.9.pkl'
 
 file_client_args = dict(io_backend='disk')
-file_client_args = dict(
- io_backend='petrel',
- path_mapping=dict({'data/ava': 's254:s3://openmmlab/datasets/action/ava'}))
 train_pipeline = [
  dict(type='SampleAVAFrames', clip_len=32, frame_interval=2),
  dict(type='RawFrameDecode', **file_client_args),

diff --git a/...etection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py b/...etection/acrn/slowfast-acrn_kinetics400-pretrained-r50_8xb8-8x8x1-cosine-10e_ava22-rgb.py
@@ -46,6 +46,7 @@
  shared_head=dict(type='ACRNHead', in_channels=4608, out_channels=2304),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2304,
  num_classes=81,
  multilabel=True,

diff --git a/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb-infer_r50_ava21-rgb.py
@@ -34,6 +34,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2048,
  num_classes=81,
  multilabel=True,

diff --git a/...gs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py b/...gs/detection/lfb/slowonly-lfb-nl_kinetics400-pretrained-r50_8xb12-4x16x1-20e_ava21-rgb.py
@@ -37,6 +37,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2560,
  num_classes=81,
  multilabel=True,

diff --git a/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py b/configs/detection/lfb/slowonly-lfb_ava-pretrained-r50_infer-4x16x1_ava21-rgb.py
@@ -34,6 +34,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2048,
  num_classes=81,
  multilabel=True,

diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
@@ -44,6 +44,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2304,
  num_classes=81,
  multilabel=True,

diff --git a/...detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py b/...detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb6-8x8x1-cosine-10e_ava22-rgb.py
@@ -45,6 +45,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2304,
  num_classes=81,
  multilabel=True,

diff --git a/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py b/configs/detection/slowfast/slowfast_kinetics400-pretrained-r50_8xb8-8x8x1-20e_ava21-rgb.py
@@ -45,6 +45,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2304,
  num_classes=81,
  multilabel=True,

diff --git a/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-4x16x1-10e_ava-kinetics-rgb.py
@@ -29,6 +29,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2048,
  num_classes=81,
  multilabel=True,

diff --git a/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py b/configs/detection/slowonly/slowonly_k400-pre-r50_8xb8-8x8x1-10e_ava-kinetics-rgb.py
@@ -29,6 +29,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2048,
  num_classes=81,
  multilabel=True,

diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r101_8xb16-8x8x1-20e_ava21-rgb.py
@@ -28,6 +28,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2048,
  num_classes=81,
  multilabel=True,

diff --git a/...s/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py b/...s/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-4x16x1-20e_ava21-rgb.py
@@ -36,6 +36,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2048,
  num_classes=81,
  multilabel=True,

diff --git a/...gs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py b/...gs/detection/slowonly/slowonly_kinetics400-pretrained-r50-nl_8xb16-8x8x1-20e_ava21-rgb.py
@@ -36,6 +36,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2048,
  num_classes=81,
  multilabel=True,

diff --git a/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
@@ -29,6 +29,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2048,
  num_classes=81,
  multilabel=True,

diff --git a/...detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py b/...detection/slowonly/slowonly_kinetics400-pretrained-r50_8xb16-4x16x1-8e_multisports-rgb.py
@@ -30,6 +30,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=False,
  in_channels=2048,
  num_classes=num_classes,
  multilabel=False,

diff --git a/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py b/configs/detection/slowonly/slowonly_kinetics700-pretrained-r50_8xb16-4x16x1-20e_ava21-rgb.py
@@ -29,6 +29,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=2048,
  num_classes=81,
  multilabel=True,

diff --git a/...tection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py b/...tection/videomae/vit-base-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py
@@ -31,6 +31,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=768,
  num_classes=81,
  multilabel=True,

diff --git a/...ection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py b/...ection/videomae/vit-large-p16_videomae-k400-pre_8xb8-16x4x1-20e-adamw_ava-kinetics-rgb.py
@@ -32,6 +32,7 @@
  with_temporal_pool=True),
  bbox_head=dict(
  type='BBoxHeadAVA',
+ background_class=True,
  in_channels=1024,
  num_classes=81,
  multilabel=True,

diff --git a/mmaction/datasets/ava_dataset.py b/mmaction/datasets/ava_dataset.py
@@ -203,7 +203,6 @@ def parse_img_record(self, img_records: List[dict]) -> tuple:
 
  labels.append(label)
  entity_ids.append(img_record['entity_id'])
-
  bboxes = np.stack(bboxes)
  labels = np.stack(labels)
  entity_ids = np.stack(entity_ids)

diff --git a/mmaction/evaluation/functional/multisports_utils.py b/mmaction/evaluation/functional/multisports_utils.py
@@ -7,6 +7,7 @@
 from collections import defaultdict
 
 import numpy as np
+from mmengine.logging import MMLogger
 from rich.progress import track
 
 
@@ -314,7 +315,7 @@ def tubescore(tt):
 
 
 def frameAP(GT, alldets, thr, print_info=True):
-
+ logger = MMLogger.get_current_instance()
  vlist = GT['test_videos'][0]
 
  results = {}
@@ -326,7 +327,7 @@ def frameAP(GT, alldets, thr, print_info=True):
  'basketball save', 'basketball jump ball'
  ]:
  if print_info:
- print('do not evaluate {}'.format(label))
+ logger.info('do not evaluate {}'.format(label))
  continue
  # det format: <video_index><frame_number><label_index><score><x1><y1><x2><y2> # noqa: E501
  detections = alldets[alldets[:, 2] == ilabel, :]
@@ -355,7 +356,7 @@ def frameAP(GT, alldets, thr, print_info=True):
  gt_num = sum([g.shape[0] for g in gt.values()])
  if gt_num == 0:
  if print_info:
- print('no such label', ilabel, label)
+ logger.info('no such label', ilabel, label)
  continue
  fp = 0 # false positives
  tp = 0 # true positives
@@ -395,15 +396,15 @@ def frameAP(GT, alldets, thr, print_info=True):
  class_result[label] = pr_to_ap_voc(results[label]) * 100
  frameap_result = np.mean(ap)
  if print_info:
- print('frameAP_{}\n'.format(thr))
+ logger.info('frameAP_{}\n'.format(thr))
  for label in class_result:
- print('{:20s} {:8.2f}'.format(label, class_result[label]))
- print('{:20s} {:8.2f}'.format('mAP', frameap_result))
+ logger.info('{:20s} {:8.2f}'.format(label, class_result[label]))
+ logger.info('{:20s} {:8.2f}'.format('mAP', frameap_result))
  return frameap_result
 
 
 def videoAP(GT, alldets, thr, print_info=True):
-
+ logger = MMLogger.get_current_instance()
  vlist = GT['test_videos'][0]
 
  res = {}
@@ -414,7 +415,7 @@ def videoAP(GT, alldets, thr, print_info=True):
  'basketball save', 'basketball jump ball'
  ]:
  if print_info:
- print('do not evaluate{}'.format(GT['labels'][ilabel]))
+ logger.info('do not evaluate{}'.format(GT['labels'][ilabel]))
  continue
  detections = alldets[ilabel]
  # load ground-truth
@@ -438,7 +439,7 @@ def videoAP(GT, alldets, thr, print_info=True):
  tp = 0 # true positives
  if gt_num == 0:
  if print_info:
- print('no such label', ilabel, GT['labels'][ilabel])
+ logger.info('no such label', ilabel, GT['labels'][ilabel])
  continue
  is_gt_box_detected = {}
  for i, j in enumerate(
@@ -471,10 +472,10 @@ def videoAP(GT, alldets, thr, print_info=True):
  for label in res:
  class_result[label] = pr_to_ap_voc(res[label]) * 100
  if print_info:
- print('VideoAP_{}\n'.format(thr))
+ logger.info('VideoAP_{}\n'.format(thr))
  for label in class_result:
- print('{:20s} {:8.2f}'.format(label, class_result[label]))
- print('{:20s} {:8.2f}'.format('mAP', videoap_result))
+ logger.info('{:20s} {:8.2f}'.format(label, class_result[label]))
+ logger.info('{:20s} {:8.2f}'.format('mAP', videoap_result))
  return videoap_result
 
 

diff --git a/mmaction/models/backbones/vit_mae.py b/mmaction/models/backbones/vit_mae.py
@@ -12,12 +12,6 @@
 from mmaction.registry import MODELS
 from mmaction.utils import ConfigType, OptConfigType
 
-try:
- from mmdet.registry import MODELS as MMDET_MODELS
- mmdet_imported = True
-except (ImportError, ModuleNotFoundError):
- mmdet_imported = False
-
 
 class Attention(BaseModule):
  """Multi-head Self-attention.
@@ -387,7 +381,3 @@ def forward(self, x: Tensor) -> Tensor:
  return self.fc_norm(x.mean(1))
 
  return x[:, 0]
-
-
-if mmdet_imported:
- MMDET_MODELS.register_module()(VisionTransformer)
diff --git a/mmaction/models/roi_heads/__init__.py b/mmaction/models/roi_heads/__init__.py
@@ -1,10 +1,23 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .bbox_heads import BBoxHeadAVA
-from .roi_extractors import SingleRoIExtractor3D
-from .roi_head import AVARoIHead
-from .shared_heads import ACRNHead, FBOHead, LFBInferHead
-
-__all__ = [
- 'AVARoIHead', 'BBoxHeadAVA', 'SingleRoIExtractor3D', 'ACRNHead', 'FBOHead',
- 'LFBInferHead'
-]
+try:
+ from mmdet.registry import MODELS as MMDET_MODELS
+
+ from .bbox_heads import BBoxHeadAVA
+ from .roi_extractors import SingleRoIExtractor3D
+ from .roi_head import AVARoIHead
+ from .shared_heads import ACRNHead, FBOHead, LFBInferHead
+
+ for module in [
+ AVARoIHead, BBoxHeadAVA, SingleRoIExtractor3D, ACRNHead, FBOHead,
+ LFBInferHead
+ ]:
+
+ MMDET_MODELS.register_module()(module)
+
+ __all__ = [
+ 'AVARoIHead', 'BBoxHeadAVA', 'SingleRoIExtractor3D', 'ACRNHead',
+ 'FBOHead', 'LFBInferHead'
+ ]
+
+except (ImportError, ModuleNotFoundError):
+ pass
diff --git a/mmaction/models/roi_heads/bbox_heads/bbox_head.py b/mmaction/models/roi_heads/bbox_heads/bbox_head.py
@@ -5,25 +5,17 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from mmdet.models.task_modules.samplers import SamplingResult
 from mmengine.config import ConfigDict
 from mmengine.structures import InstanceData
-from torch import Tensor
-
-from mmaction.structures.bbox import bbox_target
-from mmaction.utils import InstanceList
-
-try:
- from mmdet.models.task_modules.samplers import SamplingResult
- from mmdet.registry import MODELS as MMDET_MODELS
- mmdet_imported = True
-except (ImportError, ModuleNotFoundError):
- from mmaction.utils import SamplingResult
- mmdet_imported = False
-
 # Resolve cross-entropy function to support multi-target in Torch < 1.10
 # This is a very basic 'hack', with minimal functionality to support the
 # procedure under prior torch versions
 from packaging import version as pv
+from torch import Tensor
+
+from mmaction.structures.bbox import bbox_target
+from mmaction.utils import InstanceList
 
 if pv.parse(torch.__version__) < pv.parse('1.10'):
 
@@ -44,6 +36,8 @@ class BBoxHeadAVA(nn.Module):
  """Simplest RoI head, with only one fc layer for classification.
 
  Args:
+ background_class (bool): Whether set class 0 as background class and
+ ignore it when calculate loss.
  temporal_pool_type (str): The temporal pool type. Choices are ``avg``
  or ``max``. Defaults to ``avg``.
  spatial_pool_type (str): The spatial pool type. Choices are ``avg`` or
@@ -70,6 +64,7 @@ class BBoxHeadAVA(nn.Module):
 
  def __init__(
  self,
+ background_class: bool,
  temporal_pool_type: str = 'avg',
  spatial_pool_type: str = 'max',
  in_channels: int = 2048,
@@ -98,6 +93,8 @@ def __init__(
  self.focal_gamma = focal_gamma
  self.focal_alpha = focal_alpha
 
+ self.background_class = background_class
+
  if topk is None:
  self.topk = ()
  elif isinstance(topk, int):
@@ -251,9 +248,11 @@ def loss_and_target(self, cls_score: Tensor, rois: Tensor,
  losses = dict()
  # Only use the cls_score
  if cls_score is not None:
- labels = labels[:, 1:] # Get valid labels (ignore first one)
+ if self.background_class:
+ labels = labels[:, 1:] # Get valid labels (ignore first one)
+ cls_score = cls_score[:, 1:]
  pos_inds = torch.sum(labels, dim=-1) > 0
- cls_score = cls_score[pos_inds, 1:]
+ cls_score = cls_score[pos_inds]
  labels = labels[pos_inds]
 
  # Compute First Recall/Precisions
@@ -268,7 +267,7 @@ def loss_and_target(self, cls_score: Tensor, rois: Tensor,
 
  # If Single-label, need to ensure that target labels sum to 1: ie
  # that they are valid probabilities.
- if not self.multilabel:
+ if not self.multilabel and self.background_class:
  labels = labels / labels.sum(dim=1, keepdim=True)
 
  # Select Loss function based on single/multi-label
@@ -414,7 +413,3 @@ def _bbox_crop_undo(bboxes, crop_quadruple):
  results.scores = scores
 
  return results
-
-
-if mmdet_imported:
- MMDET_MODELS.register_module()(BBoxHeadAVA)