CLASS_NAMES: ['car','truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'] DATA_CONFIG: _BASE_CONFIG_: cfgs/dataset_configs/nuscenes_dataset.yaml POINT_CLOUD_RANGE: [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] CAMERA_CONFIG: USE_CAMERA: True IMAGE: FINAL_DIM: [256,704] RESIZE_LIM_TRAIN: [0.38, 0.55] RESIZE_LIM_TEST: [0.48, 0.48] DATA_AUGMENTOR: DISABLE_AUG_LIST: ['placeholder'] AUG_CONFIG_LIST: - NAME: random_world_flip ALONG_AXIS_LIST: ['x', 'y'] - NAME: random_world_rotation WORLD_ROT_ANGLE: [-0.78539816, 0.78539816] - NAME: random_world_scaling WORLD_SCALE_RANGE: [0.9, 1.1] - NAME: random_world_translation NOISE_TRANSLATE_STD: [0.5, 0.5, 0.5] - NAME: imgaug ROT_LIM: [-5.4, 5.4] RAND_FLIP: True DATA_PROCESSOR: - NAME: mask_points_and_boxes_outside_range REMOVE_OUTSIDE_BOXES: True - NAME: shuffle_points SHUFFLE_ENABLED: { 'train': True, 'test': True } - NAME: transform_points_to_voxels VOXEL_SIZE: [0.075, 0.075, 0.2] MAX_POINTS_PER_VOXEL: 10 MAX_NUMBER_OF_VOXELS: { 'train': 120000, 'test': 160000 } - NAME: image_calibrate - NAME: image_normalize mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] MODEL: NAME: BevFusion VFE: NAME: MeanVFE BACKBONE_3D: NAME: VoxelResBackBone8x USE_BIAS: False MAP_TO_BEV: NAME: HeightCompression NUM_BEV_FEATURES: 256 IMAGE_BACKBONE: NAME: SwinTransformer EMBED_DIMS: 96 DEPTHS: [2, 2, 6, 2] NUM_HEADS: [3, 6, 12, 24] WINDOW_SIZE: 7 MLP_RATIO: 4 DROP_RATE: 0. ATTN_DROP_RATE: 0. DROP_PATH_RATE: 0.2 PATCH_NORM: True OUT_INDICES: [1, 2, 3] WITH_CP: False CONVERT_WEIGHTS: True INIT_CFG: type: Pretrained checkpoint: swint-nuimages-pretrained.pth NECK: NAME: GeneralizedLSSFPN IN_CHANNELS: [192, 384, 768] OUT_CHANNELS: 256 START_LEVEL: 0 END_LEVEL: -1 NUM_OUTS: 3 VTRANSFORM: NAME: DepthLSSTransform IMAGE_SIZE: [256, 704] IN_CHANNEL: 256 OUT_CHANNEL: 80 FEATURE_SIZE: [32, 88] XBOUND: [-54.0, 54.0, 0.3] YBOUND: [-54.0, 54.0, 0.3] ZBOUND: [-10.0, 10.0, 20.0] DBOUND: [1.0, 60.0, 0.5] DOWNSAMPLE: 2 FUSER: NAME: ConvFuser IN_CHANNEL: 336 OUT_CHANNEL: 256 BACKBONE_2D: NAME: BaseBEVBackbone LAYER_NUMS: [5, 5] LAYER_STRIDES: [1, 2] NUM_FILTERS: [128, 256] UPSAMPLE_STRIDES: [1, 2] NUM_UPSAMPLE_FILTERS: [256, 256] USE_CONV_FOR_NO_STRIDE: True DENSE_HEAD: CLASS_AGNOSTIC: False NAME: TransFusionHead USE_BIAS_BEFORE_NORM: False NUM_PROPOSALS: 200 HIDDEN_CHANNEL: 128 NUM_CLASSES: 10 NUM_HEADS: 8 NMS_KERNEL_SIZE: 3 FFN_CHANNEL: 256 DROPOUT: 0.1 BN_MOMENTUM: 0.1 ACTIVATION: relu NUM_HM_CONV: 2 SEPARATE_HEAD_CFG: HEAD_ORDER: ['center', 'height', 'dim', 'rot', 'vel'] HEAD_DICT: { 'center': {'out_channels': 2, 'num_conv': 2}, 'height': {'out_channels': 1, 'num_conv': 2}, 'dim': {'out_channels': 3, 'num_conv': 2}, 'rot': {'out_channels': 2, 'num_conv': 2}, 'vel': {'out_channels': 2, 'num_conv': 2}, } TARGET_ASSIGNER_CONFIG: FEATURE_MAP_STRIDE: 8 DATASET: nuScenes GAUSSIAN_OVERLAP: 0.1 MIN_RADIUS: 2 HUNGARIAN_ASSIGNER: cls_cost: {'gamma': 2.0, 'alpha': 0.25, 'weight': 0.15} reg_cost: {'weight': 0.25} iou_cost: {'weight': 0.25} LOSS_CONFIG: LOSS_WEIGHTS: { 'cls_weight': 1.0, 'bbox_weight': 0.25, 'hm_weight': 1.0, 'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] } LOSS_CLS: use_sigmoid: True gamma: 2.0 alpha: 0.25 POST_PROCESSING: SCORE_THRESH: 0.0 POST_CENTER_RANGE: [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0] POST_PROCESSING: RECALL_THRESH_LIST: [0.3, 0.5, 0.7] SCORE_THRESH: 0.1 OUTPUT_RAW_SCORE: False EVAL_METRIC: kitti OPTIMIZATION: BATCH_SIZE_PER_GPU: 3 NUM_EPOCHS: 6 OPTIMIZER: adam_cosineanneal LR: 0.0001 WEIGHT_DECAY: 0.01 MOMENTUM: 0.9 BETAS: [0.9, 0.999] MOMS: [0.9, 0.8052631] PCT_START: 0.4 WARMUP_ITER: 500 DECAY_STEP_LIST: [35, 45] LR_WARMUP: False WARMUP_EPOCH: 1 GRAD_NORM_CLIP: 35 LOSS_SCALE_FP16: 32