# -------------------------------------------------------- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language # Copyright (c) 2022 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Xueyan Zou (xueyan@cs.wisc.edu) # -------------------------------------------------------- # Define Test/Trainer/Saving PIPELINE: XDecoderPipeline TRAINER: xdecoder SAVE_DIR: '../../data/output/test' base_path: "./" # Resume Logistic RESUME: false WEIGHT: false RESUME_FROM: '' EVAL_AT_START: False # Logging and Debug WANDB: False LOG_EVERY: 100 FIND_UNUSED_PARAMETERS: false # Speed up training FP16: false PORT: '36873' # misc LOADER: JOINT: False KEY_DATASET: 'coco' ################## # Task settings ################## VERBOSE: true MODEL: NAME: seem_model_v1 HEAD: xdecoder_head MASK_ON: false KEYPOINT_ON: false LOAD_PROPOSALS: false DIM_PROJ: 512 TEXT: ARCH: vlpencoder NAME: transformer TOKENIZER: clip CONTEXT_LENGTH: 77 # 77 WIDTH: 512 HEADS: 8 LAYERS: 12 # 6 AUTOGRESSIVE: True BACKBONE: NAME: focal PRETRAINED: '' LOAD_PRETRAINED: false FOCAL: PRETRAIN_IMG_SIZE: 224 PATCH_SIZE: 4 EMBED_DIM: 192 DEPTHS: [2, 2, 18, 2] FOCAL_LEVELS: [4, 4, 4, 4] FOCAL_WINDOWS: [3, 3, 3, 3] DROP_PATH_RATE: 0.3 MLP_RATIO: 4.0 DROP_RATE: 0.0 PATCH_NORM: True USE_CONV_EMBED: True SCALING_MODULATOR: True USE_CHECKPOINT: False USE_POSTLN: true USE_POSTLN_IN_MODULATION: false USE_LAYERSCALE: True OUT_FEATURES: ["res2", "res3", "res4", "res5"] OUT_INDICES: [0, 1, 2, 3] ENCODER: NAME: transformer_encoder_fpn IGNORE_VALUE: 255 NUM_CLASSES: 133 LOSS_WEIGHT: 1.0 CONVS_DIM: 512 MASK_DIM: 512 NORM: "GN" IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 DECODER: NAME: seem_v1 TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" MASK: ENABLED: True DETECTION: False SPATIAL: ENABLED: True MAX_ITER: 1 GROUNDING: ENABLED: True MAX_LEN: 5 TEXT_WEIGHT: 2.0 CLASS_WEIGHT: 0.5 RETRIEVAL: ENABLED: False LVIS: ENABLED: True THRES: 0.7 OPENIMAGE: ENABLED: False NEGATIVE_SAMPLES: 5 GROUNDING: ENABLED: False MAX_LEN: 5 CAPTION: ENABLED: False PHRASE_PROB: 0.5 SIM_THRES: 0.95 DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 GCLASS_WEIGHT: 0.4 GMASK_WEIGHT: 1.0 GDICE_WEIGHT: 1.0 SCLASS_WEIGHT: 0.4 SMASK_WEIGHT: 1.0 SDICE_WEIGHT: 1.0 OCLASS_WEIGHT: 0.4 OMASK_WEIGHT: 1.0 ODICE_WEIGHT: 1.0 CLASS_WEIGHT: 2.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 BBOX_WEIGHT: 5.0 GIOU_WEIGHT: 2.0 CAPTION_WEIGHT: 2.0 COST_SPATIAL: CLASS_WEIGHT: 5.0 MASK_WEIGHT: 2.0 DICE_WEIGHT: 2.0 HIDDEN_DIM: 512 NUM_OBJECT_QUERIES: 101 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 MAX_SPATIAL_LEN: [512, 512, 512, 512] # ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 TRAIN_NUM_POINTS: 12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query TOP_GROUNDING_LAYERS: 10 TOP_CAPTION_LAYERS: 10 TOP_SPATIAL_LAYERS: 10 TOP_OPENIMAGE_LAYERS: 10 TEST: SEMANTIC_ON: True INSTANCE_ON: True PANOPTIC_ON: True OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.8 SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false # Spatial sampler STROKE_SAMPLER: MAX_CANDIDATE: 1 CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"] DILATION: 3 CIRCLE: NUM_STROKES: 5 STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small'] STROKE_PROB: [0.33, 0.33, 0.33] SCRIBBLE: NUM_STROKES: 5 STROKE_PRESET: ['rand_curve', 'rand_curve_small'] STROKE_PROB: [0.5, 0.5] POINT: NUM_POINTS: 20 POLYGON: MAX_POINTS: 9 EVAL: MODE: 'best' # best/random/best_random NEGATIVE: False MAX_ITER: 20 IOU_ITER: 1 GROUNDING: False # Multi-modal Architecture, order matters ATTENTION_ARCH: VARIABLE: queries: ['object', 'grounding', 'spatial'] tokens: ['grounding', 'spatial'] memories: ['spatial'] SELF_ATTENTION: queries: object: ['queries_object'] grounding: ['queries_grounding', 'tokens_grounding'] spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial'] tokens: grounding: ['queries_grounding', 'tokens_grounding'] spatial: ['tokens_spatial'] memories: spatial: ['memories_spatial'] CROSS_ATTENTION: queries: object: True grounding: True spatial: True memories: spatial: True tokens: grounding: False spatial: False MASKING: ['tokens_spatial', 'tokens_grounding'] DUPLICATION: queries: grounding: 'queries_object' spatial: 'queries_object' SPATIAL_MEMORIES: 32 QUERY_NUMBER: 3 DATASETS: TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding_lvis",] # TRAIN: ["coco_2017_train_panoptic_with_sem_seg_caption_grounding",] TEST: ["coco_2017_val_panoptic_with_sem_seg", "pascalvoc_val_Point", "refcocog_val_umd"] # to evaluate instance and semantic performance as well # TEST: ["pascalvoc_val_Point"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] # TEST: ["cocomini_val_Point", "cocomini_val_Circle", "cocomini_val_Scribble", "cocomini_val_Polygon", "cocomini_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] # TEST: ["ade600_val_Point", "ade600_val_Circle", "ade600_val_Scribble", "ade600_val_Polygon", "ade600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] # TEST: ["openimage600_val_Point", "openimage600_val_Circle", "openimage600_val_Scribble", "openimage600_val_Polygon", "openimage600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box] CLASS_CONCAT: false SIZE_DIVISIBILITY: 32 PROPOSAL_FILES_TRAIN: [] INPUT: PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] TRAIN: ASPECT_RATIO_GROUPING: true BATCH_SIZE_TOTAL: 4 BATCH_SIZE_PER_GPU: 4 SHUFFLE: true TEST: DETECTIONS_PER_IMAGE: 100 NAME: coco_eval IOU_TYPE: ['bbox', 'segm'] USE_MULTISCALE: false BATCH_SIZE_TOTAL: 8 MODEL_FILE: '' AUG: ENABLED: False DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 8 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: True COCO: INPUT: MIN_SIZE_TRAIN: 800 MAX_SIZE_TRAIN: 1333 MIN_SIZE_TRAIN_SAMPLING: 'choice' MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 IMAGE_SIZE: 1024 MIN_SCALE: 0.1 MAX_SCALE: 2.0 DATASET_MAPPER_NAME: "coco_interactive" IGNORE_VALUE: 255 COLOR_AUG_SSD: False SIZE_DIVISIBILITY: 32 RANDOM_FLIP: "horizontal" MASK_FORMAT: "polygon" FORMAT: "RGB" CROP: ENABLED: True DATASET: DATASET: 'coco' # Validation dataset ADE20K: INPUT: MIN_SIZE_TRAIN: 640 MIN_SIZE_TRAIN_SAMPLING: "choice" MIN_SIZE_TEST: 640 MAX_SIZE_TRAIN: 2560 MAX_SIZE_TEST: 2560 MASK_FORMAT: "polygon" CROP: ENABLED: True TYPE: "absolute" SIZE: (640, 640) SINGLE_CATEGORY_MAX_AREA: 1.0 COLOR_AUG_SSD: True SIZE_DIVISIBILITY: 640 # used in dataset mapper DATASET_MAPPER_NAME: "mask_former_panoptic" FORMAT: "RGB" DATASET: DATASET: 'ade' SBD: INPUT: MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 0 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: False TEST: BATCH_SIZE_TOTAL: 1 VOC: INPUT: MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 0 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: False TEST: BATCH_SIZE_TOTAL: 8 DAVIS: INPUT: MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 0 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: False TEST: BATCH_SIZE_TOTAL: 8 VOS: INPUT: MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 0 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: False TEST: BATCH_SIZE_TOTAL: 1 REF: INPUT: PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] MIN_SIZE_TEST: 512 MAX_SIZE_TEST: 1024 FORMAT: "RGB" SPATIAL: False DATALOADER: FILTER_EMPTY_ANNOTATIONS: False NUM_WORKERS: 4 LOAD_PROPOSALS: False SAMPLER_TRAIN: "TrainingSampler" ASPECT_RATIO_GROUPING: False TEST: BATCH_SIZE_TOTAL: 8 # Detectron2 training config for optimizer and lr scheduler SOLVER: BASE_LR: 0.0001 STEPS: [0.88889, 0.96296] MAX_ITER: 1 GAMMA: 0.1 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 10 WARMUP_METHOD: "linear" WEIGHT_DECAY: 0.05 OPTIMIZER: "ADAMW" LR_SCHEDULER_NAME: "WarmupMultiStepLR" LR_MULTIPLIER: backbone: 0.1 lang_encoder: 0.1 FIX_PARAM: backbone: True lang_encoder: True pixel_decoder: True WEIGHT_DECAY_NORM: 0.0 WEIGHT_DECAY_EMBED: 0.0 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 5.0 # 0.01 NORM_TYPE: 2.0 MAX_NUM_EPOCHS: 50