IMAGE_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + image masks"
IMAGE_CAPTION_GROUNDING_MASKS_MODE = "caption + grounding + image masks"

IMAGE_INFERENCE_MODES = [
    IMAGE_OPEN_VOCABULARY_DETECTION_MODE,
    IMAGE_CAPTION_GROUNDING_MASKS_MODE
]

VIDEO_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + video masks"

VIDEO_INFERENCE_MODES = [
    VIDEO_OPEN_VOCABULARY_DETECTION_MODE
]