| | _BASE_: "base_model_bert_l12_h768.yaml" |
| |
|
| | SHARED_TARGETS: |
| |
|
| |
|
| |
|
| | - |
| | NAME: 'Vocab_Word' |
| | SHARED_TARGETS_CFG: |
| | FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl' |
| | DISTRIBUTED: True |
| |
|
| | TASKS: |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | - |
| | NAME: mscoco_caption |
| | DATASETS: |
| | TRAIN: 'ImageTextPairDataset' |
| | |
| | TEST: 'ImageTextPairDataset' |
| | TASK_TYPE: 'image_caption' |
| | DATASET_NAME: 'MSCOCO' |
| | TARGET_SET: ['Vocab_Word'] |
| | DATALOADER: |
| | TRAIN_BATCH_SIZE: 200 |
| | TEST_BATCH_SIZE: 2 |
| | NUM_WORKERS: 4 |
| | FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' |
| | ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations' |
| | S3_PATH: 's3://coco/' |
| | SEQ_PER_SAMPLE: 1 |
| | CACHE_MODE: True |
| | CIRCULAR_CACHE_MODE: False |
| | ZIP_MODE: False |
| | CACHE_ORIGIN_IMAGE: False |
| | RANDOM_CAPTION: False |
| | AS_NUMPY_AS_POSSIBLE: False |
| | SAMPLING_WEIGHT: 0.5 |
| | TRANSFORM: 'clip_transforms' |
| | RANDOM_MASK: True |
| | MODEL: |
| | MAX_SEQ_LEN: 30 |
| | EVAL_MAX_SEQ_LEN: 21 |
| | TEMP_NAME: logit_scale_caption |
| | LOSSES: |
| | NAMES: ['CrossEntropy', 'Accuracy'] |
| | LOSS_WEIGHT: 0.5 |
| | REDUCTION: 'mean' |
| | DECODE_STRATEGY: |
| | NAME: 'CaptionBeamSearcherV3' |
| | BEAM_SIZE: 2 |
| | |
| | INFERENCE: |
| | NAME: 'COCOEvaler' |
| | VOCAB: 'CLIP' |
| | ID_KEY: 'image_id' |
| | VALUE: 'caption' |
| | VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json' |
| | TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json' |
| | GENERATION_MODE: True |
| |
|
| |
|
| |
|
| |
|
| | ENGINE: |
| | NAME: 'UnifiedTrainer' |
| | |
| | MODEL: |
| | META_ARCHITECTURE: 'MultiTaskTransformerEncoder' |
| | ENCODER: 'UnifiedBertEncoder' |
| |
|
| | IN_TUNING: True |
| | SHARE_LAYERNORM: True |
| | BERT: |
| | NORMALIZE_DECISION: "BERTPre" |
| | DROP_PATH_PROB: 0.1 |
| | DROP_PATH_PROB_FIXED: True |
| |
|
| | UNIFY_QKV: True |
| | |
| | MODEL_EMA: False |
| | MODEL_EMA_DECAY: 0.9999 |
| |
|
| | MAEParamsInit: True |
| | POSEMBEDFIX: True |
| |
|
| |
|
| | IMG_INPUT_SIZE: 224 |
| | PATCH_SIZE: 16 |
| |
|
| | LAYER_SCALE: True |
| | LAYER_SCALE_INIT: 1e-3 |
| |
|
| |
|
| | DATALOADER: |
| | USE_WEIGHTED_SAMPLER: True |
| | UNIFIED_DATASET: True |
| | NUM_WORKERS: 16 |
| |
|
| | PADDING_TO_MAX: False |
| |
|
| |
|
| | |
| | |
| | SOLVER: |
| | NAME: 'Adam' |
| | TORCH_OPTIMIZER: True |
| | PARAMS_SEPERATE: True |
| | |
| | |
| | MAX_ITER: 450000 |
| | CHECKPOINT_PERIOD: 50000 |
| | EVAL_PERIOD: 500000 |
| | BASE_LR: 0.001 |
| | BIAS_LR_FACTOR: 1.0 |
| | WEIGHT_DECAY: 0.05 |
| | WEIGHT_DECAY_NORM: 0.0 |
| | WEIGHT_DECAY_BIAS: 0.0 |
| | WEIGHT_DECAY_EMBEDDING: 0.0 |
| | MOMENTUM: 0.9 |
| | DAMPENING: 0.0 |
| | NESTEROV: 0.0 |
| | BETAS: [0.9, 0.95] |
| | EPS: 1e-6 |
| | GRAD_CLIP: 0.1 |
| | GRAD_CLIP_TYPE: 'norm' |
| | ACCUM_ITER: 0 |
| | AMP_FP16: True |
| | APEX_FP16: False |
| |
|
| | WRITE_PERIOD: 50 |
| | MIN_LOSS_SCLE: 2048.0 |
| | |
| | |
| |
|
| | LOSS_SCALE_WINDOW: 200 |
| |
|
| |
|
| |
|
| |
|
| |
|
| | |
| | |
| | LR_SCHEDULER: |
| | NAME: 'WarmupCosine' |
| | WARMUP: 20000 |
| | MIN_LR: 0.000001 |
| |
|
| |
|
| |
|
| |
|
| | |
| | INFERENCE: |
| |
|
| | VOCAB: 'CLIP' |
| | ITER_BASED: True |
| |
|
| |
|
| | find_unused_parameters: true |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|