{ "run_id": "1120_BEHAVIOR_challenge_QwenDual_taskall", "run_root_dir": "./results/Checkpoints", "seed": 42, "trackers": [ "jsonl", "wandb" ], "wandb_entity": "zwanggk", "wandb_project": "Behavior_17", "is_debug": false, "framework": { "name": "QwenDual", "qwenvl": { "base_vlm": "./playground/Pretrained_models/Qwen3-VL-4B-Instruct", "attn_implementation": "flash_attention_2", "vl_hidden_dim": 2048 }, "dino": { "dino_backbone": "dinov2_vits14" }, "action_model": { "action_model_type": "DiT-B", "action_hidden_dim": 1024, "hidden_size": 1024, "add_pos_embed": true, "max_seq_len": 2048, "action_dim": 23, "state_dim": 44, "future_action_window_size": 49, "action_horizon": 50, "past_action_window_size": 0, "repeated_diffusion_steps": 8, "noise_beta_alpha": 1.5, "noise_beta_beta": 1.0, "noise_s": 0.999, "num_timestep_buckets": 1000, "num_inference_timesteps": 4, "num_target_vision_tokens": 32, "diffusion_model_cfg": { "cross_attention_dim": 2048, "dropout": 0.2, "final_dropout": true, "interleave_self_attention": true, "norm_type": "ada_norm", "num_layers": 16, "output_dim": 1024, "positional_embeddings": null } } }, "datasets": { "vlm_data": { "dataset_py": "vlm_datasets", "dataformat": "llava_json", "dataset_use": "asv2_conversation_en,asv2_detailed_description_en,asv2_region_captioning_en,coco_internvl_longcap_en,coco_karpathy_train_567_en,coco_negative_gpt4o_en,coco_poetry_zh,coco_rem_en_zh,cocorem_exist_yorn_en,cocotextv2_en,cocotextv2_gpt4o_en,okvqa_en,refcoco_grounding_aug_en,refcoco_grounding_en,tallyqa_coco_en,toloka_grounding_aug_en,vqav2_en,vsr_en", "eval_dataset": "aokvqa_cauldron_llava_format", "data_flatten": false, "base_interval": 2, "max_pixels": 12845056, "min_pixels": 3136, "model_max_length": 2048, "model_type": "qwen2.5vl", "per_device_batch_size": 4 }, "vla_data": { "dataset_py": "lerobot_datasets", "data_root_dir": "playground/Datasets", "data_mix": "BEHAVIOR_challenge", "include_state": true, "per_device_batch_size": 32, "load_all_data_for_training": true, "CoT_prompt": "Your task is {instruction}. To identify the key objects for your task. Locate their bounding boxes in [x1,y1,x2,y2] format.", "CoT_answer": "bbox", "default_image_resolution": [ 3, 224, 224 ], "task_id": "all" } }, "trainer": { "epochs": 100, "max_train_steps": 100000, "num_warmup_steps": 5000, "save_interval": 10000, "eval_interval": 100, "learning_rate": { "base": 4e-05, "qwen_vl_interface": 1e-05, "action_model": 0.0001 }, "lr_scheduler_type": "cosine_with_min_lr", "scheduler_specific_kwargs": { "min_lr": 1e-06 }, "freeze_modules": true, "loss_scale": { "vla": 1.0, "vlm": 0.1 }, "repeated_diffusion_steps": 4, "max_grad_norm": 1.0, "warmup_ratio": 0.1, "weight_decay": 0.0, "logging_frequency": 100, "gradient_clipping": 1.0, "gradient_accumulation_steps": 1, "optimizer": { "name": "AdamW", "betas": [ 0.9, 0.95 ], "eps": 1e-08, "weight_decay": 1e-08 }, "is_resume": false, "resume_epoch": null, "resume_step": null, "enable_gradient_checkpointing": true, "enable_mixed_precision_training": true }, "output_dir": "./results/Checkpoints/1120_BEHAVIOR_challenge_QwenDual_taskall" }