{ "best_model_checkpoint": "/kaggle/working/xoron-final", "best_metric": 5.891898287038009, "epoch": 4, "epochs_completed": 4, "global_step": 72, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [], "logging_steps": 50, "max_steps": 72, "num_train_epochs": 4, "total_flos": 0, "train_batch_size": 1, "effective_batch_size": 16, "learning_rate": 0.0001, "max_grad_norm": 1.0, "trainable_components": [ "llm", "cross_attention", "modality_markers" ], "frozen_components": [ "vision", "video", "audio", "speech", "image_generation", "video_generation" ], "trial_name": null, "trial_params": null }