Any-to-Any
Transformers
Safetensors
English
xoron
multimodal
Mixture of Experts
text-to-image
image editing
image to video
text-to-video
video editing
text-to-speech
speech-to-text
speech-to-speech
image-to-text
video-to-text
agentic
tool-use
flow-matching
3d-rope
titok
vidtok
dual-stream-attention
zero-shot-voice-cloning
bigvgan
snake-activation
multi-receptive-field-fusion
custom_code
| { | |
| "components": [ | |
| "llm", | |
| "vision_encoder", | |
| "video_encoder", | |
| "audio_encoder", | |
| "audio_decoder", | |
| "projector", | |
| "audio_projector", | |
| "cross_attention", | |
| "generator", | |
| "video_generator", | |
| "waveform_decoder", | |
| "modality_markers" | |
| ], | |
| "save_format": "components" | |
| } |