Spaces:
Running
Running
| <html> | |
| <head> | |
| <meta charset="utf-8"> | |
| <meta name="description" | |
| content="AUTOMOTIVE-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems. A high-fidelity benchmark and environment for in-vehicle GUIs with 185 parameterized tasks and reproducible checks."> | |
| <meta name="keywords" content="Automotive-ENV, multimodal agents, vehicle GUI, benchmark, ASURADA"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1"> | |
| <title>AUTOMOTIVE-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems</title> | |
| <!-- (Optional) Google Analytics - remove if not needed --> | |
| <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script> | |
| <script> | |
| window.dataLayer = window.dataLayer || []; | |
| function gtag(){ dataLayer.push(arguments); } | |
| gtag('js', new Date()); | |
| gtag('config', 'G-PYVRSFMDRL'); | |
| </script> | |
| <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet"> | |
| <link rel="stylesheet" href="./static/css/bulma.min.css"> | |
| <link rel="stylesheet" href="./static/css/bulma-carousel.min.css"> | |
| <link rel="stylesheet" href="./static/css/bulma-slider.min.css"> | |
| <link rel="stylesheet" href="./static/css/fontawesome.all.min.css"> | |
| <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"> | |
| <link rel="stylesheet" href="./static/css/index.css"> | |
| <link rel="icon" href="./static/images/favicon.svg"> | |
| <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script> | |
| <script defer src="./static/js/fontawesome.all.min.js"></script> | |
| <script src="./static/js/bulma-carousel.min.js"></script> | |
| <script src="./static/js/bulma-slider.min.js"></script> | |
| <script src="./static/js/index.js"></script> | |
| <style> | |
| /* Paper-like tuning */ | |
| body { background: #ffffff; color: #111; } | |
| .hero { background: #fff; } | |
| .publication-title { letter-spacing: -0.02em; } | |
| .publication-links .button { margin: 0 6px 8px; } | |
| .subtitle { color: #444; } | |
| .footer { background: #fafafa; } | |
| /* Center all section titles like the hero title */ | |
| .section .title.is-3 { text-align: center; } | |
| /* Match content width to teaser video width (full container), overriding Bulma's 4/5 column */ | |
| .container.is-max-desktop .columns .column.is-four-fifths { | |
| flex: 0 0 100%; | |
| max-width: 100%; | |
| } | |
| /* Consistent media scaling */ | |
| .hero.teaser video#teaser { width: 100%; height: auto; display: block; } | |
| .system-figure img, | |
| #task-statistics img, | |
| #results-analysis img { width: 100%; height: auto; display: block; border: 1px solid #eee; border-radius: 6px; } | |
| /* Gentle spacing under the overview description */ | |
| .system-overview-desc { margin-top: 12px; } | |
| </style> | |
| </head> | |
| <body> | |
| <!-- Hero: title, authors, links --> | |
| <section class="hero"> | |
| <div class="hero-body"> | |
| <div class="container is-max-desktop"> | |
| <div class="columns is-centered"> | |
| <div class="column has-text-centered"> | |
| <h1 class="title is-1 publication-title">AUTOMOTIVE-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems</h1> | |
| <div class="is-size-5 publication-authors"> | |
| <span class="author-block"><strong>Junfeng Yan</strong><sup>*1</sup>,</span> | |
| <span class="author-block"><strong>Biao Wu</strong><sup>*1</sup>,</span> | |
| <span class="author-block"><strong>Meng Fang</strong><sup>2</sup>,</span> | |
| <span class="author-block"><strong>Ling Chen</strong><sup>1</sup></span> | |
| </div> | |
| <div class="is-size-6 publication-authors" style="margin-top:6px;"> | |
| <span class="author-block"><sup>1</sup>Australian Artificial Intelligence Institute, Sydney, Australia</span><br> | |
| <span class="author-block"><sup>2</sup>University of Liverpool, Liverpool, United Kingdom</span> | |
| </div> | |
| <div class="column has-text-centered" style="margin-top:16px;"> | |
| <div class="publication-links"> | |
| <span class="link-block"> | |
| <a href="https://arxiv.org/abs/2509.21143" | |
| class="external-link button is-normal is-rounded is-dark" target="_blank" rel="noopener"> | |
| <span class="icon"><i class="ai ai-arxiv"></i></span> | |
| <span>Paper</span> | |
| </a> | |
| </span> | |
| <span class="link-block"> | |
| <a href="https://github.com/automotive-env/AutmotiveEnv.github.io" | |
| class="external-link button is-normal is-rounded is-dark" target="_blank" rel="noopener"> | |
| <span class="icon"><i class="fab fa-github"></i></span> | |
| <span>Code (coming soon)</span> | |
| </a> | |
| </span> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- Teaser video (local mp4). Put your video at ./static/videos/demo.mp4 --> | |
| <section class="hero teaser"> | |
| <div class="container is-max-desktop"> | |
| <div class="hero-body"> | |
| <video id="teaser" autoplay muted loop playsinline height="100%"> | |
| <source src="./static/videos/demo.mp4" type="video/mp4"> | |
| </video> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- System overview: two stacked images + description --> | |
| <section class="section" id="system-overview"> | |
| <div class="container is-max-desktop"> | |
| <div class="columns is-centered"> | |
| <div class="column is-four-fifths"> | |
| <h2 class="title is-3 has-text-centered">System Overview</h2> | |
| <figure class="system-figure has-text-centered" style="margin-bottom:16px;"> | |
| <img src="./static/images/demo_task.jpg" alt="Automotive-ENV task overview"> | |
| <figcaption class="subtitle is-6" style="margin-top:8px;"> | |
| Task instruction: Open the front windshield defroster, open the rear windshield defroster. | |
| </figcaption> | |
| </figure> | |
| <figure class="system-figure has-text-centered"> | |
| <img src="./static/images/demo_arch.jpg" alt="Automotive-ENV system architecture overview"> | |
| </figure> | |
| <div class="content has-text-justified system-overview-desc"> | |
| <p> | |
| Automotive OS-based environment where the agent observes the accessibility tree, screen, and GPS; | |
| optionally consults GPS-contextualized web knowledge; and acts through tap screens and API calls. | |
| Task success is determined by low-level programmatic checks of system signals. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- Abstract --> | |
| <section class="section"> | |
| <div class="container is-max-desktop"> | |
| <div class="columns is-centered has-text-centered"> | |
| <div class="column is-four-fifths"> | |
| <h2 class="title is-3">Abstract</h2> | |
| <div class="content has-text-justified"> | |
| <p> | |
| Multimodal agents have demonstrated strong performance in general GUI interactions, but their | |
| application in automotive systems has been largely unexplored. In-vehicle GUIs present distinct | |
| challenges: drivers’ limited attention, strict safety requirements, and complex location-based | |
| interaction patterns. To address these challenges, we introduce <strong>Automotive-ENV</strong>, | |
| the first high-fidelity benchmark and interaction environment tailored for vehicle GUIs. | |
| </p> | |
| <p> | |
| This platform defines <strong>185 parameterized tasks</strong> spanning explicit control, | |
| implicit intent understanding, and safety-aware tasks, and provides structured multimodal | |
| observations with precise programmatic checks for reproducible evaluation. Building on this | |
| benchmark, we propose <strong>ASURADA</strong>, a geo-aware multimodal agent that integrates | |
| GPS-informed context to dynamically adjust actions based on location, environmental conditions, | |
| and regional driving norms. | |
| </p> | |
| <p> | |
| Experiments show that geo-aware information significantly improves success on safety-aware tasks, | |
| highlighting the importance of location-based context in automotive environments. We will release | |
| Automotive-ENV, complete with all tasks and benchmarking tools, to further the development of | |
| safe and adaptive in-vehicle agents. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- Task Statistics and Comparison --> | |
| <section class="section" id="task-statistics"> | |
| <div class="container is-max-desktop"> | |
| <div class="columns is-centered"> | |
| <div class="column is-four-fifths"> | |
| <h2 class="title is-3 has-text-centered">Task Statistics and Comparison</h2> | |
| <div class="content has-text-justified"> | |
| <p> | |
| <strong>Automotive-ENV</strong> contains <em>185 parameterized tasks</em> spanning multiple dimensions: | |
| modalities (screen, accessibility tree, GPS), intent types (explicit control, implicit intent, | |
| safety-aware), and UI primitives (tap, long-press, slider, toggle, text). We report distributions | |
| across these dimensions and across task categories (Maps, HVAC, Road, Phenomenon, Media, Apps, System, Comms). | |
| </p> | |
| </div> | |
| <figure class="system-figure has-text-centered" style="margin-top:12px;"> | |
| <img src="./static/images/static.jpg" alt="Task distributions across dimensions and categories"> | |
| <figcaption class="subtitle is-6" style="margin-top:8px;"> | |
| Task distributions across different dimensions. (a) Distribution of tasks by task dimensions. | |
| (b) Distribution of tasks across task categories (Maps, HVAC, Road, Phenomenon, Media, Apps, System, Comms). | |
| </figcaption> | |
| </figure> | |
| <figure class="system-figure has-text-centered" style="margin-top:18px;"> | |
| <img src="./static/images/task_and_check.jpg" alt="Representative instructions and validation methods"> | |
| <figcaption class="subtitle is-6" style="margin-top:8px;"> | |
| Representative user instructions for in-vehicle tasks, categorized by task type, with corresponding validation methods. | |
| </figcaption> | |
| </figure> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- Results and Analysis (+ Discussion & Conclusion appended at the end) --> | |
| <section class="section" id="results-analysis"> | |
| <div class="container is-max-desktop"> | |
| <div class="columns is-centered"> | |
| <div class="column is-four-fifths"> | |
| <h2 class="title is-3 has-text-centered">Results and Analysis</h2> | |
| <div class="content has-text-justified"> | |
| <p> | |
| We evaluate multiple agent configurations on <strong>Automotive-ENV</strong>, reporting success | |
| rates across General tasks (Explicit Control, Implicit Intent) and Safety-Aware tasks | |
| (Driving Alignment, Environment Alerts). We also analyze the effect of GPS-aware context | |
| on inference token usage and task-wise performance across hotspot categories. | |
| </p> | |
| </div> | |
| <figure class="system-figure has-text-centered" style="margin-top:12px;"> | |
| <img src="./static/images/results.jpg" alt="Success rates of different agent configurations across task groups"> | |
| <figcaption class="subtitle is-6" style="margin-top:8px;"> | |
| Success rates (SR %) of different agent configurations on Automotive-ENV. Results are | |
| reported across General tasks (Explicit Control, Implicit Intent) and Safety-Aware tasks | |
| (Driving Alignment, Environment Alerts). | |
| </figcaption> | |
| </figure> | |
| <figure class="system-figure has-text-centered" style="margin-top:18px;"> | |
| <img src="./static/images/task_and_check.jpg" alt="Token length distributions and task-wise performance with vs. without GPS"> | |
| <figcaption class="subtitle is-6" style="margin-top:8px;"> | |
| Comparison of inference tokens with and without GPS information. Left: distribution of | |
| token lengths. Right: task-wise performance across hotspot categories. | |
| </figcaption> | |
| </figure> | |
| <!-- Discussion --> | |
| <div class="content has-text-justified" style="margin-top:28px;"> | |
| <h3 class="title is-4 has-text-centered">Discussion</h3> | |
| <p> | |
| GPS signals are indispensable for providing geographic context in automotive agents, yet they | |
| are prone to disruptions in real-world environments such as tunnels, underground parking, or dense | |
| urban canyons. These interruptions can cause temporary localization failures, directly undermining | |
| navigation and geo-dependent decision-making. To address this limitation, large language models | |
| (LLMs) can act as virtual sensors by leveraging their built-in knowledge of road networks together | |
| with the last available GPS coordinates and timestamps. During short signal outages, the agent can | |
| simulate intermediate positions and continue offering navigation or context-aware recommendations. | |
| Once connectivity is restored, the simulated trajectory can be aligned with actual positioning | |
| data. This capability highlights the potential of LLMs to complement imperfect sensor signals and | |
| enhance robustness in safety-critical automotive applications. | |
| </p> | |
| </div> | |
| <!-- Conclusion --> | |
| <div class="content has-text-justified" style="margin-top:18px;"> | |
| <h3 class="title is-4 has-text-centered">Conclusion</h3> | |
| <p> | |
| In this work, we present <strong>Automotive-ENV</strong>, the first large-scale benchmark explicitly designed for | |
| evaluating multimodal agents in realistic automotive GUI environments. Unlike desktop or mobile | |
| benchmarks, Automotive-ENV provides structured, reproducible, and geographically parameterized | |
| tasks that capture the complexity of in-vehicle interaction under real-world constraints. Building on | |
| this foundation, we propose <strong>ASURADA</strong>, a geo-adaptive agent capable of integrating GPS location | |
| and contextual signals to deliver safe and personalized actions. Our experiments show that geo-context | |
| integration not only improves task accuracy, especially in safety-critical settings, but also | |
| reduces reasoning overhead by enabling proactive, context-driven planning. Together, Automotive-ENV | |
| and ASURADA establish a foundation for the next generation of in-vehicle assistants that are | |
| multimodal, safety-aware, and culturally adaptive, advancing the reliable deployment of autonomous | |
| agents in high-stakes driving environments. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- BibTeX --> | |
| <section class="section" id="BibTeX"> | |
| <div class="container is-max-desktop content"> | |
| <h2 class="title is-3 has-text-centered">BibTeX</h2> | |
| <pre><code>@misc{yan2025automotiveenvbenchmarkingmultimodalagents, | |
| title={Automotive-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems}, | |
| author={Junfeng Yan and Biao Wu and Meng Fang and Ling Chen}, | |
| year={2025}, | |
| eprint={2509.21143}, | |
| archivePrefix={arXiv}, | |
| primaryClass={cs.RO}, | |
| url={https://arxiv.org/abs/2509.21143} | |
| }</code></pre> | |
| </div> | |
| </section> | |
| <footer class="footer"> | |
| <div class="container"> | |
| <div class="content has-text-centered"> | |
| <p> | |
| This website adapts the open-source <a href="https://nerfies.github.io" target="_blank" rel="noopener">Nerfies/OS-World</a> page framework. | |
| Template code © original authors, used under | |
| <a href="https://creativecommons.org/licenses/by-sa/4.0/" target="_blank" rel="noopener">CC BY-SA 4.0</a>. | |
| </p> | |
| <p> | |
| Site content (text, figures, video) © 2025 automotive-env. | |
| Source for this page: <a href="https://github.com/automotive-env/AutmotiveEnv.github.io" target="_blank" rel="noopener">GitHub</a>. | |
| </p> | |
| <p> | |
| <a class="icon-link" href="https://arxiv.org/abs/2509.21143" target="_blank" rel="noopener" title="arXiv"> | |
| <i class="ai ai-arxiv"></i> | |
| </a> | |
| <a class="icon-link" href="https://github.com/automotive-env/AutmotiveEnv.github.io" target="_blank" rel="noopener" title="GitHub"> | |
| <i class="fab fa-github"></i> | |
| </a> | |
| <a class="icon-link" href="https://creativecommons.org/licenses/by-sa/4.0/" target="_blank" rel="noopener" title="CC BY-SA 4.0"> | |
| <i class="fab fa-creative-commons"></i> | |
| </a> | |
| </p> | |
| <p>© 2025 automotive-env — Hosted on GitHub Pages.</p> | |
| </div> | |
| </div> | |
| </footer> | |
| </body> | |
| </html> |