Spaces:

madmax3366
/

AutoEnv_demo

Running

App Files Files Community

AutoEnv_demo / index.html

madmax3366

Update index.html

ffafe20 verified about 1 month ago

raw

history blame contribute delete

16.9 kB

	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="utf-8">
	<meta name="description"
	content="AUTOMOTIVE-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems. A high-fidelity benchmark and environment for in-vehicle GUIs with 185 parameterized tasks and reproducible checks.">
	<meta name="keywords" content="Automotive-ENV, multimodal agents, vehicle GUI, benchmark, ASURADA">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<title>AUTOMOTIVE-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems</title>

	<!-- (Optional) Google Analytics - remove if not needed -->
	<script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
	<script>
	window.dataLayer = window.dataLayer \|\| [];
	function gtag(){ dataLayer.push(arguments); }
	gtag('js', new Date());
	gtag('config', 'G-PYVRSFMDRL');
	</script>

	<link href="https://fonts.googleapis.com/css?family=Google+Sans\|Noto+Sans\|Castoro" rel="stylesheet">
	<link rel="stylesheet" href="./static/css/bulma.min.css">
	<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
	<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
	<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
	<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
	<link rel="stylesheet" href="./static/css/index.css">
	<link rel="icon" href="./static/images/favicon.svg">

	<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
	<script defer src="./static/js/fontawesome.all.min.js"></script>
	<script src="./static/js/bulma-carousel.min.js"></script>
	<script src="./static/js/bulma-slider.min.js"></script>
	<script src="./static/js/index.js"></script>

	<style>
	/* Paper-like tuning */
	body { background: #ffffff; color: #111; }
	.hero { background: #fff; }
	.publication-title { letter-spacing: -0.02em; }
	.publication-links .button { margin: 0 6px 8px; }
	.subtitle { color: #444; }
	.footer { background: #fafafa; }

	/* Center all section titles like the hero title */
	.section .title.is-3 { text-align: center; }

	/* Match content width to teaser video width (full container), overriding Bulma's 4/5 column */
	.container.is-max-desktop .columns .column.is-four-fifths {
	flex: 0 0 100%;
	max-width: 100%;
	}

	/* Consistent media scaling */
	.hero.teaser video#teaser { width: 100%; height: auto; display: block; }
	.system-figure img,
	#task-statistics img,
	#results-analysis img { width: 100%; height: auto; display: block; border: 1px solid #eee; border-radius: 6px; }

	/* Gentle spacing under the overview description */
	.system-overview-desc { margin-top: 12px; }
	</style>
	</head>
	<body>

	<!-- Hero: title, authors, links -->
	<section class="hero">
	<div class="hero-body">
	<div class="container is-max-desktop">
	<div class="columns is-centered">
	<div class="column has-text-centered">

	<h1 class="title is-1 publication-title">AUTOMOTIVE-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems</h1>

	<div class="is-size-5 publication-authors">
	<span class="author-block"><strong>Junfeng Yan</strong><sup>*1</sup>,</span>
	<span class="author-block"><strong>Biao Wu</strong><sup>*1</sup>,</span>
	<span class="author-block"><strong>Meng Fang</strong><sup>2</sup>,</span>
	<span class="author-block"><strong>Ling Chen</strong><sup>1</sup></span>
	</div>

	<div class="is-size-6 publication-authors" style="margin-top:6px;">
	<span class="author-block"><sup>1</sup>Australian Artificial Intelligence Institute, Sydney, Australia</span><br>
	<span class="author-block"><sup>2</sup>University of Liverpool, Liverpool, United Kingdom</span>
	</div>

	<div class="column has-text-centered" style="margin-top:16px;">
	<div class="publication-links">
	<span class="link-block">
	<a href="https://arxiv.org/abs/2509.21143"
	class="external-link button is-normal is-rounded is-dark" target="_blank" rel="noopener">
	<span class="icon"><i class="ai ai-arxiv"></i></span>
	<span>Paper</span>
	</a>
	</span>
	<span class="link-block">
	<a href="https://github.com/automotive-env/AutmotiveEnv.github.io"
	class="external-link button is-normal is-rounded is-dark" target="_blank" rel="noopener">
	<span class="icon"><i class="fab fa-github"></i></span>
	<span>Code (coming soon)</span>
	</a>
	</span>
	</div>
	</div>

	</div>
	</div>
	</div>
	</div>
	</section>

	<!-- Teaser video (local mp4). Put your video at ./static/videos/demo.mp4 -->
	<section class="hero teaser">
	<div class="container is-max-desktop">
	<div class="hero-body">
	<video id="teaser" autoplay muted loop playsinline height="100%">
	<source src="./static/videos/demo.mp4" type="video/mp4">
	</video>
	</div>
	</div>
	</section>

	<!-- System overview: two stacked images + description -->
	<section class="section" id="system-overview">
	<div class="container is-max-desktop">
	<div class="columns is-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3 has-text-centered">System Overview</h2>

	<figure class="system-figure has-text-centered" style="margin-bottom:16px;">
	<img src="./static/images/demo_task.jpg" alt="Automotive-ENV task overview">
	<figcaption class="subtitle is-6" style="margin-top:8px;">
	Task instruction: Open the front windshield defroster, open the rear windshield defroster.
	</figcaption>
	</figure>

	<figure class="system-figure has-text-centered">
	<img src="./static/images/demo_arch.jpg" alt="Automotive-ENV system architecture overview">
	</figure>

	<div class="content has-text-justified system-overview-desc">
	<p>
	Automotive OS-based environment where the agent observes the accessibility tree, screen, and GPS;
	optionally consults GPS-contextualized web knowledge; and acts through tap screens and API calls.
	Task success is determined by low-level programmatic checks of system signals.
	</p>
	</div>
	</div>
	</div>
	</div>
	</section>

	<!-- Abstract -->
	<section class="section">
	<div class="container is-max-desktop">
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3">Abstract</h2>
	<div class="content has-text-justified">
	<p>
	Multimodal agents have demonstrated strong performance in general GUI interactions, but their
	application in automotive systems has been largely unexplored. In-vehicle GUIs present distinct
	challenges: drivers’ limited attention, strict safety requirements, and complex location-based
	interaction patterns. To address these challenges, we introduce <strong>Automotive-ENV</strong>,
	the first high-fidelity benchmark and interaction environment tailored for vehicle GUIs.
	</p>
	<p>
	This platform defines <strong>185 parameterized tasks</strong> spanning explicit control,
	implicit intent understanding, and safety-aware tasks, and provides structured multimodal
	observations with precise programmatic checks for reproducible evaluation. Building on this
	benchmark, we propose <strong>ASURADA</strong>, a geo-aware multimodal agent that integrates
	GPS-informed context to dynamically adjust actions based on location, environmental conditions,
	and regional driving norms.
	</p>
	<p>
	Experiments show that geo-aware information significantly improves success on safety-aware tasks,
	highlighting the importance of location-based context in automotive environments. We will release
	Automotive-ENV, complete with all tasks and benchmarking tools, to further the development of
	safe and adaptive in-vehicle agents.
	</p>
	</div>
	</div>
	</div>
	</div>
	</section>

	<!-- Task Statistics and Comparison -->
	<section class="section" id="task-statistics">
	<div class="container is-max-desktop">
	<div class="columns is-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3 has-text-centered">Task Statistics and Comparison</h2>

	<div class="content has-text-justified">
	<p>
	<strong>Automotive-ENV</strong> contains <em>185 parameterized tasks</em> spanning multiple dimensions:
	modalities (screen, accessibility tree, GPS), intent types (explicit control, implicit intent,
	safety-aware), and UI primitives (tap, long-press, slider, toggle, text). We report distributions
	across these dimensions and across task categories (Maps, HVAC, Road, Phenomenon, Media, Apps, System, Comms).
	</p>
	</div>

	<figure class="system-figure has-text-centered" style="margin-top:12px;">
	<img src="./static/images/static.jpg" alt="Task distributions across dimensions and categories">
	<figcaption class="subtitle is-6" style="margin-top:8px;">
	Task distributions across different dimensions. (a) Distribution of tasks by task dimensions.
	(b) Distribution of tasks across task categories (Maps, HVAC, Road, Phenomenon, Media, Apps, System, Comms).
	</figcaption>
	</figure>

	<figure class="system-figure has-text-centered" style="margin-top:18px;">
	<img src="./static/images/task_and_check.jpg" alt="Representative instructions and validation methods">
	<figcaption class="subtitle is-6" style="margin-top:8px;">
	Representative user instructions for in-vehicle tasks, categorized by task type, with corresponding validation methods.
	</figcaption>
	</figure>
	</div>
	</div>
	</div>
	</section>

	<!-- Results and Analysis (+ Discussion & Conclusion appended at the end) -->
	<section class="section" id="results-analysis">
	<div class="container is-max-desktop">
	<div class="columns is-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3 has-text-centered">Results and Analysis</h2>

	<div class="content has-text-justified">
	<p>
	We evaluate multiple agent configurations on <strong>Automotive-ENV</strong>, reporting success
	rates across General tasks (Explicit Control, Implicit Intent) and Safety-Aware tasks
	(Driving Alignment, Environment Alerts). We also analyze the effect of GPS-aware context
	on inference token usage and task-wise performance across hotspot categories.
	</p>
	</div>

	<figure class="system-figure has-text-centered" style="margin-top:12px;">
	<img src="./static/images/results.jpg" alt="Success rates of different agent configurations across task groups">
	<figcaption class="subtitle is-6" style="margin-top:8px;">
	Success rates (SR %) of different agent configurations on Automotive-ENV. Results are
	reported across General tasks (Explicit Control, Implicit Intent) and Safety-Aware tasks
	(Driving Alignment, Environment Alerts).
	</figcaption>
	</figure>

	<figure class="system-figure has-text-centered" style="margin-top:18px;">
	<img src="./static/images/task_and_check.jpg" alt="Token length distributions and task-wise performance with vs. without GPS">
	<figcaption class="subtitle is-6" style="margin-top:8px;">
	Comparison of inference tokens with and without GPS information. Left: distribution of
	token lengths. Right: task-wise performance across hotspot categories.
	</figcaption>
	</figure>

	<!-- Discussion -->
	<div class="content has-text-justified" style="margin-top:28px;">
	<h3 class="title is-4 has-text-centered">Discussion</h3>
	<p>
	GPS signals are indispensable for providing geographic context in automotive agents, yet they
	are prone to disruptions in real-world environments such as tunnels, underground parking, or dense
	urban canyons. These interruptions can cause temporary localization failures, directly undermining
	navigation and geo-dependent decision-making. To address this limitation, large language models
	(LLMs) can act as virtual sensors by leveraging their built-in knowledge of road networks together
	with the last available GPS coordinates and timestamps. During short signal outages, the agent can
	simulate intermediate positions and continue offering navigation or context-aware recommendations.
	Once connectivity is restored, the simulated trajectory can be aligned with actual positioning
	data. This capability highlights the potential of LLMs to complement imperfect sensor signals and
	enhance robustness in safety-critical automotive applications.
	</p>
	</div>

	<!-- Conclusion -->
	<div class="content has-text-justified" style="margin-top:18px;">
	<h3 class="title is-4 has-text-centered">Conclusion</h3>
	<p>
	In this work, we present <strong>Automotive-ENV</strong>, the first large-scale benchmark explicitly designed for
	evaluating multimodal agents in realistic automotive GUI environments. Unlike desktop or mobile
	benchmarks, Automotive-ENV provides structured, reproducible, and geographically parameterized
	tasks that capture the complexity of in-vehicle interaction under real-world constraints. Building on
	this foundation, we propose <strong>ASURADA</strong>, a geo-adaptive agent capable of integrating GPS location
	and contextual signals to deliver safe and personalized actions. Our experiments show that geo-context
	integration not only improves task accuracy, especially in safety-critical settings, but also
	reduces reasoning overhead by enabling proactive, context-driven planning. Together, Automotive-ENV
	and ASURADA establish a foundation for the next generation of in-vehicle assistants that are
	multimodal, safety-aware, and culturally adaptive, advancing the reliable deployment of autonomous
	agents in high-stakes driving environments.
	</p>
	</div>

	</div>
	</div>
	</div>
	</section>

	<!-- BibTeX -->
	<section class="section" id="BibTeX">
	<div class="container is-max-desktop content">
	<h2 class="title is-3 has-text-centered">BibTeX</h2>
	<pre><code>@misc{yan2025automotiveenvbenchmarkingmultimodalagents,
	title={Automotive-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems},
	author={Junfeng Yan and Biao Wu and Meng Fang and Ling Chen},
	year={2025},
	eprint={2509.21143},
	archivePrefix={arXiv},
	primaryClass={cs.RO},
	url={https://arxiv.org/abs/2509.21143}
	}</code></pre>
	</div>
	</section>

	<footer class="footer">
	<div class="container">
	<div class="content has-text-centered">
	<p>
	This website adapts the open-source <a href="https://nerfies.github.io" target="_blank" rel="noopener">Nerfies/OS-World</a> page framework.
	Template code © original authors, used under
	<a href="https://creativecommons.org/licenses/by-sa/4.0/" target="_blank" rel="noopener">CC BY-SA 4.0</a>.
	</p>
	<p>
	Site content (text, figures, video) © 2025 automotive-env.
	Source for this page: <a href="https://github.com/automotive-env/AutmotiveEnv.github.io" target="_blank" rel="noopener">GitHub</a>.
	</p>
	<p>
	<a class="icon-link" href="https://arxiv.org/abs/2509.21143" target="_blank" rel="noopener" title="arXiv">
	<i class="ai ai-arxiv"></i>
	</a>
	<a class="icon-link" href="https://github.com/automotive-env/AutmotiveEnv.github.io" target="_blank" rel="noopener" title="GitHub">
	<i class="fab fa-github"></i>
	</a>
	<a class="icon-link" href="https://creativecommons.org/licenses/by-sa/4.0/" target="_blank" rel="noopener" title="CC BY-SA 4.0">
	<i class="fab fa-creative-commons"></i>
	</a>
	</p>
	<p>© 2025 automotive-env — Hosted on GitHub Pages.</p>
	</div>
	</div>
	</footer>

	</body>
	</html>