File size: 16,860 Bytes
7b60214
2a9cc3f
7b60214
2a9cc3f
 
 
 
 
7b60214
9b4a15e
41bbfbc
2a9cc3f
 
 
 
 
 
 
 
 
41bbfbc
 
 
 
 
 
 
 
 
 
 
 
 
9b4a15e
7b60214
124022a
2a9cc3f
 
 
 
 
 
124022a
ffafe20
124022a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652f2dc
7b60214
 
 
9b4a15e
41bbfbc
2a9cc3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652f2dc
2a9cc3f
 
652f2dc
2a9cc3f
 
 
 
 
 
7b60214
9b4a15e
2a9cc3f
 
 
124022a
96f344f
 
 
 
 
 
 
 
 
 
124022a
a6245b3
2a9cc3f
 
 
 
74fb05d
 
124022a
74fb05d
edbe0aa
74fb05d
 
 
 
124022a
74fb05d
 
a6245b3
 
 
 
 
 
 
7b60214
9b4a15e
2a9cc3f
 
 
41bbfbc
2a9cc3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b4a15e
2a9cc3f
 
 
 
 
b431b3d
 
181cf2b
 
 
124022a
181cf2b
 
 
 
 
 
 
 
 
 
 
b431b3d
181cf2b
 
 
 
 
 
b431b3d
 
 
 
 
 
181cf2b
 
 
 
 
ffafe20
8ca7ca7
 
 
 
124022a
8ca7ca7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffafe20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ca7ca7
 
 
 
 
41bbfbc
2a9cc3f
 
124022a
5b9d522
 
 
 
 
 
 
 
9b4a15e
2a9cc3f
 
 
 
 
 
617dccf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a9cc3f
 
 
7b60214
 
8697998
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="AUTOMOTIVE-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems. A high-fidelity benchmark and environment for in-vehicle GUIs with 185 parameterized tasks and reproducible checks.">
  <meta name="keywords" content="Automotive-ENV, multimodal agents, vehicle GUI, benchmark, ASURADA">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>AUTOMOTIVE-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems</title>

  <!-- (Optional) Google Analytics - remove if not needed -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];
    function gtag(){ dataLayer.push(arguments); }
    gtag('js', new Date());
    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/favicon.svg">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>

  <style>
    /* Paper-like tuning */
    body { background: #ffffff; color: #111; }
    .hero { background: #fff; }
    .publication-title { letter-spacing: -0.02em; }
    .publication-links .button { margin: 0 6px 8px; }
    .subtitle { color: #444; }
    .footer { background: #fafafa; }

    /* Center all section titles like the hero title */
    .section .title.is-3 { text-align: center; }

    /* Match content width to teaser video width (full container), overriding Bulma's 4/5 column */
    .container.is-max-desktop .columns .column.is-four-fifths {
      flex: 0 0 100%;
      max-width: 100%;
    }

    /* Consistent media scaling */
    .hero.teaser video#teaser { width: 100%; height: auto; display: block; }
    .system-figure img,
    #task-statistics img,
    #results-analysis img { width: 100%; height: auto; display: block; border: 1px solid #eee; border-radius: 6px; }

    /* Gentle spacing under the overview description */
    .system-overview-desc { margin-top: 12px; }
  </style>
</head>
<body>

<!-- Hero: title, authors, links -->
<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">

          <h1 class="title is-1 publication-title">AUTOMOTIVE-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems</h1>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><strong>Junfeng Yan</strong><sup>*1</sup>,</span>
            <span class="author-block"><strong>Biao Wu</strong><sup>*1</sup>,</span>
            <span class="author-block"><strong>Meng Fang</strong><sup>2</sup>,</span>
            <span class="author-block"><strong>Ling Chen</strong><sup>1</sup></span>
          </div>

          <div class="is-size-6 publication-authors" style="margin-top:6px;">
            <span class="author-block"><sup>1</sup>Australian Artificial Intelligence Institute, Sydney, Australia</span><br>
            <span class="author-block"><sup>2</sup>University of Liverpool, Liverpool, United Kingdom</span>
          </div>

          <div class="column has-text-centered" style="margin-top:16px;">
            <div class="publication-links">
              <span class="link-block">
                <a href="https://arxiv.org/abs/2509.21143"
                   class="external-link button is-normal is-rounded is-dark" target="_blank" rel="noopener">
                  <span class="icon"><i class="ai ai-arxiv"></i></span>
                  <span>Paper</span>
                </a>
              </span>
              <span class="link-block">
                <a href="https://github.com/automotive-env/AutmotiveEnv.github.io"
                   class="external-link button is-normal is-rounded is-dark" target="_blank" rel="noopener">
                  <span class="icon"><i class="fab fa-github"></i></span>
                  <span>Code (coming soon)</span>
                </a>
              </span>
            </div>
          </div>

        </div>
      </div>
    </div>
  </div>
</section>

<!-- Teaser video (local mp4). Put your video at ./static/videos/demo.mp4 -->
<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <video id="teaser" autoplay muted loop playsinline height="100%">
        <source src="./static/videos/demo.mp4" type="video/mp4">
      </video>
    </div>
  </div>
</section>

<!-- System overview: two stacked images + description -->
<section class="section" id="system-overview">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3 has-text-centered">System Overview</h2>

        <figure class="system-figure has-text-centered" style="margin-bottom:16px;">
          <img src="./static/images/demo_task.jpg" alt="Automotive-ENV task overview">
          <figcaption class="subtitle is-6" style="margin-top:8px;">
            Task instruction: Open the front windshield defroster, open the rear windshield defroster.
          </figcaption>
        </figure>

        <figure class="system-figure has-text-centered">
          <img src="./static/images/demo_arch.jpg" alt="Automotive-ENV system architecture overview">
        </figure>

        <div class="content has-text-justified system-overview-desc">
          <p>
            Automotive OS-based environment where the agent observes the accessibility tree, screen, and GPS;
            optionally consults GPS-contextualized web knowledge; and acts through tap screens and API calls.
            Task success is determined by low-level programmatic checks of system signals.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- Abstract -->
<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Multimodal agents have demonstrated strong performance in general GUI interactions, but their
            application in automotive systems has been largely unexplored. In-vehicle GUIs present distinct
            challenges: drivers’ limited attention, strict safety requirements, and complex location-based
            interaction patterns. To address these challenges, we introduce <strong>Automotive-ENV</strong>,
            the first high-fidelity benchmark and interaction environment tailored for vehicle GUIs.
          </p>
          <p>
            This platform defines <strong>185 parameterized tasks</strong> spanning explicit control,
            implicit intent understanding, and safety-aware tasks, and provides structured multimodal
            observations with precise programmatic checks for reproducible evaluation. Building on this
            benchmark, we propose <strong>ASURADA</strong>, a geo-aware multimodal agent that integrates
            GPS-informed context to dynamically adjust actions based on location, environmental conditions,
            and regional driving norms.
          </p>
          <p>
            Experiments show that geo-aware information significantly improves success on safety-aware tasks,
            highlighting the importance of location-based context in automotive environments. We will release
            Automotive-ENV, complete with all tasks and benchmarking tools, to further the development of
            safe and adaptive in-vehicle agents.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<!-- Task Statistics and Comparison -->
<section class="section" id="task-statistics">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3 has-text-centered">Task Statistics and Comparison</h2>

        <div class="content has-text-justified">
          <p>
            <strong>Automotive-ENV</strong> contains <em>185 parameterized tasks</em> spanning multiple dimensions:
            modalities (screen, accessibility tree, GPS), intent types (explicit control, implicit intent,
            safety-aware), and UI primitives (tap, long-press, slider, toggle, text). We report distributions
            across these dimensions and across task categories (Maps, HVAC, Road, Phenomenon, Media, Apps, System, Comms).
          </p>
        </div>

        <figure class="system-figure has-text-centered" style="margin-top:12px;">
          <img src="./static/images/static.jpg" alt="Task distributions across dimensions and categories">
          <figcaption class="subtitle is-6" style="margin-top:8px;">
            Task distributions across different dimensions. (a) Distribution of tasks by task dimensions.
            (b) Distribution of tasks across task categories (Maps, HVAC, Road, Phenomenon, Media, Apps, System, Comms).
          </figcaption>
        </figure>

        <figure class="system-figure has-text-centered" style="margin-top:18px;">
          <img src="./static/images/task_and_check.jpg" alt="Representative instructions and validation methods">
          <figcaption class="subtitle is-6" style="margin-top:8px;">
            Representative user instructions for in-vehicle tasks, categorized by task type, with corresponding validation methods.
          </figcaption>
        </figure>
      </div>
    </div>
  </div>
</section>

<!-- Results and Analysis (+ Discussion & Conclusion appended at the end) -->
<section class="section" id="results-analysis">
  <div class="container is-max-desktop">
    <div class="columns is-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3 has-text-centered">Results and Analysis</h2>

        <div class="content has-text-justified">
          <p>
            We evaluate multiple agent configurations on <strong>Automotive-ENV</strong>, reporting success
            rates across General tasks (Explicit Control, Implicit Intent) and Safety-Aware tasks
            (Driving Alignment, Environment Alerts). We also analyze the effect of GPS-aware context
            on inference token usage and task-wise performance across hotspot categories.
          </p>
        </div>

        <figure class="system-figure has-text-centered" style="margin-top:12px;">
          <img src="./static/images/results.jpg" alt="Success rates of different agent configurations across task groups">
          <figcaption class="subtitle is-6" style="margin-top:8px;">
            Success rates (SR %) of different agent configurations on Automotive-ENV. Results are
            reported across General tasks (Explicit Control, Implicit Intent) and Safety-Aware tasks
            (Driving Alignment, Environment Alerts).
          </figcaption>
        </figure>

        <figure class="system-figure has-text-centered" style="margin-top:18px;">
          <img src="./static/images/task_and_check.jpg" alt="Token length distributions and task-wise performance with vs. without GPS">
          <figcaption class="subtitle is-6" style="margin-top:8px;">
            Comparison of inference tokens with and without GPS information. Left: distribution of
            token lengths. Right: task-wise performance across hotspot categories.
          </figcaption>
        </figure>

        <!-- Discussion -->
        <div class="content has-text-justified" style="margin-top:28px;">
          <h3 class="title is-4 has-text-centered">Discussion</h3>
          <p>
            GPS signals are indispensable for providing geographic context in automotive agents, yet they
            are prone to disruptions in real-world environments such as tunnels, underground parking, or dense
            urban canyons. These interruptions can cause temporary localization failures, directly undermining
            navigation and geo-dependent decision-making. To address this limitation, large language models
            (LLMs) can act as virtual sensors by leveraging their built-in knowledge of road networks together
            with the last available GPS coordinates and timestamps. During short signal outages, the agent can
            simulate intermediate positions and continue offering navigation or context-aware recommendations.
            Once connectivity is restored, the simulated trajectory can be aligned with actual positioning
            data. This capability highlights the potential of LLMs to complement imperfect sensor signals and
            enhance robustness in safety-critical automotive applications.
          </p>
        </div>

        <!-- Conclusion -->
        <div class="content has-text-justified" style="margin-top:18px;">
          <h3 class="title is-4 has-text-centered">Conclusion</h3>
          <p>
            In this work, we present <strong>Automotive-ENV</strong>, the first large-scale benchmark explicitly designed for
            evaluating multimodal agents in realistic automotive GUI environments. Unlike desktop or mobile
            benchmarks, Automotive-ENV provides structured, reproducible, and geographically parameterized
            tasks that capture the complexity of in-vehicle interaction under real-world constraints. Building on
            this foundation, we propose <strong>ASURADA</strong>, a geo-adaptive agent capable of integrating GPS location
            and contextual signals to deliver safe and personalized actions. Our experiments show that geo-context
            integration not only improves task accuracy, especially in safety-critical settings, but also
            reduces reasoning overhead by enabling proactive, context-driven planning. Together, Automotive-ENV
            and ASURADA establish a foundation for the next generation of in-vehicle assistants that are
            multimodal, safety-aware, and culturally adaptive, advancing the reliable deployment of autonomous
            agents in high-stakes driving environments.
          </p>
        </div>

      </div>
    </div>
  </div>
</section>

<!-- BibTeX -->
<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title is-3 has-text-centered">BibTeX</h2>
    <pre><code>@misc{yan2025automotiveenvbenchmarkingmultimodalagents,
  title={Automotive-ENV: Benchmarking Multimodal Agents in Vehicle Interface Systems},
  author={Junfeng Yan and Biao Wu and Meng Fang and Ling Chen},
  year={2025},
  eprint={2509.21143},
  archivePrefix={arXiv},
  primaryClass={cs.RO},
  url={https://arxiv.org/abs/2509.21143}
}</code></pre>
  </div>
</section>

<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">
      <p>
        This website adapts the open-source <a href="https://nerfies.github.io" target="_blank" rel="noopener">Nerfies/OS-World</a> page framework.
        Template code © original authors, used under
        <a href="https://creativecommons.org/licenses/by-sa/4.0/" target="_blank" rel="noopener">CC BY-SA 4.0</a>.
      </p>
      <p>
        Site content (text, figures, video) © 2025 automotive-env.
        Source for this page: <a href="https://github.com/automotive-env/AutmotiveEnv.github.io" target="_blank" rel="noopener">GitHub</a>.
      </p>
      <p>
        <a class="icon-link" href="https://arxiv.org/abs/2509.21143" target="_blank" rel="noopener" title="arXiv">
          <i class="ai ai-arxiv"></i>
        </a>
        <a class="icon-link" href="https://github.com/automotive-env/AutmotiveEnv.github.io" target="_blank" rel="noopener" title="GitHub">
          <i class="fab fa-github"></i>
        </a>
        <a class="icon-link" href="https://creativecommons.org/licenses/by-sa/4.0/" target="_blank" rel="noopener" title="CC BY-SA 4.0">
          <i class="fab fa-creative-commons"></i>
        </a>
      </p>
      <p>© 2025 automotive-env — Hosted on GitHub Pages.</p>
    </div>
  </div>
</footer>

</body>
</html>