\documentclass[11pt]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[margin=1in]{geometry}
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{enumitem}
\usepackage{xcolor}
\usepackage{times}
\usepackage{natbib}
\usepackage{hyperref}
\usepackage{url}
\usepackage{microtype}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{multirow}
\usepackage{tabularx}
\hypersetup{colorlinks=true,linkcolor=blue!60!black,citecolor=blue!60!black,urlcolor=blue!60!black}
\title{\Large\bfseries ``Let Me Show You'': Understanding How Users Communicate Spatial Intent Through Gesture-Augmented Voice Interfaces}
\author{
\textbf{Hannah Park}\textsuperscript{1}\quad
\textbf{Diego Ramirez}\textsuperscript{2}\quad
\textbf{Amara Okafor}\textsuperscript{1}\quad
\textbf{Jakob Nielsen}\textsuperscript{3}\\[4pt]
\textsuperscript{1}Human-Computer Interaction Institute, Carnegie Mellon University\\
\textsuperscript{2}Department of Informatics, University of California, Irvine\\
\textsuperscript{3}Department of Computer Science, University of Copenhagen\\[2pt]
{\small\texttt{\{hpark,aokafor\}@cs.cmu.edu, [email protected], [email protected]}}
}
\date{}
\begin{document}
\maketitle
\begin{abstract}
Voice assistants increasingly mediate our interactions with smart environments, yet communicating spatial intent---``put it over there'' or ``move this closer''---through voice alone is notoriously difficult. We present a mixed-methods study ($N=48$) investigating how users naturally combine gesture and voice to communicate spatial intent in smart home environments. Through a Wizard-of-Oz study with a gesture-sensing voice assistant, we identify five distinct \emph{spatial communication strategies} that users adopt, ranging from purely verbal descriptions to predominantly gestural commands. We find that users dynamically switch strategies based on referent complexity, spatial precision requirements, and perceived system capabilities. Based on our findings, we develop \textsc{GestureVoice}, a multimodal interaction framework that fuses gesture and speech signals to resolve spatial references. A controlled evaluation ($N=24$) shows that \textsc{GestureVoice} reduces spatial reference errors by 64\% and task completion time by 38\% compared to voice-only interaction. We contribute design implications for multimodal spatial interfaces and a taxonomy of gesture-voice coordination patterns.
\end{abstract}
\section{Introduction}
Voice-controlled smart home devices have become ubiquitous, with over 300 million smart speakers deployed worldwide as of 2025. These devices excel at discrete commands (``turn on the lights'') but struggle with spatial tasks that humans communicate effortlessly through gesture and body language. Asking a voice assistant to ``dim the light in the corner'' or ``move the thermostat setting warmer'' requires precise spatial or parametric language that users find unnatural and error-prone \citep{porcheron2018voice}.
Human-to-human spatial communication is inherently multimodal. When giving directions, people naturally point, gesture, and use body orientation alongside speech \citep{kendon2004gesture}. The spatial deictic ``there'' is nearly meaningless without an accompanying gesture. Yet current voice interfaces force users into a purely linguistic channel, creating a mismatch between natural communication strategies and system capabilities.
Advances in depth sensing, computer vision, and pose estimation now make it feasible to capture user gestures alongside speech in real-time. However, designing effective gesture-augmented voice interfaces requires understanding how users \emph{actually} coordinate gesture and speech for spatial communication. Prior work has studied gesture-speech coordination in human communication \citep{mcneill1992hand} and in specific interaction contexts like maps \citep{bolt1980put}, but a comprehensive understanding of gesture-voice spatial strategies in smart home contexts is lacking.
This paper addresses two research questions:
\begin{itemize}[nosep,leftmargin=*]
\item[\textbf{RQ1.}] What strategies do users employ when communicating spatial intent through combined gesture and voice in smart home environments?
\item[\textbf{RQ2.}] How can a multimodal system effectively fuse gesture and speech signals to improve spatial interaction accuracy?
\end{itemize}
\subsection{Contributions}
\begin{enumerate}[nosep,leftmargin=*]
\item A taxonomy of five spatial communication strategies derived from observational study of 48 participants.
\item The \textsc{GestureVoice} framework for fusing gesture and speech for spatial reference resolution.
\item Empirical evidence that multimodal spatial interaction reduces errors by 64\% over voice-only baselines.
\item Design implications for gesture-augmented voice interfaces.
\end{enumerate}
\section{Related Work}
\subsection{Spatial Interaction with Voice Assistants}
Research on voice assistant usability has highlighted spatial communication as a key challenge \citep{luger2016like,porcheron2018voice}. Users frequently abandon spatial tasks or resort to overly verbose descriptions when limited to voice \citep{myers2018patterns}. \citet{williams2019smart} proposed spatial language grounding for smart environments but relied solely on linguistic input.
\subsection{Gesture-Speech Integration}
The seminal work of \citet{mcneill1992hand} established that speech and gesture form an integrated communication system. \citet{bolt1980put} demonstrated ``Put-That-There,'' an early multimodal interface combining pointing and speech. More recently, \citet{kim2021gesture} explored mid-air gestures for smart TV control. Our work extends this line by systematically studying gesture-speech coordination strategies for spatial tasks in smart home contexts.
\subsection{Multimodal Fusion}
Multimodal fusion techniques range from early fusion (concatenating raw signals) to late fusion (combining independent predictions) \citep{baltrusaitis2019multimodal}. Attention-based fusion mechanisms \citep{tsai2019multimodal} can learn dynamic weighting between modalities. We adopt a cross-modal attention approach that leverages the temporal alignment between gesture and speech events.
\section{Study 1: Observational Study}
\subsection{Method}
We conducted a Wizard-of-Oz study in a simulated smart home living room equipped with 4 depth cameras (Intel RealSense D455) and 6 smart devices (2 lights, 1 thermostat, 1 speaker, 1 display, 1 robotic arm).
\paragraph{Participants.} 48 participants (26 female, 20 male, 2 non-binary; ages 19--61, $M=31.4$, $SD=10.2$) were recruited from a university participant pool. All had experience with voice assistants; 35\% used them daily.
\paragraph{Tasks.} Participants completed 18 spatial tasks across three categories:
\begin{itemize}[nosep,leftmargin=*]
\item \textbf{Referential} (6 tasks): Identify a target device or location (``adjust the light near the window'').
\item \textbf{Directional} (6 tasks): Specify movement or change direction (``make the display brighter on the left side'').
\item \textbf{Configurational} (6 tasks): Arrange multiple elements spatially (``set the two corner lights to match'').
\end{itemize}
\paragraph{Procedure.} Participants were told the system could see their gestures and hear their voice. A hidden wizard interpreted participant intent and controlled devices accordingly. Sessions were video-recorded and transcribed.
\paragraph{Analysis.} Two researchers independently coded 864 interaction episodes (48 participants $\times$ 18 tasks) using thematic analysis \citep{braun2006using}, achieving Cohen's $\kappa = 0.84$ after reconciliation.
\subsection{Findings: Spatial Communication Strategies}
We identified five distinct strategies:
\begin{table}[t]
\centering
\caption{Spatial communication strategies and their frequency across task types.}
\label{tab:strategies}
\small
\begin{tabular}{@{}lccc@{}}
\toprule
\textbf{Strategy} & \textbf{Ref.} & \textbf{Dir.} & \textbf{Config.} \\
\midrule
S1: Voice-primary & 31\% & 42\% & 18\% \\
S2: Deictic gesture + voice & 38\% & 21\% & 27\% \\
S3: Iconic gesture + voice & 8\% & 24\% & 33\% \\
S4: Gesture-primary & 15\% & 7\% & 12\% \\
S5: Sequential multimodal & 8\% & 6\% & 10\% \\
\bottomrule
\end{tabular}
\end{table}
\paragraph{S1: Voice-Primary.} Users relied primarily on verbal descriptions with minimal gesture. Most common for directional tasks where spatial language was sufficient (``make it warmer'').
\paragraph{S2: Deictic Gesture + Voice.} Users pointed at targets while providing verbal commands. Dominated referential tasks. Typical pattern: point at device, then speak command (``that one, turn it up'').
\paragraph{S3: Iconic Gesture + Voice.} Users performed representational gestures (e.g., hand rotation for dimming, spreading hands apart for ``bigger''). Most common in configurational tasks requiring continuous parameter adjustment.
\paragraph{S4: Gesture-Primary.} Users performed gestures with minimal speech (e.g., pointing and swiping). Emerged when participants perceived the system understood gesture well.
\paragraph{S5: Sequential Multimodal.} Users alternated between gesture and speech in a turn-taking pattern, using one modality to clarify or refine the other. Often appeared after initial miscommunication.
\subsection{Key Findings}
\begin{enumerate}[nosep,leftmargin=*]
\item \textbf{Strategy switching is dynamic}: 73\% of participants used at least 3 different strategies across tasks.
\item \textbf{Complexity drives gesture}: Gesture use increased with spatial complexity ($r = 0.71$, $p < .001$).
\item \textbf{Temporal coordination matters}: In S2 and S3, gesture onset preceded speech onset by 340ms on average ($SD = 180$ms), consistent with psycholinguistic findings.
\item \textbf{Repair strategies are multimodal}: When initial commands failed, 82\% of participants switched to a different strategy.
\end{enumerate}
\section{The GestureVoice Framework}
Based on Study 1 findings, we designed \textsc{GestureVoice}, a real-time multimodal framework.
\subsection{Architecture}
The system has three modules:
\paragraph{Gesture Perception.} MediaPipe Holistic extracts hand landmarks, body pose, and pointing direction at 30 FPS. We classify gesture types (deictic, iconic, beat) using a lightweight temporal CNN trained on 2,400 labeled gesture clips from our study.
\paragraph{Speech Processing.} An ASR module (Whisper-large) transcribes speech in real-time. A semantic parser extracts spatial language features: spatial prepositions, demonstratives, directional terms, and device references.
\paragraph{Cross-Modal Fusion.} A cross-attention transformer aligns gesture and speech features using temporal correspondence. The gesture modality attends to relevant speech tokens and vice versa:
\begin{equation}
\mathbf{h}_{\text{fused}} = \text{CrossAttn}(\mathbf{h}_{\text{gesture}}, \mathbf{h}_{\text{speech}}) + \text{CrossAttn}(\mathbf{h}_{\text{speech}}, \mathbf{h}_{\text{gesture}})
\end{equation}
The fused representation is decoded into a structured spatial command: (target device, action, parameters).
\section{Study 2: Controlled Evaluation}
\subsection{Method}
We conducted a within-subjects experiment ($N = 24$) comparing three conditions: (1) voice-only, (2) gesture-only, and (3) \textsc{GestureVoice} (multimodal). Participants completed 12 spatial tasks (4 per condition, counterbalanced).
\subsection{Results}
\begin{table}[t]
\centering
\caption{Study 2 results (means with 95\% CI).}
\label{tab:eval}
\small
\begin{tabular}{@{}lccc@{}}
\toprule
\textbf{Metric} & \textbf{Voice} & \textbf{Gesture} & \textbf{GestureVoice} \\
\midrule
Error rate (\%) & 42.3 & 28.7 & \textbf{15.1} \\
Task time (s) & 18.2 & 14.7 & \textbf{11.3} \\
SUS score & 61.2 & 58.4 & \textbf{79.8} \\
NASA-TLX & 48.3 & 44.1 & \textbf{31.6} \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:eval} shows that \textsc{GestureVoice} significantly outperformed both unimodal conditions. Repeated-measures ANOVA revealed significant main effects of condition on error rate ($F(2,46) = 18.7$, $p < .001$, $\eta_p^2 = .45$), task time ($F(2,46) = 12.3$, $p < .001$, $\eta_p^2 = .35$), and usability ($F(2,46) = 22.1$, $p < .001$, $\eta_p^2 = .49$).
\section{Discussion}
\subsection{Design Implications}
Our findings suggest several implications for gesture-augmented voice interfaces:
\begin{enumerate}[nosep,leftmargin=*]
\item \textbf{Support strategy diversity}: Systems should accommodate all five communication strategies rather than prescribing a single interaction pattern.
\item \textbf{Leverage temporal coordination}: The consistent gesture-before-speech pattern provides a natural attention cue for the system.
\item \textbf{Enable graceful modality fallback}: When one modality is ambiguous, the system should weight the other more heavily rather than failing entirely.
\item \textbf{Provide multimodal feedback}: Users need confirmation that both gesture and speech were received.
\end{enumerate}
\subsection{Limitations}
Our study was conducted in a controlled lab environment. Real homes have varying lighting, occlusions, and background noise. The Wizard-of-Oz study may have created expectations of perfect gesture understanding. Longitudinal deployment studies are needed to assess how strategies evolve over time.
\section{Conclusion}
We presented a comprehensive investigation of gesture-voice spatial communication, contributing a taxonomy of five communication strategies, the \textsc{GestureVoice} multimodal framework, and empirical evidence of its effectiveness. As smart environments become more pervasive, supporting natural multimodal spatial communication will be essential for intuitive and efficient interaction. Our work provides both the theoretical foundations and practical tools to advance this goal.
\section*{Acknowledgments}
This work was supported by NSF grant IIS-2432187 and the CMU Human-Computer Interaction Institute. We thank all study participants and the anonymous reviewers for their valuable feedback.
{\small
\bibliographystyle{plainnat}
\begin{thebibliography}{15}
\bibitem[Baltrusaitis et~al.(2019)]{baltrusaitis2019multimodal}
T.~Baltrusaitis, C.~Ahuja, and L.-P. Morency.
\newblock Multimodal machine learning: A survey and taxonomy.
\newblock \emph{IEEE Trans.\ PAMI}, 41(2):423--443, 2019.
\bibitem[Bolt(1980)]{bolt1980put}
R.~Bolt.
\newblock ``{Put-That-There}'': Voice and gesture at the graphics interface.
\newblock \emph{Computer Graphics}, 14(3):262--270, 1980.
\bibitem[Braun and Clarke(2006)]{braun2006using}
V.~Braun and V.~Clarke.
\newblock Using thematic analysis in psychology.
\newblock \emph{Qualitative Research in Psychology}, 3(2):77--101, 2006.
\bibitem[Kendon(2004)]{kendon2004gesture}
A.~Kendon.
\newblock \emph{Gesture: Visible Action as Utterance}.
\newblock Cambridge University Press, 2004.
\bibitem[Kim et~al.(2021)]{kim2021gesture}
D.~Kim, O.~Hilliges, S.~Izadi, A.~Butler, J.~Chen, I.~Oikonomidis, and P.~Olivier.
\newblock Digits: Freehand 3{D} interactions anywhere using a wrist-worn gloveless sensor.
\newblock In \emph{Proc.\ UIST}, 2021.
\bibitem[Luger and Sellen(2016)]{luger2016like}
E.~Luger and A.~Sellen.
\newblock ``{Like} having a really bad {PA}'': The gulf between user expectation and experience of conversational agents.
\newblock In \emph{Proc.\ CHI}, 2016.
\bibitem[McNeill(1992)]{mcneill1992hand}
D.~McNeill.
\newblock \emph{Hand and Mind: What Gestures Reveal about Thought}.
\newblock University of Chicago Press, 1992.
\bibitem[Myers et~al.(2018)]{myers2018patterns}
C.~Myers, A.~Furqan, J.~Nebolsky, K.~Caro, and J.~Zhu.
\newblock Patterns for how users overcome obstacles in voice user interfaces.
\newblock In \emph{Proc.\ CHI}, 2018.
\bibitem[Porcheron et~al.(2018)]{porcheron2018voice}
M.~Porcheron, J.~Fischer, S.~Reeves, and S.~Sharples.
\newblock Voice interfaces in everyday life.
\newblock In \emph{Proc.\ CHI}, 2018.
\bibitem[Tsai et~al.(2019)]{tsai2019multimodal}
Y.-H.~H. Tsai, S.~Bai, P.~P. Liang, J.~Z. Kolter, L.-P. Morency, and R.~Salakhutdinov.
\newblock Multimodal transformer for unaligned multimodal language sequences.
\newblock In \emph{Proc.\ ACL}, 2019.
\bibitem[Williams et~al.(2019)]{williams2019smart}
T.~Williams, M.~Bussing, S.~Cabrol, E.~Boez, and P.~Hu.
\newblock Mixed reality deictic gesture for multi-modal robot communication.
\newblock In \emph{Proc.\ HRI}, 2019.
\end{thebibliography}
}
\end{document}

PDF Preview
Create an account to compile and preview