\documentclass[conference]{IEEEtran}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{enumitem}
\usepackage{xcolor}
\usepackage{cite}
\usepackage{hyperref}
\usepackage{url}
\usepackage{microtype}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{multirow}
\usepackage{balance}
\title{Hierarchical Affordance-Guided Navigation for\\Mobile Manipulation in Unstructured Environments}
\author{
\IEEEauthorblockN{
Maria Gonzalez\IEEEauthorrefmark{1},
Takeshi Saito\IEEEauthorrefmark{2},
Felix Obermeyer\IEEEauthorrefmark{3},
and Nadia Karam\IEEEauthorrefmark{1}
}
\IEEEauthorblockA{
\IEEEauthorrefmark{1}Robotics Institute, Carnegie Mellon University, Pittsburgh, PA, USA\\
\IEEEauthorrefmark{2}Department of Mechano-Informatics, University of Tokyo, Japan\\
\IEEEauthorrefmark{3}Autonomous Systems Lab, ETH Z\"urich, Switzerland\\
\texttt{\{mgonzalez,nkaram\}@andrew.cmu.edu, [email protected], [email protected]}
}
}
\begin{document}
\maketitle
\begin{abstract}
Mobile manipulation in unstructured environments requires robots to jointly reason about navigation and manipulation---where to position the base for effective grasping, how to approach objects in cluttered spaces, and when to reposition if initial attempts fail. We present \textsc{AffordNav}, a hierarchical planning framework that uses learned affordance maps to jointly optimize navigation and manipulation strategies. At the high level, an affordance-aware planner identifies candidate base poses by predicting manipulation success probability from visual observations. At the low level, a local policy executes combined navigation-manipulation motions using a learned value function over the robot's configuration space. We train the system end-to-end using a combination of simulated experience in procedurally generated cluttered environments and a small amount of real-world data via sim-to-real transfer. In real-world experiments on a Fetch mobile manipulator, \textsc{AffordNav} achieves an 84.2\% success rate on tabletop pick-and-place tasks in cluttered environments, outperforming a sequential navigate-then-manipulate baseline (61.7\%) and an affordance-unaware joint planning baseline (72.3\%). We demonstrate generalization to novel objects, environments, and multi-step manipulation sequences.
\end{abstract}
\section{Introduction}
Mobile manipulation---the combination of locomotion and dexterous manipulation---is essential for robots operating in human environments. Tasks as simple as fetching a cup from a cluttered kitchen counter require the robot to navigate to an appropriate position, reach through or around obstacles, grasp the target object, and transport it to a goal location. The tight coupling between navigation and manipulation decisions makes these tasks particularly challenging.
A fundamental question in mobile manipulation is: \emph{where should the robot stand?} The base pose determines which objects are reachable, what grasp configurations are feasible, and whether the arm can avoid obstacles. Traditional approaches decouple this into separate navigation and manipulation phases: first navigate to a pre-specified or heuristically chosen position, then plan a manipulation trajectory. This decoupling often leads to base poses that are suboptimal for manipulation, requiring costly repositioning or failing outright when the selected position provides no feasible grasp.
We propose \textsc{AffordNav}, a hierarchical framework that jointly optimizes navigation and manipulation through learned affordance predictions. The key insight is that \emph{visual affordances}---predictions of where and how an object can be manipulated---provide a natural bridge between navigation and manipulation planning. By predicting manipulation affordances from the robot's current observation, we can evaluate candidate base poses before committing to navigation, selecting positions that maximize manipulation success probability.
\subsection{Contributions}
\begin{itemize}[nosep,leftmargin=*]
\item A hierarchical planning framework that uses affordance predictions to jointly optimize navigation and manipulation for mobile robots.
\item An affordance mapping module that predicts manipulation success as a function of base pose from visual observations.
\item A sim-to-real transfer pipeline using domain randomization and a small amount of real-world fine-tuning data.
\item Extensive real-world experiments demonstrating 84.2\% success rate in cluttered environments, a 22.5\% improvement over the baseline.
\end{itemize}
\section{Related Work}
\subsection{Mobile Manipulation}
Classical approaches to mobile manipulation plan base placement and arm trajectories separately \cite{stilman2007navigation}. Optimization-based methods jointly plan base and arm motions but are computationally expensive for real-time operation \cite{haviland2022holistic}. Learning-based approaches have shown promise: \cite{xia2021relmogen} learn navigation policies conditioned on manipulation goals, while \cite{yokoyama2023adaptive} propose adaptive skill coordination. Our work differs by explicitly learning affordance maps that guide base placement decisions.
\subsection{Affordance Learning}
Affordance prediction has a rich history in robotics, from Gibson's ecological psychology \cite{gibson1979ecological} to modern deep learning approaches. \cite{mandikal2022dexvip} predict visual affordances for dexterous grasping. \cite{nagarajan2020learning} learn affordances from human interaction videos. We extend affordance prediction to the mobile manipulation setting, predicting manipulation feasibility as a function of the robot's base configuration.
\subsection{Sim-to-Real Transfer}
Domain randomization \cite{tobin2017domain} and system identification \cite{chebotar2019closing} are common sim-to-real strategies. Recent work on visual sim-to-real transfer \cite{james2019sim} achieves zero-shot transfer for manipulation tasks. We combine domain randomization with a small real-world fine-tuning step, which we find critical for handling real-world clutter and lighting variation.
\section{Method}
\subsection{Problem Formulation}
We consider a mobile manipulator with base configuration $q_b \in SE(2)$ and arm configuration $q_a \in \mathbb{R}^7$. Given an RGB-D observation $\mathcal{O}$, a target object specification $o^*$, and a goal location $g$, the robot must:
\begin{enumerate}[nosep]
\item Navigate to a base pose suitable for grasping $o^*$.
\item Grasp $o^*$ while avoiding obstacles.
\item Transport $o^*$ to goal location $g$.
\end{enumerate}
\subsection{System Overview}
\textsc{AffordNav} operates at two levels:
\paragraph{High-Level Affordance Planner.}
Given the current observation $\mathcal{O}$, the affordance planner predicts a manipulation success map $M : SE(2) \to [0, 1]$ that estimates the probability of successful manipulation from each candidate base pose. The robot navigates to:
\begin{equation}
q_b^* = \arg\max_{q_b \in \mathcal{Q}_{\text{free}}} M(q_b) - \lambda \cdot c(q_b^{\text{current}}, q_b)
\end{equation}
where $\mathcal{Q}_{\text{free}}$ is the collision-free configuration space, $c(\cdot,\cdot)$ is the navigation cost, and $\lambda$ balances manipulation success against navigation efficiency.
\paragraph{Low-Level Manipulation Policy.}
Once positioned, a learned manipulation policy $\pi_a(a_t | \mathcal{O}_t, q_b)$ generates arm actions conditioned on the current observation and base pose. We use a residual policy architecture that combines a motion planning prior with learned corrections:
\begin{equation}
a_t = a_t^{\text{plan}} + \Delta a_t^{\text{learned}}
\end{equation}
\subsection{Affordance Map Learning}
The affordance map $M$ is parameterized as a neural network $M_\theta$ that takes as input:
\begin{itemize}[nosep]
\item A top-down occupancy map of the environment.
\item Target object features extracted from the RGB-D observation.
\item Candidate base pose $q_b$ encoded as a spatial feature.
\end{itemize}
The architecture uses a U-Net backbone operating on the occupancy map with cross-attention to object features. The output is a dense prediction of manipulation success probability at each spatial location and orientation.
\paragraph{Training.}
We train $M_\theta$ in simulation using procedurally generated environments:
\begin{equation}
\mathcal{L}_M = -\frac{1}{|\mathcal{D}|}\sum_{(q_b, y) \in \mathcal{D}} \big[y \log M_\theta(q_b) + (1-y)\log(1 - M_\theta(q_b))\big]
\end{equation}
where $y \in \{0, 1\}$ indicates whether a manipulation attempt from $q_b$ succeeded in simulation.
\subsection{Sim-to-Real Transfer}
We use a three-stage transfer pipeline:
\begin{enumerate}[nosep,leftmargin=*]
\item \textbf{Simulation pre-training}: Train on 10,000 procedurally generated cluttered scenes with domain randomization over textures, lighting, object shapes, and physics parameters.
\item \textbf{Real-world data collection}: Collect 200 real manipulation trials using a scripted exploration policy.
\item \textbf{Fine-tuning}: Fine-tune the affordance map and manipulation policy on combined simulated and real data with a mixing ratio of 4:1 (sim:real).
\end{enumerate}
\subsection{Replanning and Recovery}
If manipulation fails (detected via force/torque sensing), the robot triggers replanning. The affordance map is updated with the failure observation:
\begin{equation}
M'(q_b) = M(q_b) \cdot (1 - \alpha \cdot \mathbb{1}[q_b \approx q_b^{\text{fail}}])
\end{equation}
This suppresses the failed base pose region and the robot selects a new approach direction. We allow up to 3 replanning attempts before declaring failure.
\section{Experiments}
\subsection{Experimental Setup}
We evaluate on a Fetch mobile manipulator equipped with a Robotiq 2F-85 gripper, an Intel RealSense D435i wrist-mounted camera, and a base-mounted Velodyne VLP-16 LiDAR.
\paragraph{Environments.} We test in three real-world settings:
\begin{itemize}[nosep,leftmargin=*]
\item \textbf{Lab kitchen}: Countertop with dishes, utensils, and food items (15--25 objects).
\item \textbf{Office desk}: Cluttered workspace with stationery, electronics, and personal items (10--20 objects).
\item \textbf{Storage shelf}: Multi-level shelf with stacked and densely packed objects (20--30 objects).
\end{itemize}
\paragraph{Tasks.} Each trial requires picking a specified target object and placing it at a designated goal location. We evaluate 120 trials per method (40 per environment).
\subsection{Baselines}
\begin{itemize}[nosep,leftmargin=*]
\item \textbf{Navigate-Then-Manipulate (NTM)}: Sequential approach---navigate to the nearest reachable point, then plan a manipulation trajectory.
\item \textbf{Joint Planning (JP)}: Joint base-arm optimization without affordance guidance.
\item \textbf{Affordance-Only (AO)}: Affordance-guided base placement but with a classical manipulation planner (no learned policy).
\end{itemize}
\subsection{Results}
\begin{table}[t]
\centering
\caption{Success rates (\%) across environments.}
\label{tab:main}
\small
\begin{tabular}{@{}lcccc@{}}
\toprule
\textbf{Method} & \textbf{Kitchen} & \textbf{Office} & \textbf{Shelf} & \textbf{Avg.} \\
\midrule
NTM & 70.0 & 67.5 & 47.5 & 61.7 \\
JP & 80.0 & 75.0 & 62.5 & 72.5 \\
AO & 82.5 & 77.5 & 65.0 & 75.0 \\
\textbf{AffordNav} & \textbf{90.0} & \textbf{87.5} & \textbf{75.0} & \textbf{84.2} \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:main} shows that \textsc{AffordNav} significantly outperforms all baselines across all environments. The largest gap appears in the storage shelf environment (75.0\% vs.\ 47.5\% for NTM), where constrained spaces make base positioning critical. The combination of affordance-guided planning and learned manipulation (AffordNav vs.\ AO) provides a 9.2\% improvement, confirming the value of the learned policy's ability to adapt to the chosen base pose.
\subsection{Planning Time Analysis}
\begin{table}[t]
\centering
\caption{Average planning and execution times (seconds).}
\label{tab:timing}
\small
\begin{tabular}{@{}lcc@{}}
\toprule
\textbf{Method} & \textbf{Planning} & \textbf{Execution} \\
\midrule
NTM & $0.8 \pm 0.3$ & $42.1 \pm 12.4$ \\
JP & $8.3 \pm 2.1$ & $35.6 \pm 9.8$ \\
\textbf{AffordNav} & $1.2 \pm 0.4$ & $28.3 \pm 7.2$ \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:timing} shows that \textsc{AffordNav}'s affordance map inference adds minimal planning overhead (0.4s vs.\ NTM) while substantially reducing execution time through better base positioning. Joint planning is 7$\times$ slower due to optimization over the combined base-arm space.
\subsection{Ablation Study}
\begin{table}[t]
\centering
\caption{Ablation study (average success rate \%).}
\label{tab:ablation}
\small
\begin{tabular}{@{}lc@{}}
\toprule
\textbf{Configuration} & \textbf{Success Rate} \\
\midrule
Full AffordNav & \textbf{84.2} \\
Without replanning & 76.7 \\
Without sim-to-real fine-tuning & 68.3 \\
Without affordance map (random base) & 55.0 \\
Simulation only (no real data) & 62.5 \\
\bottomrule
\end{tabular}
\end{table}
The ablation study (Table~\ref{tab:ablation}) reveals that: (1) the affordance map is the most impactful component (+29.2\% over random); (2) sim-to-real fine-tuning is critical (+15.9\%); and (3) replanning provides a meaningful safety net (+7.5\%).
\subsection{Generalization}
We test generalization to 10 novel objects unseen during training (success rate: 78.3\%, only 5.9\% below seen objects) and a novel room layout (success rate: 80.0\%). The affordance map generalizes well because it reasons about spatial relationships and reachability rather than memorizing specific object appearances.
\section{Conclusion}
We presented \textsc{AffordNav}, a hierarchical mobile manipulation framework that leverages learned affordance maps to jointly optimize navigation and manipulation decisions. Real-world experiments on a Fetch robot demonstrate significant improvements over sequential and joint planning baselines, with strong generalization to novel objects and environments. Our affordance-guided approach provides an effective bridge between navigation and manipulation planning, enabling more capable mobile manipulation in unstructured environments.
Future work includes extending to bimanual manipulation, handling dynamic environments with moving obstacles, and scaling to whole-building navigation with long-horizon task planning.
\balance
\bibliographystyle{IEEEtran}
\begin{thebibliography}{15}
\bibitem{stilman2007navigation}
M.~Stilman and J.~Kuffner, ``Navigation among movable obstacles: Real-time reasoning in complex environments,'' \emph{Int.\ J.\ Humanoid Robotics}, vol.~2, no.~4, pp.~479--503, 2005.
\bibitem{haviland2022holistic}
J.~Haviland, N.~S\"underhauf, and P.~Corke, ``A holistic approach to reactive mobile manipulation,'' \emph{IEEE Robotics and Automation Letters}, vol.~7, no.~2, pp.~3122--3129, 2022.
\bibitem{xia2021relmogen}
F.~Xia, C.~Li, R.~Mart\'in-Mart\'in, O.~Litany, A.~Toshev, and S.~Savarese, ``Relmogen: Integrating motion generation in reinforcement learning for mobile manipulation,'' in \emph{Proc.\ ICRA}, 2021.
\bibitem{yokoyama2023adaptive}
R.~Yokoyama, A.~Clegg, E.~Undersander, S.~Ha, D.~Batra, and A.~Rai, ``Adaptive skill coordination for robotic mobile manipulation,'' in \emph{Proc.\ CoRL}, 2023.
\bibitem{gibson1979ecological}
J.~J. Gibson, \emph{The Ecological Approach to Visual Perception}.\hskip 1em plus 0.5em minus 0.4em\relax Boston, MA: Houghton Mifflin, 1979.
\bibitem{mandikal2022dexvip}
P.~Mandikal and K.~Grauman, ``DexVIP: Learning dexterous grasping with human hand pose priors from video,'' in \emph{Proc.\ CoRL}, 2022.
\bibitem{nagarajan2020learning}
T.~Nagarajan, C.~Feichtenhofer, and K.~Grauman, ``Grounded human-object interaction hotspots from video,'' in \emph{Proc.\ ICCV}, 2019.
\bibitem{tobin2017domain}
J.~Tobin, R.~Fong, A.~Ray, J.~Schneider, W.~Zaremba, and P.~Abbeel, ``Domain randomization for transferring deep neural networks from simulation to the real world,'' in \emph{Proc.\ IROS}, 2017.
\bibitem{chebotar2019closing}
Y.~Chebotar, A.~Handa, V.~Makoviychuk, M.~Macklin, J.~Issac, N.~Ratliff, and D.~Fox, ``Closing the sim-to-real loop: Adapting simulation randomization with real world experience,'' in \emph{Proc.\ ICRA}, 2019.
\bibitem{james2019sim}
S.~James, P.~Wohlhart, M.~Kalakrishnan, D.~Kalashnikov, A.~Irpan, J.~Ibarz, S.~Levine, R.~Hadsell, and K.~Bousmalis, ``Sim-to-real via sim-to-sim: Data-efficient robotic grasping via randomized-to-canonical adaptation networks,'' in \emph{Proc.\ CVPR}, 2019.
\end{thebibliography}
\end{document}

PDF Preview
Create an account to compile and preview