\documentclass[11pt,twocolumn]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{lmodern}
\usepackage[top=1in,bottom=1in,left=0.75in,right=0.75in]{geometry}
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{natbib}
\usepackage{enumitem}
\usepackage{microtype}
\usepackage{xcolor}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{fancyhdr}
\pagestyle{fancy}
\fancyhf{}
\fancyhead[C]{\small NeurIPS 2026 --- Conference Paper}
\fancyfoot[C]{\thepage}
\renewcommand{\headrulewidth}{0.4pt}
\setlength{\parindent}{1em}
\setlength{\parskip}{0pt}
\title{\textbf{Learning Representations for Structured Prediction\\via Compositional Attention Networks}}
\author{
Alexander Chen\thanks{Equal contribution.} \\
Department of Computer Science\\
Stanford University\\
\texttt{[email protected]} \\
\and
Maria Gonzalez$^*$ \\
Department of Statistics\\
UC Berkeley\\
\texttt{[email protected]} \\
\and
Wei Zhang \\
Google DeepMind\\
\texttt{[email protected]}
}
\date{}
\begin{document}
\twocolumn[
\begin{@twocolumnfalse}
\maketitle
\begin{abstract}
\noindent
Learning effective representations for structured prediction tasks remains a fundamental challenge in machine learning. Existing approaches often struggle to capture long-range dependencies while maintaining computational efficiency. In this work, we propose \textbf{Compositional Attention Networks (CANs)}, a novel architecture that learns hierarchical representations through structured attention mechanisms. Our approach decomposes complex prediction targets into compositional substructures, enabling the model to learn reusable components that generalize across tasks. We introduce a principled training framework based on variational inference that jointly optimizes the representation and prediction objectives. Extensive experiments on four benchmark datasets---semantic parsing, code generation, molecular property prediction, and document summarization---demonstrate that CANs achieve state-of-the-art performance while requiring 40\% fewer parameters than comparable models. We further provide theoretical analysis showing that our compositional decomposition provably reduces sample complexity under mild assumptions.
\end{abstract}
\vspace{1em}
\end{@twocolumnfalse}
]
\section{Introduction}
Structured prediction encompasses a broad class of machine learning problems where the output space exhibits complex combinatorial structure \citep{lafferty2001conditional}. Unlike standard classification or regression, these tasks require the model to produce outputs that satisfy intrinsic structural constraints---parse trees must be well-formed, generated molecules must be chemically valid, and program synthesis must yield syntactically correct code.
Recent advances in deep learning have led to powerful representation learning methods that achieve impressive performance on many structured prediction benchmarks \citep{vaswani2017attention}. However, two key challenges remain: (1)~capturing long-range dependencies in the output structure efficiently, and (2)~generalizing compositionally to novel structural combinations not seen during training.
In this paper, we address both challenges by proposing \textbf{Compositional Attention Networks (CANs)}, an architecture that learns to decompose structured outputs into reusable compositional primitives. Our key insight is that many structured prediction tasks share an underlying compositional nature---complex outputs are built from simpler, recurring substructures.
Our main contributions are:
\begin{itemize}[nosep,leftmargin=*]
\item We introduce Compositional Attention Networks, a novel architecture that learns hierarchical decompositions of structured outputs (Section~\ref{sec:method}).
\item We develop a variational training framework that jointly optimizes representation quality and prediction accuracy (Section~\ref{sec:training}).
\item We provide theoretical guarantees on the sample complexity benefits of compositional representations (Section~\ref{sec:theory}).
\item We demonstrate state-of-the-art results on four diverse benchmarks while using significantly fewer parameters (Section~\ref{sec:experiments}).
\end{itemize}
\section{Related Work}
\label{sec:related}
\paragraph{Structured Prediction.} Classical approaches to structured prediction include conditional random fields \citep{lafferty2001conditional}, structured SVMs \citep{tsochantaridis2005large}, and energy-based models. Neural structured prediction has been revolutionized by sequence-to-sequence models with attention \citep{bahdanau2015neural}, graph neural networks, and more recently, transformer-based architectures.
\paragraph{Compositional Learning.} Compositional generalization has been studied extensively in the context of semantic parsing and language understanding. Lake and Baroni (2018) demonstrated that standard neural networks struggle with compositional generalization, motivating architectural innovations that explicitly encode compositional structure.
\paragraph{Representation Learning.} Self-supervised representation learning has achieved remarkable success across modalities, from vision to language. Methods such as contrastive learning, masked prediction, and variational autoencoders learn general-purpose representations. Our work extends these ideas to structured output spaces.
\section{Method}
\label{sec:method}
\subsection{Problem Formulation}
Let $\mathcal{X}$ denote the input space and $\mathcal{Y}$ the structured output space. Given a training set $\{(x_i, y_i)\}_{i=1}^{N}$ drawn from an unknown distribution $p(x, y)$, our goal is to learn a mapping $f_\theta: \mathcal{X} \rightarrow \mathcal{Y}$ that minimizes the structured risk:
\begin{equation}
\mathcal{R}(f_\theta) = \mathbb{E}_{(x,y) \sim p} \left[ \Delta(y, f_\theta(x)) \right],
\end{equation}
where $\Delta: \mathcal{Y} \times \mathcal{Y} \rightarrow \mathbb{R}_{\geq 0}$ is a task-specific structured loss function.
\subsection{Compositional Decomposition}
We assume that each structured output $y \in \mathcal{Y}$ can be decomposed as a composition of primitives from a finite vocabulary $\mathcal{P} = \{p_1, \ldots, p_K\}$ via a composition operator $\oplus$:
\begin{equation}
y = p_{i_1} \oplus p_{i_2} \oplus \cdots \oplus p_{i_m}.
\end{equation}
The compositional attention mechanism computes representation $\mathbf{h}_i$ for each position as:
\begin{equation}
\mathbf{h}_i = \sum_{k=1}^{K} \alpha_{ik} \cdot \mathbf{W}_v \mathbf{e}_k,
\end{equation}
where $\alpha_{ik}$ are attention weights computed via:
\begin{equation}
\alpha_{ik} = \frac{\exp\left( \mathbf{q}_i^\top \mathbf{k}_k / \sqrt{d} \right)}{\sum_{j=1}^{K} \exp\left( \mathbf{q}_i^\top \mathbf{k}_j / \sqrt{d} \right)}.
\end{equation}
\subsection{Hierarchical Structure Learning}
To capture multi-scale compositional structure, we introduce a hierarchy of $L$ composition levels. At each level $\ell$, the model learns to group lower-level primitives into higher-level structures:
\begin{equation}
\mathbf{h}_i^{(\ell)} = \text{CompAttn}^{(\ell)}\left(\mathbf{h}_i^{(\ell-1)}, \mathcal{P}^{(\ell)}\right),
\end{equation}
where $\text{CompAttn}^{(\ell)}$ denotes the compositional attention at level~$\ell$.
\section{Training Framework}
\label{sec:training}
We optimize the model using a variational objective that balances reconstruction quality with structural regularization:
\begin{equation}
\mathcal{L}(\theta, \phi) = \mathbb{E}_{q_\phi(z|x,y)}\left[\log p_\theta(y|x,z)\right] - \beta \cdot \text{KL}\left(q_\phi(z|x,y) \| p(z)\right).
\label{eq:elbo}
\end{equation}
The training procedure alternates between updating the encoder parameters $\phi$ and the decoder parameters $\theta$ using the reparameterization trick for gradient estimation.
\begin{algorithm}[t]
\caption{CAN Training Procedure}
\label{alg:training}
\begin{algorithmic}[1]
\REQUIRE Dataset $\mathcal{D}$, learning rate $\eta$, KL weight $\beta$
\STATE Initialize parameters $\theta$, $\phi$
\FOR{epoch $= 1, \ldots, T$}
\FOR{minibatch $(x, y) \sim \mathcal{D}$}
\STATE Sample $z \sim q_\phi(z | x, y)$
\STATE Compute $\mathcal{L}(\theta, \phi)$ via Eq.~\eqref{eq:elbo}
\STATE Update $\theta, \phi \leftarrow \theta, \phi - \eta \nabla \mathcal{L}$
\ENDFOR
\STATE Anneal $\beta$ according to schedule
\ENDFOR
\end{algorithmic}
\end{algorithm}
\section{Theoretical Analysis}
\label{sec:theory}
We provide sample complexity bounds showing the benefit of compositional decomposition.
\textbf{Theorem 1.} \textit{Let $\mathcal{Y}$ be a structured output space with compositional complexity $C(\mathcal{Y}) = K^m$, where $K$ is the primitive vocabulary size and $m$ is the maximum composition depth. Then the sample complexity of learning $f_\theta$ with compositional representations is $\mathcal{O}(K \cdot m \cdot d \cdot \log(1/\delta) / \epsilon^2)$, compared to $\mathcal{O}(K^m \cdot d \cdot \log(1/\delta) / \epsilon^2)$ for non-compositional approaches.}
This exponential reduction in sample complexity is the key theoretical motivation for our compositional approach.
\section{Experiments}
\label{sec:experiments}
\subsection{Datasets and Setup}
We evaluate on four benchmarks:
\begin{itemize}[nosep,leftmargin=*]
\item \textbf{SCAN} --- Compositional semantic parsing
\item \textbf{CodeSearchNet} --- Code generation from docstrings
\item \textbf{MoleculeNet} --- Molecular property prediction
\item \textbf{CNN/DailyMail} --- Abstractive summarization
\end{itemize}
\subsection{Main Results}
\begin{table}[t]
\centering
\caption{Main results across four benchmarks. Best results in \textbf{bold}, second-best \underline{underlined}.}
\label{tab:results}
\small
\begin{tabular}{@{}lcccc@{}}
\toprule
\textbf{Model} & \textbf{SCAN} & \textbf{CSN} & \textbf{MolNet} & \textbf{CNN/DM} \\
& (Acc. \%) & (BLEU) & (AUC) & (R-L) \\
\midrule
Transformer & 78.2 & 38.1 & 0.812 & 41.3 \\
GNN-Struct & 82.4 & 36.9 & \underline{0.847} & 39.8 \\
CompGen & \underline{91.3} & \underline{40.2} & 0.831 & \underline{42.1} \\
\midrule
CAN (Ours) & \textbf{96.7} & \textbf{43.5} & \textbf{0.863} & \textbf{43.8} \\
\quad -- w/o hierarchy & 93.1 & 41.8 & 0.849 & 42.9 \\
\quad -- w/o variational & 94.2 & 42.1 & 0.855 & 43.2 \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:results} presents our main results. CAN consistently outperforms all baselines across the four benchmarks, achieving a new state of the art on SCAN (+5.4\%), CodeSearchNet (+3.3 BLEU), MoleculeNet (+0.016 AUC), and CNN/DailyMail (+1.7 R-L).
\subsection{Ablation Study}
We conducted ablation experiments removing key components. Removing the hierarchical structure (``w/o hierarchy'') reduces performance across all tasks, confirming the importance of multi-scale composition. The variational training framework (``w/o variational'') also contributes meaningful gains, particularly on SCAN and MoleculeNet.
\subsection{Efficiency Analysis}
CAN uses 40\% fewer parameters than comparable transformer models due to parameter sharing across compositional primitives. Training converges in approximately 60\% of the time required by the full transformer baseline on equivalent hardware (8$\times$ NVIDIA A100 GPUs).
\section{Conclusion}
We presented Compositional Attention Networks, a novel approach to structured prediction that learns hierarchical compositional representations. Our method achieves state-of-the-art results on four diverse benchmarks while being more parameter-efficient. Theoretical analysis confirms that compositional decomposition provably reduces sample complexity. Future work includes extending CANs to multi-modal structured prediction and exploring unsupervised discovery of compositional primitives.
\paragraph{Acknowledgements.} This work was supported by NSF Grant IIS-2024000 and a Google Faculty Research Award. We thank the anonymous reviewers for their constructive feedback.
\bibliographystyle{plainnat}
\begin{thebibliography}{10}
\bibitem[Bahdanau et~al.(2015)]{bahdanau2015neural}
D.~Bahdanau, K.~Cho, and Y.~Bengio.
\newblock Neural machine translation by jointly learning to align and translate.
\newblock In \emph{Proceedings of ICLR}, 2015.
\bibitem[Lafferty et~al.(2001)]{lafferty2001conditional}
J.~Lafferty, A.~McCallum, and F.~Pereira.
\newblock Conditional random fields: Probabilistic models for segmenting and labeling sequence data.
\newblock In \emph{Proceedings of ICML}, pages 282--289, 2001.
\bibitem[Tsochantaridis et~al.(2005)]{tsochantaridis2005large}
I.~Tsochantaridis, T.~Joachims, T.~Hofmann, and Y.~Altun.
\newblock Large margin methods for structured and interdependent output variables.
\newblock \emph{Journal of Machine Learning Research}, 6:1453--1484, 2005.
\bibitem[Vaswani et~al.(2017)]{vaswani2017attention}
A.~Vaswani, N.~Shazeer, N.~Parmar, J.~Uszkoreit, L.~Jones, A.~N. Gomez, L.~Kaiser, and I.~Polosukhin.
\newblock Attention is all you need.
\newblock In \emph{Advances in Neural Information Processing Systems}, pages 5998--6008, 2017.
\end{thebibliography}
\end{document}

PDF Preview
Create an account to compile and preview