AAAI conference paper using the official aaai style. Two-column with numbered citations, figures, tables, the mandatory AAAI copyright block via \pdfinfo, and required AAAI formatting constraints (secnumdepth=0, times+helvet fonts, frenchspacing).
aaai/main.tex
%File: aaai-template.tex
\documentclass[letterpaper]{article} % AAAI requires letterpaper
\usepackage{aaai26} % AAAI 2026 style file - swap for the current year
% DO NOT CHANGE THESE
\usepackage{times}
\usepackage{helvet}
\usepackage{courier}
\usepackage[hyphens]{url}
\usepackage{graphicx}
\urlstyle{rm}
\def\UrlFont{\rm}
\usepackage{graphicx}
\usepackage{natbib}
\usepackage{caption}
\frenchspacing
\setlength{\pdfpagewidth}{8.5in}
\setlength{\pdfpageheight}{11in}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
% PDF INFO IS REQUIRED FOR AAAI SUBMISSIONS
\pdfinfo{
/Title (Structured Reasoning with Constrained Decoding for Knowledge-Intensive Tasks)
/Author (First Last, Jane Doe, John Smith)
/TemplateVersion (AAAI.2026.v1)
}
% Disable section numbers (AAAI convention)
\setcounter{secnumdepth}{0}
\title{Structured Reasoning with Constrained Decoding\\
for Knowledge-Intensive Tasks}
\author{
First Last\textsuperscript{\rm 1},
Jane Doe\textsuperscript{\rm 2},
John Smith\textsuperscript{\rm 1}
}
\affiliations{
\textsuperscript{\rm 1}University of Example, City, Country\\
\textsuperscript{\rm 2}Example Research Labs, City, Country\\
[email protected], [email protected], [email protected]
}
\begin{document}
\maketitle
\begin{abstract}
We propose a constrained decoding method for structured reasoning in
knowledge-intensive tasks. The approach augments an autoregressive
language model with a lightweight verifier that prunes incoherent
partial completions during beam search. The verifier is trained on
aligned (question, evidence, answer) triples and adds \textless 5\%
decoding overhead. On three open-domain QA benchmarks---HotpotQA,
StrategyQA, and 2WikiMQA---our method improves exact-match accuracy
by 5 to 8 points over strong LLM baselines without any fine-tuning of
the base model.
\end{abstract}
\section{Introduction}
Large language models (LLMs) are fluent at surface-level reasoning but
fail systematically on tasks requiring multi-hop structured inference.
Existing mitigations such as chain-of-thought prompting~\cite{wei2022cot}
and self-consistency~\cite{wang2023selfconsistency} improve reliability
but do not directly enforce factual coherence. We close this gap with
a decoding-time verifier that scores the \emph{coherence} of partial
hypotheses against retrieved evidence.
\paragraph{Contributions.}
\begin{itemize}
\item A lightweight verifier that operates on partial LLM generations
and prunes incoherent completions during beam search.
\item A model-agnostic decoding procedure that adds less than 5\%
compute overhead on modern GPUs.
\item Empirical gains of 5--8 EM points on three multi-hop QA
benchmarks, with full ablations over verifier size and beam width.
\end{itemize}
\section{Related Work}
\paragraph{Reasoning with LLMs.}
Chain-of-thought~\cite{wei2022cot}, self-consistency~\cite{wang2023selfconsistency},
and least-to-most prompting have become standard inference-time augmentations.
\paragraph{Constrained decoding.}
Constrained beam search has been explored for grammatical~\cite{post2018fast}
and factual~\cite{lu2022neurologic} objectives. Our verifier can be viewed
as a learned, continuous generalization of these hard constraints.
\paragraph{Retrieval-augmented generation.}
RAG~\cite{lewis2020rag} and FiD inject retrieved passages into the LLM's
context. Our method is complementary: the verifier uses the same retrieved
evidence to score hypotheses.
\section{Method}
Our decoder maintains a beam $\mathcal{B}_t$ of partial hypotheses at
each step $t$. After each token, the verifier scores consistency against
the retrieved knowledge. Low-scoring partial hypotheses are pruned
aggressively.
\subsection{Verifier Architecture}
The verifier is a 125M-parameter encoder transformer trained on 1.2M
aligned QA--evidence triples. Inputs are the question, the current partial
hypothesis, and up to five retrieved passages.
\subsection{Constrained Beam Search}
The combined score for a partial hypothesis $h_t$ is
\begin{equation}
s(h_t) = \alpha \log P_{\text{LM}}(h_t) + (1-\alpha) \log P_{\text{V}}(h_t \mid q, e),
\label{eq:score}
\end{equation}
where $\alpha \in [0,1]$ trades off fluency and verified coherence.
\begin{algorithm}[t]
\caption{Verifier-Guided Beam Search}
\label{alg:decode}
\begin{algorithmic}[1]
\Require Question $q$, evidence $e$, beam size $K$, weight $\alpha$
\State $\mathcal{B}_0 \gets \{(\emptyset, 0)\}$
\For{$t = 1, \ldots, T_{\max}$}
\State $\mathcal{C} \gets \{(h \mathbin{\|} w, s(h)+\log P_{\text{LM}}(w|h)) : h \in \mathcal{B}_{t-1}, w \in \mathcal{V}\}$
\State $\mathcal{C}' \gets \{(h', s' + (1-\alpha)\log P_\text{V}(h'|q,e)) : (h',s') \in \mathcal{C}\}$
\State $\mathcal{B}_t \gets \text{Top-}K(\mathcal{C}')$
\EndFor
\State \Return $\arg\max_{h \in \mathcal{B}_{T_{\max}}} s(h)$
\end{algorithmic}
\end{algorithm}
\section{Experiments}
\subsection{Setup}
We use Llama-3-70B as the base LM and a DeBERTa-v3-base verifier.
Retrieval uses Contriever over the 2024 Wikipedia snapshot.
\subsection{Main Results}
\begin{table}[t]
\centering
\small
\begin{tabular}{lccc}
\toprule
Method & HotpotQA & StrategyQA & 2WikiMQA \\
\midrule
Greedy & 42.1 & 67.4 & 38.9 \\
Self-consistency & 46.3 & 69.8 & 42.7 \\
RAG & 47.8 & 70.1 & 43.5 \\
\textbf{Ours} & \textbf{51.7} & \textbf{74.2} & \textbf{47.8} \\
\bottomrule
\end{tabular}
\caption{Exact-match accuracy across three open-domain QA benchmarks.
Our method improves by 5--8 points over the strongest baseline.}
\end{table}
\subsection{Analysis}
Ablations show the verifier contributes most on questions requiring
three or more reasoning hops. Removing it collapses performance to the
greedy baseline.
\section{Conclusion}
Decoding-time verification is a practical, model-agnostic way to improve
structured reasoning in LLMs. Future work will explore training the
verifier end-to-end with the decoder.
\section*{Acknowledgments}
We thank the anonymous reviewers and colleagues at Example Research
Labs. This work was supported by NSF Grant IIS-XXXXXXX.
\bibliography{refs}
\bibliographystyle{aaai26}
\end{document}

PDF Preview
Create an account to compile and preview