\documentclass[runningheads]{llncs}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{amsmath,amssymb}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{algorithm2e}
\usepackage{listings}
\renewcommand\UrlFont{\color{blue}\rmfamily}
\begin{document}
\title{Paper Title: A Clear and Concise Heading}
\titlerunning{Short Running Title}
\author{First Author\inst{1}\orcidID{0000-0000-0000-0000} \and
Second Author\inst{1,2}\orcidID{1111-2222-3333-4444} \and
Third Author\inst{2}}
\authorrunning{F. Author et al.}
\institute{Springer Heidelberg, Tiergartenstr.\ 17, 69121 Heidelberg, Germany\\
\email{[email protected]}\\
\url{http://www.springer.com/gp/computer-science/lncs}
\and
ABC Institute, Rupert-Karls-University Heidelberg, Heidelberg, Germany\\
\email{\{second,third\}@example.com}}
\maketitle
\begin{abstract}
The abstract should briefly summarize the contents of the paper in
150--250 words. Recent advances in neural architecture search have
demonstrated that automated design can match or exceed hand-crafted
networks on image classification benchmarks. However, the computational
cost of existing search algorithms remains prohibitive for resource-
constrained environments. In this work, we propose a lightweight
progressive search strategy that reduces GPU hours by an order of
magnitude while maintaining competitive accuracy on CIFAR-10 and
ImageNet. Our method leverages early-stopping predictors trained on
partial learning curves to prune unpromising candidates, combined with
a weight-sharing super-network that amortizes training cost across the
search space. Experiments show that the discovered architectures achieve
97.4\% accuracy on CIFAR-10 and 76.8\% top-1 accuracy on ImageNet with
only 12 GPU hours of search time.
\keywords{First keyword \and Second keyword \and Third keyword.}
\end{abstract}
\section{Introduction}
The design of deep neural network architectures has traditionally relied on
human expertise, requiring extensive trial and error to balance accuracy,
latency, and parameter count~\cite{ref_lncs1}. Neural architecture search
(NAS) aims to automate this process by formulating architecture design as an
optimization problem over a discrete search space~\cite{ref_lncs2}.
Despite impressive results, early NAS methods such as reinforcement-learning-
based controllers required thousands of GPU hours per search, limiting their
applicability to well-funded research labs~\cite{ref_lncs3}. Weight-sharing
approaches like DARTS reduced this cost significantly but introduced new
challenges, including performance collapse and sensitivity to hyperparameters.
In this paper, we make the following contributions:
\begin{enumerate}
\item We introduce a progressive search strategy that expands the candidate
pool incrementally, guided by early-stopping predictors.
\item We propose a hybrid super-network training scheme that mitigates the
weight-coupling problem inherent in one-shot methods.
\item We provide a theoretical analysis bounding the approximation error of
the early-stopping predictor.
\item We validate the approach on CIFAR-10 and ImageNet, achieving
state-of-the-art trade-offs between search cost and final accuracy.
\end{enumerate}
\section{Related Work}
\subsection{Reinforcement-Learning-Based NAS}
Zoph and Le~\cite{ref_lncs2} pioneered the use of a recurrent controller
trained with REINFORCE to generate architecture descriptions. Subsequent work
introduced parameter sharing across child models and hierarchical search
spaces to reduce computation, though wall-clock times remained on the order of
hundreds of GPU hours~\cite{ref_lncs3}.
\subsection{Differentiable Architecture Search}
Liu et al.\ introduced DARTS, which relaxes the discrete search space into a
continuous one and optimizes architecture parameters via gradient descent.
While dramatically faster, DARTS is prone to degenerate solutions dominated by
skip connections~\cite{ref_lncs4}. Several stabilization techniques---including
progressive shrinking, perturbation-based regularization, and Hessian-aware
pruning---have been proposed to address this issue.
\subsection{Predictor-Based Methods}
An alternative line of work trains surrogate models to predict architecture
performance from cheap-to-compute features such as trainable parameter count,
FLOPs, or partial learning curves~\cite{ref_lncs5}. Our approach falls into
this category but differs by integrating predictors directly into a progressive
search loop.
\section{Method}
\begin{definition}[Search Space]
Let $\mathcal{A}$ denote the set of all architectures expressible as directed
acyclic graphs with nodes drawn from a predefined operation set
$\mathcal{O}=\{3\times3\text{ conv}, 5\times5\text{ conv}, \text{skip},
\text{pool}, \text{zero}\}$. Each architecture $\alpha\in\mathcal{A}$ maps an
input tensor to an output tensor via a composition of $L$ cells.
\end{definition}
\begin{theorem}[Predictor Error Bound]
Let $\hat{f}$ be the early-stopping predictor trained on $m$ architecture--
accuracy pairs sampled from $\mathcal{A}$. Under Lipschitz continuity of the
true accuracy function $f$, the expected prediction error satisfies
\begin{equation}
\mathbb{E}\bigl[|f(\alpha)-\hat{f}(\alpha)|\bigr]
\leq C\,m^{-1/(d+2)},
\end{equation}
where $d$ is the intrinsic dimensionality of the search space and $C$ is a
constant depending on the Lipschitz coefficient.
\end{theorem}
The core optimization follows a two-level bilevel formulation:
\begin{equation}
p(y \mid x) = \frac{\exp(f(x,y))}{\sum_{y'} \exp(f(x,y'))}.
\end{equation}
\begin{algorithm}[t]
\SetAlgoLined
\KwIn{Search space $\mathcal{A}$, budget $T$, predictor $\hat{f}$}
\KwOut{Best architecture $\alpha^*$}
$\mathcal{P}_0 \leftarrow$ sample $k$ random architectures from $\mathcal{A}$\;
\For{$t = 1$ \KwTo $T$}{
Train each $\alpha\in\mathcal{P}_{t-1}$ for $e_t$ epochs\;
Fit predictor $\hat{f}$ on partial curves\;
$\mathcal{P}_t \leftarrow$ top-$k$ architectures by $\hat{f}$\;
Expand $\mathcal{P}_t$ with $k'$ mutated candidates\;
}
$\alpha^* \leftarrow \arg\max_{\alpha\in\mathcal{P}_T} \hat{f}(\alpha)$\;
\Return $\alpha^*$\;
\caption{Progressive architecture search with early-stopping predictor.}\label{alg:search}
\end{algorithm}
\begin{figure}[t]
\centering
\fbox{\parbox{0.85\linewidth}{\centering\vspace{4cm}
Figure Placeholder\\[4pt]
Replace with a diagram showing the progressive search loop:
candidate pool, partial training, predictor filtering, and expansion.
\vspace{4cm}}}
\caption{Overview of the progressive search procedure. At each round, a
predictor prunes low-performing candidates and the pool is replenished
with mutations of top architectures.}\label{fig:overview}
\end{figure}
\section{Experiments}
\subsection{Experimental Setup}
All experiments were conducted on a single NVIDIA A100 GPU. CIFAR-10 models
were trained for 600 epochs using SGD with cosine annealing
(initial learning rate 0.025, weight decay $3\times10^{-4}$). ImageNet models
followed the standard 250-epoch schedule with label smoothing and mixup
augmentation. The search phase used $T=5$ progressive rounds with
$k=64$ candidates per round.
\begin{table}[t]
\caption{Main results.}\label{tab:lncs}
\centering
\begin{tabular}{lcc}
\toprule
Model & Accuracy & F1 \\
\midrule
Baseline & 82.1 & 0.78 \\
Proposed & \textbf{89.4} & \textbf{0.86} \\
\bottomrule
\end{tabular}
\end{table}
\begin{table}[t]
\caption{Comparison of search cost and accuracy on CIFAR-10.}\label{tab:comparison}
\centering
\begin{tabular}{lccc}
\toprule
Method & GPU Hours & Params (M) & Accuracy (\%) \\
\midrule
NASNet-A~\cite{ref_lncs2} & 2000 & 3.3 & 97.35 \\
DARTS~\cite{ref_lncs4} & 1.5 & 3.4 & 97.24 \\
ENAS~\cite{ref_lncs3} & 12 & 4.6 & 97.11 \\
Proposed & 12 & 3.1 & \textbf{97.42} \\
\bottomrule
\end{tabular}
\end{table}
\section{Discussion}
The experimental results confirm that combining progressive search with
early-stopping predictors achieves a favorable cost--accuracy trade-off. The
discovered architecture uses fewer parameters than NASNet-A while matching its
accuracy, and it outperforms DARTS without suffering from skip-connection
collapse.
An interesting observation is that the predictor becomes increasingly accurate
as the search progresses and more training data accumulates. By round $T=4$,
the Kendall $\tau$ correlation between predicted and true rankings exceeds 0.9,
suggesting that the majority of the search budget can be saved by relying on
predictions rather than full training.
\section{Conclusion}
We presented a progressive neural architecture search method that leverages
early-stopping predictors to dramatically reduce search cost. Experiments on
CIFAR-10 and ImageNet demonstrate competitive accuracy at a fraction of the
computational budget required by prior methods. Future work will explore
transferring discovered cells to downstream tasks such as object detection
and semantic segmentation.
\begin{thebibliography}{8}
\bibitem{ref_lncs1}
Author, F., Author, S.: Title of a proceedings paper. In: Editor,
F., Editor, S. (eds.) CONFERENCE 2016, LNCS, vol. 9999, pp. 1--13.
Springer, Heidelberg (2016).
\bibitem{ref_lncs2}
Zoph, B., Le, Q.V.: Neural architecture search with reinforcement
learning. In: ICLR (2017).
\bibitem{ref_lncs3}
Pham, H., Guan, M., Zoph, B., Le, Q.V., Dean, J.: Efficient neural
architecture search via parameter sharing. In: ICML, pp. 4095--4104 (2018).
\bibitem{ref_lncs4}
Liu, H., Simonyan, K., Yang, Y.: DARTS: Differentiable architecture
search. In: ICLR (2019).
\bibitem{ref_lncs5}
Wen, W., Liu, H., Chen, Y., Li, H., Bender, G., Kindermans, P.-J.:
Neural predictor for neural architecture search. In: ECCV,
pp. 660--676 (2020).
\end{thebibliography}
\end{document}

PDF Preview
Create an account to compile and preview