International Conference on Machine Learning paper using the official icml2024 style. Anonymous/accepted toggles, Impact Statement section (required since 2022), theorem environments, algorithm blocks, full experimental sections, and appendix.
icml/main.tex
\documentclass{article}
% ICML 2024 style file. For anonymous submission use \usepackage{icml2024};
% for camera-ready use \usepackage[accepted]{icml2024}.
\usepackage[accepted]{icml2024}
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{amsmath,amssymb,amsthm,mathtools}
\usepackage{algorithm}
\usepackage{algorithmic}
% Theorem environments
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\icmltitlerunning{Efficient Adaptive Optimization for Non-Convex Problems}
\begin{document}
\twocolumn[
\icmltitle{Efficient Adaptive Optimization for\\Large-Scale Non-Convex Problems}
% It is OKAY to include author information, even for camera-ready papers, as
% long as \icmlaffiliation and \icmlauthor are properly set and the
% \icmlcorrespondingauthor line is filled in.
\icmlsetsymbol{equal}{*}
\begin{icmlauthorlist}
\icmlauthor{First Last}{uoe,lab}
\icmlauthor{Jane Doe}{uoe}
\icmlauthor{John Smith}{lab}
\end{icmlauthorlist}
\icmlaffiliation{uoe}{Department of Computer Science, University of Example, City, Country}
\icmlaffiliation{lab}{Example Research, City, Country}
\icmlcorrespondingauthor{First Last}{[email protected]}
\icmlkeywords{Machine Learning, Optimization, Non-convex, Adam, SGD}
\vskip 0.3in
]
% This must be here for camera-ready papers.
\printAffiliationsAndNotice{}
% For equal contributions: \printAffiliationsAndNotice{\icmlEqualContribution}
\begin{abstract}
Adaptive optimizers such as Adam are the workhorse of modern deep learning
but suffer from known pathologies on non-convex objectives. We propose a
new adaptive optimizer, \textsc{Adamix}, that smoothly interpolates between
Adam and SGD while preserving convergence guarantees under non-convex
smoothness. Our theoretical analysis shows a tight
$\mathcal{O}(1/\sqrt{T})$ bound in expectation, matching the best known
rate for adaptive methods. Empirically, \textsc{Adamix} matches or exceeds
Adam on ImageNet, CIFAR-100, and WikiText-103 while using 30\% less
memory and 18\% less wall-clock time. Code and checkpoints are available
at \url{https://example.com/adamix}.
\end{abstract}
\section{Introduction}
\label{sec:intro}
Adaptive optimizers~\citep{kingma2015adam,loshchilov2019decoupled} dominate
the training of deep networks, yet their behavior on non-convex objectives
is incompletely understood. Prior analyses~\citep{reddi2019amsgrad,
zhang2020adaptive} establish convergence only under restrictive assumptions
or exhibit weaker-than-desired rates. We revisit the trade-off between
adaptivity and generalization through a unified framework.
\paragraph{Contributions.} Our contributions are threefold:
\begin{enumerate}
\item We propose \textsc{Adamix}, a single-parameter family of adaptive
optimizers indexed by $\gamma \in [0,1]$ that interpolates between SGD
with momentum ($\gamma=0$) and Adam ($\gamma=1$).
\item We prove an $\mathcal{O}(1/\sqrt{T})$ convergence rate for all
$\gamma$ in the family under standard assumptions, yielding the first
unified analysis across the adaptivity spectrum.
\item Empirically, we show that intermediate $\gamma \approx 0.5$ delivers
the accuracy of Adam at the memory footprint of SGD across vision and
language benchmarks.
\end{enumerate}
\section{Preliminaries}
\label{sec:prelim}
We consider the stochastic non-convex optimization problem
\begin{equation}
\min_{\theta \in \mathbb{R}^d} F(\theta) := \mathbb{E}_{\xi \sim \mathcal{D}}[f(\theta; \xi)],
\label{eq:problem}
\end{equation}
where $f(\cdot; \xi)$ is $L$-smooth and $F$ is bounded below.
\begin{assumption}[Smoothness]
\label{asm:smooth}
$F$ is $L$-smooth: $\|\nabla F(\theta) - \nabla F(\theta')\| \le L \|\theta - \theta'\|$.
\end{assumption}
\begin{assumption}[Bounded variance]
\label{asm:variance}
$\mathbb{E}\|\nabla f(\theta; \xi) - \nabla F(\theta)\|^2 \le \sigma^2$.
\end{assumption}
\section{Method}
\label{sec:method}
Let $g_t = \nabla f(\theta_t; \xi_t)$. The \textsc{Adamix} update is
\begin{align}
m_t &= \beta_1 m_{t-1} + (1-\beta_1) g_t, \label{eq:mom}\\
v_t &= \beta_2 v_{t-1} + (1-\beta_2) g_t^{\odot 2}, \label{eq:var}\\
\theta_{t+1} &= \theta_t - \eta \frac{m_t}{(\sqrt{v_t} + \epsilon)^{\gamma}}, \label{eq:update}
\end{align}
where $\gamma \in [0,1]$ controls the degree of adaptivity.
\begin{algorithm}[t]
\caption{\textsc{Adamix}}
\label{alg:adamix}
\begin{algorithmic}[1]
\REQUIRE Initial $\theta_0$, step $\eta$, $\beta_1,\beta_2 \in [0,1)$, $\gamma \in [0,1]$
\STATE $m_0 \leftarrow 0$, $v_0 \leftarrow 0$
\FOR{$t = 1, 2, \ldots, T$}
\STATE Sample $\xi_t$, compute $g_t = \nabla f(\theta_{t-1}; \xi_t)$
\STATE $m_t \leftarrow \beta_1 m_{t-1} + (1-\beta_1) g_t$
\STATE $v_t \leftarrow \beta_2 v_{t-1} + (1-\beta_2) g_t^{\odot 2}$
\STATE $\theta_t \leftarrow \theta_{t-1} - \eta m_t / (\sqrt{v_t} + \epsilon)^\gamma$
\ENDFOR
\end{algorithmic}
\end{algorithm}
\section{Convergence Analysis}
\label{sec:theory}
\begin{theorem}[Non-convex convergence of \textsc{Adamix}]
\label{thm:main}
Under Assumptions~\ref{asm:smooth}--\ref{asm:variance}, for any $\gamma \in [0,1]$
and appropriate step size $\eta = \mathcal{O}(1/\sqrt{T})$,
\begin{equation}
\min_{t \le T} \mathbb{E}\|\nabla F(\theta_t)\|^2 \le \mathcal{O}(1/\sqrt{T}).
\end{equation}
\end{theorem}
\begin{proof}
The proof decomposes the update into a signed and an unsigned component and
bounds each via standard smoothness and variance arguments. The full
derivation appears in Appendix~\ref{app:proofs}.
\end{proof}
\section{Experiments}
\label{sec:exp}
We evaluate on three benchmarks: ImageNet (ResNet-50), CIFAR-100
(ResNet-18), and WikiText-103 (Transformer).
\begin{table}[t]
\caption{Top-1 accuracy on ImageNet (ResNet-50, 90 epochs) and memory usage.}
\label{tab:imagenet}
\centering
\small
\begin{tabular}{lccc}
\toprule
Optimizer & Acc.\ (\%) & Memory (GB) & Time (h) \\
\midrule
SGD+momentum & 76.1 & 14.8 & 42 \\
Adam & 75.3 & 19.2 & 38 \\
AdamW & 76.3 & 19.2 & 38 \\
\textsc{Adamix} ($\gamma=0$) & 76.0 & \textbf{13.4} & 41 \\
\textsc{Adamix} ($\gamma=0.5$) & \textbf{76.8} & \textbf{13.4} & \textbf{31} \\
\textsc{Adamix} ($\gamma=1$) & 75.4 & 19.2 & 38 \\
\bottomrule
\end{tabular}
\end{table}
\subsection{Ablations}
We ablate the interpolation parameter $\gamma$ across
$\{0, 0.25, 0.5, 0.75, 1\}$. Intermediate values dominate the extremes
on both final accuracy and time-to-threshold.
\section{Related Work}
\label{sec:related}
Adam~\citep{kingma2015adam}, AdamW~\citep{loshchilov2019decoupled},
AMSGrad, Adafactor, and Lion span the adaptive-optimizer design space.
\textsc{Adamix} unifies this space along a single interpretable axis.
\section{Conclusion}
\label{sec:conclusion}
We presented a single-parameter family that bridges SGD and Adam with
both theoretical and practical improvements. Future work includes
schedule-aware automatic tuning of $\gamma$ and analysis under
distributional shift.
\section*{Impact Statement}
This paper presents work whose goal is to advance the field of Machine
Learning. There are many potential societal consequences of our work,
none of which we feel must be specifically highlighted here. In
particular, improved optimizer efficiency reduces training energy cost
but may also lower the barrier to training very large models.
\section*{Acknowledgements}
We thank the anonymous reviewers and colleagues at Example Research.
\bibliography{example}
\bibliographystyle{icml2024}
\appendix
\section{Full Proofs}
\label{app:proofs}
\subsection{Proof of Theorem~\ref{thm:main}}
By smoothness (Assumption~\ref{asm:smooth}),
\begin{equation*}
F(\theta_{t+1}) \le F(\theta_t) + \langle \nabla F(\theta_t), \theta_{t+1} - \theta_t \rangle + \frac{L}{2}\|\theta_{t+1}-\theta_t\|^2.
\end{equation*}
Substituting the update rule~\eqref{eq:update} and taking expectations
over $\xi_t$ gives the descent lemma. Summing from $t=1$ to $T$ and
dividing by $T$ yields the stated rate. The technical care is in
bounding the $\gamma$-dependent preconditioner; see~\citep{reddi2019amsgrad}
for the $\gamma=1$ case.
\section{Additional Experiments}
\label{app:exp}
Full hyperparameter grids and learning curves for all benchmarks.
\end{document}

PDF Preview
Create an account to compile and preview