\documentclass[sigconf]{acmart}
\usepackage{algorithm2e}
\setcopyright{acmcopyright}
\copyrightyear{2024}
\acmYear{2024}
\acmDOI{10.1145/XXXXXXX.XXXXXXX}
\acmConference[Conference'24]{ACM Conference on XYZ}{June 03--05, 2024}{City, Country}
\acmBooktitle{Proceedings of the ACM Conference (Conference'24)}
\acmPrice{15.00}
\acmISBN{978-1-4503-XXXX-X/18/06}
\begin{document}
\title{The Name of the Title Is Hope}
\subtitle{An Optional Subtitle}
\author{Ben Trovato}
\authornote{Both authors contributed equally to this research.}
\email{[email protected]}
\orcid{1234-5678-9012}
\affiliation{%
\institution{Institute for Clarity in Documentation}
\city{Dublin}
\country{Ireland}
}
\author{G.K.M. Tobin}
\authornotemark[1]
\email{[email protected]}
\affiliation{%
\institution{Institute for Clarity in Documentation}
\city{Dublin}
\country{Ireland}
}
\renewcommand{\shortauthors}{Trovato and Tobin.}
\begin{abstract}
We present a novel framework for adaptive representation learning that
jointly optimizes feature extraction and downstream task performance. Current
approaches treat these stages independently, leading to sub-optimal
representations when the feature space does not align with the decision
boundary. Our method introduces a bi-level optimization procedure that
alternates between updating the encoder and the task-specific head, coupled
with a regularization term that encourages disentangled latent factors.
We evaluate on three standard benchmarks---CIFAR-100, Tiny-ImageNet, and
CUB-200---and demonstrate consistent improvements of 2--4\% in top-1
accuracy over competitive baselines while reducing inference latency by
15\%. We further provide theoretical analysis showing our objective
converges under mild smoothness assumptions.
\end{abstract}
\begin{CCSXML}
<ccs2012>
<concept>
<concept_id>10010520.10010553.10010562</concept_id>
<concept_desc>Computer systems organization~Embedded systems</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10010147.10010178.10010179</concept_id>
<concept_desc>Computing methodologies~Machine learning</concept_desc>
<concept_significance>300</concept_significance>
</concept>
</ccs2012>
\end{CCSXML}
\ccsdesc[500]{Computer systems organization~Embedded systems}
\ccsdesc[300]{Computing methodologies~Machine learning}
\keywords{datasets, neural networks, gaze detection, text tagging}
\maketitle
\section{Introduction}
Representation learning has become a cornerstone of modern machine learning,
underpinning advances in computer vision~\cite{he2016deep}, natural language
processing~\cite{devlin2019bert}, and speech recognition. Despite remarkable
progress, most existing methods decouple the feature extraction stage from the
downstream objective, optimizing each independently. This two-stage pipeline
can produce representations that are informative in a general sense yet
misaligned with the specific decision boundaries required by the target task.
The consequences of this misalignment are particularly pronounced in
fine-grained recognition settings, where subtle inter-class differences
must be captured by the learned features. Recent work has shown that
end-to-end fine-tuning can partially mitigate the issue, but naive
joint optimization often leads to overfitting on small datasets and
increases computational cost~\cite{vaswani2017attention}.
In this paper we propose \emph{Adaptive Bi-Level Representation Learning}
(ABRL), a framework that treats feature extraction and task optimization as
coupled sub-problems within a bi-level optimization formulation. The outer
loop updates the encoder parameters to maximize representation quality,
while the inner loop adjusts the task head to exploit the current feature
space. A disentanglement regularizer encourages orthogonal latent factors,
improving both interpretability and generalization. Our contributions are:
\begin{itemize}
\item A principled bi-level formulation that jointly learns representations
and task heads.
\item A disentanglement regularizer with convergence guarantees.
\item State-of-the-art results on three benchmarks with reduced inference
cost.
\end{itemize}
\section{Related Work}
\subsection{Representation Learning}
Self-supervised pre-training has emerged as a dominant paradigm for learning
transferable features. Contrastive methods such as SimCLR~\cite{chen2020simple}
and MoCo maximize agreement between augmented views, while generative
approaches reconstruct masked inputs. Our work differs by explicitly coupling
the representation objective with the downstream loss.
\subsection{Bi-Level Optimization}
Bi-level optimization has been applied to meta-learning, hyperparameter
selection, and neural architecture search. We adapt the framework of
implicit differentiation to avoid unrolling the full inner optimization,
reducing memory from $O(T)$ to $O(1)$ where $T$ is the number of inner
steps~\cite{he2016deep}.
\subsection{Disentangled Representations}
$\beta$-VAE and its variants encourage axis-aligned latent factors through
augmented KL penalties. We incorporate a softer spectral regularizer
that avoids posterior collapse while still promoting disentanglement.
\section{Method}
We formalize our approach as follows. Let $f_\theta$ denote the encoder and
$g_\phi$ the task head. The bi-level objective is:
\begin{equation}
\min_\theta\; \mathcal{L}_{\mathrm{outer}}(\theta, \phi^\star(\theta)),
\quad \text{s.t.}\;\; \phi^\star(\theta) = \arg\min_\phi\;
\mathcal{L}_{\mathrm{inner}}(\theta, \phi).
\end{equation}
The inner loss $\mathcal{L}_{\mathrm{inner}}$ is the standard cross-entropy
on the training split, while the outer loss $\mathcal{L}_{\mathrm{outer}}$
combines validation cross-entropy with a disentanglement penalty:
\begin{equation}
\mathcal{L}_{\mathrm{outer}} = -\sum_{i=1}^{N} \log p_\phi(y_i \mid f_\theta(x_i))
+ \lambda \left\| \mathrm{Cov}(f_\theta(X)) - I \right\|_F^2.
\end{equation}
\begin{figure}[t]
\centering
\fbox{\parbox{0.9\columnwidth}{\centering\vspace{2em}%
Architecture diagram: encoder $f_\theta$ feeds into task head $g_\phi$
with bi-level gradient flow.\vspace{2em}}}
\caption{Overview of the ABRL framework. Solid arrows indicate forward
computation; dashed arrows show bi-level gradient paths.}
\label{fig:architecture}
\end{figure}
Figure~\ref{fig:architecture} illustrates the overall architecture.
\begin{algorithm}[t]
\caption{ABRL Training Procedure}\label{alg:abrl}
\KwIn{Dataset $\mathcal{D}$, learning rates $\alpha, \beta$, inner steps $K$}
\KwOut{Optimized $\theta, \phi$}
Initialize $\theta, \phi$\;
\For{epoch $= 1, \dots, E$}{
Sample mini-batch $(X, Y) \sim \mathcal{D}$\;
\For{$k = 1, \dots, K$}{
$\phi \leftarrow \phi - \alpha \nabla_\phi \mathcal{L}_{\mathrm{inner}}(\theta, \phi)$\;
}
Compute $\nabla_\theta \mathcal{L}_{\mathrm{outer}}$ via implicit differentiation\;
$\theta \leftarrow \theta - \beta \nabla_\theta \mathcal{L}_{\mathrm{outer}}$\;
}
\end{algorithm}
\section{Experiments}
\subsection{Setup}
We evaluate on CIFAR-100, Tiny-ImageNet, and CUB-200-2011. All models
use a ResNet-50 backbone pre-trained on ImageNet-1K. We train with SGD
(momentum 0.9, weight decay $10^{-4}$) for 100 epochs with cosine
annealing from an initial learning rate of 0.01. The inner loop uses
$K=5$ gradient steps with $\alpha=0.001$. Each experiment is repeated
over three random seeds and we report mean $\pm$ standard deviation.
\subsection{Main Results}
\begin{table}[t]
\centering
\caption{Top-1 accuracy (\%) on three benchmarks.}\label{tab:main}
\begin{tabular}{lccc}
\toprule
Method & CIFAR-100 & Tiny-ImageNet & CUB-200 \\
\midrule
Fine-tune & 78.2 $\pm$ 0.3 & 64.1 $\pm$ 0.4 & 75.8 $\pm$ 0.5 \\
SimCLR+FT & 79.5 $\pm$ 0.2 & 65.3 $\pm$ 0.3 & 77.2 $\pm$ 0.4 \\
ABRL (ours) & \textbf{81.9 $\pm$ 0.2} & \textbf{68.0 $\pm$ 0.3} & \textbf{79.4 $\pm$ 0.3} \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:main} shows that ABRL consistently outperforms both
standard fine-tuning and the contrastive pre-training baseline.
\subsection{Ablation Study}
\begin{table}[t]
\centering
\caption{Ablation on CIFAR-100 (top-1 accuracy \%).}\label{tab:ablation}
\begin{tabular}{lc}
\toprule
Variant & Accuracy \\
\midrule
ABRL (full) & \textbf{81.9} \\
w/o disentanglement penalty & 80.4 \\
w/o implicit differentiation & 79.8 \\
Single-level joint training & 79.1 \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:ablation} confirms that both the bi-level structure and
the disentanglement regularizer contribute meaningfully to performance.
\section{Discussion}
Our results indicate that tightly coupling representation learning with
the downstream objective yields representations that better capture
task-relevant variation. The disentanglement penalty adds minimal
computational overhead (roughly 3\% wall-clock time) while improving
generalization by 1.5 percentage points on average. A limitation of
the current approach is the sensitivity to the number of inner steps
$K$: too few steps under-optimize the task head, while too many lead
to over-specialization. Future work will investigate adaptive schedules
for $K$ and extend ABRL to multi-task settings.
\section{Conclusion}
We introduced ABRL, a bi-level optimization framework for adaptive
representation learning. By jointly optimizing feature extraction and
task performance with a disentanglement regularizer, ABRL achieves
state-of-the-art accuracy on three fine-grained benchmarks while
reducing inference latency. Theoretical analysis confirms convergence
under standard smoothness conditions. Code and pre-trained models will
be released upon publication.
\begin{acks}
We thank the anonymous reviewers for their constructive feedback. This
work was supported in part by the National Science Foundation under
Grant No.\ IIS-2112345 and by a Google Research Scholar award.
\end{acks}
\bibliographystyle{ACM-Reference-Format}
\begin{thebibliography}{00}
\bibitem{he2016deep} Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016. Deep residual learning for image recognition. In \emph{Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 770--778.
\bibitem{vaswani2017attention} Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, {\L}ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In \emph{Advances in Neural Information Processing Systems (NeurIPS)}, 5998--6008.
\bibitem{devlin2019bert} Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of deep bidirectional transformers for language understanding. In \emph{NAACL-HLT}, 4171--4186.
\bibitem{chen2020simple} Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In \emph{ICML}, 1597--1607.
\bibitem{kingma2015adam} Diederik P. Kingma and Jimmy Ba. 2015. Adam: A method for stochastic optimization. In \emph{ICLR}.
\end{thebibliography}
\end{document}

PDF Preview
Create an account to compile and preview