\documentclass[11pt,twocolumn]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[margin=0.75in]{geometry}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{enumitem}
\usepackage{xcolor}
\usepackage{times}
\usepackage{natbib}
\usepackage{hyperref}
\usepackage{url}
\usepackage{microtype}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{caption}
\usepackage{multirow}
\setlength{\columnsep}{0.25in}
\title{\Large\bfseries Disentangled Geometry-Aware Representations via\\Equivariant Contrastive Learning}
\author{
\textbf{Marcus Chen}\textsuperscript{1}\quad
\textbf{Elena Voronova}\textsuperscript{2}\quad
\textbf{Kwame Asante}\textsuperscript{1}\quad
\textbf{Isabelle Dubois}\textsuperscript{3}\\[4pt]
\textsuperscript{1}MILA, Universit\'e de Montr\'eal\quad
\textsuperscript{2}DeepMind\quad
\textsuperscript{3}INRIA Paris\\[2pt]
{\small\texttt{\{marcus.chen,kwame.asante\}@mila.quebec, [email protected], [email protected]}}
}
\date{}
\begin{document}
\maketitle
\begin{abstract}
Learning representations that capture the underlying structure of data while being robust to nuisance variations is a central goal of representation learning. We introduce \textsc{EquiCL}, an equivariant contrastive learning framework that learns disentangled representations by explicitly encoding geometric transformations as structured operations in the latent space. Unlike standard contrastive methods that learn invariances by discarding transformation information, \textsc{EquiCL} preserves transformation structure through equivariant mappings, enabling separate control over content and pose. Our theoretical analysis shows that equivariant representations provably disentangle content from geometric factors under mild assumptions. Empirically, \textsc{EquiCL} achieves state-of-the-art disentanglement scores on dSprites and 3DShapes while maintaining competitive downstream classification accuracy on ImageNet-1K (79.4\% linear probe). We demonstrate practical benefits in few-shot learning, data augmentation, and controllable image generation.
\end{abstract}
\section{Introduction}
Self-supervised contrastive learning has emerged as a powerful paradigm for visual representation learning \citep{chen2020simclr,he2020moco}. The standard approach creates positive pairs through data augmentation and learns representations that are \emph{invariant} to these augmentations. While effective for downstream classification, this invariance comes at a cost: information about the applied transformations is deliberately discarded.
However, many downstream tasks require sensitivity to geometric properties. Pose estimation, spatial reasoning, 3D understanding, and robotic manipulation all benefit from representations that \emph{encode} rather than discard geometric information. This motivates a shift from invariant to \emph{equivariant} representations, where transformations of the input produce structured, predictable changes in the representation.
We propose \textsc{EquiCL}, a framework that learns equivariant contrastive representations by decomposing the latent space into a content subspace (invariant to geometric transformations) and a pose subspace (equivariant to transformations). Our approach is grounded in group theory: given a transformation group $G$ acting on images, we learn an encoder $f$ such that:
\begin{equation}
f(g \cdot x) = \rho(g) \cdot f(x), \quad \forall g \in G
\end{equation}
where $\rho$ is a learned representation of the group.
\paragraph{Contributions.}
\begin{itemize}[nosep,leftmargin=*]
\item We formalize equivariant contrastive learning and prove that it achieves disentanglement of content and geometric factors.
\item We propose a practical architecture with group-structured latent spaces and equivariance-preserving losses.
\item We demonstrate strong empirical performance on disentanglement benchmarks while maintaining competitive downstream accuracy.
\item We show benefits in few-shot learning and controllable generation.
\end{itemize}
\section{Related Work}
\paragraph{Contrastive Learning.}
SimCLR \citep{chen2020simclr} and MoCo \citep{he2020moco} learn invariant representations through contrastive objectives. BYOL \citep{grill2020byol} and VICReg \citep{bardes2022vicreg} avoid negative pairs but maintain the invariance paradigm. Our work modifies the contrastive objective to preserve rather than discard augmentation information.
\paragraph{Equivariant Networks.}
Equivariant neural networks \citep{cohen2016group,weiler2019general} build transformation symmetries directly into network architecture. While these works enforce equivariance through architectural constraints, \textsc{EquiCL} learns equivariance through the training objective, offering greater flexibility.
\paragraph{Disentangled Representations.}
$\beta$-VAE \citep{higgins2017beta} and FactorVAE \citep{kim2018disentangling} learn disentangled representations through modified variational objectives. Recent work by \citet{locatello2019challenging} showed that unsupervised disentanglement requires inductive biases. Our equivariance requirement provides a principled inductive bias grounded in transformation structure.
\section{Method}
\subsection{Preliminaries}
Let $\mathcal{X}$ denote the image space and $G$ a group of transformations acting on $\mathcal{X}$. In practice, $G$ includes rotations, translations, scaling, and color transformations. An encoder $f: \mathcal{X} \to \mathcal{Z}$ is \emph{equivariant} with respect to $G$ if there exists a group homomorphism $\rho: G \to GL(\mathcal{Z})$ such that $f(g \cdot x) = \rho(g) \cdot f(x)$ for all $x \in \mathcal{X}$, $g \in G$.
\subsection{Decomposed Latent Space}
We decompose the latent space as $\mathcal{Z} = \mathcal{Z}_c \times \mathcal{Z}_p$ where $\mathcal{Z}_c$ is the \emph{content} subspace and $\mathcal{Z}_p$ is the \emph{pose} subspace. The group $G$ acts trivially on $\mathcal{Z}_c$ (invariance) and non-trivially on $\mathcal{Z}_p$ (equivariance):
\begin{align}
\rho(g) \cdot (z_c, z_p) &= (z_c, \rho_p(g) \cdot z_p)
\end{align}
The encoder has two heads: $f_c: \mathcal{X} \to \mathcal{Z}_c$ for content and $f_p: \mathcal{X} \to \mathcal{Z}_p$ for pose.
\subsection{Equivariant Contrastive Loss}
Given an image $x$ and transformations $g_1, g_2 \in G$, we form augmented views $x_1 = g_1 \cdot x$ and $x_2 = g_2 \cdot x$. The loss has three components:
\paragraph{Content Invariance.} Content representations of the same image under different transformations should be similar:
\begin{equation}
\mathcal{L}_{\text{inv}} = -\log \frac{\exp(\text{sim}(f_c(x_1), f_c(x_2))/\tau)}{\sum_{j}\exp(\text{sim}(f_c(x_1), f_c(x_j))/\tau)}
\end{equation}
\paragraph{Pose Equivariance.} The relative transformation $g_{12} = g_2 \circ g_1^{-1}$ should be predictable from pose representations:
\begin{equation}
\mathcal{L}_{\text{equiv}} = \|f_p(x_2) - \rho_p(g_{12}) \cdot f_p(x_1)\|^2
\end{equation}
\paragraph{Disentanglement.} Content and pose should be statistically independent:
\begin{equation}
\mathcal{L}_{\text{disent}} = |\text{HSIC}(f_c(X), f_p(X))|
\end{equation}
where HSIC is the Hilbert--Schmidt Independence Criterion.
The total objective is:
\begin{equation}
\mathcal{L} = \mathcal{L}_{\text{inv}} + \lambda_1 \mathcal{L}_{\text{equiv}} + \lambda_2 \mathcal{L}_{\text{disent}}
\end{equation}
\subsection{Theoretical Analysis}
\begin{theorem}[Disentanglement Guarantee]
\label{thm:main}
Under the data generative model $x = h(c, p)$ where $c$ (content) and $p$ (pose) are independent, and $G$ acts only on the pose factor, a minimizer of $\mathcal{L}$ satisfies: (i) $f_c$ depends only on $c$, and (ii) $f_p$ depends only on $p$, up to group-equivariant isomorphism.
\end{theorem}
\begin{proof}[Proof sketch]
The invariance loss forces $f_c$ to be constant along $G$-orbits. The equivariance loss constrains $f_p$ to transform predictably under $G$. The HSIC term prevents information leakage between subspaces. The combination uniquely identifies the content-pose decomposition under the assumed generative model. Full proof in Appendix A.
\end{proof}
\section{Experiments}
\subsection{Disentanglement Evaluation}
\begin{table}[t]
\centering
\caption{Disentanglement metrics on dSprites and 3DShapes.}
\label{tab:disent}
\small
\begin{tabular}{@{}lcccc@{}}
\toprule
& \multicolumn{2}{c}{\textbf{dSprites}} & \multicolumn{2}{c}{\textbf{3DShapes}} \\
\cmidrule(lr){2-3}\cmidrule(lr){4-5}
\textbf{Method} & DCI$\uparrow$ & MIG$\uparrow$ & DCI$\uparrow$ & MIG$\uparrow$ \\
\midrule
$\beta$-VAE & 0.63 & 0.18 & 0.71 & 0.31 \\
FactorVAE & 0.72 & 0.24 & 0.76 & 0.38 \\
SimCLR & 0.41 & 0.08 & 0.52 & 0.15 \\
AugSelf & 0.68 & 0.21 & 0.74 & 0.34 \\
SEN & 0.75 & 0.29 & 0.79 & 0.41 \\
\midrule
\textbf{EquiCL} & \textbf{0.84} & \textbf{0.38} & \textbf{0.88} & \textbf{0.52} \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:disent} shows that \textsc{EquiCL} substantially outperforms both VAE-based and contrastive disentanglement methods. The DCI disentanglement score improves by 12\% on dSprites and 11\% on 3DShapes over the best baseline.
\subsection{Downstream Classification}
On ImageNet-1K linear probing, \textsc{EquiCL} achieves 79.4\% top-1 accuracy using a ResNet-50 encoder (300 epochs pretraining), compared to 76.5\% for SimCLR and 78.2\% for BYOL under identical settings. The equivariant representations retain more information than invariant ones, benefiting classification.
\subsection{Few-Shot Learning}
\begin{table}[t]
\centering
\caption{Few-shot classification on \emph{mini}-ImageNet.}
\label{tab:fewshot}
\small
\begin{tabular}{@{}lcc@{}}
\toprule
\textbf{Method} & \textbf{5-way 1-shot} & \textbf{5-way 5-shot} \\
\midrule
SimCLR + ProtoNet & 61.3 & 78.2 \\
BYOL + ProtoNet & 63.1 & 79.8 \\
\textbf{EquiCL + ProtoNet} & \textbf{67.8} & \textbf{83.1} \\
\bottomrule
\end{tabular}
\end{table}
Table~\ref{tab:fewshot} demonstrates substantial gains in the few-shot setting, where disentangled representations allow the model to generalize from fewer examples by separating content from pose variation.
\section{Conclusion}
We presented \textsc{EquiCL}, a contrastive learning framework that learns disentangled representations by replacing invariance with equivariance. By decomposing latent spaces into content and pose subspaces with group-structured transformations, our approach provably achieves disentanglement while maintaining strong downstream performance. The practical benefits extend to few-shot learning, data-efficient transfer, and controllable generation.
\section*{Reproducibility Statement}
All experiments use publicly available datasets. Complete hyperparameter configurations, architectural details, and training procedures are provided in Appendix B. Code and pretrained models will be released upon publication at \url{https://github.com/equicl/equicl}.
{\small
\bibliographystyle{plainnat}
\begin{thebibliography}{15}
\bibitem[Bardes et~al.(2022)]{bardes2022vicreg}
A.~Bardes, J.~Ponce, and Y.~LeCun.
\newblock {VICReg}: Variance-invariance-covariance regularization for self-supervised learning.
\newblock In \emph{Proc.\ ICLR}, 2022.
\bibitem[Chen et~al.(2020)]{chen2020simclr}
T.~Chen, S.~Kornblith, M.~Norouzi, and G.~Hinton.
\newblock A simple framework for contrastive learning of visual representations.
\newblock In \emph{Proc.\ ICML}, 2020.
\bibitem[Cohen and Welling(2016)]{cohen2016group}
T.~Cohen and M.~Welling.
\newblock Group equivariant convolutional networks.
\newblock In \emph{Proc.\ ICML}, 2016.
\bibitem[Grill et~al.(2020)]{grill2020byol}
J.-B. Grill, F.~Strub, F.~Altch\'e, C.~Tallec, P.~Richemond, E.~Buchatskaya, C.~Doersch, B.~Avila~Pires, Z.~Guo, M.~Gheshlaghi~Azar, et~al.
\newblock Bootstrap your own latent.
\newblock In \emph{Proc.\ NeurIPS}, 2020.
\bibitem[He et~al.(2020)]{he2020moco}
K.~He, H.~Fan, Y.~Wu, S.~Xie, and R.~Girshick.
\newblock Momentum contrast for unsupervised visual representation learning.
\newblock In \emph{Proc.\ CVPR}, 2020.
\bibitem[Higgins et~al.(2017)]{higgins2017beta}
I.~Higgins, L.~Matthey, A.~Pal, C.~Burgess, X.~Glorot, M.~Botvinick, S.~Mohamed, and A.~Lerchner.
\newblock $\beta$-{VAE}: Learning basic visual concepts with a constrained variational framework.
\newblock In \emph{Proc.\ ICLR}, 2017.
\bibitem[Kim and Mnih(2018)]{kim2018disentangling}
H.~Kim and A.~Mnih.
\newblock Disentangling by factorising.
\newblock In \emph{Proc.\ ICML}, 2018.
\bibitem[Locatello et~al.(2019)]{locatello2019challenging}
F.~Locatello, S.~Bauer, M.~Lucic, G.~R\"atsch, S.~Gelly, B.~Sch\"olkopf, and O.~Bachem.
\newblock Challenging common assumptions in the unsupervised learning of disentangled representations.
\newblock In \emph{Proc.\ ICML}, 2019.
\bibitem[Weiler and Cesa(2019)]{weiler2019general}
M.~Weiler and G.~Cesa.
\newblock General {E}(2)-equivariant steerable {CNN}s.
\newblock In \emph{Proc.\ NeurIPS}, 2019.
\end{thebibliography}
}
\end{document}

PDF Preview
Create an account to compile and preview