\documentclass[wcp]{jmlr}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{natbib}
\usepackage{algorithm}
\usepackage{algorithmic}
\jmlrheading{25}{2024}{1-25}{4/24}{9/24}{}{First Author and Second Author}
\ShortHeadings{Short Title}{Author and Author}
\firstpageno{1}
\title{A Paper for the Journal of Machine Learning Research}
\author{\name First A. Author \email [email protected] \\
\addr Department of X \\
University of Y
\AND
\name Second B. Author \email [email protected] \\
\addr Department of X \\
University of Y}
\editor{Editor Name}
\begin{document}
\maketitle
\begin{abstract}
We consider the problem of learning sparse, interpretable representations from high-dimensional data under distributional shift. Existing approaches based on empirical risk minimization suffer from poor generalization when the training and test distributions differ, a setting that arises frequently in clinical, financial, and scientific applications. We propose Distributionally Robust Sparse Learning (DRSL), a framework that minimizes the worst-case expected loss over an ambiguity set of distributions defined by a Wasserstein ball around the empirical measure. We prove that the resulting estimator achieves minimax-optimal rates of convergence under standard regularity conditions and recovers the true support with high probability when the signal-to-noise ratio exceeds a sharp threshold. Experiments on synthetic data, gene expression datasets, and financial return prediction demonstrate that DRSL consistently outperforms $\ell_1$-regularized baselines and distributionally robust methods that ignore sparsity structure, with improvements of 8--15\% in out-of-distribution accuracy.
\end{abstract}
\begin{keywords}
distributionally robust optimization, sparse learning, Wasserstein distance, high-dimensional statistics, support recovery
\end{keywords}
\section{Introduction}
Modern machine learning systems are increasingly deployed in environments where the data distribution at test time differs from the training distribution~\citep{vapnik1998}. This phenomenon, known as distributional shift, arises in applications ranging from clinical risk prediction~\citep{shimodaira2000} to autonomous driving and financial forecasting~\citep{ben-david2010}. Standard empirical risk minimization (ERM) provides no guarantees under such shifts.
Distributionally robust optimization (DRO) addresses this limitation by optimizing worst-case performance over a set of plausible distributions~\citep{duchi2021}. However, existing DRO formulations typically do not exploit structure in the parameter space, leading to overly conservative estimators in high-dimensional regimes.
In this paper, we bridge these two lines of work by proposing Distributionally Robust Sparse Learning (DRSL), which combines Wasserstein DRO with $\ell_1$-regularization. Our contributions are:
\begin{itemize}
\item A tractable convex formulation for Wasserstein DRO with sparsity constraints that admits an efficient ADMM solver.
\item Finite-sample excess risk bounds showing minimax-optimal rates of $O(\sqrt{s \log p / n})$ where $s$ is the sparsity level, $p$ the ambient dimension, and $n$ the sample size.
\item A sharp threshold condition for exact support recovery under distributional uncertainty.
\item Comprehensive experiments demonstrating consistent improvements over baselines on three real-world tasks.
\end{itemize}
\section{Preliminaries}
We introduce notation used throughout the paper.
\begin{table}[t]
\centering
\caption{Summary of notation.}
\label{tab:notation}
\begin{tabular}{ll}
\toprule
Symbol & Meaning \\
\midrule
$\mathcal{X} \subseteq \mathbb{R}^p$ & Feature space \\
$\mathcal{Y} \subseteq \mathbb{R}$ & Response space \\
$P_0$ & True data-generating distribution \\
$\hat{P}_n$ & Empirical distribution over $n$ samples \\
$\mathcal{B}_\varepsilon(\hat{P}_n)$ & Wasserstein ball of radius $\varepsilon$ around $\hat{P}_n$ \\
$\ell(\cdot, \cdot)$ & Loss function \\
$\Omega(\theta) = \|\theta\|_1$ & Sparsity-inducing regularizer \\
$s = \|\theta^\star\|_0$ & True sparsity level \\
\bottomrule
\end{tabular}
\end{table}
Let $\{(x_i, y_i)\}_{i=1}^n$ be drawn i.i.d.\ from an unknown distribution $P_0$ on $\mathcal{X} \times \mathcal{Y}$. The Wasserstein distance of order $q$ between two distributions $P$ and $Q$ is defined as
\begin{equation}
W_q(P, Q) = \left(\inf_{\gamma \in \Gamma(P,Q)} \int \|z - z'\|^q \, d\gamma(z, z')\right)^{1/q},
\end{equation}
where $\Gamma(P,Q)$ denotes the set of couplings with marginals $P$ and $Q$.
\section{Method}
Our objective combines distributional robustness with sparsity:
\begin{equation}\label{eq:drsl}
\hat{\theta} = \arg\min_\theta \sup_{P \in \mathcal{B}_\varepsilon(\hat{P}_n)} \mathbb{E}_P[\ell(y, f_\theta(x))] + \lambda\|\theta\|_1.
\end{equation}
\begin{lemma}[Strong duality]\label{lem:duality}
Under Assumption~A1 (Lipschitz continuity of $\ell$), the inner supremum in~\eqref{eq:drsl} admits the dual representation
\begin{equation}
\sup_{P \in \mathcal{B}_\varepsilon(\hat{P}_n)} \mathbb{E}_P[\ell(y, f_\theta(x))] = \inf_{\eta \geq 0}\left\{\eta\varepsilon + \frac{1}{n}\sum_{i=1}^n \sup_{(x',y')} \left[\ell(y', f_\theta(x')) - \eta\,c((x_i,y_i),(x',y'))\right]\right\},
\end{equation}
where $c$ is the ground cost and $\eta$ is the dual variable.
\end{lemma}
\begin{proof}
The result follows from Kantorovich duality applied to the Wasserstein ball constraint. By the minimax theorem, the order of the infimum over $\eta$ and the minimization over $\theta$ can be exchanged under the convexity-concavity conditions guaranteed by A1. The pointwise supremum reduces to a robust surrogate loss evaluated at each sample, yielding a tractable empirical objective.
\end{proof}
\begin{theorem}[Consistency and support recovery]
Under assumptions A1--A3, the DRSL estimator $\hat{\theta}$ satisfies:
\begin{enumerate}
\item $\|\hat{\theta} - \theta^\star\|_2 = O_P(\sqrt{s\log p / n})$.
\item If $\min_{j \in S} |\theta^\star_j| > C\sqrt{\log p / n}$ for a universal constant $C$, then $\text{supp}(\hat{\theta}) = \text{supp}(\theta^\star)$ with probability at least $1 - 2/p$.
\end{enumerate}
\end{theorem}
\begin{algorithm}[t]
\caption{ADMM Solver for DRSL}
\label{alg:admm}
\begin{algorithmic}[1]
\REQUIRE Data $\{(x_i, y_i)\}_{i=1}^n$, radius $\varepsilon$, penalty $\lambda$, step size $\rho$
\STATE Initialize $\theta^{(0)}, z^{(0)}, u^{(0)} \leftarrow 0$
\FOR{$k = 0, 1, 2, \ldots$ until convergence}
\STATE $\theta^{(k+1)} \leftarrow \arg\min_\theta \; L_\varepsilon(\theta) + \frac{\rho}{2}\|\theta - z^{(k)} + u^{(k)}\|_2^2$
\STATE $z^{(k+1)} \leftarrow \mathcal{S}_{\lambda/\rho}(\theta^{(k+1)} + u^{(k)})$ \hfill $\triangleright$ Soft-thresholding
\STATE $u^{(k+1)} \leftarrow u^{(k)} + \theta^{(k+1)} - z^{(k+1)}$
\ENDFOR
\RETURN $\theta^{(k+1)}$
\end{algorithmic}
\end{algorithm}
\subsection{Computational cost}
Each ADMM iteration requires solving a smooth optimization subproblem over $\theta$ (step~3), which costs $O(np)$ for linear models and can be accelerated with stochastic gradients for large $n$. The soft-thresholding step is $O(p)$. In practice, convergence is reached in 50--200 iterations for the datasets considered in our experiments.
\section{Experiments}
\subsection{Datasets}
We evaluate DRSL on three tasks: (1)~synthetic sparse regression with controlled distributional shift, (2)~gene expression classification using the TCGA pan-cancer dataset ($p = 20{,}531$ genes, $n = 5{,}000$ samples), and (3)~financial return prediction using monthly S\&P~500 constituent data.
\subsection{Experimental setup}
All methods are tuned via 5-fold cross-validation on the source domain. For DRO methods, the Wasserstein radius $\varepsilon$ is selected from $\{0.01, 0.05, 0.1, 0.5, 1.0\}$. The sparsity parameter $\lambda$ is chosen from a logarithmic grid of 20 values. We report mean and standard deviation over 10 random train/test splits.
\subsection{Main results}
\begin{figure}[t]
\centering
\fbox{\parbox[c][5cm][c]{0.7\linewidth}{\centering Out-of-distribution accuracy vs.\ Wasserstein radius $\varepsilon$ for DRSL, Lasso, and DRO baselines on the TCGA dataset}}
\caption{Performance comparison across varying levels of distributional shift. DRSL maintains higher accuracy across all shift magnitudes.}
\label{fig:main}
\end{figure}
\begin{table}[t]
\centering
\caption{Out-of-distribution test accuracy (\%) across three benchmarks. Best in bold.}
\label{tab:results}
\begin{tabular}{lccc}
\toprule
Method & Synthetic & TCGA & S\&P 500 \\
\midrule
ERM (Lasso) & $72.3 \pm 2.1$ & $68.7 \pm 3.4$ & $54.2 \pm 1.8$ \\
DRO (Wasserstein) & $76.8 \pm 1.9$ & $71.2 \pm 2.8$ & $56.1 \pm 2.0$ \\
Group DRO & $75.1 \pm 2.3$ & $70.5 \pm 3.1$ & $55.8 \pm 1.7$ \\
DRSL (Ours) & $\mathbf{83.6 \pm 1.4}$ & $\mathbf{79.1 \pm 2.2}$ & $\mathbf{62.4 \pm 1.5}$ \\
\bottomrule
\end{tabular}
\end{table}
\subsection{Ablation study}
To isolate the contribution of each component, we evaluate variants of DRSL: (a) without sparsity ($\lambda=0$), reducing to standard Wasserstein DRO; (b) without robustness ($\varepsilon=0$), reducing to Lasso; and (c) with $\ell_2$ regularization instead of $\ell_1$. Removing sparsity degrades TCGA accuracy by 7.9\%, while removing robustness degrades it by 10.4\%, confirming that both components are essential.
\subsection{Convergence analysis}
We empirically verify the convergence behavior of the ADMM solver. On the TCGA dataset, the primal residual $\|r^{(k)}\|_2$ decreases below $10^{-6}$ within 120 iterations, and the objective value stabilizes after approximately 80 iterations. Wall-clock time for the full optimization is under 45 seconds on a single NVIDIA A100 GPU using a PyTorch implementation.
\section{Discussion}
Our theoretical and empirical results demonstrate that exploiting sparsity structure within distributionally robust optimization leads to substantially improved generalization under shift. The minimax-optimal convergence rate of $O(\sqrt{s \log p / n})$ matches the rate of the oracle Lasso that knows the true distribution, indicating that robustness comes at no statistical cost in the high-dimensional regime. A limitation of DRSL is the requirement to specify the Wasserstein radius $\varepsilon$, for which principled calibration methods remain an open problem.
\section{Conclusion}
We introduced DRSL, a framework for distributionally robust sparse learning that achieves minimax-optimal estimation rates and exact support recovery under distributional uncertainty. Future work includes extensions to non-convex models and online settings.
\acks{We thank the anonymous reviewers for their constructive feedback. This work was supported in part by NSF Grant DMS-2023505.}
\bibliographystyle{jmlr}
\begin{thebibliography}{5}
\bibitem[Vapnik(1998)]{vapnik1998}
Vapnik, V. (1998). \emph{Statistical Learning Theory}. Wiley.
\bibitem[Shimodaira(2000)]{shimodaira2000}
Shimodaira, H. (2000). Improving predictive inference under covariate shift by reweighting the log-likelihood function. \emph{Journal of Statistical Planning and Inference}, 90(2):227--244.
\bibitem[Ben-David et~al.(2010)]{ben-david2010}
Ben-David, S., Blitzer, J., Crammer, K., Kulesza, A., Pereira, F., \& Vaughan, J.W. (2010). A theory of learning from different domains. \emph{Machine Learning}, 79(1):151--175.
\bibitem[Duchi \& Namkoong(2021)]{duchi2021}
Duchi, J.C. \& Namkoong, H. (2021). Learning models with uniform performance via distributionally robust optimization. \emph{Annals of Statistics}, 49(3):1378--1406.
\bibitem[Wainwright(2019)]{wainwright2019}
Wainwright, M.J. (2019). \emph{High-Dimensional Statistics: A Non-Asymptotic Viewpoint}. Cambridge University Press.
\end{thebibliography}
\end{document}

PDF Preview
Create an account to compile and preview