\documentclass[conference,letterpaper]{IEEEtran}
\IEEEoverridecommandlockouts
\usepackage{cite}
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{hyperref}
\begin{document}
\title{Memory-Side Compression for Transformer Training:\\
A Hardware-Software Co-Design}
\author{\IEEEauthorblockN{First Last, Jane Doe, John Smith}
\IEEEauthorblockA{\textit{University of Example}\\
\{you, jane, john\}@example.com}}
\maketitle
\begin{abstract}
Transformer training at scale is memory-bandwidth-bound. Memory-Side
Compression (MSC) compresses activation tensors in-DRAM using a
lightweight encoding tailored for GELU and softmax outputs. MSC
recovers 1.63$\times$ effective bandwidth and 26\% end-to-end training
speedup on a 70B-parameter model.
\end{abstract}
\begin{IEEEkeywords}
memory, compression, transformer training, HBM
\end{IEEEkeywords}
\section{Introduction}
As compute scales, memory bandwidth becomes the dominant bottleneck.
\section{Background}
HBM subsystems, activation checkpointing, tensor compression.
\section{Design}
MSC encodes activations in DRAM via adaptive clustering. The decoder is
implemented in HBM's near-memory logic. Small tails of GELU outputs
allow 3$\times$ compression with 0.03 dB post-training accuracy loss.
\section{Evaluation}
\begin{table}[t]
\centering\small
\begin{tabular}{lcc}
\toprule
Model & Baseline steps/s & \textbf{MSC} \\
\midrule
GPT-70B & 0.42 & \textbf{0.53} \\
LLaMA-65B & 0.39 & \textbf{0.50} \\
Mixtral-45B & 0.61 & \textbf{0.77} \\
\bottomrule
\end{tabular}
\caption{End-to-end training throughput on 256 GPUs.}
\end{table}
\section{Conclusion}
Simple memory-side compression delivers meaningful end-to-end training
speedups with negligible accuracy impact.
\bibliographystyle{IEEEtran}
\bibliography{refs}
\end{document}

PDF Preview
Create an account to compile and preview