\documentclass[conference,letterpaper]{IEEEtran}
\IEEEoverridecommandlockouts
\usepackage{cite}
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{hyperref}
\begin{document}
\title{SpinFlux: A Reconfigurable Tensor Core for\\
Sparse and Dense Deep Learning Workloads}
\author{\IEEEauthorblockN{First Last, Jane Doe, John Smith}
\IEEEauthorblockA{\textit{University of Example}\\
\{you, jane, john\}@example.com}}
\maketitle
\begin{abstract}
Modern DL workloads exhibit variable sparsity across layers.
Fixed-function tensor cores under-utilize arithmetic units on sparse
workloads. SpinFlux dynamically switches between dense MAC, 2:4 sparse,
and block-sparse modes within a single fused operation, achieving
2.3$\times$ higher throughput/mm$^2$ than fixed-function baselines.
\end{abstract}
\begin{IEEEkeywords}
tensor core, sparsity, hardware accelerator
\end{IEEEkeywords}
\section{Introduction}
Sparsity in deep networks is ubiquitous; fixed accelerators leave
performance on the table.
\section{Background}
Dense MAC arrays, 2:4 sparsity support, block-sparse compute.
\section{SpinFlux Design}
A new operand network routes different sparsity patterns through shared
MAC units. Reconfiguration happens at operation granularity in under 1
cycle.
\section{Evaluation}
\begin{table}[t]
\centering\small
\begin{tabular}{lccc}
\toprule
Workload & Dense TC & 2:4 TC & \textbf{SpinFlux} \\
\midrule
BERT-large & 1.00 & 1.42 & \textbf{1.85} \\
ResNet-50 & 1.00 & 1.28 & \textbf{1.64} \\
GPT-3 block-s & 1.00 & 1.16 & \textbf{2.31} \\
\bottomrule
\end{tabular}
\caption{Normalized throughput per mm$^2$.}
\end{table}
\section{Conclusion}
A modestly reconfigurable tensor core captures substantial performance
on the diverse sparsity mix of modern workloads.
\bibliographystyle{IEEEtran}
\bibliography{refs}
\end{document}

PDF Preview
Create an account to compile and preview