\documentclass[conference]{IEEEtran}
\IEEEoverridecommandlockouts
\usepackage{cite}
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{hyperref}
\begin{document}
\title{How Developers Debug Machine Learning Pipelines:\\
An Empirical Study of 1{,}000 Production Issues}
\author{\IEEEauthorblockN{First Last}
\IEEEauthorblockA{\textit{University of Example} \\ [email protected]}
\and
\IEEEauthorblockN{Jane Doe}
\IEEEauthorblockA{\textit{Example Research Labs} \\ [email protected]}
\and
\IEEEauthorblockN{John Smith}
\IEEEauthorblockA{\textit{University of Example} \\ [email protected]}}
\maketitle
\begin{abstract}
Debugging ML pipelines is harder than debugging traditional software:
failures can be silent, data-dependent, and temporally distant from
causes. We present an empirical study of 1{,}000 ML-pipeline incidents
from three large organizations, classifying them by root cause,
detection, and resolution time. Data-quality issues dominate (47\%) and
existing tooling addresses them poorly.
\end{abstract}
\begin{IEEEkeywords}
machine learning, debugging, empirical software engineering
\end{IEEEkeywords}
\section{Introduction}
ML pipelines underpin consequential products. Their failures are poorly
understood compared to classical software failures.
\section{Research Questions}
\textbf{RQ1.} What are the dominant root causes of ML-pipeline incidents?\\
\textbf{RQ2.} How are they detected, and how quickly?\\
\textbf{RQ3.} What tools/practices correlate with faster resolution?
\section{Study Design}
We collected 1{,}000 post-mortem reports spanning 2022--2025 from three
organizations. Two researchers coded each report using an iteratively
refined taxonomy.
\section{Results}
\begin{table}[t]
\centering\small
\begin{tabular}{lcc}
\toprule
Root cause & Share & MTTR (h) \\
\midrule
Data quality & 47\% & 14.2 \\
Training/serving skew & 18\% & 22.6 \\
Config/infra & 15\% & 4.8 \\
Model degradation & 12\% & 31.1 \\
Other & 8\% & 9.3 \\
\bottomrule
\end{tabular}
\caption{Distribution of 1{,}000 ML-pipeline incidents.}
\end{table}
\section{Discussion}
Data-quality issues are often invisible until they have produced weeks
of poor model output. Better monitoring of \emph{inputs} would shorten
MTTR considerably.
\section{Threats to Validity}
\emph{Construct:} our root-cause taxonomy may not generalize to other
organizations. \emph{Internal:} coder disagreement was resolved by
discussion, which may introduce consensus bias. \emph{External:} our
sample draws from three organizations; generalization is limited.
\section{Conclusion}
ML-pipeline debugging requires tooling and practices beyond those
adapted from classical software engineering.
\bibliographystyle{IEEEtran}
\bibliography{refs}
\end{document}

PDF Preview
Create an account to compile and preview