\documentclass[11pt,a4paper]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[margin=1in]{geometry}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{cite}
\usepackage{xcolor}
\hypersetup{colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{example}[theorem]{Example}
\title{The Title of Your Paper: A Subtitle for Clarification}
\author{First Author\thanks{Corresponding author: \texttt{[email protected]}} \\ Department of X \\ University of Y
\and Second Author \\ Department of X \\ University of Y}
\date{\today}
\begin{document}
\maketitle
\begin{abstract}
This is the abstract of your paper. In 150 to 250 words, state the problem,
the method, the key findings, and the implications. A good abstract is
self-contained: a reader should be able to understand the contribution of the
paper without looking at anything else. Use past tense for what you did and
present tense for conclusions.
\end{abstract}
\section{Introduction}\label{sec:intro}
Motivate the problem in this section. Explain why it matters, what the prior
state of the art is, and what is missing. End the introduction with a clear
statement of your contribution as a bulleted list:
\begin{itemize}
\item A novel method for X that improves on Y.
\item An empirical study across three benchmarks.
\item Open-source code and datasets at \url{https://example.com}.
\end{itemize}
References look like~\cite{einstein1905,turing1936}.
We summarize the key notation used throughout the paper in Table~\ref{tab:notation}.
\begin{table}[t]
\centering
\begin{tabular}{ll}
\toprule
Symbol & Meaning \\
\midrule
$\mathcal{X}$ & Input space \\
$\mathcal{Y}$ & Output space \\
$f_\theta$ & Model parameterized by $\theta$ \\
$\ell(\cdot,\cdot)$ & Loss function \\
$n$ & Number of training samples \\
$p$ & Dimensionality of feature space \\
$\lambda$ & Regularization parameter \\
\bottomrule
\end{tabular}
\caption{Summary of notation used in this paper.}
\label{tab:notation}
\end{table}
\section{Related Work}\label{sec:related}
Position the paper relative to prior work. Group related papers thematically
and discuss each cluster in a paragraph.
\section{Method}\label{sec:method}
Describe your approach. Equations render inline like $E = mc^2$, and as
displayed equations like
\begin{equation}\label{eq:schrodinger}
i\hbar\frac{\partial}{\partial t}\Psi(\mathbf{r},t) = \hat{H}\,\Psi(\mathbf{r},t).
\end{equation}
For multi-line alignment:
\begin{align}
f(x) &= ax^2 + bx + c, \\
f'(x) &= 2ax + b.
\end{align}
\begin{definition}[Something]
A formal definition of the object of interest.
\end{definition}
\begin{theorem}[Main result]
If assumption A holds, then conclusion B follows.
\end{theorem}
\begin{proof}
Sketch: first note that\ldots, then by applying Lemma X\ldots, therefore Q.E.D.
\end{proof}
\subsection{Complexity Analysis}\label{sec:complexity}
Let $T(n)$ denote the running time of the proposed algorithm on an input of
size $n$. The dominant cost arises from the matrix inversion step, which
requires $O(p^3)$ operations in the general case. By exploiting the sparse
structure of the Hessian (at most $s$ non-zero entries per row), we reduce
this to $O(s^2 p)$.
\begin{proposition}[Time complexity]
Under the sparsity assumption $\|\theta^\star\|_0 \leq s$, the total
computational cost of the proposed algorithm is $O(n s^2 p + n \log n)$,
where the $n \log n$ term arises from the sorting step in the initialization
phase.
\end{proposition}
The space complexity is $O(np + s^2)$, dominated by storage of the data
matrix and the sparse Hessian approximation.
\section{Experiments}\label{sec:exp}
\begin{figure}[t]
\centering
% \includegraphics[width=0.7\linewidth]{figure.png}
\fbox{\parbox[c][5cm][c]{0.7\linewidth}{\centering placeholder figure}}
\caption{A descriptive caption. Place captions below figures.}
\label{fig:main}
\end{figure}
\begin{figure}[t]
\centering
\fbox{\parbox[c][4cm][c]{0.45\linewidth}{\centering Convergence curve: training loss vs.\ epochs}}
\hfill
\fbox{\parbox[c][4cm][c]{0.45\linewidth}{\centering Convergence curve: validation accuracy vs.\ epochs}}
\caption{Training dynamics of the proposed method on Benchmark~B. Left: training loss decreases monotonically, confirming convergence of the optimizer. Right: validation accuracy plateaus after approximately 60 epochs.}
\label{fig:convergence}
\end{figure}
\begin{table}[t]
\centering
\begin{tabular}{lrrr}
\toprule
Method & Accuracy & F1 & Latency (ms) \\
\midrule
Baseline & 82.1 & 0.78 & 120 \\
Ours & \textbf{89.4} & \textbf{0.86} & \textbf{95} \\
\bottomrule
\end{tabular}
\caption{Results on the benchmark. Place captions above tables.}
\label{tab:main}
\end{table}
\section{Discussion}\label{sec:discussion}
Interpret the results. What do the numbers in Table~\ref{tab:main} mean in
context? Are there failure cases? Limitations?
\section{Conclusion}\label{sec:conclusion}
Summarize the contribution, restate the most important numbers, and suggest
future directions.
\section*{Acknowledgments}
We thank the reviewers for their feedback. This work was supported by grant
XYZ.
\begin{thebibliography}{9}
\bibitem{einstein1905} A. Einstein, ``Zur Elektrodynamik bewegter K\"orper,'' \emph{Annalen der Physik}, vol.~17, pp.~891--921, 1905.
\bibitem{turing1936} A. M. Turing, ``On Computable Numbers, with an Application to the Entscheidungsproblem,'' \emph{Proc. London Math. Soc.}, vol.~42, pp.~230--265, 1936.
\bibitem{cortes1995} C. Cortes and V. Vapnik, ``Support-vector networks,'' \emph{Machine Learning}, vol.~20, no.~3, pp.~273--297, 1995.
\bibitem{lecun1998} Y. LeCun, L. Bottou, Y. Bengio, and P. Haffner, ``Gradient-based learning applied to document recognition,'' \emph{Proceedings of the IEEE}, vol.~86, no.~11, pp.~2278--2324, 1998.
\bibitem{vaswani2017} A. Vaswani et al., ``Attention is all you need,'' in \emph{Advances in Neural Information Processing Systems}, vol.~30, 2017.
\end{thebibliography}
\appendix
\section{Supplementary Proof}\label{app:proof}
\begin{lemma}[Auxiliary bound]\label{lem:aux}
Let $X_1, \ldots, X_n$ be i.i.d.\ sub-Gaussian random variables with parameter
$\sigma$. Then for any $t > 0$,
\begin{equation}
P\!\left(\left|\frac{1}{n}\sum_{i=1}^n X_i - \mathbb{E}[X_1]\right| > t\right) \leq 2\exp\!\left(-\frac{nt^2}{2\sigma^2}\right).
\end{equation}
\end{lemma}
\begin{proof}
By the moment-generating function characterization of sub-Gaussian random
variables, for any $s > 0$ we have $\mathbb{E}[e^{s(X_i - \mu)}] \leq e^{s^2\sigma^2/2}$.
Applying Markov's inequality to $e^{s \sum_i (X_i - \mu)}$ and optimizing over
$s$ yields the stated bound. The two-sided inequality follows by a union bound
over the upper and lower tails.
\end{proof}
\begin{corollary}
Under the conditions of Lemma~\ref{lem:aux}, the sample mean converges to the
population mean at rate $O(\sigma\sqrt{\log(1/\delta)/n})$ with probability at
least $1-\delta$.
\end{corollary}
This concentration inequality is used in the proof of Theorem~3.3 to control
the deviation of the empirical risk from the population risk uniformly over the
parameter space.
\end{document}

PDF Preview
Create an account to compile and preview