\documentclass{tufte-handout}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{amsmath,amssymb}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage[hidelinks]{hyperref}
\title{Lecture Notes \\ \emph{Introduction to Information Theory}}
\author[First Last]{First Last}
\date{\today}
\begin{document}
\maketitle
\begin{abstract}
\noindent These notes introduce the basic quantities of information theory:
entropy, mutual information, and channel capacity. Proofs are sketched;
references are provided in the margins.
\end{abstract}
\section{Entropy}
The entropy of a discrete random variable $X$ with pmf $p$ is
\begin{equation}
H(X) = -\sum_{x} p(x) \log p(x).\sidenote{When $\log$ is base 2, $H$ is in \emph{bits}.}
\end{equation}
\newthought{Intuitively,} entropy measures uncertainty. A fair coin has $H = 1$ bit;
a biased coin has $H < 1$.
\begin{marginfigure}
\centering
\begin{tikzpicture}
\draw[->] (0,0) -- (3.2,0) node[right]{$p$};
\draw[->] (0,0) -- (0,1.4) node[above]{$H(p)$};
\draw[thick,domain=0.01:0.99,smooth,variable=\p] plot ({3*\p}, {-\p*log2(\p) - (1-\p)*log2(1-\p)});
\end{tikzpicture}
\caption{Binary entropy function.}
\end{marginfigure}
\section{Mutual Information}
The mutual information $I(X;Y)$ measures shared uncertainty:
\begin{equation}
I(X;Y) = H(X) - H(X \mid Y) = H(Y) - H(Y \mid X).
\end{equation}
\section{Channel Capacity}\marginnote{Shannon, 1948.}
For a discrete memoryless channel with transition kernel $p(y\mid x)$,
\begin{equation}
C = \max_{p(x)} I(X;Y).
\end{equation}
Shannon's channel-coding theorem shows $C$ is the largest rate at which
information can be transmitted with vanishing error.
\section{Examples}
\begin{table}[h]
\centering
\begin{tabular}{lc}
\toprule
Channel & Capacity \\
\midrule
Noiseless binary & $1$ bit \\
Binary symmetric $p$ & $1 - H(p)$ \\
Erasure $p$ & $1 - p$ \\
\bottomrule
\end{tabular}
\caption{Capacities of common channels.}
\end{table}
\end{document}

PDF Preview
Create an account to compile and preview