\documentclass{article}
\usepackage[margin=1.2in]{geometry}
\usepackage{graphicx}
\usepackage{amsmath,amssymb,amsthm,bm}
\usepackage{latexsym,color,minipage-marginpar,caption,multirow,verbatim}
\usepackage{enumerate}
\usepackage{times}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\Var}{\textnormal{Var}}
\newcommand{\simiid}{\overset{\text{i.i.d.}}{\sim}}
\newcommand{\td}{\,\textrm{d}}
\newcommand{\red}{\color{red}}
\newcommand{\blue}{\color{blue}}
\definecolor{darkblue}{rgb}{0.2, 0.2, 0.5}
\newcommand{\sol}{~\\\color{darkblue}{\bf Solution:~\\}}
\begin{document}
\title{Stats 210A, Fall 2023\\
Homework 1\\
{\large {\bf Due date}: Wednesday, Sep. 6}}
\date{}
\maketitle
\vspace{-5em}
You may disregard measure-theoretic niceties about conditioning on measure-zero sets, almost-sure equality vs. actual equality, ``all functions'' vs. ``all measurable functions,'' etc. (unless the problem is explicitly asking about such issues).
\begin{description}
\item[1. Bias-Variance Tradeoff]\hfill\\
Consider a generic estimation setting where we observe $X \sim P_\theta$, for a model $\cP = \{P_\theta:\; \theta \in \Theta \subseteq \Theta\}$, and we want to estimate $\theta$ using some estimator $\delta(X) \in \RR^d$. The {\em bias} of $\delta$ (under sampling from $P_\theta$) is defined as
\[
\text{Bias}_\theta(\delta(X)) = \EE_\theta[\delta(X)] - \theta.
\]
For $d=1$, it is well-known that the mean squared error $\text{MSE}(\theta; \delta)$ can be decomposed as the sum of the squared bias of $\delta$ and its variance:
\begin{equation}\label{eq:biasvar}
\text{MSE}(\theta; \delta) = \text{Bias}_\theta(\delta)^2 + \Var_\theta(\delta).
\end{equation}
\begin{enumerate}[(a)]
\item Derive the correct generalization of \eqref{eq:biasvar} for general $d \geq 1$, where the MSE is defined as
\[
\text{MSE}(\theta; \delta) = \EE_\theta \|\delta(X) - \theta\|_2^2.
\]
It might help to start with $d=1$.
\item Suppose that we are estimating the false positive rate of a new diagnostic test for some disease, using a sample of $n$ specimens taken from a population known not to have the disease we are testing for. If $X$ is the number of false positives and $\theta \in (0,1)$ is the false positive rate, assume $X \sim \text{Binom}(n, \theta)$. The ``obvious'' estimator is $\delta_0(X) = X/n$.
However, biological samples are expensive to obtain and the new test is a slightly modified version of an old test whose false positive rate is known to be $\theta_0 \in (0,1)$, so we might want to ``shrink'' the estimator toward $\theta_0$ as follows:
\[
\delta_{\gamma}(X) = \gamma \theta_0 + (1-\gamma) \frac{X}{n}, \quad \text{ for } \gamma \in [0,1],
\]
where taking $\gamma = 0$ reduces to the ``obvious'' estimator $\delta_0(X) = X/n$.
Find the MSE of $\delta_\gamma(X)$ as an explicit expression in $\theta_0, \theta, n$, and $\gamma$.
\item Find the parameter $\gamma^*$ for which the MSE is minimized, as an expression in $n, \theta$, and $\theta_0$. What happens to $\gamma^*$ if we send $\theta \to \theta_0$ holding $\theta_0$ and $n$ fixed? What if we send $n\to\infty$ holding $\theta$ and $\theta_0$ fixed instead? Explain why these limits make sense.
\item In our calculation above, $\gamma^*$ is never exactly zero. That is, a smidgeon of shrinkage always beats no shrinkage. Does this prove that $\delta_0$ is inadmissible? Prove or disprove whether $\delta_0$ is dominated by any $\delta_\gamma$.
{\bf Moral:} Shading our estimate toward some ``hunch'' value can be an effective technique to improve an estimator's performance. This is a central idea in statistics and machine learning that goes by many names: regularization, shrinkage, and inductive bias, to name a few. The optimal amount of bias in an estimator depends on the sample size, and the accuracy of our hunch, but is rarely zero. This may give us pause about insisting that estimators should be unbiased, a theme to which we will return later.
\end{enumerate}
\item[2. Convexity of $A(\eta)$ and $\Xi_1$]\hfill\\
Let $\cP=\{p_\eta:\; \eta \in \Xi_1\}$ denote an $s$-parameter exponential family in canonical form
\[
p_\eta(x) = e^{\eta'T(x) - A(\eta)}h(x), \qquad A(\eta) = \log\int_{\cX} e^{\eta'T(x)}h(x)\td \mu(x),
\]
where $\Xi_1=\{\eta:\; A(\eta) < \infty\}$ is the natural parameter space.
Recall H\"{o}lder's inequality: if $q_1,q_2\geq 1$ with $q_1^{-1} + q_2^{-1} = 1$, and $f_1$ and $f_2$ are ($\mu$-measurable) functions from $\cX$ to $\RR$, then
\[
\|f_1f_2\|_{L^1(\mu)} \leq \|f_1\|_{L^{q_1}(\mu)}\|f_2\|_{L^{q_2}(\mu)}, \quad \text{ where } \|f\|_{L^{q}(\mu)} = \left(\int_{\cX} |f(x)|^q\td \mu(x)\right)^{1/q}.
\]
({\bf Note} that $q_1=q_2=2$ reduces to Cauchy-Schwarz).
\begin{enumerate}[(a)]
\item Show that $A(\eta):\;\RR^s \to [0,\infty]$ is a convex function: that is, for {\em any} $\eta_1,\eta_2\in \RR^s$ (not just in $\Xi_1$), and $c\in [0,1]$ then
\begin{equation}\label{eq:ineq}
A(c\eta_1 + (1-c)\eta_2) \leq c A(\eta_1) + (1-c) A(\eta_2)
\end{equation}
({\bf Hint}: try $q_1=c^{-1}$, $f_1(x)^{1/c}=e^{\eta_1'T(x)}h(x)$.)
\item Conclude that $\Xi_1\subseteq \RR^s$ is convex.
{\bf Moral:} The natural parameter space for any exponential family (meaning the set of all parameters $\eta$ that give normalizable densities) is a convex subset of $\RR^s$.
\end{enumerate}
\item[3. Expectation of an increasing function]\hfill\\
\begin{enumerate}[(a)]
\item Assume $X\sim P$ is a real-valued random variable. Show that if $f(x)$ and $g(x)$ are non-decreasing functions of $x$, then
\[
\text{Cov}(f(X),g(X)) \geq 0
\]
({\bf Hint}: derive the identity $\EE\left[(f(X_1)-f(X_2))(g(X_1)-g(X_2))\right] = 2\text{Cov}(f(X_1),g(X_1))$, where $X_1,X_2\simiid P$).
\item Let $p_\eta(x)$ be a one-parameter canonical exponential family with non-decreasing sufficient statistic $T(x)$, where $x\in\cX\subseteq \RR$:
\[
p_\eta(x) = e^{\eta T(x) - A(\eta)}h(x).
\]
Let $\psi(x)$ be any non-decreasing bounded function. Show that, for $\eta\in\Xi_1^{\text{o}}$, $\frac{d}{d \eta}\EE_{\eta}[\psi(X)] \geq 0$.
({\bf Hint}: find an expression for $\frac{d}{d \eta} \EE_{\eta}[\psi(X)]$ by using methods akin to the ones we used in class to derive the differential identities. You may appeal to Keener Theorem 2.4 to justify differentiating under the integral sign.)
\item Conclude that $X$ is stochastically increasing in $\eta$; that is, show $\PP_\eta(X \leq c)$ is non-increasing in $\eta$, for every $c \in \RR$.
\end{enumerate}
{\bf Moral:} This exercise confirms something that we should intuitively expect to be true: that increasing the natural parameter $\eta$, which ``tilts'' the distribution toward larger values of $T(X)$, will also shift the distribution of $X$ to the right if $T$ is an increasing function. It also illustrates the usefulness of differential identities for understanding exponential families' structure.
\item[4. Exponential families maximize entropy]\hfill\\
The entropy (with respect to $\mu$) of a random variable $X$ with density $p$, is defined by
\[
h(p) = \EE_p(-\log p(X)) = -\int_{\{x:\,p(x) > 0\}} \log(p(x)) p(x)\td \mu(x).
\]
Here, as always in this course, $\log$ denotes the natural logarithm, but $h$ is also commonly defined in terms of the log with base 2. Entropy arises naturally in information theory as a minimal expected code length (for the base-2 log), or in statistical mechanics as a measure of the disorder in a physical system.
Let $T:\; \cX\to\RR^s$ denote a generic function, and let $\alpha$ be some vector in the interior of the convex hull of $T(\cX) = \{T(x):\; x\in \cX\}$. Consider the problem of maximizing $h(p)$ over all probability densities subject to the constraint that $\EE_p[T(X)]=\alpha$. That is, we want to solve
\begin{align*}
\text{maximize} \quad &-\int_{\{x:\,p(x) > 0\}} \log(p(x)) p(x)\td \mu(x) \\[5pt]
\text{s.t.} \quad &p(x) \geq 0, \;\; \int_\cX p(x) \td \mu(x) = 1, \;\text{and}\; \int_\cX p(x)T(x)\td \mu(x)=\alpha \in \RR^s.
\end{align*}
\begin{enumerate}[(a)]
\item If $\cX$ is a finite set with $\mu(\{x\})>0$ for all $x\in\cX$, show that the optimal $p^*$ is a member the $s$-parameter exponential family
\[p_{\eta}(x)=e^{\eta'T(x)-A(\eta)},\]
with parameter $\eta^*\in \RR^s$ chosen so that $p_{\eta^*}$
satisfies the constraints.
({\bf Hint}: use Lagrange multipliers).
\item Blithely\footnote{Meaning naively, without any concern that anything new might go wrong in a continuous space} applying the result of (a) to $\cX=\RR$, find the distribution that maximizes entropy with respect to the Lebesgue measure, subject to the constraint that $\EE(X) = \mu, \text{Var}(X) = \sigma^{2}$.
\item Assume that we need to place $n$ balls into $d$ bins. The number of ways to place the balls resulting in $k_i$ total balls in bin $i$, for $i=1,\ldots,d$, is given by the combinatorial expression $\frac{n!}{k_1! k_2! \cdots k_d!}$.
Now consider the empirical distribution of the balls. Its probability mass function is $p(i) = k_i/n$ with respect to the counting measure on $\{1,\ldots,d\}$. Let $N_p$ denote the number of configurations with empirical distribution $p$, and show that
\[
\log(N_p) = n h(p) + O(\log n),
\]
where $h(p)$ is the entropy with respect to the counting measure on $\{1,\ldots,d\}$.
In other words, there are many more high-entropy configurations than low-entropy configurations. This suggests the intuition that, if we consider a physical system at a ``macro level'' (such as the distribution of gas particles in a container) then we should expect it to drift toward high-entropy configurations.
{\bf Hint:} It may be helpful to recall Stirling's approximation:
\[
\log(n!) = n\log n - n + O(\log n)
\]
\end{enumerate}
{\bf Moral:} This exercise illustrates additional reasons why exponential family distributions are natural objects of study in statistics.
\item[5. Gamma family]\hfill\\
The gamma family is a two-parameter family of distributions on $\RR_+ = [0,\infty)$, with density
\[
p_{k,\theta}(x) = \frac{x^{k-1}e^{-x/\theta}}{\Gamma(k)\theta^k}
\]
with respect to the Lebesgue measure on $\RR_+$. $k>0$ and $\theta>0$ are respectively called the shape and scale parameters, and $\Gamma(k)$ is the gamma function, defined as
\[
\Gamma(k) = \int_0^\infty x^{k-1}e^{-x}\td x.
\]
The gamma distribution generalizes the exponential distribution
\[
\text{Exp}(\theta) = \theta^{-1}e^{-x/\theta} = \text{Gamma}(1,\theta)
\]
and the chi-squared distribution
\[
\chi_d^2 = \frac{x^{d/2-1}e^{-x/2}}{\Gamma(d/2)2^{d/2}} = \text{Gamma}(d/2,2).
\]
\begin{enumerate}[(a)]
\item Show that the Gamma is a 2-parameter exponential family by putting it into its canonical form. Find the natural parameter, sufficient statistic, carrier density, and log-partition function ({\bf Note}: there are multiple valid ways of doing this).
\item Find the mean and variance of $X \sim \Gamma(k,\theta)$.
\item Find the moment generating function of $X\sim \Gamma(k,\theta)$:
\[
M_X(u) = \EE_{k,\theta}[e^{uX}],
\]
and use it to find the distribution of $X_+ = \sum_{i=1}^n X_i$ where $X_1,\ldots,X_n$ are mutually independent with $X_i \sim \text{Gamma}(k_i, \theta)$.
You may use without proof the following uniqueness result about MGFs:
If $Y$ and $Z$ are two random variables whose MGFs coincide in a neighborhood of 0 ($\exists \delta>0$ for which $M_Y(u) =M_Z(u) < \infty$ for all $u\in[-\delta,\delta]$), then $Y$ and $Z$ have the same distribution.
\end{enumerate}
\end{description}
\end{document}