\documentclass[a4paper]{article}
\usepackage[margin=1.2in]{geometry}
\usepackage{graphicx}
\usepackage{amsmath,amssymb,amsthm,bm}
\usepackage{latexsym,color,minipage-marginpar,caption,multirow,verbatim}
\usepackage{enumerate}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\ep}{\varepsilon}
\newcommand{\td}{\,\textrm{d}}
\newcommand{\simiid}{\overset{\textrm{i.i.d.}}{\sim}}
\newcommand{\simind}{\overset{\textrm{ind.}}{\sim}}
\begin{document}
\title{Stats 210A, Fall 2023\\
Homework 6\\
{\large {\bf Due date}: Wednesday, Oct. 11}}
\date{}
\maketitle
\vspace{-5em}
You may disregard measure-theoretic niceties about conditioning on measure-zero sets, almost-sure equality vs. actual equality, ``all functions'' vs. ``all measurable functions,'' etc. (unless the problem is explicitly asking about such issues).
If you need to write code to answer a question, show your code. If you need to include a plot, make sure the plot is readable, with appropriate axis labels and a legend if necessary. Points will be deducted for very hard-to-read code or plots.
\begin{description}
\item[1. Effective degrees of freedom]\hfill\\
We can write a standard Gaussian sequence model in the form
\[
Y_i = \mu_i + \ep_i, \quad \ep_i \simiid N(0,\sigma^2), \quad i = 1,\ldots,n
\]
with $\mu\in\RR^n$ and $\sigma^2 > 0$ possibly unknown. If we estimate $\mu$ by some estimator $\hat\mu(Y)$, we can compute the residual sum of squares (RSS):
\[
\text{RSS}(\hat\mu,Y) = \|\hat\mu(Y)-Y\|^2 = \sum_{i=1}^n (\hat\mu_i(Y) - Y_i)^2.
\]
If we were to observe the same signal with independent noise $Y^* = \mu + \ep^*$, the expected prediction error (EPE) is defined as
\[
\text{EPE}(\mu,\hat\mu) = \EE_\mu\left[\| \hat\mu(Y) - Y^*\|^2\right] = \EE_\mu\left[\|\hat\mu(Y)-\mu\|^2\right] + n\sigma^2.
\]
Because $\hat\mu$ is typically chosen to make RSS small for the observed data $Y$ (i.e., to fit $Y$ well), the RSS is usually an optimistic estimator of the EPE, especially if $\hat\mu$ tends to overfit. To quantify how much $\hat\mu$ overfits, we can define the {\em effective degrees of freedom} (or simply the {\em degrees of freedom}) of $\hat\mu$ as
\[
\text{DF}(\mu,\hat\mu) = \frac{1}{2\sigma^2}\EE\left[\text{EPE} - \text{RSS}\right],
\]
which uses optimism as a proxy for overfitting.
For the following questions assume we also have a predictor matrix $X\in \RR^{n\times d}$, which is simply a matrix of fixed real numbers. Suppose that $d\leq n$ and $X$ has full column rank.
\begin{enumerate}[(a)]
\item Show that if $\hat\mu$ is differentiable with $\EE_\mu\|D\hat\mu(Y)\|_F < \infty$ then
\[
\sum_{i=1}^n \frac{\partial \hat\mu_i(Y)}{\partial Y_i}
\]
is an unbiased estimator of the DF. (Recall $D\hat\mu(Y)$ is the Jacobian matrix from class).
\item Suppose $\hat\mu = X\hat\beta$, where $\hat\beta$ is the ordinary least squares estimator (i.e., chosen to minimize the RSS). Show that the DF is $d$. (This confirms that DF generalizes the intuitive notion of degrees of freedom as ``the number of free variables'').
\item Suppose $\hat\mu = X\hat\beta$, where $\hat\beta$ minimizes the penalized least squares criterion:
\[
\hat\beta = \arg\min_\beta \|Y - X\beta\|_2^2 + \rho \|\beta\|_2^2,
\]
for some $\rho \geq 0$. Show that the DF is $\sum_{j=1}^d \frac{\lambda_j}{\rho+\lambda_j}$, where $\lambda_1 \geq \cdots \geq \lambda_d > 0$ are the eigenvalues of $X'X$ (counted with multiplicity) ({\bf Hint:} use the singular value decomposition of $X$).
\end{enumerate}
\item[2. Soft thresholding]\hfill\\
Consider the {\em soft thresholding operator} with parameter $\lambda \geq 0$, defined as
\[
\eta_\lambda(x) =
\begin{cases}
x - \lambda & x > \lambda\\
0 & |x| \leq \lambda\\
x + \lambda & x < -\lambda
\end{cases}
\]
Note that, although we didn't prove it in class, Stein's lemma applies for continuous functions $h(x)$ which are differentiable except on a measure zero set; you can apply it here without worrying.
Assume $X \sim N_d(\theta, I_d)$ for $\theta \in \RR^d$, which we will estimate via $\delta_\lambda(X) = (\eta_\lambda(X_1), \ldots, \eta_\lambda(X_{d}))$. Soft thresholding is sometimes used when we expect {\em sparsity}: a small number of relatively large $\theta_i$ values. $\lambda$ here is called a {\em tuning parameter} since it determines what version of the estimator we use, but doesn't have an obvious statistical interpretation.
\begin{enumerate}[(a)]
\item Show that $\left|\{i:\; |X_i| > \lambda\}\right|$ is an unbiased estimator of the degrees of freedom of $\delta_\lambda$ (so, in a sense, the DF is the expected number of ``free variables'').
\item Show that
\[
d + \sum_i \min(X_i^2, \lambda^2) - 2 \left|\{i:\; |X_i| \leq \lambda\}\right|
\]
is an unbiased estimator for the MSE of $\delta_\lambda$.
\item Show that the risk-minimizing value $\lambda^*$ solves
\[
\lambda \sum_i \PP_{\theta_i}(|X_i| > \lambda) = \sum_i \phi(\lambda - \theta_i) + \phi(\lambda + \theta_i),
\]
where $\phi(z) = \frac{e^{-z^2/2}}{\sqrt{2\pi}}$ is the standard normal density.
\item Consider a problem with $\theta_1 = \cdots = \theta_{20} = 10$ and $\theta_{21} = \cdots = \theta_{500} = 0$. Compute $\lambda^*$ numerically. Then simulate a vector $X$ from the model and use it to automatically tune the value of $\lambda$ by minimizing SURE. Call the automatically tuned value $\hat\lambda(X)$ and report both $\lambda^*$ and $\hat\lambda(X)$. Finally plot the true MSE of $\delta_\lambda$ along with its SURE estimate against $\lambda$ for a reasonable range of $\lambda$ values. Add a horizontal line for the risk of the UMVU estimator.
\item Compute and report the squared error loss $\|\delta(X) - \theta\|^2$ for the following four estimators:
\begin{enumerate}[(i)]
\item the UMVU estimator $\delta_0(X) = X$,
\item the optimally tuned soft-thresholding estimator $\delta_{\lambda^*}(X)$,
\item the automatically tuned soft-thresholding estimator $\delta_{\hat{\lambda}(X)}(X)$, and
\item the James-Stein estimator.
\end{enumerate}
You do not need to compute the MSE. Intuitively, what do you think accounts for the good performance of soft-thresholding in this example?
\end{enumerate}
\item[3. Mean estimation]\hfill\\
\begin{enumerate}[(a)]
\item Suppose $X_1, \ldots, X_n \simiid N_d(\theta, I_d)$ and consider estimating $\theta\in\RR^d$. Show that $\overline X = \frac{1}{n}\sum_i X_i$ is the minimax estimator of $\theta$ under squared error loss.
{\bf Hint:} Find a least favorable sequence of priors.
\item Suppose $X_1, \ldots, X_n \simiid P$ where $P$ is any distribution over the real numbers such that $\text{Var}_P(X) \leq 1$. Show that $\overline X = \frac{1}{n}\sum_i X_i$ is minimax for estimating $\theta(P) = \EE_P X$ under the squared error loss.
{\bf Hint:} Try to relate this problem to the Gaussian problem with $d=1$.
\item Assume $X \sim N(\theta, 1)$ with the constraint that $|\theta| \leq 1$. Show that the minimax estimator for squared error loss is
\[
\text{tanh}(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}.
\]
Plot its risk function.
{\bf Hint:} Plot the risk function first. For this problem if you need to show that a function is maximized or minimized somewhere, you may do it numerically or by inspecting a graph if it is obvious enough.
\end{enumerate}
\item[4. James-Stein estimator with regression-based shrinkage]\hfill\\
Consider estimating $\theta \in \RR^d$ in the model $Y \sim N_d(\theta, I_d)$. In the standard James-Stein estimator, we shrink all the estimates toward zero, but it might make more sense to shrink them towards the average value $\overline{Y}$, or towards some other value based on observed side information.
\begin{enumerate}[(a)]
\item Consider the estimator
\[
\delta_i^{(1)}(Y) = \overline{Y} + \left(1 - \frac{d-3}{\|Y- \overline{Y}1_d\|^2}\right) \left(Y_i - \overline{Y}\right)
\]
Show that $\delta^{(1)}(Y)$ strictly dominates the estimator $\delta^{(0)}(Y)=Y$, for $d\geq 4$.
\[
\text{MSE}(\theta; \delta^{(1)}) < \text{MSE}(\theta; \delta^{(0)}), \quad \text{for all } \theta \in \RR^d.
\]
Calculate the MSE of $\delta^{(1)}$ if $\theta_1=\theta_2=\cdots=\theta_d$. How would it compare to the MSE for the usual James-Stein estimator?
{\bf Hint:} Change the basis using an appropriate orthogonal rotation and think about how the estimator operates on different subspaces.
{\bf Hint:} Recall that if $Z \sim N_d(\mu, \Sigma)$ and $A\in \RR^{k\times d}$ is a fixed matrix then ${AZ \sim N_k(A\mu, A\Sigma A')}$.
\item Now suppose instead that we have side information about each $\theta_i$, represented by fixed covariate vectors $x_1,\ldots,x_d \in \RR^k$. Assume the design matrix $X \in \RR^{d\times k}$ whose $i$th row is $x_i'$ has full column rank. Suppose that we expect $\theta \approx X\beta$ for some $\beta\in \RR^k$, but unlike the usual linear regression setup, we will not assume $\theta = X\beta$ with perfect equality.
Find an estimator $\delta^{(2)}$, analogous to the one in part (a), that dominates $\delta^{(0)}$ whenever $d - k \geq 3$:
\[
\text{MSE}(\theta; \delta^{(2)}) < \text{MSE}(\theta; \delta^{(0)}), \quad \text{for all } \theta \in \RR^d,
\]
and for which $\text{MSE}(X\beta; \delta^{(2)}) = k + 2$, for any $\beta\in \RR^k$.
{\bf Hint:} Think of this setting as a generalization of part (a), which can be considered a special case with $d=1$ and all $x_i = 1$. What is the right orthogonal rotation?
{\bf Note:} Don't assume there is an additional intercept term for the regression; this could always be incorporated into the $X$ matrix by taking $x_{i,1} = 1$ for all $i=1,\ldots,d$.
\end{enumerate}
\item[5. Upper-bounding $\theta$]\hfill\\
\begin{enumerate}[(a)]
\item Let $X \sim N(\theta, 1)$ for $\theta\in\RR$, and consider the loss function
\[
L(\theta, d) = 1\{d < \theta\};
\]
that is, we observe $X$ and try to come up with an upper bound $\delta(x) \in \RR$ for $\theta$. Show that the minimax risk is 0 (note you may not be able to find a minimax estimator).
\item Now, consider a problem with the same loss function but without observing any data. Show the minimax risk (considering both randomized and non-randomized estimators) is 1, but the Bayes risk $r_\Lambda = 0$ for any prior $\Lambda$ (note there may be no estimator $\delta_\Lambda$ that attains the minimum Bayes risk).
({\bf Note:} This problem exhibits a ``duality gap'' where the lower bounds we can get by trying different priors will always fall short of the minimax risk.)
\item {\bf Optional} (not graded, no extra points): Now consider the same loss function, but now $X \sim N(\theta, \sigma^2)$ and $\sigma^2$ is unknown too. Find the minimax risk.
{\bf Hint:} consider estimators of the form $\delta(X) = c|X|$.
\end{enumerate}
\end{description}
\end{document}