\documentclass{article}
\usepackage[margin=1.2in]{geometry}
\usepackage{graphicx}
\usepackage{amsmath,amssymb,amsthm,bm}
\usepackage{latexsym,color,minipage-marginpar,caption,multirow,verbatim}
\usepackage{enumerate}
\usepackage{times}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\ep}{\varepsilon}
\newcommand{\widebar}{\overline}
\newcommand{\simiid}{\overset{\textrm{i.i.d.}}{\sim}}
\newcommand{\simind}{\overset{\textrm{ind.}}{\sim}}
\newcommand{\td}{\,\textrm{d}}
\newcommand{\red}{\color{red}}
\definecolor{darkblue}{rgb}{0.2, 0.2, 0.5}
\newcommand{\sol}{~\\\color{darkblue}{\bf Solution:~\\}}
\begin{document}
\title{Stats 210A, Fall 2023\\
Homework 4\\
{\large {\bf Due date}: Wednesday, Sep. 27}}
\date{}
\maketitle
\vspace{-5em}
You may disregard measure-theoretic niceties about conditioning on measure-zero sets, almost-sure equality vs. actual equality, ``all functions'' vs. ``all measurable functions,'' etc. (unless the problem is explicitly asking about such issues).
\begin{description}
\item[1. Bayesian law of large numbers]\hfill\\
\begin{enumerate}[(a)]
\item Let $p(x)$ and $q(x)$ denote two strictly positive probability densities with respect to a common dominating measure $\mu$. The {\em Kullback--Leibler divergence} between $p$ and $q$ is defined as
\[
D(p \| q) = \int_{\cX} p(x) \log \frac{p(x)}{q(x)} \td\mu(x).
\]
Show that $D(p \| q) \geq 0$, with equality only in the case that $p(X) = q(X)$ almost surely
{\bf Hint:} recall that $\log(1+x) \leq x$ for all $x>-1$.
\item Consider a dominated likelihood model $\cP = \{p_{\theta}(x):\; \theta\in \Theta\}$, where the parameter space $\Theta$ is a finite set, and the densities are strictly positive on $\cX$. Let $\lambda$ denote a prior density w.r.t. the counting measure on $\Theta$, and consider the Bayes posterior after observing a sample $X_1,\ldots,X_n \simiid p_{\theta_0}(x)$ for some {\em fixed} value $\theta_0$ (that is, we are doing a {\em frequentist} analysis of the {\em Bayesian} posterior distribution). Assume that all the densities are distinct; that is, $p_{\theta_1}(X) = p_{\theta_2}(X)$ almost surely if and only if $\theta_1=\theta_2$.
If the prior $\lambda$ puts positive mass on all values in $\Theta$, show that as $n\to\infty$, the posterior density eventually concentrates nearly all its mass on the true value $\theta_0$. That is,
\[
\PP_{\theta_0}\left[\lambda(\theta_0 \mid X_1,\ldots,X_n) \geq 1-\ep\right] \to 1, \quad \text{for all } \ep > 0.
\]
({\bf Hint:} use the law of large numbers).
\end{enumerate}
{\bf Moral:} At least for a finite parameter space, the Bayes estimator always converges to the right answer as long as we put positive mass on the right answer. This result can be generalized with more effort to continuous parameter spaces under some regularity conditions on the likelihood function, similar to the types of conditions we will use to guarantee the MLE is consistent.
The requirement that the prior density should be nonzero everywhere is sometimes called Cromwell's Rule, after Oliver Cromwell's famous plea to the Church of Scotland: ``I beseech you, in the bowels of Christ, think it possible that you may be mistaken.''
\item[2. Fisher information for location and scale families]\hfill\\
Consider a scale family
\[p_\theta(x) = \frac{1}{\theta}p_0\left(\frac{x}{\theta}\right),\quad \theta > 0.\]
where $p_0$ is some fixed probability density function with respect to the Lebesgue measure.
\begin{enumerate}[(a)]
\item Show that the Fisher information of a single observation $X$ is given by
\[J(\theta) = \frac{1}{\theta^{2}}\int_{-\infty}^\infty\left[\frac{u p_0'(u)}{p_0(u)} + 1\right]^{2}p_0(u)du.\]
Try to explain in your own words why it makes sense that the Fisher information should be proportional to $\theta^{-2}$ (the verbal explanation will be graded leniently).
\item If we instead parameterize the model using $\zeta = \log\theta$, show that the Fisher information $J(\zeta)$ of a single observation $X$ does not depend on $\zeta$. Explain in your own words why this makes sense.
\end{enumerate}
\item[3. Ridge regression]\hfill\\
Consider the {\em Gaussian linear model} where
\[
y_i = x_i' \beta + \ep_i, \quad \text{ with } \ep_i \simiid N(0,\sigma^2) \;\text{ for } i =1, \ldots n,
\]
where $\beta \in \RR^d$ is unknown, and the covariate vectors $x_i \in \RR^d$ are fixed and known. Assume the error variance $\sigma^2>0$ is also known. We observe the response vector $y \in \RR^n$.
\begin{enumerate}[(a)]
\item Assume that $d \leq n$, and the design matrix $\mathbf{X}$ (the $n \times d$ matrix whose $i$th row is $x_i'$) has full column rank. Show that the OLS estimator $\hat\beta = (\mathbf{X}'\mathbf{X})^{-1}\mathbf{X}'y$ is the UMVU estimator of $\beta$.
{\bf Note:} Remember that the design matrix $\mathbf{X}$ is not data in the same sense $y$ is; it is more like a known parameter.
\item Now consider Bayesian estimation with the prior $\beta \sim N(\mu, \tau^2 I_d)$. Under the same prior as in part (b), find the posterior distribution of $\beta$. Does it matter whether $d > n$, or whether $\mathbf{X}$ has full column rank?
\item Suppose that $\mathbf{X}\gamma = 0$ for some nonzero $\gamma \in \RR^d$. Show that no unbiased estimator exists for $g(\beta) = \beta'\gamma$. What is the posterior distribution for $g(\beta)$?
\end{enumerate}
\item[4. Other loss functions]\hfill\\
Assume for each problem below that there exists an estimator with finite Bayes risk.
\begin{enumerate}[(a)]
\item Consider a Bayesian model with a discrete parameter $\theta$. What is the Bayes estimator for the loss $L(\theta, d) = 1\{\theta \neq d\}$?
\item Next consider a Bayesian model with a single real parameter $\theta$, and assume that the posterior distribution of $\theta$ given $X=x$ is absolutely continuous (with respect to the Lebesgue measure) for all $x$. What is the Bayes estimator for the {\em absolute error loss} $L(\theta, d) = |\theta-d|$?
\item Under the same assumptions as part (b), what loss function $L_\gamma(\theta, d)$ would give the posterior $\gamma$ quantile as its Bayes estimator; that is, the estimator $\delta_\gamma(X)$ has $\PP(\theta < \delta_\gamma(X) \mid X) = \gamma$.
\end{enumerate}
\item[5. Exponential-exponential model]\hfill\\
Consider a Bayesian model with prior distribution $\lambda(\theta) = e^{-\theta}1\{\theta>0\}$ for $\theta$ (the standard exponential distribution) and whose likelihood is the exponential location family:
\[
p_{\theta}(x) = e^{\theta-x} 1\{x > \theta\},
\]
where we observe a sample $X_1,\ldots,X_n \simiid p_\theta(x)$ given $\theta$.
\begin{enumerate}[(a)]
\item Calculate the posterior distribution for $\theta$ for $n>1$.
\item For $n=1$, calculate the posterior distribution and the Bayes estimator under squared error loss.
\item Still for $n=1$, calculate the MSE for the Bayes estimator and the UMVU estimator as a function of $\theta$. Plot the risk function for $\theta \in [0,5]$. For what values of $\theta$ does the Bayes estimator perform better?
\item Still for $n=1$, calculate the Bayes risk for the Bayes estimator, and for the UMVU estimator $X_{1}-1$, using squared error loss.
\end{enumerate}
{\bf Moral:} The Bayes estimator tends to have better risk in places where the prior is large, sometimes at the cost of performing very poorly where the prior puts very little mass.
\end{description}
\end{document}