\documentclass{article}

\usepackage[margin=1.2in]{geometry}
\usepackage{graphicx}
\usepackage{amsmath,amssymb,amsthm,bm}
\usepackage{latexsym,color,minipage-marginpar,caption,multirow,verbatim}
\usepackage{enumerate,booktabs}
\usepackage{times}

\newcommand{\RR}{\mathbb{R}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\EE}{\mathbb{E}}

\newcommand{\cP}{\mathcal{P}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cX}{\mathcal{X}}

\newcommand{\ep}{\varepsilon}
\newcommand{\td}{\,\textrm{d}}

\newcommand{\simiid}{\overset{\textrm{i.i.d.}}{\sim}}
\newcommand{\simind}{\overset{\textrm{ind.}}{\sim}}
\newcommand{\toProb}{\overset{p}{\to}}
\newcommand{\toPtheta}{\overset{{P_\theta}}{\to}}
\newcommand{\Var}{\text{Var}}


\newcommand{\red}{\color{red}}
\definecolor{darkblue}{rgb}{0.2, 0.2, 0.5}
\newcommand{\sol}{~\\\color{darkblue}{\bf Solution:~\\}}


\begin{document}

\title{Stats 210A, Fall 2023\\
  Homework 11\\
  {\large {\bf Due date}: Wednesday, Nov. 15}}
\date{}
\maketitle
\vspace{-5em}

\begin{description}

\item[1. Some Maximum Likelihood Estimators]\hfill\\
  
Find the MLE for each model below, show that it is consistent, and find its asymptotic distribution. You may assume our Taylor expansions from class are valid without checking conditions.

\begin{enumerate}[(a)]


\item Binomial: $X_1,\ldots,X_n \simiid \text{Binom}(m,\theta)$. Find the MLE for $\theta$ and for the natural parameter $\eta = \log\frac{\theta}{1-\theta}$.


\item Gaussian: $X_1,\ldots,X_n \simiid N(\theta,\sigma^2)$. Find (i) the MLE for $\theta$ if $\sigma^2$ is known, (ii) the MLE for $\sigma^2$ if $\theta$ is known, and (iii) the MLE for $(\theta,\sigma^2)$ if neither is known.


\item Laplace: $X_1,\ldots,X_n \simiid \frac{1}{2}e^{-|x-\theta|}$. Assume $n$ is odd.

  For this problem, the log-likelihood is non-differentiable at one point, but we can still use our formula for the asymptotic distribution of the MLE from class, with the Fisher information defined by $J_1(\theta) = \Var_\theta[\dot{\ell}_1(\theta;X_i)]$. You may use this fact without proof.
  
\item  {\bf Optional} (not graded, no extra points) For the Laplace, plot a few realizations of the log-likelihood for $n = 5000$ with $\theta_0=0$, and plot over it the quadratic approximation given by
  \[
  \ell_n(\theta) - \ell_n(\theta_0) \approx \dot{\ell}_n(\theta_0) (\theta - \theta_0) - \frac{1}{2} nJ_1(\theta_0) (\theta - \theta_0)^2.
  \]
  Is the quadratic approximation pretty good in the neighborhood $\theta_0 \pm 3\sigma$, where $\sigma^2$ is the approximate variance of $\hat\theta_n$? Intuitively, what do you think might account for this when the second derivative doesn't exist?


\end{enumerate}


\item[2. Estimating the inverse of a mean]\hfill\\
Suppose that $X_1,\ldots,X_n\simiid N(\theta,1)$, and that we are interested in
estimating the quantity $1/\theta$.  In order to do so, we use the
estimator $\delta(X)=1/\overline{X}_n$ where $\overline{X}_n = \frac{1}{n}
\sum_{i=1}^n X_i$ is the sample mean. Assume $\theta \neq 0$.
\begin{enumerate}[(a)]
\item Show that $\delta$ is asymptotically normal, and find its asymptotic distribution.


\item Show that the expectation $\EE|1/\overline{X}_n| = \infty$ for every $n$.  Why does this not contradict the result of part (a)?


\item Simulate to find the distribution of $1/\overline{X}_n$ for $n = 10, 100, 10^4$ and $\theta = 0.1, 1, 10$. For each setting of the parameters, plot a histogram of the estimator and overlay its Gaussian approximation. When the Gaussian approximation is not good, what is going wrong? Is the sample size a reliable indicator of whether we should trust an asymptotic approximation?

{\bf Hint:} If you are using R, the functions \texttt{hist} (with argument \texttt{freq = FALSE} to get a density histogram), \texttt{curve}, and \texttt{dnorm} will come in handy. Also, I recommend manually setting the \texttt{breaks} and \texttt{xlim} arguments in \texttt{hist} to stop enormous values from making your histogram uninformative: $\mu \pm 4\sigma$ is a reasonable range of values to plot, where $\mu$ and $\sigma^2$ are the mean and variance of the Gaussian approximation.


\end{enumerate}


\item[3. Limiting distribution of $U$-statistics]\hfill\\
Suppose $X_{1}, \ldots, X_{n} \simiid P$ in some sample space $\cX$. $U_{n} = U_{n}(X_{1}, \ldots, X_{n})$ is called a rank-2 $U$-statistic if
\[U_{n} = \frac{1}{n(n - 1)}\sum_{i=1}^{n}\sum_{j\neq i}h(X_{i}, X_{j})\]
where $h$ is a symmetric function, i.e. $h(x_{1}, x_{2}) = h(x_{2}, x_{1})$ for any $x_{1}, x_{2}\in\cX$.

In this problem, we denote $\theta = \EE h(X_{1}, X_{2})$ and assume that $\EE h(X_{1}, X_{2})^{2} < \infty$. Note that $U_n$ is the nonparametric UMVU estimator of $\theta$.

Perhaps surprisingly, we can derive the asymptotic distribution of $U_n$ in a relatively small number of steps using a technique called {\em H\'{a}jek projection} where we approximate it by an additive function of the independent $X_i$ variables. We walk through the proof below.

\begin{enumerate}[(a)]
\item Define $g(x) = \EE h(x, X_2) - \theta = \int h(x,u)\td P(u) - \theta$. Show that, for all $i$,
\[
\EE g(X_i) = 0, \quad \text{ and } \;\;\Var(g(X_i)) < \infty.
\]
({\bf Note:} $g$ is a specific function from $\cX$ to $\RR$. It is not a rule for naively substituting symbols into expressions. In particular, note that $g(X_i)$, a random variable, is not the same as the deterministic expression $\EE h(X_i, X_2)-\theta$.)


\item Define $\widehat{U}_{n} = \theta + \frac{2}{n}\sum_{i=1}^{n}g(X_{i})$. Show that $\EE[(U_n-\widehat{U}_n)f(X_i)]=0$ for any $i$ and any measurable function $f(X_i)$ with $\EE[f(X_i)^2] < \infty$.

({\bf Hint:} Condition on $X_i$)


\item Show that ${\sqrt{n}(U_{n} - \widehat{U}_{n})\toProb 0}$ as $n\to\infty$. (Hint: show that $U_n$ and $\widehat{U}_n$ have the same asymptotic variance, and then apply part (b)).


\item Conclude that $\sqrt{n}(U_{n} - \theta)\Rightarrow N(0, 4\zeta_{1})$, where $\zeta_{1} = \Var(g(X_{1}))$.


\item Assume that $\cX = \RR$ with $\EE X_i^4 <\infty$. Express the sample variance $S_{n}^{2} = \frac{1}{n-1}\sum_{i=1}^{n}(X_{i} - \overline{X})^{2}$ as a rank-2 U-statistic and use the above results to derive its asymptotic distribution. 

\end{enumerate}

({\bf Note:} a similar result holds in general for rank-$r$ $U$-statistics if we set $\widehat{U}_n= \theta + \frac{r}{n}\sum_i g(X_i)$ where ${g(x) = \EE[h(x,X_2,\ldots,X_r)]-\theta}$. )

{\bf Moral:} If $P^n$ is the distribution of $(X_1,\ldots,X_n)$ then it is easy to check that the set of all square-integrable random variables of the form $f(X_1,\ldots,X_n)$ (where $f:\; \cX^n \to \RR$ is measurable) forms a vector space over $\RR$, which we call $L^2(P^n)$, where we can define an inner product as 
\[
\langle f(X), g(X) \rangle_{L^2} = \EE[ f(X)g(X)] \leq \sqrt{\EE[f(X)^2] \EE[g(X)^2]} < \infty.
\]

Moreover, the subset of those random variables that can be written as $\sum_i f_i(X_i)$, where each $f_i$ is measurable, forms a subspace. Part (b) establishes that the simpler random variable $\widehat{U}_n$ is the {\em projection} of $U_n$ onto this subspace, and part (c) establishes that $U_n$ is asymptotically very close to its projection.


\item[4. Probabilistic big-O notation]\hfill\\

Let $X_1,X_2,\ldots$ denote a sequence of random vectors (with $\|X_n\| <\infty$ almost surely for each $n$). We say the sequence is {\em bounded in probability} (or sometimes {\em tight}) if for every $\ep>0$ there exists a constant $M_\ep > 0$ for which
\[
\PP(\|X_n\| > M_\ep) < \ep, \quad \forall n.
\]

Informally, there is ``no mass escaping to infinity'' as $n$ grows. Like regular big-O notation, these symbols can help to make rigorous asymptotic proofs look clean and intuitive.

For a fixed sequence $a_n$, we say $X_n = o_p(a_n)$ if $X_n/a_n \toProb 0$ as $n\to \infty$, and $X_n = O_p(a_n)$ if the sequence $(X_n/a_n)_{n\geq 1}$ is bounded in probability.

Prove the following facts for $X_n, Y_n \in \RR^d$:

\begin{enumerate}[(a)]
%%%
\item If $X_n \Rightarrow X$ for any random vector $X$, then $X_n = O_p(1)$. 


\item If $X_n = o_p(a_n)$ then $X_n = O_p(a_n)$.


\item If $a_n/b_n \to 0$ and $X_n = O_p(a_n)$, then $X_n = o_p(b_n)$.


\item If $X_n = O_p(a_n)$ and $Y_n = O_p(b_n)$ then $X_n + Y_n = O_p(\max\{a_n,b_n\})$.


\item If $X_n = O_p(a_n)$ and $Y_n = o_p(b_n)$, then $X_n'Y_n = o_p(a_n b_n)$. If $X_n = O_p(a_n)$ and $Y_n = O_p(b_n)$, then $X_n'Y_n = O_p(a_n b_n)$.
  

\item If $X_n = O_p(1)$ and $g:\; \RR^d \to \RR^k$ is continuous then $g(X_n) = O_p(1)$. 


\item For $d=1$, if $X_n = O_p(a_n)$ with $a_n \to 0$ and $g:\; \RR \to \RR$ is continuously differentiable with $g(0) = \dot{g}(0) = 0$, then $g(X_n) = o_p(a_n)$. Show further that if $g$ is twice continuously differentiable then $g(X_n) = O_p(a_n^2)$. ({\bf Hint:} Use the mean value theorem and apply a previous part of this problem.)


\item For $d=1$, if $\text{Var}(X_n) = a_n^2 < \infty$ and $\EE X_n = b_n$ then $X_n = O_p(a_n + b_n)$. ({\bf Hint:} Use Chebyshev's inequality.)


\item {\bf Optional} (not graded, no extra points): If $\text{Var}(X_n) = a_n^2 < \infty$, is it impossible to have $X_n = o_p(a_n)$? Prove or give a counterexample.

\end{enumerate}

\end{description}

\end{document}