\documentclass{article} \include{macros}

\usepackage{floatflt}

\begin{document}

\lecture{10}{Conditional Expectation}{Charless C.
Fowlkes}{fowlkes@cs.berkeley.edu}

\section{Definition of Conditional Expectation}
Recall the ``undergraduate'' definition of conditional
probability associated with Bayes' Rule \[ \P(A | B) \equiv
\frac{\P(A,B)}{\P(B)} \] For a discrete random variable $X$
we have \[ \P(A) = \sum_x \P(A, X=x) = \sum_x
\P(A|X=x)\P(X=x) \] and the resulting formula for
conditional expectation \begin{eqnarray*} \E(Y|X=x) &=&
\int_\Omega Y(\omega) \P(dw | X=x) \\ &=& \frac{\int_{X=x}
Y(\omega) \P(dw)}{\P(X=x)} \\ &=& \frac{\E(Y
\1_{(X=x)})}{\P(X=x)} \end{eqnarray*} We would like to
extend this to handle more general situations where
densities don't exist or we want to condition on very
``complicated'' sets.

%In constructing a more general definition, we will take as
%guidance the following formula for discrete variables \[
%\E(Y g(X)) = \sum_x \E(Y|X=x) g(x) P(X=x) \] which makes
%good sense for bounded functions $g$.

\begin{definition} \label{def:condexp}  Given a random 
variable $Y$ with $\E|Y| < \infty$ on the space
$(\Omega,\F,\P)$ and some sub-$\sigma$-field $\G \subset \F$
we will define the {\bf conditional expectation} as the
almost surely unique random variable $\E(Y | \G)$ which
satisfies the following two conditions 
\begin{itemize}
\item[1.]{$\E(Y|\G)$ is $\G$-measurable}
\item[2.]{$\E(YZ) = \E(\E(Y|\G) Z )$ for all $Z$ which are
bounded and $\G$-measurable} 
\end{itemize}
\end{definition}

For $\G = \sigma(X)$ when $X$ is a discrete variable, the
space $\Omega$ is simply partitioned into disjoint sets
$\Omega = \sqcup G_n$.  Our definition for the discrete case
gives \begin{eqnarray*} \E(Y | \sigma(X)) &=& \E(Y|X)\\ &=&
\sum_n \frac{\E(Y \1_{X=x_n})}{\P(X=x_n)} \1_{X=x_n}\\ &=&
\sum_n \frac{\E(Y \1_{G_n})}{\P(G_n)} \1_{G_n}\\
\end{eqnarray*} which is clearly $\G$-measurable.

\begin{exercise} Show that the discrete formula satisfies
condition 2 of Definition~\ref{def:condexp}.  ({\bf Hint:}
show that the condition is satisfied for random variables of
the form $Z = \1_G$ where $G \in {\cal C}$ is a collection
closed under intersection and $\G = \sigma({\cal C})$ then
invoke Dynkin's $\pi-\lambda$) \end{exercise}


\section{Conditional Expectation is Well Defined}

\begin{proposition} 
$E(X|\G)$ is unique up to almost sure equivalence.  
\end{proposition} 

\begin{proofsketch} 
Suppose that both random variables $\hat Y$ and $\hat {\hat Y}$
satisfy our conditions for being the conditional expectation
$E(Y|X)$.  Let $W = {\hat Y} - {\hat {\hat Y}}$.  Then $W$
is $\G$-measurable and $E(WZ) = 0$ for all $Z$ which are
$\G$-measurable and bounded.  If we let $Z =
\1_{W>\epsilon}$ (which is bounded and measurable) then \[
\epsilon P(W>\epsilon) \leq E(W \1_{W>\epsilon}) = 0 \] for
all $\epsilon > 0$.  A similar argument applied to $P(W <
-\epsilon)$ allows us to conclude that $P(|W|>\epsilon) = 0$
holds for all $\epsilon$ and hence $W = 0$ almost surely
making $E(Y|X)$ almost surely unique.  
\end{proofsketch}

\begin{proposition} 
$\E(X|\G)$ exists 
\end{proposition}

We've shown that $\E(Y|\G)$ exists in the discrete case by
writing out an explicit formula so that ``$\E(Y|X)$ to
integrates like $Y$ over $\G$-measurable sets.'' We give
three different approaches for attacking the general case.


\subsection{``Hands On'' Proof}
The first is a hands on approach by extending the discrete
case via limits.  We will make use of

\begin{lemma} {\bf William's Tower Property} Suppose $\G
\subset H \subset F$ are nested $\sigma$-fields and $\E(
\cdot | \G)$ and $\E(\cdot | \H)$ are both well defined then
$\E(\E(Y|\H)|\G) = \E(Y|\G) = \E(\E(Y|\G)|\H)$ \end{lemma}

A special case is when $\G = \{\emptyset,\Omega\}$ then
$\E(Y|\G) = \E Y$ is a constant so it's easy to see
$\E(\E(Y|\H)|\G) = \E(\E(Y)|\H) = \E(Y)$ and
$\E(\E(Y|\G)|\H) = \E(\E(Y)|\H) = \E(Y)$

\begin{proofsketch} {\bf Existence via Limits}
 For a disjoint partition $\sqcup G_i =
\Omega$ and $G \in \G = \sigma(\{G_i\})$ define \[ E(Y|\G) =
\sum_i \frac{E(Y\1_{G_i})}{P(G_i)}1_{G_i} \] where we deal
appropriately with the niggling possibility of $\P(G_i) = 0$
by either throwing out the offending sets or defining
$\frac{0}{0} = 0$. 

We now consider an arbitrary but countably generated
$\sigma$-field $\G$.  This situation is not too restrictive,
for example the $\sigma$-field associated with an
$\R$-valued random variable $X$ is generated by the
countable collection $\{ B_i = (X \leq r_i) : r \in \Q\}$.
If we set $\G_n = \sigma(B_1,B_2,\ldots,B_n)$ then $\G_n$ is
increasing to the limit $\G_1 \subset \G_2 \subset \ldots
\subset \G = \sigma(\cup \G_n)$.  For a given $n$ the random
variable $Y_n = \E(Y|\G_n)$ exists by our explicit
definition above since we can decompose the generating set
into a disjoint partition of the space.

Now we show that $Y_n$ converges in some appropriate manner
to a $Y_\infty$  which will then function as a version of
$E(Y|\G)$.  We will assume that $\E|Y|^2 < \infty$

Write $Y_n = \E(Y|G_n) = Y_1 + (Y_2 - Y_1) + (Y_3 - Y_2) +
\ldots + (Y_n - Y_{n-1})$.  The terms in this summation are
orthogonal in $\L^2$ so we can compute the variance as \[
s_n^2 = \E(Y_n^2) = \E (Y_1^2) + \E((Y_2-Y_1)^2) \ldots + \E
((Y_n-Y_{n-1})^2) \] where the cross terms are zero.  Let
$s^2 = E(Y^2) = E(Y_n + (Y-Y_n)) < \infty$.  Then $s_n^2
\uparrow s_\infty^2 \leq s^2 < \infty$.  For $n>m$ we know
again by orthogonality that $E((Y_n - Y_m)^2) = s_n^2 -
s_m^2 \to 0$ as $m \to \infty$ since $s_n^2$ is just a
bounded real sequence.  This means that the sequence $Y_n$
is Cauchy in $\L^2$ and invoking the completeness of $\L^2$ we
conclude that $Y_n \to Y_\infty$.

All that remains is to check that $Y_\infty$ is a
conditional expectation.  It satisfies requirement (1) since
as a limit of $\G$-measurable variables it is $\G$-measurable.  
To check (2) we need to show that $E(Y G) = E(Y_\infty G)$ for
all $G$ which are bounded and $\G$-measurable.  As usual, it
suffices to check for a much smaller set $\{\1_{A_i} : A_i
\in {\cal A}\}$ where ${\cal A}$ is an intersection closed
collection and $\sigma({\cal A}) = \G$.  Take this
collection to be ${\cal A} = \cup_m \G_m$.  \[ \E(Y G_m) =
\E(Y_m G_m) = \E(Y_n G_m) \] holds by the tower property for
any $n > m$.  Noting that $\E(Y_n Z) \to \E(Y_\infty Z)$ is
true for all $Z \in \L^2$ by the continuity of inner product
this sequence must go to the desired limit which gives $\E(Y
\G_m) = \E(Y_\infty \G_m)$
\end{proofsketch} 

\begin{exercise}
Remove the countably generated constraint on $\G$.  ({\bf
Hint:} Be a bit more clever $\ldots$ for $Y \in \L^2$ look at 
$\E(Y|\G)$ for $\G \subset \F$ with $\G$ finite.  Then as
above $\sup_\G \E(\E(Y|\G)^2) \leq \E Y^2$ so we can choose
$\G_n$ with $\E(\E(Y|\G_n)^2)$ increasing to this supremum.
The $\G_n$ may not be nested but argue that ${\cal C}_n =
\sigma(\G_1 \cup \G_2 \cup \ldots \cup \G_n)$ are and let
$\hat Y = \lim_n \E(Y|{\cal C}_n))$).
\end{exercise}

\begin{exercise} Remove the $\L^2$ constraint on $Y$.  ({\bf
Hint:} Consider $Y \geq 0$  and show convergence of
$\E(Y \wedge n\ |\ \G)$ then turn crank on the standard machinery)
\end{exercise}

\subsection{Measure Theory Proof}

Here we pull out some power tools from measure theory.

\begin{theorem} {\bf Lebesgue-Radon-Nikodym}
\cite{rudin87}(p.121) If $\mu$ and $\lambda$ are
non-negative $\sigma$-finite measures on a collection $\G$
and $\mu(G) = 0 \implies \lambda(G) = 0$ (written $\lambda
<\!\!< \mu$, pronounced "$\lambda$ is absolutely continuous
with respect to $\mu$") for all $G \in \G$ then there exists
a non-negative $\G$ measurable function ${\hat Y}$ such that
\[ \lambda (G) = \int_G {\hat Y} d\mu \] for all $G \in
{\cal G}$.
\end{theorem}


\begin{proofsketch} {\bf Existence via Lebesgue-Radon-Nikodym}
Assume $Y \geq 0$ and define the probability measure \[ Q(C)
= \int_C Y dP = \E Y \1_C \] which is non-negative and
finite because $\E |Y| < \infty$ and $Q$ is absolutely
continuous with respect to $P$.  LRN implies the
existence of ${\hat Y}$ which satisfies our requirements to
be a version of the conditional expectation ${\hat Y} = \E
(Y | \G)$.  For general $Y$ we can employ $\E(Y^+ | \G) -
\E(Y^- | \G)$.
\end{proofsketch}

\subsection{Functional Analysis Proof}

This gives a nice geometric picture for the case when
$Y \in \L^2$

\begin{lemma}\label{lemma:hilbunique} Every nonempty, closed, convex set E in a
Hilbert space H contains a unique element of smallest norm
\end{lemma}

\begin{lemma} {\bf Existence of Projections in Hilbert
Space} Given a closed subspace $K$ of a Hilbert space $H$ 
and element $x \in H$, there exists a decomposition 
$x = y + z$ where $y \in K$ and $z \in K^\perp$ (the
orthogonal complement).
\end{lemma}

The idea for the existence of projections is to let $y$ be the element of
smallest norm in $x+K$ and $z = x-y$.  See \cite{rudin87}(p.79) for a full
discussion of Lemma \ref{lemma:hilbunique}.

\begin{proofsketch} {\bf Existence via Hilbert Space
Projection} \label{proof:hilbert} Suppose $Y \in \L^2(\F)$
and $X \in \L^2(\G)$.  Requirement (2) demands that for all
$X$ \[ \E( (Y - \E(Y|\G)) X ) = 0 \] which has the geometric
interpretation of requiring $Y - \E(Y|\G)$ to be orthogonal
to the subspace $\L^2(\G)$.  Requirement (1) says that
$\E(Y|\G) \in \L^2(\G)$ so $\E(Y|\G)$ is just the orthogonal
projection of $Y$ onto the closed subspace $\L^2(\G)$.  The
lemma above shows that such a projection is well defined.
\end{proofsketch}


\section{Properties of Conditional Expectation}

It's helpful to think of $\E(\cdot|\G)$ as an operator on
random variables that transforms $\F$-measurable variables
into $\G$-measurable ones.

We isolate some useful properties of conditional expectation
which the reader will no doubt want to prove before believing 

\begin{itemize}
\item{
$\E(\cdot|\G)$ is positive: 
\[
Y \geq 0 \rightarrow \E(Y|\G) \geq 0)
\]
}

\item{ $\E(\cdot|\G)$ is linear: 
\[
\E(aX + bY | \G) = a\E(X|\G) + b\E(Y|\G)
\]
}

\item{
$\E(\cdot|\G)$ is a projection: 
\[
\E(E(X|\G)|\G) = E(X|\G)
\]
}

\item{ More generally, the ``tower property''. If $\H \subset \G$ 
then
\[
\E(\E(X|\G)|\H) = \E(\E(X|\H)\G) = \E(X|\H)
\]
}

\item{
$\E(\cdot|\G)$ commutes with multiplication by
$\G$-measurable variables: 
\[
\E(XY|\G) = E(X|\G)Y \mbox{ for } \E|XY| < \infty \mbox{ and } Y \in \G
\]
}

\item{
$\E(\cdot|\G)$ respects monotone convergence:
\[
0 \leq X_n \uparrow X \implies \E(X_n|\G) \uparrow \E(X|\G)
\]
}

\item{
If $\phi$ is convex and $\E|\phi(X)| < \infty$ then
a conditional form of Jensen's inequality holds: 
\[
\phi(\E(X|\G) \leq \E(\phi(X)|\G)
\]
}

\item{
$\E(\cdot|\G)$ is a continuous contraction of $\L^p$
for $p\geq 1$: 
\[
\| \E(X|\G) \|_p \leq \|X\|_p
\]
and
\[
X_n \ltcv X \mbox{ implies } \E(X_n|\G) \ltcv \E(X|\G)
\]
}

\item{Repeated Conditioning.  For $\G_0 \subset \G_1 \subset
\ldots$, $\G_\infty = \sigma(\cup \G_i)$, and $X \in \L^p$ with
$p \geq 1$ then
\[
\E(X|\G_n) \ascv \E(X|\G_\infty)
\]
\[
\E(X|\G_n) \lpcv \E(X|\G_\infty)
\]
}

\end{itemize}

\section{Regular Conditional Distributions}

\begin{definition} Given random variable 
$X : (\Omega,\F) \rightarrow (S,{\cal S})$ and sub-$\sigma$-field $\G \subset \F$
we define the {\bf Markov kernel} $Q(\omega,A): \Omega \times {\cal S} \rightarrow
[0,1]$ as a (carefully chosen) version of the conditional
probability $\P(X \in A | \G)$ which has the properties
\begin{itemize}
\item[1.]{$\omega \mapsto Q(\omega,A)$ is a ($\G$-measurable) version 
of $\P(X \in A | \G)$ for fixed choice of $A$}
\item[2.]{$A \mapsto Q(\omega,A)$ is a probability measure on $(S,{\cal S})$}
\end{itemize}
When $S = \Omega$ and $X$ is the identity map we call $Q$ a {\bf regular
conditional probability}
\end{definition}

For $G \in \G$ we have that
\[
\P(X \in A, G) = \E(\P(X \in A | \G) \1_G ) = \int_G
Q(\omega,A) P(d\omega) \] and in the case when $\G =
\sigma(Y)$ the kernel takes the form \[ Q(\omega,A) = {\hat
Q}(Y(\omega),A) \] for some ${\hat Q} : \R \times \borel(\R)
\rightarrow \R$ which we write as $P(X \in A | Y = y)$ and
gives the slick formula \[ \P(X \in A, Y \in B) = \int_B P(X
\in A | Y = y) P(Y \in dy) \] reminiscent of Bayes' rule for
discrete variables.

Regular conditional probabilities do not always exist.  However,
if we are dealing with a random variable whose range is a ``nice''
space (one for which there exists a measurable 1-1 map to $\R$
whose inverse is also measurable) the following sketch shows we
are ok. (\cite{durrett95}(p.230) gives full details)


\begin{proofsketch} {\bf Existence of ``Regular'' Conditional
Probabilities}  First construct $\P(X \in A | \G)$ for Borel
sets so that it behaves as a probability with respect to $A$
almost surely.  Use intervals $\{(-\infty,q):q \in \Q\}$.
We can then choose $P(X \leq q | \G)$ for $q \in \Q$ to be
increasing and take on values of $0$ and $1$ at $-\infty$
and $\infty$ respectively.  Uniquely extend this increasing
function defined on $\Q$ to all of $\R$ in a right continuous 
manner by setting \[ P(X \leq r | \G) = \lim_{q \downarrow r} 
\P(x \leq q | \G) \] for any almost every $\omega$.
\end{proofsketch}

\begin{corollary}
For every joint distribution $(X,Y)$ where
$Y$'s range is a nice space, say $(X,Y) \in \R^2$ then \[
P(X \in dx,Y\in dy) = Q(x,dy)P(X \in dx) \] for some Markov
kernel $Q$.  
\end{corollary}

It is important to note that while even when both $Q_Y$
and $Q_X$ exist so that \[ P(X \in dx,Y\in dy) = Q_X(y,dx)P(Y
\in dy) = Q_Y(x,dy)P(X \in dx) \] there is no general way to
go from $Q_X$ and $P(Y \in dy)$ to $Q_Y$ unless we restrict 
ourselves to the case where $X$ and $Y$ have well defined
densities.

\section{A Word About $\E(Y|X=x)$}

Suppose that $\P(X \in [a,b]) > 0$ then using the naive
definition of conditional expectations we have
\[ 
\E(Y|X \in [a,b]) = \frac{\E (Y \1_{ (X \in [a,b])}) }{\P(X \in [a,b])}
\]
and we hope that this will give meaning to $\E(Y|X=x)$ in
the context
\[
\E(Y|X \in [a,b]) = \int_a^b \frac{\E(Y|X=x)}{\P(X\in [a,b])} dP(X \in dx) 
\] 
Using our
new definition of conditional expectation we have 
\[
\frac{\E(\E(X|Y) \1_{(X \in [a,b])}}{\P(X \in [a,b])} =
\frac{\E(Y \1_{(X \in [a,b])})}{\P(X \in [a,b])} 
\]
which gives us 
\[ 
\E(Y\1_{(X \in [a,b])}) = \int_a^b \E(Y|X = x) P(X \in dx) 
\]
This is enough to define conditional
expectations since the class of intervals $[a,b]$ is rich
enough to extend the formula to each Borel set $B$ so that
\[
\E(Y \1_{(X \in B)}) = \int_B E(Y|X = x) P(X \in dx)
\]

However, it is important not to attribute too much meaning
to the notation $\E(A|X=x)$ since it is usually the case
that $\P(X = x) = 0$ and so different versions of the
conditional expectation may not agree.

\begin{floatingfigure}[r]{2.2in}
\begin{center}
\psfig{figure=borel.eps,height=2.1in,width=1.75in}
\end{center}
\end{floatingfigure}

This is highlighted by the following simple version of
Borel's paradox:

Let $(X,Y)$ be uniformly chosen on the half disc so that $X
= R \cos(\Theta)$ and $Y = R \sin(\Theta)$ with $0 < R \leq
1$ and $\Theta \in [0,\pi]$.  We should certainly believe
the set equivalence \[ \{X = 0\} \iff \{\Theta =
\frac{\pi}{2}\} \]

Now $P(Y > \frac{1}{2} | X = 0) = \frac{1}{2}$ has real
meaning as there is a version of $\P(Y > \frac{1}{2} | X =
x)$ which is continuous in $X$ and it's value at $0$ is
$\frac{1}{2}$.  On the other hand, there is a unique version
of $P(Y > \frac{1}{2} | \Theta = \theta)$ whose value at
$\theta = \frac{\pi}{2}$ is $\frac{3}{4}$.  Slicing up a 
space in different ways can clearly give us surprisingly
incommensurate\footnote{ From Webster's Revised Unabridged
Dictionary (1913): Commensurate $\backslash$ ke-'men(ts)-ret
$\backslash$, a.  1.  Having a common measure; commensurable;
reducible to a common measure; as, commensurate quantities.
2. Equal in measure or extent; proportionate.} null sets!

\bibliographystyle{plain}
\bibliography{/saruman/accounts/fac/pitman/search/bm3,/saruman/accounts/fac/pitman/search/general,/saruman/accounts/fac/pitman/search/bm4,/saruman/accounts/fac/pitman/search/bessel,/saruman/accounts/fac/pitman/search/sizebias,/saruman/accounts/fac/pitman/search/pitman,/saruman/accounts/fac/pitman/search/comb,/saruman/accounts/fac/pitman/search/species,/saruman/accounts/fac/aldous/trees/trees,/saruman/accounts/fac/aldous/trees/rwgbook,/saruman/accounts/fac/aldous/trees/misc,/saruman/accounts/fac/aldous/trees/me,/saruman/accounts/fac/aldous/trees/coag}
\end{document}


