recent-pubs.bib

@comment{{This file has been generated by bib2bib 1.98}}
@comment{{Command line: /usr/bin/bib2bib -ob recent-pubs.bib -c 'year>=2006 and $type != "unpublished" and not note : "[pP]atent"' ../mypubs.bib}}
@article{bjm-ccrb-05,
  title = {Convexity, classification, and risk bounds},
  author = {Peter L.\ Bartlett and
  Michael I.\ Jordan and Jon D.\ McAuliffe},
  note = {(Was Department of Statistics, U.C.\ Berkeley Technical Report
	number 638, 2003)},
  year = {2006},
  journal = {Journal of the American Statistical Association},
  volume = {101},
  number = {473},
  pages = {138-156},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.ps.gz},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.pdf},
  abstract = {Many of the classification algorithms developed in the
  machine learning literature, including the support
  vector machine and boosting, can be viewed as minimum
  contrast methods that minimize a convex surrogate of the
  0-1 loss function.  The convexity makes these algorithms
  computationally efficient.  The use of a surrogate, however,
  has statistical consequences that must be balanced against
  the computational virtues of convexity.  To study these
  issues, we provide a general quantitative relationship between
  the risk as assessed using the 0-1 loss and the risk as assessed
  using any nonnegative surrogate loss function. We show that this
  relationship gives nontrivial upper bounds on excess risk
  under the weakest possible condition on the loss
  function: that it satisfy a pointwise form of Fisher
  consistency for classification.  The relationship is
  based on a simple variational transformation of the loss
  function that is easy to compute in many applications.
  We also present a refined version of this result in the
  case of low noise.  Finally, we present applications of
  our results to the estimation of convergence rates in the
  general setting of function classes that are scaled
  convex hulls of a finite-dimensional base class,
  with a variety of commonly used loss functions.}
}
@article{bm-em-05,
  author = {Peter L.~Bartlett and Shahar Mendelson},
  title = {Empirical minimization},
  journal = {Probability Theory and Related Fields},
  year = {2006},
  volume = 135,
  number = 3,
  pages = {311--334},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.ps.gz},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.pdf},
  abstract = {We investigate the behavior of the empirical
minimization algorithm using various methods. We first analyze it
by comparing the empirical, random, structure and the original one
on the class, either in an additive sense, via the uniform law of
large numbers, or in a multiplicative sense, using isomorphic
coordinate projections. We then show that a direct analysis of the
empirical minimization algorithm yields a significantly better
bound, and that the estimates we obtain are essentially sharp. The
method of proof we use is based on Talagrand's concentration
inequality for empirical processes.}
}
@techreport{bw-crouhl-06,
  author = {Peter L. Bartlett and Marten H. Wegkamp},
  title = {Classification with a Reject Option using a Hinge
	  Loss},
  year = {2006},
  institution = {U.C.\ Berkeley},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.ps.gz},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.pdf},
  abstract = {We consider the problem of binary classification where the
classifier can, for
a particular cost, choose not to classify an observation. Just as in
the
conventional classification problem, minimization of the sample
average of
the cost is a difficult optimization problem. As an alternative, we
propose
the optimization of a certain convex loss function f, analogous
to the
hinge loss used in support vector machines (SVMs). Its convexity
ensures that
the sample average of this surrogate loss can be efficiently
minimized. We
study its statistical properties. We show that minimizing the expected
surrogate loss---the f-risk---also minimizes the risk.  We also
study
the rate at which the f-risk approaches its minimum value. We
show that
fast rates are possible when the conditional probability Pr(Y=1|X)
is
unlikely to be close to certain critical values.}
}
@techreport{bt-aic-06a,
  author = {Peter L.~Bartlett and Mikhail Traskin},
  title = {AdaBoost is Consistent},
  institution = {U.~C.~Berkeley},
  year = {2006}
}
@article{bjm-c-06,
  author = {Peter L.\ Bartlett and
  Michael I.\ Jordan and Jon D.\ McAuliffe},
  title = {Comment},
  journal = {Statistical Science},
  year = {2006},
  volume = {21},
  number = {3},
  pages = {341--346}
}
@inproceedings{bt-aolmc-06,
  author = {Peter L.~Bartlett and Mikhail Traskin},
  title = {AdaBoost and other Large Margin Classifiers:
         Convexity in Pattern Classification},
  booktitle = {Proceedings of the 5th Workshop on Defence Applications
of Signal Processing},
  year = {2006}
}
@article{bm-d-06,
  author = {Peter L.\ Bartlett and Shahar Mendelson},
  title = {Discussion of ``2004 {IMS} {M}edallion {L}ecture:
{L}ocal {R}ademacher complexities and oracle inequalities in risk
minimization'' by {V}.\ {K}oltchinskii},
  journal = {The Annals of Statistics},
  year = {2006},
  volume = {34},
  number = {6},
  pages = {2657--2663}
}
@techreport{bmp-osbeeem-05,
  author = {Peter L. Bartlett and Shahar Mendelson and Petra
		Philips},
  title = {Optimal sample-based estimates of the
		expectation of the empirical minimizer},
  year = {2007},
  institution = {U.C.\ Berkeley},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.ps.gz},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.pdf},
  abstract = {We study sample-based estimates of the expectation of the
	function produced by the empirical minimization algorithm.
	We investigate the extent to which one can estimate
	the rate of convergence of the empirical minimizer in a
	data dependent manner. We establish three main results.
	First, we provide an algorithm that upper bounds the
	expectation of the empirical minimizer in a completely
	data-dependent manner.  This bound is based on a structural
	result in
	http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf,
        which relates expectations to
	sample averages.  Second, we show that these structural
	upper bounds can be loose. In particular, we demonstrate a
	class for which the expectation of the empirical minimizer
	decreases as $O(1/n)$ for sample size $n$, although the
	upper bound based on structural properties is $\Omega(1)$.
	Third, we show that this looseness of the bound is inevitable:
	we present an example that shows that a sharp bound cannot
	be universally recovered from empirical data.}
}
@techreport{b-freeoims-06,
  author = {Peter L. Bartlett},
  title = {Fast rates for estimation error and oracle
	inequalities for model selection},
  year = {2007},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-freeoims-06.pdf},
  institution = {Department of Statistics, U.C.\ Berkeley},
  number = {729},
  abstract = {We consider complexity penalization methods for model
selection.  These methods aim to choose a model to
optimally trade off estimation and approximation errors
by minimizing the sum of an empirical risk term and a
complexity penalty. It is well known that if we use a
bound on the maximal deviation between empirical and
true risks as a complexity penalty,
then the risk of our choice is no more
than the approximation error plus twice the complexity
penalty. There are many cases, however, where complexity
penalties like this give loose upper bounds on the
estimation error. In particular, if we choose a function
from a suitably simple convex function class with a
strictly convex loss function, then the estimation error
(the difference between the risk of the empirical risk
minimizer and the minimal risk in the class) approaches zero at a
faster rate than the maximal deviation between empirical
and true risks. In this note, we address the question of
whether it is possible to design a complexity penalized
model selection method for these situations. We show that,
provided the sequence of models is ordered by inclusion, in
these cases we can use tight upper bounds on estimation
error as a complexity penalty. Surprisingly, this is the
case even in situations when the difference between the
empirical risk and true risk (and indeed the error of
any estimate of the approximation error) decreases
much more slowly than the
complexity penalty.  We give an oracle inequality
showing that the resulting model selection method chooses
a function with risk no more than the approximation error
plus a constant times the complexity penalty.}
}
@inproceedings{bt-scpskd-06,
  author = {Peter L.~Bartlett and Ambuj Tewari},
  title = {Sample complexity of policy search with known dynamics},
  booktitle = {Advances in Neural Information Processing Systems 19},
  year = 2007,
  editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  pages = {97--104},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-scpskd-06.pdf}
}
@inproceedings{rbr-soimbtmerb-06,
  author = {Benjamin I. P. Rubinstein and Peter L.~Bartlett and J. Hyam
Rubinstein},
  title = {Shifting, one-inclusion mistake bounds and tight multiclass
expected risk bounds},
  booktitle = {Advances in Neural Information Processing Systems 19},
  editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  pages = {1193--1200},
  year = {2007},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/rbr-soimbtmerb-06.pdf}
}
@inproceedings{rb-rcckc-06,
  author = {David Rosenberg and Peter L.~Bartlett},
  title = {The {R}ademacher Complexity of Co-Regularized Kernel Classes},
  abstract = {In the multi-view approach to semi-supervised learning, we choose
  one predictor from each of multiple hypothesis classes, and we
  `co-regularize' our choices by penalizing disagreement among the
  predictors on the unlabeled data.  In this paper we examine the
  co-regularization method used in the recently proposed
  co-regularized least squares (CoRLS) algorithm.
  In this method we have two hypothesis
  classes, each a reproducing kernel Hilbert space (RKHS), and we
  co-regularize by penalizing the average squared difference in
  predictions on the unlabeled data.  We get our final predictor by
  taking the pointwise average of the predictors from each view.  We
  call the set of predictors that can result from this procedure the
  co-regularized hypothesis class.  The main result of this paper is a
  tight bound on the Rademacher complexity of the co-regularized
  hypothesis class in terms of the kernel matrices of each RKHS. We
  find that the co-regularization reduces the Rademacher complexity of
  the hypothesis class by an amount depending on how different the two
  views are, measured by a data dependent metric. We then use standard
  techniques to bound the gap between training error and test error
  for the CoRLS algorithm.  Experimentally, we find that the amount of
  reduction in complexity introduced by co-regularization correlates
  with the amount of improvement that co-regularization gives in the
  CoRLS algorithm},
  booktitle = {Proceedings of the Eleventh International Conference on 
	Artificial Intelligence and Statistics},
  editor = {Marina Meila and Xiaotong Shen},
  seriestitle = {JMLR Workshop and Conference Proceedings},
  volume = {2},
  pages = {396--403},
  year = 2007,
  pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v2/rosenberg07a/rosenberg07a.pdf}
}
@inproceedings{bt-aic-06,
  author = {Peter L.~Bartlett and Mikhail Traskin},
  title = {AdaBoost is Consistent},
  booktitle = {Advances in Neural Information Processing Systems 19},
  editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  pages = {105--112},
  year = {2007},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-aic-06.pdf},
  abstract = {The risk, or probability of error, of
the classifier produced by the AdaBoost algorithm is
investigated. In particular, we consider the stopping
strategy to be used in AdaBoost to achieve universal
consistency. We show that provided AdaBoost is stopped
after n^(1-a)$ iterations---for sample size n and
a>0---the sequence of risks of the classifiers it
produces approaches the Bayes risk.}
}
@inproceedings{tb-bpmdparc-07,
  author = {Ambuj Tewari and Peter L.~Bartlett},
  title = {Bounded parameter {M}arkov decision processes with average
	reward criterion},
  booktitle = {Proceedings of the Conference on Learning Theory},
  year = {2007},
  pages = {263--277}
}
@inproceedings{abr-mlea-07,
  author = {Jacob Abernethy and Peter L.~Bartlett and Alexander Rakhlin},
  title = {Multitask learning with expert advice},
  booktitle = {Proceedings of the Conference on Learning Theory},
  year = {2007},
  pages = {484-498},
  abstract = {We consider the problem of prediction with expert advice
	in the setting where a forecaster is presented with several
	online prediction tasks. Instead of competing against the best
	expert separately on each task, we assume the tasks are
	related, and thus we expect that a few experts will perform
	well on the entire set of tasks. That is, our forecaster would
	like, on each task, to compete against the best expert
	chosen from a small set of experts. While we describe
	the `ideal' algorithm and its performance bound, we show
	that the computation required for this algorithm is as hard as
	computation of a matrix permanent. We present an efficient
	algorithm based on mixing priors, and prove a bound that is
	nearly as good for the sequential task presentation case. We
	also consider a harder case where the task may change
	arbitrarily from round to round, and we develop an efficient
	randomized algorithm based on Markov chain Monte Carlo
	techniques.}
}
@inproceedings{arb--07,
  author = {Alexander Rakhlin and Jacob Abernethy and
	Peter L.~Bartlett},
  title = {Online discovery of similarity mappings},
  booktitle = {Proceedings of the 24th International Conference on Machine
	Learning (ICML-2007)},
  year = {2007},
  pages = {767--774},
  abstract = {We consider the problem of choosing,
	sequentially, a map which assigns elements of a set A
	to a few elements of a set B.  On each round, the
	algorithm suffers some cost associated with the chosen
	assignment, and the goal is to minimize the cumulative loss
	of these choices relative to the best map on the entire
	sequence. Even though the offline problem of finding the best
	map is provably hard, we show that there is an equivalent
	online approximation algorithm, Randomized Map Prediction
	(RMP), that is efficient and performs nearly as well. While
	drawing upon results from the `Online Prediction with
	Expert Advice' setting, we show how RMP can be utilized as
	an online approach to several standard batch problems. We
	apply RMP to online clustering as well as online feature
	selection and, surprisingly, RMP often outperforms the
	standard batch algorithms on these problems.}
}
@article{bt-aic-07,
  author = {Peter L.~Bartlett and Mikhail Traskin},
  title = {AdaBoost is Consistent},
  journal = {Journal of Machine Learning Research},
  volume = {8},
  pages = {2347--2368},
  year = {2007},
  url = {jmlr.csail.mit.edu/papers/volume8/bartlett07b/bartlett07b.pdf},
  pdf = {jmlr.csail.mit.edu/papers/volume8/bartlett07b/bartlett07b.pdf},
  abstract = {The risk, or probability of error, of
the classifier produced by the AdaBoost algorithm is
investigated. In particular, we consider the stopping
strategy to be used in AdaBoost to achieve universal
consistency. We show that provided AdaBoost is stopped
after n^(1-a) iterations---for sample size n and
0

@techreport{rbr-soimbtmerb-07,
  author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
	J.~Hyam Rubinstein},
  title = {Shifting: one-inclusion mistake bounds and sample
	compression},
  institution = {EECS Department, University of California, Berkeley},
  pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-86.pdf},
  year = {2007},
  abstract = {We present new expected risk bounds for binary and
	multiclass prediction, and resolve several recent conjectures
	on sample compressibility due to Kuzmin and Warmuth. By
	exploiting the combinatorial structure of concept class F,
        Haussler et al. achieved a VC(F)/n bound for the natural one-inclusion
	prediction strategy. The key step in their proof is a d=VC(F)
	bound on the graph density of a subgraph of the
	hypercube - one-inclusion graph. The first main result of
	this report is a density bound of
	n choose(n-1,<=d-1)/choose(n,<=d) < d, which positively resolves
	a conjecture of Kuzmin and Warmuth relating to their unlabeled
	Peeling compression scheme and also leads to an improved
	one-inclusion mistake bound. The proof uses a new form of
	VC-invariant shifting and a group-theoretic symmetrization.
	Our second main result is an algebraic topological property of
	maximum classes of VC-dimension d as being d-contractible
	simplicial complexes, extending the well-known
	characterization that d=1 maximum classes are trees. We
	negatively resolve a minimum degree conjecture of Kuzmin and
	Warmuth - the second part to a conjectured proof of
	correctness for Peeling - that every class has one-inclusion
	minimum degree at most its VC-dimension. Our final main result
	is a k-class analogue of the d/n mistake bound, replacing the
	VC-dimension by the Pollard pseudo-dimension and the
	one-inclusion strategy by its natural hypergraph
	generalization. This result improves on known PAC-based
	expected risk bounds by a factor of O(log n) and is shown to
	be optimal up to a O(log k) factor. The combinatorial
	technique of shifting takes a central role in understanding
	the one-inclusion (hyper)graph and is a running theme
	throughout.}
}
@techreport{abrt-mlbocg-07,
  author = {Jacob Abernethy and Peter L.~Bartlett and Alexander
	Rakhlin and Ambuj Tewari},
  title = {Minimax Lower Bounds for Online Convex Games},
  institution = {UC Berkeley},
  year = {2007},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrt-mlbocg-07.pdf},
  abstract = {A number of learning problems can be cast as an Online
	Convex Game: on each round, a learner makes a prediction $x$
	from a convex set, the environment plays a loss function $f$,
	and the learner's long-term goal is to minimize regret. Algorithms
	have been proposed by Zinkevich, when $f$ is assumed to be
	convex, and Hazan et al, when $f$ is assumed to be strongly
	convex, that have provably low regret. We consider these two
	settings and analyze such games from a minimax perspective,
	proving lower bounds in each case.  These results prove that the
	existing algorithms are essentially optimal.}
}
@techreport{abr-mlea-07a,
  author = {Jacob Duncan Abernethy and Peter L.~Bartlett
  and Alexander Rakhlin},
  title = {Multitask Learning with Expert Advice},
  institution = {EECS Department, University of California, Berkeley},
  year = {2007},
  url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-20.html},
  number = {UCB/EECS-2007-20}
}
@techreport{cgkcb-egacrfmmmn-07,
  author = {Michael Collins and Amir Globerson and Terry Koo and Xavier
Carreras and Peter L.~Bartlett},
  title = {Exponentiated gradient algorithms for conditional random
fields and max-margin {M}arkov networks},
  institution = {U.C.\ Berkeley},
  year = {2007},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/cgkcb-egacrfmmmn-07.pdf},
  abstract = {Log-linear and maximum-margin models are two
commonly used methods in supervised machine learning, and
are frequently used in structured prediction problems.
Efficient learning of parameters in these models is
therefore an important problem, and becomes a key factor
when learning from very large data sets.  This paper
describes exponentiated gradient (EG) algorithms for
training such models, where EG updates are applied to
the convex dual of either the log-linear or max-margin
objective function; the dual in both the log-linear
and max-margin cases corresponds to minimizing a convex
function with simplex constraints.  We study both batch
and online variants of the algorithm, and provide rates
of convergence for both cases.  In the max-margin case,
$O({1\over\epsilon})$ EG updates are required to reach a
given accuracy $\epsilon$ in the dual; in contrast, for
log-linear models only $O(\log({ 1\over\epsilon}))$ updates
are required. For both the max-margin and log-linear cases,
our bounds suggest that the online algorithm requires
a factor of $n$ less computation to reach a desired
accuracy, where $n$ is the number of training examples. Our
experiments confirm that the online algorithms are much
faster than the batch algorithms in practice.  We describe
how the EG updates factor in a convenient way for
structured prediction problems, allowing the algorithms
to be efficiently applied to problems such as sequence
learning or natural language parsing.  We perform extensive
evaluation of the algorithms, comparing them to to L-BFGS
and stochastic gradient descent for log-linear models,
and to SVM-Struct for max-margin models. The algorithms are
applied to multi-class problems as well as a more complex
large-scale parsing task. In all these settings, the EG
algorithms presented here outperform the other methods.}
}
@article{tewari07consistency,
  author = {Ambuj Tewari and Peter L.~Bartlett},
  title = {On the Consistency of Multiclass Classification Methods},
  journal = {Journal of Machine Learning Research},
  year = {2007},
  volume = {8},
  month = {May},
  pages = {1007--1025},
  note = {(Invited paper)},
  url = {http://jmlr.csail.mit.edu/papers/v8/tewari07a.html}
}
@article{bartlett07sparseness,
  author = {Peter L. Bartlett and Ambuj Tewari},
  title = {Sparseness vs Estimating Conditional Probabilities: Some
Asymptotic Results},
  journal = {Journal of Machine Learning Research},
  year = {2007},
  volume = {8},
  month = {April},
  pages = {775--790},
  url = {http://jmlr.csail.mit.edu/papers/v8/bartlett07a.html}
}
@article{b-freeoims-07,
  author = {Peter L. Bartlett},
  title = {Fast rates for estimation error and oracle
	inequalities for model selection},
  year = {2008},
  month = apr,
  journal = {Econometric Theory},
  volume = {24},
  number = {2},
  pages = {545--552},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-freeoims-07.pdf},
  note = {(Was Department of Statistics,
U.C.\ Berkeley Technical Report number 729, 2007)},
  doi = {http://dx.doi.org/10.1017/S0266466608080225},
  abstract = {We consider complexity penalization methods for model
selection.  These methods aim to choose a model to
optimally trade off estimation and approximation errors
by minimizing the sum of an empirical risk term and a
complexity penalty. It is well known that if we use a
bound on the maximal deviation between empirical and
true risks as a complexity penalty,
then the risk of our choice is no more
than the approximation error plus twice the complexity
penalty. There are many cases, however, where complexity
penalties like this give loose upper bounds on the
estimation error. In particular, if we choose a function
from a suitably simple convex function class with a
strictly convex loss function, then the estimation error
(the difference between the risk of the empirical risk
minimizer and the minimal risk in the class) approaches zero at a
faster rate than the maximal deviation between empirical
and true risks. In this note, we address the question of
whether it is possible to design a complexity penalized
model selection method for these situations. We show that,
provided the sequence of models is ordered by inclusion, in
these cases we can use tight upper bounds on estimation
error as a complexity penalty. Surprisingly, this is the
case even in situations when the difference between the
empirical risk and true risk (and indeed the error of
any estimate of the approximation error) decreases
much more slowly than the
complexity penalty.  We give an oracle inequality
showing that the resulting model selection method chooses
a function with risk no more than the approximation error
plus a constant times the complexity penalty.}
}
@article{bw-crouhl-08,
  author = {Peter L. Bartlett and Marten H. Wegkamp},
  title = {Classification with a Reject Option using a Hinge
	  Loss},
  journal = {Journal of Machine Learning Research},
  year = {2008},
  month = aug,
  volume = {9},
  pages = {1823--1840},
  pdf = {http://www.jmlr.org/papers/volume9/bartlett08a/bartlett08a.pdf},
  abstract = {We consider the problem of binary classification where the
classifier can, for
a particular cost, choose not to classify an observation. Just as in
the
conventional classification problem, minimization of the sample
average of
the cost is a difficult optimization problem. As an alternative, we
propose
the optimization of a certain convex loss function f, analogous
to the
hinge loss used in support vector machines (SVMs). Its convexity
ensures that
the sample average of this surrogate loss can be efficiently
minimized. We
study its statistical properties. We show that minimizing the expected
surrogate loss---the f-risk---also minimizes the risk.  We also
study
the rate at which the f-risk approaches its minimum value. We
show that
fast rates are possible when the conditional probability Pr(Y=1|X)
is
unlikely to be close to certain critical values.}
}
@inproceedings{bhr-aogd-07,
  author = {Peter L.~Bartlett and Elad Hazan and Alexander Rakhlin},
  title = {Adaptive online gradient descent},
  booktitle = {Advances in Neural Information Processing
	   Systems 20},
  editor = {John Platt and Daphne Koller and Yoram Singer and Sam Roweis},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  year = {2008},
  month = sep,
  pages = {65--72},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bhr-aogd-07.pdf},
  abstract = {We study the rates of growth of the regret in online
	convex optimization. First, we show that a simple extension of
	the algorithm of Hazan et al eliminates the need for a priori
	knowledge of the lower bound on the second derivatives of the
	observed functions. We then provide an algorithm, Adaptive
	Online Gradient Descent, which interpolates between the
	results of Zinkevich for linear functions and of Hazan et al
	for strongly convex functions, achieving intermediate rates
	between $\sqrt{T}$ and $\log T$. Furthermore, we show strong
	optimality of the algorithm. Finally, we provide an extension
	of our results to general norms.}
}
@inproceedings{tb-olplrim-07,
  author = {Ambuj Tewari and Peter L.~Bartlett},
  title = {Optimistic linear programming gives logarithmic regret for
	irreducible {MDPs}},
  booktitle = {Advances in Neural Information Processing
	   Systems 20},
  editor = {John Platt and Daphne Koller and Yoram Singer and Sam Roweis},
  publisher = {MIT Press},
  address = {Cambridge, MA},
  year = {2008},
  month = sep,
  pages = {1505--1512},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/tb-olplrim-07.pdf},
  abstract = {We present an algorithm called Optimistic Linear
	Programming (OLP) for learning to optimize average reward in
	an irreducible but otherwise unknown Markov decision process
	(MDP). OLP uses its experience so far to estimate the MDP. It
	chooses actions by optimistically maximizing estimated future
	rewards over a set of next-state transition probabilities that
	are close to the estimates: a computation that corresponds to
	solving linear programs. We show that the total expected
	reward obtained by OLP up to time $T$ is within $C(P)\log T$
	of the reward obtained by the optimal policy, where $C(P)$ is
	an explicit, MDP-dependent constant. OLP is closely related to
	an algorithm proposed by Burnetas and Katehakis with four key
	differences: OLP is simpler, it does not require knowledge of
	the supports of transition probabilities and the proof of the
	regret bound is simpler, but our regret bound is a constant
	factor larger than the regret of their algorithm. OLP is also
	similar in flavor to an algorithm recently proposed by Auer
	and Ortner. But OLP is simpler and its regret bound has a
	better dependence on the size of the MDP.}
}
@article{cgkcb-egacrfmmmn-08,
  author = {Michael Collins and Amir Globerson and Terry Koo and Xavier
Carreras and Peter L.~Bartlett},
  title = {Exponentiated gradient algorithms for conditional random
fields and max-margin {M}arkov networks},
  journal = {Journal of Machine Learning Research},
  year = {2008},
  month = aug,
  volume = {9},
  pages = {1775--1822},
  pdf = {http://www.jmlr.org/papers/volume9/collins08a/collins08a.pdf},
  abstract = {Log-linear and maximum-margin models are two
commonly used methods in supervised machine learning, and
are frequently used in structured prediction problems.
Efficient learning of parameters in these models is
therefore an important problem, and becomes a key factor
when learning from very large data sets.  This paper
describes exponentiated gradient (EG) algorithms for
training such models, where EG updates are applied to
the convex dual of either the log-linear or max-margin
objective function; the dual in both the log-linear
and max-margin cases corresponds to minimizing a convex
function with simplex constraints.  We study both batch
and online variants of the algorithm, and provide rates
of convergence for both cases.  In the max-margin case,
$O({1\over\epsilon})$ EG updates are required to reach a
given accuracy $\epsilon$ in the dual; in contrast, for
log-linear models only $O(\log({ 1\over\epsilon}))$ updates
are required. For both the max-margin and log-linear cases,
our bounds suggest that the online algorithm requires
a factor of $n$ less computation to reach a desired
accuracy, where $n$ is the number of training examples. Our
experiments confirm that the online algorithms are much
faster than the batch algorithms in practice.  We describe
how the EG updates factor in a convenient way for
structured prediction problems, allowing the algorithms
to be efficiently applied to problems such as sequence
learning or natural language parsing.  We perform extensive
evaluation of the algorithms, comparing them to to L-BFGS
and stochastic gradient descent for log-linear models,
and to SVM-Struct for max-margin models. The algorithms are
applied to multi-class problems as well as a more complex
large-scale parsing task. In all these settings, the EG
algorithms presented here outperform the other methods.}
}
@inproceedings{bdhkrt-hprbbolo-08,
  title = {High-Probability Regret Bounds for Bandit Online Linear
Optimization},
  author = {Peter L.~Bartlett and Varsha Dani and Thomas Hayes and
Sham Kakade and Alexander Rakhlin and Ambuj Tewari},
  booktitle = {Proceedings of the 21st Annual Conference on Learning
Theory (COLT 2008)},
  year = {2008},
  month = dec,
  pages = {335--342},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bdhkrt-hprbbolo-08.pdf}
}
@inproceedings{abrt-osmlbocg-08,
  title = {Optimal strategies and minimax lower bounds for online convex
games},
  author = {Jacob Abernethy and Peter L. Bartlett and
Alexander Rakhlin and Ambuj Tewari},
  booktitle = {Proceedings of the 21st Annual Conference on Learning
Theory (COLT 2008)},
  year = {2008},
  month = dec,
  pages = {415--423},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrt-osmlbocg-08.pdf}
}
@article{lbw-c-08,
  title = {Correction to {\em The Importance of Convexity in Learning
    with Squared Loss}},
  author = {Wee Sun Lee and Peter L.~Bartlett and
    Robert C.~Williamson},
  journal = {IEEE Transactions on Information Theory},
  year = {2008},
  month = sep,
  pages = {4395},
  volume = {54},
  number = {9},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/lbw-c-08.pdf}
}
@inproceedings{nabh-amlfdms-08,
  title = {Application of Machine Learning in Fault Diagnostics of
Mechanical Systems},
  author = {Massieh Najafi and David M. Auslander and
  Peter L. Bartlett and Philip Haves},
  booktitle = {Proceedings of the World Congress on Engineering and
Computer Science 2008:
International Conference on Modeling, Simulation and Control 2008},
  pages = {957--962},
  notetoomit = {Winner of Best Student Paper Award of the Conference},
  year = {2008},
  month = oct,
  pdf = {http://www.iaeng.org/publication/WCECS2008/WCECS2008_pp957-962.pdf}
}
@inproceedings{nabh-fdst-08,
  title = {Fault Diagnostics and Supervised Testing: How Fault
  Diagnostic tools can be Proactive?},
  author = {Massieh Najafi and David M. Auslander and Peter L. Bartlett
  and Philip Haves},
  year = {2008},
  month = sep,
  booktitle = {Proceedings of Intelligent Systems and Control (ISC
  2008)},
  pages = {633-034},
  editor = {K. Grigoriadis}
}
@inproceedings{nabh-ocdpdsna-08,
  title = {Overcoming the Complexity of Diagnostic Problems due to
  Sensor Network Architecture},
  author = {Massieh Najafi and David M. Auslander and Peter L. Bartlett
  and Philip Haves},
  year = {2008},
  month = sep,
  booktitle = {Proceedings of Intelligent Systems and Control (ISC
  2008)},
  pages = {633-071},
  editor = {K. Grigoriadis}
}
@techreport{arb-mrtoml-08,
  title = {Matrix regularization techniques for online multitask
learning},
  author = {Alekh Agarwal and Alexander Rakhlin and Peter Bartlett},
  institution = {EECS Department,
University of California, Berkeley},
  number = {UCB/EECS-2008-138},
  year = {2008},
  pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2008/EECS-2008-138.pdf},
  abstract = {In this paper we examine the problem of prediction with
expert advice in a setup where the learner
is presented with a sequence of examples coming from different tasks.
In order for the learner to be able
to benefit from performing multiple tasks simultaneously, we make
assumptions of task relatedness by
constraining the comparator to use a lesser number of best experts
than the number of tasks. We show
how this corresponds naturally to learning under spectral or
structural matrix constraints, and propose
regularization techniques to enforce the constraints. The
regularization techniques proposed here are
interesting in their own right and multitask learning is just one
application for the ideas. A theoretical
analysis of one such regularizer is performed, and a regret bound that
shows benefits of this setup is reported.}
}
@inproceedings{bbcjnrst-opsl-08,
  title = {Open problems in the security of learning},
  author = {Marco Barreno and Peter L.~Bartlett and F.~J.~Chi
and Anthony D.~Joseph and Blaine Nelson
and Benjamin I.~P.~Rubinstein and U.~Saini and J.~Doug Tygar},
  booktitle = {Proceedings of the 1st ACM Workshop on AISec
(AISec2008)},
  pages = {19--26},
  year = {2008},
  doi = {http://dx.doi.org/10.1145/1456377.1456382},
  month = oct
}
@article{rbr-soimbtmerb-08,
  author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
	J.~Hyam Rubinstein},
  title = {Shifting: one-inclusion mistake bounds and sample
	compression},
  journal = {Journal of Computer and System Sciences},
  volume = {75},
  number = {1},
  pages = {37--59},
  note = {(Was University of California, Berkeley, EECS Department
Technical Report EECS-2007-86)},
  pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-86.pdf},
  month = {January},
  year = {2009}
}
@inproceedings{bt-rarbarlwcm-09,
  title = {{REGAL}: A Regularization based Algorithm for
Reinforcement Learning in Weakly Communicating {MDPs}},
  author = {Peter L.~Bartlett and Ambuj Tewari},
  booktitle = {Proceedings of the 25th Conference on
Uncertainty in Artificial Intelligence (UAI2009)},
  year = {2009},
  month = jun,
  pages = {35--42},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-rarbarllwcm-09.pdf}
}
@inproceedings{aabr-svormd-09,
  title = {A stochastic view of optimal regret through minimax duality},
  author = {Jacob Abernethy and Alekh Agarwal and
Peter L.~Bartlett and Alexander Rakhlin},
  booktitle = {Proceedings of the 22nd Annual Conference on 
Learning Theory -- COLT 2009},
  year = {2009},
  month = jun,
  pages = {257--266},
  abstract = {We study the regret of optimal strategies for online
convex
optimization games.  Using von Neumann's minimax theorem, we
show that the optimal regret in this adversarial setting is
closely related to the behavior of the empirical
minimization algorithm in a stochastic process setting:
it is equal to the maximum, over joint distributions of the
adversary's action sequence, of the difference between a sum
of minimal expected losses and the minimal empirical loss.
We show that the optimal regret has a natural geometric
interpretation, since it can be viewed as the gap in
Jensen's inequality for a concave functional---the minimizer
over the player's actions of expected loss---defined on a
set of probability distributions.  We use this expression to
obtain upper and lower bounds on the regret of an optimal
strategy for a variety of online learning problems. Our
method provides upper bounds without the need to construct a
learning algorithm; the lower bounds provide explicit
optimal strategies for the adversary.},
  pdf = {http://www.cs.mcgill.ca/~colt2009/papers/026.pdf},
  funding = {nsf seqs 07, nsf online 08}
}
@techreport{aabr-svormd-09a,
  title = {A stochastic view of optimal regret through minimax duality},
  author = {Jacob Abernethy and Alekh Agarwal and
Peter L.~Bartlett and Alexander Rakhlin},
  institution = {arxiv.org},
  number = {0903.5328},
  year = {2009},
  url = {http://arxiv.org/abs/0903.5328},
  abstract = {We study the regret of optimal strategies for online
convex optimization games. Using von Neumann's minimax theorem, we
show that the optimal regret in this adversarial setting is closely
related to the behavior of the empirical minimization algorithm in a
stochastic process setting: it is equal to the maximum, over joint
distributions of the adversary's action sequence, of the difference
between a sum of minimal expected losses and the minimal empirical
loss. We show that the optimal regret has a natural geometric
interpretation, since it can be viewed as the gap in Jensen's
inequality for a concave functional--the minimizer over the player's
actions of expected loss--defined on a set of probability
distributions. We use this expression to obtain upper and lower bounds
on the regret of an optimal strategy for a variety of online learning
problems. Our method provides upper bounds without the need to
construct a learning algorithm; the lower bounds provide explicit
optimal strategies for the adversary.}
}
@article{rsbn-sslfbmvpcr-09,
  title = {Multiview point cloud kernels for semisupervised learning},
  volume = {26},
  number = {5},
  month = {September},
  year = {2009},
  pages = {145--150},
  author = {David S.~Rosenberg and Vikas Sindhwani and
Peter L.~Bartlett and Partha Niyogi},
  doi = {http://dx.doi.org/10.1109/MSP.2009.933383},
  journal = {IEEE Signal Processing Magazine}
}
@inproceedings{abrw-itlbocco-09,
  title = {Information-theoretic lower bounds on the oracle complexity
of convex optimization},
  author = {Alekh Agarwal and Peter L.~Bartlett
and Pradeep Ravikumar and Martin Wainwright},
  booktitle = {Advances in
Neural Information Processing Systems 22},
  editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C. K. I.
Williams and A. Culotta},
  year = {2009},
  month = jun,
  pages = {1--9},
  pdf = {http://research.microsoft.com/en-us/um/people/alekha/1005_paper.pdf},
  abstract = {Despite the large amount of literature on upper bounds on
complexity
  of convex analysis, surprisingly little is known about the
  fundamental hardness of these problems. The extensive use of convex
  optimization in machine learning and statistics makes such an
  understanding very critical to understand fundamental computational
  limits of learning and estimation. In this paper, we study the
  complexity of stochastic convex optimization in an oracle model of
  computation. We improve upon known results and obtain tight minimax
  complexity estimates for some function classes. We also discuss
  implications of these results to learning and estimation.},
  funding = {nsf seqs 07}
}
@techreport{bbmrss-lbars-09,
  title = {A learning-based approach to reactive security},
  author = {A.~Barth and Benjamin I.~P.~Rubinstein
and M.~Sundararajan and J.~C.~Mitchell
and Dawn Song and Peter L.~Bartlett},
  institution = {arxiv.org},
  number = {0912.1155},
  url = {http://arxiv.org/abs/0912.1155},
  abstract = {Despite the conventional wisdom that proactive security is
superior to reactive security, we show that reactive security can be
competitive with proactive security as long as the reactive defender
learns from past attacks instead of myopically overreacting to the
last attack. Our game-theoretic model follows common practice in the
security literature by making worst-case assumptions about the
attacker: we grant the attacker complete knowledge of the defender's
strategy and do not require the attacker to act rationally. In this
model, we bound the competitive ratio between a reactive defense
algorithm (which is inspired by online learning theory) and the best
fixed proactive defense. Additionally, we show that, unlike proactive
defenses, this reactive strategy is robust to a lack of information
about the attacker's incentives and knowledge.},
  year = {2009}
}
@techreport{krb-uvmkl-10a,
  title = {A Unifying View of Multiple Kernel Learning},
  author = {Marius Kloft and Ulrich R\"uckert and Peter L.~Bartlett},
  year = {2010},
  institution = {arxiv.org},
  number = {1005.0437},
  url = {http://arxiv.org/abs/1005.0437},
  abstract = {Recent research on multiple kernel learning has lead to a
number of approaches for combining kernels in regularized risk
minimization. The proposed approaches include different formulations
of objectives and varying regularization strategies. In this paper we
present a unifying general optimization criterion for multiple kernel
learning and show how existing formulations are subsumed as special
cases. We also derive the criterion's dual representation, which is
suitable for general smooth optimization algorithms. Finally, we
evaluate multiple kernel learning in this framework analytically using
a Rademacher complexity bound on the generalization error and
empirically in a set of experiments.},
  funding = {nsf online 08}
}
@techreport{abh-banrle-10,
  title = {Blackwell Approachability and No-Regret Learning are Equivalent},
  author = {Jacob Abernethy and Peter L.~Bartlett and
Elad Hazan},
  institution = {arxiv.org},
  number = {1011.1936},
  url = {http://arxiv.org/abs/1011.1936},
  year = {2010},
  abstract = {We consider the celebrated Blackwell Approachability
Theorem for two-player games with vector payoffs. We show that
Blackwell's result is equivalent, via efficient reductions, to the
existence of 'no-regret' algorithms for Online Linear Optimization.
Indeed, we show that any algorithm for one such problem can be
efficiently converted into an algorithm for the other. We provide a
useful application of this reduction: the first efficient algorithm
for calibrated forecasting.},
  funding = {nsf online 08}
}
@techreport{rbht-llfs-09tr,
  title = {Learning in a large function space: Privacy preserving
mechanisms for {SVM} learning},
  author = {Benjamin I.~P.~Rubinstein and Peter~L.~Bartlett
and Ling Huang and Nina Taft},
  institution = {arxiv.org},
  number = {0911.5708},
  url = {http://arxiv.org/abs/0911.5708},
  year = {2009},
  abstract = {Several recent studies in privacy-preserving learning have
considered the trade-off between utility or risk and the level of
differential privacy guaranteed by mechanisms for statistical query
processing. In this paper we study this trade-off in private Support
Vector Machine (SVM) learning. We present two efficient mechanisms,
one for the case of finite-dimensional feature mappings and one for
potentially infinite-dimensional feature mappings with
translation-invariant kernels. For the case of translation-invariant
kernels, the proposed mechanism minimizes regularized empirical risk
in a random Reproducing Kernel Hilbert Space whose kernel uniformly
approximates the desired kernel with high probability. This technique,
borrowed from large-scale learning, allows the mechanism to respond
with a finite encoding of the classifier, even when the function class
is of infinite VC dimension. Differential privacy is established using
a proof technique from algorithmic stability. Utility--the mechanism's
response function is pointwise epsilon-close to non-private SVM with
probability 1-delta--is proven by appealing to the smoothness of
regularized empirical risk minimization with respect to small
perturbations to the feature mapping. We conclude with a lower bound
on the optimal differential privacy of the SVM. This negative result
states that for any delta, no mechanism can be simultaneously
(epsilon,delta)-useful and beta-differentially private for small
epsilon and small beta.},
  funding = {nsf seqs 07}
}
@techreport{abd-oicams-12,
  title = {Oracle inequalities for computationally adaptive model
selection},
  author = {Alekh Agarwal and Peter L.~Bartlett and
John Duchi},
  institution = {arxiv.org},
  number = {1208.0129},
  url = {http://arxiv.org/abs/1208.0129},
  year = {2012},
  abstract = {We analyze general model selection procedures using
penalized empirical loss minimization under computational constraints.
While classical model selection approaches do not consider
computational aspects of performing model selection, we argue that any
practical model selection procedure must not only trade off estimation
and approximation error, but also the computational effort required to
compute empirical minimizers for different function classes. We
provide a framework for analyzing such problems, and we give
algorithms for model selection under a computational budget. These
algorithms satisfy oracle inequalities that show that the risk of the
selected model is not much worse than if we had devoted all of our
computational budget to the optimal function class.},
  //note = {Submitted to Annals of Statistics},
  funding = {nsf large 11}
}
@inproceedings{bbmrss-lbars-10,
  title = {A learning-based approach to reactive security},
  author = {A.~Barth and Benjamin I.~P.~Rubinstein
and M.~Sundararajan and J.~C.~Mitchell
and Dawn Song and Peter L.~Bartlett},
  booktitle = {Proceedings of Financial Cryptography and Data Security (FC10)},
  year = {2010},
  mon = jan,
  pages = {192--206},
  doi = {http://dx.doi.org/10.1007/978-3-642-14577-3_16},
  funding = {nsf seqs 07}
}
@inproceedings{abd-oasdpp-10,
  title = {Optimal Allocation Strategies for the Dark Pool Problem},
  author = {Alekh Agarwal and Peter L.~Bartlett and Max Dama},
  booktitle = {Proceedings of The Thirteenth
International Conference on Artificial Intelligence and Statistics
(AISTATS)},
  editor = {Y.~W.~Teh and M.~Titterington},
  year = {2010},
  month = may,
  seriestitle = {JMLR: W\&CP},
  volume = {9},
  pages = {9-16},
  pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v9/agarwal10a/agarwal10a.pdf},
  abstract = {We study the problem of allocating stocks to dark pools. We
  propose and analyze an optimal approach for allocations, if
  continuous-valued allocations are allowed. We also propose a
  modification for the case when only integer-valued allocations are
  possible. We extend the previous work on this
  problem to adversarial scenarios, while
  also improving on those results in the iid setup. The resulting
  algorithms are efficient, and perform well in simulations
  under stochastic and adversarial inputs.}
}
@article{bmp-osbeeem-08,
  author = {Peter L. Bartlett and Shahar Mendelson and Petra
		Philips},
  title = {On the optimality of sample-based estimates of the
		expectation of the empirical minimizer},
  journal = {ESAIM: Probability and Statistics},
  volume = {14},
  pages = {315--337},
  year = {2010},
  month = jan,
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-08.pdf},
  abstract = {We study sample-based estimates of the expectation of the
	function produced by the empirical minimization algorithm.
	We investigate the extent to which one can estimate
	the rate of convergence of the empirical minimizer in a
	data dependent manner. We establish three main results.
	First, we provide an algorithm that upper bounds the
	expectation of the empirical minimizer in a completely
	data-dependent manner.  This bound is based on a structural
	result in
	http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf,
        which relates expectations to
	sample averages.  Second, we show that these structural
	upper bounds can be loose. In particular, we demonstrate a
	class for which the expectation of the empirical minimizer
	decreases as $O(1/n)$ for sample size $n$, although the
	upper bound based on structural properties is $\Omega(1)$.
	Third, we show that this looseness of the bound is inevitable:
	we present an example that shows that a sharp bound cannot
	be universally recovered from empirical data.}
}
@article{b-laue-10,
  title = {Learning to act in uncertain environments},
  volume = {53},
  number = {5},
  year = {2010},
  month = may,
  pages = {98},
  author = {Peter L.~Bartlett},
  journal = {Communications of the ACM},
  doi = {http://dx.doi.org/10.1145/1735223.1735246},
  note = {(Invited one-page comment)},
  funding = {nsf seqs 07}
}
@article{rbr-csoimbsc-10,
  title = {Corrigendum to `Shifting: One-inclusion mistake bounds and
sample compression' {[J. Comput. System Sci 75 (1) (2009) 37-59]}},
  volume = {76},
  number = {3--4},
  year = {2010},
  month = may,
  pages = {278--280},
  author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
J.~Hyam Rubinstein},
  journal = {Journal of Computer and System Sciences},
  doi = {http://dx.doi.org/10.1016/j.jcss.2010.03.001}
}
@inproceedings{abbs-ramts-10,
  title = {A Regularization Approach to Metrical Task Systems},
  author = {Jacob Abernethy and Peter L.~Bartlett and Niv Buchbinder
and Isabelle Stanton},
  year = {2010},
  month = oct,
  booktitle = {Algorithmic Learning Theory, 21st International
Conference, ALT 2010},
  editor = {Marcus Hutter and Frank Stephan and Vladimir Vovk and Thomas
Zeugmann},
  pages = {270--284},
  doi = {http://dx.doi.org/10.1007/978-3-642-16108-7_23},
  funding = {nsf online 08}
}
@inproceedings{b-oopae-10,
  title = {Optimal Online Prediction in Adversarial
Environments},
  author = {Peter L.~Bartlett},
  year = {2010},
  month = oct,
  booktitle = {Algorithmic Learning Theory, 21st International
Conference, ALT 2010},
  editor = {Marcus Hutter and Frank Stephan and Vladimir Vovk and Thomas
Zeugmann},
  pages = {34},
  doi = {http://dx.doi.org/10.1007/978-3-642-16184-1_26},
  note = {(Plenary talk abstract)},
  funding = {nsf online 08, nsf seqs 07}
}
@inproceedings{krb-uvmkl-10,
  title = {A Unifying View of Multiple Kernel Learning},
  author = {Marius Kloft and Ulrich R\"uckert and Peter L.~Bartlett},
  year = {2010},
  month = sep,
  booktitle = {Machine Learning and Knowledge Discovery in Databases,
European Conference, ECML PKDD},
  editor = {Jos\'e L.~Balc\'azar and Francesco Bonchi and Aristides
Gionis and Mich\`ele Sebag},
  note = {Part II, LNAI 6322},
  pages = {66--81},
  funding = {nsf online 08},
  doi = {http://dx.doi.org/10.1007/978-3-642-15883-4_5}
}
@inproceedings{kb-iol-10,
  title = {Implicit online learning},
  author = {Brian Kulis and Peter L.~Bartlett},
  booktitle = {Proceedings of the 27th International Conference on
Machine Learning (ICML-10)},
  pdf = {http://www.cse.ohio-state.edu/~kulis/pubs/icml_implicit.pdf},
  editor = {Johannes Fürnkranz and Thorsten Joachims},
  year = {2010},
  month = jun,
  pages = {575--582},
  funding = {nsf online 08, nsf seqs 07}
}
@article{ab-mamssl-08,
  title = {Margin-adaptive model selection in statistical learning},
  author = {Sylvain Arlot and Peter L.~Bartlett},
  journal = {Bernoulli},
  volume = {17},
  number = {2},
  pages = {687--713},
  year = {2011},
  month = may,
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/ab-mamssl-08.pdf},
  abstract = {}
}
@inproceedings{rab-lmf-11,
  title = {Learning with Missing Features},
  author = {Afshin Rostamizadeh and Alekh Agarwal and Peter L.~Bartlett},
  booktitle = {Proceedings of the Conference on
Uncertainty in Artificial Intelligence (UAI2011)},
  editor = {Avi Pfeffer and Fabio G.~Cozman},
  pdf = {http://uai.sis.pitt.edu/papers/11/p635-rostamizadeh.pdf},
  year = {2011},
  month = jul,
  pages = {635--642},
  funding = {nsf online 08},
  abstract = {}
}
@inproceedings{abh-banrle-11,
  title = {Blackwell Approachability and No-Regret Learning are Equivalent},
  author = {Jacob Abernethy and Peter L.~Bartlett and
Elad Hazan},
  booktitle = {Proceedings of the Conference on
Learning Theory (COLT2011)},
  editor = {Sham Kakade and Ulrike von Luxburg},
  year = {2011},
  month = jul,
  seriestitle = {JMLR: W\&CP},
  volume = {19},
  pdf = {http://colt2011.sztaki.hu/colt2011_submission_94.pdf},
  pages = {27-46},
  funding = {nsf online 08},
  abstract = {}
}
@inproceedings{adbl-oicbms-11,
  title = {Oracle inequalities for computationally budgeted model
selection},
  author = {Alekh Agarwal and John Duchi and Peter L.~Bartlett and
Clement Levrard},
  booktitle = {Proceedings of the Conference on
Learning Theory (COLT2011)},
  editor = {Sham Kakade and Ulrike von Luxburg},
  year = {2011},
  month = jul,
  seriestitle = {JMLR: W\&CP},
  volume = {19},
  pages = {69-86},
  abstract = {We analyze general model selection procedures using
penalized empirical loss
  minimization under computational constraints. While classical model
  selection approaches do not consider computational aspects of
performing
  model selection, we argue that any practical model selection
procedure must
  not only trade off estimation and approximation error, but also the
effects
  of computational effort required to compute empirical minimizers for
  different function classes. We provide a framework for analyzing
such
  problems, and we give algorithms for model selection under a
computational budget.},
  pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v19/agarwal11a/agarwal11a.pdf},
  funding = {nsf large scale, nsf online 08}
}
@book{stzbpw-anips24-11,
  editor = {John Shawe-Taylor and Richard Zemel and Peter L.~Bartlett and
Fernando Pereira and Kilian Weinberger},
  title = {Advances in Neural Information
Processing Systems 24. Proceedings of the 2011 Conference},
  url = {http://books.nips.cc/nips24.html},
  publisher = {NIPS Foundation},
  year = {2011},
  month = dec,
  url = {http://nips.cc/Conferences/2011/}
}
@article{abrw-itlbocsco-12,
  title = {Information-theoretic lower bounds on the oracle
complexity of stochastic convex optimization},
  year = {2012},
  month = may,
  author = {Alekh Agarwal and Peter Bartlett and Pradeep Ravikumar and Martin
Wainwright},
  journal = {IEEE Transactions on Information Theory},
  abstract = {Relative to the large literature on upper bounds on
complexity of
  convex optimization, lesser attention has been paid to the
  fundamental hardness of these problems.  Given the extensive use of
  convex optimization in machine learning and statistics, gaining an
  understanding of these complexity-theoretic issues is important. In
  this paper, we study the complexity of stochastic convex
  optimization in an oracle model of computation. We improve upon
  known results and obtain tight minimax complexity estimates for
  various function classes.},
  volume = {58},
  number = {5},
  pages = {3235--3249},
  doi = {http://dx.doi.org/10.1109/TIT.2011.2182178},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrw-itlbocsco-12.pdf}
}
@article{dbw-rsso-12,
  title = {Randomized smoothing for stochastic optimization},
  author = {John Duchi and Peter L.~Bartlett and Martin J.~Wainwright},
  year = {2012},
  month = jun,
  journal = {SIAM Journal on Optimization},
  volume = {22},
  number = {2},
  pages = {674--701},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/dbw-rsso-12.pdf},
  abstract = {We analyze convergence rates of stochastic optimization
algorithms for nonsmooth
convex optimization problems. By combining randomized smoothing
techniques with accelerated
gradient methods, we obtain convergence rates of stochastic
optimization procedures, both in expectation and with high probability,
that have optimal dependence on the variance of the gradient
estimates. To the best of our knowledge, these are the first
variance-based rates for nonsmooth
optimization. We give several applications of our results to
statistical estimation problems and provide experimental results
that demonstrate the effectiveness of the proposed algorithms. We also
describe how a combination of our algorithm with recent work on
decentralized optimization yields
a distributed stochastic optimization algorithm that is
order-optimal.}
}
@inproceedings{hb-ecosnmlbpjp-12,
  title = {Exchangeability Characterizes Optimality of Sequential
Normalized Maximum Likelihood and {Bayesian} Prediction with {Jeffreys}
Prior},
  author = {Fares Hedayati and Peter L.~Bartlett},
  pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v22/hedayati12/hedayati12.pdf},
  editor = {M.~Girolami and N.~Lawrence},
  year = {2012},
  month = apr,
  seriestitle = {JMLR Workshop and Conference Proceedings},
  booktitle = {Proceedings of the Fifteenth International Conference on Artificial Intelligence and Statistics 
(AISTATS)},
  volume = {22},
  pages = {504--510},
  abstract = {We study online prediction of individual sequences under
logarithmic
loss with parametric experts. The optimal strategy, normalized maximum
likelihood (NML), is computationally demanding and requires the length
of
the game to be known. We consider two simpler strategies:
sequential normalized maximum likelihood (SNML),
which computes the NML forecasts at each round as if it were the
last round, and Bayesian prediction.
Under appropriate conditions, both are known to achieve near-optimal
regret.  In this paper, we investigate when these strategies are
optimal.
We show that SNML is optimal iff the joint distribution on
sequences defined by SNML is exchangeable.
In the case of exponential families, this is equivalent to the
optimality of any Bayesian prediction strategy,
and the optimal prior is Jeffreys prior.}
}
@inproceedings{hb-ojpodeanmle-12,
  title = {The Optimality of {J}effreys Prior for Online Density
Estimation and the Asymptotic Normality of Maximum
Likelihood Estimators},
  author = {Fares Hedayati and Peter Bartlett},
  pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v23/hedayati12/hedayati12.pdf},
  seriestitle = {JMLR Workshop and Conference Proceedings},
  booktitle = {Proceedings of the Conference on
Learning Theory (COLT2012)},
  year = {2012},
  month = jun,
  volume = {23},
  pages = {7.1-7.13},
  funding = {nsf large scale},
  abstract = {We study online learning under logarithmic loss with
regular parametric models. We show
that a Bayesian strategy predicts optimally only if it uses Jeffreys
prior. This result was
known for canonical exponential families; we extend it to parametric
models for which the
maximum likelihood estimator is asymptotically normal. The optimal
prediction strategy,
normalized maximum likelihood, depends on the number n of rounds of
the game, in
general. However, when a Bayesian strategy is optimal, normalized
maximum likelihood
becomes independent of n. Our proof uses this to exploit the
asymptotics of normalized
maximum likelihood. The asymptotic normality of the maximum likelihood
estimator is
responsible for the necessity of Jeffreys prior.},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ojpodeanmle-12.pdf}
}
@article{rbht-llfs-09,
  title = {Learning in a large function space: Privacy preserving
mechanisms for {SVM} learning},
  author = {Benjamin I.~P.~Rubinstein and Peter~L.~Bartlett
and Ling Huang and Nina Taft},
  journal = {Journal of Privacy and Confidentiality},
  volume = 4,
  number = 1,
  pages = {65--100},
  year = {2012},
  month = aug,
  url = {http://arxiv.org/abs/0911.5708},
  funding = {nsf seqs 07},
  abstract = {}
}
@article{nabs-amlfdahu-12,
  title = {Application of Machine Learning in the Fault Diagnostics of
  Air Handling Units},
  author = {Massieh Najafi and David M. Auslander and Peter L. Bartlett
  and Philip Haves and Michael D.~Sohn},
  journal = {Applied Energy},
  doi = {http://dx.doi.org/10.1016/j.apenergy.2012.02.049},
  year = {2012},
  month = aug,
  volume = 96,
  pages = {347--358}
}
@article{bmn-lrlrpoi-09,
  title = {$l_1$-regularized linear regression: Persistence and
oracle inequalities},
  author = {Peter L.~Bartlett and Shahar Mendelson
and Joseph Neeman},
  journal = {Probability Theory and Related Fields},
  year = {2012},
  month = oct,
  volume = {154},
  number = {1--2},
  pages = {193--224},
  abstract = {  We study the predictive performance of
$\ell_1$-regularized linear
  regression in a model-free setting, including the case where the
  number of covariates is substantially larger than the sample
  size. We introduce a new analysis method that avoids the boundedness
  problems that typically arise in model-free empirical minimization.
  Our technique provides an answer to a conjecture of Greenshtein and
  Ritov~\cite{GR} regarding the ``persistence'' rate for linear
  regression and allows us to prove an oracle inequality for the error
  of the regularized minimizer.
  It also demonstrates that empirical risk minimization gives optimal
  rates (up to log factors) of convex aggregation of a set of
  estimators of a regression function.},
  funding = {nsf seqs 07},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmn-lrlrpoi-09.pdf},
  doi = {http://dx.doi.org/10.1007/s00440-011-0367-2}
}
@article{brsmsb-lbars-12,
  title = {A learning-based approach to reactive security},
  author = {A.~Barth and Benjamin I.~P.~Rubinstein
and M.~Sundararajan and J.~C.~Mitchell
and Dawn Song and Peter L.~Bartlett},
  year = {2012},
  month = jul,
  journal = {IEEE Transactions on Dependable and Secure Computing},
  url = {http://doi.ieeecomputersociety.org/10.1109/TDSC.2011.42},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/brsmsb-lbars-12.pdf},
  volume = {9},
  number = {4},
  pages = {482--493},
  funding = {nsf seqs 07},
  abstract = {Despite the conventional wisdom that proactive security is
superior to reactive security, we show that reactive security can be
competitive with proactive security as long as the reactive defender
learns from past attacks instead of myopically overreacting to the
last attack.  Our game-theoretic model follows common practice in the
security literature by making worst-case assumptions about the
attacker: we grant the attacker complete knowledge of the defender’s
strategy and do not require the attacker to act rationally. In this
model, we bound the competitive ratio between a reactive defense
algorithm (which is inspired by online learning theory) and the best
fixed proactive defense.  Additionally, we show that, unlike proactive
defenses, this reactive strategy is robust to a lack of information
about the attacker’s incentives and knowledge.}
}
@inproceedings{dbw-rspsocdc-12,
  author = {Duchi, John C. and Bartlett, Peter L. and Wainwright, Martin
J.},
  title = {{Randomized Smoothing for (Parallel) Stochastic
Optimization}},
  booktitle = {{2012 IEEE 51ST ANNUAL CONFERENCE ON DECISION AND CONTROL
(CDC)}},
  series = {{IEEE Conference on Decision and Control}},
  year = {{2012}},
  pages = {{5442-5444}},
  note = {{51st IEEE Annual Conference on Decision and Control (CDC),
HI, DEC
   10-13, 2012}},
  abstract = {{By combining randomized smoothing techniques with
accelerated gradient
   methods, we obtain convergence rates for stochastic optimization
   procedures, both in expectation and with high probability, that
have
   optimal dependence on the variance of the gradient estimates. To
the
   best of our knowledge, these are the first variance-based rates for
   non-smooth optimization. A combination of our techniques with
recent
   work on decentralized optimization yields order-optimal parallel
   stochastic optimization algorithms. We give applications of our
results
   to several statistical machine learning problems, providing
experimental
   results (in the full version of the paper) demonstrating the
   effectiveness of our algorithms.}},
  publisher = {{IEEE}},
  address = {{345 E 47TH ST, NEW YORK, NY 10017 USA}},
  type = {{Proceedings Paper}},
  issn = {{0191-2216}},
  isbn = {{978-1-4673-2066-5}}
}
@incollection{tb-lt-12,
  author = {Ambuj Tewari and Peter L.~Bartlett},
  title = {Learning Theory},
  booktitle = {Signal Processing Theory and Machine Learning},
  series = {Academic Press Library in Signal Processing},
  volume = {1},
  editor = {Paulo S.R. Diniz and Johan A.K. Suykens and
Rama Chellappa and Sergios Theodoridis},
  publisher = {Elsevier},
  year = {2014},
  pages = {775--816}
}
@book{bpbbw-anips25-12,
  editor = {Peter L.~Bartlett and Fernando Pereira and Chris J.~C.~Burges and L\'eon Bottou and Kilian Q.~Weinberger},
  title = {Advances in Neural Information
Processing Systems 25. Proceedings of the 2012 Conference},
  url = {http://books.nips.cc/nips25.html},
  publisher = {NIPS Foundation},
  year = {2012},
  month = dec,
  url = {http://nips.cc/Conferences/2012/}
}
@inproceedings{bghhk-hiopllief-13,
  author = {Peter L.~Bartlett and Peter Grunwald and Peter Harremoes and
Fares Hedayati and Wojciech Kotlowski},
  title = {Horizon-Independent Optimal Prediction with Log-Loss in Exponential
Families},
  seriestitle = {JMLR Workshop and Conference Proceedings},
  booktitle = {Proceedings of the Conference on Learning Theory (COLT2013)},
  year = {2013},
  volume = {30},
  pages = {639--661},
  pdf = {http://www.jmlr.org/proceedings/papers/v30/Bartlett13.pdf},
  funding = {nsf large scale},
  abstract = {We study online learning under logarithmic loss with
regular parametric models. Hedayati and Bartlett (2012) showed that a
Bayesian prediction strategy with Jeffreys prior and sequential
normalized maximum likelihood (SNML) coincide and are optimal if and
only if the latter is exchangeable, which occurs if and only if the
optimal strategy can be calculated without knowing the time horizon in
advance. They put forward the question what families have exchangeable
SNML strategies. We answer this question for one-dimensional
exponential families: SNML is exchangeable only for three classes of
natural exponential family distributions,namely the Gaussian, the
gamma, and the Tweedie exponential family of order 3/2.}
}
@inproceedings{scb-opambla-13,
  author = {Yevgeny Seldin and Koby Crammer and Peter L Bartlett},
  title = {Open Problem: Adversarial Multiarmed Bandits with Limited
Advice},
  seriestitle = {JMLR Workshop and Conference Proceedings},
  booktitle = {Proceedings of the Conference on Learning Theory (COLT2013)},
  year = {2013},
  volume = {30},
  pages = {1067--1072},
  pdf = {http://www.jmlr.org/proceedings/papers/v30/Seldin13.pdf},
  funding = {nsf large scale}
}
@inproceedings{abfw-hhoaa-13,
  author = {Jacob  Abernethy and Peter L.~Bartlett and Rafael Frongillo
and Andre Wibisono},
  title = {How to Hedge an Option Against an Adversary: {B}lack-{S}choles
Pricing is Minimax Optimal},
  year = {2013},
  abstract = {We consider a popular problem in finance, option pricing,
through the lens of an online learning game between Nature and an
Investor. In the Black-Scholes option pricing model from 1973, the
Investor can continuously hedge the risk of an option by trading the
underlying asset, assuming that the asset's price fluctuates according
to Geometric Brownian Motion (GBM). We consider a worst-case model, in
which Nature chooses a sequence of price fluctuations under a
cumulative quadratic volatility constraint, and the Investor can make
a sequence of hedging decisions. Our main result is to show that the
value of our proposed game, which is the ``regret'' of hedging
strategy, converges to the Black-Scholes option price. We use
significantly weaker assumptions than previous work---for instance, we
allow large jumps in the asset price---and show that the Black-Scholes
hedging strategy is near-optimal for the Investor even in this
non-stochastic framework.},
  booktitle = {Advances in Neural Information Processing Systems 26},
  pages = {2346--2354},
  url = {http://papers.nips.cc/paper/4912-how-to-hedge-an-option-against-an-adversary-black-scholes-pricing-is-minimax-optimal},
  pdf = {http://papers.nips.cc/paper/4912-how-to-hedge-an-option-against-an-adversary-black-scholes-pricing-is-minimax-optimal.pdf},
  funding = {nsf large scale, arc laureate}
}
@inproceedings{abkss-olmdpactpd-13,
  author = {Yasin Abbasi-Yadkori and Peter L.~Bartlett and
Varun Kanade and Yevgeny Seldin and Csaba Szepesvari},
  title = {Online Learning in {M}arkov Decision Processes with
Adversarially Chosen Transition Probability Distributions},
  year = {2013},
  abstract = {We study the problem of online learning Markov Decision
Processes (MDPs) when both the transition distributions and loss functions
are chosen by an adversary. We present an algorithm that, under a
mixing assumption, achieves $O(\sqrt{T\log|\Pi|}+\log|\Pi|)$ regret
with respect to a comparison set of policies $\Pi$. The regret
is independent of the size of the state and action spaces. When
expectations over sample paths can be computed efficiently and
the comparison set $\Pi$ has polynomial size,
this algorithm is efficient.
We also consider the episodic adversarial online shortest path
problem. Here, in each episode an adversary may choose a weighted
directed acyclic graph with an identified start and finish node. The
goal of the learning algorithm is to choose a path that minimizes
the loss while traversing from the start to finish node. At the end
of each episode the loss function (given by weights on the edges)
is revealed to the learning algorithm. The goal is to minimize regret
with respect to a fixed policy for selecting paths. This problem is
a special case of the online MDP problem. For randomly chosen graphs
and adversarial losses, this problem can be efficiently solved. We
show that it also can be efficiently solved for adversarial
graphs and randomly chosen losses. When both graphs and losses
are adversarially chosen, we present an efficient algorithm whose
regret scales linearly with the number of distinct graphs.
Finally, we show that designing efficient algorithms for the
adversarial online shortest path problem (and hence for the
adversarial MDP problem) is as hard as learning parity with noise, a
notoriously difficult problem that has been used to design efficient
cryptographic schemes.},
  booktitle = {Advances in Neural Information Processing Systems 26},
  pages = {2508--2516},
  url = {http://papers.nips.cc/paper/4975-online-learning-in-markov-decision-processes-with-adversarially-chosen-transition-probability-distributions},
  pdf = {http://papers.nips.cc/paper/4975-online-learning-in-markov-decision-processes-with-adversarially-chosen-transition-probability-distributions.pdf},
  funding = {nsf large scale, arc laureate}
}
@inproceedings{sbca-plambpo-14,
  author = {Yevgeny Seldin and Peter L.~Bartlett and Koby Crammer and
Yasin Abbasi-Yadkori},
  title = {Prediction with Limited Advice and Multiarmed Bandits with
Paid Observations},
  year = {2014},
  abstract = {We study two basic questions in online learning. The first
question is what happens between full-information and limited-feedback
games and the second question is the cost of information acquisition
in online learning. The questions are addressed by defining two
variations of standard online learning games. In the first variation,
\emph{prediction with limited advice}, we consider a game of
prediction with expert advice, where on each round of the game we
query the advice of a subset of $M$ out of $N$ experts. We present an
algorithm that achieves $O(\sqrt{(N/M)T\ln N})$ regret on $T$ rounds
of this game. The second variation, the \emph{multiarmed bandit with
paid observations}, is a variant of the adversarial $N$-armed bandit
game, where on round $t$ of the game, we can observe the reward of any
number of arms, but each observation has a cost $c$. We present an
algorithm that achieves $O((c N \ln N)^{1/3} T^{2/3})$ regret on $T$
rounds of this game. We present lower bounds that show that, apart
from the logarithmic factors, these regret bounds cannot be improved.},
  booktitle = {Proceedings of the 31st International Conference on
Machine Learning (ICML-14)},
  pages = {280--287},
  url = {http://jmlr.org/proceedings/papers/v32/seldin14.html},
  pdf = {http://jmlr.org/proceedings/papers/v32/seldin14.pdf},
  funding = {nsf large scale, arc laureate}
}
@inproceedings{abk-tat-14,
  author = {Yasin Abbasi-Yadkori and Peter L.~Bartlett and Varun Kanade},
  title = {Tracking Adversarial Targets},
  year = {2014},
  abstract = {We study linear quadratic problems with adversarial
tracking targets. We propose a Follow The Leader algorithm and show
that, under a stability condition, its regret grows as the logarithm
of the number of rounds of the game. We also study a problem with
adversarially chosen transition dynamics, for which an
exponentially-weighted average algorithm is proposed and analyzed.},
  booktitle = {Proceedings of the 31st International Conference on
Machine Learning (ICML-14)},
  pages = {369--377},
  url = {http://jmlr.org/proceedings/papers/v32/abbasi-yadkori14.html},
  pdf = {http://jmlr.org/proceedings/papers/v32/abbasi-yadkori14.pdf},
  funding = {nsf large scale, arc laureate}
}
@techreport{rrb-bevcmc-14,
  author = {J. Hyam Rubinstein and Benjamin Rubinstein and Peter Bartlett},
  title = {Bounding Embeddings of {VC} Classes into Maximum Classes},
  institution = {arXiv.org},
  year = {2014},
  number = {1401.7388},
  url = {http://arxiv.org/abs/1401.7388},
  abstract = {One of the earliest conjectures in computational learning
theory---the Sample Compression Conjecture---asserts that concept
classes (or set systems) admit compression schemes of size polynomial
in their VC dimension. To-date this statement is known to be true for
maximum classes---those that meet Sauer's Lemma, which bounds class
cardinality in terms of VC dimension, with equality. The most
promising approach to positively resolving the conjecture is by
embedding general VC classes into maximum classes without super-linear
increase to their VC dimensions, as such embeddings extend the known
compression schemes to all VC classes. We show that maximum classes
can be characterized by a local-connectivity property of the graph
obtained by viewing the class as a cubical complex. This geometric
characterization of maximum VC classes is applied to prove a negative
embedding result which demonstrates VC-$d$ classes that cannot be
embedded in any maximum class of VC dimension lower than $2d$. On the
other hand, we give a general recursive procedure for embedding VC-$d$
classes into VC-$(d+k)$ maximum classes for smallest $k$.}
}
@incollection{rrb-bevcmc-14a,
  author = {J. Hyam Rubinstein and Benjamin Rubinstein and Peter Bartlett},
  title = {Bounding Embeddings of {VC} Classes into Maximum Classes},
  year = {2014},
  booktitle = {Festschrift of Alexey Chervonenkis},
  editor = {A. Gammerman and V. Vovk},
  publisher = {Springer},
  city = {Berlin},
  abstract = {One of the earliest conjectures in computational learning
theory---the Sample Compression Conjecture---asserts that concept
classes (or set systems) admit compression schemes of size polynomial
in their VC dimension. To-date this statement is known to be true for
maximum classes---those that meet Sauer's Lemma, which bounds class
cardinality in terms of VC dimension, with equality. The most
promising approach to positively resolving the conjecture is by
embedding general VC classes into maximum classes without super-linear
increase to their VC dimensions, as such embeddings extend the known
compression schemes to all VC classes. We show that maximum classes
can be characterized by a local-connectivity property of the graph
obtained by viewing the class as a cubical complex. This geometric
characterization of maximum VC classes is applied to prove a negative
embedding result which demonstrates VC-$d$ classes that cannot be
embedded in any maximum class of VC dimension lower than $2d$. On the
other hand, we give a general recursive procedure for embedding VC-$d$
classes into VC-$(d+k)$ maximum classes for smallest $k$.},
  url = {http://arxiv.org/abs/1401.7388}
}
@techreport{abm-lplsmdp-14,
  author = {Yasin Abbasi-Yadkori and Peter L. Bartlett and Alan Malek},
  title = {Linear programming for large-scale {M}arkov decision problems},
  year = {2014},
  abstract = {We consider the problem of controlling a Markov decision
process (MDP) with a large state space, so as to minimize average
cost. Since it is intractable to compete with the optimal policy for
large scale problems, we pursue the more modest goal of competing with
a low-dimensional family of policies. We use the dual linear
programming formulation of the MDP average cost problem, in which the
variable is a stationary distribution over state-action pairs, and we
consider a neighborhood of a low-dimensional subset of the set of
stationary distributions (defined in terms of state-action features)
as the comparison class. We propose two techniques, one based on
stochastic convex optimization, and one based on constraint sampling.
In both cases, we give bounds that show that the performance of our
algorithms approaches the best achievable by any policy in the
comparison class. Most importantly, these results depend on the size
of the comparison class, but not on the size of the state space.
Preliminary experiments show the effectiveness of the proposed
algorithms in a queuing application.},
  institution = {arXiv.org},
  number = {1402.6763},
  url = {http://arxiv.org/abs/1402.6763}
}
@inproceedings{abm-lplsmdp-14a,
  author = {Yasin Abbasi-Yadkori and Peter L. Bartlett and Alan Malek},
  title = {Linear programming for large-scale {M}arkov decision problems},
  year = {2014},
  booktitle = {Proceedings of the 31st International Conference on
Machine Learning (ICML-14)},
  pages = {496--504},
  funding = {nsf large scale, arc laureate},
  abstract = {We consider the problem of controlling a Markov decision
process (MDP) with a large state space, so as to minimize average
cost. Since it is intractable to compete with the optimal policy for
large scale problems, we pursue the more modest goal of competing with
a low-dimensional family of policies. We use the dual linear
programming formulation of the MDP average cost problem, in which the
variable is a stationary distribution over state-action pairs, and we
consider a neighborhood of a low-dimensional subset of the set of
stationary distributions (defined in terms of state-action features)
as the comparison class. We propose two techniques, one based on
stochastic convex optimization, and one based on constraint sampling.
In both cases, we give bounds that show that the performance of our
algorithms approaches the best achievable by any policy in the
comparison class. Most importantly, these results depend on the size
of the comparison class, but not on the size of the state space.
Preliminary experiments show the effectiveness of the proposed
algorithms in a queuing application.},
  url = {http://jmlr.org/proceedings/papers/v32/malek14.html},
  pdf = {http://jmlr.org/proceedings/papers/v32/malek14.pdf}
}
@inproceedings{kmb-emsslg-14,
  title = {Efficient Minimax Strategies for Square Loss Games},
  author = {Wouter M Koolen and Alan Malek and Peter L Bartlett},
  booktitle = {Advances in Neural Information Processing Systems 27},
  editor = {Z. Ghahramani and M. Welling and C. Cortes and N.D. Lawrence
and K.Q. Weinberger},
  pages = {3230--3238},
  year = {2014},
  publisher = {Curran Associates, Inc.},
  url = {http://papers.nips.cc/paper/5243-efficient-minimax-strategies-for-square-loss-games.pdf},
  abstract = {We consider online prediction problems where the loss
between the prediction and the outcome is measured by the squared
Euclidean distance and its generalization, the squared Mahalanobis
distance. We derive the minimax solutions for the case where the
prediction and action spaces are the simplex (this setup is sometimes
called the Brier game) and the $\ell_2$ ball (this setup is related to
Gaussian density estimation). We show that in both cases the value of
each sub-game is a quadratic function of a simple statistic of the
state, with coefficients that can be efficiently computed using an
explicit recurrence relation. The resulting deterministic minimax
strategy and randomized maximin strategy are linear functions of the
statistic.}
}
@inproceedings{kthbjt-lmcpm-14,
  title = {Large-Margin Convex Polytope Machine},
  author = {Alex Kantchelian and Michael C Tschantz and Ling Huang
and Peter L Bartlett and Anthony D Joseph and J. Doug Tygar},
  booktitle = {Advances in Neural Information Processing Systems 27},
  editor = {Z. Ghahramani and M. Welling and C. Cortes and N.D. Lawrence
and K.Q. Weinberger},
  pages = {3248--3256},
  year = {2014},
  publisher = {Curran Associates, Inc.},
  url = {http://papers.nips.cc/paper/5511-large-margin-convex-polytope-machine.pdf},
  abstract = {We present the Convex Polytope Machine (CPM), a novel
non-linear learning algorithm for large-scale binary classification
tasks. The CPM finds a large margin convex polytope separator which
encloses one class. We develop a stochastic gradient descent based
algorithm that is amenable to massive datasets, and augment it with a
heuristic procedure to avoid sub-optimal local minima. Our
experimental evaluations of the CPM on large-scale datasets from
distinct domains (MNIST handwritten digit recognition, text topic, and
web security) demonstrate that the CPM trains models faster, sometimes
several orders of magnitude, than state-of-the-art similar approaches
and kernel-SVM methods while achieving comparable or better
classification performance. Our empirical results suggest that, unlike
prior similar approaches, we do not need to control the number of
sub-classifiers (sides of the polytope) to avoid overfitting.}
}
@inproceedings{abcm-lsmdpklcc-15,
  author = {Yasin Abbasi-Yadkori and Peter L Bartlett and Xi Chen and Alan Malek},
  title = {Large-Scale {M}arkov Decision Problems with {KL} Control Cost},
  year = {2015},
  month = {June},
  booktitle = {Proceedings of the 32nd International Conference on
Machine Learning (ICML-15)},
  volume = {37},
  pages = {1053--1062},
  url = {http://jmlr.org/proceedings/papers/v37/abbasi-yadkori15.html},
  pdf = {http://jmlr.org/proceedings/papers/v37/abbasi-yadkori15.pdf},
  funding = {nsf large scale, arc laureate, adobe},
  abstract = {}
}
@inproceedings{bkmtw-mfdlr-15,
  author = {Peter L.~Bartlett and Wouter Koolen and Alan
Malek and Eiji Takimoto and Manfred Warmuth},
  title = {Minimax Fixed-Design Linear Regression},
  seriestitle = {JMLR Workshop and Conference Proceedings},
  booktitle = {Proceedings of the Conference on Learning Theory (COLT2015)},
  year = {2015},
  volume = {40},
  pages = {226--239},
  month = {June},
  url = {http://jmlr.org/proceedings/papers/v40/Bartlett15.pdf},
  pdf = {http://jmlr.org/proceedings/papers/v40/Bartlett15.pdf},
  abstract = {We consider a linear regression game in which the covariates are
known in advance: at each round, the learner predicts a real-value,
the adversary reveals a label, and the learner incurs a squared
error loss. The aim is to minimize the regret with respect to linear
predictions.  For a variety of constraints on the adversary's
labels, we show that the minimax optimal strategy is linear,
with a parameter choice that is reminiscent of ordinary least squares
(and as easy to compute).  The predictions depend on all covariates,
past and future, with a particular weighting assigned to future
covariates corresponding to the role that they play in the minimax
regret.  We study two families of label sequences: box constraints
(under a covariate compatibility condition), and a weighted 2-norm
constraint that emerges naturally from the analysis.  The strategy
is adaptive in the sense that it requires no knowledge of the
constraint set.  We obtain an explicit expression for the minimax
regret for these games.  For the case of uniform box constraints,
we show that, with worst case covariate sequences, the regret is
$O(d\log T)$, with no dependence on the scaling of the covariates.},
  funding = {nsf large scale, arc laureate, arc coe, simons it spring2015}
}
@inproceedings{aykmb-mtsp-15,
  author = {Wouter Koolen and Alan Malek and Peter L. Bartlett and
    Yasin Abbasi-Yadkori},
  title = {Minimax time series prediction},
  year = {2015},
  booktitle = {Advances in Neural Information Processing Systems 28},
  editor = {C. Cortes and N.D. Lawrence and D.D. Lee and M. Sugiyama and
R. Garnett and R. Garnett},
  pages = {2548--2556},
  publisher = {Curran Associates, Inc.},
  url = {http://papers.nips.cc/paper/5730-minimax-time-series-prediction.pdf},
  abstract = {We consider an adversarial formulation of
the problem of predicting a time series with square loss.
The aim is to predict an arbitrary sequence of vectors almost as well
as the best smooth comparator sequence in retrospect.
Our approach allows natural measures of smoothness, such as the
squared norm of increments. More generally, we can consider a linear
time series model and penalize the comparator sequence through
the energy of the implied driving noise terms.
We derive the minimax strategy for all problems of this type, and
we show that it can be implemented efficiently.
The optimal predictions are linear in the previous observations.
We obtain an explicit expression for the regret in terms of
the parameters defining the problem.
For typical, simple definitions of smoothness, the computation of the
optimal predictions involves only sparse matrices.
In the case of norm-constrained data, where the smoothness is defined
in terms of the squared norm of the comparator's increments, we show
that the regret grows as $T/\sqrt\lambda$, where $T$ is the length
of the game and $\lambda$ specifies the smoothness of the comparator.},
  funding = {nsf large scale, arc laureate, arc coe, simons it spring2015}
}
@inproceedings{kbb-amdcdt-15,
  author = {Walid Krichene and Alexandre Bayen and Peter L. Bartlett},
  title = {Accelerating Mirror Descent in Continuous and Discrete time},
  booktitle = {Advances in Neural Information Processing Systems 28},
  year = {2015},
  editor = {C. Cortes and N.D. Lawrence and D.D. Lee and M. Sugiyama and
R. Garnett and R. Garnett},
  pages = {2827--2835},
  publisher = {Curran Associates, Inc.},
  url = {http://papers.nips.cc/paper/5843-accelerated-mirror-descent-in-continuous-and-discrete-time.pdf},
  abstract = {We study accelerated mirror descent dynamics in continuous
and discrete time. Combining the original continuous-time motivation of
mirror descent with a recent ODE interpretation of Nesterov's
accelerated method, we propose a family of continuous-time descent
dynamics for convex functions with Lipschitz gradients, such that the
solution trajectories are guaranteed to converge to the optimum at a
$\Ocal(1/t^2)$ rate. We then show that a large family of first-order
accelerated methods can be obtained as a discretization of the ODE,
and these methods converge at a $\Ocal(1/k^2)$ rate. This connection
between accelerated mirror descent and the ODE provides an intuitive
approach to the design and analysis of accelerated first-order
algorithms.},
  funding = {nsf large scale, arc laureate, arc coe, simons asgt fall2014}
}
@techreport{aybw-lramdp-15,
  author = {Yasin Abbasi-Yadkori and Peter L. Bartlett and Stephen
    Wright},
  title = {A {L}agrangian Relaxation Approach to {M}arkov
    Decision Problems},
  institution = {UC Berkeley EECS},
  year = {2015},
  abstract = {We study Markov decision problems (MDPs) over a restricted
policy class, and show that a Lagrangian relaxation approach finds
near-optimal policies in this class efficiently.
In particular, the computational complexity depends on the number
of features used to define policies, and not on the size of the
state space. The statistical complexity also scales well: our method
requires only low-dimensional second order statistics.
Most value-function-based methods for MDPs return a policy that is
greedy with respect to the value function estimate. We discuss
drawbacks of this approach, and propose a new policy class defined for
some parameter vector $w$ by $\pi_w(a|x) = ( 1-Q_w(x,a) +
\E_{\nu(.|x)} Q_w ) \nu(a|x)$,
where $Q_w$ is the state-action value function,
$\nu$ is a baseline policy, and the mean of $Q_w$ under $\nu(.|x)$
% (denoted by $ \E_{\nu(.|x)} Q_w$) 
acts as a normalizer. Similar to the
greedy and Gibbs policies, the proposed policy assigns larger
probabilities to actions with smaller value-function estimates.  We
demonstrate the effectiveness of our Lagrangian relaxation approach,
applied to this policy class, on a queueing problem and an energy
storage application.},
  funding = {nsf large scale, arc laureate, Adobe}
}
@techreport{hb-ecosnmlbp-15,
  author = {Fares Hedayati and Peter L. Bartlett},
  title = {Exchangeability Characterizes Optimality of Sequential
Normalized Maximum Likelihood and {B}ayesian Prediction},
  year = {2015},
  abstract = {We study online learning under logarithmic loss with
regular parametric models. In this setting, each strategy corresponds
to a joint distribution on sequences. The minimax optimal strategy is the
normalized maximum likelihood (NML) strategy. We show 
that the sequential normalized maximum likelihood (SNML) strategy
predicts minimax optimally (i.e. as NML) if and only if the
joint distribution on sequences defined by SNML is exchangeable. This
property also characterizes the optimality of a Bayesian
prediction strategy. In that case, the optimal prior distribution is
Jeffreys prior for a broad class of parametric models for which
the maximum likelihood estimator is asymptotically normal. The optimal
prediction strategy, normalized maximum likelihood,
depends on the number n of rounds of the game, in general. However,
when a Bayesian strategy is optimal, normalized maximum
likelihood becomes independent of n. Our proof uses this to exploit
the asymptotics of normalized maximum likelihood. The
asymptotic normality of the maximum likelihood estimator is
responsible for the necessity of Jeffreys prior.},
  institution = {UC Berkeley EECS},
  //note = {Submitted to IEEE Transactions on Information Theory},
  url = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ecosnmlbp-15.pdf},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ecosnmlbp-15.pdf},
  funding = {nsf large scale, arc laureate}
}
@inproceedings{aybw-frpia-16,
  author = {Yasin Abbasi-Yadkori and Peter L. Bartlett and Stephen
    Wright},
  title = {A Fast and Reliable Policy Improvement Algorithm},
  booktitle = {Proceedings of AISTATS 2016},
  pages = {1338--1346},
  year = {2016},
  url = {http://jmlr.org/proceedings/papers/v51/abbasi-yadkori16.html},
  pdf = {http://jmlr.org/proceedings/papers/v51/abbasi-yadkori16.pdf},
  funding = {arc laureate, arc coe, adobe}
}
@inproceedings{glgob-ilccpeb-16,
  author = {Victor Gabillon and Alessandro Lazaric and Mohammad
Ghavamzadeh and Ronald Ortner and Peter L.~Bartlett},
  title = {Improved Learning Complexity in Combinatorial Pure
Exploration Bandits},
  booktitle = {Proceedings of AISTATS 2016},
  pages = {1004--1012},
  year = {2016},
  url = {http://jmlr.org/proceedings/papers/v51/gabillon16.html},
  pdf = {http://jmlr.org/proceedings/papers/v51/gabillon16.pdf},
  funding = {arc laureate, arc coe}
}
@inproceedings{kbb-aaadd-16,
  author = {Walid Krichene and Alexandre Bayen and Peter L. Bartlett},
  title = {Adaptive Averaging in Accelerated Descent Dynamics},
  booktitle = {Advances in Neural Information Processing Systems 29},
  year = {2016},
  funding = {nsf deep, arc laureate, arc coe}
}
@techreport{b-op-16,
  author = {Peter L. Bartlett},
  title = {Online prediction},
  year = {2015},
  abstract = {We review game-theoretic models of prediction, in which
the process generating the data is modelled as an adversary with whom
the prediction method competes. We present a formulation that
encompasses a wide variety of decision problems, and focus on the
relationship between prediction in this game-theoretic setting and
prediction in the more standard probabilistic setting. In particular,
we present a view of standard prediction strategies as Bayesian
decision methods, and we show how the regret of optimal strategies
depends on complexity measures that are closely related to those that
appear in probabilistic settings.},
  institution = {UC Berkeley EECS},
  //note = {Submitted to Annales de l'Institut Henri Poincar\'{e}},
  funding = {nsf large scale, arc laureate}
}
@techreport{aymbg-hnr-16,
  author = {Yasin Abbasi-Yadkori and Alan Malek and Peter L.~Bartlett
and Victor Gabillon},
  title = {Hit-and-Run for Sampling and Planning in Non-Convex Spaces},
  year = {2016},
  abstract = {We propose the Hit-and-Run algorithm for planning and sampling
problems in
non-convex spaces. For sampling, we show the first analysis of the
Hit-and-Run algorithm in non-convex spaces and show that it mixes fast as
long as certain smoothness conditions are satisfied. In particular,
our analysis reveals an intriguing connection between fast mixing and
the existence of smooth measure-preserving mappings from a convex
space to the non-convex space. For planning, we show advantages of
Hit-and-Run compared to state-of-the-art planning methods such as
Rapidly-Exploring Random Trees.},
  institution = {UC Berkeley EECS},
  //note = {Submitted to AIStats2016},
  funding = {nsf large scale, arc laureate, nsf deep learning}
}

This file was generated by bibtex2html 1.98.

Books

Journal papers, book chapters

All Publications