@comment{{This file has been generated by bib2bib 1.91}}
@comment{{Command line: bib2bib -ob recent-pubs.bib -c 'year>=2006 and not note : "[pP]atent"' /accounts/fac/bartlett/share/lib/bibtex/mypubs.bib}}
@article{bjm-ccrb-05,
title = {Convexity, classification, and risk bounds},
author = {Peter L.\ Bartlett and
Michael I.\ Jordan and Jon D.\ McAuliffe},
note = {(Was Department of Statistics, U.C.\ Berkeley Technical Report
number 638, 2003)},
year = {2006},
journal = {Journal of the American Statistical Association},
volume = {101},
number = {473},
pages = {138-156},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.pdf},
abstract = {Many of the classification algorithms developed in the
machine learning literature, including the support
vector machine and boosting, can be viewed as minimum
contrast methods that minimize a convex surrogate of the
0-1 loss function. The convexity makes these algorithms
computationally efficient. The use of a surrogate, however,
has statistical consequences that must be balanced against
the computational virtues of convexity. To study these
issues, we provide a general quantitative relationship between
the risk as assessed using the 0-1 loss and the risk as assessed
using any nonnegative surrogate loss function. We show that this
relationship gives nontrivial upper bounds on excess risk
under the weakest possible condition on the loss
function: that it satisfy a pointwise form of Fisher
consistency for classification. The relationship is
based on a simple variational transformation of the loss
function that is easy to compute in many applications.
We also present a refined version of this result in the
case of low noise. Finally, we present applications of
our results to the estimation of convergence rates in the
general setting of function classes that are scaled
convex hulls of a finite-dimensional base class,
with a variety of commonly used loss functions.}
}
@article{bm-em-05,
author = {Peter L.~Bartlett and Shahar Mendelson},
title = {Empirical minimization},
journal = {Probability Theory and Related Fields},
year = {2006},
volume = 135,
number = 3,
pages = {311--334},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.pdf},
abstract = {We investigate the behavior of the empirical
minimization algorithm using various methods. We first analyze it
by comparing the empirical, random, structure and the original one
on the class, either in an additive sense, via the uniform law of
large numbers, or in a multiplicative sense, using isomorphic
coordinate projections. We then show that a direct analysis of the
empirical minimization algorithm yields a significantly better
bound, and that the estimates we obtain are essentially sharp. The
method of proof we use is based on Talagrand's concentration
inequality for empirical processes.}
}
@techreport{bmp-osbeeem-05,
author = {Peter L. Bartlett and Shahar Mendelson and Petra
Philips},
title = {Optimal sample-based estimates of the
expectation of the empirical minimizer},
year = {2007},
institution = {U.C.\ Berkeley},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.pdf},
abstract = {We study sample-based estimates of the expectation of the
function produced by the empirical minimization algorithm.
We investigate the extent to which one can estimate
the rate of convergence of the empirical minimizer in a
data dependent manner. We establish three main results.
First, we provide an algorithm that upper bounds the
expectation of the empirical minimizer in a completely
data-dependent manner. This bound is based on a structural
result in
http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf,
which relates expectations to
sample averages. Second, we show that these structural
upper bounds can be loose. In particular, we demonstrate a
class for which the expectation of the empirical minimizer
decreases as $O(1/n)$ for sample size $n$, although the
upper bound based on structural properties is $\Omega(1)$.
Third, we show that this looseness of the bound is inevitable:
we present an example that shows that a sharp bound cannot
be universally recovered from empirical data.}
}
@article{bmp-osbeeem-08,
author = {Peter L. Bartlett and Shahar Mendelson and Petra
Philips},
title = {Optimal sample-based estimates of the
expectation of the empirical minimizer},
journal = {ESAIM: Probability and Statistics},
year = {2008},
note = {(Accepted)},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.pdf},
abstract = {We study sample-based estimates of the expectation of the
function produced by the empirical minimization algorithm.
We investigate the extent to which one can estimate
the rate of convergence of the empirical minimizer in a
data dependent manner. We establish three main results.
First, we provide an algorithm that upper bounds the
expectation of the empirical minimizer in a completely
data-dependent manner. This bound is based on a structural
result in
http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf,
which relates expectations to
sample averages. Second, we show that these structural
upper bounds can be loose. In particular, we demonstrate a
class for which the expectation of the empirical minimizer
decreases as $O(1/n)$ for sample size $n$, although the
upper bound based on structural properties is $\Omega(1)$.
Third, we show that this looseness of the bound is inevitable:
we present an example that shows that a sharp bound cannot
be universally recovered from empirical data.}
}
@techreport{b-freeoims-06,
author = {Peter L. Bartlett},
title = {Fast rates for estimation error and oracle
inequalities for model selection},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-freeoims-06.pdf},
institution = {Department of Statistics, U.C.\ Berkeley},
number = {729},
abstract = {We consider complexity penalization methods for model
selection. These methods aim to choose a model to
optimally trade off estimation and approximation errors
by minimizing the sum of an empirical risk term and a
complexity penalty. It is well known that if we use a
bound on the maximal deviation between empirical and
true risks as a complexity penalty,
then the risk of our choice is no more
than the approximation error plus twice the complexity
penalty. There are many cases, however, where complexity
penalties like this give loose upper bounds on the
estimation error. In particular, if we choose a function
from a suitably simple convex function class with a
strictly convex loss function, then the estimation error
(the difference between the risk of the empirical risk
minimizer and the minimal risk in the class) approaches zero at a
faster rate than the maximal deviation between empirical
and true risks. In this note, we address the question of
whether it is possible to design a complexity penalized
model selection method for these situations. We show that,
provided the sequence of models is ordered by inclusion, in
these cases we can use tight upper bounds on estimation
error as a complexity penalty. Surprisingly, this is the
case even in situations when the difference between the
empirical risk and true risk (and indeed the error of
any estimate of the approximation error) decreases
much more slowly than the
complexity penalty. We give an oracle inequality
showing that the resulting model selection method chooses
a function with risk no more than the approximation error
plus a constant times the complexity penalty.}
}
@article{b-freeoims-07,
author = {Peter L. Bartlett},
title = {Fast rates for estimation error and oracle
inequalities for model selection},
year = {2008},
journal = {Econometric Theory},
volume = {24},
number = {2},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-freeoims-07.pdf},
note = {(To appear. Was Department of Statistics,
U.C.\ Berkeley Technical Report number 729, 2007)},
abstract = {We consider complexity penalization methods for model
selection. These methods aim to choose a model to
optimally trade off estimation and approximation errors
by minimizing the sum of an empirical risk term and a
complexity penalty. It is well known that if we use a
bound on the maximal deviation between empirical and
true risks as a complexity penalty,
then the risk of our choice is no more
than the approximation error plus twice the complexity
penalty. There are many cases, however, where complexity
penalties like this give loose upper bounds on the
estimation error. In particular, if we choose a function
from a suitably simple convex function class with a
strictly convex loss function, then the estimation error
(the difference between the risk of the empirical risk
minimizer and the minimal risk in the class) approaches zero at a
faster rate than the maximal deviation between empirical
and true risks. In this note, we address the question of
whether it is possible to design a complexity penalized
model selection method for these situations. We show that,
provided the sequence of models is ordered by inclusion, in
these cases we can use tight upper bounds on estimation
error as a complexity penalty. Surprisingly, this is the
case even in situations when the difference between the
empirical risk and true risk (and indeed the error of
any estimate of the approximation error) decreases
much more slowly than the
complexity penalty. We give an oracle inequality
showing that the resulting model selection method chooses
a function with risk no more than the approximation error
plus a constant times the complexity penalty.}
}
@techreport{bw-crouhl-06,
author = {Peter L. Bartlett and Marten H. Wegkamp},
title = {Classification with a Reject Option using a Hinge
Loss},
year = {2006},
institution = {U.C.\ Berkeley},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.pdf},
abstract = {We consider the problem of binary classification where the
classifier can, for
a particular cost, choose not to classify an observation. Just as in
the
conventional classification problem, minimization of the sample
average of
the cost is a difficult optimization problem. As an alternative, we
propose
the optimization of a certain convex loss function f, analogous
to the
hinge loss used in support vector machines (SVMs). Its convexity
ensures that
the sample average of this surrogate loss can be efficiently
minimized. We
study its statistical properties. We show that minimizing the expected
surrogate loss---the f-risk---also minimizes the risk. We also
study
the rate at which the f-risk approaches its minimum value. We
show that
fast rates are possible when the conditional probability Pr(Y=1|X)
is
unlikely to be close to certain critical values.}
}
@article{bw-crouhl-08,
author = {Peter L. Bartlett and Marten H. Wegkamp},
title = {Classification with a Reject Option using a Hinge
Loss},
journal = {Journal of Machine Learning Research},
year = {2008},
note = {(To appear.)},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.pdf},
abstract = {We consider the problem of binary classification where the
classifier can, for
a particular cost, choose not to classify an observation. Just as in
the
conventional classification problem, minimization of the sample
average of
the cost is a difficult optimization problem. As an alternative, we
propose
the optimization of a certain convex loss function f, analogous
to the
hinge loss used in support vector machines (SVMs). Its convexity
ensures that
the sample average of this surrogate loss can be efficiently
minimized. We
study its statistical properties. We show that minimizing the expected
surrogate loss---the f-risk---also minimizes the risk. We also
study
the rate at which the f-risk approaches its minimum value. We
show that
fast rates are possible when the conditional probability Pr(Y=1|X)
is
unlikely to be close to certain critical values.}
}
@inproceedings{bt-scpskd-06,
author = {Peter L.~Bartlett and Ambuj Tewari},
title = {Sample complexity of policy search with known dynamics},
booktitle = {Advances in Neural Information Processing Systems 19},
year = 2007,
editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
publisher = {MIT Press},
address = {Cambridge, MA},
pages = {97--104},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-scpskd-06.pdf}
}
@inproceedings{rbr-soimbtmerb-06,
author = {Benjamin I. P. Rubinstein and Peter L.~Bartlett and J. Hyam
Rubinstein},
title = {Shifting, one-inclusion mistake bounds and tight multiclass
expected risk bounds},
booktitle = {Advances in Neural Information Processing Systems 19},
note = {To appear},
editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
publisher = {MIT Press},
address = {Cambridge, MA},
pages = {1193--1200},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/rbr-soimbtmerb-06.pdf}
}
@inproceedings{rb-rcckc-06,
author = {David Rosenberg and Peter L.~Bartlett},
title = {The {R}ademacher Complexity of Co-Regularized Kernel Classes},
abstract = {In the multi-view approach to semi-supervised learning, we choose
one predictor from each of multiple hypothesis classes, and we
`co-regularize' our choices by penalizing disagreement among the
predictors on the unlabeled data. In this paper we examine the
co-regularization method used in the recently proposed
co-regularized least squares (CoRLS) algorithm.
In this method we have two hypothesis
classes, each a reproducing kernel Hilbert space (RKHS), and we
co-regularize by penalizing the average squared difference in
predictions on the unlabeled data. We get our final predictor by
taking the pointwise average of the predictors from each view. We
call the set of predictors that can result from this procedure the
co-regularized hypothesis class. The main result of this paper is a
tight bound on the Rademacher complexity of the co-regularized
hypothesis class in terms of the kernel matrices of each RKHS. We
find that the co-regularization reduces the Rademacher complexity of
the hypothesis class by an amount depending on how different the two
views are, measured by a data dependent metric. We then use standard
techniques to bound the gap between training error and test error
for the CoRLS algorithm. Experimentally, we find that the amount of
reduction in complexity introduced by co-regularization correlates
with the amount of improvement that co-regularization gives in the
CoRLS algorithm},
booktitle = {Proceedings of the Eleventh International Conference on
Artificial Intelligence and Statistics},
editor = {Marina Meila and Xiaotong Shen},
note = {To appear},
year = 2007,
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/rb-rcckc-06.pdf}
}
@inproceedings{bt-aic-06,
author = {Peter L.~Bartlett and Mikhail Traskin},
title = {AdaBoost is Consistent},
booktitle = {Advances in Neural Information Processing Systems 19},
editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
publisher = {MIT Press},
address = {Cambridge, MA},
pages = {105--112},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-aic-06.pdf},
abstract = {The risk, or probability of error, of
the classifier produced by the AdaBoost algorithm is
investigated. In particular, we consider the stopping
strategy to be used in AdaBoost to achieve universal
consistency. We show that provided AdaBoost is stopped
after n^(1-a)$ iterations---for sample size n and
a>0---the sequence of risks of the classifiers it
produces approaches the Bayes risk.}
}
@techreport{bt-aic-06a,
author = {Peter L.~Bartlett and Mikhail Traskin},
title = {AdaBoost is Consistent},
institution = {U.\ C.\ Berkeley},
year = {2006}
}
@article{bjm-c-06,
author = {Peter L.\ Bartlett and
Michael I.\ Jordan and Jon D.\ McAuliffe},
title = {Comment},
journal = {Statistical Science},
year = {2006},
volume = {21},
number = {3},
pages = {341--346}
}
@inproceedings{bt-aolmc-06,
author = {Peter L.~Bartlett and Mikhail Traskin},
title = {AdaBoost and other Large Margin Classifiers:
Convexity in Pattern Classification},
booktitle = {Proceedings of the 5th Workshop on Defence Applications
of Signal Processing},
year = {2006},
note = {To appear}
}
@article{bm-d-06,
author = {Peter L.\ Bartlett and Shahar Mendelson},
title = {Discussion of ``2004 {IMS} {M}edallion {L}ecture:
{L}ocal {R}ademacher complexities and oracle inequalities in risk
minimization'' by {V}.\ {K}oltchinskii},
journal = {The Annals of Statistics},
year = {2006},
volume = {34},
number = {6},
pages = {2657--2663}
}
@inproceedings{tb-bpmdparc-07,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {Bounded parameter {M}arkov decision processes with average
reward criterion},
booktitle = {Proceedings of the Conference on Learning Theory},
year = {2007},
pages = {263--277}
}
@inproceedings{abr-mlea-07,
author = {Jacob Abernethy and Peter L.~Bartlett and Alexander Rakhlin},
title = {Multitask learning with expert advice},
booktitle = {Proceedings of the Conference on Learning Theory},
year = {2007},
pages = {484-498},
abstract = {We consider the problem of prediction with expert advice
in the setting where a forecaster is presented with several
online prediction tasks. Instead of competing against the best
expert separately on each task, we assume the tasks are
related, and thus we expect that a few experts will perform
well on the entire set of tasks. That is, our forecaster would
like, on each task, to compete against the best expert
chosen from a small set of experts. While we describe
the `ideal' algorithm and its performance bound, we show
that the computation required for this algorithm is as hard as
computation of a matrix permanent. We present an efficient
algorithm based on mixing priors, and prove a bound that is
nearly as good for the sequential task presentation case. We
also consider a harder case where the task may change
arbitrarily from round to round, and we develop an efficient
randomized algorithm based on Markov chain Monte Carlo
techniques.}
}
@inproceedings{arb--07,
author = {Alexander Rakhlin and Jacob Abernethy and
Peter L.~Bartlett},
title = {Online discovery of similarity mappings},
booktitle = {Proceedings of the 24th International Conference on Machine
Learning (ICML-2007)},
year = {2007},
note = {To appear},
pages = {},
abstract = {We consider the problem of choosing,
sequentially, a map which assigns elements of a set A
to a few elements of a set B. On each round, the
algorithm suffers some cost associated with the chosen
assignment, and the goal is to minimize the cumulative loss
of these choices relative to the best map on the entire
sequence. Even though the offline problem of finding the best
map is provably hard, we show that there is an equivalent
online approximation algorithm, Randomized Map Prediction
(RMP), that is efficient and performs nearly as well. While
drawing upon results from the `Online Prediction with
Expert Advice' setting, we show how RMP can be utilized as
an online approach to several standard batch problems. We
apply RMP to online clustering as well as online feature
selection and, surprisingly, RMP often outperforms the
standard batch algorithms on these problems.}
}
@article{bt-aic-07,
author = {Peter L.~Bartlett and Mikhail Traskin},
title = {AdaBoost is Consistent},
journal = {Journal of Machine Learning Research},
volume = {8},
pages = {2347--2368},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-aic-07.pdf},
abstract = {The risk, or probability of error, of
the classifier produced by the AdaBoost algorithm is
investigated. In particular, we consider the stopping
strategy to be used in AdaBoost to achieve universal
consistency. We show that provided AdaBoost is stopped
after n^(1-a) iterations---for sample size n and
0
@techreport{rbr-soimbtmerb-07,
author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
J.~Hyam Rubinstein},
title = {Shifting: one-inclusion mistake bounds and sample
compression},
institution = {EECS Department, University of California, Berkeley},
pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-86.pdf},
year = {2007},
abstract = {We present new expected risk bounds for binary and
multiclass prediction, and resolve several recent conjectures
on sample compressibility due to Kuzmin and Warmuth. By
exploiting the combinatorial structure of concept class F,
Haussler et al. achieved a VC(F)/n bound for the natural one-inclusion
prediction strategy. The key step in their proof is a d=VC(F)
bound on the graph density of a subgraph of the
hypercube - one-inclusion graph. The first main result of
this report is a density bound of
n choose(n-1,<=d-1)/choose(n,<=d) < d, which positively resolves
a conjecture of Kuzmin and Warmuth relating to their unlabeled
Peeling compression scheme and also leads to an improved
one-inclusion mistake bound. The proof uses a new form of
VC-invariant shifting and a group-theoretic symmetrization.
Our second main result is an algebraic topological property of
maximum classes of VC-dimension d as being d-contractible
simplicial complexes, extending the well-known
characterization that d=1 maximum classes are trees. We
negatively resolve a minimum degree conjecture of Kuzmin and
Warmuth - the second part to a conjectured proof of
correctness for Peeling - that every class has one-inclusion
minimum degree at most its VC-dimension. Our final main result
is a k-class analogue of the d/n mistake bound, replacing the
VC-dimension by the Pollard pseudo-dimension and the
one-inclusion strategy by its natural hypergraph
generalization. This result improves on known PAC-based
expected risk bounds by a factor of O(log n) and is shown to
be optimal up to a O(log k) factor. The combinatorial
technique of shifting takes a central role in understanding
the one-inclusion (hyper)graph and is a running theme
throughout.}
}
@article{rbr-soimbtmerb-08,
author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
J.~Hyam Rubinstein},
title = {Shifting: one-inclusion mistake bounds and sample
compression},
journal = {Journal of Computer and System Sciences},
note = {(To appear. Was University of California, Berkeley, EECS Department
Technical Report EECS-2007-86)},
pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-86.pdf},
year = {2008}
}
@unpublished{rb-bbspngp-07,
author = {David Rosenberg and Peter L.~Bartlett},
title = {On bounds for {B}ayesian sequence prediction with
non-{G}aussian priors},
note = {Technical Report},
year = {2007},
abstract = {We present worst-case log-loss regret bounds for the
Bayesian model averaging algorithm in the regression setting.
Our work generalizes earlier work that gives bounds of a
similar form that only hold for Gaussian priors. Our bounds
hold for arbitrary priors, though the regret term now includes
a term involving the modulus of continuity of the prior, as
well as the value of the prior at the point of comparison.}
}
@unpublished{abrt-mlbocg-07,
author = {Jacob Abernethy and Peter L.~Bartlett and Alexander
Rakhlin and Ambuj Tewari},
title = {Minimax Lower Bounds for Online Convex Games},
note = {Technical Report},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrt-mlbocg-07.pdf},
abstract = {A number of learning problems can be cast as an Online
Convex Game: on each round, a learner makes a prediction $x$
from a convex set, the environment plays a loss function $f$,
and the learner's long-term goal is to minimize regret. Algorithms
have been proposed by Zinkevich, when $f$ is assumed to be
convex, and Hazan et al, when $f$ is assumed to be strongly
convex, that have provably low regret. We consider these two
settings and analyze such games from a minimax perspective,
proving lower bounds in each case. These results prove that the
existing algorithms are essentially optimal.}
}
@inproceedings{bhr-aogd-07,
author = {Peter L.~Bartlett and Elad Hazan and Alexander Rakhlin},
title = {Adaptive online gradient descent},
booktitle = {Advances in Neural Information Processing
Systems 20},
editor = {Daphne Koller and Yoram Singer and John Platt},
publisher = {MIT Press},
address = {Cambridge, MA},
year = {2008},
note = {To appear},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bhr-aogd-07.pdf},
abstract = {We study the rates of growth of the regret in online
convex optimization. First, we show that a simple extension of
the algorithm of Hazan et al eliminates the need for a priori
knowledge of the lower bound on the second derivatives of the
observed functions. We then provide an algorithm, Adaptive
Online Gradient Descent, which interpolates between the
results of Zinkevich for linear functions and of Hazan et al
for strongly convex functions, achieving intermediate rates
between $\sqrt{T}$ and $\log T$. Furthermore, we show strong
optimality of the algorithm. Finally, we provide an extension
of our results to general norms.}
}
@inproceedings{tb-olplrim-07,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {Optimistic linear programming gives logarithmic regret for
irreducible MDPs},
booktitle = {Advances in Neural Information Processing
Systems 20},
editor = {Daphne Koller and Yoram Singer and John Platt},
publisher = {MIT Press},
address = {Cambridge, MA},
year = {2008},
note = {To appear},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/tb-olplrim-07.pdf},
abstract = {We present an algorithm called Optimistic Linear
Programming (OLP) for learning to optimize average reward in
an irreducible but otherwise unknown Markov decision process
(MDP). OLP uses its experience so far to estimate the MDP. It
chooses actions by optimistically maximizing estimated future
rewards over a set of next-state transition probabilities that
are close to the estimates: a computation that corresponds to
solving linear programs. We show that the total expected
reward obtained by OLP up to time $T$ is within $C(P)\log T$
of the reward obtained by the optimal policy, where $C(P)$ is
an explicit, MDP-dependent constant. OLP is closely related to
an algorithm proposed by Burnetas and Katehakis with four key
differences: OLP is simpler, it does not require knowledge of
the supports of transition probabilities and the proof of the
regret bound is simpler, but our regret bound is a constant
factor larger than the regret of their algorithm. OLP is also
similar in flavor to an algorithm recently proposed by Auer
and Ortner. But OLP is simpler and its regret bound has a
better dependence on the size of the MDP.}
}
@techreport{abr-mlea-07a,
author = {Jacob Duncan Abernethy and Peter L.~Bartlett
and Alexander Rakhlin},
title = {Multitask Learning with Expert Advice},
institution = {EECS Department, University of California, Berkeley},
year = {2007},
url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-20.html},
number = {UCB/EECS-2007-20}
}
@techreport{cgkcb-egacrfmmmn-07,
author = {Michael Collins and Amir Globerson and Terry Koo and Xavier
Carreras and Peter L.~Bartlett},
title = {Exponentiated gradient algorithms for conditional random
fields and max-margin {M}arkov networks},
institution = {U.C.\ Berkeley},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/cgkcb-egacrfmmmn-07.pdf},
abstract = {Log-linear and maximum-margin models are two
commonly used methods in supervised machine learning, and
are frequently used in structured prediction problems.
Efficient learning of parameters in these models is
therefore an important problem, and becomes a key factor
when learning from very large data sets. This paper
describes exponentiated gradient (EG) algorithms for
training such models, where EG updates are applied to
the convex dual of either the log-linear or max-margin
objective function; the dual in both the log-linear
and max-margin cases corresponds to minimizing a convex
function with simplex constraints. We study both batch
and online variants of the algorithm, and provide rates
of convergence for both cases. In the max-margin case,
$O({1\over\epsilon})$ EG updates are required to reach a
given accuracy $\epsilon$ in the dual; in contrast, for
log-linear models only $O(\log({ 1\over\epsilon}))$ updates
are required. For both the max-margin and log-linear cases,
our bounds suggest that the online algorithm requires
a factor of $n$ less computation to reach a desired
accuracy, where $n$ is the number of training examples. Our
experiments confirm that the online algorithms are much
faster than the batch algorithms in practice. We describe
how the EG updates factor in a convenient way for
structured prediction problems, allowing the algorithms
to be efficiently applied to problems such as sequence
learning or natural language parsing. We perform extensive
evaluation of the algorithms, comparing them to to L-BFGS
and stochastic gradient descent for log-linear models,
and to SVM-Struct for max-margin models. The algorithms are
applied to multi-class problems as well as a more complex
large-scale parsing task. In all these settings, the EG
algorithms presented here outperform the other methods.}
}
@article{cgkcb-egacrfmmmn-08,
author = {Michael Collins and Amir Globerson and Terry Koo and Xavier
Carreras and Peter L.~Bartlett},
title = {Exponentiated gradient algorithms for conditional random
fields and max-margin {M}arkov networks},
journal = {Journal of Machine Learning Research},
year = {2008},
note = {(Accepted)},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/cgkcb-egacrfmmmn-07.pdf},
abstract = {Log-linear and maximum-margin models are two
commonly used methods in supervised machine learning, and
are frequently used in structured prediction problems.
Efficient learning of parameters in these models is
therefore an important problem, and becomes a key factor
when learning from very large data sets. This paper
describes exponentiated gradient (EG) algorithms for
training such models, where EG updates are applied to
the convex dual of either the log-linear or max-margin
objective function; the dual in both the log-linear
and max-margin cases corresponds to minimizing a convex
function with simplex constraints. We study both batch
and online variants of the algorithm, and provide rates
of convergence for both cases. In the max-margin case,
$O({1\over\epsilon})$ EG updates are required to reach a
given accuracy $\epsilon$ in the dual; in contrast, for
log-linear models only $O(\log({ 1\over\epsilon}))$ updates
are required. For both the max-margin and log-linear cases,
our bounds suggest that the online algorithm requires
a factor of $n$ less computation to reach a desired
accuracy, where $n$ is the number of training examples. Our
experiments confirm that the online algorithms are much
faster than the batch algorithms in practice. We describe
how the EG updates factor in a convenient way for
structured prediction problems, allowing the algorithms
to be efficiently applied to problems such as sequence
learning or natural language parsing. We perform extensive
evaluation of the algorithms, comparing them to to L-BFGS
and stochastic gradient descent for log-linear models,
and to SVM-Struct for max-margin models. The algorithms are
applied to multi-class problems as well as a more complex
large-scale parsing task. In all these settings, the EG
algorithms presented here outperform the other methods.}
}
@inproceedings{bdhkrt-hprbbolo-08,
title = {High-Probability Regret Bounds for Bandit Online Linear
Optimization},
author = {Peter L.~Bartlett and Varsha Dani and Thomas Hayes and
Sham Kakade and Alexander Rakhlin and Ambuj Tewari},
booktitle = {Proceedings of the 21st Annual Conference on Learning
Theory (COLT 2008)},
year = {2008},
note = {(To appear)},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bdhkrt-hprbbolo-08.pdf}
}
@inproceedings{abrt-osmlbocg-08,
title = {Optimal strategies and minimax lower bounds for online convex
games},
author = {Jacob Abernethy and Peter L. Bartlett and
Alexander Rakhlin and Ambuj Tewari},
booktitle = {Proceedings of the 21st Annual Conference on Learning
Theory (COLT 2008)},
year = {2008},
note = {(To appear)},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrt-osmlbocg-08.pdf}
}
@article{tewari07consistency,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {On the Consistency of Multiclass Classification Methods},
journal = {Journal of Machine Learning Research},
year = {2007},
volume = {8},
month = {May},
pages = {1007--1025},
note = {(Invited paper)},
url = {http://jmlr.csail.mit.edu/papers/v8/tewari07a.html}
}
@article{bartlett07sparseness,
author = {Peter L. Bartlett and Ambuj Tewari},
title = {Sparseness vs Estimating Conditional Probabilities: Some
Asymptotic Results},
journal = {Journal of Machine Learning Research},
year = {2007},
volume = {8},
month = {April},
pages = {775--790},
url = {http://jmlr.csail.mit.edu/papers/v8/bartlett07a.html}
}
@article{lbw-c-08,
title = {Correction to {\em The Importance of Convexity in Learning
with Squared Loss}},
author = {Wee Sun Lee and Peter L.~Bartlett and
Robert C.~Williamson},
journal = {IEEE Transactions on Information Theory},
year = {2008},
note = {(To appear)},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/lbw-c-08.pdf}
}
@unpublished{ab-mamssl-08,
title = {Margin-adaptive model selection in statistical learning},
author = {Sylvain Arlot and Peter L.~Bartlett},
note = {Submitted},
year = {2008},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/ab-mamssl-08.pdf}
}
This file was generated by
bibtex2html 1.91.
Books
Journal papers, book chapters
All Publications