articles.bib

@comment{{This file has been generated by bib2bib 1.99}}

@comment{{Command line: bib2bib --remove funding --remove notetoomit -ob articles.bib -c '$type = "ARTICLE" or $type = "INCOLLECTION"' ../mypubs.bib}}

@article{lbd-evbsbwie-92,
  author = {D. R. Lovell and P. L. Bartlett and T. Downs},
  title = {Error and variance bounds on sigmoidal neurons with
  weight and input errors},
  journal = {Electronics Letters},
  volume = {28},
  number = {8},
  pages = {760--762},
  year = {1992}
}

@article{bd-urwtmlnhlu-92,
  author = {P. L. Bartlett and T. Downs},
  title = {Using random weights to train multi-layer networks of
  hard-limiting units},
  journal = {IEEE Transactions on Neural Networks},
  volume = {3},
  number = {2},
  pages = {202--210},
  year = {1992}
}

@article{b-vcdbttln-93,
  author = {P. L. Bartlett},
  title = {{V}apnik-{C}hervonenkis dimension bounds for two- and three-layer
  networks},
  journal = {Neural Computation},
  volume = {5},
  number = {3},
  pages = {371--373},
  year = {1993}
}

@incollection{b-clt-94,
  author = {P. L. Bartlett},
  title = {Computational Learning Theory},
  booktitle = {Encyclopedia of Computer Science and Technology},
  volume = {31},
  pages = {83--99},
  editor = {A. Kent and J. G. Williams},
  publisher = {Marcel Dekker},
  city = {New York},
  year = {1994}
}

@article{lbw-lbvspfc-95,
  author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
  title = {Lower bounds on the {VC}-dimension of smoothly parametrized function
  classes},
  journal = {Neural Computation},
  volume = {7},
  pages = {990--1002},
  year = {1995},
  note = {(See also correction, Neural Computation, 9: 765--769, 1997)}
}

@article{lbw-ealnnbf-96,
  author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
  title = {Efficient agnostic learning of neural networks with bounded
  fan-in},
  journal = {IEEE Transactions on Information Theory},
  volume = {42},
  number = {6},
  pages = {2118--2132},
  year = {1996}
}

@article{abist-vgai-96,
  author = {M. Anthony and P. L. Bartlett and Y. Ishai
  and J. Shawe-Taylor},
  title = {Valid generalisation from approximate interpolation},
  journal = {Combinatorics, Probability, and Computing},
  volume = {5},
  pages = {191--214},
  year = {1996}
}

@article{blw-fslrvf-96,
  author = {P. L. Bartlett and P. M. Long and R. C. Williamson},
  title = {Fat-shattering and the learnability of real-valued
  functions},
  journal = {Journal of Computer and System Sciences},
  note = {(special issue on COLT`94)},
  volume = {52},
  number = {3},
  pages = {434--452},
  year = {1996}
}

@article{bw-vcdptlnndi-96,
  author = {P. L. Bartlett and R. C. Williamson},
  title = {The {V}apnik-{C}hervonenkis dimension and pseudodimension of two-layer
  neural networks with discrete inputs},
  journal = {Neural Computation},
  volume = {8},
  pages = {653--656},
  year = {1996}
}

@article{bkp-cnrvfc-97,
  author = {P. L. Bartlett and S. R. Kulkarni and S. E. Posner},
  title = {Covering numbers for real-valued function classes},
  journal = {IEEE Transactions on Information Theory},
  volume = {43},
  number = {5},
  pages = {1721--1724},
  year = {1997}
}

@article{b-brnnpr-97,
  author = {P. L. Bartlett},
  title = {Book Review: `{N}eural networks for pattern recognition,'
{C}hristopher {M}.~{B}ishop},
  journal = {Statistics in Medicine},
  volume = {16},
  number = {20},
  pages = {2385--2386},
  year = {1997}
}

@article{lbw-clbvspfc-97,
  author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
  title = {Correction to `Lower bounds on the {VC}-dimension of smoothly
  parametrized function classes'},
  journal = {Neural Computation},
  volume = {9},
  pages = {765--769},
  year = {1997}
}

@article{sfbl-bmneevm-98,
  author = {R. E. Schapire and Y. Freund and P. L. Bartlett
  and W. S.  Lee},
  title = {Boosting the margin: a new explanation for the effectiveness of
  voting methods},
  journal = {Annals of Statistics},
  volume = {26},
  number = {5},
  pages = {1651--1686},
  year = {1998}
}

@article{bmm-alvdbppn-98,
  author = {P. L. Bartlett and V. Maiorov and R. Meir},
  title = {Almost linear {VC} dimension bounds for piecewise polynomial
  networks},
  journal = {Neural Computation},
  volume = {10},
  number = {8},
  pages = {2159--2173},
  year = {1998}
}

@article{lbw-iclsl-98,
  author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
  title = {The importance of convexity in learning with squared loss},
  journal = {IEEE Transactions on Information Theory},
  volume = {44},
  number = {5},
  pages = {1974--1980},
  year = {1998}
}

@article{stbwa-srmddh-98,
  author = {J. Shawe-Taylor and P. L. Bartlett and R. C. Williamson
  and M. Anthony},
  title = {Structural Risk Minimization over Data-Dependent
  Hierarchies},
  journal = {IEEE Transactions on Information Theory},
  volume = {44},
  number = {5},
  pages = {1926--1940},
  year = {1998}
}

@article{bll-mdreqd-98,
  author = {P. L. Bartlett and T. Linder and G. Lugosi},
  title = {The minimax distortion redundancy in empirical quantizer
  design},
  journal = {IEEE Transactions on Information Theory},
  volume = {44},
  number = {5},
  pages = {1802--1813},
  year = {1998}
}

@article{bk-cmcsnd-98,
  author = {P. L. Bartlett and S. Kulkarni},
  title = {The complexity of model classes, and smoothing noisy data},
  journal = {Systems and Control Letters},
  volume = {34},
  number = {3},
  pages = {133--140},
  year = {1998}
}

@article{bv-isilt-98,
  author = {P. L. Bartlett and M. Vidyasagar},
  title = {Introduction to the special issue on learning theory},
  journal = {Systems and Control Letters},
  volume = {34},
  pages = {113--114},
  year = {1998}
}

@article{kbb-ocpcle-98,
  author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett},
  title = {Optimal controller properties from closed-loop experiments},
  journal = {Automatica},
  volume = {34},
  number = {1},
  pages = {83--91},
  year = {1998}
}

@article{bl-plussd-98,
  author = {P. L. Bartlett and P. M. Long},
  title = {Prediction, learning, uniform convergence, and scale-sensitive
  dimensions},
  journal = {Journal of Computer and System Sciences},
  volume = {56},
  number = {2},
  pages = {174--190},
  year = {1998},
  note = {(special issue on COLT`95)}
}

@article{b-scpcnn-98,
  author = {P. L. Bartlett},
  title = {The sample complexity of pattern classification
  with neural networks: the size of the weights is more
  important than the size of the network},
  journal = {IEEE Transactions on Information Theory},
  volume = {44},
  number = {2},
  pages = {525--536},
  year = {1998}
}

@article{bl-iudsam-99,
  author = {P. L. Bartlett and G. Lugosi},
  title = {An inequality for uniform deviations of sample averages
  from their means},
  journal = {Statistics and Probability Letters},
  volume = {44},
  number = {1},
  pages = {55--62},
  year = {1999}
}

@incollection{b-ennl-99,
  author = {P. L. Bartlett},
  title = {Efficient neural network learning},
  booktitle = {Open Problems in Mathematical Systems Theory and
  Control},
  editor = {V. D. Blondel and E. D. Sontag and M. Vidyasagar
  and J. C. Willems},
  pages = {35--38},
  publisher = {Springer Verlag},
  city = {London},
  year = {1999}
}

@incollection{bst-gpsvmopc-99,
  author = {P. L. Bartlett and J. Shawe-Taylor},
  title = {Generalization performance of support vector machines and other
  pattern classifiers},
  booktitle = {Advances in Kernel Methods -- Support Vector Learning},
  editor = {B. Sch{\"o}lkopf and C. J. C. Burges and A. J. Smola},
  pages = {43--54},
  publisher = {MIT Press},
  city = {Cambridge, MA},
  year = {1999}
}

@article{ab-fli-00,
  author = {M. Anthony and P. L. Bartlett},
  title = {Function learning from interpolation},
  journal = {Combinatorics, Probability, and Computing},
  volume = {9},
  pages = {213--225},
  year = {2000}
}

@incollection{mbbf-fgtch-00,
  author = {L. Mason and J. Baxter and P. L. Bartlett and M. Frean},
  title = {Functional Gradient Techniques for Combining Hypotheses},
  booktitle = {Advances in Large Margin Classifiers},
  editor = {A. J. Smola and P. L. Bartlett and B. Sch{\"o}lkopf and
  D. Schuurmans},
  pages = {221--246},
  publisher = {MIT Press},
  city = {Cambridge, MA},
  year = {2000}
}

@incollection{sbss-ilmc-00,
  author = {A. J. Smola and P. L. Bartlett and B. Sch{\"o}lkopf
  and D. Schuurmans},
  title = {Introduction to Large Margin Classifiers},
  booktitle = {Advances in Large Margin Classifiers},
  editors = {A. J. Smola and P. L. Bartlett and B. Sch{\"o}lkopf
  and D. Schuurmans},
  pages = {1--29},
  publisher = {MIT Press},
  city = {Cambridge, MA},
  year = {2000}
}

@article{bbdk-lccesc-00,
  author = {P. L. Bartlett and S. Ben-David and S. R. Kulkarni},
  title = {Learning changing concepts by exploiting the structure of
  change},
  journal = {Machine Learning},
  volume = {41},
  number = {2},
  pages = {153--174},
  year = {2000}
}

@article{ppb-pace-00,
  author = {S. Parameswaran and M. F. Parkinson and P. L. Bartlett},
  title = {Profiling in the {ASP} codesign environment},
  journal = {Journal of Systems Architecture},
  volume = {46},
  number = {14},
  pages = {1263--1274},
  year = {2000}
}

@article{sswb-nsva-00,
  author = {B. Sch{\"o}lkopf and A. Smola and R. C. Williamson
  and P. L. Bartlett},
  title = {New support vector algorithms},
  journal = {Neural Computation},
  volume = {12},
  number = {5},
  pages = {1207--1245},
  year = {2000}
}

@article{kbb-ditsa-00,
  author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett},
  title = {Direct iterative tuning via spectral analysis},
  journal = {Automatica},
  volume = {36},
  number = {9},
  pages = {1301--1307},
  year = {2000}
}

@article{mbb-igeom-00,
  author = {L. Mason and P. L. Bartlett and J. Baxter},
  title = {Improved generalization through explicit optimization of
  margins},
  journal = {Machine Learning},
  volume = {38},
  number = {3},
  pages = {243--255},
  year = {2000}
}

@article{bb-ihgbps-01,
  author = {J. Baxter and P. L. Bartlett},
  title = {Infinite-Horizon Policy-Gradient Estimation},
  journal = {Journal of Artificial Intelligence Research},
  volume = {15},
  pages = {319--350},
  year = {2001},
  url = {http://www.cs.washington.edu/research/jair/abstracts/baxter01a.html}
}

@article{bbw-ihgbps2gaae-01,
  author = {J. Baxter and P. L. Bartlett and L. Weaver},
  title = {Experiments with Infinite-Horizon, Policy-Gradient
Estimation},
  journal = {Journal of Artificial Intelligence Research},
  volume = {15},
  pages = {351--381},
  year = {2001},
  url = {http://www.cs.washington.edu/research/jair/abstracts/baxter01b.html}
}

@article{gbstw-cnsvm-01,
  author = {Y. Guo and P. L. Bartlett and J. Shawe-Taylor and R. C.
Williamson},
  title = {Covering numbers for support vector machines},
  journal = {IEEE Transactions on Information Theory},
  volume = {48},
  number = {1},
  pages = {239--250},
  year = {2002}
}

@article{bm-rgcrbsr-02,
  author = {P. L. Bartlett and S. Mendelson},
  title = {Rademacher and {G}aussian complexities: Risk bounds and structural
results},
  journal = {Journal of Machine Learning Research},
  volume = {3},
  pages = {463--482},
  pdf = {http://www.jmlr.org/papers/volume3/bartlett02a/bartlett02a.pdf},
  year = {2002}
}

@article{bbl-msee-02,
  author = {P. L. Bartlett and S. Boucheron and G. Lugosi},
  title = {Model selection and error estimation},
  journal = {Machine Learning},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bbl-msee.ps.gz},
  volume = {48},
  pages = {85--113},
  year = {2002}
}

@article{bb-eabgbrl-02,
  author = {P. L. Bartlett and J. Baxter},
  title = {Estimation and approximation bounds for gradient-based
reinforcement learning},
  journal = {Journal of Computer and System Sciences},
  volume = {64},
  number = {1},
  pages = {133--150},
  year = {2002}
}

@article{bbd-hrnnap-02,
  author = {P. L. Bartlett and S. Ben-David},
  title = {Hardness results for neural network approximation problems},
  journal = {Theoretical Computer Science},
  note = {(special issue on Eurocolt'99)},
  year = {2002},
  volume = {284},
  number = {1},
  pages = {53--66},
  url = {http://dx.doi.org/10.1016/S0304-3975(01)00057-3}
}

@article{bfh-erwl-02,
  author = {P. L. Bartlett and P. Fischer and K.-U. H{\"o}ffgen},
  title = {Exploiting random walks for learning},
  journal = {Information and Computation},
  year = {2002},
  volume = {176},
  number = {2},
  pages = {121--135},
  url = {http://dx.doi.org/10.1006/inco.2002.3083}
}

@article{mbg-gecc-02,
  author = {L. Mason and P. L. Bartlett and M. Golea},
  title = {Generalization error of combined classifiers},
  journal = {Journal of Computer and System Sciences},
  year = {2002},
  volume = {65},
  number = {2},
  pages = {415--438},
  url = {http://dx.doi.org/10.1006/jcss.2002.1854}
}

@incollection{bm-vcdnn-03,
  author = {Peter L. Bartlett and Wolfgang Maass},
  title = {{V}apnik-{C}hervonenkis dimension of neural nets},
  editor = {Michael A.~Arbib},
  booktitle = {The Handbook of Brain Theory and Neural
  Networks},
  publisher = {MIT Press},
  city = {Cambridge, MA},
  note = {Second Edition},
  pages = {1188--1192},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bm-vcdnn-03.ps.gz},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bm-vcdnn-03.pdf},
  year = {2003}
}

@incollection{b-irltvfm-03,
  author = {Peter L. Bartlett},
  title = {An introduction to reinforcement learning theory: value
function methods},
  booktitle = {Advanced Lectures on Machine Learning},
  seriestitle = {Lecture Notes in Computer Science},
  volume = {2600},
  editor = {Shahar Mendelson and Alexander J. Smola},
  isbn = {3-540-00529-3},
  pages = {184--202},
  publisher = {Springer},
  html = {http://link.springer.de/link/service/series/0558/bibs/2600/26000184.htm},
  year = {2003}
}

@article{lcbegj-lkmsdp-04,
  author = {G. Lanckriet and N. Cristianini and P. L. Bartlett and L. El
Ghaoui and M. Jordan},
  title = {Learning the kernel matrix with semi-definite programming},
  journal = {Journal of Machine Learning Research},
  pdf = {http://www.jmlr.org/papers/volume5/lanckriet04a/lanckriet04a.pdf},
  ps = {http://www.jmlr.org/papers/volume5/lanckriet04a/lanckriet04a.ps.gz},
  year = {2004},
  volume = {5},
  pages = {27--72}
}

@article{gbb-vrtgerl-04,
  author = {E. Greensmith and P. L. Bartlett and J. Baxter},
  title = {Variance reduction techniques for gradient estimates in
reinforcement learning},
  journal = {Journal of Machine Learning Research},
  volume = {5},
  year = {2004},
  pages = {1471--1530},
  url = {http://jmlr.csail.mit.edu/papers/volume5/greensmith04a/greensmith04a.pdf}
}

@article{bjm-d-03,
  title = {Discussion of boosting papers},
  author = {Peter L.\ Bartlett and Michael I.\ Jordan
  and Jon D.\ McAuliffe},
  journal = {The Annals of Statistics},
  year = {2004},
  volume = {32},
  number = {1},
  pages = {85--91},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-d-03.ps.Z},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-d-03.pdf}
}

@article{bbm-lrc-05,
  title = {Local {R}ademacher complexities},
  author = {Peter L.~Bartlett and Olivier Bousquet and Shahar Mendelson},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bbm-lrc-05.ps},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bbm-lrc-05.pdf},
  journal = {Annals of Statistics},
  volume = {33},
  number = {4},
  pages = {1497--1537},
  year = {2005},
  abstract = {We propose new bounds on the error of
  learning algorithms in terms of a data-dependent notion
  of complexity. The estimates we establish give optimal
  rates and are based on a local and empirical version of
  Rademacher averages, in the sense that the Rademacher
  averages are computed from the data, on a subset of
  functions with small empirical error. We present some
  applications to prediction with bounded loss, and to
  regression with a convex loss function and a convex
  function class.}
}

@article{bjm-ccrb-05,
  title = {Convexity, classification, and risk bounds},
  author = {Peter L.\ Bartlett and
  Michael I.\ Jordan and Jon D.\ McAuliffe},
  note = {(Was Department of Statistics, U.C.\ Berkeley Technical Report
	number 638, 2003)},
  year = {2006},
  journal = {Journal of the American Statistical Association},
  volume = {101},
  number = {473},
  pages = {138-156},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.ps.gz},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.pdf},
  abstract = {Many of the classification algorithms developed in the
  machine learning literature, including the support
  vector machine and boosting, can be viewed as minimum
  contrast methods that minimize a convex surrogate of the
  0-1 loss function.  The convexity makes these algorithms
  computationally efficient.  The use of a surrogate, however,
  has statistical consequences that must be balanced against
  the computational virtues of convexity.  To study these
  issues, we provide a general quantitative relationship between
  the risk as assessed using the 0-1 loss and the risk as assessed
  using any nonnegative surrogate loss function. We show that this
  relationship gives nontrivial upper bounds on excess risk
  under the weakest possible condition on the loss
  function: that it satisfy a pointwise form of Fisher
  consistency for classification.  The relationship is
  based on a simple variational transformation of the loss
  function that is easy to compute in many applications.
  We also present a refined version of this result in the
  case of low noise.  Finally, we present applications of
  our results to the estimation of convergence rates in the
  general setting of function classes that are scaled
  convex hulls of a finite-dimensional base class,
  with a variety of commonly used loss functions.}
}

@article{bm-em-05,
  author = {Peter L.~Bartlett and Shahar Mendelson},
  title = {Empirical minimization},
  journal = {Probability Theory and Related Fields},
  year = {2006},
  volume = 135,
  number = 3,
  pages = {311--334},
  ps = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.ps.gz},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.pdf},
  abstract = {We investigate the behavior of the empirical
minimization algorithm using various methods. We first analyze it
by comparing the empirical, random, structure and the original one
on the class, either in an additive sense, via the uniform law of
large numbers, or in a multiplicative sense, using isomorphic
coordinate projections. We then show that a direct analysis of the
empirical minimization algorithm yields a significantly better
bound, and that the estimates we obtain are essentially sharp. The
method of proof we use is based on Talagrand's concentration
inequality for empirical processes.}
}

@article{bjm-c-06,
  author = {Peter L.\ Bartlett and
  Michael I.\ Jordan and Jon D.\ McAuliffe},
  title = {Comment},
  journal = {Statistical Science},
  year = {2006},
  volume = {21},
  number = {3},
  pages = {341--346}
}

@article{bm-d-06,
  author = {Peter L.\ Bartlett and Shahar Mendelson},
  title = {Discussion of ``2004 {IMS} {M}edallion {L}ecture:
{L}ocal {R}ademacher complexities and oracle inequalities in risk
minimization'' by {V}.\ {K}oltchinskii},
  journal = {The Annals of Statistics},
  year = {2006},
  volume = {34},
  number = {6},
  pages = {2657--2663}
}

@article{bt-aic-07,
  author = {Peter L.~Bartlett and Mikhail Traskin},
  title = {AdaBoost is Consistent},
  journal = {Journal of Machine Learning Research},
  volume = {8},
  pages = {2347--2368},
  year = {2007},
  url = {jmlr.csail.mit.edu/papers/volume8/bartlett07b/bartlett07b.pdf},
  pdf = {jmlr.csail.mit.edu/papers/volume8/bartlett07b/bartlett07b.pdf},
  abstract = {The risk, or probability of error, of
the classifier produced by the AdaBoost algorithm is
investigated. In particular, we consider the stopping
strategy to be used in AdaBoost to achieve universal
consistency. We show that provided AdaBoost is stopped
after n^(1-a) iterations---for sample size n and
0

@article{tewari07consistency,
  author = {Ambuj Tewari and Peter L.~Bartlett},
  title = {On the Consistency of Multiclass Classification Methods},
  journal = {Journal of Machine Learning Research},
  year = {2007},
  volume = {8},
  month = {May},
  pages = {1007--1025},
  note = {(Invited paper)},
  url = {http://jmlr.csail.mit.edu/papers/v8/tewari07a.html}
}


@article{bartlett07sparseness,
  author = {Peter L. Bartlett and Ambuj Tewari},
  title = {Sparseness vs Estimating Conditional Probabilities: Some
Asymptotic Results},
  journal = {Journal of Machine Learning Research},
  year = {2007},
  volume = {8},
  month = {April},
  pages = {775--790},
  url = {http://jmlr.csail.mit.edu/papers/v8/bartlett07a.html}
}


@article{b-freeoims-07,
  author = {Peter L. Bartlett},
  title = {Fast rates for estimation error and oracle
	inequalities for model selection},
  year = {2008},
  month = apr,
  journal = {Econometric Theory},
  volume = {24},
  number = {2},
  pages = {545--552},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-freeoims-07.pdf},
  note = {(Was Department of Statistics,
U.C.\ Berkeley Technical Report number 729, 2007)},
  doi = {http://dx.doi.org/10.1017/S0266466608080225},
  abstract = {We consider complexity penalization methods for model
selection.  These methods aim to choose a model to
optimally trade off estimation and approximation errors
by minimizing the sum of an empirical risk term and a
complexity penalty. It is well known that if we use a
bound on the maximal deviation between empirical and
true risks as a complexity penalty,
then the risk of our choice is no more
than the approximation error plus twice the complexity
penalty. There are many cases, however, where complexity
penalties like this give loose upper bounds on the
estimation error. In particular, if we choose a function
from a suitably simple convex function class with a
strictly convex loss function, then the estimation error
(the difference between the risk of the empirical risk
minimizer and the minimal risk in the class) approaches zero at a
faster rate than the maximal deviation between empirical
and true risks. In this note, we address the question of
whether it is possible to design a complexity penalized
model selection method for these situations. We show that,
provided the sequence of models is ordered by inclusion, in
these cases we can use tight upper bounds on estimation
error as a complexity penalty. Surprisingly, this is the
case even in situations when the difference between the
empirical risk and true risk (and indeed the error of
any estimate of the approximation error) decreases
much more slowly than the
complexity penalty.  We give an oracle inequality
showing that the resulting model selection method chooses
a function with risk no more than the approximation error
plus a constant times the complexity penalty.}
}


@article{bw-crouhl-08,
  author = {Peter L. Bartlett and Marten H. Wegkamp},
  title = {Classification with a Reject Option using a Hinge
	  Loss},
  journal = {Journal of Machine Learning Research},
  year = {2008},
  month = aug,
  volume = {9},
  pages = {1823--1840},
  pdf = {http://www.jmlr.org/papers/volume9/bartlett08a/bartlett08a.pdf},
  abstract = {We consider the problem of binary classification where the
classifier can, for
a particular cost, choose not to classify an observation. Just as in
the
conventional classification problem, minimization of the sample
average of
the cost is a difficult optimization problem. As an alternative, we
propose
the optimization of a certain convex loss function f, analogous
to the
hinge loss used in support vector machines (SVMs). Its convexity
ensures that
the sample average of this surrogate loss can be efficiently
minimized. We
study its statistical properties. We show that minimizing the expected
surrogate loss---the f-risk---also minimizes the risk.  We also
study
the rate at which the f-risk approaches its minimum value. We
show that
fast rates are possible when the conditional probability Pr(Y=1|X)
is
unlikely to be close to certain critical values.}
}


@article{cgkcb-egacrfmmmn-08,
  author = {Michael Collins and Amir Globerson and Terry Koo and Xavier
Carreras and Peter L.~Bartlett},
  title = {Exponentiated gradient algorithms for conditional random
fields and max-margin {M}arkov networks},
  journal = {Journal of Machine Learning Research},
  year = {2008},
  month = aug,
  volume = {9},
  pages = {1775--1822},
  pdf = {http://www.jmlr.org/papers/volume9/collins08a/collins08a.pdf},
  abstract = {Log-linear and maximum-margin models are two
commonly used methods in supervised machine learning, and
are frequently used in structured prediction problems.
Efficient learning of parameters in these models is
therefore an important problem, and becomes a key factor
when learning from very large data sets.  This paper
describes exponentiated gradient (EG) algorithms for
training such models, where EG updates are applied to
the convex dual of either the log-linear or max-margin
objective function; the dual in both the log-linear
and max-margin cases corresponds to minimizing a convex
function with simplex constraints.  We study both batch
and online variants of the algorithm, and provide rates
of convergence for both cases.  In the max-margin case,
$O({1\over\epsilon})$ EG updates are required to reach a
given accuracy $\epsilon$ in the dual; in contrast, for
log-linear models only $O(\log({ 1\over\epsilon}))$ updates
are required. For both the max-margin and log-linear cases,
our bounds suggest that the online algorithm requires
a factor of $n$ less computation to reach a desired
accuracy, where $n$ is the number of training examples. Our
experiments confirm that the online algorithms are much
faster than the batch algorithms in practice.  We describe
how the EG updates factor in a convenient way for
structured prediction problems, allowing the algorithms
to be efficiently applied to problems such as sequence
learning or natural language parsing.  We perform extensive
evaluation of the algorithms, comparing them to to L-BFGS
and stochastic gradient descent for log-linear models,
and to SVM-Struct for max-margin models. The algorithms are
applied to multi-class problems as well as a more complex
large-scale parsing task. In all these settings, the EG
algorithms presented here outperform the other methods.}
}


@article{lbw-c-08,
  title = {Correction to {\em The Importance of Convexity in Learning
    with Squared Loss}},
  author = {Wee Sun Lee and Peter L.~Bartlett and
    Robert C.~Williamson},
  journal = {IEEE Transactions on Information Theory},
  year = {2008},
  month = sep,
  pages = {4395},
  volume = {54},
  number = {9},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/lbw-c-08.pdf}
}


@article{rbr-soimbtmerb-08,
  author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
	J.~Hyam Rubinstein},
  title = {Shifting: one-inclusion mistake bounds and sample
	compression},
  journal = {Journal of Computer and System Sciences},
  volume = {75},
  number = {1},
  pages = {37--59},
  note = {(Was University of California, Berkeley, EECS Department
Technical Report EECS-2007-86)},
  pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-86.pdf},
  month = {January},
  year = {2009}
}


@article{rsbn-sslfbmvpcr-09,
  title = {Multiview point cloud kernels for semisupervised learning},
  volume = {26},
  number = {5},
  month = {September},
  year = {2009},
  pages = {145--150},
  author = {David S.~Rosenberg and Vikas Sindhwani and
Peter L.~Bartlett and Partha Niyogi},
  doi = {http://dx.doi.org/10.1109/MSP.2009.933383},
  journal = {IEEE Signal Processing Magazine}
}


@article{bmp-osbeeem-08,
  author = {Peter L. Bartlett and Shahar Mendelson and Petra
		Philips},
  title = {On the optimality of sample-based estimates of the
		expectation of the empirical minimizer},
  journal = {ESAIM: Probability and Statistics},
  volume = {14},
  pages = {315--337},
  year = {2010},
  month = jan,
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-08.pdf},
  abstract = {We study sample-based estimates of the expectation of the
	function produced by the empirical minimization algorithm.
	We investigate the extent to which one can estimate
	the rate of convergence of the empirical minimizer in a
	data dependent manner. We establish three main results.
	First, we provide an algorithm that upper bounds the
	expectation of the empirical minimizer in a completely
	data-dependent manner.  This bound is based on a structural
	result in
	http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf,
        which relates expectations to
	sample averages.  Second, we show that these structural
	upper bounds can be loose. In particular, we demonstrate a
	class for which the expectation of the empirical minimizer
	decreases as $O(1/n)$ for sample size $n$, although the
	upper bound based on structural properties is $\Omega(1)$.
	Third, we show that this looseness of the bound is inevitable:
	we present an example that shows that a sharp bound cannot
	be universally recovered from empirical data.}
}


@article{b-laue-10,
  title = {Learning to act in uncertain environments},
  volume = {53},
  number = {5},
  year = {2010},
  month = may,
  pages = {98},
  author = {Peter L.~Bartlett},
  journal = {Communications of the ACM},
  doi = {http://dx.doi.org/10.1145/1735223.1735246},
  note = {(Invited one-page comment)}
}


@article{rbr-csoimbsc-10,
  title = {Corrigendum to `Shifting: One-inclusion mistake bounds and
sample compression' {[J. Comput. System Sci 75 (1) (2009) 37-59]}},
  volume = {76},
  number = {3--4},
  year = {2010},
  month = may,
  pages = {278--280},
  author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
J.~Hyam Rubinstein},
  journal = {Journal of Computer and System Sciences},
  doi = {http://dx.doi.org/10.1016/j.jcss.2010.03.001}
}


@article{ab-mamssl-08,
  title = {Margin-adaptive model selection in statistical learning},
  author = {Sylvain Arlot and Peter L.~Bartlett},
  journal = {Bernoulli},
  volume = {17},
  number = {2},
  pages = {687--713},
  year = {2011},
  month = may,
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/ab-mamssl-08.pdf},
  abstract = {}
}


@article{abrw-itlbocsco-12,
  title = {Information-theoretic lower bounds on the oracle
complexity of stochastic convex optimization},
  year = {2012},
  month = may,
  author = {Alekh Agarwal and Peter Bartlett and Pradeep Ravikumar and Martin
Wainwright},
  journal = {IEEE Transactions on Information Theory},
  abstract = {Relative to the large literature on upper bounds on
complexity of
  convex optimization, lesser attention has been paid to the
  fundamental hardness of these problems.  Given the extensive use of
  convex optimization in machine learning and statistics, gaining an
  understanding of these complexity-theoretic issues is important. In
  this paper, we study the complexity of stochastic convex
  optimization in an oracle model of computation. We improve upon
  known results and obtain tight minimax complexity estimates for
  various function classes.},
  volume = {58},
  number = {5},
  pages = {3235--3249},
  doi = {http://dx.doi.org/10.1109/TIT.2011.2182178},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrw-itlbocsco-12.pdf}
}


@article{dbw-rsso-12,
  title = {Randomized smoothing for stochastic optimization},
  author = {John Duchi and Peter L.~Bartlett and Martin J.~Wainwright},
  year = {2012},
  month = jun,
  journal = {SIAM Journal on Optimization},
  volume = {22},
  number = {2},
  pages = {674--701},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/dbw-rsso-12.pdf},
  abstract = {We analyze convergence rates of stochastic optimization
algorithms for nonsmooth
convex optimization problems. By combining randomized smoothing
techniques with accelerated
gradient methods, we obtain convergence rates of stochastic
optimization procedures, both in expectation and with high probability,
that have optimal dependence on the variance of the gradient
estimates. To the best of our knowledge, these are the first
variance-based rates for nonsmooth
optimization. We give several applications of our results to
statistical estimation problems and provide experimental results
that demonstrate the effectiveness of the proposed algorithms. We also
describe how a combination of our algorithm with recent work on
decentralized optimization yields
a distributed stochastic optimization algorithm that is
order-optimal.}
}


@article{rbht-llfs-09,
  title = {Learning in a large function space: Privacy preserving
mechanisms for {SVM} learning},
  author = {Benjamin I.~P.~Rubinstein and Peter~L.~Bartlett
and Ling Huang and Nina Taft},
  journal = {Journal of Privacy and Confidentiality},
  volume = 4,
  number = 1,
  pages = {65--100},
  year = {2012},
  month = aug,
  url = {http://arxiv.org/abs/0911.5708},
  abstract = {}
}


@article{nabs-amlfdahu-12,
  title = {Application of Machine Learning in the Fault Diagnostics of
  Air Handling Units},
  author = {Massieh Najafi and David M. Auslander and Peter L. Bartlett
  and Philip Haves and Michael D.~Sohn},
  journal = {Applied Energy},
  doi = {http://dx.doi.org/10.1016/j.apenergy.2012.02.049},
  year = {2012},
  month = aug,
  volume = 96,
  pages = {347--358}
}


@article{bmn-lrlrpoi-09,
  title = {$l_1$-regularized linear regression: Persistence and
oracle inequalities},
  author = {Peter L.~Bartlett and Shahar Mendelson
and Joseph Neeman},
  journal = {Probability Theory and Related Fields},
  year = {2012},
  month = oct,
  volume = {154},
  number = {1--2},
  pages = {193--224},
  abstract = {  We study the predictive performance of
$\ell_1$-regularized linear
  regression in a model-free setting, including the case where the
  number of covariates is substantially larger than the sample
  size. We introduce a new analysis method that avoids the boundedness
  problems that typically arise in model-free empirical minimization.
  Our technique provides an answer to a conjecture of Greenshtein and
  Ritov~\cite{GR} regarding the ``persistence'' rate for linear
  regression and allows us to prove an oracle inequality for the error
  of the regularized minimizer.
  It also demonstrates that empirical risk minimization gives optimal
  rates (up to log factors) of convex aggregation of a set of
  estimators of a regression function.},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmn-lrlrpoi-09.pdf},
  doi = {http://dx.doi.org/10.1007/s00440-011-0367-2}
}


@article{brsmsb-lbars-12,
  title = {A learning-based approach to reactive security},
  author = {A.~Barth and Benjamin I.~P.~Rubinstein
and M.~Sundararajan and J.~C.~Mitchell
and Dawn Song and Peter L.~Bartlett},
  year = {2012},
  month = jul,
  journal = {IEEE Transactions on Dependable and Secure Computing},
  url = {http://doi.ieeecomputersociety.org/10.1109/TDSC.2011.42},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/brsmsb-lbars-12.pdf},
  volume = {9},
  number = {4},
  pages = {482--493},
  abstract = {Despite the conventional wisdom that proactive security is
superior to reactive security, we show that reactive security can be
competitive with proactive security as long as the reactive defender
learns from past attacks instead of myopically overreacting to the
last attack.  Our game-theoretic model follows common practice in the
security literature by making worst-case assumptions about the
attacker: we grant the attacker complete knowledge of the defender’s
strategy and do not require the attacker to act rationally. In this
model, we bound the competitive ratio between a reactive defense
algorithm (which is inspired by online learning theory) and the best
fixed proactive defense.  Additionally, we show that, unlike proactive
defenses, this reactive strategy is robust to a lack of information
about the attacker’s incentives and knowledge.}
}


@incollection{tb-lt-12,
  author = {Ambuj Tewari and Peter L.~Bartlett},
  title = {Learning Theory},
  booktitle = {Signal Processing Theory and Machine Learning},
  series = {Academic Press Library in Signal Processing},
  volume = {1},
  editor = {Paulo S.R. Diniz and Johan A.K. Suykens and
Rama Chellappa and Sergios Theodoridis},
  publisher = {Elsevier},
  year = {2014},
  pages = {775--816}
}


@incollection{rrb-bevcmc-14a,
  author = {J. Hyam Rubinstein and Benjamin Rubinstein and Peter Bartlett},
  title = {Bounding Embeddings of {VC} Classes into Maximum Classes},
  year = {2014},
  booktitle = {Festschrift of Alexey Chervonenkis},
  editor = {A. Gammerman and V. Vovk},
  publisher = {Springer},
  city = {Berlin},
  abstract = {One of the earliest conjectures in computational learning
theory---the Sample Compression Conjecture---asserts that concept
classes (or set systems) admit compression schemes of size polynomial
in their VC dimension. To-date this statement is known to be true for
maximum classes---those that meet Sauer's Lemma, which bounds class
cardinality in terms of VC dimension, with equality. The most
promising approach to positively resolving the conjecture is by
embedding general VC classes into maximum classes without super-linear
increase to their VC dimensions, as such embeddings extend the known
compression schemes to all VC classes. We show that maximum classes
can be characterized by a local-connectivity property of the graph
obtained by viewing the class as a cubical complex. This geometric
characterization of maximum VC classes is applied to prove a negative
embedding result which demonstrates VC-$d$ classes that cannot be
embedded in any maximum class of VC dimension lower than $2d$. On the
other hand, we give a general recursive procedure for embedding VC-$d$
classes into VC-$(d+k)$ maximum classes for smallest $k$.},
  url = {http://arxiv.org/abs/1401.7388}
}


@article{hb-ecosnmlbp-17,
  author = {Fares Hedayati and Peter L. Bartlett},
  title = {Exchangeability Characterizes Optimality of Sequential
Normalized Maximum Likelihood and {B}ayesian Prediction},
  journal = {IEEE Transactions on Information Theory},
  month = {October},
  volume = {63},
  number = {10},
  pages = {6767--6773},
  year = {2017},
  abstract = {We study online learning under logarithmic loss with
regular parametric models. In this setting, each strategy corresponds
to a joint distribution on sequences. The minimax optimal strategy is the
normalized maximum likelihood (NML) strategy. We show 
that the sequential normalized maximum likelihood (SNML) strategy
predicts minimax optimally (i.e. as NML) if and only if the
joint distribution on sequences defined by SNML is exchangeable. This
property also characterizes the optimality of a Bayesian
prediction strategy. In that case, the optimal prior distribution is
Jeffreys prior for a broad class of parametric models for which
the maximum likelihood estimator is asymptotically normal. The optimal
prediction strategy, normalized maximum likelihood,
depends on the number n of rounds of the game, in general. However,
when a Bayesian strategy is optimal, normalized maximum
likelihood becomes independent of n. Our proof uses this to exploit
the asymptotics of normalized maximum likelihood. The
asymptotic normality of the maximum likelihood estimator is
responsible for the necessity of Jeffreys prior.},
  doi = {10.1109/TIT.2017.2735799},
  url = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ecosnmlbp-17.pdf},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ecosnmlbp-17.pdf}
}


@article{bhlm-ntvcdpdbfplnn-19,
  author = {Peter L. Bartlett and Nick Harvey and Christopher Liaw and Abbas Mehrabian},
  title = {Nearly-tight {VC}-dimension and Pseudodimension Bounds for Piecewise Linear Neural Networks},
  journal = {Journal of Machine Learning Research},
  year = {2019},
  volume = {20},
  number = {63},
  pages = {1-17},
  url = {http://jmlr.org/papers/v20/17-612.html}
}


@article{bhl-gdiielpdltdrn-19,
  author = {Peter L.~Bartlett and David P.~Helmbold and Philip M.~Long},
  title = {Gradient descent with identity initialization
efficiently learns positive definite linear transformations
by deep residual networks},
  year = {2019},
  journal = {Neural Computation},
  volume = {31},
  pages = {477--502}
}


@article{mfwb-esanscp-22,
  author = {Wenlong Mou and Nicolas Flammarion and Martin J.~Wainwright and Peter L.~Bartlett},
  title = {An Efficient Sampling Algorithm for Non-smooth Composite Potentials},
  journal = {Journal of Machine Learning Research},
  year = {2022},
  volume = {23},
  number = {233},
  pages = {1--50},
  url = {http://jmlr.org/papers/v23/20-527.html}
}


@article{tb-borr-23,
  author = {Alexander Tsigler and Peter L. Bartlett},
  title = {Benign overfitting in ridge regression},
  journal = {Journal of Machine Learning Research},
  year = {2023},
  volume = {24},
  number = {123},
  pages = {1--76},
  url = {http://jmlr.org/papers/v24/22-1398.html},
  note = {(See also arXiv:2009.14286)}
}


@article{mpbkbw-dfmfpogflqs-20,
  title = {Derivative-Free Methods for Policy Optimization: Guarantees
for Linear Quadratic Systems},
  author = {Dhruv Malik and Ashwin Pananjady and Kush Bhatia and
Koulik Khamaru and Peter L.~Bartlett and Martin J.~Wainwright},
  journal = {Journal of Machine Learning Research},
  year = {2020},
  volume = {21},
  number = {21},
  pages = {1-51},
  url = {http://jmlr.org/papers/v21/19-198.html}
}


@article{bllt-bolr-20,
  author = {Peter L.~Bartlett and Philip M.~Long and G\'{a}bor Lugosi
and Alexander Tsigler},
  title = {Benign Overfitting in Linear Regression},
  journal = {Proceedings of the National Academy of Sciences},
  note = {(arXiv:1906.11300)},
  year = {2020},
  volume = {117},
  number = {48},
  pages = {30063--30070},
  doi = {10.1073/pnas.1907378117},
  publisher = {National Academy of Sciences},
  abstract = {The phenomenon of benign overfitting is one of the key mysteries uncovered by deep learning methodology: deep neural networks seem to predict well, even with a perfect fit to noisy training data. Motivated by this phenomenon, we consider when a perfect fit to training data in linear regression is compatible with accurate prediction. We give a characterization of linear regression problems for which the minimum norm interpolating prediction rule has near-optimal prediction accuracy. The characterization is in terms of two notions of the effective rank of the data covariance. It shows that overparameterization is essential for benign overfitting in this setting: the number of directions in parameter space that are unimportant for prediction must significantly exceed the sample size. By studying examples of data covariance properties that this characterization shows are required for benign overfitting, we find an important role for finite-dimensional data: the accuracy of the minimum norm interpolating prediction rule approaches the best possible accuracy for a much narrower range of properties of the data distribution when the data lie in an infinite-dimensional space vs. when the data lie in a finite-dimensional space with dimension that grows faster than the sample size.},
  issn = {0027-8424},
  url = {https://www.pnas.org/content/early/2020/04/22/1907378117},
  eprint = {https://www.pnas.org/content/early/2020/04/22/1907378117.full.pdf}
}


@article{bmr-dlasp-21,
  author = {Peter L. Bartlett and Andrea Montanari and Alexander
  Rakhlin},
  title = {Deep learning: a statistical viewpoint},
  journal = {Acta Numerica},
  year = {2021},
  volume = {30},
  doi = {10.1017/S0962492921000027},
  publisher = {Cambridge University Press},
  pages = {87–201},
  url = {https://arxiv.org/abs/2103.09177},
  abstract = {The remarkable practical success of deep learning has revealed some
major surprises from a theoretical perspective.  In particular,
simple gradient methods easily find near-optimal solutions to
non-convex optimization problems, and despite giving a near-perfect
fit to training data without any explicit effort to control model
complexity, these methods exhibit excellent predictive accuracy.
We conjecture that specific principles underlie these phenomena: that
overparametrization allows gradient methods to find interpolating
solutions, that these methods implicitly impose regularization,
and that overparametrization leads to benign overfitting, that is,
accurate predictions despite overfitting training data.
 We survey recent theoretical progress
that provides examples illustrating these principles in simpler
settings.  We first review classical uniform convergence results
and why they fall short of explaining aspects of the behavior of
deep learning methods.  We give examples of implicit regularization
in simple settings, where gradient methods lead to minimal norm
functions that perfectly fit the training data.  Then we review
prediction methods that exhibit benign overfitting, focusing
on regression problems with quadratic loss.  For these methods,
we can decompose the prediction rule into a simple component that
is useful for prediction and a spiky component that is useful for
overfitting but, in a favorable setting, does not harm prediction
accuracy.  We focus specifically on the linear regime for neural
networks, where the network can be approximated by a linear model.
In this regime, we demonstrate the success of gradient flow, and
we consider benign overfitting with two-layer networks, giving an
exact asymptotic analysis that precisely demonstrates the impact of
overparametrization.  We conclude by highlighting the key challenges
that arise in extending these insights to realistic deep learning
settings.}
}


@article{bl-fmdgblni-21,
  author = {Peter L. Bartlett and Philip M. Long},
  title = {Failures of Model-dependent Generalization Bounds for Least-norm Interpolation},
  journal = {Journal of Machine Learning Research},
  year = {2021},
  volume = {22},
  number = {204},
  pages = {1--15},
  url = {http://jmlr.org/papers/v22/20-1164.html},
  note = {arXiv:2010.08479}
}


@article{mmbjw-holdyama-21,
  author = {Wenlong Mou and Yi-An Ma and Martin J. Wainwright and
  Peter L. Bartlett and Michael I. Jordan},
  title = {High-Order {L}angevin Diffusion Yields an Accelerated
  {MCMC} Algorithm},
  journal = {Journal of Machine Learning Research},
  year = {2021},
  volume = {22},
  number = {42},
  pages = {1-41},
  url = {http://jmlr.org/papers/v22/20-576.html}
}


@article{mccfbj-itanafgbm-21,
  author = {Yi-An Ma and Niladri S.~Chatterji and Xiang Cheng
and Nicolas Flammarion and Peter L.~Bartlett and Michael I.~Jordan},
  title = {Is There an Analog of {N}esterov Acceleration for
Gradient-Based {MCMC}?},
  year = {2021},
  journal = {Bernoulli},
  volume = {27},
  number = {3},
  pages = {1942--1992},
  doi = {http://dx.doi.org/10.3150/20-BEJ1297},
  abstract = {We formulate gradient-based Markov chain Monte Carlo
(MCMC) sampling as optimization on the space of probability measures,
with Kullback–Leibler (KL) divergence as the objective functional. We
show that an underdamped form of the Langevin algorithm performs
accelerated gradient descent in this metric. To characterize the
convergence of the algorithm, we construct a Lyapunov functional and
exploit hypocoercivity of the underdamped Langevin algorithm. As an
application, we show that accelerated rates can be obtained for a
class of nonconvex functions with the Langevin algorithm.}
}


@article{clb-wdgdllfitl-21,
  author = {Niladri S.~Chatterji and Philip M.~Long and Peter L.~Bartlett},
  title = {When does gradient descent with logistic loss find interpolating two-layer networks?},
  year = {2021},
  journal = {Journal of Machine Learning Research},
  url = {https://arxiv.org/abs/2012.02409},
  abstract = {We study the training of finite-width two-layer smoothed
ReLU networks for binary classification using the logistic loss. We
show that gradient descent drives the training loss to zero if the
initial loss is small enough. When the data satisfies certain cluster
and separation conditions and the network is wide enough, we show that
one step of gradient descent reduces the loss sufficiently that the
first result applies.},
  volume = {22},
  number = {159},
  pages = {1-48}
}


@article{cbl-olbsgsa-21,
  author = {Niladri S.~Chatterji and Peter L.~Bartlett
    and Philip M.~Long},
  title = {Oracle lower bounds for stochastic gradient sampling
algorithms},
  year = {2022},
  journal = {Bernoulli},
  volume = {28},
  number = {2},
  pages = {1074--1092},
  note = {arXiv:2002.00291},
  url = {https://arxiv.org/abs/2002.00291},
  abstract = {We consider the problem of sampling from a strongly
log-concave density in R^d, and prove an information theoretic lower bound on the number of stochastic gradient queries of the log density needed. Several popular sampling algorithms (including many Markov chain Monte Carlo methods) operate by using stochastic gradients of the log density to generate a sample; our results establish an information theoretic limit for all these algorithms.
We show that for every algorithm, there exists a well-conditioned
strongly log-concave target density for which the distribution of
points generated by the algorithm would be at least ε away from the
target in total variation distance if the number of gradient queries
is less than Omega(sigma^2d/epsilon^2), where sigma^2d is the variance of the stochastic
gradient. Our lower bound follows by combining the ideas of Le Cam
deficiency routinely used in the comparison of statistical experiments
along with standard information theoretic tools used in lower bounding
Bayes risk functions. To the best of our knowledge our results provide
the first nontrivial dimension-dependent lower bound for this
problem.}
}


@article{mfwb-ibdldnorwc-21,
  author = {Mou, Wenlong and Flammarion, Nicolas and Wainwright, Martin J.
and Bartlett, Peter L.},
  title = {Improved Bounds for Discretization of {L}angevin Diffusions:
Near-Optimal Rates without Convexity},
  year = {2022},
  journal = {Bernoulli},
  volume = {28},
  number = {3},
  pages = {1577--1601},
  doi = {http://dx.doi.org/10.3150/21-BEJ1343},
  url = {https://arxiv.org/abs/1907.11331},
  abstract = {We present an improved analysis of the Euler-Maruyama discretization of the Langevin diffusion. Our analysis does not require global contractivity, and yields polynomial dependence on the time horizon. Compared to existing approaches, we make an additional smoothness assumption, and improve the existing rate from O(eta) to O(eta^2) in terms of the KL divergence. This result matches the correct order for numerical SDEs, without suffering from exponential time dependence. When applied to algorithms for sampling and learning, this result simultaneously improves all those methods based on Dalayan's approach.}
}


@article{clb-ibibbotlln-22,
  author = {Niladri S.~Chatterji and Philip M.~Long and Peter L.~Bartlett},
  title = {The Interplay Between Implicit Bias and Benign Overfitting in Two-Layer Linear Networks},
  journal = {Journal of Machine Learning Research},
  year = {2022},
  volume = {23},
  number = {263},
  pages = {1--48},
  url = {http://jmlr.org/papers/v23/21-1011.html}
}


@article{fcb-rfa-22a,
  author = {Spencer Frei and Niladri S.~Chatterji and Peter L.~Bartlett},
  title = {Random Feature Amplification: Feature Learning and
Generalization in Neural Networks},
  abstract = {In this work, we provide a characterization of the feature-learning process in two-layer ReLU networks trained by gradient descent on the logistic loss following random initialization. We consider data with binary labels that are generated by an XOR-like function of the input features. We permit a constant fraction of the training labels to be corrupted by an adversary. We show that, although linear classifiers are no better than random guessing for the distribution we consider, two-layer ReLU networks trained by gradient descent achieve generalization error close to the label noise rate. We develop a novel proof technique that shows that at initialization, the vast majority of neurons function as random features that are only weakly correlated with useful features, and the gradient descent dynamics `amplify' these weak, random features to strong, useful features.},
  year = {2023},
  journal = {Journal of Machine Learning Research},
  volume = {24},
  number = {303},
  pages = {1--49},
  url = {http://jmlr.org/papers/v24/22-1132.html}
}


@article{pkbk-scleope-23,
  author = {Juan C.~Perdomo and Akshay Krishnamurthy and Peter L.~Bartlett
and Sham Kakade},
  title = {A Complete Characterization of Linear Estimators for Offline Policy Evaluation},
  abstract = {Offline policy evaluation is a fundamental statistical problem in reinforcement learning that involves estimating the value function of some decision-making policy given data collected by a potentially different policy. In order to tackle problems with complex, high-dimensional observations, there has been significant interest from theoreticians and practitioners alike in understanding the possibility of function approximation in reinforcement learning. Despite significant study, a sharp characterization of when we might expect offline policy evaluation to be tractable, even in the simplest setting of linear function approximation, has so far remained elusive, with a surprising number of strong negative results recently appearing in the literature.
In this work, we identify simple control-theoretic and linear-algebraic conditions that are necessary and sufficient for classical methods, in particular Fitted Q-iteration (FQI) and least squares temporal difference learning (LSTD), to succeed at offline policy evaluation. Using this characterization, we establish a precise hierarchy of regimes under which these estimators succeed. We prove that LSTD works under strictly weaker conditions than FQI. Furthermore, we establish that if a problem is not solvable via LSTD, then it cannot be solved by a broad class of linear estimators, even in the limit of infinite data. Taken together, our results provide a complete picture of the behavior of linear estimators for offline policy evaluation, unify previously disparate analyses of canonical algorithms, and provide significantly sharper notions of the underlying statistical complexity of offline policy evaluation.},
  year = {2023},
  journal = {Journal of Machine Learning Research},
  volume = {24},
  number = {284},
  pages = {1--50},
  url = {http://jmlr.org/papers/v24/22-0341.html}
}


@article{blb-dsam-23,
  title = {The Dynamics of Sharpness-Aware Minimization: Bouncing Across Ravines
and Drifting Towards Wide Minima},
  author = {Peter L.~Bartlett and Philip M.~Long and Olivier Bousquet},
  abstract = {We consider Sharpness-Aware Minimization (SAM), a gradient-based optimization method for deep networks that has exhibited performance improvements on image and language prediction problems. We show that when SAM is applied with a convex quadratic objective, for most random initializations it converges to a cycle that oscillates between either side of the minimum in the direction with the largest curvature, and we provide bounds on the rate of convergence.
In the non-quadratic case, we show that such oscillations effectively
perform gradient descent, with a smaller step-size, on the spectral
norm of the Hessian. In such cases, SAM’s update may be regarded as a
third derivative—the derivative of the Hessian in the leading
eigenvector direction—that encourages drift toward wider minima.},
  year = {2023},
  journal = {Journal of Machine Learning Research},
  volume = {24},
  number = {316},
  pages = {1--36},
  url = {http://jmlr.org/papers/v24/23-043.html}
}


@article{bl-cplucssd-24,
  title = {Corrigendum to 'Prediction, Learning, Uniform Convergence, and
Scale-Sensitive Dimensions'},
  journal = {Journal of Computer and System Sciences},
  author = {Peter L.~Bartlett and Philip M.~Long},
  volume = {140},
  pages = {103465},
  year = {2024}
}


@article{mhwbj-dpppcrp-24,
  author = {Wenlong Mou and Nhat Ho and Martin Wainwright and
Peter L.~Bartlett and Michael Jordan},
  title = {A Diffusion Process Perspective on Posterior Contraction Rates for Parameters},
  journal = {SIAM Journal on Mathematics of Data Science},
  year = {2024},
  note = {(To appear)}
}


@article{zfb-ttllmic-24,
  title = {Trained Transformers Learn Linear Models In-Context},
  author = {Ruiqi Zhang and Spencer Frei and Peter L.~Bartlett},
  abstract = {Attention-based neural networks such as transformers have demonstrated a remarkable ability to exhibit in-context learning (ICL): Given a short prompt sequence of tokens from an unseen task, they can formulate relevant per-token and next-token predictions without any parameter updates. By embedding a sequence of labeled training data and unlabeled test data as a prompt, this allows for transformers to behave like supervised learning algorithms. Indeed, recent work has shown that when training transformer architectures over random instances of linear regression problems, these models’ predictions mimic those of ordinary least squares. Towards understanding the mechanisms underlying this phenomenon, we investigate the dynamics of ICL in transformers with a single linear self-attention layer trained by gradient flow on linear regression tasks. We show that despite non-convexity, gradient flow with a suitable random initialization finds a global minimum of the objective function. At this global minimum, when given a test prompt of labeled examples from a new prediction task, the transformer achieves prediction error competitive with the best linear predictor over the test prompt distribution. We additionally characterize the robustness of the trained transformer to a variety of distribution shifts and show that although a number of shifts are tolerated, shifts in the covariate distribution of the prompts are not. Motivated by this, we consider a generalized ICL setting where the covariate distributions can vary across prompts. We show that although gradient flow succeeds at finding a global minimum in this setting, the trained transformer is still brittle under mild covariate shifts},
  year = {2024},
  journal = {Journal of Machine Learning Research},
  volume = {25},
  number = {49},
  pages = {1--55},
  url = {http://jmlr.org/papers/v25/23-1042.html}
}


@article{bmdbj-brnv-23,
  title = {Bayesian Robustness: A Nonasymptotic Viewpoint},
  author = {Kush Bhatia and Yi-An Ma and Anca D. Dragan and Peter L. Bartlett and Michael I. Jordan},
  year = {2023},
  volume = {119},
  number = {546},
  pages = {1112--1123},
  journal = {Journal of the American Statistical Association},
  doi = {http://doi.org/10.1080/01621459.2023.2174121}
}


This file was generated by
bibtex2html 1.99.