# pubs-93.bib

@comment{{This file has been generated by bib2bib 1.98}}

@comment{{Command line: /usr/bin/bib2bib --remove funding --remove notetoomit -ob pubs-93.bib -c 'year>=1993 and $type != "unpublished" and not note : "[pP]atent" ' ../mypubs.bib}}  @article{b-vcdbttln-93, author = {P. L. Bartlett}, title = {{V}apnik-{C}hervonenkis dimension bounds for two- and three-layer networks}, journal = {Neural Computation}, volume = {5}, number = {3}, pages = {371--373}, year = {1993} }  @inproceedings{lb-evbmlnn-93, author = {D. R. Lovell and P. L. Bartlett}, title = {Error and variance bounds in multi-layer neural networks}, booktitle = {Proceedings of the Fourth Australian Conference on Neural Networks}, pages = {161--164}, year = {1993} }  @inproceedings{b-ssnlmln-93, author = {P. L. Bartlett}, title = {The sample size necessary for learning in multi-layer networks}, booktitle = {Proceedings of the Fourth Australian Conference on Neural Networks}, pages = {14--17}, year = {1993} }  @inproceedings{b-lbvcdmltn-93, author = {P. L. Bartlett}, title = {Lower bounds on the {V}apnik-{C}hervonenkis dimension of multi-layer threshold networks}, booktitle = {Proceedings of the Sixth Annual ACM Conference on Computational Learning Theory}, pages = {144--150}, publisher = {ACM Press}, city = {New York}, year = {1993} }  @incollection{b-clt-94, author = {P. L. Bartlett}, title = {Computational Learning Theory}, booktitle = {Encyclopedia of Computer Science and Technology}, volume = {31}, pages = {83--99}, editor = {A. Kent and J. G. Williams}, publisher = {Marcel Dekker}, city = {New York}, year = {1994} }  @inproceedings{bfh-erwl-94, author = {P. L. Bartlett and P. Fischer and K.-U. H{\"o}ffgen}, title = {Exploiting random walks for learning}, booktitle = {Proceedings of the Seventh Annual ACM Conference on Computational Learning Theory}, pages = {318--327}, publisher = {ACM Press}, city = {New York}, year = {1994} }  @inproceedings{blw-fslrvf-94, author = {P. L. Bartlett and P. M. Long and R. C. Williamson}, title = {Fat-shattering and the learnability of real-valued functions}, booktitle = {Proceedings of the Seventh Annual ACM Conference on Computational Learning Theory}, pages = {299--310}, publisher = {ACM Press}, city = {New York}, year = {1994} }  @inproceedings{lbw-lbvdspfc-94, author = {W. S. Lee and P. L. Bartlett and R. C. Williamson}, title = {Lower bounds on the {VC}-dimension of smoothly parametrized function classes}, booktitle = {Proceedings of the Seventh Annual ACM Conference on Computational Learning Theory}, pages = {362--367}, publisher = {ACM Press}, city = {New York}, year = {1994} }  @inproceedings{b-lqrvf-94, author = {P. L. Bartlett}, title = {Learning quantized real-valued functions}, booktitle = {Proceedings of Computing: the Australian Theory Seminar}, pages = {24--35}, publisher = {University of Technology Sydney}, year = {1994} }  @inproceedings{lbw-vcdnnrpr-94, author = {W. S. Lee and P. L. Bartlett and R. C. Williamson}, title = {The {V}apnik-{C}hervonenkis dimension of neural networks with restricted parameter ranges}, booktitle = {Proceedings of the Fifth Australian Conference on Neural Networks}, pages = {198--201}, year = {1994} }  @article{lbw-lbvspfc-95, author = {W. S. Lee and P. L. Bartlett and R. C. Williamson}, title = {Lower bounds on the {VC}-dimension of smoothly parametrized function classes}, journal = {Neural Computation}, volume = {7}, pages = {990--1002}, year = {1995}, note = {(See also correction, Neural Computation, 9: 765--769, 1997)} }  @inproceedings{bd-ecgdacrnn-95, author = {P. L. Bartlett and S. Dasgupta}, title = {Exponential convergence of a gradient descent algorithm for a class of recurrent neural networks}, booktitle = {Proceedings of the 38th Midwest Symposium on Circuits and Systems}, year = {1995} }  @inproceedings{bl-mtssdl-95, author = {P. L. Bartlett and P. M. Long}, title = {More theorems about scale sensitive dimensions and learning}, booktitle = {Proceedings of the Eighth Annual ACM Conference on Computational Learning Theory}, pages = {392--401}, publisher = {ACM Press}, city = {New York}, year = {1995} }  @inproceedings{lbw-eallcbf-95, author = {W. S. Lee and P. L. Bartlett and R. C. Williamson}, title = {On efficient agnostic learning of linear combinations of basis functions}, booktitle = {Proceedings of the Eighth Annual ACM Conference on Computational Learning Theory}, pages = {369--376}, publisher = {ACM Press}, city = {New York}, year = {1995} }  @inproceedings{ab-aie-95, author = {M. Anthony and P. L. Bartlett}, title = {Function learning from interpolation}, booktitle = {Computational Learning Theory: Second European Conference, EUROCOLT 95, Barcelona Spain, March 1995, Proceedings}, pages = {211--221}, year = {1995} }  @inproceedings{bw-scnnldi-95, author = {P. L. Bartlett and R. C. Williamson}, title = {The sample complexity of neural network learning with discrete inputs}, booktitle = {Proceedings of the Sixth Australian Conference on Neural Networks}, pages = {189--192}, year = {1995} }  @inproceedings{lbw-ealnnbf-95, author = {W. S. Lee and P. L. Bartlett and R. C. Williamson}, title = {Efficient agnostic learning of neural networks with bounded fan-in}, booktitle = {Proceedings of the Sixth Australian Conference on Neural Networks}, pages = {201--204}, year = {1995} }  @book{bbw-psacnn-96, editor = {Peter L. Bartlett and Anthony Burkitt and Robert C. Williamson}, title = {Proceedings of the Seventh Australian Conference on Neural Networks}, publisher = {Australian National University}, city = {Canberra}, year = {1996} }  @article{lbw-ealnnbf-96, author = {W. S. Lee and P. L. Bartlett and R. C. Williamson}, title = {Efficient agnostic learning of neural networks with bounded fan-in}, journal = {IEEE Transactions on Information Theory}, volume = {42}, number = {6}, pages = {2118--2132}, year = {1996} }  @article{abist-vgai-96, author = {M. Anthony and P. L. Bartlett and Y. Ishai and J. Shawe-Taylor}, title = {Valid generalisation from approximate interpolation}, journal = {Combinatorics, Probability, and Computing}, volume = {5}, pages = {191--214}, year = {1996} }  @article{blw-fslrvf-96, author = {P. L. Bartlett and P. M. Long and R. C. Williamson}, title = {Fat-shattering and the learnability of real-valued functions}, journal = {Journal of Computer and System Sciences}, note = {(special issue on COLT94)}, volume = {52}, number = {3}, pages = {434--452}, year = {1996} }  @article{bw-vcdptlnndi-96, author = {P. L. Bartlett and R. C. Williamson}, title = {The {V}apnik-{C}hervonenkis dimension and pseudodimension of two-layer neural networks with discrete inputs}, journal = {Neural Computation}, volume = {8}, pages = {653--656}, year = {1996} }  @inproceedings{ksbw-elcmvf-96, author = {A. Kowalczyk and J. Szymanski and P. L. Bartlett and R. C. Williamson}, title = {Examples of learning curves from a modified {VC}-formalism}, booktitle = {Advances in Neural Information Processing Systems 8}, pages = {344--350}, year = {1996} }  @inproceedings{bk-cmcsnd-96, author = {P. L. Bartlett and S. R. Kulkarni}, title = {The complexity of model classes, and smoothing noisy data (Invited)}, booktitle = {Proceedings of the 35th IEEE Conference on Decision and Control}, pages = {TM09-4, 2312--2317}, publisher = {IEEE}, year = {1996} }  @inproceedings{kbb-sbtlc-96, author = {L. Kammer and R. R. Bitmead and P. L. Bartlett}, title = {Signal-based testing of {LQ}-optimality of controllers}, booktitle = {Proceedings of the 35th IEEE Conference on Decision and Control}, pages = {FA17-2, 3620--3623}, publisher = {IEEE}, year = {1996} }  @inproceedings{bbdk-lccesc-96, author = {P. L. Bartlett and S. Ben-David and S. R. Kulkarni}, title = {Learning changing concepts by exploiting the structure of change}, booktitle = {Proceedings of the Ninth Annual Conference on Computational Learning Theory}, pages = {131--139}, publisher = {ACM Press}, city = {New York}, year = {1996} }  @inproceedings{stbwa-fsrm-96, author = {J. Shawe-Taylor and P. L. Bartlett and R. C. Williamson and M. Anthony}, title = {A framework for structural risk minimization}, booktitle = {Proceedings of the Ninth Annual Conference on Computational Learning Theory}, pages = {68--76}, publisher = {ACM Press}, city = {New York}, year = {1996} }  @inproceedings{lbw-iclsl-96, author = {W. S. Lee and P. L. Bartlett and R. C. Williamson}, title = {The importance of convexity in learning with squared loss}, booktitle = {Proceedings of the Ninth Annual Conference on Computational Learning Theory}, pages = {140--146}, publisher = {ACM Press}, city = {New York}, year = {1996} }  @inproceedings{kbb-atiad-96, author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett}, title = {Adaptive tracking identification: the art of defalsification}, booktitle = {Proceedings of the 1996 IFAC World Congress}, year = {1996} }  @article{bkp-cnrvfc-97, author = {P. L. Bartlett and S. R. Kulkarni and S. E. Posner}, title = {Covering numbers for real-valued function classes}, journal = {IEEE Transactions on Information Theory}, volume = {43}, number = {5}, pages = {1721--1724}, year = {1997} }  @article{b-brnnpr-97, author = {P. L. Bartlett}, title = {Book Review: {N}eural networks for pattern recognition,' {C}hristopher {M}.~{B}ishop}, journal = {Statistics in Medicine}, volume = {16}, number = {20}, pages = {2385--2386}, year = {1997} }  @article{lbw-clbvspfc-97, author = {W. S. Lee and P. L. Bartlett and R. C. Williamson}, title = {Correction to Lower bounds on the {VC}-dimension of smoothly parametrized function classes'}, journal = {Neural Computation}, volume = {9}, pages = {765--769}, year = {1997} }  @inproceedings{b-fvgswmisn-97, author = {P. L. Bartlett}, title = {For valid generalization, the size of the weights is more important than the size of the network}, booktitle = {Advances in Neural Information Processing Systems 9}, pages = {134--140}, year = {1997} }  @inproceedings{sfbl-bmneevm-97, author = {R. E. Schapire and Y. Freund and P. L. Bartlett and W. S. Lee}, title = {Boosting the margin: A new explanation for the effectiveness of voting methods}, booktitle = {Machine Learning: Proceedings of the Fourteenth International Conference}, pages = {322--330}, year = {1997} }  @inproceedings{bll-mdreqd-97, author = {P. L. Bartlett and T. Linder and G. Lugosi}, title = {The minimax distortion redundancy in empirical quantizer design (abstract)}, booktitle = {Proceedings of the 1997 IEEE International Symposium on Information Theory}, pages = {511}, year = {1997} }  @inproceedings{bll-mlbeqd-97, author = {P. L. Bartlett and T. Linder and G. Lugosi}, title = {A minimax lower bound for empirical quantizer design}, booktitle = {Proceedings of the Third European Conference on Computational Learning Theory (EuroCOLT'97)}, pages = {220--222}, editor = {S. Ben-David}, publisher = {Springer}, year = {1997} }  @inproceedings{bb-rrcncnsann-97, author = {J. Baxter and P. L. Bartlett}, title = {A result relating convex$n$-widths to covering numbers with some applications to neural networks}, booktitle = {Proceedings of the Third European Conference on Computational Learning Theory (EuroCOLT'97)}, pages = {251--259}, editor = {S. Ben-David}, publisher = {Springer}, year = {1997} }  @inproceedings{b-nnl-97, author = {P. L. Bartlett}, title = {Neural network learning. (Abstract of invited talk.)}, booktitle = {CONTROL 97 Conference Proceedings, Institution of Engineers Australia}, pages = {543}, year = {1997} }  @inproceedings{lb-gswes-97, author = {G. Loy and P. L. Bartlett}, title = {Generalization and the size of the weights: an experimental study}, booktitle = {Proceedings of the Eighth Australian Conference on Neural Networks}, pages = {60--64}, year = {1997} }  @book{bm-peacclt-98, editor = {Peter L. Bartlett and Yishay Mansour}, title = {Proceedings of the Eleventh Annual Conference on Computational Learning Theory}, publisher = {ACM Press}, city = {New York}, year = {1998} }  @article{sfbl-bmneevm-98, author = {R. E. Schapire and Y. Freund and P. L. Bartlett and W. S. Lee}, title = {Boosting the margin: a new explanation for the effectiveness of voting methods}, journal = {Annals of Statistics}, volume = {26}, number = {5}, pages = {1651--1686}, year = {1998} }  @article{bmm-alvdbppn-98, author = {P. L. Bartlett and V. Maiorov and R. Meir}, title = {Almost linear {VC} dimension bounds for piecewise polynomial networks}, journal = {Neural Computation}, volume = {10}, number = {8}, pages = {2159--2173}, year = {1998} }  @article{lbw-iclsl-98, author = {W. S. Lee and P. L. Bartlett and R. C. Williamson}, title = {The importance of convexity in learning with squared loss}, journal = {IEEE Transactions on Information Theory}, volume = {44}, number = {5}, pages = {1974--1980}, year = {1998} }  @article{stbwa-srmddh-98, author = {J. Shawe-Taylor and P. L. Bartlett and R. C. Williamson and M. Anthony}, title = {Structural Risk Minimization over Data-Dependent Hierarchies}, journal = {IEEE Transactions on Information Theory}, volume = {44}, number = {5}, pages = {1926--1940}, year = {1998} }  @article{bll-mdreqd-98, author = {P. L. Bartlett and T. Linder and G. Lugosi}, title = {The minimax distortion redundancy in empirical quantizer design}, journal = {IEEE Transactions on Information Theory}, volume = {44}, number = {5}, pages = {1802--1813}, year = {1998} }  @article{bk-cmcsnd-98, author = {P. L. Bartlett and S. Kulkarni}, title = {The complexity of model classes, and smoothing noisy data}, journal = {Systems and Control Letters}, volume = {34}, number = {3}, pages = {133--140}, year = {1998} }  @article{bv-isilt-98, author = {P. L. Bartlett and M. Vidyasagar}, title = {Introduction to the special issue on learning theory}, journal = {Systems and Control Letters}, volume = {34}, pages = {113--114}, year = {1998} }  @article{kbb-ocpcle-98, author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett}, title = {Optimal controller properties from closed-loop experiments}, journal = {Automatica}, volume = {34}, number = {1}, pages = {83--91}, year = {1998} }  @article{bl-plussd-98, author = {P. L. Bartlett and P. M. Long}, title = {Prediction, learning, uniform convergence, and scale-sensitive dimensions}, journal = {Journal of Computer and System Sciences}, volume = {56}, number = {2}, pages = {174--190}, year = {1998}, note = {(special issue on COLT95)} }  @article{b-scpcnn-98, author = {P. L. Bartlett}, title = {The sample complexity of pattern classification with neural networks: the size of the weights is more important than the size of the network}, journal = {IEEE Transactions on Information Theory}, volume = {44}, number = {2}, pages = {525--536}, year = {1998} }  @inproceedings{bb-cdmfs1nnc-98, author = {J. Baxter and P. L. Bartlett}, title = {The canonical distortion measure in feature space and$1$-{NN} classification}, booktitle = {Advances in Neural Information Processing Systems 10}, pages = {245--251}, year = {1998} }  @inproceedings{gbl-gdtd-98, author = {M. Golea and P. L. Bartlett and W. S. Lee}, title = {Generalization in decision trees and {DNF}: Does size matter?}, booktitle = {Advances in Neural Information Processing Systems 10}, pages = {259--265}, year = {1998} }  @inproceedings{kbb-ditsa-98, author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett}, title = {Direct iterative tuning via spectral analysis}, booktitle = {Proceedings of the IEEE Conference on Decision and Control}, volume = {3}, pages = {2874--2879}, year = {1998} }  @inproceedings{sbsw-svraac-98, author = {B. Sch{\"o}lkopf and P. L. Bartlett and A. Smola and R. Williamson}, title = {Support vector regression with automatic accuracy control}, editor = { L. Niklasson and M. Boden and T. Ziemke}, booktitle = {Perspectives in Neural Computing: Proceedings of the 8th International Conference on Artificial Neural Networks (ICANN'98)}, pages = {111--116}, publisher = {Springer-Verlag}, city = {Berlin}, year = {1998} }  @inproceedings{mbg-gtncdtcmp-98, author = {L. Mason and P. L. Bartlett and M. Golea}, title = {Generalization in threshold networks, combined decision trees and combined mask perceptrons}, editor = {T. Downs and M. Frean and M. Gallagher}, booktitle = {Proceedings of the Ninth Australian Conference on Neural Networks (ACNN'98)}, publisher = {University of Queensland}, pages = {84--88}, year = {1998} }  @book{ab-nnltf-99, author = {Martin Anthony and Peter L. Bartlett}, title = {Neural Network Learning: Theoretical Foundations}, publisher = {Cambridge University Press}, year = {1999}, url = {http://www.stat.berkeley.edu/~bartlett/nnl/index.html}, isbn = {0 521 57353 X}, note = {404pp. ISBN 978-0-521-57353-X. Reprinted 2001, 2002. Paperback edition 2009; ISBN 978-0-521-11862-0.} }  @article{bl-iudsam-99, author = {P. L. Bartlett and G. Lugosi}, title = {An inequality for uniform deviations of sample averages from their means}, journal = {Statistics and Probability Letters}, volume = {44}, number = {1}, pages = {55--62}, year = {1999} }  @incollection{b-ennl-99, author = {P. L. Bartlett}, title = {Efficient neural network learning}, booktitle = {Open Problems in Mathematical Systems Theory and Control}, editor = {V. D. Blondel and E. D. Sontag and M. Vidyasagar and J. C. Willems}, pages = {35--38}, publisher = {Springer Verlag}, city = {London}, year = {1999} }  @incollection{bst-gpsvmopc-99, author = {P. L. Bartlett and J. Shawe-Taylor}, title = {Generalization performance of support vector machines and other pattern classifiers}, booktitle = {Advances in Kernel Methods -- Support Vector Learning}, editor = {B. Sch{\"o}lkopf and C. J. C. Burges and A. J. Smola}, pages = {43--54}, publisher = {MIT Press}, city = {Cambridge, MA}, year = {1999} }  @inproceedings{bmm-alvdbppn-99, author = {P. L. Bartlett and V. Maiorov and R. Meir}, title = {Almost linear {VC} dimension bounds for piecewise polynomial networks}, booktitle = {Advances in Neural Information Processing Systems 11}, pages = {190--196}, year = {1999} }  @inproceedings{mbb-domigcc-99, author = {L. Mason and P. L. Bartlett and J. Baxter}, title = {Direct optimization of margins improves generalization in combined classifiers}, booktitle = {Advances in Neural Information Processing Systems 11}, pages = {288--294}, year = {1999} }  @inproceedings{sbsw-stnsvra-99, author = {B. Sch{\"o}lkopf and P. L. Bartlett and A. Smola and R. Williamson}, title = {Shrinking the tube: a new support vector regression algorithm}, booktitle = {Advances in Neural Information Processing Systems 11}, pages = {330--336}, year = {1999} }  @inproceedings{kbz-sfossgmbtmrpe-99, author = {T. Koshizen and P. L. Bartlett and A. Zelinsky}, title = {Sensor fusion of odometry and sonar sensors by the {G}aussian mixture {B}ayes' technique in mobile robot position estimation}, booktitle = {Proceedings of the 1999 IEEE International Conference on Systems, Man and Cybernetics}, volume = {4}, pages = {742--747}, year = {1999} }  @inproceedings{gbstw-cnsvm-99, author = {Y. Guo and P. L. Bartlett and J. Shawe-Taylor and R. C. Williamson}, title = {Covering Numbers for Support Vector Machines}, booktitle = {Proceedings of the Twelfth Annual Conference on Computational Learning Theory}, pages = {267--277}, year = {1999} }  @inproceedings{bbd-hrnnap-99, author = {P. L. Bartlett and S. Ben-David}, title = {Hardness results for neural network approximation problems}, booktitle = {Proceedings of the Fourth European Conference on Computational Learning Theory}, pages = {50--62}, year = {1999} }  @inproceedings{mbb-ebvcmcf-99, author = {L. Mason and P. L. Bartlett and J. Baxter}, title = {Error bounds for voting classifiers using margin cost functions (Invited abstract)}, booktitle = {Proceedings of the IEEE Information Theory Workshop on Detection, Estimation, Classification and Imaging}, pages = {36}, year = {1999} }  @inproceedings{bb-vmds-99, author = {P. L. Bartlett and J. Baxter}, title = {Voting methods for data segmentation}, booktitle = {Proceedings of the Advanced Investment Technology Conference}, pages = {35--40}, publisher = {Bond University}, year = {1999} }  @book{sbss-almc-00, editor = {Alexander J. Smola and Peter L. Bartlett and Bernard Sch{\"o}lkopf and Dale Schuurmans}, title = {Advances in Large Margin Classifiers}, publisher = {MIT Press}, city = {Cambridge}, country = {USA}, year = {2000}, isbn = {ISBN 0262 19448 1} }  @article{ab-fli-00, author = {M. Anthony and P. L. Bartlett}, title = {Function learning from interpolation}, journal = {Combinatorics, Probability, and Computing}, volume = {9}, pages = {213--225}, year = {2000} }  @incollection{mbbf-fgtch-00, author = {L. Mason and J. Baxter and P. L. Bartlett and M. Frean}, title = {Functional Gradient Techniques for Combining Hypotheses}, booktitle = {Advances in Large Margin Classifiers}, editor = {A. J. Smola and P. L. Bartlett and B. Sch{\"o}lkopf and D. Schuurmans}, pages = {221--246}, publisher = {MIT Press}, city = {Cambridge, MA}, year = {2000} }  @incollection{sbss-ilmc-00, author = {A. J. Smola and P. L. Bartlett and B. Sch{\"o}lkopf and D. Schuurmans}, title = {Introduction to Large Margin Classifiers}, booktitle = {Advances in Large Margin Classifiers}, editors = {A. J. Smola and P. L. Bartlett and B. Sch{\"o}lkopf and D. Schuurmans}, pages = {1--29}, publisher = {MIT Press}, city = {Cambridge, MA}, year = {2000} }  @article{bbdk-lccesc-00, author = {P. L. Bartlett and S. Ben-David and S. R. Kulkarni}, title = {Learning changing concepts by exploiting the structure of change}, journal = {Machine Learning}, volume = {41}, number = {2}, pages = {153--174}, year = {2000} }  @article{ppb-pace-00, author = {S. Parameswaran and M. F. Parkinson and P. L. Bartlett}, title = {Profiling in the {ASP} codesign environment}, journal = {Journal of Systems Architecture}, volume = {46}, number = {14}, pages = {1263--1274}, year = {2000} }  @article{sswb-nsva-00, author = {B. Sch{\"o}lkopf and A. Smola and R. C. Williamson and P. L. Bartlett}, title = {New support vector algorithms}, journal = {Neural Computation}, volume = {12}, number = {5}, pages = {1207--1245}, year = {2000} }  @article{kbb-ditsa-00, author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett}, title = {Direct iterative tuning via spectral analysis}, journal = {Automatica}, volume = {36}, number = {9}, pages = {1301--1307}, year = {2000} }  @article{mbb-igeom-00, author = {L. Mason and P. L. Bartlett and J. Baxter}, title = {Improved generalization through explicit optimization of margins}, journal = {Machine Learning}, volume = {38}, number = {3}, pages = {243--255}, year = {2000} }  @inproceedings{bb-socpomdp-00, author = {P. L. Bartlett and J. Baxter}, title = {Stochastic optimization of controlled partially observable {M}arkov decision processes}, booktitle = {Proceedings of the IEEE Conference on Decision and Control}, volume = {1}, pages = {124--129}, year = {2000} }  @inproceedings{bb-dgbrl-00, author = {J. Baxter and P. L. Bartlett}, title = {Direct gradient-based reinforcement learning (Invited)}, booktitle = {Proceedings of the International Symposium on Circuits and Systems}, pages = {III-271--274}, year = {2000} }  @inproceedings{mbbf-bagd-00, author = {L. Mason and J. Baxter and P. L. Bartlett and M. Frean}, title = {Boosting algorithms as gradient descent}, booktitle = {Advances in Neural Information Processing Systems 12}, pages = {512--518}, year = {2000} }  @inproceedings{bb-gaoaepgpa-00, author = {J. Baxter and P. L. Bartlett}, title = {{GPOMDP}: An On-line Algorithm for Estimating Performance Gradients in {POMDP}'s, with Applications}, booktitle = {Proceedings of the 2000 International Conference on Machine Learning}, pages = {41--48}, year = {2000} }  @inproceedings{bbl-msee-00, author = {P. L. Bartlett and S. Boucheron and G. Lugosi}, title = {Model selection and error estimation}, booktitle = {Proceedings of the Thirteenth Annual Conference on Computational Learning Theory}, pages = {286--297}, year = {2000} }  @inproceedings{bb-eabgbrl-00, author = {P. L. Bartlett and J. Baxter}, title = {Estimation and approximation bounds for gradient-based reinforcement learning}, booktitle = {Proceedings of the Thirteenth Annual Conference on Computational Learning Theory}, pages = {133--141}, year = {2000} }  @article{bb-ihgbps-01, author = {J. Baxter and P. L. Bartlett}, title = {Infinite-Horizon Policy-Gradient Estimation}, journal = {Journal of Artificial Intelligence Research}, volume = {15}, pages = {319--350}, year = {2001}, url = {http://www.cs.washington.edu/research/jair/abstracts/baxter01a.html} }  @article{bbw-ihgbps2gaae-01, author = {J. Baxter and P. L. Bartlett and L. Weaver}, title = {Experiments with Infinite-Horizon, Policy-Gradient Estimation}, journal = {Journal of Artificial Intelligence Research}, volume = {15}, pages = {351--381}, year = {2001}, url = {http://www.cs.washington.edu/research/jair/abstracts/baxter01b.html} }  @inproceedings{bbbcefgswgedadvcwssw-asvmcpsmsdpcss-01, author = {A. Ben-Hur and T. Barnes and P. L. Bartlett and O. Chapelle and A. Elisseeff and H. Fristche and I. Guyon and B. Sch{\"o}lkopf and J. Weston and E. Fung and C. Enderwick and E. A. Dalmasso and B.-L. Adam and J. W. Davis and A. Vlahou and L. Cazares and M. Ward and P. F. Schellhammer and J. Semmes and G. L. Wright}, title = {Application of support vector machines to the classification of ProteinChip System mass spectral data of prostate cancer serum samples (abstract)}, booktitle = {Second Annual National Cancer Institute Early Detection Research Network Scientific Workshop}, year = {2001} }  @inproceedings{bm-rgcrbsr-01, author = {P. L. Bartlett and S. Mendelson}, title = {Rademacher and {G}aussian complexities: Risk bounds and structural results}, booktitle = {Proceedings of the Fourteenth Annual Conference on Computational Learning Theory and Fifth European Conference on Computational Learning Theory}, pages = {224--240}, year = {2001} }  @inproceedings{sb-sggpr-01, author = {A. J. Smola and P. L. Bartlett}, title = {Sparse greedy {G}aussian process regression}, booktitle = {Advances in Neural Information Processing Systems 13}, pages = {619--625}, year = {2001} }  @article{gbstw-cnsvm-01, author = {Y. Guo and P. L. Bartlett and J. Shawe-Taylor and R. C. Williamson}, title = {Covering numbers for support vector machines}, journal = {IEEE Transactions on Information Theory}, volume = {48}, number = {1}, pages = {239--250}, year = {2002} }  @article{bm-rgcrbsr-02, author = {P. L. Bartlett and S. Mendelson}, title = {Rademacher and {G}aussian complexities: Risk bounds and structural results}, journal = {Journal of Machine Learning Research}, volume = {3}, pages = {463--482}, pdf = {http://www.jmlr.org/papers/volume3/bartlett02a/bartlett02a.pdf}, year = {2002} }  @article{bbl-msee-02, author = {P. L. Bartlett and S. Boucheron and G. Lugosi}, title = {Model selection and error estimation}, journal = {Machine Learning}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bbl-msee.ps.gz}, volume = {48}, pages = {85--113}, year = {2002} }  @article{bb-eabgbrl-02, author = {P. L. Bartlett and J. Baxter}, title = {Estimation and approximation bounds for gradient-based reinforcement learning}, journal = {Journal of Computer and System Sciences}, volume = {64}, number = {1}, pages = {133--150}, year = {2002} }  @article{bbd-hrnnap-02, author = {P. L. Bartlett and S. Ben-David}, title = {Hardness results for neural network approximation problems}, journal = {Theoretical Computer Science}, note = {(special issue on Eurocolt'99)}, year = {2002}, volume = {284}, number = {1}, pages = {53--66}, url = {http://dx.doi.org/10.1016/S0304-3975(01)00057-3} }  @article{bfh-erwl-02, author = {P. L. Bartlett and P. Fischer and K.-U. H{\"o}ffgen}, title = {Exploiting random walks for learning}, journal = {Information and Computation}, year = {2002}, volume = {176}, number = {2}, pages = {121--135}, url = {http://dx.doi.org/10.1006/inco.2002.3083} }  @article{mbg-gecc-02, author = {L. Mason and P. L. Bartlett and M. Golea}, title = {Generalization error of combined classifiers}, journal = {Journal of Computer and System Sciences}, year = {2002}, volume = {65}, number = {2}, pages = {415--438}, url = {http://dx.doi.org/10.1006/jcss.2002.1854} }  @inproceedings{bbm-lrc-02, author = {P. L. Bartlett and O. Bousquet and S. Mendelson}, title = {Localized {R}ademacher complexity}, booktitle = {Proceedings of the Conference on Computational Learning Theory}, pages = {44-58}, year = {2002} }  @inproceedings{lcbegj-lkmsdp-02, author = {G. Lanckriet and N. Cristianini and P. L. Bartlett and L. El Ghaoui and M. Jordan}, title = {Learning the kernel matrix with semi-definite programming}, booktitle = {Proceedings of the International Conference on Machine Learning}, pages = {323--330}, year = {2002} }  @inproceedings{gbb-vrtgerl-02, author = {E. Greensmith and P. L. Bartlett and J. Baxter}, title = {Variance reduction techniques for gradient estimates in reinforcement learning}, booktitle = {Advances in Neural Information Processing Systems 14}, year = {2002}, pages = {1507--1514}, ps = {http://www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/CN02.ps.gz} }  @inproceedings{b-paccc-03, author = {Peter L.\ Bartlett}, title = {Prediction algorithms: complexity, concentration and convexity}, booktitle = {Proceedings of the 13th IFAC Symposium on System Identification}, year = {2003}, pages = {1507--1517}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/b-paccc-03.ps.Z}, abstract = {In this paper, we review two families of algorithms used to estimate large-scale statistical models for prediction problems, kernel methods and boosting algorithms. We focus on the computational and statistical properties of prediction algorithms of this kind. Convexity plays an important role for these algorithms, since they exploit the computational advantages of convex optimization procedures. However, in addition to its computational advantages, the use of convexity in these methods also confers some attractive statistical properties. We present some recent results that show the advantages of convexity for estimation rates, the rates at which the prediction accuracies approach their optimal values. In addition, we present results that quantify the cost of using a convex loss function in place of the real loss function of interest.} }  @techreport{bjm-ccrb-03, title = {Convexity, classification, and risk bounds}, author = {Peter L.\ Bartlett and Michael I.\ Jordan and Jon D.\ McAuliffe}, institution = {Department of Statistics, U.C.\ Berkeley}, number = {638}, year = {2003}, pdf = {http://www.stat.berkeley.edu/tech-reports/638.pdf}, ps = {http://www.stat.berkeley.edu/tech-reports/638.ps.Z}, abstract = {Many of the classification algorithms developed in the machine learning literature, including the support vector machine and boosting, can be viewed as minimum contrast methods that minimize a convex surrogate of the 0-1 loss function. The convexity makes these algorithms computationally efficient. The use of a surrogate, however, has statistical consequences that must be balanced against the computational virtues of convexity. To study these issues, we provide a general quantitative relationship between the risk as assessed using the 0-1 loss and the risk as assessed using any nonnegative surrogate loss function. We show that this relationship gives nontrivial upper bounds on excess risk under the weakest possible condition on the loss function: that it satisfy a pointwise form of Fisher consistency for classification. The relationship is based on a simple variational transformation of the loss function that is easy to compute in many applications. We also present a refined version of this result in the case of low noise. Finally, we present applications of our results to the estimation of convergence rates in the general setting of function classes that are scaled convex hulls of a finite-dimensional base class, with a variety of commonly used loss functions.} }  @incollection{bm-vcdnn-03, author = {Peter L. Bartlett and Wolfgang Maass}, title = {{V}apnik-{C}hervonenkis dimension of neural nets}, editor = {Michael A.~Arbib}, booktitle = {The Handbook of Brain Theory and Neural Networks}, publisher = {MIT Press}, city = {Cambridge, MA}, note = {Second Edition}, pages = {1188--1192}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bm-vcdnn-03.ps.gz}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bm-vcdnn-03.pdf}, year = {2003} }  @incollection{b-irltvfm-03, author = {Peter L. Bartlett}, title = {An introduction to reinforcement learning theory: value function methods}, booktitle = {Advanced Lectures on Machine Learning}, seriestitle = {Lecture Notes in Computer Science}, volume = {2600}, editor = {Shahar Mendelson and Alexander J. Smola}, isbn = {3-540-00529-3}, pages = {184--202}, publisher = {Springer}, html = {http://link.springer.de/link/service/series/0558/bibs/2600/26000184.htm}, year = {2003} }  @article{lcbegj-lkmsdp-04, author = {G. Lanckriet and N. Cristianini and P. L. Bartlett and L. El Ghaoui and M. Jordan}, title = {Learning the kernel matrix with semi-definite programming}, journal = {Journal of Machine Learning Research}, pdf = {http://www.jmlr.org/papers/volume5/lanckriet04a/lanckriet04a.pdf}, ps = {http://www.jmlr.org/papers/volume5/lanckriet04a/lanckriet04a.ps.gz}, year = {2004}, volume = {5}, pages = {27--72} }  @article{gbb-vrtgerl-04, author = {E. Greensmith and P. L. Bartlett and J. Baxter}, title = {Variance reduction techniques for gradient estimates in reinforcement learning}, journal = {Journal of Machine Learning Research}, volume = {5}, year = {2004}, pages = {1471--1530}, url = {http://jmlr.csail.mit.edu/papers/volume5/greensmith04a/greensmith04a.pdf} }  @article{bjm-d-03, title = {Discussion of boosting papers}, author = {Peter L.\ Bartlett and Michael I.\ Jordan and Jon D.\ McAuliffe}, journal = {The Annals of Statistics}, year = {2004}, volume = {32}, number = {1}, pages = {85--91}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-d-03.ps.Z}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-d-03.pdf} }  @inproceedings{bjm-lmcclln-03, author = {Peter L.\ Bartlett and Michael I.\ Jordan and Jon D.\ McAuliffe}, title = {Large margin classifiers: convex loss, low noise, and convergence rates}, booktitle = {Advances in Neural Information Processing Systems, 16}, year = {2004}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-lmcclln-03.ps.gz}, abstract = { Many classification algorithms, including the support vector machine, boosting and logistic regression, can be viewed as minimum contrast methods that minimize a convex surrogate of the 0-1 loss function. We characterize the statistical consequences of using such a surrogate by providing a general quantitative relationship between the risk as assessed using the 0-1 loss and the risk as assessed using any nonnegative surrogate loss function. We show that this relationship gives nontrivial bounds under the weakest possible condition on the loss function---that it satisfy a pointwise form of Fisher consistency for classification. The relationship is based on a variational transformation of the loss function that is easy to compute in many applications. We also present a refined version of this result in the case of low noise. Finally, we present applications of our results to the estimation of convergence rates in the general setting of function classes that are scaled hulls of a finite-dimensional base class.} }  @inproceedings{bt-secp-04, author = {Peter L.~Bartlett and Ambuj Tewari}, title = {Sparseness vs estimating conditional probabilities: Some asymptotic results}, booktitle = {Proceedings of the 17th Annual Conference on Learning Theory}, volume = {3120}, seriestitle = {Lecture Notes in Computer Science}, pages = {564--578}, publisher = {Springer}, year = {2004}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bt-secp-04.ps.gz}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-secp-04.pdf}, abstract = {One of the nice properties of kernel classifiers such as SVMs is that they often produce sparse solutions. However, the decision functions of these classifiers cannot always be used to estimate the conditional probability of the class label. We investigate the relationship between these two properties and show that these are intimately related: sparseness does not occur when the conditional probabilities can be unambiguously estimated. We consider a family of convex loss functions and derive sharp asymptotic bounds for the number of support vectors. This enables us to characterize the exact trade-off between sparseness and the ability to estimate conditional probabilities for these loss functions.} }  @inproceedings{bmp-lcerm-04, author = {Peter L. Bartlett and Shahar Mendelson and Petra Philips}, title = {Local Complexities for Empirical Risk Minimization}, booktitle = {Proceedings of the 17th Annual Conference on Computational Learning Theory (COLT2004)}, volume = {3120}, seriestitle = {Lecture Notes in Computer Science}, pages = {270--284}, publisher = {Springer}, year = {2004}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-lcerm-04.ps.gz}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-lcerm-04.pdf}, abstract = {We present sharp bounds on the risk of the empirical minimization algorithm under mild assumptions on the class. We introduce the notion of isomorphic coordinate projections and show that this leads to a sharper error bound than the best previously known. The quantity which governs this bound on the empirical minimizer is the fixed point of the function$\xxi(r)  = \E \sup \left\{|\E f -
\E_n f|: f\in F,\, \E f = r \right\}$. We prove that this is the best estimate one can obtain using structural results', and that it is possible to estimate the error rate from data. We then prove that the bound on the empirical minimization algorithm can be improved further by a direct analysis, and that the correct error rate is the maximizer of$\oxi(r) -r$, where$\oxi(r)  = \E \sup \left\{\E f - \E_n f: f\in F,\, \E f = r
\right\}$.} }  @inproceedings{tb-ocmcm-05, author = {Ambuj Tewari and Peter L.~Bartlett}, title = {On the consistency of multiclass classification methods}, booktitle = {Proceedings of the 18th Annual Conference on Learning Theory}, volume = {3559}, seriestitle = {Lecture Notes in Computer Science}, pages = {143--157}, publisher = {Springer}, year = {2005}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/tb-ocmcm-05.pdf} }  @inproceedings{bcmt-lmmsc-04, author = {Peter L. {Bartlett} and Michael {Collins} and Ben {Taskar} and David {McAllester}}, title = {Exponentiated Gradient Algorithms for Large-margin Structured Classification}, booktitle = {Advances in Neural Information Processing Systems 17}, editor = {Lawrence K. Saul and Yair Weiss and {L\'{e}on} Bottou}, publisher = {MIT Press}, address = {Cambridge, MA}, pages = {113-120}, year = {2005}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bcmt-lmmsc-04.ps.gz}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bcmt-lmmsc-04.pdf}, abstract = {We consider the problem of {\em structured classification}, where the task is to predict a label$y$from an input$x$, and$y$has meaningful internal structure. Our framework includes supervised training of both Markov random fields and weighted context-free grammars as special cases. We describe an algorithm that solves the large-margin optimization problem defined by Taskar et al, using an exponential-family (Gibbs distribution) representation of structured objects. The algorithm is efficient -- even in cases where the number of labels$y$is exponential in size -- provided that certain expectations under Gibbs distributions can becalculated efficiently. The optimization method we use for structured labels relies on a more general result, specifically the application of exponentiated gradient (EG) updates to quadratic programs (QPs). We describe a new method for solving QPs based on these techniques, and give bounds on its rate of convergence. In addition to their application to the structured-labels task, the EG updates lead to simple algorithms for optimizing conventional'' binary or multiclass SVM problems. Finally, we give a new generalization bound for structured classification, using PAC-Bayesian methods for the analysis of large margin classifiers.} }  @inproceedings{jrsb-mletldp-04, author = {Rafael Jim\'enez-Rodriguez and Nicholas Sitar and Peter L.~Bartlett}, title = {Maximum likelihood estimation of trace length distribution parameters using the {EM} algorithm}, booktitle = {Prediction, Analysis and Design in Geomechanical Applications: Proceedings of the Eleventh International Conference on Computer Methods and Advances in Geomechanics (IACMAG-2005)}, editor = {G. Barla and M. Barla}, volume = {1}, pages = {619--626}, address = {Bologna}, publisher = {P\atron Editore}, optnote = {Torino, Italy, 19-24 June 2005}, year = 2005 }  @article{bbm-lrc-05, title = {Local {R}ademacher complexities}, author = {Peter L.~Bartlett and Olivier Bousquet and Shahar Mendelson}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bbm-lrc-05.ps}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bbm-lrc-05.pdf}, journal = {Annals of Statistics}, volume = {33}, number = {4}, pages = {1497--1537}, year = {2005}, abstract = {We propose new bounds on the error of learning algorithms in terms of a data-dependent notion of complexity. The estimates we establish give optimal rates and are based on a local and empirical version of Rademacher averages, in the sense that the Rademacher averages are computed from the data, on a subset of functions with small empirical error. We present some applications to prediction with bounded loss, and to regression with a convex loss function and a convex function class.} }  @article{bjm-ccrb-05, title = {Convexity, classification, and risk bounds}, author = {Peter L.\ Bartlett and Michael I.\ Jordan and Jon D.\ McAuliffe}, note = {(Was Department of Statistics, U.C.\ Berkeley Technical Report number 638, 2003)}, year = {2006}, journal = {Journal of the American Statistical Association}, volume = {101}, number = {473}, pages = {138-156}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.ps.gz}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.pdf}, abstract = {Many of the classification algorithms developed in the machine learning literature, including the support vector machine and boosting, can be viewed as minimum contrast methods that minimize a convex surrogate of the 0-1 loss function. The convexity makes these algorithms computationally efficient. The use of a surrogate, however, has statistical consequences that must be balanced against the computational virtues of convexity. To study these issues, we provide a general quantitative relationship between the risk as assessed using the 0-1 loss and the risk as assessed using any nonnegative surrogate loss function. We show that this relationship gives nontrivial upper bounds on excess risk under the weakest possible condition on the loss function: that it satisfy a pointwise form of Fisher consistency for classification. The relationship is based on a simple variational transformation of the loss function that is easy to compute in many applications. We also present a refined version of this result in the case of low noise. Finally, we present applications of our results to the estimation of convergence rates in the general setting of function classes that are scaled convex hulls of a finite-dimensional base class, with a variety of commonly used loss functions.} }  @article{bm-em-05, author = {Peter L.~Bartlett and Shahar Mendelson}, title = {Empirical minimization}, journal = {Probability Theory and Related Fields}, year = {2006}, volume = 135, number = 3, pages = {311--334}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.ps.gz}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.pdf}, abstract = {We investigate the behavior of the empirical minimization algorithm using various methods. We first analyze it by comparing the empirical, random, structure and the original one on the class, either in an additive sense, via the uniform law of large numbers, or in a multiplicative sense, using isomorphic coordinate projections. We then show that a direct analysis of the empirical minimization algorithm yields a significantly better bound, and that the estimates we obtain are essentially sharp. The method of proof we use is based on Talagrand's concentration inequality for empirical processes.} }  @techreport{bw-crouhl-06, author = {Peter L. Bartlett and Marten H. Wegkamp}, title = {Classification with a Reject Option using a Hinge Loss}, year = {2006}, institution = {U.C.\ Berkeley}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.ps.gz}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.pdf}, abstract = {We consider the problem of binary classification where the classifier can, for a particular cost, choose not to classify an observation. Just as in the conventional classification problem, minimization of the sample average of the cost is a difficult optimization problem. As an alternative, we propose the optimization of a certain convex loss function f, analogous to the hinge loss used in support vector machines (SVMs). Its convexity ensures that the sample average of this surrogate loss can be efficiently minimized. We study its statistical properties. We show that minimizing the expected surrogate loss---the f-risk---also minimizes the risk. We also study the rate at which the f-risk approaches its minimum value. We show that fast rates are possible when the conditional probability Pr(Y=1|X) is unlikely to be close to certain critical values.} }  @techreport{bt-aic-06a, author = {Peter L.~Bartlett and Mikhail Traskin}, title = {AdaBoost is Consistent}, institution = {U.~C.~Berkeley}, year = {2006} }  @article{bjm-c-06, author = {Peter L.\ Bartlett and Michael I.\ Jordan and Jon D.\ McAuliffe}, title = {Comment}, journal = {Statistical Science}, year = {2006}, volume = {21}, number = {3}, pages = {341--346} }  @inproceedings{bt-aolmc-06, author = {Peter L.~Bartlett and Mikhail Traskin}, title = {AdaBoost and other Large Margin Classifiers: Convexity in Pattern Classification}, booktitle = {Proceedings of the 5th Workshop on Defence Applications of Signal Processing}, year = {2006} }  @article{bm-d-06, author = {Peter L.\ Bartlett and Shahar Mendelson}, title = {Discussion of 2004 {IMS} {M}edallion {L}ecture: {L}ocal {R}ademacher complexities and oracle inequalities in risk minimization'' by {V}.\ {K}oltchinskii}, journal = {The Annals of Statistics}, year = {2006}, volume = {34}, number = {6}, pages = {2657--2663} }  @techreport{bmp-osbeeem-05, author = {Peter L. Bartlett and Shahar Mendelson and Petra Philips}, title = {Optimal sample-based estimates of the expectation of the empirical minimizer}, year = {2007}, institution = {U.C.\ Berkeley}, ps = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.ps.gz}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.pdf}, abstract = {We study sample-based estimates of the expectation of the function produced by the empirical minimization algorithm. We investigate the extent to which one can estimate the rate of convergence of the empirical minimizer in a data dependent manner. We establish three main results. First, we provide an algorithm that upper bounds the expectation of the empirical minimizer in a completely data-dependent manner. This bound is based on a structural result in http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf, which relates expectations to sample averages. Second, we show that these structural upper bounds can be loose. In particular, we demonstrate a class for which the expectation of the empirical minimizer decreases as$O(1/n)$for sample size$n$, although the upper bound based on structural properties is$\Omega(1)$. Third, we show that this looseness of the bound is inevitable: we present an example that shows that a sharp bound cannot be universally recovered from empirical data.} }  @techreport{b-freeoims-06, author = {Peter L. Bartlett}, title = {Fast rates for estimation error and oracle inequalities for model selection}, year = {2007}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-freeoims-06.pdf}, institution = {Department of Statistics, U.C.\ Berkeley}, number = {729}, abstract = {We consider complexity penalization methods for model selection. These methods aim to choose a model to optimally trade off estimation and approximation errors by minimizing the sum of an empirical risk term and a complexity penalty. It is well known that if we use a bound on the maximal deviation between empirical and true risks as a complexity penalty, then the risk of our choice is no more than the approximation error plus twice the complexity penalty. There are many cases, however, where complexity penalties like this give loose upper bounds on the estimation error. In particular, if we choose a function from a suitably simple convex function class with a strictly convex loss function, then the estimation error (the difference between the risk of the empirical risk minimizer and the minimal risk in the class) approaches zero at a faster rate than the maximal deviation between empirical and true risks. In this note, we address the question of whether it is possible to design a complexity penalized model selection method for these situations. We show that, provided the sequence of models is ordered by inclusion, in these cases we can use tight upper bounds on estimation error as a complexity penalty. Surprisingly, this is the case even in situations when the difference between the empirical risk and true risk (and indeed the error of any estimate of the approximation error) decreases much more slowly than the complexity penalty. We give an oracle inequality showing that the resulting model selection method chooses a function with risk no more than the approximation error plus a constant times the complexity penalty.} }  @inproceedings{bt-scpskd-06, author = {Peter L.~Bartlett and Ambuj Tewari}, title = {Sample complexity of policy search with known dynamics}, booktitle = {Advances in Neural Information Processing Systems 19}, year = 2007, editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman}, publisher = {MIT Press}, address = {Cambridge, MA}, pages = {97--104}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-scpskd-06.pdf} }  @inproceedings{rbr-soimbtmerb-06, author = {Benjamin I. P. Rubinstein and Peter L.~Bartlett and J. Hyam Rubinstein}, title = {Shifting, one-inclusion mistake bounds and tight multiclass expected risk bounds}, booktitle = {Advances in Neural Information Processing Systems 19}, editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman}, publisher = {MIT Press}, address = {Cambridge, MA}, pages = {1193--1200}, year = {2007}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/rbr-soimbtmerb-06.pdf} }  @inproceedings{rb-rcckc-06, author = {David Rosenberg and Peter L.~Bartlett}, title = {The {R}ademacher Complexity of Co-Regularized Kernel Classes}, abstract = {In the multi-view approach to semi-supervised learning, we choose one predictor from each of multiple hypothesis classes, and we co-regularize' our choices by penalizing disagreement among the predictors on the unlabeled data. In this paper we examine the co-regularization method used in the recently proposed co-regularized least squares (CoRLS) algorithm. In this method we have two hypothesis classes, each a reproducing kernel Hilbert space (RKHS), and we co-regularize by penalizing the average squared difference in predictions on the unlabeled data. We get our final predictor by taking the pointwise average of the predictors from each view. We call the set of predictors that can result from this procedure the co-regularized hypothesis class. The main result of this paper is a tight bound on the Rademacher complexity of the co-regularized hypothesis class in terms of the kernel matrices of each RKHS. We find that the co-regularization reduces the Rademacher complexity of the hypothesis class by an amount depending on how different the two views are, measured by a data dependent metric. We then use standard techniques to bound the gap between training error and test error for the CoRLS algorithm. Experimentally, we find that the amount of reduction in complexity introduced by co-regularization correlates with the amount of improvement that co-regularization gives in the CoRLS algorithm}, booktitle = {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics}, editor = {Marina Meila and Xiaotong Shen}, seriestitle = {JMLR Workshop and Conference Proceedings}, volume = {2}, pages = {396--403}, year = 2007, pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v2/rosenberg07a/rosenberg07a.pdf} }  @inproceedings{bt-aic-06, author = {Peter L.~Bartlett and Mikhail Traskin}, title = {AdaBoost is Consistent}, booktitle = {Advances in Neural Information Processing Systems 19}, editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman}, publisher = {MIT Press}, address = {Cambridge, MA}, pages = {105--112}, year = {2007}, pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-aic-06.pdf}, abstract = {The risk, or probability of error, of the classifier produced by the AdaBoost algorithm is investigated. In particular, we consider the stopping strategy to be used in AdaBoost to achieve universal consistency. We show that provided AdaBoost is stopped after n^(1-a)$ iterations---for sample size n and
a>0---the sequence of risks of the classifiers it
produces approaches the Bayes risk.}
}

@inproceedings{tb-bpmdparc-07,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {Bounded parameter {M}arkov decision processes with average
reward criterion},
booktitle = {Proceedings of the Conference on Learning Theory},
year = {2007},
pages = {263--277}
}

@inproceedings{abr-mlea-07,
author = {Jacob Abernethy and Peter L.~Bartlett and Alexander Rakhlin},
booktitle = {Proceedings of the Conference on Learning Theory},
year = {2007},
pages = {484-498},
abstract = {We consider the problem of prediction with expert advice
in the setting where a forecaster is presented with several
related, and thus we expect that a few experts will perform
well on the entire set of tasks. That is, our forecaster would
like, on each task, to compete against the best expert
chosen from a small set of experts. While we describe
the ideal' algorithm and its performance bound, we show
that the computation required for this algorithm is as hard as
computation of a matrix permanent. We present an efficient
algorithm based on mixing priors, and prove a bound that is
nearly as good for the sequential task presentation case. We
also consider a harder case where the task may change
arbitrarily from round to round, and we develop an efficient
randomized algorithm based on Markov chain Monte Carlo
techniques.}
}

@inproceedings{arb--07,
author = {Alexander Rakhlin and Jacob Abernethy and
Peter L.~Bartlett},
title = {Online discovery of similarity mappings},
booktitle = {Proceedings of the 24th International Conference on Machine
Learning (ICML-2007)},
year = {2007},
pages = {767--774},
abstract = {We consider the problem of choosing,
sequentially, a map which assigns elements of a set A
to a few elements of a set B.  On each round, the
algorithm suffers some cost associated with the chosen
assignment, and the goal is to minimize the cumulative loss
of these choices relative to the best map on the entire
sequence. Even though the offline problem of finding the best
map is provably hard, we show that there is an equivalent
online approximation algorithm, Randomized Map Prediction
(RMP), that is efficient and performs nearly as well. While
drawing upon results from the Online Prediction with
Expert Advice' setting, we show how RMP can be utilized as
an online approach to several standard batch problems. We
apply RMP to online clustering as well as online feature
selection and, surprisingly, RMP often outperforms the
standard batch algorithms on these problems.}
}

@article{bt-aic-07,
author = {Peter L.~Bartlett and Mikhail Traskin},
journal = {Journal of Machine Learning Research},
volume = {8},
pages = {2347--2368},
year = {2007},
url = {jmlr.csail.mit.edu/papers/volume8/bartlett07b/bartlett07b.pdf},
pdf = {jmlr.csail.mit.edu/papers/volume8/bartlett07b/bartlett07b.pdf},
abstract = {The risk, or probability of error, of
the classifier produced by the AdaBoost algorithm is
investigated. In particular, we consider the stopping
strategy to be used in AdaBoost to achieve universal
consistency. We show that provided AdaBoost is stopped
after n^(1-a) iterations---for sample size n and
0

@techreport{rbr-soimbtmerb-07,
author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
J.~Hyam Rubinstein},
title = {Shifting: one-inclusion mistake bounds and sample
compression},
institution = {EECS Department, University of California, Berkeley},
pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-86.pdf},
year = {2007},
abstract = {We present new expected risk bounds for binary and
multiclass prediction, and resolve several recent conjectures
on sample compressibility due to Kuzmin and Warmuth. By
exploiting the combinatorial structure of concept class F,
Haussler et al. achieved a VC(F)/n bound for the natural one-inclusion
prediction strategy. The key step in their proof is a d=VC(F)
bound on the graph density of a subgraph of the
hypercube - one-inclusion graph. The first main result of
this report is a density bound of
n choose(n-1,<=d-1)/choose(n,<=d) < d, which positively resolves
a conjecture of Kuzmin and Warmuth relating to their unlabeled
Peeling compression scheme and also leads to an improved
one-inclusion mistake bound. The proof uses a new form of
VC-invariant shifting and a group-theoretic symmetrization.
Our second main result is an algebraic topological property of
maximum classes of VC-dimension d as being d-contractible
simplicial complexes, extending the well-known
characterization that d=1 maximum classes are trees. We
negatively resolve a minimum degree conjecture of Kuzmin and
Warmuth - the second part to a conjectured proof of
correctness for Peeling - that every class has one-inclusion
minimum degree at most its VC-dimension. Our final main result
is a k-class analogue of the d/n mistake bound, replacing the
VC-dimension by the Pollard pseudo-dimension and the
one-inclusion strategy by its natural hypergraph
generalization. This result improves on known PAC-based
expected risk bounds by a factor of O(log n) and is shown to
be optimal up to a O(log k) factor. The combinatorial
technique of shifting takes a central role in understanding
the one-inclusion (hyper)graph and is a running theme
throughout.}
}

@techreport{rb-bbspngp-07,
author = {David Rosenberg and Peter L.~Bartlett},
title = {On bounds for {B}ayesian sequence prediction with
non-{G}aussian priors},
note = {Technical Report},
year = {2007},
abstract = {We present worst-case log-loss regret bounds for the
Bayesian model averaging algorithm in the regression setting.
Our work generalizes earlier work that gives bounds of a
similar form that only hold for Gaussian priors. Our bounds
hold for arbitrary priors, though the regret term now includes
a term involving the modulus of continuity of the prior, as
well as the value of the prior at the point of comparison.}
}

@techreport{abrt-mlbocg-07,
author = {Jacob Abernethy and Peter L.~Bartlett and Alexander
Rakhlin and Ambuj Tewari},
title = {Minimax Lower Bounds for Online Convex Games},
institution = {UC Berkeley},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrt-mlbocg-07.pdf},
abstract = {A number of learning problems can be cast as an Online
Convex Game: on each round, a learner makes a prediction $x$
from a convex set, the environment plays a loss function $f$,
and the learner's long-term goal is to minimize regret. Algorithms
have been proposed by Zinkevich, when $f$ is assumed to be
convex, and Hazan et al, when $f$ is assumed to be strongly
convex, that have provably low regret. We consider these two
settings and analyze such games from a minimax perspective,
proving lower bounds in each case.  These results prove that the
existing algorithms are essentially optimal.}
}

@techreport{abr-mlea-07a,
author = {Jacob Duncan Abernethy and Peter L.~Bartlett
and Alexander Rakhlin},
institution = {EECS Department, University of California, Berkeley},
year = {2007},
url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-20.html},
number = {UCB/EECS-2007-20}
}

@techreport{cgkcb-egacrfmmmn-07,
author = {Michael Collins and Amir Globerson and Terry Koo and Xavier
Carreras and Peter L.~Bartlett},
title = {Exponentiated gradient algorithms for conditional random
fields and max-margin {M}arkov networks},
institution = {U.C.\ Berkeley},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/cgkcb-egacrfmmmn-07.pdf},
abstract = {Log-linear and maximum-margin models are two
commonly used methods in supervised machine learning, and
are frequently used in structured prediction problems.
Efficient learning of parameters in these models is
therefore an important problem, and becomes a key factor
when learning from very large data sets.  This paper
describes exponentiated gradient (EG) algorithms for
training such models, where EG updates are applied to
the convex dual of either the log-linear or max-margin
objective function; the dual in both the log-linear
and max-margin cases corresponds to minimizing a convex
function with simplex constraints.  We study both batch
and online variants of the algorithm, and provide rates
of convergence for both cases.  In the max-margin case,
$O({1\over\epsilon})$ EG updates are required to reach a
given accuracy $\epsilon$ in the dual; in contrast, for
log-linear models only $O(\log({ 1\over\epsilon}))$ updates
are required. For both the max-margin and log-linear cases,
our bounds suggest that the online algorithm requires
a factor of $n$ less computation to reach a desired
accuracy, where $n$ is the number of training examples. Our
experiments confirm that the online algorithms are much
faster than the batch algorithms in practice.  We describe
how the EG updates factor in a convenient way for
structured prediction problems, allowing the algorithms
to be efficiently applied to problems such as sequence
learning or natural language parsing.  We perform extensive
evaluation of the algorithms, comparing them to to L-BFGS
and stochastic gradient descent for log-linear models,
and to SVM-Struct for max-margin models. The algorithms are
applied to multi-class problems as well as a more complex
large-scale parsing task. In all these settings, the EG
algorithms presented here outperform the other methods.}
}

@article{tewari07consistency,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {On the Consistency of Multiclass Classification Methods},
journal = {Journal of Machine Learning Research},
year = {2007},
volume = {8},
month = {May},
pages = {1007--1025},
note = {(Invited paper)},
url = {http://jmlr.csail.mit.edu/papers/v8/tewari07a.html}
}

@article{bartlett07sparseness,
author = {Peter L. Bartlett and Ambuj Tewari},
title = {Sparseness vs Estimating Conditional Probabilities: Some
Asymptotic Results},
journal = {Journal of Machine Learning Research},
year = {2007},
volume = {8},
month = {April},
pages = {775--790},
url = {http://jmlr.csail.mit.edu/papers/v8/bartlett07a.html}
}

@article{b-freeoims-07,
author = {Peter L. Bartlett},
title = {Fast rates for estimation error and oracle
inequalities for model selection},
year = {2008},
month = apr,
journal = {Econometric Theory},
volume = {24},
number = {2},
pages = {545--552},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-freeoims-07.pdf},
note = {(Was Department of Statistics,
U.C.\ Berkeley Technical Report number 729, 2007)},
doi = {http://dx.doi.org/10.1017/S0266466608080225},
abstract = {We consider complexity penalization methods for model
selection.  These methods aim to choose a model to
optimally trade off estimation and approximation errors
by minimizing the sum of an empirical risk term and a
complexity penalty. It is well known that if we use a
bound on the maximal deviation between empirical and
true risks as a complexity penalty,
then the risk of our choice is no more
than the approximation error plus twice the complexity
penalty. There are many cases, however, where complexity
penalties like this give loose upper bounds on the
estimation error. In particular, if we choose a function
from a suitably simple convex function class with a
strictly convex loss function, then the estimation error
(the difference between the risk of the empirical risk
minimizer and the minimal risk in the class) approaches zero at a
faster rate than the maximal deviation between empirical
and true risks. In this note, we address the question of
whether it is possible to design a complexity penalized
model selection method for these situations. We show that,
provided the sequence of models is ordered by inclusion, in
these cases we can use tight upper bounds on estimation
error as a complexity penalty. Surprisingly, this is the
case even in situations when the difference between the
empirical risk and true risk (and indeed the error of
any estimate of the approximation error) decreases
much more slowly than the
complexity penalty.  We give an oracle inequality
showing that the resulting model selection method chooses
a function with risk no more than the approximation error
plus a constant times the complexity penalty.}
}

@article{bw-crouhl-08,
author = {Peter L. Bartlett and Marten H. Wegkamp},
title = {Classification with a Reject Option using a Hinge
Loss},
journal = {Journal of Machine Learning Research},
year = {2008},
month = aug,
volume = {9},
pages = {1823--1840},
pdf = {http://www.jmlr.org/papers/volume9/bartlett08a/bartlett08a.pdf},
abstract = {We consider the problem of binary classification where the
classifier can, for
a particular cost, choose not to classify an observation. Just as in
the
conventional classification problem, minimization of the sample
average of
the cost is a difficult optimization problem. As an alternative, we
propose
the optimization of a certain convex loss function f, analogous
to the
hinge loss used in support vector machines (SVMs). Its convexity
ensures that
the sample average of this surrogate loss can be efficiently
minimized. We
study its statistical properties. We show that minimizing the expected
surrogate loss---the f-risk---also minimizes the risk.  We also
study
the rate at which the f-risk approaches its minimum value. We
show that
fast rates are possible when the conditional probability Pr(Y=1|X)
is
unlikely to be close to certain critical values.}
}

@inproceedings{bhr-aogd-07,
author = {Peter L.~Bartlett and Elad Hazan and Alexander Rakhlin},
booktitle = {Advances in Neural Information Processing
Systems 20},
editor = {John Platt and Daphne Koller and Yoram Singer and Sam Roweis},
publisher = {MIT Press},
year = {2008},
month = sep,
pages = {65--72},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bhr-aogd-07.pdf},
abstract = {We study the rates of growth of the regret in online
convex optimization. First, we show that a simple extension of
the algorithm of Hazan et al eliminates the need for a priori
knowledge of the lower bound on the second derivatives of the
observed functions. We then provide an algorithm, Adaptive
Online Gradient Descent, which interpolates between the
results of Zinkevich for linear functions and of Hazan et al
for strongly convex functions, achieving intermediate rates
between $\sqrt{T}$ and $\log T$. Furthermore, we show strong
optimality of the algorithm. Finally, we provide an extension
of our results to general norms.}
}

@inproceedings{tb-olplrim-07,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {Optimistic linear programming gives logarithmic regret for
irreducible {MDPs}},
booktitle = {Advances in Neural Information Processing
Systems 20},
editor = {John Platt and Daphne Koller and Yoram Singer and Sam Roweis},
publisher = {MIT Press},
year = {2008},
month = sep,
pages = {1505--1512},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/tb-olplrim-07.pdf},
abstract = {We present an algorithm called Optimistic Linear
Programming (OLP) for learning to optimize average reward in
an irreducible but otherwise unknown Markov decision process
(MDP). OLP uses its experience so far to estimate the MDP. It
chooses actions by optimistically maximizing estimated future
rewards over a set of next-state transition probabilities that
are close to the estimates: a computation that corresponds to
solving linear programs. We show that the total expected
reward obtained by OLP up to time $T$ is within $C(P)\log T$
of the reward obtained by the optimal policy, where $C(P)$ is
an explicit, MDP-dependent constant. OLP is closely related to
an algorithm proposed by Burnetas and Katehakis with four key
differences: OLP is simpler, it does not require knowledge of
the supports of transition probabilities and the proof of the
regret bound is simpler, but our regret bound is a constant
factor larger than the regret of their algorithm. OLP is also
similar in flavor to an algorithm recently proposed by Auer
and Ortner. But OLP is simpler and its regret bound has a
better dependence on the size of the MDP.}
}

@article{cgkcb-egacrfmmmn-08,
author = {Michael Collins and Amir Globerson and Terry Koo and Xavier
Carreras and Peter L.~Bartlett},
title = {Exponentiated gradient algorithms for conditional random
fields and max-margin {M}arkov networks},
journal = {Journal of Machine Learning Research},
year = {2008},
month = aug,
volume = {9},
pages = {1775--1822},
pdf = {http://www.jmlr.org/papers/volume9/collins08a/collins08a.pdf},
abstract = {Log-linear and maximum-margin models are two
commonly used methods in supervised machine learning, and
are frequently used in structured prediction problems.
Efficient learning of parameters in these models is
therefore an important problem, and becomes a key factor
when learning from very large data sets.  This paper
describes exponentiated gradient (EG) algorithms for
training such models, where EG updates are applied to
the convex dual of either the log-linear or max-margin
objective function; the dual in both the log-linear
and max-margin cases corresponds to minimizing a convex
function with simplex constraints.  We study both batch
and online variants of the algorithm, and provide rates
of convergence for both cases.  In the max-margin case,
$O({1\over\epsilon})$ EG updates are required to reach a
given accuracy $\epsilon$ in the dual; in contrast, for
log-linear models only $O(\log({ 1\over\epsilon}))$ updates
are required. For both the max-margin and log-linear cases,
our bounds suggest that the online algorithm requires
a factor of $n$ less computation to reach a desired
accuracy, where $n$ is the number of training examples. Our
experiments confirm that the online algorithms are much
faster than the batch algorithms in practice.  We describe
how the EG updates factor in a convenient way for
structured prediction problems, allowing the algorithms
to be efficiently applied to problems such as sequence
learning or natural language parsing.  We perform extensive
evaluation of the algorithms, comparing them to to L-BFGS
and stochastic gradient descent for log-linear models,
and to SVM-Struct for max-margin models. The algorithms are
applied to multi-class problems as well as a more complex
large-scale parsing task. In all these settings, the EG
algorithms presented here outperform the other methods.}
}

@inproceedings{bdhkrt-hprbbolo-08,
title = {High-Probability Regret Bounds for Bandit Online Linear
Optimization},
author = {Peter L.~Bartlett and Varsha Dani and Thomas Hayes and
Sham Kakade and Alexander Rakhlin and Ambuj Tewari},
booktitle = {Proceedings of the 21st Annual Conference on Learning
Theory (COLT 2008)},
year = {2008},
month = dec,
pages = {335--342},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bdhkrt-hprbbolo-08.pdf}
}

@inproceedings{abrt-osmlbocg-08,
title = {Optimal strategies and minimax lower bounds for online convex
games},
author = {Jacob Abernethy and Peter L. Bartlett and
Alexander Rakhlin and Ambuj Tewari},
booktitle = {Proceedings of the 21st Annual Conference on Learning
Theory (COLT 2008)},
year = {2008},
month = dec,
pages = {415--423},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrt-osmlbocg-08.pdf}
}

@article{lbw-c-08,
title = {Correction to {\em The Importance of Convexity in Learning
with Squared Loss}},
author = {Wee Sun Lee and Peter L.~Bartlett and
Robert C.~Williamson},
journal = {IEEE Transactions on Information Theory},
year = {2008},
month = sep,
pages = {4395},
volume = {54},
number = {9},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/lbw-c-08.pdf}
}

@inproceedings{nabh-amlfdms-08,
title = {Application of Machine Learning in Fault Diagnostics of
Mechanical Systems},
author = {Massieh Najafi and David M. Auslander and
Peter L. Bartlett and Philip Haves},
booktitle = {Proceedings of the World Congress on Engineering and
Computer Science 2008:
International Conference on Modeling, Simulation and Control 2008},
pages = {957--962},
year = {2008},
month = oct,
pdf = {http://www.iaeng.org/publication/WCECS2008/WCECS2008_pp957-962.pdf}
}

@inproceedings{nabh-fdst-08,
title = {Fault Diagnostics and Supervised Testing: How Fault
Diagnostic tools can be Proactive?},
author = {Massieh Najafi and David M. Auslander and Peter L. Bartlett
and Philip Haves},
year = {2008},
month = sep,
booktitle = {Proceedings of Intelligent Systems and Control (ISC
2008)},
pages = {633-034},
}

@inproceedings{nabh-ocdpdsna-08,
title = {Overcoming the Complexity of Diagnostic Problems due to
Sensor Network Architecture},
author = {Massieh Najafi and David M. Auslander and Peter L. Bartlett
and Philip Haves},
year = {2008},
month = sep,
booktitle = {Proceedings of Intelligent Systems and Control (ISC
2008)},
pages = {633-071},
}

@techreport{arb-mrtoml-08,
title = {Matrix regularization techniques for online multitask
learning},
author = {Alekh Agarwal and Alexander Rakhlin and Peter Bartlett},
institution = {EECS Department,
University of California, Berkeley},
number = {UCB/EECS-2008-138},
year = {2008},
pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2008/EECS-2008-138.pdf},
abstract = {In this paper we examine the problem of prediction with
expert advice in a setup where the learner
is presented with a sequence of examples coming from different tasks.
In order for the learner to be able
to benefit from performing multiple tasks simultaneously, we make
constraining the comparator to use a lesser number of best experts
than the number of tasks. We show
how this corresponds naturally to learning under spectral or
structural matrix constraints, and propose
regularization techniques to enforce the constraints. The
regularization techniques proposed here are
interesting in their own right and multitask learning is just one
application for the ideas. A theoretical
analysis of one such regularizer is performed, and a regret bound that
shows benefits of this setup is reported.}
}

@inproceedings{bbcjnrst-opsl-08,
title = {Open problems in the security of learning},
author = {Marco Barreno and Peter L.~Bartlett and F.~J.~Chi
and Anthony D.~Joseph and Blaine Nelson
and Benjamin I.~P.~Rubinstein and U.~Saini and J.~Doug Tygar},
booktitle = {Proceedings of the 1st ACM Workshop on AISec
(AISec2008)},
pages = {19--26},
year = {2008},
doi = {http://dx.doi.org/10.1145/1456377.1456382},
month = oct
}

@article{rbr-soimbtmerb-08,
author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
J.~Hyam Rubinstein},
title = {Shifting: one-inclusion mistake bounds and sample
compression},
journal = {Journal of Computer and System Sciences},
volume = {75},
number = {1},
pages = {37--59},
note = {(Was University of California, Berkeley, EECS Department
Technical Report EECS-2007-86)},
pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-86.pdf},
month = {January},
year = {2009}
}

@inproceedings{bt-rarbarlwcm-09,
title = {{REGAL}: A Regularization based Algorithm for
Reinforcement Learning in Weakly Communicating {MDPs}},
author = {Peter L.~Bartlett and Ambuj Tewari},
booktitle = {Proceedings of the 25th Conference on
Uncertainty in Artificial Intelligence (UAI2009)},
year = {2009},
month = jun,
pages = {35--42},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-rarbarllwcm-09.pdf}
}

@inproceedings{aabr-svormd-09,
title = {A stochastic view of optimal regret through minimax duality},
author = {Jacob Abernethy and Alekh Agarwal and
Peter L.~Bartlett and Alexander Rakhlin},
booktitle = {Proceedings of the 22nd Annual Conference on
Learning Theory -- COLT 2009},
year = {2009},
month = jun,
pages = {257--266},
abstract = {We study the regret of optimal strategies for online
convex
optimization games.  Using von Neumann's minimax theorem, we
show that the optimal regret in this adversarial setting is
closely related to the behavior of the empirical
minimization algorithm in a stochastic process setting:
it is equal to the maximum, over joint distributions of the
adversary's action sequence, of the difference between a sum
of minimal expected losses and the minimal empirical loss.
We show that the optimal regret has a natural geometric
interpretation, since it can be viewed as the gap in
Jensen's inequality for a concave functional---the minimizer
over the player's actions of expected loss---defined on a
set of probability distributions.  We use this expression to
obtain upper and lower bounds on the regret of an optimal
strategy for a variety of online learning problems. Our
method provides upper bounds without the need to construct a
learning algorithm; the lower bounds provide explicit
pdf = {http://www.cs.mcgill.ca/~colt2009/papers/026.pdf}
}

@techreport{aabr-svormd-09a,
title = {A stochastic view of optimal regret through minimax duality},
author = {Jacob Abernethy and Alekh Agarwal and
Peter L.~Bartlett and Alexander Rakhlin},
institution = {arxiv.org},
number = {0903.5328},
year = {2009},
url = {http://arxiv.org/abs/0903.5328},
abstract = {We study the regret of optimal strategies for online
convex optimization games. Using von Neumann's minimax theorem, we
show that the optimal regret in this adversarial setting is closely
related to the behavior of the empirical minimization algorithm in a
stochastic process setting: it is equal to the maximum, over joint
distributions of the adversary's action sequence, of the difference
between a sum of minimal expected losses and the minimal empirical
loss. We show that the optimal regret has a natural geometric
interpretation, since it can be viewed as the gap in Jensen's
inequality for a concave functional--the minimizer over the player's
actions of expected loss--defined on a set of probability
distributions. We use this expression to obtain upper and lower bounds
on the regret of an optimal strategy for a variety of online learning
problems. Our method provides upper bounds without the need to
construct a learning algorithm; the lower bounds provide explicit
}

@article{rsbn-sslfbmvpcr-09,
title = {Multiview point cloud kernels for semisupervised learning},
volume = {26},
number = {5},
month = {September},
year = {2009},
pages = {145--150},
author = {David S.~Rosenberg and Vikas Sindhwani and
Peter L.~Bartlett and Partha Niyogi},
doi = {http://dx.doi.org/10.1109/MSP.2009.933383},
journal = {IEEE Signal Processing Magazine}
}

@inproceedings{abrw-itlbocco-09,
title = {Information-theoretic lower bounds on the oracle complexity
of convex optimization},
author = {Alekh Agarwal and Peter L.~Bartlett
and Pradeep Ravikumar and Martin Wainwright},
Neural Information Processing Systems 22},
editor = {Y. Bengio and D. Schuurmans and J. Lafferty and C. K. I.
Williams and A. Culotta},
year = {2009},
month = jun,
pages = {1--9},
pdf = {http://research.microsoft.com/en-us/um/people/alekha/1005_paper.pdf},
abstract = {Despite the large amount of literature on upper bounds on
complexity
of convex analysis, surprisingly little is known about the
fundamental hardness of these problems. The extensive use of convex
optimization in machine learning and statistics makes such an
understanding very critical to understand fundamental computational
limits of learning and estimation. In this paper, we study the
complexity of stochastic convex optimization in an oracle model of
computation. We improve upon known results and obtain tight minimax
complexity estimates for some function classes. We also discuss
implications of these results to learning and estimation.}
}

title = {A learning-based approach to reactive security},
author = {A.~Barth and Benjamin I.~P.~Rubinstein
and M.~Sundararajan and J.~C.~Mitchell
and Dawn Song and Peter L.~Bartlett},
institution = {arxiv.org},
number = {0912.1155},
url = {http://arxiv.org/abs/0912.1155},
abstract = {Despite the conventional wisdom that proactive security is
superior to reactive security, we show that reactive security can be
competitive with proactive security as long as the reactive defender
learns from past attacks instead of myopically overreacting to the
last attack. Our game-theoretic model follows common practice in the
security literature by making worst-case assumptions about the
attacker: we grant the attacker complete knowledge of the defender's
strategy and do not require the attacker to act rationally. In this
model, we bound the competitive ratio between a reactive defense
algorithm (which is inspired by online learning theory) and the best
fixed proactive defense. Additionally, we show that, unlike proactive
defenses, this reactive strategy is robust to a lack of information
about the attacker's incentives and knowledge.},
year = {2009}
}

@techreport{krb-uvmkl-10a,
title = {A Unifying View of Multiple Kernel Learning},
author = {Marius Kloft and Ulrich R\"uckert and Peter L.~Bartlett},
year = {2010},
institution = {arxiv.org},
number = {1005.0437},
url = {http://arxiv.org/abs/1005.0437},
abstract = {Recent research on multiple kernel learning has lead to a
number of approaches for combining kernels in regularized risk
minimization. The proposed approaches include different formulations
of objectives and varying regularization strategies. In this paper we
present a unifying general optimization criterion for multiple kernel
learning and show how existing formulations are subsumed as special
cases. We also derive the criterion's dual representation, which is
suitable for general smooth optimization algorithms. Finally, we
evaluate multiple kernel learning in this framework analytically using
a Rademacher complexity bound on the generalization error and
empirically in a set of experiments.}
}

@techreport{abh-banrle-10,
title = {Blackwell Approachability and No-Regret Learning are Equivalent},
author = {Jacob Abernethy and Peter L.~Bartlett and
institution = {arxiv.org},
number = {1011.1936},
url = {http://arxiv.org/abs/1011.1936},
year = {2010},
abstract = {We consider the celebrated Blackwell Approachability
Theorem for two-player games with vector payoffs. We show that
Blackwell's result is equivalent, via efficient reductions, to the
existence of 'no-regret' algorithms for Online Linear Optimization.
Indeed, we show that any algorithm for one such problem can be
efficiently converted into an algorithm for the other. We provide a
useful application of this reduction: the first efficient algorithm
for calibrated forecasting.}
}

@techreport{rbht-llfs-09tr,
title = {Learning in a large function space: Privacy preserving
mechanisms for {SVM} learning},
author = {Benjamin I.~P.~Rubinstein and Peter~L.~Bartlett
and Ling Huang and Nina Taft},
institution = {arxiv.org},
number = {0911.5708},
url = {http://arxiv.org/abs/0911.5708},
year = {2009},
abstract = {Several recent studies in privacy-preserving learning have
considered the trade-off between utility or risk and the level of
differential privacy guaranteed by mechanisms for statistical query
processing. In this paper we study this trade-off in private Support
Vector Machine (SVM) learning. We present two efficient mechanisms,
one for the case of finite-dimensional feature mappings and one for
potentially infinite-dimensional feature mappings with
translation-invariant kernels. For the case of translation-invariant
kernels, the proposed mechanism minimizes regularized empirical risk
in a random Reproducing Kernel Hilbert Space whose kernel uniformly
approximates the desired kernel with high probability. This technique,
borrowed from large-scale learning, allows the mechanism to respond
with a finite encoding of the classifier, even when the function class
is of infinite VC dimension. Differential privacy is established using
a proof technique from algorithmic stability. Utility--the mechanism's
response function is pointwise epsilon-close to non-private SVM with
probability 1-delta--is proven by appealing to the smoothness of
regularized empirical risk minimization with respect to small
perturbations to the feature mapping. We conclude with a lower bound
on the optimal differential privacy of the SVM. This negative result
states that for any delta, no mechanism can be simultaneously
(epsilon,delta)-useful and beta-differentially private for small
epsilon and small beta.}
}

@techreport{abd-oicams-12,
title = {Oracle inequalities for computationally adaptive model
selection},
author = {Alekh Agarwal and Peter L.~Bartlett and
John Duchi},
institution = {arxiv.org},
number = {1208.0129},
url = {http://arxiv.org/abs/1208.0129},
year = {2012},
abstract = {We analyze general model selection procedures using
penalized empirical loss minimization under computational constraints.
While classical model selection approaches do not consider
computational aspects of performing model selection, we argue that any
practical model selection procedure must not only trade off estimation
and approximation error, but also the computational effort required to
compute empirical minimizers for different function classes. We
provide a framework for analyzing such problems, and we give
algorithms for model selection under a computational budget. These
algorithms satisfy oracle inequalities that show that the risk of the
selected model is not much worse than if we had devoted all of our
computational budget to the optimal function class.}
}

title = {A learning-based approach to reactive security},
author = {A.~Barth and Benjamin I.~P.~Rubinstein
and M.~Sundararajan and J.~C.~Mitchell
and Dawn Song and Peter L.~Bartlett},
booktitle = {Proceedings of Financial Cryptography and Data Security (FC10)},
year = {2010},
mon = jan,
pages = {192--206},
doi = {http://dx.doi.org/10.1007/978-3-642-14577-3_16}
}

@inproceedings{abd-oasdpp-10,
title = {Optimal Allocation Strategies for the Dark Pool Problem},
author = {Alekh Agarwal and Peter L.~Bartlett and Max Dama},
booktitle = {Proceedings of The Thirteenth
International Conference on Artificial Intelligence and Statistics
(AISTATS)},
editor = {Y.~W.~Teh and M.~Titterington},
year = {2010},
month = may,
seriestitle = {JMLR: W\&CP},
volume = {9},
pages = {9-16},
pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v9/agarwal10a/agarwal10a.pdf},
abstract = {We study the problem of allocating stocks to dark pools. We
propose and analyze an optimal approach for allocations, if
continuous-valued allocations are allowed. We also propose a
modification for the case when only integer-valued allocations are
possible. We extend the previous work on this
also improving on those results in the iid setup. The resulting
algorithms are efficient, and perform well in simulations
}

@article{bmp-osbeeem-08,
author = {Peter L. Bartlett and Shahar Mendelson and Petra
Philips},
title = {On the optimality of sample-based estimates of the
expectation of the empirical minimizer},
journal = {ESAIM: Probability and Statistics},
volume = {14},
pages = {315--337},
year = {2010},
month = jan,
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-08.pdf},
abstract = {We study sample-based estimates of the expectation of the
function produced by the empirical minimization algorithm.
We investigate the extent to which one can estimate
the rate of convergence of the empirical minimizer in a
data dependent manner. We establish three main results.
First, we provide an algorithm that upper bounds the
expectation of the empirical minimizer in a completely
data-dependent manner.  This bound is based on a structural
result in
http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf,
which relates expectations to
sample averages.  Second, we show that these structural
upper bounds can be loose. In particular, we demonstrate a
class for which the expectation of the empirical minimizer
decreases as $O(1/n)$ for sample size $n$, although the
upper bound based on structural properties is $\Omega(1)$.
Third, we show that this looseness of the bound is inevitable:
we present an example that shows that a sharp bound cannot
be universally recovered from empirical data.}
}

@article{b-laue-10,
title = {Learning to act in uncertain environments},
volume = {53},
number = {5},
year = {2010},
month = may,
pages = {98},
author = {Peter L.~Bartlett},
journal = {Communications of the ACM},
doi = {http://dx.doi.org/10.1145/1735223.1735246},
note = {(Invited one-page comment)}
}

@article{rbr-csoimbsc-10,
title = {Corrigendum to Shifting: One-inclusion mistake bounds and
sample compression' {[J. Comput. System Sci 75 (1) (2009) 37-59]}},
volume = {76},
number = {3--4},
year = {2010},
month = may,
pages = {278--280},
author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
J.~Hyam Rubinstein},
journal = {Journal of Computer and System Sciences},
doi = {http://dx.doi.org/10.1016/j.jcss.2010.03.001}
}

@inproceedings{abbs-ramts-10,
title = {A Regularization Approach to Metrical Task Systems},
author = {Jacob Abernethy and Peter L.~Bartlett and Niv Buchbinder
and Isabelle Stanton},
year = {2010},
month = oct,
booktitle = {Algorithmic Learning Theory, 21st International
Conference, ALT 2010},
editor = {Marcus Hutter and Frank Stephan and Vladimir Vovk and Thomas
Zeugmann},
pages = {270--284},
doi = {http://dx.doi.org/10.1007/978-3-642-16108-7_23}
}

@inproceedings{b-oopae-10,
title = {Optimal Online Prediction in Adversarial
Environments},
author = {Peter L.~Bartlett},
year = {2010},
month = oct,
booktitle = {Algorithmic Learning Theory, 21st International
Conference, ALT 2010},
editor = {Marcus Hutter and Frank Stephan and Vladimir Vovk and Thomas
Zeugmann},
pages = {34},
doi = {http://dx.doi.org/10.1007/978-3-642-16184-1_26},
note = {(Plenary talk abstract)}
}

@inproceedings{krb-uvmkl-10,
title = {A Unifying View of Multiple Kernel Learning},
author = {Marius Kloft and Ulrich R\"uckert and Peter L.~Bartlett},
year = {2010},
month = sep,
booktitle = {Machine Learning and Knowledge Discovery in Databases,
European Conference, ECML PKDD},
editor = {Jos\'e L.~Balc\'azar and Francesco Bonchi and Aristides
Gionis and Mich\ele Sebag},
note = {Part II, LNAI 6322},
pages = {66--81},
doi = {http://dx.doi.org/10.1007/978-3-642-15883-4_5}
}

@inproceedings{kb-iol-10,
title = {Implicit online learning},
author = {Brian Kulis and Peter L.~Bartlett},
booktitle = {Proceedings of the 27th International Conference on
Machine Learning (ICML-10)},
pdf = {http://www.cse.ohio-state.edu/~kulis/pubs/icml_implicit.pdf},
editor = {Johannes Fürnkranz and Thorsten Joachims},
year = {2010},
month = jun,
pages = {575--582}
}

@article{ab-mamssl-08,
title = {Margin-adaptive model selection in statistical learning},
author = {Sylvain Arlot and Peter L.~Bartlett},
journal = {Bernoulli},
volume = {17},
number = {2},
pages = {687--713},
year = {2011},
month = may,
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/ab-mamssl-08.pdf},
abstract = {}
}

@inproceedings{rab-lmf-11,
title = {Learning with Missing Features},
author = {Afshin Rostamizadeh and Alekh Agarwal and Peter L.~Bartlett},
booktitle = {Proceedings of the Conference on
Uncertainty in Artificial Intelligence (UAI2011)},
editor = {Avi Pfeffer and Fabio G.~Cozman},
year = {2011},
month = jul,
pages = {635--642},
abstract = {}
}

@inproceedings{abh-banrle-11,
title = {Blackwell Approachability and No-Regret Learning are Equivalent},
author = {Jacob Abernethy and Peter L.~Bartlett and
booktitle = {Proceedings of the Conference on
Learning Theory (COLT2011)},
editor = {Sham Kakade and Ulrike von Luxburg},
year = {2011},
month = jul,
seriestitle = {JMLR: W\&CP},
volume = {19},
pdf = {http://colt2011.sztaki.hu/colt2011_submission_94.pdf},
pages = {27-46},
abstract = {}
}

title = {Oracle inequalities for computationally budgeted model
selection},
author = {Alekh Agarwal and John Duchi and Peter L.~Bartlett and
Clement Levrard},
booktitle = {Proceedings of the Conference on
Learning Theory (COLT2011)},
editor = {Sham Kakade and Ulrike von Luxburg},
year = {2011},
month = jul,
seriestitle = {JMLR: W\&CP},
volume = {19},
pages = {69-86},
abstract = {We analyze general model selection procedures using
penalized empirical loss
minimization under computational constraints. While classical model
selection approaches do not consider computational aspects of
performing
model selection, we argue that any practical model selection
procedure must
not only trade off estimation and approximation error, but also the
effects
of computational effort required to compute empirical minimizers for
different function classes. We provide a framework for analyzing
such
problems, and we give algorithms for model selection under a
computational budget.},
pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v19/agarwal11a/agarwal11a.pdf}
}

@book{stzbpw-anips24-11,
editor = {John Shawe-Taylor and Richard Zemel and Peter L.~Bartlett and
Fernando Pereira and Kilian Weinberger},
title = {Advances in Neural Information
Processing Systems 24. Proceedings of the 2011 Conference},
url = {http://books.nips.cc/nips24.html},
publisher = {NIPS Foundation},
year = {2011},
month = dec,
url = {http://nips.cc/Conferences/2011/}
}

@article{abrw-itlbocsco-12,
title = {Information-theoretic lower bounds on the oracle
complexity of stochastic convex optimization},
year = {2012},
month = may,
author = {Alekh Agarwal and Peter Bartlett and Pradeep Ravikumar and Martin
Wainwright},
journal = {IEEE Transactions on Information Theory},
abstract = {Relative to the large literature on upper bounds on
complexity of
convex optimization, lesser attention has been paid to the
fundamental hardness of these problems.  Given the extensive use of
convex optimization in machine learning and statistics, gaining an
understanding of these complexity-theoretic issues is important. In
this paper, we study the complexity of stochastic convex
optimization in an oracle model of computation. We improve upon
known results and obtain tight minimax complexity estimates for
various function classes.},
volume = {58},
number = {5},
pages = {3235--3249},
doi = {http://dx.doi.org/10.1109/TIT.2011.2182178},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrw-itlbocsco-12.pdf}
}

title = {Randomized smoothing for stochastic optimization},
author = {John Duchi and Peter L.~Bartlett and Martin J.~Wainwright},
year = {2012},
month = jun,
journal = {SIAM Journal on Optimization},
volume = {22},
number = {2},
pages = {674--701},
abstract = {We analyze convergence rates of stochastic optimization
algorithms for nonsmooth
convex optimization problems. By combining randomized smoothing
techniques with accelerated
gradient methods, we obtain convergence rates of stochastic
optimization procedures, both in expectation and with high probability,
that have optimal dependence on the variance of the gradient
estimates. To the best of our knowledge, these are the first
variance-based rates for nonsmooth
optimization. We give several applications of our results to
statistical estimation problems and provide experimental results
that demonstrate the effectiveness of the proposed algorithms. We also
describe how a combination of our algorithm with recent work on
decentralized optimization yields
a distributed stochastic optimization algorithm that is
order-optimal.}
}

@inproceedings{hb-ecosnmlbpjp-12,
title = {Exchangeability Characterizes Optimality of Sequential
Normalized Maximum Likelihood and {Bayesian} Prediction with {Jeffreys}
Prior},
author = {Fares Hedayati and Peter L.~Bartlett},
pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v22/hedayati12/hedayati12.pdf},
editor = {M.~Girolami and N.~Lawrence},
year = {2012},
month = apr,
seriestitle = {JMLR Workshop and Conference Proceedings},
booktitle = {Proceedings of the Fifteenth International Conference on Artificial Intelligence and Statistics
(AISTATS)},
volume = {22},
pages = {504--510},
abstract = {We study online prediction of individual sequences under
logarithmic
loss with parametric experts. The optimal strategy, normalized maximum
likelihood (NML), is computationally demanding and requires the length
of
the game to be known. We consider two simpler strategies:
sequential normalized maximum likelihood (SNML),
which computes the NML forecasts at each round as if it were the
last round, and Bayesian prediction.
Under appropriate conditions, both are known to achieve near-optimal
regret.  In this paper, we investigate when these strategies are
optimal.
We show that SNML is optimal iff the joint distribution on
sequences defined by SNML is exchangeable.
In the case of exponential families, this is equivalent to the
optimality of any Bayesian prediction strategy,
and the optimal prior is Jeffreys prior.}
}

@inproceedings{hb-ojpodeanmle-12,
title = {The Optimality of {J}effreys Prior for Online Density
Estimation and the Asymptotic Normality of Maximum
Likelihood Estimators},
author = {Fares Hedayati and Peter Bartlett},
pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v23/hedayati12/hedayati12.pdf},
seriestitle = {JMLR Workshop and Conference Proceedings},
booktitle = {Proceedings of the Conference on
Learning Theory (COLT2012)},
year = {2012},
month = jun,
volume = {23},
pages = {7.1-7.13},
abstract = {We study online learning under logarithmic loss with
regular parametric models. We show
that a Bayesian strategy predicts optimally only if it uses Jeffreys
prior. This result was
known for canonical exponential families; we extend it to parametric
models for which the
maximum likelihood estimator is asymptotically normal. The optimal
prediction strategy,
normalized maximum likelihood, depends on the number n of rounds of
the game, in
general. However, when a Bayesian strategy is optimal, normalized
maximum likelihood
becomes independent of n. Our proof uses this to exploit the
asymptotics of normalized
maximum likelihood. The asymptotic normality of the maximum likelihood
estimator is
responsible for the necessity of Jeffreys prior.},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ojpodeanmle-12.pdf}
}

@article{rbht-llfs-09,
title = {Learning in a large function space: Privacy preserving
mechanisms for {SVM} learning},
author = {Benjamin I.~P.~Rubinstein and Peter~L.~Bartlett
and Ling Huang and Nina Taft},
journal = {Journal of Privacy and Confidentiality},
volume = 4,
number = 1,
pages = {65--100},
year = {2012},
month = aug,
url = {http://arxiv.org/abs/0911.5708},
abstract = {}
}

@article{nabs-amlfdahu-12,
title = {Application of Machine Learning in the Fault Diagnostics of
Air Handling Units},
author = {Massieh Najafi and David M. Auslander and Peter L. Bartlett
and Philip Haves and Michael D.~Sohn},
journal = {Applied Energy},
doi = {http://dx.doi.org/10.1016/j.apenergy.2012.02.049},
year = {2012},
month = aug,
volume = 96,
pages = {347--358}
}

@article{bmn-lrlrpoi-09,
title = {$l_1$-regularized linear regression: Persistence and
oracle inequalities},
author = {Peter L.~Bartlett and Shahar Mendelson
and Joseph Neeman},
journal = {Probability Theory and Related Fields},
year = {2012},
month = oct,
volume = {154},
number = {1--2},
pages = {193--224},
abstract = {  We study the predictive performance of
$\ell_1$-regularized linear
regression in a model-free setting, including the case where the
number of covariates is substantially larger than the sample
size. We introduce a new analysis method that avoids the boundedness
problems that typically arise in model-free empirical minimization.
Our technique provides an answer to a conjecture of Greenshtein and
Ritov~\cite{GR} regarding the persistence'' rate for linear
regression and allows us to prove an oracle inequality for the error
of the regularized minimizer.
It also demonstrates that empirical risk minimization gives optimal
rates (up to log factors) of convex aggregation of a set of
estimators of a regression function.},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmn-lrlrpoi-09.pdf},
doi = {http://dx.doi.org/10.1007/s00440-011-0367-2}
}

@article{brsmsb-lbars-12,
title = {A learning-based approach to reactive security},
author = {A.~Barth and Benjamin I.~P.~Rubinstein
and M.~Sundararajan and J.~C.~Mitchell
and Dawn Song and Peter L.~Bartlett},
year = {2012},
month = jul,
journal = {IEEE Transactions on Dependable and Secure Computing},
url = {http://doi.ieeecomputersociety.org/10.1109/TDSC.2011.42},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/brsmsb-lbars-12.pdf},
volume = {9},
number = {4},
pages = {482--493},
abstract = {Despite the conventional wisdom that proactive security is
superior to reactive security, we show that reactive security can be
competitive with proactive security as long as the reactive defender
learns from past attacks instead of myopically overreacting to the
last attack.  Our game-theoretic model follows common practice in the
security literature by making worst-case assumptions about the
attacker: we grant the attacker complete knowledge of the defender’s
strategy and do not require the attacker to act rationally. In this
model, we bound the competitive ratio between a reactive defense
algorithm (which is inspired by online learning theory) and the best
fixed proactive defense.  Additionally, we show that, unlike proactive
defenses, this reactive strategy is robust to a lack of information
about the attacker’s incentives and knowledge.}
}

@inproceedings{dbw-rspsocdc-12,
author = {Duchi, John C. and Bartlett, Peter L. and Wainwright, Martin
J.},
title = {{Randomized Smoothing for (Parallel) Stochastic
Optimization}},
booktitle = {{2012 IEEE 51st Annual Conference on Decision and Control
(CDC)}},
series = {{IEEE Conference on Decision and Control}},
year = {2012},
pages = {{5442-5444}},
abstract = {{By combining randomized smoothing techniques with
methods, we obtain convergence rates for stochastic optimization
procedures, both in expectation and with high probability, that
have
optimal dependence on the variance of the gradient estimates. To
the
best of our knowledge, these are the first variance-based rates for
non-smooth optimization. A combination of our techniques with
recent
work on decentralized optimization yields order-optimal parallel
stochastic optimization algorithms. We give applications of our
results
to several statistical machine learning problems, providing
experimental
results (in the full version of the paper) demonstrating the
effectiveness of our algorithms.}},
publisher = {{IEEE}},
address = {{345 E 47th St, New York, NY 10017 USA}},
type = {{Proceedings Paper}},
issn = {{0191-2216}},
isbn = {{978-1-4673-2066-5}}
}

@book{bpbbw-anips25-12,
editor = {Peter L.~Bartlett and Fernando Pereira and Chris J.~C.~Burges and L\'eon Bottou and Kilian Q.~Weinberger},
title = {Advances in Neural Information
Processing Systems 25. Proceedings of the 2012 Conference},
url = {http://books.nips.cc/nips25.html},
publisher = {NIPS Foundation},
year = {2012},
month = dec,
url = {http://nips.cc/Conferences/2012/}
}

@inproceedings{bghhk-hiopllief-13,
author = {Peter L.~Bartlett and Peter Grunwald and Peter Harremoes and
Fares Hedayati and Wojciech Kotlowski},
title = {Horizon-Independent Optimal Prediction with Log-Loss in Exponential
Families},
seriestitle = {JMLR Workshop and Conference Proceedings},
booktitle = {Proceedings of the Conference on Learning Theory (COLT2013)},
year = {2013},
volume = {30},
pages = {639--661},
pdf = {http://www.jmlr.org/proceedings/papers/v30/Bartlett13.pdf},
abstract = {We study online learning under logarithmic loss with
regular parametric models. Hedayati and Bartlett (2012) showed that a
Bayesian prediction strategy with Jeffreys prior and sequential
normalized maximum likelihood (SNML) coincide and are optimal if and
only if the latter is exchangeable, which occurs if and only if the
optimal strategy can be calculated without knowing the time horizon in
advance. They put forward the question what families have exchangeable
SNML strategies. We answer this question for one-dimensional
exponential families: SNML is exchangeable only for three classes of
natural exponential family distributions,namely the Gaussian, the
gamma, and the Tweedie exponential family of order 3/2.}
}

@inproceedings{scb-opambla-13,
author = {Yevgeny Seldin and Koby Crammer and Peter L Bartlett},
title = {Open Problem: Adversarial Multiarmed Bandits with Limited
seriestitle = {JMLR Workshop and Conference Proceedings},
booktitle = {Proceedings of the Conference on Learning Theory (COLT2013)},
year = {2013},
volume = {30},
pages = {1067--1072},
pdf = {http://www.jmlr.org/proceedings/papers/v30/Seldin13.pdf}
}

@inproceedings{abfw-hhoaa-13,
author = {Jacob  Abernethy and Peter L.~Bartlett and Rafael Frongillo
and Andre Wibisono},
title = {How to Hedge an Option Against an Adversary: {B}lack-{S}choles
Pricing is Minimax Optimal},
year = {2013},
abstract = {We consider a popular problem in finance, option pricing,
through the lens of an online learning game between Nature and an
Investor. In the Black-Scholes option pricing model from 1973, the
Investor can continuously hedge the risk of an option by trading the
underlying asset, assuming that the asset's price fluctuates according
to Geometric Brownian Motion (GBM). We consider a worst-case model, in
which Nature chooses a sequence of price fluctuations under a
cumulative quadratic volatility constraint, and the Investor can make
a sequence of hedging decisions. Our main result is to show that the
value of our proposed game, which is the regret'' of hedging
strategy, converges to the Black-Scholes option price. We use
significantly weaker assumptions than previous work---for instance, we
allow large jumps in the asset price---and show that the Black-Scholes
hedging strategy is near-optimal for the Investor even in this
non-stochastic framework.},
booktitle = {Advances in Neural Information Processing Systems 26},
pages = {2346--2354},
}

@inproceedings{abkss-olmdpactpd-13,
author = {Yasin Abbasi-Yadkori and Peter L.~Bartlett and
Varun Kanade and Yevgeny Seldin and Csaba Szepesvari},
title = {Online Learning in {M}arkov Decision Processes with
year = {2013},
abstract = {We study the problem of online learning Markov Decision
Processes (MDPs) when both the transition distributions and loss functions
are chosen by an adversary. We present an algorithm that, under a
mixing assumption, achieves $O(\sqrt{T\log|\Pi|}+\log|\Pi|)$ regret
with respect to a comparison set of policies $\Pi$. The regret
is independent of the size of the state and action spaces. When
expectations over sample paths can be computed efficiently and
the comparison set $\Pi$ has polynomial size,
this algorithm is efficient.
We also consider the episodic adversarial online shortest path
problem. Here, in each episode an adversary may choose a weighted
directed acyclic graph with an identified start and finish node. The
goal of the learning algorithm is to choose a path that minimizes
the loss while traversing from the start to finish node. At the end
of each episode the loss function (given by weights on the edges)
is revealed to the learning algorithm. The goal is to minimize regret
with respect to a fixed policy for selecting paths. This problem is
a special case of the online MDP problem. For randomly chosen graphs
and adversarial losses, this problem can be efficiently solved. We
show that it also can be efficiently solved for adversarial
graphs and randomly chosen losses. When both graphs and losses
are adversarially chosen, we present an efficient algorithm whose
regret scales linearly with the number of distinct graphs.
Finally, we show that designing efficient algorithms for the
adversarial online shortest path problem (and hence for the
adversarial MDP problem) is as hard as learning parity with noise, a
notoriously difficult problem that has been used to design efficient
cryptographic schemes.},
booktitle = {Advances in Neural Information Processing Systems 26},
pages = {2508--2516},
}

@incollection{tb-lt-12,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {Learning Theory},
booktitle = {Signal Processing Theory and Machine Learning},
series = {Academic Press Library in Signal Processing},
volume = {1},
editor = {Paulo S.R. Diniz and Johan A.K. Suykens and
Rama Chellappa and Sergios Theodoridis},
publisher = {Elsevier},
year = {2014},
pages = {775--816}
}

@inproceedings{sbca-plambpo-14,
author = {Yevgeny Seldin and Peter L.~Bartlett and Koby Crammer and
title = {Prediction with Limited Advice and Multiarmed Bandits with
Paid Observations},
year = {2014},
abstract = {We study two basic questions in online learning. The first
question is what happens between full-information and limited-feedback
games and the second question is the cost of information acquisition
in online learning. The questions are addressed by defining two
variations of standard online learning games. In the first variation,
\emph{prediction with limited advice}, we consider a game of
prediction with expert advice, where on each round of the game we
query the advice of a subset of $M$ out of $N$ experts. We present an
algorithm that achieves $O(\sqrt{(N/M)T\ln N})$ regret on $T$ rounds
of this game. The second variation, the \emph{multiarmed bandit with
paid observations}, is a variant of the adversarial $N$-armed bandit
game, where on round $t$ of the game, we can observe the reward of any
number of arms, but each observation has a cost $c$. We present an
algorithm that achieves $O((c N \ln N)^{1/3} T^{2/3})$ regret on $T$
rounds of this game. We present lower bounds that show that, apart
from the logarithmic factors, these regret bounds cannot be improved.},
booktitle = {Proceedings of the 31st International Conference on
Machine Learning (ICML-14)},
pages = {280--287},
url = {http://jmlr.org/proceedings/papers/v32/seldin14.html},
pdf = {http://jmlr.org/proceedings/papers/v32/seldin14.pdf}
}

@inproceedings{abk-tat-14,
year = {2014},
that, under a stability condition, its regret grows as the logarithm
of the number of rounds of the game. We also study a problem with
adversarially chosen transition dynamics, for which an
exponentially-weighted average algorithm is proposed and analyzed.},
booktitle = {Proceedings of the 31st International Conference on
Machine Learning (ICML-14)},
pages = {369--377},
}

@techreport{rrb-bevcmc-14,
author = {J. Hyam Rubinstein and Benjamin Rubinstein and Peter Bartlett},
title = {Bounding Embeddings of {VC} Classes into Maximum Classes},
institution = {arXiv.org},
year = {2014},
number = {1401.7388},
url = {http://arxiv.org/abs/1401.7388},
abstract = {One of the earliest conjectures in computational learning
theory---the Sample Compression Conjecture---asserts that concept
classes (or set systems) admit compression schemes of size polynomial
in their VC dimension. To-date this statement is known to be true for
maximum classes---those that meet Sauer's Lemma, which bounds class
cardinality in terms of VC dimension, with equality. The most
promising approach to positively resolving the conjecture is by
embedding general VC classes into maximum classes without super-linear
increase to their VC dimensions, as such embeddings extend the known
compression schemes to all VC classes. We show that maximum classes
can be characterized by a local-connectivity property of the graph
obtained by viewing the class as a cubical complex. This geometric
characterization of maximum VC classes is applied to prove a negative
embedding result which demonstrates VC-$d$ classes that cannot be
embedded in any maximum class of VC dimension lower than $2d$. On the
other hand, we give a general recursive procedure for embedding VC-$d$
classes into VC-$(d+k)$ maximum classes for smallest $k$.}
}

@incollection{rrb-bevcmc-14a,
author = {J. Hyam Rubinstein and Benjamin Rubinstein and Peter Bartlett},
title = {Bounding Embeddings of {VC} Classes into Maximum Classes},
year = {2014},
booktitle = {Festschrift of Alexey Chervonenkis},
editor = {A. Gammerman and V. Vovk},
publisher = {Springer},
city = {Berlin},
abstract = {One of the earliest conjectures in computational learning
theory---the Sample Compression Conjecture---asserts that concept
classes (or set systems) admit compression schemes of size polynomial
in their VC dimension. To-date this statement is known to be true for
maximum classes---those that meet Sauer's Lemma, which bounds class
cardinality in terms of VC dimension, with equality. The most
promising approach to positively resolving the conjecture is by
embedding general VC classes into maximum classes without super-linear
increase to their VC dimensions, as such embeddings extend the known
compression schemes to all VC classes. We show that maximum classes
can be characterized by a local-connectivity property of the graph
obtained by viewing the class as a cubical complex. This geometric
characterization of maximum VC classes is applied to prove a negative
embedding result which demonstrates VC-$d$ classes that cannot be
embedded in any maximum class of VC dimension lower than $2d$. On the
other hand, we give a general recursive procedure for embedding VC-$d$
classes into VC-$(d+k)$ maximum classes for smallest $k$.},
url = {http://arxiv.org/abs/1401.7388}
}

@techreport{abm-lplsmdp-14,
author = {Yasin Abbasi-Yadkori and Peter L. Bartlett and Alan Malek},
title = {Linear programming for large-scale {M}arkov decision problems},
year = {2014},
abstract = {We consider the problem of controlling a Markov decision
process (MDP) with a large state space, so as to minimize average
cost. Since it is intractable to compete with the optimal policy for
large scale problems, we pursue the more modest goal of competing with
a low-dimensional family of policies. We use the dual linear
programming formulation of the MDP average cost problem, in which the
variable is a stationary distribution over state-action pairs, and we
consider a neighborhood of a low-dimensional subset of the set of
stationary distributions (defined in terms of state-action features)
as the comparison class. We propose two techniques, one based on
stochastic convex optimization, and one based on constraint sampling.
In both cases, we give bounds that show that the performance of our
algorithms approaches the best achievable by any policy in the
comparison class. Most importantly, these results depend on the size
of the comparison class, but not on the size of the state space.
Preliminary experiments show the effectiveness of the proposed
algorithms in a queuing application.},
institution = {arXiv.org},
number = {1402.6763},
url = {http://arxiv.org/abs/1402.6763}
}

@inproceedings{abm-lplsmdp-14a,
author = {Yasin Abbasi-Yadkori and Peter L. Bartlett and Alan Malek},
title = {Linear programming for large-scale {M}arkov decision problems},
year = {2014},
booktitle = {Proceedings of the 31st International Conference on
Machine Learning (ICML-14)},
pages = {496--504},
abstract = {We consider the problem of controlling a Markov decision
process (MDP) with a large state space, so as to minimize average
cost. Since it is intractable to compete with the optimal policy for
large scale problems, we pursue the more modest goal of competing with
a low-dimensional family of policies. We use the dual linear
programming formulation of the MDP average cost problem, in which the
variable is a stationary distribution over state-action pairs, and we
consider a neighborhood of a low-dimensional subset of the set of
stationary distributions (defined in terms of state-action features)
as the comparison class. We propose two techniques, one based on
stochastic convex optimization, and one based on constraint sampling.
In both cases, we give bounds that show that the performance of our
algorithms approaches the best achievable by any policy in the
comparison class. Most importantly, these results depend on the size
of the comparison class, but not on the size of the state space.
Preliminary experiments show the effectiveness of the proposed
algorithms in a queuing application.},
url = {http://jmlr.org/proceedings/papers/v32/malek14.html},
pdf = {http://jmlr.org/proceedings/papers/v32/malek14.pdf}
}

@inproceedings{kmb-emsslg-14,
title = {Efficient Minimax Strategies for Square Loss Games},
author = {Wouter M Koolen and Alan Malek and Peter L Bartlett},
booktitle = {Advances in Neural Information Processing Systems 27},
editor = {Z. Ghahramani and M. Welling and C. Cortes and N.D. Lawrence
and K.Q. Weinberger},
pages = {3230--3238},
year = {2014},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/5243-efficient-minimax-strategies-for-square-loss-games.pdf},
abstract = {We consider online prediction problems where the loss
between the prediction and the outcome is measured by the squared
Euclidean distance and its generalization, the squared Mahalanobis
distance. We derive the minimax solutions for the case where the
prediction and action spaces are the simplex (this setup is sometimes
called the Brier game) and the $\ell_2$ ball (this setup is related to
Gaussian density estimation). We show that in both cases the value of
each sub-game is a quadratic function of a simple statistic of the
state, with coefficients that can be efficiently computed using an
explicit recurrence relation. The resulting deterministic minimax
strategy and randomized maximin strategy are linear functions of the
statistic.}
}

@inproceedings{kthbjt-lmcpm-14,
title = {Large-Margin Convex Polytope Machine},
author = {Alex Kantchelian and Michael C Tschantz and Ling Huang
and Peter L Bartlett and Anthony D Joseph and J. Doug Tygar},
booktitle = {Advances in Neural Information Processing Systems 27},
editor = {Z. Ghahramani and M. Welling and C. Cortes and N.D. Lawrence
and K.Q. Weinberger},
pages = {3248--3256},
year = {2014},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/5511-large-margin-convex-polytope-machine.pdf},
abstract = {We present the Convex Polytope Machine (CPM), a novel
non-linear learning algorithm for large-scale binary classification
tasks. The CPM finds a large margin convex polytope separator which
encloses one class. We develop a stochastic gradient descent based
algorithm that is amenable to massive datasets, and augment it with a
heuristic procedure to avoid sub-optimal local minima. Our
experimental evaluations of the CPM on large-scale datasets from
distinct domains (MNIST handwritten digit recognition, text topic, and
web security) demonstrate that the CPM trains models faster, sometimes
several orders of magnitude, than state-of-the-art similar approaches
and kernel-SVM methods while achieving comparable or better
classification performance. Our empirical results suggest that, unlike
prior similar approaches, we do not need to control the number of
sub-classifiers (sides of the polytope) to avoid overfitting.}
}

@inproceedings{abcm-lsmdpklcc-15,
author = {Yasin Abbasi-Yadkori and Peter L Bartlett and Xi Chen and Alan Malek},
title = {Large-Scale {M}arkov Decision Problems with {KL} Control Cost},
year = {2015},
month = {June},
booktitle = {Proceedings of the 32nd International Conference on
Machine Learning (ICML-15)},
volume = {37},
pages = {1053--1062},
abstract = {}
}

@inproceedings{bkmtw-mfdlr-15,
author = {Peter L.~Bartlett and Wouter Koolen and Alan
Malek and Eiji Takimoto and Manfred Warmuth},
title = {Minimax Fixed-Design Linear Regression},
seriestitle = {JMLR Workshop and Conference Proceedings},
booktitle = {Proceedings of the Conference on Learning Theory (COLT2015)},
year = {2015},
volume = {40},
pages = {226--239},
month = {June},
url = {http://jmlr.org/proceedings/papers/v40/Bartlett15.pdf},
pdf = {http://jmlr.org/proceedings/papers/v40/Bartlett15.pdf},
abstract = {We consider a linear regression game in which the covariates are
known in advance: at each round, the learner predicts a real-value,
the adversary reveals a label, and the learner incurs a squared
error loss. The aim is to minimize the regret with respect to linear
predictions.  For a variety of constraints on the adversary's
labels, we show that the minimax optimal strategy is linear,
with a parameter choice that is reminiscent of ordinary least squares
(and as easy to compute).  The predictions depend on all covariates,
past and future, with a particular weighting assigned to future
covariates corresponding to the role that they play in the minimax
regret.  We study two families of label sequences: box constraints
(under a covariate compatibility condition), and a weighted 2-norm
constraint that emerges naturally from the analysis.  The strategy
is adaptive in the sense that it requires no knowledge of the
constraint set.  We obtain an explicit expression for the minimax
regret for these games.  For the case of uniform box constraints,
we show that, with worst case covariate sequences, the regret is
$O(d\log T)$, with no dependence on the scaling of the covariates.}
}

@inproceedings{aykmb-mtsp-15,
author = {Wouter Koolen and Alan Malek and Peter L. Bartlett and
title = {Minimax time series prediction},
year = {2015},
booktitle = {Advances in Neural Information Processing Systems 28},
editor = {C. Cortes and N.D. Lawrence and D.D. Lee and M. Sugiyama and
R. Garnett and R. Garnett},
pages = {2548--2556},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/5730-minimax-time-series-prediction.pdf},
abstract = {We consider an adversarial formulation of
the problem of predicting a time series with square loss.
The aim is to predict an arbitrary sequence of vectors almost as well
as the best smooth comparator sequence in retrospect.
Our approach allows natural measures of smoothness, such as the
squared norm of increments. More generally, we can consider a linear
time series model and penalize the comparator sequence through
the energy of the implied driving noise terms.
We derive the minimax strategy for all problems of this type, and
we show that it can be implemented efficiently.
The optimal predictions are linear in the previous observations.
We obtain an explicit expression for the regret in terms of
the parameters defining the problem.
For typical, simple definitions of smoothness, the computation of the
optimal predictions involves only sparse matrices.
In the case of norm-constrained data, where the smoothness is defined
in terms of the squared norm of the comparator's increments, we show
that the regret grows as $T/\sqrt\lambda$, where $T$ is the length
of the game and $\lambda$ specifies the smoothness of the comparator.}
}

@techreport{aykmb-mtsp-15b,
author = {Yasin Abbasi-Yadkori and Wouter Koolen and Alan Malek and
Peter L. Bartlett},
title = {Minimax time series prediction},
year = {2015},
institution = {EECS Department,
University of California, Berkeley}
}

@techreport{kbb-amdcdt-15a,
author = {Walid Krichene and Alexandre Bayen and Peter L. Bartlett},
title = {Accelerating Mirror Descent in Continuous and Discrete time},
year = {2015},
institution = {EECS Department, University of California, Berkeley}
}

@inproceedings{kbb-amdcdt-15,
author = {Walid Krichene and Alexandre Bayen and Peter L. Bartlett},
title = {Accelerating Mirror Descent in Continuous and Discrete time},
booktitle = {Advances in Neural Information Processing Systems 28},
year = {2015},
editor = {C. Cortes and N.D. Lawrence and D.D. Lee and M. Sugiyama and
R. Garnett and R. Garnett},
pages = {2827--2835},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/5843-accelerated-mirror-descent-in-continuous-and-discrete-time.pdf},
abstract = {We study accelerated mirror descent dynamics in continuous
and discrete time. Combining the original continuous-time motivation of
mirror descent with a recent ODE interpretation of Nesterov's
accelerated method, we propose a family of continuous-time descent
dynamics for convex functions with Lipschitz gradients, such that the
solution trajectories are guaranteed to converge to the optimum at a
$\Ocal(1/t^2)$ rate. We then show that a large family of first-order
accelerated methods can be obtained as a discretization of the ODE,
and these methods converge at a $\Ocal(1/k^2)$ rate. This connection
between accelerated mirror descent and the ODE provides an intuitive
approach to the design and analysis of accelerated first-order
algorithms.}
}

@techreport{aybw-lramdp-15,
author = {Yasin Abbasi-Yadkori and Peter L. Bartlett and Stephen
Wright},
title = {A {L}agrangian Relaxation Approach to {M}arkov
Decision Problems},
institution = {UC Berkeley EECS},
year = {2015},
abstract = {We study Markov decision problems (MDPs) over a restricted
policy class, and show that a Lagrangian relaxation approach finds
near-optimal policies in this class efficiently.
In particular, the computational complexity depends on the number
of features used to define policies, and not on the size of the
state space. The statistical complexity also scales well: our method
requires only low-dimensional second order statistics.
Most value-function-based methods for MDPs return a policy that is
greedy with respect to the value function estimate. We discuss
drawbacks of this approach, and propose a new policy class defined for
some parameter vector $w$ by $\pi_w(a|x) = ( 1-Q_w(x,a) + \E_{\nu(.|x)} Q_w ) \nu(a|x)$,
where $Q_w$ is the state-action value function,
$\nu$ is a baseline policy, and the mean of $Q_w$ under $\nu(.|x)$
% (denoted by $\E_{\nu(.|x)} Q_w$)
acts as a normalizer. Similar to the
greedy and Gibbs policies, the proposed policy assigns larger
probabilities to actions with smaller value-function estimates.  We
demonstrate the effectiveness of our Lagrangian relaxation approach,
applied to this policy class, on a queueing problem and an energy
storage application.}
}

@techreport{b-op-16,
author = {Peter L. Bartlett},
title = {Online prediction},
year = {2015},
abstract = {We review game-theoretic models of prediction, in which
the process generating the data is modelled as an adversary with whom
the prediction method competes. We present a formulation that
encompasses a wide variety of decision problems, and focus on the
relationship between prediction in this game-theoretic setting and
prediction in the more standard probabilistic setting. In particular,
we present a view of standard prediction strategies as Bayesian
decision methods, and we show how the regret of optimal strategies
depends on complexity measures that are closely related to those that
appear in probabilistic settings.},
institution = {UC Berkeley EECS},
//note = {Submitted to Annales de l'Institut Henri Poincar\'{e}},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-op-16.pdf}
}

@inproceedings{aybw-frpia-16,
author = {Yasin Abbasi-Yadkori and Peter L. Bartlett and Stephen
Wright},
title = {A Fast and Reliable Policy Improvement Algorithm},
booktitle = {Proceedings of AISTATS 2016},
pages = {1338--1346},
year = {2016},
abstract = {We introduce a simple, efficient method that improves
stochastic policies for {M}arkov decision processes. The computational
complexity is the same as that of the value estimation problem. We
prove that when the value estimation error is small, this method gives
an improvement in performance that increases with certain variance
properties of the initial policy and transition dynamics. Performance
in numerical experiments compares favorably with previous policy
improvement algorithms.},
}

@inproceedings{glgob-ilccpeb-16,
author = {Victor Gabillon and Alessandro Lazaric and Mohammad
Ghavamzadeh and Ronald Ortner and Peter L.~Bartlett},
title = {Improved Learning Complexity in Combinatorial Pure
Exploration Bandits},
booktitle = {Proceedings of AISTATS 2016},
pages = {1004--1012},
year = {2016},
abstract = {We study the problem of combinatorial pure exploration in
the stochastic multi-armed bandit problem. We first construct a new
measure of complexity that provably characterizes the learning
performance of the algorithms we propose for the fixed confidence and
the fixed budget setting. We show that this complexity is never higher
than the one in existing work and illustrate a number of
configurations in which it can be significantly smaller. While in
general this improvement comes at the cost of increased computational
complexity, we provide a series of examples, including a planning
problem, where this extra cost is not significant.},
url = {http://jmlr.org/proceedings/papers/v51/gabillon16.html},
pdf = {http://jmlr.org/proceedings/papers/v51/gabillon16.pdf}
}

author = {Walid Krichene and Alexandre Bayen and Peter L. Bartlett},
title = {Adaptive Averaging in Accelerated Descent Dynamics},
booktitle = {Advances in Neural Information Processing Systems 29},
year = {2016},
pages = {2991--2999},
abstract = {We study accelerated descent dynamics for constrained
convex optimization. This dynamics can be described naturally as a
coupling of a dual variable accumulating gradients at a given rate
$\eta(t)$, and a primal variable obtained as the weighted average
of the mirrored dual trajectory, with weights $w(t)$. Using a Lyapunov
argument, we give sufficient conditions on $\eta$ and $w$ to achieve a
desired convergence rate. As an example, we show that the replicator
dynamics (an example of mirror descent on the simplex) can be
accelerated using a simple averaging scheme.  We then propose an
speed up the decrease of the Lyapunov function. We provide guarantees
on adaptive averaging in continuous-time, prove that it preserves the
quadratic convergence rate of accelerated first-order methods in
discrete-time, and give numerical experiments to compare it with
existing heuristics, such as adaptive restarting. The experiments
restarting, with significant improvements in some cases.},
}

@article{hb-ecosnmlbp-17,
author = {Fares Hedayati and Peter L. Bartlett},
title = {Exchangeability Characterizes Optimality of Sequential
Normalized Maximum Likelihood and {B}ayesian Prediction},
journal = {IEEE Transactions on Information Theory},
month = {October},
volume = {63},
number = {10},
pages = {6767--6773},
year = {2017},
abstract = {We study online learning under logarithmic loss with
regular parametric models. In this setting, each strategy corresponds
to a joint distribution on sequences. The minimax optimal strategy is the
normalized maximum likelihood (NML) strategy. We show
that the sequential normalized maximum likelihood (SNML) strategy
predicts minimax optimally (i.e. as NML) if and only if the
joint distribution on sequences defined by SNML is exchangeable. This
property also characterizes the optimality of a Bayesian
prediction strategy. In that case, the optimal prior distribution is
Jeffreys prior for a broad class of parametric models for which
the maximum likelihood estimator is asymptotically normal. The optimal
prediction strategy, normalized maximum likelihood,
depends on the number n of rounds of the game, in general. However,
when a Bayesian strategy is optimal, normalized maximum
likelihood becomes independent of n. Our proof uses this to exploit
the asymptotics of normalized maximum likelihood. The
asymptotic normality of the maximum likelihood estimator is
responsible for the necessity of Jeffreys prior.},
doi = {10.1109/TIT.2017.2735799},
url = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ecosnmlbp-17.pdf},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ecosnmlbp-17.pdf}
}

@inproceedings{aymbg-hnr-17,
author = {Yasin Abbasi-Yadkori and Alan Malek and Peter L.~Bartlett
and Victor Gabillon},
title = {Hit-and-Run for Sampling and Planning in Non-Convex Spaces},
year = {2017},
abstract = {We propose the Hit-and-Run algorithm for planning and sampling
problems in
non-convex spaces. For sampling, we show the first analysis of the
Hit-and-Run algorithm in non-convex spaces and show that it mixes fast as
long as certain smoothness conditions are satisfied. In particular,
our analysis reveals an intriguing connection between fast mixing and
the existence of smooth measure-preserving mappings from a convex
space to the non-convex space. For planning, we show advantages of
Hit-and-Run compared to state-of-the-art planning methods such as
Rapidly-Exploring Random Trees.},
booktitle = {Proceedings of the 20th International Conference on
Artificial Intelligence and Statistics},
pages = {888--895},
editor = {Aarti Singh and Jerry Zhu},
volume = {54},
series = {Proceedings of Machine Learning Research},
address = {Fort Lauderdale, FL, USA},
}

@inproceedings{zsjbd-rgohlnn-17,
author = {Kai Zhong and Zhao Song and Prateek Jain
and Peter L.~Bartlett and Inderjit S. Dhillon},
title = {Recovery Guarantees for One-hidden-layer Neural Networks},
booktitle = {Proceedings of the 34th International Conference on
Machine Learning (ICML-17)},
pages = {4140--4149},
year = {2017},
editor = {Doina Precup and Yee Whye Teh},
volume = {70},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v70/zhong17a/zhong17a.pdf},
url = {http://proceedings.mlr.press/v70/zhong17a.html},
abstract = {In this paper, we consider regression problems with
one-hidden-layer neural networks (1NNs). We distill some properties of
activation functions that lead to   \emph{local strong convexity} in
the neighborhood of the ground-truth parameters for the 1NN
squared-loss objective and most popular nonlinear activation functions
satisfy the distilled properties, including rectified linear units
(ReLUs), leaky ReLUs, squared ReLUs and sigmoids. For activation
functions that are also smooth, we show \emph{local linear
convergence} guarantees of gradient descent under a resampling rule.
For homogeneous activations, we show tensor methods are able to
initialize the parameters to fall into the local strong convexity
region. As a result, tensor initialization followed by gradient
descent is guaranteed to recover the ground truth with sample
complexity $d \cdot \log(1/\epsilon) \cdot \mathrm{poly}(k,\lambda )$
and computational complexity $n\cdot d \cdot \mathrm{poly}(k,\lambda)$ for smooth  homogeneous activations with high probability, where $d$
is the dimension of the input, $k$ ($k\leq d$) is the number of hidden
nodes, $\lambda$ is a conditioning  property of the ground-truth
parameter matrix between the input layer and the hidden layer,
$\epsilon$ is the targeted precision and $n$ is the number of samples.
To the best of our knowledge, this is the first work that provides
recovery guarantees for 1NNs with both sample complexity and
computational complexity \emph{linear} in the input dimension and
\emph{logarithmic} in the precision.}
}

@inproceedings{pbbc-ftsmfamp-17,
author = {Martin P{\'e}ron and Kai Helge Becker and Peter L.~Bartlett
and Iadine Chad{\e}s},
title = {Fast-Tracking Stationary {MOMDPs} for Adaptive Management
Problems},
booktitle = {Proceedings of the Thirty-First AAAI Conference on Artificial
Intelligence (AAAI-17)},
pages = {4531--4537},
year = {2017},
abstract = {Adaptive management is applied in conservation and
natural resource management, and consists of making sequential
decisions when the transition matrix is uncertain. Informally
described as ’learning by doing’, this approach aims to trade off
between decisions that help achieve the objective and decisions that
will yield a better knowledge of the true transition matrix. When
the true transition matrix is assumed to be an element of a finite
set of possible matrices, solving a mixed observability Markov
computationally demanding.  Under the assumption (common in adaptive
management) that the true transition matrix is stationary, we
propose a polynomial-time algorithm to find a lower bound of the
value function. In the corners of the domain of the value function
(belief space), this lower bound is provably equal to the optimal
value function. We also show that under further assumptions, it is a
linear approximation of the optimal value function in a neighborhood
around the corners. We evaluate the benefits of our approach by
using it to initialize the solvers MO-SARSOP and Perseus on a novel
computational sustainability problem and a recent adaptive
management data challenge.  Our approach leads to an improved
initial value function and translates into significant computational
gains for both solvers.},
pdf = {https://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14552/14063},
url = {https://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14552}
}

@inproceedings{gayb-moa3epp-17,
author = {Yasin Abbasi-Yadkori and Peter L.~Bartlett and Victor Gabillon},
title = {Near Minimax Optimal Players for the Finite-Time 3-Expert
Prediction Problem},
booktitle = {Advances in Neural Information Processing Systems 30},
editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett},
pages = {3033--3042},
year = {2017},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/6896-near-minimax-optimal-players-for-the-finite-time-3-expert-prediction-problem.pdf},
pdf = {http://papers.nips.cc/paper/6896-near-minimax-optimal-players-for-the-finite-time-3-expert-prediction-problem.pdf},
abstract = {We study minimax strategies for the online prediction
strategy, called Comb, is optimal in this game for any number of
experts including the non asymptotic case where the number of experts
is small. We make progress in this direction by showing that Comb is
minimax optimal within an additive logarithmic error in the finite
time three expert problems.}
}

@inproceedings{snmbnn-manng-17,
author = {Peter Bartlett and Dylan Foster and Matus Telgarsky},
title = {Spectrally-normalized margin bounds for neural networks},
booktitle = {Advances in Neural Information Processing Systems 30},
editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett},
pages = {6240--6249},
year = {2017},
publisher = {Curran Associates, Inc.},
abstract = {This paper presents a margin-based multiclass
generalization bound for neural networks that scales with their
margin-normalized spectral complexity: their Lipschitz constant,
meaning the product of the spectral norms of the weight matrices,
times a certain correction factor. This bound is empirically
investigated for a standard AlexNet network trained with SGD on the
mnist and cifar10 datasets, with both original and random labels; the
bound, the Lipschitz constants, and the excess risks are all in direct
correlation, suggesting both that SGD selects predictors whose
complexity scales with the difficulty of the learning task, and
secondly that the presented bound is sensitive to this complexity.},
pdf = {http://papers.nips.cc/paper/7204-spectrally-normalized-margin-bounds-for-neural-networks.pdf},
url = {http://papers.nips.cc/paper/7204-spectrally-normalized-margin-bounds-for-neural-networks.pdf}
}

@techreport{bhlm-ntvcdpdbfplnn-17,
author = {Peter L. Bartlett and Nick Harvey and Chris Liaw and Abbas
Mehrabian},
title = {Nearly-tight {VC}-dimension and pseudodimension bounds for
piecewise linear neural networks},
year = 2017,
institution = {arXiv.org},
number = {1703.02930},
abstract = {We prove new upper and lower bounds on the VC-dimension of
deep neural networks with the ReLU activation function. These bounds
are tight for almost the entire range of parameters. Letting $W$ be
the number of weights and $L$ be the number of layers, we prove that
the VC-dimension is $O(W Llog(W))$, and provide examples with
VC-dimension $\Omega(W L\log(W/L))$. This improves both the previously
known upper bounds and lower bounds. In terms of the number $U$ of
non-linear units, we prove a tight bound $\Theta(W U)$ on the
VC-dimension. All of these bounds generalize to arbitrary piecewise
linear activation functions, and also hold for the pseudodimensions of
these function classes.  Combined with previous results, this gives an
intriguing range of dependencies of the VC-dimension on depth for
networks with different non-linearities: there is no dependence for
piecewise-constant, linear dependence for piecewise-linear, and no
more than quadratic dependence for general piecewise-polynomial.},
url = {https://arxiv.org/abs/1703.02930},
pdf = {https://arxiv.org/pdf/1703.02930.pdf}
}

@inproceedings{kb-aasdd-17,
author = {Walid Krichene and Peter Bartlett},
title = {Acceleration and Averaging in Stochastic Descent Dynamics},
booktitle = {Advances in Neural Information Processing Systems 30},
editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett},
pages = {6796--6806},
year = {2017},
publisher = {Curran Associates, Inc.},
abstract = {We formulate and study a general family of
(continuous-time) stochastic dynamics for accelerated first-order
minimization of smooth convex functions.  Building on an averaging
formulation of accelerated mirror descent, we propose a stochastic
variant in which the gradient is contaminated by noise, and study the
resulting stochastic differential equation. We prove a bound on the
rate of change of an energy function associated with the problem, then
use it to derive estimates of convergence rates of the function values
(almost surely and in expectation), both for persistent and
asymptotically vanishing noise. We discuss the interaction between the
parameters of the dynamics (learning rate and averaging rates) and the
covariation of the noise process. In particular, we show how the
asymptotic rate of covariation affects the choice of parameters and,
ultimately, the convergence rate.},
pdf = {http://papers.nips.cc/paper/7256-acceleration-and-averaging-in-stochastic-descent-dynamics.pdf},
url = {http://papers.nips.cc/paper/7256-acceleration-and-averaging-in-stochastic-descent-dynamics.pdf}
}

@inproceedings{cb-amdlri-17,
author = {Niladri Chatterji and Peter Bartlett},
title = {Alternating minimization for dictionary learning with random
initialization},
booktitle = {Advances in Neural Information Processing Systems 30},
editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett},
pages = {1997--2006},
year = {2017},
publisher = {Curran Associates, Inc.},
pdf = {http://papers.nips.cc/paper/6795-alternating-minimization-for-dictionary-learning-with-random-initialization.pdf},
url = {http://papers.nips.cc/paper/6795-alternating-minimization-for-dictionary-learning-with-random-initialization.pdf}
}

@techreport{bel-rsfcnifidno-17,
author = {Peter L. Bartlett and Steven Evans and Philip M.~Long},
title = {Representing smooth functions as compositions of
near-identity functions with implications for deep network
optimization},
year = {2018},
institution = {arXiv.org},
number = {1804.05012},
abstract = {We show that any smooth bi-Lipschitz $h$ can be
represented exactly as a composition $h_m\circ\cdots h_1$
of functions $h_1,\ldots,h_m$ that are close to the identity in the
sense that each $(h_i − Id)$ is Lipschitz, and the Lipschitz constant
decreases inversely with the number $m$ of functions composed. This
implies that $h$ can be represented to any accuracy by a deep residual
network whose nonlinear layers compute functions with a small
Lipschitz constant. Next, we consider nonlinear regression with a
composition of near-identity nonlinear maps. We show that, regarding
Fr\'echet derivatives with respect to the $h_1,\ldots,h_m$,
any critical point of a quadratic criterion in this near-identity
region must be a global minimizer. In contrast, if we consider
derivatives with respect to parameters of a fixed-size residual
network with sigmoid activation functions, we show that there are
near-identity critical points that are suboptimal, even in the
realizable case. Informally, this means that functional gradient
methods for residual networks cannot get stuck at suboptimal critical
points corresponding to near-identity layers, whereas parametric
gradient methods for sigmoidal residual networks suffer from
suboptimal critical points in the near-identity region.},
url = {https://arxiv.org/abs/1804.05012},
pdf = {https://arxiv.org/pdf/1804.05012.pdf}
}

@inproceedings{gdkisdl-yplprb-18,
author = {Dong Yin and Ashwin Pananjady and Max Lam and Dimitris
Papailiopoulos and Kannan Ramchandran and Peter Bartlett},
title = {Gradient Diversity: a Key Ingredient for Scalable Distributed
Learning},
year = {2018},
booktitle = {Proceedings of the 21st International Conference on
Artificial Intelligence and Statistics},
url = {http://proceedings.mlr.press/v84/yin18a.html},
pdf = {http://proceedings.mlr.press/v84/yin18a/yin18a.pdf},
pages = {1998--2007},
editor = {Amos Storkey and Fernando Perez-Cruz},
volume = {84},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
abstract = {It has been experimentally observed that distributed
implementations of mini-batch stochastic gradient descent (SGD)
algorithms exhibit speedup saturation and decaying generalization
ability beyond a particular batch-size. In this work, we present an
analysis hinting that high similarity between concurrently processed
introduce the notion of gradient diversity that measures the
role in the convergence and generalization performance of mini-batch
SGD.  We also establish that heuristics similar to DropConnect,
Langevin dynamics, and quantization, are provably diversity-inducing
mechanisms, and provide experimental evidence indicating that these
mechanisms can indeed enable the use of larger batches without
sacrificing accuracy and lead to faster training in distributed
learning. For example, in one of our experiments, for a
convolutional neural network to reach 95\% training accuracy on
MNIST, using the diversity-inducing mechanism can reduce the
training time by 30\% in the distributed setting.}
}

@inproceedings{crpbm-ffflcagm-18,
author = {Xiang Cheng and Fred Roosta and Stefan Palombo
and Peter Bartlett and Michael Mahoney},
Methods},
year = {2018},
pages = {404--414},
booktitle = {Proceedings of the 21st International Conference on
Artificial Intelligence and Statistics},
url = {http://proceedings.mlr.press/v84/cheng18b.html},
pdf = {http://proceedings.mlr.press/v84/cheng18b/cheng18b.pdf},
publisher = {PMLR},
editor = {Amos Storkey and Fernando Perez-Cruz},
volume = {84},
series = {Proceedings of Machine Learning Research},
abstract = {We consider first order gradient methods for effectively
optimizing a composite objective in the form of a sum of smooth and,
potentially, non-smooth functions. We present accelerated and adaptive
gradient methods, called FLAG and FLARE, which can offer the best of
both worlds. They can achieve the optimal convergence rate by
attaining the optimal first-order oracle complexity for smooth convex
available and conform to the geometry of the domain. We show
theoretically and empirically that, through the compounding effects of
acceleration and adaptivity, FLAG and FLARE can be highly effective
for many data fitting and machine learning applications.}
}

@inproceedings{cb-clmcmckld-18,
author = {Xiang Cheng and Peter Bartlett},
title = {Convergence of {L}angevin {MCMC} in {KL}-divergence},
year = {2018},
booktitle = {Proceedings of ALT2018},
url = {http://proceedings.mlr.press/v83/cheng18a.html},
pdf = {http://proceedings.mlr.press/v83/cheng18a/cheng18a.pdf},
pages = {186--211},
editor = {Firdaus Janoos and Mehryar Mohri and Karthik Sridharan},
volume = {83},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
abstract = {Langevin diffusion is a commonly used tool for
sampling from a given distribution. In this work, we establish that
when the target density $p^*$ is such that $\log p^*$ is $L$
smooth and $m$ strongly convex, discrete Langevin diffusion produces
a distribution $p$ with $\KL{p}{p^*}≤\epsilon$ in
$\tilde{O}(\frac{d}{\epsilon})$ steps, where $d$ is the dimension of the
sample space.  We also study the convergence rate when the
strong-convexity assumption is absent. By considering the Langevin
diffusion as a gradient flow in the space of probability
distributions, we obtain an elegant analysis that applies to the
stronger property of convergence in KL-divergence and gives a
conceptually simpler proof of the best-known convergence results in
weaker metrics.}
}

author = {Martin P\'{e}ron and Peter Bartlett and Kai Helge Becker and Kate Helmstedt and Iadine Chad\{e}s},
title = {Two approximate dynamic programming algorithms for managing
complete {SIS} networks},
year = {2018},
booktitle = {ACM SIGCAS Conference on Computing and
Sustainable Societies (COMPASS 2018)},
url = {https://dl.acm.org/citation.cfm?id=3209811.3209814},
}

@inproceedings{ycrb-brdltosr-18,
author = {Dong Yin and Yudong Chen and Kannan Ramchandran and Peter
Bartlett},
title = {Byzantine-Robust Distributed Learning: Towards Optimal
Statistical Rates},
abstract = {we develop distributed optimization algorithms that are
provably robust against Byzantine failures---arbitrary and potentially
adversarial behavior, in distributed computing systems, with a focus
on achieving optimal statistical performance. A main result of this
work is a sharp analysis of two robust distributed gradient descent
algorithms based on median and trimmed mean operations, respectively.
We prove statistical error rates for all of strongly convex,
non-strongly convex, and smooth non-convex population loss functions.
In particular, these algorithms are shown to achieve order-optimal
statistical error rates for strongly convex losses. To achieve better
communication efficiency, we further propose a median-based
distributed algorithm that is provably robust, and uses only one
communication round. For strongly convex quadratic loss, we show that
this algorithm achieves the same optimal error rate as the robust
year = {2018},
booktitle = {Proceedings of the 35th International Conference on
Machine Learning (ICML-18)},
url = {http://proceedings.mlr.press/v80/yin18a.html},
pdf = {http://proceedings.mlr.press/v80/yin18a/yin18a.pdf},
pages = {5650--5659},
editor = {Dy, Jennifer and Krause, Andreas},
volume = {80},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
abstract = {In this paper, we develop distributed optimization
algorithms that are provably robust against Byzantine
failures—arbitrary and potentially adversarial behavior, in
distributed computing systems, with a focus on achieving optimal
statistical performance. A main result of this work is a sharp
analysis of two robust distributed gradient descent algorithms based
on median and trimmed mean operations, respectively. We prove
statistical error rates for all of strongly convex, non-strongly
convex, and smooth non-convex population loss functions. In
particular, these algorithms are shown to achieve order-optimal
statistical error rates for strongly convex losses. To achieve
better communication efficiency, we further propose a median-based
distributed algorithm that is provably robust, and uses only one
communication round. For strongly convex quadratic loss, we show
that this algorithm achieves the same optimal error rate as the
}

@inproceedings{cfmbj-otvrsgmc-18,
author = {Niladri Chatterji and Nicolas Flammarion and Yian Ma and
Peter Bartlett and Michael Jordan},
title = {On the Theory of Variance Reduction for Stochastic Gradient
{M}onte {C}arlo},
year = {2018},
booktitle = {Proceedings of the 35th International Conference on
Machine Learning (ICML-18)},
url = {http://proceedings.mlr.press/v80/chatterji18a.html},
pdf = {http://proceedings.mlr.press/v80/chatterji18a/chatterji18a.pdf},
pages = {764--773},
editor = {Dy, Jennifer and Krause, Andreas},
volume = {80},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
abstract = {We provide convergence guarantees in Wasserstein
distance for a variety of variance-reduction methods: SAGA Langevin
diffusion, SVRG Langevin diffusion and control-variate underdamped
Langevin diffusion. We analyze these methods under a uniform set of
assumptions on the log-posterior distribution, assuming it to be
smooth, strongly convex and Hessian Lipschitz. This is achieved by a
new proof technique combining ideas from finite-sum optimization and
the analysis of sampling methods. Our sharp theoretical bounds allow
us to identify regimes of interest where each method performs better
than the others. Our theory is verified with experiments on
real-world and synthetic datasets.}
}

@inproceedings{bhl-gdiielpdltdrn-18,
author = {Peter L.~Bartlett and David P.~Helmbold and Philip M.~Long},
title = {Gradient descent with identity initialization
efficiently learns positive definite linear transformations
by deep residual networks},
year = {2018},
url = {https://arxiv.org/abs/1802.06093},
booktitle = {Proceedings of the 35th International Conference on
Machine Learning (ICML-18)},
abstract = {We analyze algorithms for approximating a function $f(x) = \Phi x$ mapping $\Re^d$ to $\Re^d$ using deep linear neural networks,
i.e. that learn a function $h$ parameterized by matrices $\Theta_1, \ldots , \Theta_L$ and defined by $h(x) = \Theta_L\Theta_{L−1}\cdots\Theta_1x$. We focus on algorithms that
case that the distribution over the inputs is isotropic. We provide
polynomial bounds on the number of iterations for gradient descent to
approximate the least squares matrix $\Phi$, in the case where the
initial hypothesis $\Theta_1 = \cdots = \Theta_L = I$ has excess loss
bounded by a small enough constant. On the other hand, we show that
gradient descent fails to converge for $\Phi$ whose distance from the
identity is a larger constant, and we show that some forms of
regularization toward the identity in each layer do not help.
If $\Phi$ is symmetric positive definite, we show
that an algorithm that initializes $\Theta_i = I$ learns an
$\epsilon$-approximation of $f$ using a number of updates polynomial
in $L$, the condition number of $\Phi$, and $\log(d/\epsilon)$.
In contrast, we show that if the least squares matrix $\Phi$
is symmetric and has a negative eigenvalue, then all members of a
class of algorithms that perform gradient descent with identity
initialization, and optionally regularize toward the identity in each
layer, fail to converge. We analyze an algorithm for the case that
$\Phi$ satisfies $u^\top\Phi u>0$ for all $u$, but may not be symmetric.
This algorithm uses two regularizers: one that maintains the invariant
$u^\top\Theta_L\Theta_{L-1}\cdots\Theta_1 u>0$
for all $u$, and another that balances' $\Theta_1, \ldots, \Theta_L$
so that they have the same singular values.},
url = {http://proceedings.mlr.press/v80/bartlett18a.html},
pdf = {http://proceedings.mlr.press/v80/bartlett18a/bartlett18a.pdf},
pages = {521--530},
editor = {Dy, Jennifer and Krause, Andreas},
volume = {80},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR}
}

@inproceedings{mb-himlr-18a,
author = {Alan Malek and Peter L.~Bartlett},
title = {Horizon-Independent Minimax Linear Regression},
booktitle = {Advances in Neural Information Processing Systems 31},
publisher = {Curran Associates, Inc.},
year = {2018},
abstract = {We consider a linear regression game: at each round, an
adversary reveals a covariate vector, the learner predicts a real
value, the adversary reveals a label, and the learner suffers the
squared prediction error. The aim is to minimize the difference
between the cumulative loss and that of the linear predictor that is
best in hindsight. Previous work demonstrated that the minimax optimal
strategy is easy to compute recursively from the end of the game; this
requires the entire sequence of covariate vectors in advance. We show
that, once provided with a measure of the scale of the problem, we can
invert the recursion and play the minimax strategy without knowing the
future covariates. Further, we show that this forward recursion
remains optimal even against adaptively chosen labels and covariates,
prevent misrepresentation of the scale of the problem. This strategy
is horizon-independent, i.e. it incurs no more regret than the optimal
strategy that knows in advance the number of rounds of the game. We
also provide an interpretation of the minimax algorithm as a
regularizer, and obtain an explicit expression for the minimax
regret.},
editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
pages = {5264--5273},
url = {http://papers.nips.cc/paper/7772-horizon-independent-minimax-linear-regression.pdf}
}

@inproceedings{ccbj-ulmanaa-18,
author = {Xiang Cheng and Niladri S.~Chatterji and Peter L.~Bartlett
and Michael I.~Jordan},
title = {Underdamped {L}angevin {MCMC}: A non-asymptotic analysis},
year = {2018},
booktitle = {Proceedings of the 31st Conference on Learning Theory (COLT2018)},
abstract = {We study the underdamped Langevin diffusion when the log
of the target distribution is smooth and strongly concave. We present
a MCMC algorithm based on its discretization and show that it achieves
$\epsilon$ error (in 2-Wasserstein distance) in $O(\sqrt d/\epsilon)$
steps. This is a significant improvement over the best known rate for
overdamped Langevin MCMC, which is $O(d/\epsilon^2)$ steps under the
same smoothness/concavity assumptions. The underdamped Langevin MCMC
scheme can be viewed as a version of Hamiltonian Monte Carlo (HMC)
which has been observed to outperform overdamped Langevin MCMC methods
in a number of application areas. We provide quantitative rates that
support this empirical wisdom.},
url = {http://proceedings.mlr.press/v75/cheng18a.html},
pdf = {http://proceedings.mlr.press/v75/cheng18a/cheng18a.pdf},
pages = {300--323},
editor = {Bubeck, S\'ebastien and Perchet, Vianney and Rigollet, Philippe},
volume = {75},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR}
}

@inproceedings{abgmv-bbwsabai-18,
author = {Yasin Abbasi-Yadkori and Peter L.~Bartlett and Victor
Gabillon and Alan Malek and Michal Valko},
title = {Best of Both Worlds:   Stochastic and Adversarial Best-Arm
Identification},
year = {2018},
abstract = {We study bandit best-arm identification with arbitrary and
potentially adversarial rewards. A simple random uniform learner
obtains the optimal rate of error in the adversarial scenario.
However, this type of strategy is suboptimal when the rewards are
sampled stochastically. Therefore, we ask: Can we design a learner
that performs optimally in both the stochastic and adversarial
problems while not being aware of the nature of the rewards? First, we
show that designing such a learner is impossible in general. In
particular, to be robust to adversarial rewards, we can only guarantee
optimal rates of error on a subset of the stochastic problems. We give
a lower bound that characterizes the optimal rate in stochastic
problems if the strategy is constrained to be robust to adversarial
rewards.  Finally, we design a simple parameter-free algorithm and
show that its probability of error matches (up to log factors) the
lower bound in stochastic problems, and it is also robust to
booktitle = {Proceedings of the 31st Conference on Learning Theory (COLT2018)},
pages = {918--949},
editor = {Bubeck, S\'ebastien and Perchet, Vianney and Rigollet, Philippe},
volume = {75},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR}
}

@inproceedings{bpfbj-gseasgec-18,
author = {Kush Bhatia and Aldo Pacchiano and Nicolas Flammarion and
Peter L.~Bartlett and Michael I.~Jordan},
title = {Gen-{O}ja: Simple and Efficient Algorithm for Streaming
Generalized Eigenvector Computation},
year = {2018},
booktitle = {Advances in Neural Information Processing Systems 31},
publisher = {Curran Associates, Inc.},
abstract = {In this paper, we study the problems of principle
Generalized Eigenvector computation and Canonical Correlation Analysis
in the stochastic setting. We propose a simple and efficient algorithm
for these problems. We prove the global convergence of our algorithm,
borrowing ideas from the theory of fast-mixing Markov chains and
two-Time-Scale Stochastic Approximation, showing that it achieves the
optimal rate of convergence. In the process, we develop tools for
understanding stochastic processes with Markovian noise which might be
of independent interest.},
editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
pages = {7016--7025},
url = {http://papers.nips.cc/paper/7933-gen-oja-simple-efficient-algorithm-for-streaming-generalized-eigenvector-computation.pdf}
}

@techreport{ccabj-scrldns-18,
title = {Sharp Convergence Rates for {L}angevin Dynamics in the
Nonconvex Setting},
author = {Xiang Cheng and Niladri S.~Chatterji and Yasin
Abbasi-Yadkori and Peter L.~Bartlett and Michael I.~Jordan},
institution = {arxiv.org},
number = {arXiv:1805.01648 [stat.ML]},
year = {2018},
url = {https://arxiv.org/abs/1805.01648}
}

@article{bhlm-ntvcdpdbfplnn-19,
author = {Peter L. Bartlett and Nick Harvey and Christopher Liaw and Abbas Mehrabian},
title = {Nearly-tight {VC}-dimension and Pseudodimension Bounds for Piecewise Linear Neural Networks},
journal = {Journal of Machine Learning Research},
year = {2019},
volume = {20},
number = {63},
pages = {1-17},
url = {http://jmlr.org/papers/v20/17-612.html}
}

@inproceedings{bgv-aspfaaoumlsa-19,
author = {Peter L.~Bartlett and Victor Gabillon and Michal Valko},
title = {A simple parameter-free and adaptive approach to optimization
under a minimal local smoothness assumption},
booktitle = {Proceedings of the 30th International Conference on Algorithmic Learning Theory},
pages = {184--206},
year = {2019},
editor = {Garivier, Aur\'elien and Kale, Satyen},
volume = {98},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v98/bartlett19a/bartlett19a.pdf},
url = {http://proceedings.mlr.press/v98/bartlett19a.html},
abstract = {We study the problem of optimizing a function under
a \emph{budgeted number of evaluations}. We only assume that the
function is \emph{locally} smooth around one of its global optima.
The difficulty of optimization is measured in terms of 1) the amount
of \emph{noise} $b$ of the function evaluation and 2)	the local
smoothness, $d$, of the function. A smaller $d$ results in smaller
optimization error. We come with a new, simple, and parameter-free
approach. First, for all values of $b$ and $d$, this approach
recovers at least the state-of-the-art regret guarantees. Second,
our approach additionally obtains these results while being
\textit{agnostic} to the values of both $b$ and $d$. This leads to
the first algorithm that naturally adapts to an \textit{unknown}
range of noise $b$ and leads to significant improvements in a
moderate and low-noise regime. Third, our approach also obtains a
remarkable improvement over the state-of-the-art SOO algorithm when
the noise is very low which includes the case of optimization under
deterministic feedback ($b=0$).	There, under our minimal local
smoothness assumption, this improvement is of exponential magnitude
and holds for a class of functions that covers the vast majority of
functions that practitioners optimize ($d=0$). We  show that our
algorithmic improvement is borne out in  experiments as we
empirically show faster convergence on common benchmarks.}
}

@inproceedings{mpbkbw-dfmfpogflqs-19,
title = {Derivative-Free Methods for Policy Optimization:
author = {Dhruv Malik and Ashwin Pananjady and Kush Bhatia and
Koulik Khamaru and Peter L. Bartlett and Martin J. Wainwright},
booktitle = {Proceedings of the 22nd International Conference on Artificial
Intelligence and Statistics (AISTATS)},
pages = {2916--2925},
year = {2019},
editor = {Chaudhuri, Kamalika and Sugiyama, Masashi},
volume = {89},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v89/malik19a/malik19a.pdf},
url = {http://proceedings.mlr.press/v89/malik19a.html},
abstract = {We study derivative-free methods for policy
optimization over the class of linear policies. We focus on
characterizing the convergence rate of a canonical stochastic,
two-point, derivative-free method for linear-quadratic systems in
which the initial state of the system is drawn at random. In
particular, we show that for problems with effective dimension $D$,
such a method converges to an $\epsilon$-approximate solution within
$\widetilde{\mathcal{O}}(D/\epsilon)$ steps, with multiplicative
pre-factors that are explicit lower-order polynomial terms in the
curvature parameters of the problem. Along the way, we also derive
stochastic zero-order rates for a class of non-convex optimization
problems.}
}

@inproceedings{mrsb-bmwrmsosl-19,
title = {Best of many worlds: Robust model selection for online
supervised learning},
author = {Vidya Muthukumar and Mitas Ray and
Anant Sahai and Peter L. Bartlett},
booktitle = {Proceedings of the 22nd International Conference on Artificial
Intelligence and Statistics (AISTATS)},
pages = {3177--3186},
year = {2019},
editor = {Chaudhuri, Kamalika and Sugiyama, Masashi},
volume = {89},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v89/muthukumar19a/muthukumar19a.pdf},
url = {http://proceedings.mlr.press/v89/muthukumar19a.html},
abstract = {We introduce algorithms for online, full-information
prediction that   are computationally efficient and competitive with
contextual tree experts of unknown complexity, in both probabilistic
and adversarial settings.   We incorporate a novel probabilistic
framework of structural risk minimization into existing adaptive
algorithms and show that we can robustly learn not only the presence
of stochastic structure when it exists, but also the correct model
order. When the stochastic data is actually realized from a
predictor in the model class considered, we obtain regret bounds
that are competitive with the regret of an optimal algorithm that
possesses strong side information about both the true model order
and whether the process generating the data is stochastic or
adversarial. In cases where the data does not arise from any of the
models, our algorithm selects models of higher order as we play more
rounds.   We display empirically improved \textit{overall prediction
error} over other adversarially robust approaches.}
}

@inproceedings{pcb-olkl-19,
title = {Online learning with kernel losses},
author = {Chatterji, Niladri and Pacchiano, Aldo and Bartlett, Peter},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
pages = {971--980},
year = {2019},
editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
volume = {97},
series = {Proceedings of Machine Learning Research},
address = {Long Beach, California, USA},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v97/chatterji19a/chatterji19a.pdf},
url = {http://proceedings.mlr.press/v97/chatterji19a.html},
abstract = {We present a generalization of the adversarial
linear bandits framework, where the underlying losses are kernel
functions (with an associated reproducing kernel Hilbert space)
rather than linear functions. We study a version of the exponential
weights algorithm and bound its regret in this setting. Under
conditions on the eigen-decay of the kernel we provide a sharp
characterization of the regret for this algorithm. When we have
polynomial eigen-decay ($\mu_j \le \mathcal{O}(j^{-\beta})$), we
find that the regret is bounded by $\mathcal{R}_n \le \mathcal{O}(n^{\beta/2(\beta-1)})$. While under the assumption of
exponential eigen-decay ($\mu_j \le \mathcal{O}(e^{-\beta j })$) we
get an even tighter bound on the regret $\mathcal{R}_n \le \tilde{\mathcal{O}}(n^{1/2})$. When the eigen-decay is polynomial we
also show a \emph{non-matching} minimax lower bound on the regret of
$\mathcal{R}_n \ge \Omega(n^{(\beta+1)/2\beta})$ and a lower bound
of $\mathcal{R}_n \ge \Omega(n^{1/2})$ when the decay in the
eigen-values is exponentially fast. We also study the full
information setting when the underlying losses are kernel functions
and present an adapted exponential weights algorithm and a
}

@inproceedings{bgv-sfapdddr-19,
author = {Peter L.~Bartlett and Victor Gabillon and Jennifer Healey
and Michal Valko},
title = {Scale-free adaptive planning for deterministic dynamics and
discounted rewards},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
pages = {495--504},
year = {2019},
editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
volume = {97},
series = {Proceedings of Machine Learning Research},
address = {Long Beach, California, USA},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v97/bartlett19a/bartlett19a.pdf},
url = {http://proceedings.mlr.press/v97/bartlett19a.html},
abstract = {We address the problem of planning in an environment
with deterministic dynamics and stochastic discounted rewards under
a limited numerical budget where the ranges of both rewards and
noise are unknown. We introduce PlaTypOOS, an adaptive, robust, and
efficient alternative to the OLOP (open-loop optimistic planning)
algorithm. Whereas OLOP requires a priori knowledge of the ranges of
both rewards and noise, PlaTypOOS dynamically adapts its behavior to
both. This allows PlaTypOOS to be immune to two vulnerabilities of
OLOP: failure when given underestimated ranges of noise and rewards
and inefficiency when these are overestimated. PlaTypOOS
PlaTypOOS acts in a provably more efficient manner vs. OLOP when
OLOP is given an overestimated reward and show that in the case of
no noise, PlaTypOOS learns exponentially faster.}
}

@inproceedings{abblsw-prbpiep-19,
title = {{POLITEX}: Regret Bounds for Policy Iteration using Expert Prediction},
author = {Abbasi-Yadkori, Yasin and Bartlett, Peter L. and Bhatia, Kush and Lazic, Nevena and Szepesvari, Csaba and Weisz, Gellert},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
pages = {3692--3702},
year = {2019},
editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
volume = {97},
series = {Proceedings of Machine Learning Research},
address = {Long Beach, California, USA},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v97/lazic19a/lazic19a.pdf},
url = {http://proceedings.mlr.press/v97/lazic19a.html},
abstract = {We present POLITEX (POLicy ITeration with EXpert
advice), a variant of policy iteration where each policy is a
Boltzmann distribution over the sum of action-value function
estimates of the previous policies, and analyze its regret in
continuing RL problems. We assume that the value function error
after running a policy for $\tau$ time steps scales as
$\epsilon(\tau) = \epsilon_0 + O(\sqrt{d/\tau})$, where $\epsilon_0$
is the worst-case approximation error and $d$ is the number of
features in a compressed representation of the state-action space.
We establish that this condition is satisfied by the LSPE algorithm
under certain assumptions on the MDP and policies. Under the error
assumption, we show that the regret of POLITEX in uniformly mixing
MDPs scales as $O(d^{1/2}T^{3/4} + \epsilon_0T)$, where $O(\cdot)$
hides logarithmic terms and problem-dependent constants. Thus, we
provide the first regret bound for a fully practical model-free
method which only scales in the number of features, and not in the
size of the underlying MDP. Experiments on a queuing problem confirm
that POLITEX is competitive with some of its alternatives, while
preliminary results on Ms Pacman (one of the standard Atari
benchmark problems) confirm the viability of POLITEX beyond linear
function approximation.}
}

@inproceedings{yckb-daspabrdl-19,
author = {Dong Yin and Yudong Chen and Ramchandran Kannan and
Peter L.~Bartlett},
title = {Defending Against Saddle Point Attack in {B}yzantine-Robust Distributed Learning},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
pages = {7074--7084},
year = {2019},
editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
volume = {97},
series = {Proceedings of Machine Learning Research},
address = {Long Beach, California, USA},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v97/yin19a/yin19a.pdf},
url = {http://proceedings.mlr.press/v97/yin19a.html},
abstract = {We study robust distributed learning that involves
minimizing a non-convex loss function with saddle points. We
consider the Byzantine setting where some worker machines have
abnormal or even arbitrary and adversarial behavior, and in this
setting, the Byzantine machines may create fake local minima near a
saddle point that is far away from any true local minimum, even when
robust gradient estimators are used. We develop ByzantinePGD, a
robust first-order algorithm that can provably escape saddle points
and fake local minima, and converge to an approximate true local
minimizer with low iteration complexity. As a by-product, we give a
simpler algorithm and analysis for escaping saddle points in the
usual non-Byzantine setting. We further discuss three robust
gradient estimators that can be used in ByzantinePGD, including
median, trimmed mean, and iterative filtering. We characterize their
performance in concrete statistical settings, and argue for their
near-optimality in low and high dimensional regimes.}
}

@inproceedings{ykb-rcarg-19,
author = {Dong Yin and Ramchandran Kannan and Peter L.~Bartlett},
Generalization},
booktitle = {Proceedings of the 36th International Conference on Machine Learning},
pages = {7085--7094},
year = {2019},
editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
volume = {97},
series = {Proceedings of Machine Learning Research},
address = {Long Beach, California, USA},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v97/yin19b/yin19b.pdf},
url = {http://proceedings.mlr.press/v97/yin19b.html},
abstract = {Many machine learning models are vulnerable to
that are imperceptible to humans can often make machine learning
models produce wrong predictions with high confidence; moreover,
although we may obtain robust models on the training dataset via
adversarial training, in some problems the learned models cannot
generalize well to the test data. In this paper, we focus on
$\ell_\infty$ attacks, and study the adversarially robust
generalization problem through the lens of Rademacher complexity.
For binary linear classifiers, we prove tight bounds for the
Rademacher complexity is never smaller than its natural counterpart,
and it has an unavoidable dimension dependence, unless the weight
vector has bounded $\ell_1$ norm, and our results also extend to
multi-class linear classifiers; in addition, for (nonlinear) neural
networks, we show that the dimension dependence in the adversarial
Rademacher complexity also exists. We further consider a surrogate
adversarial loss for one-hidden layer ReLU network and prove margin
bounds for this setting. Our results indicate that having $\ell_1$
norm constraints on the weight matrices might be a potential way to
improve generalization in the adversarial setting. We demonstrate
experimental results that validate our theoretical findings.}
}

@techreport{bmdbj-brnv-19,
title = {Bayesian Robustness: A Nonasymptotic Viewpoint},
author = {Kush Bhatia and Yi-An Ma and Anca D. Dragan and Peter L. Bartlett and Michael I. Jordan},
year = {2019},
institution = {arXiv},
number = {1907.11826}
}

@techreport{mhwbj-sbmm-19,
author = {Wenlong Mou and Nhat Ho and Martin J. Wainwright and
Peter L.~Bartlett and Michael I.~Jordan},
title = {Sampling for {B}ayesian Mixture Models: {MCMC} with Polynomial-Time Mixing},
institution = {arXiv},
number = {1912.05153},
year = {2019},
abstract = {We study the problem of sampling from the power posterior
distribution in Bayesian Gaussian mixture models. The power posterior
is a robust version of the classical posterior; it is known to be
non-log-concave and multi-modal, which leads to exponential mixing
times for some standard MCMC algorithms. We introduce and study the
Reflected Metropolis-Hastings Random Walk (RMRW) algorithm for
sampling from the power posterior distribution of two-component
location Gaussian mixtures. For symmetric mixtures with mean
parameters at $−\theta_0$ and $\theta_0$, we prove that its mixing
time is bounded as $d^{1.5}(d + \|\theta_0\|^2)^{4.5}$
as long as the sample size $n$ is of the order $d(d + \|\theta_0\|^2)$.
Notably, this result requires no conditions on the separation of the
two means $\theta_0$ and $-\theta_0$. En route to proving this bound,
we establish some new results, of possible independent interest, that
allow for combining Poincare inequalities for conditional and marginal
densities.}
}

@inproceedings{cfb-fmesgr-19,
title = {Fast Mean Estimation with Sub-Gaussian Rates},
author = {Cherapanamjeri, Yeshwanth and Flammarion, Nicolas and Bartlett, Peter L.},
booktitle = {Proceedings of the Thirty-Second Conference on Learning Theory},
pages = {786--806},
year = {2019},
editor = {Beygelzimer, Alina and Hsu, Daniel},
volume = {99},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v99/cherapanamjeri19b/cherapanamjeri19b.pdf},
url = {http://proceedings.mlr.press/v99/cherapanamjeri19b.html},
abstract = {We propose an estimator for the mean of a random
vector in $\mathbb{R}^d$ that can be computed in time
$O(n^{3.5}+n^2d)$ for $n$ i.i.d. samples and that has error bounds
matching the sub-Gaussian case. The only assumptions we make about
the data distribution are that it has finite mean and covariance; in
particular, we make no assumptions about higher-order moments. Like
the polynomial time estimator introduced by Hopkins (2018), which is
based on the sum-of-squares hierarchy, our estimator achieves
optimal statistical efficiency in this challenging setting, but it
has a significantly faster runtime and a simpler analysis.}
}

@inproceedings{cb-tmcwh-19,
author = {Yeshwanth Cherapanamjeri and Peter L.~Bartlett},
title = {Testing {M}arkov Chains Without Hitting},
year = {2019},
abstract = {We study the problem of identity testing of Markov chains.
In this setting, we are given access to a single trajectory from a
Markov chain with unknown transition matrix $Q$ and the goal is to
determine whether $Q = P$ for some known matrix $P$ or Dist$(P , Q)\ge\epsilon$, where Dist is suitably defined. In recent work by
Daskalakis et al.~(2018a), it was shown that it is possible to
distinguish between the two cases provided the length of the observed
trajectory is at least super-linear in the hitting time of $P$, which
may be arbitrarily large.  In this paper, we propose an algorithm that
avoids this dependence on hitting time, thus enabling efficient
testing of Markov chains even in cases where it is infeasible to
observe every state in the chain. Our algorithm is based on combining
classical ideas from approximation algorithms with techniques for the
spectral analysis of Markov chains.},
booktitle = {Proceedings of the 32nd Conference on Learning Theory (COLT2019)},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
pages = {758--785},
editor = {Beygelzimer, Alina and Hsu, Daniel},
volume = {99},
pdf = {http://proceedings.mlr.press/v99/cherapanamjeri19a/cherapanamjeri19a.pdf},
url = {http://proceedings.mlr.press/v99/cherapanamjeri19a.html}
}

@article{bhl-gdiielpdltdrn-19,
author = {Peter L.~Bartlett and David P.~Helmbold and Philip M.~Long},
title = {Gradient descent with identity initialization
efficiently learns positive definite linear transformations
by deep residual networks},
year = {2019},
journal = {Neural Computation},
volume = {31},
pages = {477--502}
}

@techreport{mfwb-esanscp-19,
title = {An Efficient Sampling Algorithm for Non-smooth Composite Potentials},
author = {Wenlong Mou and Nicolas Flammarion and Martin J. Wainwright and Peter L. Bartlett},
year = {2019},
institution = {arXiv},
number = {1910.00551}
}

@techreport{tb-borr-20,
title = {Benign overfitting in ridge regression},
author = {Alexander Tsigler and Peter L.~Bartlett},
year = {2020},
abstract = {Classical learning theory suggests that strong regularization is needed to learn a class with large complexity. This intuition is in contrast with the modern practice of machine learning, in particular learning neural networks, where the number of parameters often exceeds the number of data points. It has been observed empirically that such overparametrized models can show good generalization performance even if trained with vanishing or negative regularization. The aim of this work is to understand theoretically how this effect can occur, by studying the setting of ridge regression. We provide non-asymptotic generalization bounds for overparametrized ridge regression that depend on the arbitrary covariance structure of the data, and show that those bounds are tight for a range of regularization parameter values. To our knowledge this is the first work that studies overparametrized ridge regression in such a general setting. We identify when small or negative regularization is sufficient for obtaining small generalization error. On the technical side, our bounds only require the data vectors to be i.i.d. sub-gaussian, while most previous work assumes independence of the components of those vectors.},
institution = {arxiv.org},
number = {arXiv:2009.14286},
url = {https://arxiv.org/abs/2009.14286}
}

@inproceedings{bpbdw-plamc-20,
title = {Preference learning along multiple criteria: A game-theoretic
perspective},
author = {Kush Bhatia and Ashwin Pananjady and Peter L.~Bartlett and
Anca Dragan and Martin Wainwright},
year = {2020},
abstract = {The literature on ranking from ordinal data is vast, and
there are several ways to aggregate overall preferences from pairwise
comparisons between objects. In particular, it is well-known that any
Nash equilibrium of the zero-sum game induced by the preference matrix
defines a natural solution concept (winning distribution over objects)
known as a von Neumann winner. Many real-world problems, however, are
inevitably multi-criteria, with different pairwise preferences
governing the different criteria. In this work, we generalize the
notion of a von Neumann winner to the multi-criteria setting by taking
inspiration from Blackwell’s approachability. Our framework allows for
non-linear aggregation of preferences across criteria, and generalizes
the linearization-based approach from multi-objective optimization.
From a theoretical standpoint, we show that the Blackwell winner of a
multi-criteria problem instance can be computed as the solution to a
convex optimization problem. Furthermore, given random samples of
pairwise comparisons, we show that a simple, "plug-in" estimator
achieves (near-)optimal minimax sample complexity. Finally, we
showcase the practical utility of our framework in a user study on
autonomous driving, where we find that the Blackwell winner
outperforms the von Neumann winner for the overall preferences.},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
pages = {7413--7424},
publisher = {Curran Associates, Inc.},
url = {https://proceedings.neurips.cc/paper/2020/file/52f4691a4de70b3c441bca6c546979d9-Paper.pdf},
volume = {33}
}

@article{mpbkbw-dfmfpogflqs-20,
title = {Derivative-Free Methods for Policy Optimization: Guarantees
author = {Dhruv Malik and Ashwin Pananjady and Kush Bhatia and
Koulik Khamaru and Peter L.~Bartlett and Martin J.~Wainwright},
journal = {Journal of Machine Learning Research},
year = {2020},
volume = {21},
number = {21},
pages = {1-51},
url = {http://jmlr.org/papers/v21/19-198.html}
}

@inproceedings{cmb-oasoa-20,
title = {{OSOM}: A Simultaneously Optimal Algorithm for Multi-Armed and Linear
Contextual Bandits},
author = {Niladri Chatterji and Vidya Muthukumar and Peter L.~Bartlett},
year = {2020},
booktitle = {Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics},
pages = {1844--1854},
editor = {Chiappa, Silvia and Calandra, Roberto},
volume = {108},
series = {Proceedings of Machine Learning Research},
month = {26--28 Aug},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v108/chatterji20b/chatterji20b.pdf},
url = {http://proceedings.mlr.press/v108/chatterji20b.html},
abstract = {We consider the stochastic linear (multi-armed)
contextual bandit problem with the possibility of hidden simple
multi-armed bandit structure in which the rewards are independent of
the contextual information. Algorithms that are designed solely for
one of the regimes are known to be sub-optimal for their alternate
regime. We design a single computationally efficient algorithm that
simultaneously obtains problem-dependent optimal regret rates in the
simple multi-armed bandit regime and minimax optimal regret rates in
the linear contextual bandit regime, without knowing a priori which
of the two models generates the rewards. These results are proved
under the condition of stochasticity of contextual information over
multiple rounds. Our results should be viewed as a step towards
principled data-dependent policy class selection for contextual
bandits.}
}

@inproceedings{cdbj-lmcws-20,
title = {{L}angevin {M}onte {C}arlo without Smoothness},
author = {Chatterji, Niladri and Diakonikolas, Jelena and
Jordan, Michael I. and Bartlett, Peter L.},
booktitle = {Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics},
pages = {1716--1726},
year = {2020},
editor = {Chiappa, Silvia and Calandra, Roberto},
volume = {108},
series = {Proceedings of Machine Learning Research},
month = {26--28 Aug},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v108/chatterji20a/chatterji20a.pdf},
url = {http://proceedings.mlr.press/v108/chatterji20a.html},
abstract = {Langevin Monte Carlo (LMC) is an iterative algorithm
used to generate samples from a distribution that is known only up
to a normalizing constant. The nonasymptotic dependence of its
mixing time on the dimension and target accuracy is understood
mainly in the setting of smooth (gradient-Lipschitz) log-densities,
a serious limitation for applications in machine learning. In this
paper, we remove this limitation, providing polynomial-time
convergence guarantees for a variant of LMC in the setting of
nonsmooth log-concave distributions. At a high level, our results
follow by leveraging the implicit smoothing of the log-density that
comes from a small  Gaussian perturbation that we add to the
iterates of the algorithm and controlling the bias and variance that
are induced by this perturbation.}
}

@article{bllt-bolr-20,
author = {Peter L.~Bartlett and Philip M.~Long and G\'{a}bor Lugosi
and Alexander Tsigler},
title = {Benign Overfitting in Linear Regression},
journal = {Proceedings of the National Academy of Sciences},
note = {(arXiv:1906.11300)},
year = {2020},
volume = {117},
number = {48},
pages = {30063--30070},
doi = {10.1073/pnas.1907378117},
publisher = {National Academy of Sciences},
abstract = {The phenomenon of benign overfitting is one of the key mysteries uncovered by deep learning methodology: deep neural networks seem to predict well, even with a perfect fit to noisy training data. Motivated by this phenomenon, we consider when a perfect fit to training data in linear regression is compatible with accurate prediction. We give a characterization of linear regression problems for which the minimum norm interpolating prediction rule has near-optimal prediction accuracy. The characterization is in terms of two notions of the effective rank of the data covariance. It shows that overparameterization is essential for benign overfitting in this setting: the number of directions in parameter space that are unimportant for prediction must significantly exceed the sample size. By studying examples of data covariance properties that this characterization shows are required for benign overfitting, we find an important role for finite-dimensional data: the accuracy of the minimum norm interpolating prediction rule approaches the best possible accuracy for a much narrower range of properties of the data distribution when the data lie in an infinite-dimensional space vs. when the data lie in a finite-dimensional space with dimension that grows faster than the sample size.},
issn = {0027-8424},
url = {https://www.pnas.org/content/early/2020/04/22/1907378117},
eprint = {https://www.pnas.org/content/early/2020/04/22/1907378117.full.pdf}
}

@inproceedings{pmmb-otsla-20,
title = {On Approximate {T}hompson Sampling with {L}angevin Algorithms},
author = {Mazumdar, Eric and Pacchiano, Aldo and Ma, Yian and Jordan, Michael and Bartlett, Peter},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {6797--6807},
year = {2020},
editor = {Hal {Daum\'e} III and Aarti Singh},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/mazumdar20a/mazumdar20a.pdf},
url = {http://proceedings.mlr.press/v119/mazumdar20a.html},
abstract = {Thompson sampling for multi-armed bandit problems is
known to enjoy favorable performance in both theory and practice.
However, its wider deployment is restricted due to a significant
computational limitation: the need for samples from posterior
distributions at every iteration. In practice, this limitation is
alleviated by making use of approximate sampling methods, yet
provably incorporating approximate samples into Thompson Sampling
algorithms remains an open problem. In this work we address this by
proposing two efficient Langevin MCMC algorithms tailored to
Thompson sampling. The resulting approximate Thompson Sampling
algorithms are efficiently implementable and provably achieve
optimal instance-dependent regret for the Multi-Armed Bandit (MAB)
problem. To prove these results we derive novel posterior
concentration bounds and MCMC convergence rates for log-concave
distributions which may be of independent interest.}
}

@inproceedings{lpbj-ampermi-20,
author = {Jonathan Lee and Aldo Pacchiano and Peter L.~Bartlett and
Michael I.~Jordan},
title = {Accelerated Message Passing for Entropy-Regularized {MAP} Inference},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {5736--5746},
year = {2020},
editor = {Hal {Daum\'e} III and Aarti Singh},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/lee20e/lee20e.pdf},
url = {http://proceedings.mlr.press/v119/lee20e.html},
abstract = {Maximum a posteriori (MAP) inference in discrete-valued Markov random fields is a fundamental problem in machine learning that involves identifying the most likely configuration of random variables given a distribution. Due to the difficulty of this combinatorial problem, linear programming (LP) relaxations are commonly used to derive specialized message passing algorithms that are often interpreted as coordinate descent on the dual LP. To achieve more desirable computational properties, a number of methods regularize the LP with an entropy term, leading to a class of smooth message passing algorithms with convergence guarantees. In this paper, we present randomized methods for accelerating these algorithms by leveraging techniques that underlie classical accelerated gradient methods. The proposed algorithms incorporate the familiar steps of standard smooth message passing algorithms, which can be viewed as coordinate minimization steps. We show that these accelerated variants achieve faster rates for finding $\epsilon$-optimal points of the unregularized problem, and, when the LP is tight, we prove that the proposed algorithms recover the true MAP solution in fewer iterations than standard message passing algorithms.}
}

@inproceedings{pmlr-v119-cheng20e,
title = {Stochastic Gradient and {L}angevin Processes},
author = {Cheng, Xiang and Yin, Dong and Bartlett, Peter and Jordan, Michael},
booktitle = {Proceedings of the 37th International Conference on Machine Learning},
pages = {1810--1819},
year = {2020},
editor = {Hal {Daum\'e} III and Aarti Singh},
volume = {119},
series = {Proceedings of Machine Learning Research},
month = {13--18 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v119/cheng20e/cheng20e.pdf},
url = {http://proceedings.mlr.press/v119/cheng20e.html},
abstract = {We prove quantitative convergence rates at which
discrete Langevin-like processes converge to the invariant
distribution of a related stochastic differential equation. We study
the setup where the additive noise can be non-Gaussian and
state-dependent and the potential function can be non-convex. We
show that the key properties of these processes depend on the
potential function and the second moment of the additive noise. We
apply our theoretical findings to studying the convergence of
Stochastic Gradient Descent (SGD) for non-convex problems and
corroborate them with experiments using SGD to train deep neural
networks on the CIFAR-10 dataset.}
}

@inproceedings{mfb-sdarhs-20,
title = {Self-Distillation Amplifies Regularization in {H}ilbert Space},
author = {Hossein Mobahi and Mehrdad Farajtabar and Peter L.~Bartlett},
year = {2020},
booktitle = {Advances in Neural Information Processing Systems 33},
abstract = {Knowledge distillation introduced in the deep learning
context is a method to transfer knowledge from one architecture to
another. In particular, when the architectures are identical, this is
called self-distillation. The idea is to feed in predictions of the
trained model as new target values for retraining (and iterate this
loop possibly a few times). It has been empirically observed that the
self-distilled model often achieves higher accuracy on held out data.
Why this happens, however, has been a mystery: the self-distillation
solely evolves by looping over training. To the best of our knowledge,
there is no rigorous understanding of why this happens. This work
provides the first theoretical analysis of self-distillation. We focus
on fitting a nonlinear function to training data, where the model
space is Hilbert space and fitting is subject to L2 regularization in
this function space. We show that self-distillation iterations modify
regularization by progressively limiting the number of basis functions
that can be used to represent the solution. This implies (as we also
verify empirically) that while a few rounds of self-distillation may
reduce over-fitting, further rounds may lead to under-fitting and thus
worse performance.},
booktitle = {Advances in Neural Information Processing Systems},
editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
pages = {3351--3361},
publisher = {Curran Associates, Inc.},
volume = {33}
}

@techreport{catjfb-orlrnlt-20,
title = {Optimal Robust Linear Regression in Nearly Linear Time},
author = {Yeshwanth Cherapanamjeri and Efe Aras and Nilesh Tripuraneni and Michael I. Jordan and Nicolas Flammarion and Peter L. Bartlett},
year = {2020},
institution = {arXiv},
number = {2007.08137}
}

@inproceedings{mlwbj-fgalsaa-20,
title = {On Linear Stochastic Approximation: Fine-grained
{P}olyak-{R}uppert and Non-Asymptotic Concentration},
author = {Mou, Wenlong and Li, Chris Junchi and Wainwright, Martin J
and Bartlett, Peter L and Jordan, Michael I},
booktitle = {Proceedings of Thirty Third Conference on Learning Theory},
pages = {2947--2997},
year = {2020},
editor = {Jacob Abernethy and Shivani Agarwal},
volume = {125},
series = {Proceedings of Machine Learning Research},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v125/mou20a/mou20a.pdf},
url = {http://proceedings.mlr.press/v125/mou20a.html},
abstract = { We undertake a precise study of the asymptotic and
non-asymptotic properties of stochastic approximation procedures
with Polyak-Ruppert averaging for solving a linear system $\bar{A} \theta = \bar{b}$. When the matrix $\bar{A}$ is Hurwitz, we prove a
central limit theorem (CLT) for the averaged iterates with fixed
step size and number of iterations going to infinity. The CLT
characterizes the exact asymptotic covariance matrix, which is the
sum of the classical Polyak-Ruppert covariance and a correction term
that scales with the step size. Under assumptions on the tail of the
noise distribution, we prove a non-asymptotic concentration
inequality whose main term matches the covariance in CLT in any
direction, up to universal constants. When the matrix $\bar{A}$ is
not Hurwitz but only has non-negative real parts in its eigenvalues,
we prove that the averaged LSA procedure actually achieves an
$O(1/T)$ rate in mean-squared error. Our results provide a more
refined understanding of linear stochastic approximation in both the
asymptotic and non-asymptotic settings. We also show various
applications of the main results, including the study of
momentum-based stochastic gradient methods as well as temporal
difference algorithms in reinforcement learning.}
}

@techreport{bl-fmdgblni-20,
title = {Failures of model-dependent generalization bounds for least-norm interpolation},
author = {Peter L. Bartlett and Philip M. Long},
year = {2020},
institution = {arxiv.org},
number = {arXiv:2010.08479},
url = {https://arxiv.org/abs/2010.08479},
abstract = {We consider bounds on the generalization performance of the least-norm linear regressor, in the over-parameterized regime where it can interpolate the data. We describe a sense in which any generalization bound of a type that is commonly proved in statistical learning theory must sometimes be very loose when applied to analyze the least-norm interpolant. In particular, for a variety of natural joint distributions on training examples, any valid generalization bound that depends only on the output of the learning algorithm, the number of training examples, and the confidence parameter, and that satisfies a mild condition (substantially weaker than monotonicity in sample size), must sometimes be very loose---it can be bounded below by a constant when the true excess risk goes to zero.}
}

@techreport{ctbj-omewv-20,
title = {Optimal Mean Estimation without a Variance},
author = {Yeshwanth Cherapanamjeri and Nilesh Tripuraneni and Peter L. Bartlett and Michael I. Jordan},
year = {2020},
institution = {arXiv},
number = {2011.12433}
}

@article{bmr-dlasp-21,
author = {Peter L. Bartlett and Andrea Montanari and Alexander
Rakhlin},
title = {Deep learning: a statistical viewpoint},
journal = {Acta Numerica},
year = {2021},
volume = {30},
doi = {10.1017/S0962492921000027},
publisher = {Cambridge University Press},
pages = {87–201},
url = {https://arxiv.org/abs/2103.09177},
abstract = {The remarkable practical success of deep learning has revealed some
major surprises from a theoretical perspective.  In particular,
simple gradient methods easily find near-optimal solutions to
non-convex optimization problems, and despite giving a near-perfect
fit to training data without any explicit effort to control model
complexity, these methods exhibit excellent predictive accuracy.
We conjecture that specific principles underlie these phenomena: that
overparametrization allows gradient methods to find interpolating
solutions, that these methods implicitly impose regularization,
and that overparametrization leads to benign overfitting, that is,
accurate predictions despite overfitting training data.
We survey recent theoretical progress
that provides examples illustrating these principles in simpler
settings.  We first review classical uniform convergence results
and why they fall short of explaining aspects of the behavior of
deep learning methods.  We give examples of implicit regularization
functions that perfectly fit the training data.  Then we review
prediction methods that exhibit benign overfitting, focusing
on regression problems with quadratic loss.  For these methods,
we can decompose the prediction rule into a simple component that
is useful for prediction and a spiky component that is useful for
overfitting but, in a favorable setting, does not harm prediction
accuracy.  We focus specifically on the linear regime for neural
networks, where the network can be approximated by a linear model.
In this regime, we demonstrate the success of gradient flow, and
we consider benign overfitting with two-layer networks, giving an
exact asymptotic analysis that precisely demonstrates the impact of
overparametrization.  We conclude by highlighting the key challenges
that arise in extending these insights to realistic deep learning
settings.}
}

@article{bl-fmdgblni-21,
author = {Peter L. Bartlett and Philip M. Long},
title = {Failures of Model-dependent Generalization Bounds for Least-norm Interpolation},
journal = {Journal of Machine Learning Research},
year = {2021},
volume = {22},
number = {204},
pages = {1--15},
url = {http://jmlr.org/papers/v22/20-1164.html},
note = {arXiv:2010.08479}
}

@article{mmbjw-holdyama-21,
author = {Wenlong Mou and Yi-An Ma and Martin J. Wainwright and
Peter L. Bartlett and Michael I. Jordan},
title = {High-Order {L}angevin Diffusion Yields an Accelerated
{MCMC} Algorithm},
journal = {Journal of Machine Learning Research},
year = {2021},
volume = {22},
number = {42},
pages = {1-41},
url = {http://jmlr.org/papers/v22/20-576.html}
}

@article{mccfbj-itanafgbm-21,
author = {Yi-An Ma and Niladri S.~Chatterji and Xiang Cheng
and Nicolas Flammarion and Peter L.~Bartlett and Michael I.~Jordan},
title = {Is There an Analog of {N}esterov Acceleration for
year = {2021},
journal = {Bernoulli},
volume = {27},
number = {3},
pages = {1942--1992},
doi = {http://dx.doi.org/10.3150/20-BEJ1297},
abstract = {We formulate gradient-based Markov chain Monte Carlo
(MCMC) sampling as optimization on the space of probability measures,
with Kullback–Leibler (KL) divergence as the objective functional. We
show that an underdamped form of the Langevin algorithm performs
accelerated gradient descent in this metric. To characterize the
convergence of the algorithm, we construct a Lyapunov functional and
exploit hypocoercivity of the underdamped Langevin algorithm. As an
application, we show that accelerated rates can be obtained for a
class of nonconvex functions with the Langevin algorithm.}
}

@inproceedings{bbds-aluu-21,
author = {Kush Bhatia and Peter L. Bartlett and Anca D. Dragan and Jacob Steinhardt},
title = {{Agnostic Learning with Unknown Utilities}},
booktitle = {12th Innovations in Theoretical Computer Science Conference (ITCS 2021)},
pages = {55:1--55:20},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
isbn = {978-3-95977-177-1},
issn = {1868-8969},
year = {2021},
volume = {185},
editor = {James R. Lee},
publisher = {Schloss Dagstuhl--Leibniz-Zentrum f{\"u}r Informatik},
url = {https://drops.dagstuhl.de/opus/volltexte/2021/13594},
doi = {10.4230/LIPIcs.ITCS.2021.55}
}

@inproceedings{pgbj-sblc-21,
title = { Stochastic Bandits with Linear Constraints },
Bartlett, Peter L. and Jiang, Heinrich},
booktitle = {Proceedings of The 24th International Conference on Artificial Intelligence and Statistics},
pages = {2827--2835},
year = {2021},
editor = {Banerjee, Arindam and Fukumizu, Kenji},
volume = {130},
series = {Proceedings of Machine Learning Research},
month = {13--15 Apr},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v130/pacchiano21a/pacchiano21a.pdf},
url = {http://proceedings.mlr.press/v130/pacchiano21a.html},
abstract = { We study a constrained contextual linear bandit setting, where the goal of the agent is to produce a sequence of policies, whose expected cumulative reward over the course of multiple rounds is maximum, and each one of them has an expected cost below a certain threshold. We propose an upper-confidence bound algorithm for this problem, called optimistic pessimistic linear bandit (OPLB), and prove a sublinear bound on its regret that is inversely proportional to the difference between the constraint threshold and the cost of a known feasible action. Our algorithm balances exploration and constraint satisfaction using a novel idea that scales the radii of the reward and cost confidence sets with different scaling factors. We further specialize our results to multi-armed bandits and propose a computationally efficient algorithm for this setting and prove a a regret bound that is better than simply casting multi-armed bandits as an instance of linear bandits and using the regret bound of OPLB. We also prove a lower-bound for the problem studied in the paper and provide simulations to validate our theoretical results. Finally, we show how our algorithm and analysis can be extended to multiple constraints and to the case when the cost of the feasible action is unknown. }
}

@inproceedings{abms-defcc-21,
title = {Dropout: Explicit Forms and Capacity Control},
author = {Raman Arora and Peter L.~Bartlett and Poorya Mianjy and
Nathan Srebro},
year = {2021},
booktitle = {Proceedings of the 38th International Conference on Machine Learning},
pages = {351--361},
editor = {Meila, Marina and Zhang, Tong},
volume = {139},
series = {Proceedings of Machine Learning Research},
month = {18--24 Jul},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v139/arora21a/arora21a.pdf},
url = {http://proceedings.mlr.press/v139/arora21a.html},
abstract = {We investigate the capacity control provided by
dropout in various machine learning problems. First, we study
dropout for matrix completion, where it induces a
distribution-dependent regularizer that equals the weighted
trace-norm of the product of the factors. In deep learning, we show
that the distribution-dependent regularizer due to dropout directly
controls the Rademacher complexity of the underlying class of deep
neural networks. These developments enable us to give concrete
generalization error bounds for the dropout algorithm in both matrix
completion as well as training deep neural networks.}
}

@inproceedings{clb-wdgdlli-21,
author = {Niladri S.~Chatterji and Philip M.~Long and Peter L.~Bartlett},
title = {When does gradient descent with logistic loss interpolate
using deep networks with smoothed {ReLU} activations?},
year = {2021},
booktitle = {Proceedings of the 34th Conference on Learning Theory
(COLT2021)},
pages = {927--1027},
editor = {Belkin, Mikhail and Kpotufe, Samory},
volume = {134},
series = {Proceedings of Machine Learning Research},
url = {https://proceedings.mlr.press/v134/chatterji21a.html},
abstract = {We establish conditions under which gradient descent
applied to fixed-width deep networks drives the logistic loss to zero,
and prove bounds on the rate of convergence. Our analysis applies for
smoothed approximations to the ReLU, such as Swish and the Huberized
ReLU, proposed in previous applied work. We provide two sufficient
conditions for convergence. The first is simply a bound on the loss at
initialization. The second is a data separation condition used in
prior analyses.}
}

@inproceedings{psab-tdfualc-21,
author = {Juan Perdomo and Max Simchowitz and Alekh Agarwal and
Peter L.~Bartlett},
title = {Towards a Dimension-Free Understanding of Adaptive Linear
Control},
year = {2021},
booktitle = {Proceedings of the 34th Conference on Learning Theory
(COLT2021)},
note = {To appear},
abstract = {We study the problem of adaptive control of the linear
quadratic regulator for systems in very high, or even infinite
dimension. We demonstrate that while sublinear regret requires finite
dimensional inputs,  the ambient state dimension of the system need
not be small in order to perform online control. We provide the first
regret bounds for LQR which hold for infinite dimensional systems,
replacing dependence on ambient dimension with more natural notions of
problem complexity. Our guarantees arise from a novel perturbation
bound for certainty equivalence which scales with the prediction
error in estimating the system parameters, without requiring
consistent parameter recovery in more stringent measures like the
operator norm. When specialized to finite dimensional settings, our
bounds recover near optimal dimension and time horizon dependence.}
}

@techreport{bbc-aemlrrn-21a,
author = {Peter L.~Bartlett and Sebastien Bubeck and Yeshwanth
Cherapanamjeri},
title = {Adversarial Examples in Multi-Layer Random {ReLU} Networks},
year = {2021},
institution = {arXiv},
number = {2106.12611},
abstract = {We consider the phenomenon of adversarial examples in ReLU
networks with independent gaussian parameters. For networks of
constant depth and with a large range of widths (for instance, it
suffices if the width of each layer is polynomial in that of any other
layer), small perturbations of input vectors lead to large changes of
outputs. This generalizes results of Daniely and Schacham (2020) for
networks of rapidly decreasing width and of Bubeck et al (2021) for
two-layer networks. The proof shows that adversarial examples arise in
these networks because the functions that they compute are very close
to linear. Bottleneck layers in the network play a key role: the
minimal width up to some point in the network determines scales and
sensitivities of mappings computed up to that point. The main result
is for networks with constant depth, but we also show that some
constraint on depth is necessary for a result of this kind, because
there are suitably deep networks that, with constant probability,
compute a function that is close to constant.}
}

@article{clb-wdgdllfitl-21,
author = {Niladri S.~Chatterji and Philip M.~Long and Peter L.~Bartlett},
title = {When does gradient descent with logistic loss find interpolating two-layer networks?},
year = {2021},
journal = {Journal of Machine Learning Research},
url = {https://arxiv.org/abs/2012.02409},
abstract = {We study the training of finite-width two-layer smoothed
ReLU networks for binary classification using the logistic loss. We
show that gradient descent drives the training loss to zero if the
initial loss is small enough. When the data satisfies certain cluster
and separation conditions and the network is wide enough, we show that
one step of gradient descent reduces the loss sufficiently that the
first result applies.},
volume = {22},
number = {159},
pages = {1-48}
}

@article{cbl-olbsgsa-21,
author = {Niladri S.~Chatterji and Peter L.~Bartlett
and Philip M.~Long},
title = {Oracle lower bounds for stochastic gradient sampling
algorithms},
year = {2022},
journal = {Bernoulli},
volume = {28},
number = {2},
pages = {1074--1092},
note = {arXiv:2002.00291},
url = {https://arxiv.org/abs/2002.00291},
abstract = {We consider the problem of sampling from a strongly
log-concave density in R^d, and prove an information theoretic lower bound on the number of stochastic gradient queries of the log density needed. Several popular sampling algorithms (including many Markov chain Monte Carlo methods) operate by using stochastic gradients of the log density to generate a sample; our results establish an information theoretic limit for all these algorithms.
We show that for every algorithm, there exists a well-conditioned
strongly log-concave target density for which the distribution of
points generated by the algorithm would be at least ε away from the
target in total variation distance if the number of gradient queries
is less than Omega(sigma^2d/epsilon^2), where sigma^2d is the variance of the stochastic
gradient. Our lower bound follows by combining the ideas of Le Cam
deficiency routinely used in the comparison of statistical experiments
along with standard information theoretic tools used in lower bounding
Bayes risk functions. To the best of our knowledge our results provide
the first nontrivial dimension-dependent lower bound for this
problem.}
}

@article{mfwb-ibdldnorwc-21,
author = {Mou, Wenlong and Flammarion, Nicolas and Wainwright, Martin J.
and Bartlett, Peter L.},
title = {Improved Bounds for Discretization of {L}angevin Diffusions:
Near-Optimal Rates without Convexity},
year = {2022},
journal = {Bernoulli},
volume = {28},
number = {3},
pages = {1577--1601},
doi = {http://dx.doi.org/10.3150/21-BEJ1343},
url = {https://arxiv.org/abs/1907.11331},
abstract = {We present an improved analysis of the Euler-Maruyama discretization of the Langevin diffusion. Our analysis does not require global contractivity, and yields polynomial dependence on the time horizon. Compared to existing approaches, we make an additional smoothness assumption, and improve the existing rate from O(eta) to O(eta^2) in terms of the KL divergence. This result matches the correct order for numerical SDEs, without suffering from exponential time dependence. When applied to algorithms for sampling and learning, this result simultaneously improves all those methods based on Dalayan's approach.}
}

@inproceedings{mpwb-oidgmlsa-22,
author = {Wenlong Mou and Ashwin Pananjady and Martin J.~Wainwright
and Peter L.~Bartlett},
title = {Optimal and instance-dependent guarantees for {M}arkovian
linear stochastic approximation},
year = {2022},
abstract = {We study stochastic approximation procedures for approximately solving a $d$-dimensional linear fixed point equation based on observing a trajectory of length $n$ from an ergodic Markov chain.  We first exhibit a non-asymptotic bound of the order $t_{\mathrm{mix}} \tfrac{d}{n}$ on the squared error of the last iterate of a standard scheme, where $t_{\mathrm{mix}}$ is a mixing time.  We then prove a non-asymptotic instance-dependent bound on a suitably averaged sequence of iterates, with a leading term that matches the local asymptotic minimax limit, including sharp dependence on the parameters $(d, t_{\mathrm{mix}})$ in the higher order terms. We complement these upper bounds with a non-asymptotic minimax lower bound that establishes the instance-optimality of the averaged SA estimator. We derive corollaries of these results for policy evaluation with Markov noise---covering the TD($\lambda$) family of algorithms for all $\lambda \in [0, 1)$---and linear autoregressive models. Our instance-dependent characterizations open the door to the design of fine-grained model selection procedures for hyperparameter tuning (e.g., choosing the value of $\lambda$ when running the TD($\lambda$) algorithm).},
booktitle = {Proceedings of the 35th Conference on Learning Theory (COLT2022)},
note = {To appear}
}

@inproceedings{fcb-bowl-22,
author = {Spencer Frei and Niladri Chatterji and Peter L.~Bartlett},
title = {Benign Overfitting without Linearity: Neural Network
Classifiers Trained by Gradient Descent for Noisy Linear Data},
abstract = {Benign overfitting, the phenomenon where interpolating models
generalize well in the presence of noisy data, was first observed in
neural network models trained with gradient descent.  To better
understand this empirical observation, we consider the generalization
error of two-layer neural networks trained to interpolation by
gradient descent on the logistic loss following random initialization.
We assume the data comes from well-separated class-conditional
log-concave distributions and allow for a constant fraction of the
training labels to be corrupted by an adversary.  We show that in this
setting, neural networks exhibit benign overfitting: they can be
driven to zero training error, perfectly fitting any noisy training
labels, and simultaneously achieve test error close to the
Bayes-optimal error.   In contrast to previous work on benign
overfitting that require linear or kernel-based predictors, our
analysis holds in a setting where both the model and learning dynamics
are fundamentally nonlinear.},
year = {2022},
booktitle = {Proceedings of the 35th Conference on Learning Theory (COLT2022)},
note = {To appear}
}

@inproceedings{biw-gbddnla-22,
author = {Peter L.~Bartlett and Piotr Indyk and Tal Wagner},
title = {Generalization Bounds for Data-Driven Numerical Linear
Algebra},
abstract = {Data-driven algorithms can adapt their internal structure or parameters to inputs from unknown application-specific distributions, by learning from a training sample of inputs. Several recent works have applied this approach to problems in numerical linear algebra, obtaining significant empirical gains in performance. However, no theoretical explanation for their success was known.
In this work we prove generalization bounds for those algorithms,
within the PAC-learning framework for data-driven algorithm selection
proposed by Gupta and Roughgarden (SICOMP 2017). Our main result is an
almost tight bound on the fat shattering dimension of the
learning-based low rank approximation algorithm of Indyk et
al.~(NeurIPS 2019). Our techniques are general, and provide
generalization bounds for many other recently proposed data-driven
algorithms in numerical linear algebra, covering both sketching-based
and multigrid-based methods. This considerably broadens the class of
data-driven algorithms for which a PAC-learning analysis is
available.},
year = {2022},
booktitle = {Proceedings of the 35th Conference on Learning Theory (COLT2022)},
note = {To appear}
}

@inproceedings{ctbj-omewv-22,
title = {Optimal Mean Estimation without a Variance},
author = {Yeshwanth Cherapanamjeri and Nilesh Tripuraneni and Peter L.~Bartlett and Michael I.~Jordan},
abstract = {We study the problem of heavy-tailed mean estimation in settings where the variance of the data-generating distribution does not exist. Concretely, given a sample $\bm{X} = \{X_i\}_{i = 1}^n$ from a distribution $\mc{D}$ over $\mb{R}^d$ with mean $\mu$ which satisfies the following \emph{weak-moment} assumption for some ${\alpha \in [0, 1]}$:
$\forall \norm{v} = 1: \mb{E}_{X \ts \mc{D}}[\abs{\inp{X - \mu}{v}}^{1 + \alpha}] \leq 1$,
and given a target failure probability, $\delta$, our goal is to design an estimator which attains the smallest possible confidence interval as a function of $n,d,\delta$. For the specific case of $\alpha = 1$, foundational work of Lugosi and Mendelson exhibits an estimator achieving subgaussian confidence intervals, and subsequent work has led to computationally efficient versions of this estimator. Here, we study the case of general $\alpha$, and establish the following information-theoretic lower bound on the optimal attainable confidence interval:
$\Omega \lprp{\sqrt{\frac{d}{n}} + \lprp{\frac{d}{n}}^{\frac{\alpha}{(1 + \alpha)}} + \lprp{\frac{\log 1 / \delta}{n}}^{\frac{\alpha}{(1 + \alpha)}}}$.
Moreover, we devise a computationally-efficient estimator which
achieves this lower bound.},
year = {2022},
booktitle = {Proceedings of the 35th Conference on Learning Theory (COLT2022)},
note = {To appear}
}

@article{clb-ibibbotlln-22,
author = {Niladri S.~Chatterji and Philip M.~Long and Peter L.~Bartlett},
title = {The interplay between implicit bias and benign overfitting in
two-layer linear networks},
year = {2022},
journal = {Journal of Machine Learning Research},
note = {to appear}
}

This file was generated by
bibtex2html 1.98.