@comment{{This file has been generated by bib2bib 1.91}}
@comment{{Command line: bib2bib -ob pubs-93.bib -c 'year>=1993 and not note : "[pP]atent" ' /accounts/fac/bartlett/share/lib/bibtex/mypubs.bib}}
@article{b-vcdbttln-93,
author = {P. L. Bartlett},
title = {{V}apnik-{C}hervonenkis dimension bounds for two- and three-layer
networks},
journal = {Neural Computation},
volume = {5},
number = {3},
pages = {371--373},
year = {1993}
}
@inproceedings{lb-evbmlnn-93,
author = {D. R. Lovell and P. L. Bartlett},
title = {Error and variance bounds in multi-layer neural networks},
booktitle = {Proceedings of the Fourth Australian Conference
on Neural Networks},
pages = {161--164},
year = {1993}
}
@inproceedings{b-ssnlmln-93,
author = {P. L. Bartlett},
title = {The sample size necessary for learning in multi-layer networks},
booktitle = {Proceedings of the Fourth Australian Conference
on Neural Networks},
pages = {14--17},
year = {1993}
}
@inproceedings{b-lbvcdmltn-93,
author = {P. L. Bartlett},
title = {Lower bounds on the {V}apnik-{C}hervonenkis dimension of multi-layer
threshold networks},
booktitle = {Proceedings of the Sixth Annual ACM Conference
on Computational Learning Theory},
pages = {144--150},
publisher = {ACM Press},
city = {New York},
year = {1993}
}
@incollection{b-clt-94,
author = {P. L. Bartlett},
title = {Computational Learning Theory},
booktitle = {Encyclopedia of Computer Science and Technology},
volume = {31},
pages = {83--99},
editor = {A. Kent and J. G. Williams},
publisher = {Marcel Dekker},
city = {New York},
year = {1994}
}
@inproceedings{bfh-erwl-94,
author = {P. L. Bartlett and P. Fischer and K.-U. H{\"o}ffgen},
title = {Exploiting random walks for learning},
booktitle = {Proceedings of the Seventh Annual ACM Conference
on Computational Learning Theory},
pages = {318--327},
publisher = {ACM Press},
city = {New York},
year = {1994}
}
@inproceedings{blw-fslrvf-94,
author = {P. L. Bartlett and P. M. Long and R. C. Williamson},
title = {Fat-shattering and the learnability of real-valued
functions},
booktitle = {Proceedings of the Seventh Annual ACM Conference
on Computational Learning Theory},
pages = {299--310},
publisher = {ACM Press},
city = {New York},
year = {1994}
}
@unpublished{bw-scvae-94,
author = {P. L Bartlett and R. C. Williamson},
title = {Sample complexity versus approximation error},
note = {(unpublished), 22pp},
year = {1994}
}
@inproceedings{lbw-lbvdspfc-94,
author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
title = {Lower bounds on the {VC}-dimension of smoothly parametrized function
classes},
booktitle = {Proceedings of the Seventh Annual ACM Conference
on Computational Learning Theory},
pages = {362--367},
publisher = {ACM Press},
city = {New York},
year = {1994}
}
@inproceedings{b-lqrvf-94,
author = {P. L. Bartlett},
title = {Learning quantized real-valued functions},
booktitle = {Proceedings of Computing: the Australian Theory Seminar},
pages = {24--35},
publisher = {University of Technology Sydney},
year = {1994}
}
@inproceedings{lbw-vcdnnrpr-94,
author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
title = {The {V}apnik-{C}hervonenkis dimension of neural networks with restricted
parameter ranges},
booktitle = {Proceedings of the Fifth Australian Conference on Neural
Networks},
pages = {198--201},
year = {1994}
}
@article{lbw-lbvspfc-95,
author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
title = {Lower bounds on the {VC}-dimension of smoothly parametrized function
classes},
journal = {Neural Computation},
volume = {7},
pages = {990--1002},
year = {1995},
note = {(See also correction, Neural Computation, 9: 765--769, 1997)}
}
@inproceedings{bd-ecgdacrnn-95,
author = {P. L. Bartlett and S. Dasgupta},
title = {Exponential convergence of a gradient descent algorithm for
a class of recurrent neural networks},
booktitle = {Proceedings of the 38th Midwest Symposium
on Circuits and Systems},
year = {1995}
}
@inproceedings{bl-mtssdl-95,
author = {P. L. Bartlett and P. M. Long},
title = {More theorems about scale sensitive dimensions and learning},
booktitle = {Proceedings of the Eighth Annual ACM Conference
on Computational Learning Theory},
pages = {392--401},
publisher = {ACM Press},
city = {New York},
year = {1995}
}
@inproceedings{lbw-eallcbf-95,
author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
title = {On efficient agnostic learning of linear combinations of basis
functions},
booktitle = {Proceedings of the Eighth Annual ACM Conference
on Computational Learning Theory},
pages = {369--376},
publisher = {ACM Press},
city = {New York},
year = {1995}
}
@inproceedings{ab-aie-95,
author = {M. Anthony and P. L. Bartlett},
title = {Function learning from interpolation},
booktitle = {Computational Learning Theory: Second European
Conference, EUROCOLT 95, Barcelona Spain, March 1995,
Proceedings},
pages = {211--221},
year = {1995}
}
@inproceedings{bw-scnnldi-95,
author = {P. L. Bartlett and R. C. Williamson},
title = {The sample complexity of neural network learning with discrete
inputs},
booktitle = {Proceedings of the Sixth Australian Conference on
Neural Networks},
pages = {189--192},
year = {1995}
}
@inproceedings{lbw-ealnnbf-95,
author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
title = {Efficient agnostic learning of neural networks with bounded
fan-in},
booktitle = {Proceedings of the Sixth Australian Conference
on Neural Networks},
pages = {201--204},
year = {1995}
}
@book{bbw-psacnn-96,
author = {Peter L. Bartlett and Anthony Burkitt and
Robert C. Williamson},
title = {Proceedings of the Seventh Australian Conference on Neural
Networks},
publisher = {Australian National University},
city = {Canberra},
year = {1996}
}
@article{lbw-ealnnbf-96,
author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
title = {Efficient agnostic learning of neural networks with bounded
fan-in},
journal = {IEEE Transactions on Information Theory},
volume = {42},
number = {6},
pages = {2118--2132},
year = {1996}
}
@unpublished{bh-lcp-96,
author = {P. L. Bartlett and D. P. Helmbold},
title = {Learning changing problems},
note = {(unpublished), 36pp},
year = {1996}
}
@article{abist-vgai-96,
author = {M. Anthony and P. L. Bartlett and Y. Ishai
and J. Shawe-Taylor},
title = {Valid generalisation from approximate interpolation},
journal = {Combinatorics, Probability, and Computing},
volume = {5},
pages = {191--214},
year = {1996}
}
@article{blw-fslrvf-96,
author = {P. L. Bartlett and P. M. Long and R. C. Williamson},
title = {Fat-shattering and the learnability of real-valued
functions},
journal = {Journal of Computer and System Sciences},
note = {(special issue on COLT`94)},
volume = {52},
number = {3},
pages = {434--452},
year = {1996}
}
@article{bw-vcdptlnndi-96,
author = {P. L. Bartlett and R. C. Williamson},
title = {The {V}apnik-{C}hervonenkis dimension and pseudodimension of two-layer
neural networks with discrete inputs},
journal = {Neural Computation},
volume = {8},
pages = {653--656},
year = {1996}
}
@inproceedings{ksbw-elcmvf-96,
author = {A. Kowalczyk and J. Szymanski and P. L. Bartlett
and R. C. Williamson},
title = {Examples of learning curves from a modified {VC}-formalism},
booktitle = {Advances in Neural Information Processing Systems 8},
year = {1996}
}
@inproceedings{bk-cmcsnd-96,
author = {P. L. Bartlett and S. R. Kulkarni},
title = {The complexity of model classes, and smoothing noisy data
(Invited)},
booktitle = {Proceedings of the 35th IEEE Conference on Decision and
Control},
pages = {TM09-4, 2312--2317},
publisher = {IEEE},
year = {1996}
}
@inproceedings{kbb-sbtlc-96,
author = {L. Kammer and R. R. Bitmead and P. L. Bartlett},
title = {Signal-based testing of {LQ}-optimality of controllers},
booktitle = {Proceedings of the 35th IEEE Conference on Decision
and Control},
pages = {FA17-2, 3620--3623},
publisher = {IEEE},
year = {1996}
}
@inproceedings{bbdk-lccesc-96,
author = {P. L. Bartlett and S. Ben-David and S. R. Kulkarni},
title = {Learning changing concepts by exploiting the structure of
change},
booktitle = {Proceedings of the Ninth Annual Conference
on Computational Learning Theory},
pages = {131--139},
publisher = {ACM Press},
city = {New York},
year = {1996}
}
@inproceedings{stbwa-fsrm-96,
author = {J. Shawe-Taylor and P. L. Bartlett and R. C. Williamson
and M. Anthony},
title = {A framework for structural risk minimization},
booktitle = {Proceedings of the Ninth Annual Conference
on Computational Learning Theory},
pages = {68--76},
publisher = {ACM Press},
city = {New York},
year = {1996}
}
@inproceedings{lbw-iclsl-96,
author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
title = {The importance of convexity in learning with squared loss},
booktitle = {Proceedings of the Ninth Annual Conference
on Computational Learning Theory},
pages = {140--146},
publisher = {ACM Press},
city = {New York},
year = {1996}
}
@inproceedings{kbb-atiad-96,
author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett},
title = {Adaptive tracking identification: the art of defalsification},
booktitle = {Proceedings of the 1996 IFAC World Congress},
year = {1996}
}
@article{bkp-cnrvfc-97,
author = {P. L. Bartlett and S. R. Kulkarni and S. E. Posner},
title = {Covering numbers for real-valued function classes},
journal = {IEEE Transactions on Information Theory},
volume = {43},
number = {5},
pages = {1721--1724},
year = {1997}
}
@article{b-brnnpr-97,
author = {P. L. Bartlett},
title = {Book Review: `{N}eural networks for pattern recognition,'
{C}hristopher {M}.~{B}ishop},
journal = {Statistics in Medicine},
volume = {16},
number = {20},
pages = {2385--2386},
year = {1997}
}
@article{lbw-clbvspfc-97,
author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
title = {Correction to `Lower bounds on the {VC}-dimension of smoothly
parametrized function classes'},
journal = {Neural Computation},
volume = {9},
pages = {765--769},
year = {1997}
}
@inproceedings{b-fvgswmisn-97,
author = {P. L. Bartlett},
title = {For valid generalization, the size of the weights is
more important than the size of the network},
booktitle = {Advances in Neural Information Processing Systems 9},
pages = {134--140},
year = {1997}
}
@inproceedings{sfbl-bmneevm-97,
author = {R. E. Schapire and Y. Freund and P. L. Bartlett
and W. S. Lee},
title = {Boosting the margin: A new explanation for the effectiveness
of voting methods},
booktitle = {Machine Learning: Proceedings of the Fourteenth
International Conference},
pages = {322--330},
year = {1997}
}
@inproceedings{bll-mdreqd-97,
author = {P. L. Bartlett and T. Linder and G. Lugosi},
title = {The minimax distortion redundancy in empirical quantizer design
(abstract)},
booktitle = {Proceedings of the 1997 IEEE International Symposium
on Information Theory},
pages = {511},
year = {1997}
}
@inproceedings{bll-mlbeqd-97,
author = {P. L. Bartlett and T. Linder and G. Lugosi},
title = {A minimax lower bound for empirical quantizer design},
booktitle = {Proceedings of the Third European Conference on Computational
Learning Theory (EuroCOLT'97)},
pages = {220--222},
editor = {S. Ben-David},
publisher = {Springer},
year = {1997}
}
@inproceedings{bb-rrcncnsann-97,
author = {J. Baxter and P. L. Bartlett},
title = {A result relating convex $n$-widths to covering numbers with
some applications to neural networks},
booktitle = {Proceedings of the Third European Conference on Computational
Learning Theory (EuroCOLT'97)},
pages = {251--259},
editor = {S. Ben-David},
publisher = {Springer},
year = {1997}
}
@inproceedings{b-nnl-97,
author = {P. L. Bartlett},
title = {Neural network learning. (Abstract of invited talk.)},
booktitle = {CONTROL 97 Conference Proceedings, Institution of
Engineers Australia},
pages = {543},
year = {1997}
}
@inproceedings{lb-gswes-97,
author = {G. Loy and P. L. Bartlett},
title = {Generalization and the size of the weights: an
experimental study},
booktitle = {Proceedings of the Eighth Australian Conference
on Neural Networks},
pages = {60--64},
year = {1997}
}
@book{bm-peacclt-98,
editor = {Peter L. Bartlett and Yishay Mansour},
title = {Proceedings of the Eleventh Annual Conference on Computational
Learning Theory},
publisher = {ACM Press},
city = {New York},
year = {1998}
}
@article{sfbl-bmneevm-98,
author = {R. E. Schapire and Y. Freund and P. L. Bartlett
and W. S. Lee},
title = {Boosting the margin: a new explanation for the effectiveness of
voting methods},
journal = {Annals of Statistics},
volume = {26},
number = {5},
pages = {1651--1686},
year = {1998}
}
@article{bmm-alvdbppn-98,
author = {P. L. Bartlett and V. Maiorov and R. Meir},
title = {Almost linear {VC} dimension bounds for piecewise polynomial
networks},
journal = {Neural Computation},
volume = {10},
number = {8},
pages = {2159--2173},
year = {1998}
}
@article{lbw-iclsl-98,
author = {W. S. Lee and P. L. Bartlett and R. C. Williamson},
title = {The importance of convexity in learning with squared loss},
journal = {IEEE Transactions on Information Theory},
volume = {44},
number = {5},
pages = {1974--1980},
year = {1998}
}
@article{stbwa-srmddh-98,
author = {J. Shawe-Taylor and P. L. Bartlett and R. C. Williamson
and M. Anthony},
title = {Structural Risk Minimization over Data-Dependent
Hierarchies},
journal = {IEEE Transactions on Information Theory},
volume = {44},
number = {5},
pages = {1926--1940},
year = {1998}
}
@article{bll-mdreqd-98,
author = {P. L. Bartlett and T. Linder and G. Lugosi},
title = {The minimax distortion redundancy in empirical quantizer
design},
journal = {IEEE Transactions on Information Theory},
volume = {44},
number = {5},
pages = {1802--1813},
year = {1998}
}
@article{bk-cmcsnd-98,
author = {P. L. Bartlett and S. Kulkarni},
title = {The complexity of model classes, and smoothing noisy data},
journal = {Systems and Control Letters},
volume = {34},
number = {3},
pages = {133--140},
year = {1998}
}
@article{bv-isilt-98,
author = {P. L. Bartlett and M. Vidyasagar},
title = {Introduction to the special issue on learning theory},
journal = {Systems and Control Letters},
volume = {34},
pages = {113--114},
year = {1998}
}
@article{kbb-ocpcle-98,
author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett},
title = {Optimal controller properties from closed-loop experiments},
journal = {Automatica},
volume = {34},
number = {1},
pages = {83--91},
year = {1998}
}
@article{bl-plussd-98,
author = {P. L. Bartlett and P. M. Long},
title = {Prediction, learning, uniform convergence, and scale-sensitive
dimensions},
journal = {Journal of Computer and System Sciences},
volume = {56},
number = {2},
pages = {174--190},
year = {1998},
note = {(special issue on COLT`95)}
}
@article{b-scpcnn-98,
author = {P. L. Bartlett},
title = {The sample complexity of pattern classification
with neural networks: the size of the weights is more
important than the size of the network},
journal = {IEEE Transactions on Information Theory},
volume = {44},
number = {2},
pages = {525--536},
year = {1998}
}
@inproceedings{bb-cdmfs1nnc-98,
author = {J. Baxter and P. L. Bartlett},
title = {The canonical distortion measure in feature space and $1$-{NN}
classification},
booktitle = {Advances in Neural Information Processing Systems 10},
pages = {245--251},
year = {1998}
}
@inproceedings{gbl-gdtd-98,
author = {M. Golea and P. L. Bartlett and W. S. Lee},
title = {Generalization in decision trees and {DNF}: Does size matter?},
booktitle = {Advances in Neural Information Processing Systems 10},
pages = {259--265},
year = {1998}
}
@inproceedings{kbb-ditsa-98,
author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett},
title = {Direct iterative tuning via spectral analysis},
booktitle = {Proceedings of the IEEE Conference on Decision and
Control},
volume = {3},
pages = {2874--2879},
year = {1998}
}
@inproceedings{sbsw-svraac-98,
author = {B. Sch{\"o}lkopf and P. L. Bartlett and A. Smola
and R. Williamson},
title = {Support vector regression with automatic accuracy control},
editor = { L. Niklasson and M. Boden and T. Ziemke},
booktitle = {Perspectives in Neural Computing: Proceedings of the
8th International Conference on Artificial Neural Networks
(ICANN'98)},
pages = {111--116},
publisher = {Springer-Verlag},
city = {Berlin},
year = {1998}
}
@inproceedings{mbg-gtncdtcmp-98,
author = {L. Mason and P. L. Bartlett and M. Golea},
title = {Generalization in threshold networks, combined decision
trees and combined mask perceptrons},
editor = {T. Downs and M. Frean and M. Gallagher},
booktitle = {Proceedings of the Ninth Australian Conference
on Neural Networks (ACNN'98)},
publisher = {University of Queensland},
pages = {84--88},
year = {1998}
}
@book{ab-nnltf-99,
author = {Martin Anthony and Peter L. Bartlett},
title = {Neural Network Learning: Theoretical Foundations},
publisher = {Cambridge University Press},
year = {1999},
url = {http://www.stat.berkeley.edu/~bartlett/nnl/index.html},
isbn = {0 521 57353 X}
}
@article{bl-iudsam-99,
author = {P. L. Bartlett and G. Lugosi},
title = {An inequality for uniform deviations of sample averages
from their means},
journal = {Statistics and Probability Letters},
volume = {44},
number = {1},
pages = {55--62},
year = {1999}
}
@incollection{b-ennl-99,
author = {P. L. Bartlett},
title = {Efficient neural network learning},
booktitle = {Open Problems in Mathematical Systems Theory and
Control},
editor = {V. D. Blondel and E. D. Sontag and M. Vidyasagar
and J. C. Willems},
pages = {35--38},
publisher = {Springer Verlag},
city = {London},
year = {1999}
}
@incollection{bst-gpsvmopc-99,
author = {P. L. Bartlett and J. Shawe-Taylor},
title = {Generalization performance of support vector machines and other
pattern classifiers},
booktitle = {Advances in Kernel Methods -- Support Vector Learning},
editor = {B. Sch{\"o}lkopf and C. J. C. Burges and A. J. Smola},
pages = {43--54},
publisher = {MIT Press},
city = {Cambridge, MA},
year = {1999}
}
@inproceedings{bmm-alvdbppn-99,
author = {P. L. Bartlett and V. Maiorov and R. Meir},
title = {Almost linear {VC} dimension bounds for piecewise polynomial
networks},
booktitle = {Advances in Neural Information Processing Systems 11},
pages = {190--196},
year = {1999}
}
@inproceedings{mbb-domigcc-99,
author = {L. Mason and P. L. Bartlett and J. Baxter},
title = {Direct optimization of margins improves generalization in
combined classifiers},
booktitle = {Advances in Neural Information Processing Systems 11},
pages = {288--294},
year = {1999}
}
@inproceedings{sbsw-stnsvra-99,
author = {B. Sch{\"o}lkopf and P. L. Bartlett and A. Smola
and R. Williamson},
title = {Shrinking the tube: a new support vector regression
algorithm},
booktitle = {Advances in Neural Information Processing Systems 11},
pages = {330--336},
year = {1999}
}
@inproceedings{kbz-sfossgmbtmrpe-99,
author = {T. Koshizen and P. L. Bartlett and A. Zelinsky},
title = {Sensor fusion of odometry and sonar sensors by the {G}aussian
mixture {B}ayes' technique in mobile robot position estimation},
booktitle = {Proceedings of the 1999 IEEE International Conference on
Systems, Man and Cybernetics},
volume = {4},
pages = {742--747},
year = {1999}
}
@inproceedings{gbstw-cnsvm-99,
author = {Y. Guo and P. L. Bartlett and J. Shawe-Taylor
and R. C. Williamson},
title = {Covering Numbers for Support Vector Machines},
booktitle = {Proceedings of the Twelfth Annual Conference
on Computational Learning Theory},
pages = {267--277},
year = {1999}
}
@inproceedings{bbd-hrnnap-99,
author = {P. L. Bartlett and S. Ben-David},
title = {Hardness results for neural network approximation problems},
booktitle = {Proceedings of the Fourth European Conference
on Computational Learning Theory},
pages = {50--62},
year = {1999}
}
@inproceedings{mbb-ebvcmcf-99,
author = {L. Mason and P. L. Bartlett and J. Baxter},
title = {Error bounds for voting classifiers using margin cost
functions (Invited abstract)},
booktitle = {Proceedings of the IEEE Information Theory Workshop
on Detection, Estimation, Classification and Imaging},
pages = {36},
year = {1999}
}
@inproceedings{bb-vmds-99,
author = {P. L. Bartlett and J. Baxter},
title = {Voting methods for data segmentation},
booktitle = {Proceedings of the Advanced Investment Technology
Conference},
pages = {35--40},
publisher = {Bond University},
year = {1999}
}
@book{sbss-almc-00,
editor = {Alexander J. Smola and Peter L. Bartlett and
Bernard Sch{\"o}lkopf and Dale Schuurmans},
title = {Advances in Large Margin Classifiers},
publisher = {MIT Press},
city = {Cambridge},
country = {USA},
year = {2000},
isbn = {ISBN 0262 19448 1}
}
@article{ab-fli-00,
author = {M. Anthony and P. L. Bartlett},
title = {Function learning from interpolation},
journal = {Combinatorics, Probability, and Computing},
volume = {9},
pages = {213--225},
year = {2000}
}
@incollection{mbbf-fgtch-00,
author = {L. Mason and J. Baxter and P. L. Bartlett and M. Frean},
title = {Functional Gradient Techniques for Combining Hypotheses},
booktitle = {Advances in Large Margin Classifiers},
editor = {A. J. Smola and P. L. Bartlett and B. Sch{\"o}lkopf and
D. Schuurmans},
pages = {221--246},
publisher = {MIT Press},
city = {Cambridge, MA},
year = {2000}
}
@incollection{sbss-ilmc-00,
author = {A. J. Smola and P. L. Bartlett and B. Sch{\"o}lkopf
and D. Schuurmans},
title = {Introduction to Large Margin Classifiers},
booktitle = {Advances in Large Margin Classifiers},
editors = {A. J. Smola and P. L. Bartlett and B. Sch{\"o}lkopf
and D. Schuurmans},
pages = {1--29},
publisher = {MIT Press},
city = {Cambridge, MA},
year = {2000}
}
@article{bbdk-lccesc-00,
author = {P. L. Bartlett and S. Ben-David and S. R. Kulkarni},
title = {Learning changing concepts by exploiting the structure of
change},
journal = {Machine Learning},
volume = {41},
number = {2},
pages = {153--174},
year = {2000}
}
@article{ppb-pace-00,
author = {S. Parameswaran and M. F. Parkinson and P. L. Bartlett},
title = {Profiling in the {ASP} codesign environment},
journal = {Journal of Systems Architecture},
volume = {46},
number = {14},
pages = {1263--1274},
year = {2000}
}
@article{sswb-nsva-00,
author = {B. Sch{\"o}lkopf and A. Smola and R. C. Williamson
and P. L. Bartlett},
title = {New support vector algorithms},
journal = {Neural Computation},
volume = {12},
number = {5},
pages = {1207--1245},
year = {2000}
}
@article{kbb-ditsa-00,
author = {L. C. Kammer and R. R. Bitmead and P. L. Bartlett},
title = {Direct iterative tuning via spectral analysis},
journal = {Automatica},
volume = {36},
number = {9},
pages = {1301--1307},
year = {2000}
}
@article{mbb-igeom-00,
author = {L. Mason and P. L. Bartlett and J. Baxter},
title = {Improved generalization through explicit optimization of
margins},
journal = {Machine Learning},
volume = {38},
number = {3},
pages = {243--255},
year = {2000}
}
@inproceedings{bb-socpomdp-00,
author = {P. L. Bartlett and J. Baxter},
title = {Stochastic optimization of controlled partially
observable {M}arkov decision processes},
booktitle = {Proceedings of the IEEE Conference on Decision and
Control},
volume = {1},
pages = {124--129},
year = {2000}
}
@inproceedings{bb-dgbrl-00,
author = {J. Baxter and P. L. Bartlett},
title = {Direct gradient-based reinforcement learning (Invited)},
booktitle = {Proceedings of the International Symposium on Circuits and
Systems},
pages = {III-271--274},
year = {2000}
}
@inproceedings{mbbf-bagd-00,
author = {L. Mason and J. Baxter and P. L. Bartlett and M. Frean},
title = {Boosting algorithms as gradient descent},
booktitle = {Advances in Neural Information Processing Systems 12},
pages = {512--518},
year = {2000}
}
@inproceedings{bb-gaoaepgpa-00,
author = {J. Baxter and P. L. Bartlett},
title = {{GPOMDP}: An On-line Algorithm for Estimating Performance
Gradients in {POMDP}'s, with Applications},
booktitle = {Proceedings of the 2000 International
Conference on Machine Learning},
pages = {41--48},
year = {2000}
}
@inproceedings{bbl-msee-00,
author = {P. L. Bartlett and S. Boucheron and G. Lugosi},
title = {Model selection and error estimation},
booktitle = {Proceedings of the Thirteenth Annual Conference
on Computational Learning Theory},
pages = {286--297},
year = {2000}
}
@inproceedings{bb-eabgbrl-00,
author = {P. L. Bartlett and J. Baxter},
title = {Estimation and approximation bounds for gradient-based
reinforcement learning},
booktitle = {Proceedings of the Thirteenth Annual Conference on
Computational Learning Theory},
pages = {133--141},
year = {2000}
}
@article{bb-ihgbps-01,
author = {J. Baxter and P. L. Bartlett},
title = {Infinite-Horizon Gradient-Based Policy Search},
journal = {Journal of Artificial Intelligence Research},
volume = {15},
pages = {319--350},
year = {2001},
url = {http://www.cs.washington.edu/research/jair/abstracts/baxter01a.html}
}
@article{bbw-ihgbps2gaae-01,
author = {J. Baxter and P. L. Bartlett and L. Weaver},
title = {Experiments with Infinite-Horizon, Policy-Gradient
Estimation},
journal = {Journal of Artificial Intelligence Research},
volume = {15},
pages = {351--381},
year = {2001},
note = {http://www.cs.washington.edu/research/jair/abstracts/baxter01b.html},
url = {http://www.cs.washington.edu/research/jair/abstracts/baxter01b.html}
}
@inproceedings{bbbcefgswgedadvcwssw-asvmcpsmsdpcss-01,
author = {A. Ben-Hur and T. Barnes and P. L. Bartlett
and O. Chapelle and A. Elisseeff and H. Fristche
and I. Guyon and B. Sch{\"o}lkopf and J. Weston and E. Fung
and C. Enderwick and E. A. Dalmasso and B.-L. Adam
and J. W. Davis and A. Vlahou and L. Cazares and M. Ward
and P. F. Schellhammer and J. Semmes and G. L. Wright},
title = {Application of support vector machines to the classification of
ProteinChip System mass spectral data of prostate cancer serum
samples (abstract)},
booktitle = {Second Annual National Cancer Institute Early
Detection Research Network Scientific Workshop},
year = {2001}
}
@inproceedings{bm-rgcrbsr-01,
author = {P. L. Bartlett and S. Mendelson},
title = {Rademacher and {G}aussian complexities: Risk bounds and structural
results},
booktitle = {Proceedings of the Fourteenth Annual Conference on Computational
Learning Theory and Fifth European Conference on Computational
Learning Theory},
pages = {224--240},
year = {2001}
}
@inproceedings{sb-sggpr-01,
author = {A. J. Smola and P. L. Bartlett},
title = {Sparse greedy {G}aussian process regression},
booktitle = {Advances in Neural Information Processing Systems 13},
pages = {619--625},
year = {2001}
}
@article{gbstw-cnsvm-01,
author = {Y. Guo and P. L. Bartlett and J. Shawe-Taylor and R. C.
Williamson},
title = {Covering numbers for support vector machines},
journal = {IEEE Transactions on Information Theory},
volume = {48},
number = {1},
pages = {239--250},
year = {2002}
}
@unpublished{bbm-lrc-02b,
title = {Local {R}ademacher complexities},
institution = {U.C.\ Berkeley},
author = {Peter L.~Bartlett and Olivier Bousquet and Shahar Mendelson},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bbm-lrc-02b.ps.Z},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bbm-lrc-02b.pdf},
note = {Annals of Statistics. To appear.
http://www.stat.berkeley.edu/$\sim$bartlett/papers/bbm-lrc-02b.pdf)},
year = {2004},
abstract = {We propose new bounds on the error of
learning algorithms in terms of a data-dependent notion
of complexity. The estimates we establish give optimal
rates and are based on a local and empirical version of
Rademacher averages, in the sense that the Rademacher
averages are computed from the data, on a subset of
functions with small empirical error. We present some
applications to prediction with bounded loss, and to
regression with a convex loss function and a convex
function class.}
}
@unpublished{bg-ldpcc-02,
title = {Faster convergence rates for error probabilities
of message passing decoders for low-density parity-check codes},
author = {Peter L.~Bartlett and Evan Greensmith},
year = {2002},
note = {(unpublished)},
institution = {U.C.\ Berkeley},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bg-ldpcc-02.ps.Z},
abstract = {We consider the convergence of the error probability
of low-density parity-check codes under message passing
decoding, as a function of the length of the code. Suppose
that the number of decoding iterations is fixed. We show
that, for almost all bipartite graphs defining a LDPC code
and almost all received messages, the error probability
decreases to a constant times its asymptotic value at the
rate $\log n/n$, where $n$ is the length of the code.}
}
@article{bm-rgcrbsr-02,
author = {P. L. Bartlett and S. Mendelson},
title = {Rademacher and {G}aussian complexities: Risk bounds and structural
results},
journal = {Journal of Machine Learning Research},
volume = {3},
pages = {463--482},
note = {http://www.jmlr.org/papers/volume3/bartlett02a/bartlett02a.pdf},
pdf = {http://www.jmlr.org/papers/volume3/bartlett02a/bartlett02a.pdf},
year = {2002}
}
@article{bbl-msee-02,
author = {P. L. Bartlett and S. Boucheron and G. Lugosi},
title = {Model selection and error estimation},
journal = {Machine Learning},
note = {http://www.stat.berkeley.edu/$\sim$bartlett/papers/bbl-msee.ps.gz},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bbl-msee.ps.gz},
volume = {48},
pages = {85--113},
year = {2002}
}
@article{bb-eabgbrl-02,
author = {P. L. Bartlett and J. Baxter},
title = {Estimation and approximation bounds for gradient-based
reinforcement learning},
journal = {Journal of Computer and System Sciences},
volume = {64},
number = {1},
pages = {133--150},
year = {2002}
}
@article{bbd-hrnnap-02,
author = {P. L. Bartlett and S. Ben-David},
title = {Hardness results for neural network approximation problems},
journal = {Theoretical Computer Science},
note = {(special issue on Eurocolt'99)},
year = {2002},
volume = {284},
number = {1},
pages = {53--66},
url = {http://dx.doi.org/10.1016/S0304-3975(01)00057-3}
}
@article{bfh-erwl-02,
author = {P. L. Bartlett and P. Fischer and K.-U. H{\"o}ffgen},
title = {Exploiting random walks for learning},
journal = {Information and Computation},
year = {2002},
volume = {176},
number = {2},
pages = {121--135},
note = {http://dx.doi.org/10.1006/inco.2002.3083},
url = {http://dx.doi.org/10.1006/inco.2002.3083}
}
@article{mbg-gecc-02,
author = {L. Mason and P. L. Bartlett and M. Golea},
title = {Generalization error of combined classifiers},
journal = {Journal of Computer and System Sciences},
year = {2002},
volume = {65},
number = {2},
pages = {415--438},
note = {http://dx.doi.org/10.1006/jcss.2002.1854},
url = {http://dx.doi.org/10.1006/jcss.2002.1854}
}
@inproceedings{bbm-lrc-02,
author = {P. L. Bartlett and O. Bousquet and S. Mendelson},
title = {Localized {R}ademacher complexity},
booktitle = {Proceedings of the Conference on Computational Learning
Theory},
pages = {44-58},
year = {2002}
}
@inproceedings{lcbegj-lkmsdp-02,
author = {G. Lanckriet and N. Cristianini and P. L. Bartlett and L. El
Ghaoui and M. Jordan},
title = {Learning the kernel matrix with semi-definite programming},
booktitle = {Proceedings of the International Conference on Machine
Learning},
pages = {323--330},
year = {2002}
}
@inproceedings{gbb-vrtgerl-02,
author = {E. Greensmith and P. L. Bartlett and J. Baxter},
title = {Variance reduction techniques for gradient estimates in
reinforcement learning},
booktitle = {Advances in Neural Information Processing Systems 14},
note = {http://www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/CN02.ps.gz},
year = {2002},
ps = {http://www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/CN02.ps.gz}
}
@article{bjm-d-03,
title = {Discussion of boosting papers},
author = {Peter L.\ Bartlett and Michael I.\ Jordan
and Jon D.\ McAuliffe},
journal = {The Annals of Statistics},
year = {2004},
volume = {32},
number = {1},
pages = {85--91},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-d-03.ps.Z},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-d-03.pdf}
}
@inproceedings{b-paccc-03,
author = {Peter L.\ Bartlett},
title = {Prediction algorithms: complexity, concentration and
convexity},
booktitle = {Proceedings of the 13th IFAC Symposium on System
Identification},
year = {2003},
pages = {1507--1517},
note = {http://www.stat.berkeley.edu/$\sim$bartlett/papers/b-paccc-03.ps.Z},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/b-paccc-03.ps.Z},
abstract = {In this paper, we review two families of algorithms used
to estimate large-scale statistical models for prediction
problems, kernel methods and boosting algorithms. We
focus on the computational and statistical properties of
prediction algorithms of this kind. Convexity plays an
important role for these algorithms, since they exploit
the computational advantages of convex optimization
procedures. However, in addition to its computational
advantages, the use of convexity in these methods
also confers some attractive statistical properties.
We present some recent results that show the advantages
of convexity for estimation rates, the rates at which
the prediction accuracies approach their optimal values.
In addition, we present results that quantify the cost
of using a convex loss function in place of the real
loss function of interest.}
}
@inproceedings{bjm-lmcclln-03,
author = {Peter L.\ Bartlett and Michael I.\ Jordan and
Jon D.\ McAuliffe},
title = {Large margin classifiers: convex loss, low noise,
and convergence rates},
booktitle = {Advances in Neural Information Processing Systems, 16},
year = {2004},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-lmcclln-03.ps.gz},
abstract = {
Many classification algorithms, including the support vector
machine, boosting and logistic regression, can be viewed as minimum
contrast methods that minimize a convex surrogate of the 0-1 loss
function. We characterize the statistical consequences of using
such a surrogate by providing a general quantitative relationship between
the risk as assessed using the 0-1 loss and the risk as assessed
using any nonnegative surrogate loss function. We show that this
relationship gives nontrivial bounds under the weakest possible
condition on the loss function---that it satisfy a pointwise form
of Fisher consistency for classification. The relationship is
based on a variational transformation of the loss
function that is easy to compute in many applications.
We also present a refined version of this result in the
case of low noise. Finally, we present applications of
our results to the estimation of convergence rates in the
general setting of function classes that are scaled
hulls of a finite-dimensional base class.}
}
@article{bjm-ccrb-05,
title = {Convexity, classification, and risk bounds},
author = {Peter L.\ Bartlett and
Michael I.\ Jordan and Jon D.\ McAuliffe},
note = {(Was Department of Statistics, U.C.\ Berkeley Technical Report
number 638, 2003)},
year = {2006},
journal = {Journal of the American Statistical Association},
volume = {101},
number = {473},
pages = {138-156},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bjm-ccrb-05.pdf},
abstract = {Many of the classification algorithms developed in the
machine learning literature, including the support
vector machine and boosting, can be viewed as minimum
contrast methods that minimize a convex surrogate of the
0-1 loss function. The convexity makes these algorithms
computationally efficient. The use of a surrogate, however,
has statistical consequences that must be balanced against
the computational virtues of convexity. To study these
issues, we provide a general quantitative relationship between
the risk as assessed using the 0-1 loss and the risk as assessed
using any nonnegative surrogate loss function. We show that this
relationship gives nontrivial upper bounds on excess risk
under the weakest possible condition on the loss
function: that it satisfy a pointwise form of Fisher
consistency for classification. The relationship is
based on a simple variational transformation of the loss
function that is easy to compute in many applications.
We also present a refined version of this result in the
case of low noise. Finally, we present applications of
our results to the estimation of convergence rates in the
general setting of function classes that are scaled
convex hulls of a finite-dimensional base class,
with a variety of commonly used loss functions.}
}
@techreport{bjm-ccrb-03,
title = {Convexity, classification, and risk bounds},
author = {Peter L.\ Bartlett and
Michael I.\ Jordan and Jon D.\ McAuliffe},
institution = {Department of Statistics, U.C.\ Berkeley},
number = {638},
year = {2003},
pdf = {http://www.stat.berkeley.edu/tech-reports/638.pdf},
ps = {http://www.stat.berkeley.edu/tech-reports/638.ps.Z},
abstract = {Many of the classification algorithms developed in the
machine learning literature, including the support
vector machine and boosting, can be viewed as minimum
contrast methods that minimize a convex surrogate of the
0-1 loss function. The convexity makes these algorithms
computationally efficient. The use of a surrogate, however,
has statistical consequences that must be balanced against
the computational virtues of convexity. To study these
issues, we provide a general quantitative relationship between
the risk as assessed using the 0-1 loss and the risk as assessed
using any nonnegative surrogate loss function. We show that this
relationship gives nontrivial upper bounds on excess risk
under the weakest possible condition on the loss
function: that it satisfy a pointwise form of Fisher
consistency for classification. The relationship is
based on a simple variational transformation of the loss
function that is easy to compute in many applications.
We also present a refined version of this result in the
case of low noise. Finally, we present applications of
our results to the estimation of convergence rates in the
general setting of function classes that are scaled
convex hulls of a finite-dimensional base class,
with a variety of commonly used loss functions.}
}
@incollection{bm-vcdnn-03,
author = {Peter L. Bartlett and Wolfgang Maass},
title = {{V}apnik-{C}hervonenkis dimension of neural nets},
editor = {Michael A.~Arbib},
booktitle = {The Handbook of Brain Theory and Neural
Networks},
publisher = {MIT Press},
city = {Cambridge, MA},
note = {Second Edition},
pages = {1188--1192},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bm-vcdnn-03.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bm-vcdnn-03.pdf},
year = {2003}
}
@incollection{b-irltvfm-03,
author = {Peter L. Bartlett},
title = {An introduction to reinforcement learning theory: value
function methods},
booktitle = {Advanced Lectures on Machine Learning},
seriestitle = {Lecture Notes in Computer Science},
volume = {2600},
editor = {Shahar Mendelson and Alexander J. Smola},
isbn = {3-540-00529-3},
pages = {184--202},
publisher = {Springer},
html = {http://link.springer.de/link/service/series/0558/bibs/2600/26000184.htm},
year = {2003},
note = {http://link.springer.de/link/service/series/0558/bibs/2600/26000184.htm}
}
@unpublished{bm-em-03,
author = {Peter L.~Bartlett and Shahar Mendelson},
title = {Empirical minimization},
note = {(submitted.
http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf)},
year = {2003},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-03.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-03.pdf},
abstract = {We investigate the behavior of the empirical
minimization algorithm using various methods. We first analyze it
by comparing the empirical, random, structure and the original one
on the class, either in an additive sense, via the uniform law of
large numbers, or in a multiplicative sense, using isomorphic
coordinate projections. We then show that a direct analysis of the
empirical minimization algorithm yields a significantly better
bound, and that the estimates we obtain are essentially sharp. The
method of proof we use is based on Talagrand's concentration
inequality for empirical processes.}
}
@article{lcbegj-lkmsdp-04,
author = {G. Lanckriet and N. Cristianini and P. L. Bartlett and L. El
Ghaoui and M. Jordan},
title = {Learning the kernel matrix with semi-definite programming},
journal = {Journal of Machine Learning Research},
pdf = {http://www.jmlr.org/papers/volume5/lanckriet04a/lanckriet04a.pdf},
ps = {http://www.jmlr.org/papers/volume5/lanckriet04a/lanckriet04a.ps.gz},
year = {2004},
volume = {5},
pages = {27--72},
note = {(http://www.jmlr.org/papers/volume5/lanckriet04a/lanckriet04a.pdf)}
}
@article{gbb-vrtgerl-04,
author = {E. Greensmith and P. L. Bartlett and J. Baxter},
title = {Variance reduction techniques for gradient estimates in
reinforcement learning},
journal = {Journal of Machine Learning Research},
volume = {5},
year = {2004},
pages = {1471--1530},
url = {http://jmlr.csail.mit.edu/papers/volume5/greensmith04a/greensmith04a.pdf}
}
@inproceedings{bcmt-lmmsc-04,
author = {Peter L. {Bartlett} and Michael {Collins} and Ben
{Taskar} and David {McAllester}},
title = {Exponentiated Gradient Algorithms for Large-margin
Structured Classification},
booktitle = {Advances in Neural Information Processing
Systems 17},
editor = {Lawrence K. Saul and Yair Weiss and {L\'{e}on}
Bottou},
publisher = {MIT Press},
address = {Cambridge, MA},
pages = {113-120},
year = {2005},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bcmt-lmmsc-04.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bcmt-lmmsc-04.pdf},
abstract = {We consider the problem of {\em structured classification}, where the
task is to predict a label $y$ from an input $x$, and $y$
has meaningful internal structure. Our framework includes supervised
training of both Markov random fields and weighted context-free
grammars as special cases. We describe an
algorithm that solves the large-margin optimization problem defined by
Taskar et al, using an exponential-family (Gibbs distribution)
representation of structured objects. The algorithm is efficient --
even in cases where the number of labels $y$ is exponential in size --
provided that certain expectations under Gibbs distributions can
becalculated efficiently. The optimization method we use for
structured
labels relies on a more general result, specifically the application
of exponentiated gradient (EG) updates to quadratic programs (QPs).
We describe a new method for solving QPs based on these techniques,
and give bounds on its rate of convergence.
In addition to their application to the
structured-labels task, the EG updates lead to simple
algorithms for optimizing ``conventional'' binary or multiclass
SVM problems. Finally, we give a new generalization bound for structured
classification, using PAC-Bayesian methods for the analysis of large
margin classifiers.}
}
@inproceedings{bt-secp-04,
author = {Peter L.~Bartlett and Ambuj Tewari},
title = {Sparseness vs estimating conditional probabilities: Some
asymptotic results},
booktitle = {Proceedings of the 17th Annual Conference on Learning
Theory},
volume = {3120},
seriestitle = {Lecture Notes in Computer Science},
pages = {564--578},
publisher = {Springer},
year = {2004},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bt-secp-04.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-secp-04.pdf},
abstract = {One of the nice properties of kernel
classifiers such as SVMs is that they often produce
sparse solutions. However, the decision functions of
these classifiers cannot always be used to estimate the
conditional probability of the class label. We investigate
the relationship between these two properties and show that
these are intimately related: sparseness does not occur
when the conditional probabilities can be unambiguously
estimated. We consider a family of convex loss functions
and derive sharp asymptotic bounds for the number of
support vectors. This enables us to characterize the exact
trade-off between sparseness and the ability to estimate
conditional probabilities for these loss functions.}
}
@inproceedings{tb-ocmcm-05,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {On the consistency of multiclass classification
methods},
booktitle = {Proceedings of the 18th Annual Conference on Learning
Theory},
volume = {3559},
seriestitle = {Lecture Notes in Computer Science},
pages = {143--157},
publisher = {Springer},
year = {2005},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/tb-ocmcm-05.pdf},
notetoomit = {Winner of the Student Paper Award}
}
@inproceedings{bmp-lcerm-04,
author = {Peter L. Bartlett and Shahar Mendelson and Petra
Philips},
title = {Local Complexities for Empirical Risk Minimization},
booktitle = {Proceedings of the 17th Annual Conference on Computational
Learning Theory (COLT2004)},
volume = {3120},
seriestitle = {Lecture Notes in Computer Science},
pages = {270--284},
publisher = {Springer},
year = {2004},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-lcerm-04.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-lcerm-04.pdf},
abstract = {We present sharp bounds on the risk of the empirical
minimization algorithm under mild assumptions on the class. We introduce
the notion of isomorphic coordinate projections and show that this
leads to a sharper error bound than the best previously known. The
quantity which governs this bound on the empirical minimizer is
the fixed point of the function $\xxi(r) = \E \sup \left\{|\E f -
\E_n f|: f\in F,\, \E f = r \right\}$. We prove that this is the
best estimate one can obtain using `structural results', and that
it is possible to estimate the error rate from data. We then prove
that the bound on the empirical minimization algorithm can be
improved further by a direct analysis, and that the correct error
rate is the maximizer of $\oxi(r) -r$, where
$\oxi(r) = \E \sup \left\{\E f - \E_n f: f\in F,\, \E f = r
\right\}$.}
}
@inproceedings{jrsb-mletldp-04,
author = {Rafael Jim\'enez-Rodriguez and Nicholas Sitar
and Peter L.~Bartlett},
title = {Maximum likelihood estimation of trace length distribution
parameters using the {EM} algorithm},
booktitle = {Prediction, Analysis and Design in Geomechanical
Applications: Proceedings of the Eleventh International
Conference on Computer Methods and Advances in Geomechanics
(IACMAG-2005)},
editor = {G. Barla and M. Barla},
volume = {1},
pages = {619--626},
address = {Bologna},
publisher = {P\`atron Editore},
optnote = {Torino, Italy, 19-24 June 2005},
year = 2005
}
@article{bbm-lrc-05,
title = {Local {R}ademacher complexities},
author = {Peter L.~Bartlett and Olivier Bousquet and Shahar Mendelson},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bbm-lrc-05.ps},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bbm-lrc-05.pdf},
journal = {Annals of Statistics},
volume = {33},
number = {4},
pages = {1497--1537},
year = {2005},
abstract = {We propose new bounds on the error of
learning algorithms in terms of a data-dependent notion
of complexity. The estimates we establish give optimal
rates and are based on a local and empirical version of
Rademacher averages, in the sense that the Rademacher
averages are computed from the data, on a subset of
functions with small empirical error. We present some
applications to prediction with bounded loss, and to
regression with a convex loss function and a convex
function class.}
}
@article{bm-em-05,
author = {Peter L.~Bartlett and Shahar Mendelson},
title = {Empirical minimization},
journal = {Probability Theory and Related Fields},
year = {2006},
volume = 135,
number = 3,
pages = {311--334},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bm-em-05.pdf},
abstract = {We investigate the behavior of the empirical
minimization algorithm using various methods. We first analyze it
by comparing the empirical, random, structure and the original one
on the class, either in an additive sense, via the uniform law of
large numbers, or in a multiplicative sense, using isomorphic
coordinate projections. We then show that a direct analysis of the
empirical minimization algorithm yields a significantly better
bound, and that the estimates we obtain are essentially sharp. The
method of proof we use is based on Talagrand's concentration
inequality for empirical processes.}
}
@techreport{bmp-osbeeem-05,
author = {Peter L. Bartlett and Shahar Mendelson and Petra
Philips},
title = {Optimal sample-based estimates of the
expectation of the empirical minimizer},
year = {2007},
institution = {U.C.\ Berkeley},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.pdf},
abstract = {We study sample-based estimates of the expectation of the
function produced by the empirical minimization algorithm.
We investigate the extent to which one can estimate
the rate of convergence of the empirical minimizer in a
data dependent manner. We establish three main results.
First, we provide an algorithm that upper bounds the
expectation of the empirical minimizer in a completely
data-dependent manner. This bound is based on a structural
result in
http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf,
which relates expectations to
sample averages. Second, we show that these structural
upper bounds can be loose. In particular, we demonstrate a
class for which the expectation of the empirical minimizer
decreases as $O(1/n)$ for sample size $n$, although the
upper bound based on structural properties is $\Omega(1)$.
Third, we show that this looseness of the bound is inevitable:
we present an example that shows that a sharp bound cannot
be universally recovered from empirical data.}
}
@article{bmp-osbeeem-08,
author = {Peter L. Bartlett and Shahar Mendelson and Petra
Philips},
title = {Optimal sample-based estimates of the
expectation of the empirical minimizer},
journal = {ESAIM: Probability and Statistics},
year = {2008},
note = {(Accepted)},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bmp-osbeeem-05.pdf},
abstract = {We study sample-based estimates of the expectation of the
function produced by the empirical minimization algorithm.
We investigate the extent to which one can estimate
the rate of convergence of the empirical minimizer in a
data dependent manner. We establish three main results.
First, we provide an algorithm that upper bounds the
expectation of the empirical minimizer in a completely
data-dependent manner. This bound is based on a structural
result in
http://www.stat.berkeley.edu/$\sim$bartlett/papers/bm-em-03.pdf,
which relates expectations to
sample averages. Second, we show that these structural
upper bounds can be loose. In particular, we demonstrate a
class for which the expectation of the empirical minimizer
decreases as $O(1/n)$ for sample size $n$, although the
upper bound based on structural properties is $\Omega(1)$.
Third, we show that this looseness of the bound is inevitable:
we present an example that shows that a sharp bound cannot
be universally recovered from empirical data.}
}
@techreport{b-freeoims-06,
author = {Peter L. Bartlett},
title = {Fast rates for estimation error and oracle
inequalities for model selection},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-freeoims-06.pdf},
institution = {Department of Statistics, U.C.\ Berkeley},
number = {729},
abstract = {We consider complexity penalization methods for model
selection. These methods aim to choose a model to
optimally trade off estimation and approximation errors
by minimizing the sum of an empirical risk term and a
complexity penalty. It is well known that if we use a
bound on the maximal deviation between empirical and
true risks as a complexity penalty,
then the risk of our choice is no more
than the approximation error plus twice the complexity
penalty. There are many cases, however, where complexity
penalties like this give loose upper bounds on the
estimation error. In particular, if we choose a function
from a suitably simple convex function class with a
strictly convex loss function, then the estimation error
(the difference between the risk of the empirical risk
minimizer and the minimal risk in the class) approaches zero at a
faster rate than the maximal deviation between empirical
and true risks. In this note, we address the question of
whether it is possible to design a complexity penalized
model selection method for these situations. We show that,
provided the sequence of models is ordered by inclusion, in
these cases we can use tight upper bounds on estimation
error as a complexity penalty. Surprisingly, this is the
case even in situations when the difference between the
empirical risk and true risk (and indeed the error of
any estimate of the approximation error) decreases
much more slowly than the
complexity penalty. We give an oracle inequality
showing that the resulting model selection method chooses
a function with risk no more than the approximation error
plus a constant times the complexity penalty.}
}
@article{b-freeoims-07,
author = {Peter L. Bartlett},
title = {Fast rates for estimation error and oracle
inequalities for model selection},
year = {2008},
journal = {Econometric Theory},
volume = {24},
number = {2},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/b-freeoims-07.pdf},
note = {(To appear. Was Department of Statistics,
U.C.\ Berkeley Technical Report number 729, 2007)},
abstract = {We consider complexity penalization methods for model
selection. These methods aim to choose a model to
optimally trade off estimation and approximation errors
by minimizing the sum of an empirical risk term and a
complexity penalty. It is well known that if we use a
bound on the maximal deviation between empirical and
true risks as a complexity penalty,
then the risk of our choice is no more
than the approximation error plus twice the complexity
penalty. There are many cases, however, where complexity
penalties like this give loose upper bounds on the
estimation error. In particular, if we choose a function
from a suitably simple convex function class with a
strictly convex loss function, then the estimation error
(the difference between the risk of the empirical risk
minimizer and the minimal risk in the class) approaches zero at a
faster rate than the maximal deviation between empirical
and true risks. In this note, we address the question of
whether it is possible to design a complexity penalized
model selection method for these situations. We show that,
provided the sequence of models is ordered by inclusion, in
these cases we can use tight upper bounds on estimation
error as a complexity penalty. Surprisingly, this is the
case even in situations when the difference between the
empirical risk and true risk (and indeed the error of
any estimate of the approximation error) decreases
much more slowly than the
complexity penalty. We give an oracle inequality
showing that the resulting model selection method chooses
a function with risk no more than the approximation error
plus a constant times the complexity penalty.}
}
@techreport{bw-crouhl-06,
author = {Peter L. Bartlett and Marten H. Wegkamp},
title = {Classification with a Reject Option using a Hinge
Loss},
year = {2006},
institution = {U.C.\ Berkeley},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.pdf},
abstract = {We consider the problem of binary classification where the
classifier can, for
a particular cost, choose not to classify an observation. Just as in
the
conventional classification problem, minimization of the sample
average of
the cost is a difficult optimization problem. As an alternative, we
propose
the optimization of a certain convex loss function f, analogous
to the
hinge loss used in support vector machines (SVMs). Its convexity
ensures that
the sample average of this surrogate loss can be efficiently
minimized. We
study its statistical properties. We show that minimizing the expected
surrogate loss---the f-risk---also minimizes the risk. We also
study
the rate at which the f-risk approaches its minimum value. We
show that
fast rates are possible when the conditional probability Pr(Y=1|X)
is
unlikely to be close to certain critical values.}
}
@article{bw-crouhl-08,
author = {Peter L. Bartlett and Marten H. Wegkamp},
title = {Classification with a Reject Option using a Hinge
Loss},
journal = {Journal of Machine Learning Research},
year = {2008},
note = {(To appear.)},
ps = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.ps.gz},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bw-crouhl-06.pdf},
abstract = {We consider the problem of binary classification where the
classifier can, for
a particular cost, choose not to classify an observation. Just as in
the
conventional classification problem, minimization of the sample
average of
the cost is a difficult optimization problem. As an alternative, we
propose
the optimization of a certain convex loss function f, analogous
to the
hinge loss used in support vector machines (SVMs). Its convexity
ensures that
the sample average of this surrogate loss can be efficiently
minimized. We
study its statistical properties. We show that minimizing the expected
surrogate loss---the f-risk---also minimizes the risk. We also
study
the rate at which the f-risk approaches its minimum value. We
show that
fast rates are possible when the conditional probability Pr(Y=1|X)
is
unlikely to be close to certain critical values.}
}
@inproceedings{bt-scpskd-06,
author = {Peter L.~Bartlett and Ambuj Tewari},
title = {Sample complexity of policy search with known dynamics},
booktitle = {Advances in Neural Information Processing Systems 19},
year = 2007,
editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
publisher = {MIT Press},
address = {Cambridge, MA},
pages = {97--104},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-scpskd-06.pdf}
}
@inproceedings{rbr-soimbtmerb-06,
author = {Benjamin I. P. Rubinstein and Peter L.~Bartlett and J. Hyam
Rubinstein},
title = {Shifting, one-inclusion mistake bounds and tight multiclass
expected risk bounds},
booktitle = {Advances in Neural Information Processing Systems 19},
note = {To appear},
editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
publisher = {MIT Press},
address = {Cambridge, MA},
pages = {1193--1200},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/rbr-soimbtmerb-06.pdf}
}
@inproceedings{rb-rcckc-06,
author = {David Rosenberg and Peter L.~Bartlett},
title = {The {R}ademacher Complexity of Co-Regularized Kernel Classes},
abstract = {In the multi-view approach to semi-supervised learning, we choose
one predictor from each of multiple hypothesis classes, and we
`co-regularize' our choices by penalizing disagreement among the
predictors on the unlabeled data. In this paper we examine the
co-regularization method used in the recently proposed
co-regularized least squares (CoRLS) algorithm.
In this method we have two hypothesis
classes, each a reproducing kernel Hilbert space (RKHS), and we
co-regularize by penalizing the average squared difference in
predictions on the unlabeled data. We get our final predictor by
taking the pointwise average of the predictors from each view. We
call the set of predictors that can result from this procedure the
co-regularized hypothesis class. The main result of this paper is a
tight bound on the Rademacher complexity of the co-regularized
hypothesis class in terms of the kernel matrices of each RKHS. We
find that the co-regularization reduces the Rademacher complexity of
the hypothesis class by an amount depending on how different the two
views are, measured by a data dependent metric. We then use standard
techniques to bound the gap between training error and test error
for the CoRLS algorithm. Experimentally, we find that the amount of
reduction in complexity introduced by co-regularization correlates
with the amount of improvement that co-regularization gives in the
CoRLS algorithm},
booktitle = {Proceedings of the Eleventh International Conference on
Artificial Intelligence and Statistics},
editor = {Marina Meila and Xiaotong Shen},
note = {To appear},
year = 2007,
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/rb-rcckc-06.pdf}
}
@inproceedings{bt-aic-06,
author = {Peter L.~Bartlett and Mikhail Traskin},
title = {AdaBoost is Consistent},
booktitle = {Advances in Neural Information Processing Systems 19},
editor = {B. Sch\"{o}lkopf and J. Platt and T. Hoffman},
publisher = {MIT Press},
address = {Cambridge, MA},
pages = {105--112},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-aic-06.pdf},
abstract = {The risk, or probability of error, of
the classifier produced by the AdaBoost algorithm is
investigated. In particular, we consider the stopping
strategy to be used in AdaBoost to achieve universal
consistency. We show that provided AdaBoost is stopped
after n^(1-a)$ iterations---for sample size n and
a>0---the sequence of risks of the classifiers it
produces approaches the Bayes risk.}
}
@techreport{bt-aic-06a,
author = {Peter L.~Bartlett and Mikhail Traskin},
title = {AdaBoost is Consistent},
institution = {U.\ C.\ Berkeley},
year = {2006}
}
@article{bjm-c-06,
author = {Peter L.\ Bartlett and
Michael I.\ Jordan and Jon D.\ McAuliffe},
title = {Comment},
journal = {Statistical Science},
year = {2006},
volume = {21},
number = {3},
pages = {341--346}
}
@inproceedings{bt-aolmc-06,
author = {Peter L.~Bartlett and Mikhail Traskin},
title = {AdaBoost and other Large Margin Classifiers:
Convexity in Pattern Classification},
booktitle = {Proceedings of the 5th Workshop on Defence Applications
of Signal Processing},
year = {2006},
note = {To appear}
}
@article{bm-d-06,
author = {Peter L.\ Bartlett and Shahar Mendelson},
title = {Discussion of ``2004 {IMS} {M}edallion {L}ecture:
{L}ocal {R}ademacher complexities and oracle inequalities in risk
minimization'' by {V}.\ {K}oltchinskii},
journal = {The Annals of Statistics},
year = {2006},
volume = {34},
number = {6},
pages = {2657--2663}
}
@inproceedings{tb-bpmdparc-07,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {Bounded parameter {M}arkov decision processes with average
reward criterion},
booktitle = {Proceedings of the Conference on Learning Theory},
year = {2007},
pages = {263--277}
}
@inproceedings{abr-mlea-07,
author = {Jacob Abernethy and Peter L.~Bartlett and Alexander Rakhlin},
title = {Multitask learning with expert advice},
booktitle = {Proceedings of the Conference on Learning Theory},
year = {2007},
pages = {484-498},
abstract = {We consider the problem of prediction with expert advice
in the setting where a forecaster is presented with several
online prediction tasks. Instead of competing against the best
expert separately on each task, we assume the tasks are
related, and thus we expect that a few experts will perform
well on the entire set of tasks. That is, our forecaster would
like, on each task, to compete against the best expert
chosen from a small set of experts. While we describe
the `ideal' algorithm and its performance bound, we show
that the computation required for this algorithm is as hard as
computation of a matrix permanent. We present an efficient
algorithm based on mixing priors, and prove a bound that is
nearly as good for the sequential task presentation case. We
also consider a harder case where the task may change
arbitrarily from round to round, and we develop an efficient
randomized algorithm based on Markov chain Monte Carlo
techniques.}
}
@inproceedings{arb--07,
author = {Alexander Rakhlin and Jacob Abernethy and
Peter L.~Bartlett},
title = {Online discovery of similarity mappings},
booktitle = {Proceedings of the 24th International Conference on Machine
Learning (ICML-2007)},
year = {2007},
note = {To appear},
pages = {},
abstract = {We consider the problem of choosing,
sequentially, a map which assigns elements of a set A
to a few elements of a set B. On each round, the
algorithm suffers some cost associated with the chosen
assignment, and the goal is to minimize the cumulative loss
of these choices relative to the best map on the entire
sequence. Even though the offline problem of finding the best
map is provably hard, we show that there is an equivalent
online approximation algorithm, Randomized Map Prediction
(RMP), that is efficient and performs nearly as well. While
drawing upon results from the `Online Prediction with
Expert Advice' setting, we show how RMP can be utilized as
an online approach to several standard batch problems. We
apply RMP to online clustering as well as online feature
selection and, surprisingly, RMP often outperforms the
standard batch algorithms on these problems.}
}
@article{bt-aic-07,
author = {Peter L.~Bartlett and Mikhail Traskin},
title = {AdaBoost is Consistent},
journal = {Journal of Machine Learning Research},
volume = {8},
pages = {2347--2368},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bt-aic-07.pdf},
abstract = {The risk, or probability of error, of
the classifier produced by the AdaBoost algorithm is
investigated. In particular, we consider the stopping
strategy to be used in AdaBoost to achieve universal
consistency. We show that provided AdaBoost is stopped
after n^(1-a) iterations---for sample size n and
0
@techreport{rbr-soimbtmerb-07,
author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
J.~Hyam Rubinstein},
title = {Shifting: one-inclusion mistake bounds and sample
compression},
institution = {EECS Department, University of California, Berkeley},
pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-86.pdf},
year = {2007},
abstract = {We present new expected risk bounds for binary and
multiclass prediction, and resolve several recent conjectures
on sample compressibility due to Kuzmin and Warmuth. By
exploiting the combinatorial structure of concept class F,
Haussler et al. achieved a VC(F)/n bound for the natural one-inclusion
prediction strategy. The key step in their proof is a d=VC(F)
bound on the graph density of a subgraph of the
hypercube - one-inclusion graph. The first main result of
this report is a density bound of
n choose(n-1,<=d-1)/choose(n,<=d) < d, which positively resolves
a conjecture of Kuzmin and Warmuth relating to their unlabeled
Peeling compression scheme and also leads to an improved
one-inclusion mistake bound. The proof uses a new form of
VC-invariant shifting and a group-theoretic symmetrization.
Our second main result is an algebraic topological property of
maximum classes of VC-dimension d as being d-contractible
simplicial complexes, extending the well-known
characterization that d=1 maximum classes are trees. We
negatively resolve a minimum degree conjecture of Kuzmin and
Warmuth - the second part to a conjectured proof of
correctness for Peeling - that every class has one-inclusion
minimum degree at most its VC-dimension. Our final main result
is a k-class analogue of the d/n mistake bound, replacing the
VC-dimension by the Pollard pseudo-dimension and the
one-inclusion strategy by its natural hypergraph
generalization. This result improves on known PAC-based
expected risk bounds by a factor of O(log n) and is shown to
be optimal up to a O(log k) factor. The combinatorial
technique of shifting takes a central role in understanding
the one-inclusion (hyper)graph and is a running theme
throughout.}
}
@article{rbr-soimbtmerb-08,
author = {Benjamin I.~P.~Rubinstein and Peter L.~Bartlett and
J.~Hyam Rubinstein},
title = {Shifting: one-inclusion mistake bounds and sample
compression},
journal = {Journal of Computer and System Sciences},
note = {(To appear. Was University of California, Berkeley, EECS Department
Technical Report EECS-2007-86)},
pdf = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-86.pdf},
year = {2008}
}
@unpublished{rb-bbspngp-07,
author = {David Rosenberg and Peter L.~Bartlett},
title = {On bounds for {B}ayesian sequence prediction with
non-{G}aussian priors},
note = {Technical Report},
year = {2007},
abstract = {We present worst-case log-loss regret bounds for the
Bayesian model averaging algorithm in the regression setting.
Our work generalizes earlier work that gives bounds of a
similar form that only hold for Gaussian priors. Our bounds
hold for arbitrary priors, though the regret term now includes
a term involving the modulus of continuity of the prior, as
well as the value of the prior at the point of comparison.}
}
@unpublished{abrt-mlbocg-07,
author = {Jacob Abernethy and Peter L.~Bartlett and Alexander
Rakhlin and Ambuj Tewari},
title = {Minimax Lower Bounds for Online Convex Games},
note = {Technical Report},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrt-mlbocg-07.pdf},
abstract = {A number of learning problems can be cast as an Online
Convex Game: on each round, a learner makes a prediction $x$
from a convex set, the environment plays a loss function $f$,
and the learner's long-term goal is to minimize regret. Algorithms
have been proposed by Zinkevich, when $f$ is assumed to be
convex, and Hazan et al, when $f$ is assumed to be strongly
convex, that have provably low regret. We consider these two
settings and analyze such games from a minimax perspective,
proving lower bounds in each case. These results prove that the
existing algorithms are essentially optimal.}
}
@inproceedings{bhr-aogd-07,
author = {Peter L.~Bartlett and Elad Hazan and Alexander Rakhlin},
title = {Adaptive online gradient descent},
booktitle = {Advances in Neural Information Processing
Systems 20},
editor = {Daphne Koller and Yoram Singer and John Platt},
publisher = {MIT Press},
address = {Cambridge, MA},
year = {2008},
note = {To appear},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bhr-aogd-07.pdf},
abstract = {We study the rates of growth of the regret in online
convex optimization. First, we show that a simple extension of
the algorithm of Hazan et al eliminates the need for a priori
knowledge of the lower bound on the second derivatives of the
observed functions. We then provide an algorithm, Adaptive
Online Gradient Descent, which interpolates between the
results of Zinkevich for linear functions and of Hazan et al
for strongly convex functions, achieving intermediate rates
between $\sqrt{T}$ and $\log T$. Furthermore, we show strong
optimality of the algorithm. Finally, we provide an extension
of our results to general norms.}
}
@inproceedings{tb-olplrim-07,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {Optimistic linear programming gives logarithmic regret for
irreducible MDPs},
booktitle = {Advances in Neural Information Processing
Systems 20},
editor = {Daphne Koller and Yoram Singer and John Platt},
publisher = {MIT Press},
address = {Cambridge, MA},
year = {2008},
note = {To appear},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/tb-olplrim-07.pdf},
abstract = {We present an algorithm called Optimistic Linear
Programming (OLP) for learning to optimize average reward in
an irreducible but otherwise unknown Markov decision process
(MDP). OLP uses its experience so far to estimate the MDP. It
chooses actions by optimistically maximizing estimated future
rewards over a set of next-state transition probabilities that
are close to the estimates: a computation that corresponds to
solving linear programs. We show that the total expected
reward obtained by OLP up to time $T$ is within $C(P)\log T$
of the reward obtained by the optimal policy, where $C(P)$ is
an explicit, MDP-dependent constant. OLP is closely related to
an algorithm proposed by Burnetas and Katehakis with four key
differences: OLP is simpler, it does not require knowledge of
the supports of transition probabilities and the proof of the
regret bound is simpler, but our regret bound is a constant
factor larger than the regret of their algorithm. OLP is also
similar in flavor to an algorithm recently proposed by Auer
and Ortner. But OLP is simpler and its regret bound has a
better dependence on the size of the MDP.}
}
@techreport{abr-mlea-07a,
author = {Jacob Duncan Abernethy and Peter L.~Bartlett
and Alexander Rakhlin},
title = {Multitask Learning with Expert Advice},
institution = {EECS Department, University of California, Berkeley},
year = {2007},
url = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2007/EECS-2007-20.html},
number = {UCB/EECS-2007-20}
}
@techreport{cgkcb-egacrfmmmn-07,
author = {Michael Collins and Amir Globerson and Terry Koo and Xavier
Carreras and Peter L.~Bartlett},
title = {Exponentiated gradient algorithms for conditional random
fields and max-margin {M}arkov networks},
institution = {U.C.\ Berkeley},
year = {2007},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/cgkcb-egacrfmmmn-07.pdf},
abstract = {Log-linear and maximum-margin models are two
commonly used methods in supervised machine learning, and
are frequently used in structured prediction problems.
Efficient learning of parameters in these models is
therefore an important problem, and becomes a key factor
when learning from very large data sets. This paper
describes exponentiated gradient (EG) algorithms for
training such models, where EG updates are applied to
the convex dual of either the log-linear or max-margin
objective function; the dual in both the log-linear
and max-margin cases corresponds to minimizing a convex
function with simplex constraints. We study both batch
and online variants of the algorithm, and provide rates
of convergence for both cases. In the max-margin case,
$O({1\over\epsilon})$ EG updates are required to reach a
given accuracy $\epsilon$ in the dual; in contrast, for
log-linear models only $O(\log({ 1\over\epsilon}))$ updates
are required. For both the max-margin and log-linear cases,
our bounds suggest that the online algorithm requires
a factor of $n$ less computation to reach a desired
accuracy, where $n$ is the number of training examples. Our
experiments confirm that the online algorithms are much
faster than the batch algorithms in practice. We describe
how the EG updates factor in a convenient way for
structured prediction problems, allowing the algorithms
to be efficiently applied to problems such as sequence
learning or natural language parsing. We perform extensive
evaluation of the algorithms, comparing them to to L-BFGS
and stochastic gradient descent for log-linear models,
and to SVM-Struct for max-margin models. The algorithms are
applied to multi-class problems as well as a more complex
large-scale parsing task. In all these settings, the EG
algorithms presented here outperform the other methods.}
}
@article{cgkcb-egacrfmmmn-08,
author = {Michael Collins and Amir Globerson and Terry Koo and Xavier
Carreras and Peter L.~Bartlett},
title = {Exponentiated gradient algorithms for conditional random
fields and max-margin {M}arkov networks},
journal = {Journal of Machine Learning Research},
year = {2008},
note = {(Accepted)},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/cgkcb-egacrfmmmn-07.pdf},
abstract = {Log-linear and maximum-margin models are two
commonly used methods in supervised machine learning, and
are frequently used in structured prediction problems.
Efficient learning of parameters in these models is
therefore an important problem, and becomes a key factor
when learning from very large data sets. This paper
describes exponentiated gradient (EG) algorithms for
training such models, where EG updates are applied to
the convex dual of either the log-linear or max-margin
objective function; the dual in both the log-linear
and max-margin cases corresponds to minimizing a convex
function with simplex constraints. We study both batch
and online variants of the algorithm, and provide rates
of convergence for both cases. In the max-margin case,
$O({1\over\epsilon})$ EG updates are required to reach a
given accuracy $\epsilon$ in the dual; in contrast, for
log-linear models only $O(\log({ 1\over\epsilon}))$ updates
are required. For both the max-margin and log-linear cases,
our bounds suggest that the online algorithm requires
a factor of $n$ less computation to reach a desired
accuracy, where $n$ is the number of training examples. Our
experiments confirm that the online algorithms are much
faster than the batch algorithms in practice. We describe
how the EG updates factor in a convenient way for
structured prediction problems, allowing the algorithms
to be efficiently applied to problems such as sequence
learning or natural language parsing. We perform extensive
evaluation of the algorithms, comparing them to to L-BFGS
and stochastic gradient descent for log-linear models,
and to SVM-Struct for max-margin models. The algorithms are
applied to multi-class problems as well as a more complex
large-scale parsing task. In all these settings, the EG
algorithms presented here outperform the other methods.}
}
@inproceedings{bdhkrt-hprbbolo-08,
title = {High-Probability Regret Bounds for Bandit Online Linear
Optimization},
author = {Peter L.~Bartlett and Varsha Dani and Thomas Hayes and
Sham Kakade and Alexander Rakhlin and Ambuj Tewari},
booktitle = {Proceedings of the 21st Annual Conference on Learning
Theory (COLT 2008)},
year = {2008},
note = {(To appear)},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/bdhkrt-hprbbolo-08.pdf}
}
@inproceedings{abrt-osmlbocg-08,
title = {Optimal strategies and minimax lower bounds for online convex
games},
author = {Jacob Abernethy and Peter L. Bartlett and
Alexander Rakhlin and Ambuj Tewari},
booktitle = {Proceedings of the 21st Annual Conference on Learning
Theory (COLT 2008)},
year = {2008},
note = {(To appear)},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/abrt-osmlbocg-08.pdf}
}
@article{tewari07consistency,
author = {Ambuj Tewari and Peter L.~Bartlett},
title = {On the Consistency of Multiclass Classification Methods},
journal = {Journal of Machine Learning Research},
year = {2007},
volume = {8},
month = {May},
pages = {1007--1025},
note = {(Invited paper)},
url = {http://jmlr.csail.mit.edu/papers/v8/tewari07a.html}
}
@article{bartlett07sparseness,
author = {Peter L. Bartlett and Ambuj Tewari},
title = {Sparseness vs Estimating Conditional Probabilities: Some
Asymptotic Results},
journal = {Journal of Machine Learning Research},
year = {2007},
volume = {8},
month = {April},
pages = {775--790},
url = {http://jmlr.csail.mit.edu/papers/v8/bartlett07a.html}
}
@article{lbw-c-08,
title = {Correction to {\em The Importance of Convexity in Learning
with Squared Loss}},
author = {Wee Sun Lee and Peter L.~Bartlett and
Robert C.~Williamson},
journal = {IEEE Transactions on Information Theory},
year = {2008},
note = {(To appear)},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/lbw-c-08.pdf}
}
@unpublished{ab-mamssl-08,
title = {Margin-adaptive model selection in statistical learning},
author = {Sylvain Arlot and Peter L.~Bartlett},
note = {Submitted},
year = {2008},
pdf = {http://www.stat.berkeley.edu/~bartlett/papers/ab-mamssl-08.pdf}
}
This file was generated by
bibtex2html 1.91.