recent-pubs.bib

@comment{{This file has been generated by bib2bib 1.99}}

@comment{{Command line: bib2bib --remove funding --remove notetoomit -ob recent-pubs.bib -c 'year>=2017 and $type != "unpublished" and not note : "[pP]atent"' ../mypubs.bib}}

@article{hb-ecosnmlbp-17,
  author = {Fares Hedayati and Peter L. Bartlett},
  title = {Exchangeability Characterizes Optimality of Sequential
Normalized Maximum Likelihood and {B}ayesian Prediction},
  journal = {IEEE Transactions on Information Theory},
  month = {October},
  volume = {63},
  number = {10},
  pages = {6767--6773},
  year = {2017},
  abstract = {We study online learning under logarithmic loss with
regular parametric models. In this setting, each strategy corresponds
to a joint distribution on sequences. The minimax optimal strategy is the
normalized maximum likelihood (NML) strategy. We show 
that the sequential normalized maximum likelihood (SNML) strategy
predicts minimax optimally (i.e. as NML) if and only if the
joint distribution on sequences defined by SNML is exchangeable. This
property also characterizes the optimality of a Bayesian
prediction strategy. In that case, the optimal prior distribution is
Jeffreys prior for a broad class of parametric models for which
the maximum likelihood estimator is asymptotically normal. The optimal
prediction strategy, normalized maximum likelihood,
depends on the number n of rounds of the game, in general. However,
when a Bayesian strategy is optimal, normalized maximum
likelihood becomes independent of n. Our proof uses this to exploit
the asymptotics of normalized maximum likelihood. The
asymptotic normality of the maximum likelihood estimator is
responsible for the necessity of Jeffreys prior.},
  doi = {10.1109/TIT.2017.2735799},
  url = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ecosnmlbp-17.pdf},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/hb-ecosnmlbp-17.pdf}
}

@inproceedings{aymbg-hnr-17,
  author = {Yasin Abbasi-Yadkori and Alan Malek and Peter L.~Bartlett
and Victor Gabillon},
  title = {Hit-and-Run for Sampling and Planning in Non-Convex Spaces},
  year = {2017},
  abstract = {We propose the Hit-and-Run algorithm for planning and sampling
problems in
non-convex spaces. For sampling, we show the first analysis of the
Hit-and-Run algorithm in non-convex spaces and show that it mixes fast as
long as certain smoothness conditions are satisfied. In particular,
our analysis reveals an intriguing connection between fast mixing and
the existence of smooth measure-preserving mappings from a convex
space to the non-convex space. For planning, we show advantages of
Hit-and-Run compared to state-of-the-art planning methods such as
Rapidly-Exploring Random Trees.},
  booktitle = {Proceedings of the 20th International Conference on
Artificial Intelligence and Statistics},
  pages = {888--895},
  editor = {Aarti Singh and Jerry Zhu},
  volume = {54},
  series = {Proceedings of Machine Learning Research},
  address = {Fort Lauderdale, FL, USA},
  pdf = {http://proceedings.mlr.press/v54/abbasi-yadkori17a/abbasi-yadkori17a.pdf}
}

@inproceedings{zsjbd-rgohlnn-17,
  author = {Kai Zhong and Zhao Song and Prateek Jain
and Peter L.~Bartlett and Inderjit S. Dhillon},
  title = {Recovery Guarantees for One-hidden-layer Neural Networks},
  booktitle = {Proceedings of the 34th International Conference on
Machine Learning (ICML-17)},
  pages = {4140--4149},
  year = {2017},
  editor = {Doina Precup and Yee Whye Teh},
  volume = {70},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v70/zhong17a/zhong17a.pdf},
  url = {http://proceedings.mlr.press/v70/zhong17a.html},
  abstract = {In this paper, we consider regression problems with
one-hidden-layer neural networks (1NNs). We distill some properties of
activation functions that lead to   \emph{local strong convexity} in
the neighborhood of the ground-truth parameters for the 1NN
squared-loss objective and most popular nonlinear activation functions
satisfy the distilled properties, including rectified linear units
(ReLUs), leaky ReLUs, squared ReLUs and sigmoids. For activation
functions that are also smooth, we show \emph{local linear
convergence} guarantees of gradient descent under a resampling rule.
For homogeneous activations, we show tensor methods are able to
initialize the parameters to fall into the local strong convexity
region. As a result, tensor initialization followed by gradient
descent is guaranteed to recover the ground truth with sample
complexity $ d \cdot \log(1/\epsilon) \cdot \mathrm{poly}(k,\lambda )$
and computational complexity $n\cdot d \cdot \mathrm{poly}(k,\lambda)
$ for smooth  homogeneous activations with high probability, where $d$
is the dimension of the input, $k$ ($k\leq d$) is the number of hidden
nodes, $\lambda$ is a conditioning  property of the ground-truth
parameter matrix between the input layer and the hidden layer,
$\epsilon$ is the targeted precision and $n$ is the number of samples.
To the best of our knowledge, this is the first work that provides
recovery guarantees for 1NNs with both sample complexity and
computational complexity \emph{linear} in the input dimension and
\emph{logarithmic} in the precision.}
}

@inproceedings{pbbc-ftsmfamp-17,
  author = {Martin P{\'e}ron and Kai Helge Becker and Peter L.~Bartlett
    and Iadine Chad{\`e}s},
  title = {Fast-Tracking Stationary {MOMDPs} for Adaptive Management
    Problems},
  booktitle = {Proceedings of the Thirty-First AAAI Conference on Artificial
    Intelligence (AAAI-17)},
  pages = {4531--4537},
  year = {2017},
  abstract = {Adaptive management is applied in conservation and
  natural resource management, and consists of making sequential
  decisions when the transition matrix is uncertain. Informally
  described as ’learning by doing’, this approach aims to trade off
  between decisions that help achieve the objective and decisions that
  will yield a better knowledge of the true transition matrix. When
  the true transition matrix is assumed to be an element of a finite
  set of possible matrices, solving a mixed observability Markov
  decision process (MOMDP) leads to an optimal trade-off but is very
  computationally demanding.  Under the assumption (common in adaptive
  management) that the true transition matrix is stationary, we
  propose a polynomial-time algorithm to find a lower bound of the
  value function. In the corners of the domain of the value function
  (belief space), this lower bound is provably equal to the optimal
  value function. We also show that under further assumptions, it is a
  linear approximation of the optimal value function in a neighborhood
  around the corners. We evaluate the benefits of our approach by
  using it to initialize the solvers MO-SARSOP and Perseus on a novel
  computational sustainability problem and a recent adaptive
  management data challenge.  Our approach leads to an improved
  initial value function and translates into significant computational
  gains for both solvers.},
  pdf = {https://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14552/14063},
  url = {https://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14552}
}

@inproceedings{gayb-moa3epp-17,
  author = {Yasin Abbasi-Yadkori and Peter L.~Bartlett and Victor Gabillon},
  title = {Near Minimax Optimal Players for the Finite-Time 3-Expert
Prediction Problem},
  booktitle = {Advances in Neural Information Processing Systems 30},
  editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett},
  pages = {3033--3042},
  year = {2017},
  publisher = {Curran Associates, Inc.},
  url = {http://papers.nips.cc/paper/6896-near-minimax-optimal-players-for-the-finite-time-3-expert-prediction-problem.pdf},
  pdf = {http://papers.nips.cc/paper/6896-near-minimax-optimal-players-for-the-finite-time-3-expert-prediction-problem.pdf},
  abstract = {We study minimax strategies for the online prediction
problem with expert advice. It is conjectured that a simple adversary
strategy, called Comb, is optimal in this game for any number of
experts including the non asymptotic case where the number of experts
is small. We make progress in this direction by showing that Comb is
minimax optimal within an additive logarithmic error in the finite
time three expert problems.}
}

@inproceedings{snmbnn-manng-17,
  author = {Peter Bartlett and Dylan Foster and Matus Telgarsky},
  title = {Spectrally-normalized margin bounds for neural networks},
  booktitle = {Advances in Neural Information Processing Systems 30},
  editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett},
  pages = {6240--6249},
  year = {2017},
  publisher = {Curran Associates, Inc.},
  abstract = {This paper presents a margin-based multiclass
generalization bound for neural networks that scales with their
margin-normalized spectral complexity: their Lipschitz constant,
meaning the product of the spectral norms of the weight matrices,
times a certain correction factor. This bound is empirically
investigated for a standard AlexNet network trained with SGD on the
mnist and cifar10 datasets, with both original and random labels; the
bound, the Lipschitz constants, and the excess risks are all in direct
correlation, suggesting both that SGD selects predictors whose
complexity scales with the difficulty of the learning task, and
secondly that the presented bound is sensitive to this complexity.},
  pdf = {http://papers.nips.cc/paper/7204-spectrally-normalized-margin-bounds-for-neural-networks.pdf},
  url = {http://papers.nips.cc/paper/7204-spectrally-normalized-margin-bounds-for-neural-networks.pdf}
}

@techreport{bhlm-ntvcdpdbfplnn-17,
  author = {Peter L. Bartlett and Nick Harvey and Chris Liaw and Abbas
Mehrabian},
  title = {Nearly-tight {VC}-dimension and pseudodimension bounds for
piecewise linear neural networks},
  year = 2017,
  institution = {arXiv.org},
  number = {1703.02930},
  abstract = {We prove new upper and lower bounds on the VC-dimension of
deep neural networks with the ReLU activation function. These bounds
are tight for almost the entire range of parameters. Letting $W$ be
the number of weights and $L$ be the number of layers, we prove that
the VC-dimension is $O(W Llog(W))$, and provide examples with
VC-dimension $\Omega(W L\log(W/L))$. This improves both the previously
known upper bounds and lower bounds. In terms of the number $U$ of
non-linear units, we prove a tight bound $\Theta(W U)$ on the
VC-dimension. All of these bounds generalize to arbitrary piecewise
linear activation functions, and also hold for the pseudodimensions of
these function classes.  Combined with previous results, this gives an
intriguing range of dependencies of the VC-dimension on depth for
networks with different non-linearities: there is no dependence for
piecewise-constant, linear dependence for piecewise-linear, and no
more than quadratic dependence for general piecewise-polynomial.},
  url = {https://arxiv.org/abs/1703.02930},
  pdf = {https://arxiv.org/pdf/1703.02930.pdf}
}

@inproceedings{kb-aasdd-17,
  author = {Walid Krichene and Peter Bartlett},
  title = {Acceleration and Averaging in Stochastic Descent Dynamics},
  booktitle = {Advances in Neural Information Processing Systems 30},
  editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett},
  pages = {6796--6806},
  year = {2017},
  publisher = {Curran Associates, Inc.},
  abstract = {We formulate and study a general family of
(continuous-time) stochastic dynamics for accelerated first-order
minimization of smooth convex functions.  Building on an averaging
formulation of accelerated mirror descent, we propose a stochastic
variant in which the gradient is contaminated by noise, and study the
resulting stochastic differential equation. We prove a bound on the
rate of change of an energy function associated with the problem, then
use it to derive estimates of convergence rates of the function values
(almost surely and in expectation), both for persistent and
asymptotically vanishing noise. We discuss the interaction between the
parameters of the dynamics (learning rate and averaging rates) and the
covariation of the noise process. In particular, we show how the
asymptotic rate of covariation affects the choice of parameters and,
ultimately, the convergence rate.},
  pdf = {http://papers.nips.cc/paper/7256-acceleration-and-averaging-in-stochastic-descent-dynamics.pdf},
  url = {http://papers.nips.cc/paper/7256-acceleration-and-averaging-in-stochastic-descent-dynamics.pdf}
}

@inproceedings{cb-amdlri-17,
  author = {Niladri Chatterji and Peter Bartlett},
  title = {Alternating minimization for dictionary learning with random
initialization},
  booktitle = {Advances in Neural Information Processing Systems 30},
  editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and
R. Fergus and S. Vishwanathan and R. Garnett},
  pages = {1997--2006},
  year = {2017},
  publisher = {Curran Associates, Inc.},
  pdf = {http://papers.nips.cc/paper/6795-alternating-minimization-for-dictionary-learning-with-random-initialization.pdf},
  url = {http://papers.nips.cc/paper/6795-alternating-minimization-for-dictionary-learning-with-random-initialization.pdf}
}

@techreport{bel-rsfcnifidno-17,
  author = {Peter L. Bartlett and Steven Evans and Philip M.~Long},
  title = {Representing smooth functions as compositions of
near-identity functions with implications for deep network
optimization},
  year = {2018},
  institution = {arXiv.org},
  number = {1804.05012},
  abstract = {We show that any smooth bi-Lipschitz $h$ can be
represented exactly as a composition $h_m\circ\cdots h_1$
of functions $h_1,\ldots,h_m$ that are close to the identity in the
sense that each $(h_i − Id)$ is Lipschitz, and the Lipschitz constant
decreases inversely with the number $m$ of functions composed. This
implies that $h$ can be represented to any accuracy by a deep residual
network whose nonlinear layers compute functions with a small
Lipschitz constant. Next, we consider nonlinear regression with a
composition of near-identity nonlinear maps. We show that, regarding
Fr\'echet derivatives with respect to the $h_1,\ldots,h_m$,
any critical point of a quadratic criterion in this near-identity
region must be a global minimizer. In contrast, if we consider
derivatives with respect to parameters of a fixed-size residual
network with sigmoid activation functions, we show that there are
near-identity critical points that are suboptimal, even in the
realizable case. Informally, this means that functional gradient
methods for residual networks cannot get stuck at suboptimal critical
points corresponding to near-identity layers, whereas parametric
gradient methods for sigmoidal residual networks suffer from
suboptimal critical points in the near-identity region.},
  url = {https://arxiv.org/abs/1804.05012},
  pdf = {https://arxiv.org/pdf/1804.05012.pdf}
}

@inproceedings{gdkisdl-yplprb-18,
  author = {Dong Yin and Ashwin Pananjady and Max Lam and Dimitris
Papailiopoulos and Kannan Ramchandran and Peter Bartlett},
  title = {Gradient Diversity: a Key Ingredient for Scalable Distributed
Learning},
  year = {2018},
  booktitle = {Proceedings of the 21st International Conference on
Artificial Intelligence and Statistics},
  url = {http://proceedings.mlr.press/v84/yin18a.html},
  pdf = {http://proceedings.mlr.press/v84/yin18a/yin18a.pdf},
  pages = {1998--2007},
  editor = {Amos Storkey and Fernando Perez-Cruz},
  volume = {84},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  abstract = {It has been experimentally observed that distributed
implementations of mini-batch stochastic gradient descent (SGD)
algorithms exhibit speedup saturation and decaying generalization
ability beyond a particular batch-size. In this work, we present an
analysis hinting that high similarity between concurrently processed
gradients may be a cause of this performance degradation. We
introduce the notion of gradient diversity that measures the
dissimilarity between concurrent gradient updates, and show its key
role in the convergence and generalization performance of mini-batch
SGD.  We also establish that heuristics similar to DropConnect,
Langevin dynamics, and quantization, are provably diversity-inducing
mechanisms, and provide experimental evidence indicating that these
mechanisms can indeed enable the use of larger batches without
sacrificing accuracy and lead to faster training in distributed
learning. For example, in one of our experiments, for a
convolutional neural network to reach 95\% training accuracy on
MNIST, using the diversity-inducing mechanism can reduce the
training time by 30\% in the distributed setting.}
}

@inproceedings{crpbm-ffflcagm-18,
  author = {Xiang Cheng and Fred Roosta and Stefan Palombo
    and Peter Bartlett and Michael Mahoney},
  title = {FLAG n’ FLARE: Fast Linearly-Coupled Adaptive Gradient
Methods},
  year = {2018},
  pages = {404--414},
  booktitle = {Proceedings of the 21st International Conference on
Artificial Intelligence and Statistics},
  url = {http://proceedings.mlr.press/v84/cheng18b.html},
  pdf = {http://proceedings.mlr.press/v84/cheng18b/cheng18b.pdf},
  publisher = {PMLR},
  editor = {Amos Storkey and Fernando Perez-Cruz},
  volume = {84},
  series = {Proceedings of Machine Learning Research},
  abstract = {We consider first order gradient methods for effectively
optimizing a composite objective in the form of a sum of smooth and,
potentially, non-smooth functions. We present accelerated and adaptive
gradient methods, called FLAG and FLARE, which can offer the best of
both worlds. They can achieve the optimal convergence rate by
attaining the optimal first-order oracle complexity for smooth convex
optimization. Additionally, they can adaptively and non-uniformly
re-scale the gradient direction to adapt to the limited curvature
available and conform to the geometry of the domain. We show
theoretically and empirically that, through the compounding effects of
acceleration and adaptivity, FLAG and FLARE can be highly effective
for many data fitting and machine learning applications.}
}

@inproceedings{cb-clmcmckld-18,
  author = {Xiang Cheng and Peter Bartlett},
  title = {Convergence of {L}angevin {MCMC} in {KL}-divergence},
  year = {2018},
  booktitle = {Proceedings of ALT2018},
  url = {http://proceedings.mlr.press/v83/cheng18a.html},
  pdf = {http://proceedings.mlr.press/v83/cheng18a/cheng18a.pdf},
  pages = {186--211},
  editor = {Firdaus Janoos and Mehryar Mohri and Karthik Sridharan},
  volume = {83},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  abstract = {Langevin diffusion is a commonly used tool for
sampling from a given distribution. In this work, we establish that
when the target density $p^*$ is such that $\log p^*$ is $L$
smooth and $m$ strongly convex, discrete Langevin diffusion produces
a distribution $p$ with $\KL{p}{p^*}≤\epsilon$ in
$\tilde{O}(\frac{d}{\epsilon})$ steps, where $d$ is the dimension of the
sample space.  We also study the convergence rate when the
strong-convexity assumption is absent. By considering the Langevin
diffusion as a gradient flow in the space of probability
distributions, we obtain an elegant analysis that applies to the
stronger property of convergence in KL-divergence and gives a
conceptually simpler proof of the best-known convergence results in
weaker metrics.}
}

@inproceedings{pbbhc-tadpamcsn-18,
  author = {Martin P\'{e}ron and Peter Bartlett and Kai Helge Becker and Kate Helmstedt and Iadine Chad\`{e}s},
  title = {Two approximate dynamic programming algorithms for managing
complete {SIS} networks},
  year = {2018},
  booktitle = {ACM SIGCAS Conference on Computing and
Sustainable Societies (COMPASS 2018)},
  url = {https://dl.acm.org/citation.cfm?id=3209811.3209814},
  pdf = {http://www.stat.berkeley.edu/~bartlett/papers/pbbhc-tadpamcsn-18.pdf}
}

@inproceedings{ycrb-brdltosr-18,
  author = {Dong Yin and Yudong Chen and Kannan Ramchandran and Peter
Bartlett},
  title = {Byzantine-Robust Distributed Learning: Towards Optimal
Statistical Rates},
  abstract = {we develop distributed optimization algorithms that are
provably robust against Byzantine failures---arbitrary and potentially
adversarial behavior, in distributed computing systems, with a focus
on achieving optimal statistical performance. A main result of this
work is a sharp analysis of two robust distributed gradient descent
algorithms based on median and trimmed mean operations, respectively.
We prove statistical error rates for all of strongly convex,
non-strongly convex, and smooth non-convex population loss functions.
In particular, these algorithms are shown to achieve order-optimal
statistical error rates for strongly convex losses. To achieve better
communication efficiency, we further propose a median-based
distributed algorithm that is provably robust, and uses only one
communication round. For strongly convex quadratic loss, we show that
this algorithm achieves the same optimal error rate as the robust
distributed gradient descent algorithms.},
  year = {2018},
  booktitle = {Proceedings of the 35th International Conference on
Machine Learning (ICML-18)},
  url = {http://proceedings.mlr.press/v80/yin18a.html},
  pdf = {http://proceedings.mlr.press/v80/yin18a/yin18a.pdf},
  pages = {5650--5659},
  editor = {Dy, Jennifer and Krause, Andreas},
  volume = {80},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  abstract = {In this paper, we develop distributed optimization
algorithms that are provably robust against Byzantine
failures—arbitrary and potentially adversarial behavior, in
distributed computing systems, with a focus on achieving optimal
statistical performance. A main result of this work is a sharp
analysis of two robust distributed gradient descent algorithms based
on median and trimmed mean operations, respectively. We prove
statistical error rates for all of strongly convex, non-strongly
convex, and smooth non-convex population loss functions. In
particular, these algorithms are shown to achieve order-optimal
statistical error rates for strongly convex losses. To achieve
better communication efficiency, we further propose a median-based
distributed algorithm that is provably robust, and uses only one
communication round. For strongly convex quadratic loss, we show
that this algorithm achieves the same optimal error rate as the
robust distributed gradient descent algorithms.}
}

@inproceedings{cfmbj-otvrsgmc-18,
  author = {Niladri Chatterji and Nicolas Flammarion and Yian Ma and
Peter Bartlett and Michael Jordan},
  title = {On the Theory of Variance Reduction for Stochastic Gradient
{M}onte {C}arlo},
  year = {2018},
  booktitle = {Proceedings of the 35th International Conference on
Machine Learning (ICML-18)},
  url = {http://proceedings.mlr.press/v80/chatterji18a.html},
  pdf = {http://proceedings.mlr.press/v80/chatterji18a/chatterji18a.pdf},
  pages = {764--773},
  editor = {Dy, Jennifer and Krause, Andreas},
  volume = {80},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  abstract = {We provide convergence guarantees in Wasserstein
distance for a variety of variance-reduction methods: SAGA Langevin
diffusion, SVRG Langevin diffusion and control-variate underdamped
Langevin diffusion. We analyze these methods under a uniform set of
assumptions on the log-posterior distribution, assuming it to be
smooth, strongly convex and Hessian Lipschitz. This is achieved by a
new proof technique combining ideas from finite-sum optimization and
the analysis of sampling methods. Our sharp theoretical bounds allow
us to identify regimes of interest where each method performs better
than the others. Our theory is verified with experiments on
real-world and synthetic datasets.}
}

@inproceedings{bhl-gdiielpdltdrn-18,
  author = {Peter L.~Bartlett and David P.~Helmbold and Philip M.~Long},
  title = {Gradient descent with identity initialization
efficiently learns positive definite linear transformations
by deep residual networks},
  year = {2018},
  url = {https://arxiv.org/abs/1802.06093},
  booktitle = {Proceedings of the 35th International Conference on
Machine Learning (ICML-18)},
  abstract = {We analyze algorithms for approximating a function $f(x) =
\Phi x$ mapping $\Re^d$ to $\Re^d$ using deep linear neural networks,
i.e. that learn a function $h$ parameterized by matrices $\Theta_1,
\ldots , \Theta_L$ and defined by $h(x) =
\Theta_L\Theta_{L−1}\cdots\Theta_1x$. We focus on algorithms that
learn through gradient descent on the population quadratic loss in the
case that the distribution over the inputs is isotropic. We provide
polynomial bounds on the number of iterations for gradient descent to
approximate the least squares matrix $\Phi$, in the case where the
initial hypothesis $\Theta_1 = \cdots = \Theta_L = I$ has excess loss
bounded by a small enough constant. On the other hand, we show that
gradient descent fails to converge for $\Phi$ whose distance from the
identity is a larger constant, and we show that some forms of
regularization toward the identity in each layer do not help.
If $\Phi$ is symmetric positive definite, we show
that an algorithm that initializes $\Theta_i = I$ learns an
$\epsilon$-approximation of $f$ using a number of updates polynomial
in $L$, the condition number of $\Phi$, and $\log(d/\epsilon)$.
In contrast, we show that if the least squares matrix $\Phi$
is symmetric and has a negative eigenvalue, then all members of a
class of algorithms that perform gradient descent with identity
initialization, and optionally regularize toward the identity in each
layer, fail to converge. We analyze an algorithm for the case that
$\Phi$ satisfies $u^\top\Phi u>0$ for all $u$, but may not be symmetric.
This algorithm uses two regularizers: one that maintains the invariant
$u^\top\Theta_L\Theta_{L-1}\cdots\Theta_1 u>0$
for all $u$, and another that `balances' $\Theta_1, \ldots, \Theta_L$
so that they have the same singular values.},
  url = {http://proceedings.mlr.press/v80/bartlett18a.html},
  pdf = {http://proceedings.mlr.press/v80/bartlett18a/bartlett18a.pdf},
  pages = {521--530},
  editor = {Dy, Jennifer and Krause, Andreas},
  volume = {80},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR}
}

@inproceedings{mb-himlr-18a,
  author = {Alan Malek and Peter L.~Bartlett},
  title = {Horizon-Independent Minimax Linear Regression},
  booktitle = {Advances in Neural Information Processing Systems 31},
  publisher = {Curran Associates, Inc.},
  year = {2018},
  abstract = {We consider a linear regression game: at each round, an
adversary reveals a covariate vector, the learner predicts a real
value, the adversary reveals a label, and the learner suffers the
squared prediction error. The aim is to minimize the difference
between the cumulative loss and that of the linear predictor that is
best in hindsight. Previous work demonstrated that the minimax optimal
strategy is easy to compute recursively from the end of the game; this
requires the entire sequence of covariate vectors in advance. We show
that, once provided with a measure of the scale of the problem, we can
invert the recursion and play the minimax strategy without knowing the
future covariates. Further, we show that this forward recursion
remains optimal even against adaptively chosen labels and covariates,
provided that the adversary adheres to a set of constraints that
prevent misrepresentation of the scale of the problem. This strategy
is horizon-independent, i.e. it incurs no more regret than the optimal
strategy that knows in advance the number of rounds of the game. We
also provide an interpretation of the minimax algorithm as a
follow-the-regularized-leader strategy with a data-dependent
regularizer, and obtain an explicit expression for the minimax
regret.},
  editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
  pages = {5264--5273},
  url = {http://papers.nips.cc/paper/7772-horizon-independent-minimax-linear-regression.pdf}
}

@inproceedings{ccbj-ulmanaa-18,
  author = {Xiang Cheng and Niladri S.~Chatterji and Peter L.~Bartlett
and Michael I.~Jordan},
  title = {Underdamped {L}angevin {MCMC}: A non-asymptotic analysis},
  year = {2018},
  booktitle = {Proceedings of the 31st Conference on Learning Theory (COLT2018)},
  abstract = {We study the underdamped Langevin diffusion when the log
of the target distribution is smooth and strongly concave. We present
a MCMC algorithm based on its discretization and show that it achieves
$\epsilon$ error (in 2-Wasserstein distance) in $O(\sqrt d/\epsilon)$
steps. This is a significant improvement over the best known rate for
overdamped Langevin MCMC, which is $O(d/\epsilon^2)$ steps under the
same smoothness/concavity assumptions. The underdamped Langevin MCMC
scheme can be viewed as a version of Hamiltonian Monte Carlo (HMC)
which has been observed to outperform overdamped Langevin MCMC methods
in a number of application areas. We provide quantitative rates that
support this empirical wisdom.},
  url = {http://proceedings.mlr.press/v75/cheng18a.html},
  pdf = {http://proceedings.mlr.press/v75/cheng18a/cheng18a.pdf},
  pages = {300--323},
  editor = {Bubeck, S\'ebastien and Perchet, Vianney and Rigollet, Philippe},
  volume = {75},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR}
}

@inproceedings{abgmv-bbwsabai-18,
  author = {Yasin Abbasi-Yadkori and Peter L.~Bartlett and Victor
Gabillon and Alan Malek and Michal Valko},
  title = {Best of Both Worlds:   Stochastic and Adversarial Best-Arm
Identification},
  year = {2018},
  abstract = {We study bandit best-arm identification with arbitrary and
potentially adversarial rewards. A simple random uniform learner
obtains the optimal rate of error in the adversarial scenario.
However, this type of strategy is suboptimal when the rewards are
sampled stochastically. Therefore, we ask: Can we design a learner
that performs optimally in both the stochastic and adversarial
problems while not being aware of the nature of the rewards? First, we
show that designing such a learner is impossible in general. In
particular, to be robust to adversarial rewards, we can only guarantee
optimal rates of error on a subset of the stochastic problems. We give
a lower bound that characterizes the optimal rate in stochastic
problems if the strategy is constrained to be robust to adversarial
rewards.  Finally, we design a simple parameter-free algorithm and
show that its probability of error matches (up to log factors) the
lower bound in stochastic problems, and it is also robust to
adversarial ones.},
  booktitle = {Proceedings of the 31st Conference on Learning Theory (COLT2018)},
  url = {http://proceedings.mlr.press/v75/abbasi-yadkori18a},
  pdf = {http://proceedings.mlr.press/v75/abbasi-yadkori18a/abbasi-yadkori18a.pdf},
  pages = {918--949},
  editor = {Bubeck, S\'ebastien and Perchet, Vianney and Rigollet, Philippe},
  volume = {75},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR}
}

@inproceedings{bpfbj-gseasgec-18,
  author = {Kush Bhatia and Aldo Pacchiano and Nicolas Flammarion and
Peter L.~Bartlett and Michael I.~Jordan},
  title = {Gen-{O}ja: Simple and Efficient Algorithm for Streaming
Generalized Eigenvector Computation},
  year = {2018},
  booktitle = {Advances in Neural Information Processing Systems 31},
  publisher = {Curran Associates, Inc.},
  abstract = {In this paper, we study the problems of principle
Generalized Eigenvector computation and Canonical Correlation Analysis
in the stochastic setting. We propose a simple and efficient algorithm
for these problems. We prove the global convergence of our algorithm,
borrowing ideas from the theory of fast-mixing Markov chains and
two-Time-Scale Stochastic Approximation, showing that it achieves the
optimal rate of convergence. In the process, we develop tools for
understanding stochastic processes with Markovian noise which might be
of independent interest.},
  editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
  pages = {7016--7025},
  url = {http://papers.nips.cc/paper/7933-gen-oja-simple-efficient-algorithm-for-streaming-generalized-eigenvector-computation.pdf}
}

@techreport{ccabj-scrldns-18,
  title = {Sharp Convergence Rates for {L}angevin Dynamics in the
Nonconvex Setting},
  author = {Xiang Cheng and Niladri S.~Chatterji and Yasin
Abbasi-Yadkori and Peter L.~Bartlett and Michael I.~Jordan},
  institution = {arxiv.org},
  number = {arXiv:1805.01648 [stat.ML]},
  year = {2018},
  url = {https://arxiv.org/abs/1805.01648}
}

@article{bhlm-ntvcdpdbfplnn-19,
  author = {Peter L. Bartlett and Nick Harvey and Christopher Liaw and Abbas Mehrabian},
  title = {Nearly-tight {VC}-dimension and Pseudodimension Bounds for Piecewise Linear Neural Networks},
  journal = {Journal of Machine Learning Research},
  year = {2019},
  volume = {20},
  number = {63},
  pages = {1-17},
  url = {http://jmlr.org/papers/v20/17-612.html}
}

@inproceedings{bgv-aspfaaoumlsa-19,
  author = {Peter L.~Bartlett and Victor Gabillon and Michal Valko},
  title = {A simple parameter-free and adaptive approach to optimization
under a minimal local smoothness assumption},
  booktitle = {Proceedings of the 30th International Conference on Algorithmic Learning Theory},
  pages = {184--206},
  year = {2019},
  editor = {Garivier, Aur\'elien and Kale, Satyen},
  volume = {98},
  series = {Proceedings of Machine Learning Research},
  address = {Chicago, Illinois},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v98/bartlett19a/bartlett19a.pdf},
  url = {http://proceedings.mlr.press/v98/bartlett19a.html},
  abstract = {We study the problem of optimizing a function under
a \emph{budgeted number of evaluations}. We only assume that the
function is \emph{locally} smooth around one of its global optima.
The difficulty of optimization is measured in terms of 1) the amount
of \emph{noise} $b$ of the function evaluation and 2)	the local
smoothness, $d$, of the function. A smaller $d$ results in smaller
optimization error. We come with a new, simple, and parameter-free
approach. First, for all values of $b$ and $d$, this approach
recovers at least the state-of-the-art regret guarantees. Second,
our approach additionally obtains these results while being
\textit{agnostic} to the values of both $b$ and $d$. This leads to
the first algorithm that naturally adapts to an \textit{unknown}
range of noise $b$ and leads to significant improvements in a
moderate and low-noise regime. Third, our approach also obtains a
remarkable improvement over the state-of-the-art SOO algorithm when
the noise is very low which includes the case of optimization under
deterministic feedback ($b=0$).	There, under our minimal local
smoothness assumption, this improvement is of exponential magnitude
and holds for a class of functions that covers the vast majority of
functions that practitioners optimize ($d=0$). We  show that our
algorithmic improvement is borne out in  experiments as we
empirically show faster convergence on common benchmarks.}
}

@inproceedings{mpbkbw-dfmfpogflqs-19,
  title = {Derivative-Free Methods for Policy Optimization:
Guarantees for Linear Quadratic Systems},
  author = {Dhruv Malik and Ashwin Pananjady and Kush Bhatia and
Koulik Khamaru and Peter L. Bartlett and Martin J. Wainwright},
  booktitle = {Proceedings of the 22nd International Conference on Artificial
Intelligence and Statistics (AISTATS)},
  pages = {2916--2925},
  year = {2019},
  editor = {Chaudhuri, Kamalika and Sugiyama, Masashi},
  volume = {89},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v89/malik19a/malik19a.pdf},
  url = {http://proceedings.mlr.press/v89/malik19a.html},
  abstract = {We study derivative-free methods for policy
optimization over the class of linear policies. We focus on
characterizing the convergence rate of a canonical stochastic,
two-point, derivative-free method for linear-quadratic systems in
which the initial state of the system is drawn at random. In
particular, we show that for problems with effective dimension $D$,
such a method converges to an $\epsilon$-approximate solution within
$\widetilde{\mathcal{O}}(D/\epsilon)$ steps, with multiplicative
pre-factors that are explicit lower-order polynomial terms in the
curvature parameters of the problem. Along the way, we also derive
stochastic zero-order rates for a class of non-convex optimization
problems.}
}

@inproceedings{mrsb-bmwrmsosl-19,
  title = {Best of many worlds: Robust model selection for online
supervised learning},
  author = {Vidya Muthukumar and Mitas Ray and
Anant Sahai and Peter L. Bartlett},
  booktitle = {Proceedings of the 22nd International Conference on Artificial
Intelligence and Statistics (AISTATS)},
  pages = {3177--3186},
  year = {2019},
  editor = {Chaudhuri, Kamalika and Sugiyama, Masashi},
  volume = {89},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v89/muthukumar19a/muthukumar19a.pdf},
  url = {http://proceedings.mlr.press/v89/muthukumar19a.html},
  abstract = {We introduce algorithms for online, full-information
prediction that   are computationally efficient and competitive with
contextual tree experts of unknown complexity, in both probabilistic
and adversarial settings.   We incorporate a novel probabilistic
framework of structural risk minimization into existing adaptive
algorithms and show that we can robustly learn not only the presence
of stochastic structure when it exists, but also the correct model
order. When the stochastic data is actually realized from a
predictor in the model class considered, we obtain regret bounds
that are competitive with the regret of an optimal algorithm that
possesses strong side information about both the true model order
and whether the process generating the data is stochastic or
adversarial. In cases where the data does not arise from any of the
models, our algorithm selects models of higher order as we play more
rounds.   We display empirically improved \textit{overall prediction
error} over other adversarially robust approaches.}
}

@inproceedings{pcb-olkl-19,
  title = {Online learning with kernel losses},
  author = {Chatterji, Niladri and Pacchiano, Aldo and Bartlett, Peter},
  booktitle = {Proceedings of the 36th International Conference on Machine Learning},
  pages = {971--980},
  year = {2019},
  editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
  volume = {97},
  series = {Proceedings of Machine Learning Research},
  address = {Long Beach, California, USA},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v97/chatterji19a/chatterji19a.pdf},
  url = {http://proceedings.mlr.press/v97/chatterji19a.html},
  abstract = {We present a generalization of the adversarial
linear bandits framework, where the underlying losses are kernel
functions (with an associated reproducing kernel Hilbert space)
rather than linear functions. We study a version of the exponential
weights algorithm and bound its regret in this setting. Under
conditions on the eigen-decay of the kernel we provide a sharp
characterization of the regret for this algorithm. When we have
polynomial eigen-decay ($\mu_j \le \mathcal{O}(j^{-\beta})$), we
find that the regret is bounded by $\mathcal{R}_n \le
\mathcal{O}(n^{\beta/2(\beta-1)})$. While under the assumption of
exponential eigen-decay ($\mu_j \le \mathcal{O}(e^{-\beta j })$) we
get an even tighter bound on the regret $\mathcal{R}_n \le
\tilde{\mathcal{O}}(n^{1/2})$. When the eigen-decay is polynomial we
also show a \emph{non-matching} minimax lower bound on the regret of
$\mathcal{R}_n \ge \Omega(n^{(\beta+1)/2\beta})$ and a lower bound
of $\mathcal{R}_n \ge \Omega(n^{1/2})$ when the decay in the
eigen-values is exponentially fast. We also study the full
information setting when the underlying losses are kernel functions
and present an adapted exponential weights algorithm and a
conditional gradient descent algorithm.}
}

@inproceedings{bgv-sfapdddr-19,
  author = {Peter L.~Bartlett and Victor Gabillon and Jennifer Healey
and Michal Valko},
  title = {Scale-free adaptive planning for deterministic dynamics and
discounted rewards},
  booktitle = {Proceedings of the 36th International Conference on Machine Learning},
  pages = {495--504},
  year = {2019},
  editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
  volume = {97},
  series = {Proceedings of Machine Learning Research},
  address = {Long Beach, California, USA},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v97/bartlett19a/bartlett19a.pdf},
  url = {http://proceedings.mlr.press/v97/bartlett19a.html},
  abstract = {We address the problem of planning in an environment
with deterministic dynamics and stochastic discounted rewards under
a limited numerical budget where the ranges of both rewards and
noise are unknown. We introduce PlaTypOOS, an adaptive, robust, and
efficient alternative to the OLOP (open-loop optimistic planning)
algorithm. Whereas OLOP requires a priori knowledge of the ranges of
both rewards and noise, PlaTypOOS dynamically adapts its behavior to
both. This allows PlaTypOOS to be immune to two vulnerabilities of
OLOP: failure when given underestimated ranges of noise and rewards
and inefficiency when these are overestimated. PlaTypOOS
additionally adapts to the global smoothness of the value function.
PlaTypOOS acts in a provably more efficient manner vs. OLOP when
OLOP is given an overestimated reward and show that in the case of
no noise, PlaTypOOS learns exponentially faster.}
}

@inproceedings{abblsw-prbpiep-19,
  title = {{POLITEX}: Regret Bounds for Policy Iteration using Expert Prediction},
  author = {Abbasi-Yadkori, Yasin and Bartlett, Peter L. and Bhatia, Kush and Lazic, Nevena and Szepesvari, Csaba and Weisz, Gellert},
  booktitle = {Proceedings of the 36th International Conference on Machine Learning},
  pages = {3692--3702},
  year = {2019},
  editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
  volume = {97},
  series = {Proceedings of Machine Learning Research},
  address = {Long Beach, California, USA},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v97/lazic19a/lazic19a.pdf},
  url = {http://proceedings.mlr.press/v97/lazic19a.html},
  abstract = {We present POLITEX (POLicy ITeration with EXpert
advice), a variant of policy iteration where each policy is a
Boltzmann distribution over the sum of action-value function
estimates of the previous policies, and analyze its regret in
continuing RL problems. We assume that the value function error
after running a policy for $\tau$ time steps scales as
$\epsilon(\tau) = \epsilon_0 + O(\sqrt{d/\tau})$, where $\epsilon_0$
is the worst-case approximation error and $d$ is the number of
features in a compressed representation of the state-action space.
We establish that this condition is satisfied by the LSPE algorithm
under certain assumptions on the MDP and policies. Under the error
assumption, we show that the regret of POLITEX in uniformly mixing
MDPs scales as $O(d^{1/2}T^{3/4} + \epsilon_0T)$, where $O(\cdot)$
hides logarithmic terms and problem-dependent constants. Thus, we
provide the first regret bound for a fully practical model-free
method which only scales in the number of features, and not in the
size of the underlying MDP. Experiments on a queuing problem confirm
that POLITEX is competitive with some of its alternatives, while
preliminary results on Ms Pacman (one of the standard Atari
benchmark problems) confirm the viability of POLITEX beyond linear
function approximation.}
}

@inproceedings{yckb-daspabrdl-19,
  author = {Dong Yin and Yudong Chen and Ramchandran Kannan and
Peter L.~Bartlett},
  title = {Defending Against Saddle Point Attack in {B}yzantine-Robust Distributed Learning},
  booktitle = {Proceedings of the 36th International Conference on Machine Learning},
  pages = {7074--7084},
  year = {2019},
  editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
  volume = {97},
  series = {Proceedings of Machine Learning Research},
  address = {Long Beach, California, USA},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v97/yin19a/yin19a.pdf},
  url = {http://proceedings.mlr.press/v97/yin19a.html},
  abstract = {We study robust distributed learning that involves
minimizing a non-convex loss function with saddle points. We
consider the Byzantine setting where some worker machines have
abnormal or even arbitrary and adversarial behavior, and in this
setting, the Byzantine machines may create fake local minima near a
saddle point that is far away from any true local minimum, even when
robust gradient estimators are used. We develop ByzantinePGD, a
robust first-order algorithm that can provably escape saddle points
and fake local minima, and converge to an approximate true local
minimizer with low iteration complexity. As a by-product, we give a
simpler algorithm and analysis for escaping saddle points in the
usual non-Byzantine setting. We further discuss three robust
gradient estimators that can be used in ByzantinePGD, including
median, trimmed mean, and iterative filtering. We characterize their
performance in concrete statistical settings, and argue for their
near-optimality in low and high dimensional regimes.}
}

@inproceedings{ykb-rcarg-19,
  author = {Dong Yin and Ramchandran Kannan and Peter L.~Bartlett},
  title = {Rademacher Complexity for Adversarially Robust
Generalization},
  booktitle = {Proceedings of the 36th International Conference on Machine Learning},
  pages = {7085--7094},
  year = {2019},
  editor = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
  volume = {97},
  series = {Proceedings of Machine Learning Research},
  address = {Long Beach, California, USA},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v97/yin19b/yin19b.pdf},
  url = {http://proceedings.mlr.press/v97/yin19b.html},
  abstract = {Many machine learning models are vulnerable to
adversarial attacks; for example, adding adversarial perturbations
that are imperceptible to humans can often make machine learning
models produce wrong predictions with high confidence; moreover,
although we may obtain robust models on the training dataset via
adversarial training, in some problems the learned models cannot
generalize well to the test data. In this paper, we focus on
$\ell_\infty$ attacks, and study the adversarially robust
generalization problem through the lens of Rademacher complexity.
For binary linear classifiers, we prove tight bounds for the
adversarial Rademacher complexity, and show that the adversarial
Rademacher complexity is never smaller than its natural counterpart,
and it has an unavoidable dimension dependence, unless the weight
vector has bounded $\ell_1$ norm, and our results also extend to
multi-class linear classifiers; in addition, for (nonlinear) neural
networks, we show that the dimension dependence in the adversarial
Rademacher complexity also exists. We further consider a surrogate
adversarial loss for one-hidden layer ReLU network and prove margin
bounds for this setting. Our results indicate that having $\ell_1$
norm constraints on the weight matrices might be a potential way to
improve generalization in the adversarial setting. We demonstrate
experimental results that validate our theoretical findings.}
}

@techreport{bmdbj-brnv-19,
  title = {Bayesian Robustness: A Nonasymptotic Viewpoint},
  author = {Kush Bhatia and Yi-An Ma and Anca D. Dragan and Peter L. Bartlett and Michael I. Jordan},
  year = {2019},
  institution = {arXiv},
  number = {1907.11826}
}

@techreport{mhwbj-sbmm-19,
  author = {Wenlong Mou and Nhat Ho and Martin J. Wainwright and
Peter L.~Bartlett and Michael I.~Jordan},
  title = {Sampling for {B}ayesian Mixture Models: {MCMC} with Polynomial-Time Mixing},
  institution = {arXiv},
  number = {1912.05153},
  year = {2019},
  abstract = {We study the problem of sampling from the power posterior
distribution in Bayesian Gaussian mixture models. The power posterior
is a robust version of the classical posterior; it is known to be
non-log-concave and multi-modal, which leads to exponential mixing
times for some standard MCMC algorithms. We introduce and study the
Reflected Metropolis-Hastings Random Walk (RMRW) algorithm for
sampling from the power posterior distribution of two-component
location Gaussian mixtures. For symmetric mixtures with mean
parameters at $−\theta_0$ and $\theta_0$, we prove that its mixing
time is bounded as $d^{1.5}(d + \|\theta_0\|^2)^{4.5}$
as long as the sample size $n$ is of the order $d(d +
\|\theta_0\|^2)$.
Notably, this result requires no conditions on the separation of the
two means $\theta_0$ and $-\theta_0$. En route to proving this bound,
we establish some new results, of possible independent interest, that
allow for combining Poincare inequalities for conditional and marginal
densities.}
}

@inproceedings{cfb-fmesgr-19,
  title = {Fast Mean Estimation with Sub-Gaussian Rates},
  author = {Cherapanamjeri, Yeshwanth and Flammarion, Nicolas and Bartlett, Peter L.},
  booktitle = {Proceedings of the Thirty-Second Conference on Learning Theory},
  pages = {786--806},
  year = {2019},
  editor = {Beygelzimer, Alina and Hsu, Daniel},
  volume = {99},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v99/cherapanamjeri19b/cherapanamjeri19b.pdf},
  url = {http://proceedings.mlr.press/v99/cherapanamjeri19b.html},
  abstract = {We propose an estimator for the mean of a random
  vector in $\mathbb{R}^d$ that can be computed in time
  $O(n^{3.5}+n^2d)$ for $n$ i.i.d. samples and that has error bounds
  matching the sub-Gaussian case. The only assumptions we make about
  the data distribution are that it has finite mean and covariance; in
  particular, we make no assumptions about higher-order moments. Like
  the polynomial time estimator introduced by Hopkins (2018), which is
  based on the sum-of-squares hierarchy, our estimator achieves
  optimal statistical efficiency in this challenging setting, but it
  has a significantly faster runtime and a simpler analysis.}
}

@inproceedings{cb-tmcwh-19,
  author = {Yeshwanth Cherapanamjeri and Peter L.~Bartlett},
  title = {Testing {M}arkov Chains Without Hitting},
  year = {2019},
  abstract = {We study the problem of identity testing of Markov chains.
In this setting, we are given access to a single trajectory from a
Markov chain with unknown transition matrix $Q$ and the goal is to
determine whether $Q = P$ for some known matrix $P$ or Dist$(P ,
Q)\ge\epsilon$, where Dist is suitably defined. In recent work by
Daskalakis et al.~(2018a), it was shown that it is possible to
distinguish between the two cases provided the length of the observed
trajectory is at least super-linear in the hitting time of $P$, which
may be arbitrarily large.  In this paper, we propose an algorithm that
avoids this dependence on hitting time, thus enabling efficient
testing of Markov chains even in cases where it is infeasible to
observe every state in the chain. Our algorithm is based on combining
classical ideas from approximation algorithms with techniques for the
spectral analysis of Markov chains.},
  booktitle = {Proceedings of the 32nd Conference on Learning Theory (COLT2019)},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  pages = {758--785},
  editor = {Beygelzimer, Alina and Hsu, Daniel},
  volume = {99},
  pdf = {http://proceedings.mlr.press/v99/cherapanamjeri19a/cherapanamjeri19a.pdf},
  url = {http://proceedings.mlr.press/v99/cherapanamjeri19a.html}
}

@article{bhl-gdiielpdltdrn-19,
  author = {Peter L.~Bartlett and David P.~Helmbold and Philip M.~Long},
  title = {Gradient descent with identity initialization
efficiently learns positive definite linear transformations
by deep residual networks},
  year = {2019},
  journal = {Neural Computation},
  volume = {31},
  pages = {477--502}
}

@article{mfwb-esanscp-22,
  author = {Wenlong Mou and Nicolas Flammarion and Martin J.~Wainwright and Peter L.~Bartlett},
  title = {An Efficient Sampling Algorithm for Non-smooth Composite Potentials},
  journal = {Journal of Machine Learning Research},
  year = {2022},
  volume = {23},
  number = {233},
  pages = {1--50},
  url = {http://jmlr.org/papers/v23/20-527.html}
}

@techreport{mfwb-esanscp-19,
  title = {An Efficient Sampling Algorithm for Non-smooth Composite Potentials},
  author = {Wenlong Mou and Nicolas Flammarion and Martin J. Wainwright and Peter L. Bartlett},
  year = {2019},
  institution = {arXiv},
  number = {1910.00551}
}

@techreport{tb-borr-20,
  title = {Benign overfitting in ridge regression},
  author = {Alexander Tsigler and Peter L.~Bartlett},
  year = {2020},
  abstract = {Classical learning theory suggests that strong regularization is needed to learn a class with large complexity. This intuition is in contrast with the modern practice of machine learning, in particular learning neural networks, where the number of parameters often exceeds the number of data points. It has been observed empirically that such overparametrized models can show good generalization performance even if trained with vanishing or negative regularization. The aim of this work is to understand theoretically how this effect can occur, by studying the setting of ridge regression. We provide non-asymptotic generalization bounds for overparametrized ridge regression that depend on the arbitrary covariance structure of the data, and show that those bounds are tight for a range of regularization parameter values. To our knowledge this is the first work that studies overparametrized ridge regression in such a general setting. We identify when small or negative regularization is sufficient for obtaining small generalization error. On the technical side, our bounds only require the data vectors to be i.i.d. sub-gaussian, while most previous work assumes independence of the components of those vectors.},
  institution = {arxiv.org},
  number = {arXiv:2009.14286},
  url = {https://arxiv.org/abs/2009.14286}
}

@article{tb-borr-23,
  author = {Alexander Tsigler and Peter L. Bartlett},
  title = {Benign overfitting in ridge regression},
  journal = {Journal of Machine Learning Research},
  year = {2023},
  volume = {24},
  number = {123},
  pages = {1--76},
  url = {http://jmlr.org/papers/v24/22-1398.html},
  note = {(See also arXiv:2009.14286)}
}

@inproceedings{bpbdw-plamc-20,
  title = {Preference learning along multiple criteria: A game-theoretic
perspective},
  author = {Kush Bhatia and Ashwin Pananjady and Peter L.~Bartlett and
Anca Dragan and Martin Wainwright},
  year = {2020},
  abstract = {The literature on ranking from ordinal data is vast, and
there are several ways to aggregate overall preferences from pairwise
comparisons between objects. In particular, it is well-known that any
Nash equilibrium of the zero-sum game induced by the preference matrix
defines a natural solution concept (winning distribution over objects)
known as a von Neumann winner. Many real-world problems, however, are
inevitably multi-criteria, with different pairwise preferences
governing the different criteria. In this work, we generalize the
notion of a von Neumann winner to the multi-criteria setting by taking
inspiration from Blackwell’s approachability. Our framework allows for
non-linear aggregation of preferences across criteria, and generalizes
the linearization-based approach from multi-objective optimization.
From a theoretical standpoint, we show that the Blackwell winner of a
multi-criteria problem instance can be computed as the solution to a
convex optimization problem. Furthermore, given random samples of
pairwise comparisons, we show that a simple, "plug-in" estimator
achieves (near-)optimal minimax sample complexity. Finally, we
showcase the practical utility of our framework in a user study on
autonomous driving, where we find that the Blackwell winner
outperforms the von Neumann winner for the overall preferences.},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
  pages = {7413--7424},
  publisher = {Curran Associates, Inc.},
  url = {https://proceedings.neurips.cc/paper/2020/file/52f4691a4de70b3c441bca6c546979d9-Paper.pdf},
  volume = {33}
}

@article{mpbkbw-dfmfpogflqs-20,
  title = {Derivative-Free Methods for Policy Optimization: Guarantees
for Linear Quadratic Systems},
  author = {Dhruv Malik and Ashwin Pananjady and Kush Bhatia and
Koulik Khamaru and Peter L.~Bartlett and Martin J.~Wainwright},
  journal = {Journal of Machine Learning Research},
  year = {2020},
  volume = {21},
  number = {21},
  pages = {1-51},
  url = {http://jmlr.org/papers/v21/19-198.html}
}

@inproceedings{cmb-oasoa-20,
  title = {{OSOM}: A Simultaneously Optimal Algorithm for Multi-Armed and Linear
Contextual Bandits},
  author = {Niladri Chatterji and Vidya Muthukumar and Peter L.~Bartlett},
  year = {2020},
  booktitle = {Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics},
  pages = {1844--1854},
  editor = {Chiappa, Silvia and Calandra, Roberto},
  volume = {108},
  series = {Proceedings of Machine Learning Research},
  month = {26--28 Aug},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v108/chatterji20b/chatterji20b.pdf},
  url = {http://proceedings.mlr.press/v108/chatterji20b.html},
  abstract = {We consider the stochastic linear (multi-armed)
  contextual bandit problem with the possibility of hidden simple
  multi-armed bandit structure in which the rewards are independent of
  the contextual information. Algorithms that are designed solely for
  one of the regimes are known to be sub-optimal for their alternate
  regime. We design a single computationally efficient algorithm that
  simultaneously obtains problem-dependent optimal regret rates in the
  simple multi-armed bandit regime and minimax optimal regret rates in
  the linear contextual bandit regime, without knowing a priori which
  of the two models generates the rewards. These results are proved
  under the condition of stochasticity of contextual information over
  multiple rounds. Our results should be viewed as a step towards
  principled data-dependent policy class selection for contextual
  bandits.}
}

@inproceedings{cdbj-lmcws-20,
  title = {{L}angevin {M}onte {C}arlo without Smoothness},
  author = {Chatterji, Niladri and Diakonikolas, Jelena and
  Jordan, Michael I. and Bartlett, Peter L.},
  booktitle = {Proceedings of the Twenty Third International Conference on Artificial Intelligence and Statistics},
  pages = {1716--1726},
  year = {2020},
  editor = {Chiappa, Silvia and Calandra, Roberto},
  volume = {108},
  series = {Proceedings of Machine Learning Research},
  month = {26--28 Aug},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v108/chatterji20a/chatterji20a.pdf},
  url = {http://proceedings.mlr.press/v108/chatterji20a.html},
  abstract = {Langevin Monte Carlo (LMC) is an iterative algorithm
  used to generate samples from a distribution that is known only up
  to a normalizing constant. The nonasymptotic dependence of its
  mixing time on the dimension and target accuracy is understood
  mainly in the setting of smooth (gradient-Lipschitz) log-densities,
  a serious limitation for applications in machine learning. In this
  paper, we remove this limitation, providing polynomial-time
  convergence guarantees for a variant of LMC in the setting of
  nonsmooth log-concave distributions. At a high level, our results
  follow by leveraging the implicit smoothing of the log-density that
  comes from a small  Gaussian perturbation that we add to the
  iterates of the algorithm and controlling the bias and variance that
  are induced by this perturbation.}
}

@article{bllt-bolr-20,
  author = {Peter L.~Bartlett and Philip M.~Long and G\'{a}bor Lugosi
and Alexander Tsigler},
  title = {Benign Overfitting in Linear Regression},
  journal = {Proceedings of the National Academy of Sciences},
  note = {(arXiv:1906.11300)},
  year = {2020},
  volume = {117},
  number = {48},
  pages = {30063--30070},
  doi = {10.1073/pnas.1907378117},
  publisher = {National Academy of Sciences},
  abstract = {The phenomenon of benign overfitting is one of the key mysteries uncovered by deep learning methodology: deep neural networks seem to predict well, even with a perfect fit to noisy training data. Motivated by this phenomenon, we consider when a perfect fit to training data in linear regression is compatible with accurate prediction. We give a characterization of linear regression problems for which the minimum norm interpolating prediction rule has near-optimal prediction accuracy. The characterization is in terms of two notions of the effective rank of the data covariance. It shows that overparameterization is essential for benign overfitting in this setting: the number of directions in parameter space that are unimportant for prediction must significantly exceed the sample size. By studying examples of data covariance properties that this characterization shows are required for benign overfitting, we find an important role for finite-dimensional data: the accuracy of the minimum norm interpolating prediction rule approaches the best possible accuracy for a much narrower range of properties of the data distribution when the data lie in an infinite-dimensional space vs. when the data lie in a finite-dimensional space with dimension that grows faster than the sample size.},
  issn = {0027-8424},
  url = {https://www.pnas.org/content/early/2020/04/22/1907378117},
  eprint = {https://www.pnas.org/content/early/2020/04/22/1907378117.full.pdf}
}

@inproceedings{pmmb-otsla-20,
  title = {On Approximate {T}hompson Sampling with {L}angevin Algorithms},
  author = {Mazumdar, Eric and Pacchiano, Aldo and Ma, Yian and Jordan, Michael and Bartlett, Peter},
  booktitle = {Proceedings of the 37th International Conference on Machine Learning},
  pages = {6797--6807},
  year = {2020},
  editor = {Hal {Daum\'e} III and Aarti Singh},
  volume = {119},
  series = {Proceedings of Machine Learning Research},
  month = {13--18 Jul},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v119/mazumdar20a/mazumdar20a.pdf},
  url = {http://proceedings.mlr.press/v119/mazumdar20a.html},
  abstract = {Thompson sampling for multi-armed bandit problems is
  known to enjoy favorable performance in both theory and practice.
  However, its wider deployment is restricted due to a significant
  computational limitation: the need for samples from posterior
  distributions at every iteration. In practice, this limitation is
  alleviated by making use of approximate sampling methods, yet
  provably incorporating approximate samples into Thompson Sampling
  algorithms remains an open problem. In this work we address this by
  proposing two efficient Langevin MCMC algorithms tailored to
  Thompson sampling. The resulting approximate Thompson Sampling
  algorithms are efficiently implementable and provably achieve
  optimal instance-dependent regret for the Multi-Armed Bandit (MAB)
  problem. To prove these results we derive novel posterior
  concentration bounds and MCMC convergence rates for log-concave
  distributions which may be of independent interest.}
}

@inproceedings{lpbj-ampermi-20,
  author = {Jonathan Lee and Aldo Pacchiano and Peter L.~Bartlett and
Michael I.~Jordan},
  title = {Accelerated Message Passing for Entropy-Regularized {MAP} Inference},
  booktitle = {Proceedings of the 37th International Conference on Machine Learning},
  pages = {5736--5746},
  year = {2020},
  editor = {Hal {Daum\'e} III and Aarti Singh},
  volume = {119},
  series = {Proceedings of Machine Learning Research},
  month = {13--18 Jul},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v119/lee20e/lee20e.pdf},
  url = {http://proceedings.mlr.press/v119/lee20e.html},
  abstract = {Maximum a posteriori (MAP) inference in discrete-valued Markov random fields is a fundamental problem in machine learning that involves identifying the most likely configuration of random variables given a distribution. Due to the difficulty of this combinatorial problem, linear programming (LP) relaxations are commonly used to derive specialized message passing algorithms that are often interpreted as coordinate descent on the dual LP. To achieve more desirable computational properties, a number of methods regularize the LP with an entropy term, leading to a class of smooth message passing algorithms with convergence guarantees. In this paper, we present randomized methods for accelerating these algorithms by leveraging techniques that underlie classical accelerated gradient methods. The proposed algorithms incorporate the familiar steps of standard smooth message passing algorithms, which can be viewed as coordinate minimization steps. We show that these accelerated variants achieve faster rates for finding $\epsilon$-optimal points of the unregularized problem, and, when the LP is tight, we prove that the proposed algorithms recover the true MAP solution in fewer iterations than standard message passing algorithms.}
}

@inproceedings{pmlr-v119-cheng20e,
  title = {Stochastic Gradient and {L}angevin Processes},
  author = {Cheng, Xiang and Yin, Dong and Bartlett, Peter and Jordan, Michael},
  booktitle = {Proceedings of the 37th International Conference on Machine Learning},
  pages = {1810--1819},
  year = {2020},
  editor = {Hal {Daum\'e} III and Aarti Singh},
  volume = {119},
  series = {Proceedings of Machine Learning Research},
  month = {13--18 Jul},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v119/cheng20e/cheng20e.pdf},
  url = {http://proceedings.mlr.press/v119/cheng20e.html},
  abstract = {We prove quantitative convergence rates at which
  discrete Langevin-like processes converge to the invariant
  distribution of a related stochastic differential equation. We study
  the setup where the additive noise can be non-Gaussian and
  state-dependent and the potential function can be non-convex. We
  show that the key properties of these processes depend on the
  potential function and the second moment of the additive noise. We
  apply our theoretical findings to studying the convergence of
  Stochastic Gradient Descent (SGD) for non-convex problems and
  corroborate them with experiments using SGD to train deep neural
  networks on the CIFAR-10 dataset.}
}

@inproceedings{mfb-sdarhs-20,
  title = {Self-Distillation Amplifies Regularization in {H}ilbert Space},
  author = {Hossein Mobahi and Mehrdad Farajtabar and Peter L.~Bartlett},
  year = {2020},
  booktitle = {Advances in Neural Information Processing Systems 33},
  abstract = {Knowledge distillation introduced in the deep learning
context is a method to transfer knowledge from one architecture to
another. In particular, when the architectures are identical, this is
called self-distillation. The idea is to feed in predictions of the
trained model as new target values for retraining (and iterate this
loop possibly a few times). It has been empirically observed that the
self-distilled model often achieves higher accuracy on held out data.
Why this happens, however, has been a mystery: the self-distillation
dynamics does not receive any new information about the task and
solely evolves by looping over training. To the best of our knowledge,
there is no rigorous understanding of why this happens. This work
provides the first theoretical analysis of self-distillation. We focus
on fitting a nonlinear function to training data, where the model
space is Hilbert space and fitting is subject to L2 regularization in
this function space. We show that self-distillation iterations modify
regularization by progressively limiting the number of basis functions
that can be used to represent the solution. This implies (as we also
verify empirically) that while a few rounds of self-distillation may
reduce over-fitting, further rounds may lead to under-fitting and thus
worse performance.},
  booktitle = {Advances in Neural Information Processing Systems},
  editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
  pages = {3351--3361},
  publisher = {Curran Associates, Inc.},
  url = {https://proceedings.neurips.cc/paper/2020/file/2288f691b58edecadcc9a8691762b4fd-Paper.pdf},
  volume = {33}
}

@techreport{catjfb-orlrnlt-20,
  title = {Optimal Robust Linear Regression in Nearly Linear Time},
  author = {Yeshwanth Cherapanamjeri and Efe Aras and Nilesh Tripuraneni and Michael I. Jordan and Nicolas Flammarion and Peter L. Bartlett},
  year = {2020},
  institution = {arXiv},
  number = {2007.08137}
}

@inproceedings{mlwbj-fgalsaa-20,
  title = {On Linear Stochastic Approximation: Fine-grained
  {P}olyak-{R}uppert and Non-Asymptotic Concentration},
  author = {Mou, Wenlong and Li, Chris Junchi and Wainwright, Martin J
  and Bartlett, Peter L and Jordan, Michael I},
  booktitle = {Proceedings of Thirty Third Conference on Learning Theory},
  pages = {2947--2997},
  year = {2020},
  editor = {Jacob Abernethy and Shivani Agarwal},
  volume = {125},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v125/mou20a/mou20a.pdf},
  url = {http://proceedings.mlr.press/v125/mou20a.html},
  abstract = { We undertake a precise study of the asymptotic and
  non-asymptotic properties of stochastic approximation procedures
  with Polyak-Ruppert averaging for solving a linear system $\bar{A}
  \theta = \bar{b}$. When the matrix $\bar{A}$ is Hurwitz, we prove a
  central limit theorem (CLT) for the averaged iterates with fixed
  step size and number of iterations going to infinity. The CLT
  characterizes the exact asymptotic covariance matrix, which is the
  sum of the classical Polyak-Ruppert covariance and a correction term
  that scales with the step size. Under assumptions on the tail of the
  noise distribution, we prove a non-asymptotic concentration
  inequality whose main term matches the covariance in CLT in any
  direction, up to universal constants. When the matrix $\bar{A}$ is
  not Hurwitz but only has non-negative real parts in its eigenvalues,
  we prove that the averaged LSA procedure actually achieves an
  $O(1/T)$ rate in mean-squared error. Our results provide a more
  refined understanding of linear stochastic approximation in both the
  asymptotic and non-asymptotic settings. We also show various
  applications of the main results, including the study of
  momentum-based stochastic gradient methods as well as temporal
  difference algorithms in reinforcement learning.}
}

@techreport{bl-fmdgblni-20,
  title = {Failures of model-dependent generalization bounds for least-norm interpolation},
  author = {Peter L. Bartlett and Philip M. Long},
  year = {2020},
  institution = {arxiv.org},
  number = {arXiv:2010.08479},
  url = {https://arxiv.org/abs/2010.08479},
  abstract = {We consider bounds on the generalization performance of the least-norm linear regressor, in the over-parameterized regime where it can interpolate the data. We describe a sense in which any generalization bound of a type that is commonly proved in statistical learning theory must sometimes be very loose when applied to analyze the least-norm interpolant. In particular, for a variety of natural joint distributions on training examples, any valid generalization bound that depends only on the output of the learning algorithm, the number of training examples, and the confidence parameter, and that satisfies a mild condition (substantially weaker than monotonicity in sample size), must sometimes be very loose---it can be bounded below by a constant when the true excess risk goes to zero.}
}

@techreport{ctbj-omewv-20,
  title = {Optimal Mean Estimation without a Variance},
  author = {Yeshwanth Cherapanamjeri and Nilesh Tripuraneni and Peter L. Bartlett and Michael I. Jordan},
  year = {2020},
  institution = {arXiv},
  number = {2011.12433}
}

@article{bmr-dlasp-21,
  author = {Peter L. Bartlett and Andrea Montanari and Alexander
  Rakhlin},
  title = {Deep learning: a statistical viewpoint},
  journal = {Acta Numerica},
  year = {2021},
  volume = {30},
  doi = {10.1017/S0962492921000027},
  publisher = {Cambridge University Press},
  pages = {87–201},
  url = {https://arxiv.org/abs/2103.09177},
  abstract = {The remarkable practical success of deep learning has revealed some
major surprises from a theoretical perspective.  In particular,
simple gradient methods easily find near-optimal solutions to
non-convex optimization problems, and despite giving a near-perfect
fit to training data without any explicit effort to control model
complexity, these methods exhibit excellent predictive accuracy.
We conjecture that specific principles underlie these phenomena: that
overparametrization allows gradient methods to find interpolating
solutions, that these methods implicitly impose regularization,
and that overparametrization leads to benign overfitting, that is,
accurate predictions despite overfitting training data.
 We survey recent theoretical progress
that provides examples illustrating these principles in simpler
settings.  We first review classical uniform convergence results
and why they fall short of explaining aspects of the behavior of
deep learning methods.  We give examples of implicit regularization
in simple settings, where gradient methods lead to minimal norm
functions that perfectly fit the training data.  Then we review
prediction methods that exhibit benign overfitting, focusing
on regression problems with quadratic loss.  For these methods,
we can decompose the prediction rule into a simple component that
is useful for prediction and a spiky component that is useful for
overfitting but, in a favorable setting, does not harm prediction
accuracy.  We focus specifically on the linear regime for neural
networks, where the network can be approximated by a linear model.
In this regime, we demonstrate the success of gradient flow, and
we consider benign overfitting with two-layer networks, giving an
exact asymptotic analysis that precisely demonstrates the impact of
overparametrization.  We conclude by highlighting the key challenges
that arise in extending these insights to realistic deep learning
settings.}
}

@article{bl-fmdgblni-21,
  author = {Peter L. Bartlett and Philip M. Long},
  title = {Failures of Model-dependent Generalization Bounds for Least-norm Interpolation},
  journal = {Journal of Machine Learning Research},
  year = {2021},
  volume = {22},
  number = {204},
  pages = {1--15},
  url = {http://jmlr.org/papers/v22/20-1164.html},
  note = {arXiv:2010.08479}
}

@article{mmbjw-holdyama-21,
  author = {Wenlong Mou and Yi-An Ma and Martin J. Wainwright and
  Peter L. Bartlett and Michael I. Jordan},
  title = {High-Order {L}angevin Diffusion Yields an Accelerated
  {MCMC} Algorithm},
  journal = {Journal of Machine Learning Research},
  year = {2021},
  volume = {22},
  number = {42},
  pages = {1-41},
  url = {http://jmlr.org/papers/v22/20-576.html}
}

@article{mccfbj-itanafgbm-21,
  author = {Yi-An Ma and Niladri S.~Chatterji and Xiang Cheng
and Nicolas Flammarion and Peter L.~Bartlett and Michael I.~Jordan},
  title = {Is There an Analog of {N}esterov Acceleration for
Gradient-Based {MCMC}?},
  year = {2021},
  journal = {Bernoulli},
  volume = {27},
  number = {3},
  pages = {1942--1992},
  doi = {http://dx.doi.org/10.3150/20-BEJ1297},
  abstract = {We formulate gradient-based Markov chain Monte Carlo
(MCMC) sampling as optimization on the space of probability measures,
with Kullback–Leibler (KL) divergence as the objective functional. We
show that an underdamped form of the Langevin algorithm performs
accelerated gradient descent in this metric. To characterize the
convergence of the algorithm, we construct a Lyapunov functional and
exploit hypocoercivity of the underdamped Langevin algorithm. As an
application, we show that accelerated rates can be obtained for a
class of nonconvex functions with the Langevin algorithm.}
}

@inproceedings{bbds-aluu-21,
  author = {Kush Bhatia and Peter L. Bartlett and Anca D. Dragan and Jacob Steinhardt},
  title = {{Agnostic Learning with Unknown Utilities}},
  booktitle = {12th Innovations in Theoretical Computer Science Conference (ITCS 2021)},
  pages = {55:1--55:20},
  series = {Leibniz International Proceedings in Informatics (LIPIcs)},
  isbn = {978-3-95977-177-1},
  issn = {1868-8969},
  year = {2021},
  volume = {185},
  editor = {James R. Lee},
  publisher = {Schloss Dagstuhl--Leibniz-Zentrum f{\"u}r Informatik},
  address = {Dagstuhl, Germany},
  url = {https://drops.dagstuhl.de/opus/volltexte/2021/13594},
  doi = {10.4230/LIPIcs.ITCS.2021.55}
}

@inproceedings{pgbj-sblc-21,
  title = { Stochastic Bandits with Linear Constraints },
  author = {Pacchiano, Aldo and Ghavamzadeh, Mohammad and
  Bartlett, Peter L. and Jiang, Heinrich},
  booktitle = {Proceedings of The 24th International Conference on Artificial Intelligence and Statistics},
  pages = {2827--2835},
  year = {2021},
  editor = {Banerjee, Arindam and Fukumizu, Kenji},
  volume = {130},
  series = {Proceedings of Machine Learning Research},
  month = {13--15 Apr},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v130/pacchiano21a/pacchiano21a.pdf},
  url = {http://proceedings.mlr.press/v130/pacchiano21a.html},
  abstract = { We study a constrained contextual linear bandit setting, where the goal of the agent is to produce a sequence of policies, whose expected cumulative reward over the course of multiple rounds is maximum, and each one of them has an expected cost below a certain threshold. We propose an upper-confidence bound algorithm for this problem, called optimistic pessimistic linear bandit (OPLB), and prove a sublinear bound on its regret that is inversely proportional to the difference between the constraint threshold and the cost of a known feasible action. Our algorithm balances exploration and constraint satisfaction using a novel idea that scales the radii of the reward and cost confidence sets with different scaling factors. We further specialize our results to multi-armed bandits and propose a computationally efficient algorithm for this setting and prove a a regret bound that is better than simply casting multi-armed bandits as an instance of linear bandits and using the regret bound of OPLB. We also prove a lower-bound for the problem studied in the paper and provide simulations to validate our theoretical results. Finally, we show how our algorithm and analysis can be extended to multiple constraints and to the case when the cost of the feasible action is unknown. }
}

@inproceedings{abms-defcc-21,
  title = {Dropout: Explicit Forms and Capacity Control},
  author = {Raman Arora and Peter L.~Bartlett and Poorya Mianjy and
Nathan Srebro},
  year = {2021},
  booktitle = {Proceedings of the 38th International Conference on Machine Learning},
  pages = {351--361},
  editor = {Meila, Marina and Zhang, Tong},
  volume = {139},
  series = {Proceedings of Machine Learning Research},
  month = {18--24 Jul},
  publisher = {PMLR},
  pdf = {http://proceedings.mlr.press/v139/arora21a/arora21a.pdf},
  url = {http://proceedings.mlr.press/v139/arora21a.html},
  abstract = {We investigate the capacity control provided by
  dropout in various machine learning problems. First, we study
  dropout for matrix completion, where it induces a
  distribution-dependent regularizer that equals the weighted
  trace-norm of the product of the factors. In deep learning, we show
  that the distribution-dependent regularizer due to dropout directly
  controls the Rademacher complexity of the underlying class of deep
  neural networks. These developments enable us to give concrete
  generalization error bounds for the dropout algorithm in both matrix
  completion as well as training deep neural networks.}
}

@inproceedings{clb-wdgdlli-21,
  author = {Niladri S.~Chatterji and Philip M.~Long and Peter L.~Bartlett},
  title = {When does gradient descent with logistic loss interpolate
using deep networks with smoothed {ReLU} activations?},
  year = {2021},
  booktitle = {Proceedings of the 34th Conference on Learning Theory
(COLT2021)},
  pages = {927--1027},
  editor = {Belkin, Mikhail and Kpotufe, Samory},
  volume = {134},
  series = {Proceedings of Machine Learning Research},
  url = {https://proceedings.mlr.press/v134/chatterji21a.html},
  abstract = {We establish conditions under which gradient descent
applied to fixed-width deep networks drives the logistic loss to zero,
and prove bounds on the rate of convergence. Our analysis applies for
smoothed approximations to the ReLU, such as Swish and the Huberized
ReLU, proposed in previous applied work. We provide two sufficient
conditions for convergence. The first is simply a bound on the loss at
initialization. The second is a data separation condition used in
prior analyses.}
}

@inproceedings{psab-tdfualc-21,
  author = {Juan Perdomo and Max Simchowitz and Alekh Agarwal and
Peter L.~Bartlett},
  title = {Towards a Dimension-Free Understanding of Adaptive Linear
Control},
  year = {2021},
  booktitle = {Proceedings of the 34th Conference on Learning Theory
(COLT2021)},
  abstract = {We study the problem of adaptive control of the linear
quadratic regulator for systems in very high, or even infinite
dimension. We demonstrate that while sublinear regret requires finite
dimensional inputs,  the ambient state dimension of the system need
not be small in order to perform online control. We provide the first
regret bounds for LQR which hold for infinite dimensional systems,
replacing dependence on ambient dimension with more natural notions of
problem complexity. Our guarantees arise from a novel perturbation
bound for certainty equivalence which scales with the prediction
error in estimating the system parameters, without requiring
consistent parameter recovery in more stringent measures like the
operator norm. When specialized to finite dimensional settings, our
bounds recover near optimal dimension and time horizon dependence.}
}

@techreport{bbc-aemlrrn-21a,
  author = {Peter L.~Bartlett and Sebastien Bubeck and Yeshwanth
Cherapanamjeri},
  title = {Adversarial Examples in Multi-Layer Random {ReLU} Networks},
  year = {2021},
  institution = {arXiv},
  number = {2106.12611},
  abstract = {We consider the phenomenon of adversarial examples in ReLU
networks with independent gaussian parameters. For networks of
constant depth and with a large range of widths (for instance, it
suffices if the width of each layer is polynomial in that of any other
layer), small perturbations of input vectors lead to large changes of
outputs. This generalizes results of Daniely and Schacham (2020) for
networks of rapidly decreasing width and of Bubeck et al (2021) for
two-layer networks. The proof shows that adversarial examples arise in
these networks because the functions that they compute are very close
to linear. Bottleneck layers in the network play a key role: the
minimal width up to some point in the network determines scales and
sensitivities of mappings computed up to that point. The main result
is for networks with constant depth, but we also show that some
constraint on depth is necessary for a result of this kind, because
there are suitably deep networks that, with constant probability,
compute a function that is close to constant.}
}

@inproceedings{bbc-aemlrrn-21,
  author = {Peter L.~Bartlett and Sebastien Bubeck and Yeshwanth
Cherapanamjeri},
  title = {Adversarial Examples in Multi-Layer Random {ReLU} Networks},
  year = {2021},
  booktitle = {Advances in Neural Information Processing Systems 34},
  abstract = {We consider the phenomenon of adversarial examples in ReLU
networks with independent gaussian parameters. For networks of
constant depth and with a large range of widths (for instance, it
suffices if the width of each layer is polynomial in that of any other
layer), small perturbations of input vectors lead to large changes of
outputs. This generalizes results of Daniely and Schacham (2020) for
networks of rapidly decreasing width and of Bubeck et al (2021) for
two-layer networks. The proof shows that adversarial examples arise in
these networks because the functions that they compute are very close
to linear. Bottleneck layers in the network play a key role: the
minimal width up to some point in the network determines scales and
sensitivities of mappings computed up to that point. The main result
is for networks with constant depth, but we also show that some
constraint on depth is necessary for a result of this kind, because
there are suitably deep networks that, with constant probability,
compute a function that is close to constant.}
}

@inproceedings{plbn-nopovr-21,
  author = {Aldo Pacchiano and Jonathan Lee and Peter L.~Bartlett and 
Ofir Nachum},
  title = {Near Optimal Policy Optimization via REPS},
  booktitle = {Advances in Neural Information Processing Systems 34},
  abstract = {Since its introduction a decade ago, relative entropy policy
search (REPS) has demonstrated successful policy learning on a number
of simulated and real-world robotic domains, not to mention providing
algorithmic components used by many recently proposed reinforcement
learning (RL) algorithms. While REPS is commonly known in the
community, there exist no guarantees on its performance when using
stochastic and gradient-based solvers. In this paper we aim to fill
this gap by providing guarantees and convergence rates for the
sub-optimality of a policy learned using first-order optimization
methods applied to the REPS objective. We first consider the setting
in which we are given access to exact gradients and demonstrate how
near-optimality of the objective translates to near-optimality of the
policy. We then consider the practical setting of stochastic
gradients, and introduce a technique that uses generative access to
the underlying Markov decision process to compute parameter updates
that maintain favorable convergence to the optimal regularized
policy.},
  year = {2021}
}

@inproceedings{cpbj-otrlof-21,
  author = {Niladri Chatterji and Aldo Pacchiano
and Peter L.~Bartlett and Michael I.~Jordan},
  title = {On the Theory of Reinforcement Learning with Once-per-Episode
Feedback},
  booktitle = {Advances in Neural Information Processing Systems 34},
  abstract = {We introduce a theory of reinforcement learning (RL) in which the learner receives feedback only once at the end of an episode. While this is an extreme test case for theory, it is also arguably more representative of real-world applications than the traditional requirement in RL practice that the learner receive feedback at every time step. Indeed, in many real-world applications of reinforcement learning, such as self driving cars and robotics, it is easier to evaluate whether a learner's complete trajectory was either `good' or `bad,' but harder to provide a reward signal at each step. To show that learning is possible in this more challenging setting, we study the case where trajectory labels are generated by an unknown parametric model, and provide a statistically and computationally efficient algorithm that achieves sub-linear regret.},
  year = {2021}
}

@article{clb-wdgdllfitl-21,
  author = {Niladri S.~Chatterji and Philip M.~Long and Peter L.~Bartlett},
  title = {When does gradient descent with logistic loss find interpolating two-layer networks?},
  year = {2021},
  journal = {Journal of Machine Learning Research},
  url = {https://arxiv.org/abs/2012.02409},
  abstract = {We study the training of finite-width two-layer smoothed
ReLU networks for binary classification using the logistic loss. We
show that gradient descent drives the training loss to zero if the
initial loss is small enough. When the data satisfies certain cluster
and separation conditions and the network is wide enough, we show that
one step of gradient descent reduces the loss sufficiently that the
first result applies.},
  volume = {22},
  number = {159},
  pages = {1-48}
}

@article{cbl-olbsgsa-21,
  author = {Niladri S.~Chatterji and Peter L.~Bartlett
    and Philip M.~Long},
  title = {Oracle lower bounds for stochastic gradient sampling
algorithms},
  year = {2022},
  journal = {Bernoulli},
  volume = {28},
  number = {2},
  pages = {1074--1092},
  note = {arXiv:2002.00291},
  url = {https://arxiv.org/abs/2002.00291},
  abstract = {We consider the problem of sampling from a strongly
log-concave density in R^d, and prove an information theoretic lower bound on the number of stochastic gradient queries of the log density needed. Several popular sampling algorithms (including many Markov chain Monte Carlo methods) operate by using stochastic gradients of the log density to generate a sample; our results establish an information theoretic limit for all these algorithms.
We show that for every algorithm, there exists a well-conditioned
strongly log-concave target density for which the distribution of
points generated by the algorithm would be at least ε away from the
target in total variation distance if the number of gradient queries
is less than Omega(sigma^2d/epsilon^2), where sigma^2d is the variance of the stochastic
gradient. Our lower bound follows by combining the ideas of Le Cam
deficiency routinely used in the comparison of statistical experiments
along with standard information theoretic tools used in lower bounding
Bayes risk functions. To the best of our knowledge our results provide
the first nontrivial dimension-dependent lower bound for this
problem.}
}

@article{mfwb-ibdldnorwc-21,
  author = {Mou, Wenlong and Flammarion, Nicolas and Wainwright, Martin J.
and Bartlett, Peter L.},
  title = {Improved Bounds for Discretization of {L}angevin Diffusions:
Near-Optimal Rates without Convexity},
  year = {2022},
  journal = {Bernoulli},
  volume = {28},
  number = {3},
  pages = {1577--1601},
  doi = {http://dx.doi.org/10.3150/21-BEJ1343},
  url = {https://arxiv.org/abs/1907.11331},
  abstract = {We present an improved analysis of the Euler-Maruyama discretization of the Langevin diffusion. Our analysis does not require global contractivity, and yields polynomial dependence on the time horizon. Compared to existing approaches, we make an additional smoothness assumption, and improve the existing rate from O(eta) to O(eta^2) in terms of the KL divergence. This result matches the correct order for numerical SDEs, without suffering from exponential time dependence. When applied to algorithms for sampling and learning, this result simultaneously improves all those methods based on Dalayan's approach.}
}

@inproceedings{mpwb-oidgmlsa-22,
  author = {Wenlong Mou and Ashwin Pananjady and Martin J.~Wainwright
and Peter L.~Bartlett},
  title = {Optimal and instance-dependent guarantees for {M}arkovian
      linear stochastic approximation},
  year = {2022},
  abstract = {We study stochastic approximation procedures for approximately solving a $d$-dimensional linear fixed point equation based on observing a trajectory of length $n$ from an ergodic Markov chain.  We first exhibit a non-asymptotic bound of the order $t_{\mathrm{mix}} \tfrac{d}{n}$ on the squared error of the last iterate of a standard scheme, where $t_{\mathrm{mix}}$ is a mixing time.  We then prove a non-asymptotic instance-dependent bound on a suitably averaged sequence of iterates, with a leading term that matches the local asymptotic minimax limit, including sharp dependence on the parameters $(d, t_{\mathrm{mix}})$ in the higher order terms. We complement these upper bounds with a non-asymptotic minimax lower bound that establishes the instance-optimality of the averaged SA estimator. We derive corollaries of these results for policy evaluation with Markov noise---covering the TD($\lambda$) family of algorithms for all $\lambda \in [0, 1)$---and linear autoregressive models. Our instance-dependent characterizations open the door to the design of fine-grained model selection procedures for hyperparameter tuning (e.g., choosing the value of $\lambda$ when running the TD($\lambda$) algorithm).},
  booktitle = {Proceedings of the 35th Conference on Learning Theory (COLT2022)}
}

@inproceedings{fcb-bowl-22,
  author = {Spencer Frei and Niladri Chatterji and Peter L.~Bartlett},
  title = {Benign Overfitting without Linearity: Neural Network
Classifiers Trained by Gradient Descent for Noisy Linear Data},
  abstract = {Benign overfitting, the phenomenon where interpolating models
generalize well in the presence of noisy data, was first observed in
neural network models trained with gradient descent.  To better
understand this empirical observation, we consider the generalization
error of two-layer neural networks trained to interpolation by
gradient descent on the logistic loss following random initialization.
We assume the data comes from well-separated class-conditional
log-concave distributions and allow for a constant fraction of the
training labels to be corrupted by an adversary.  We show that in this
setting, neural networks exhibit benign overfitting: they can be
driven to zero training error, perfectly fitting any noisy training
labels, and simultaneously achieve test error close to the
Bayes-optimal error.   In contrast to previous work on benign
overfitting that require linear or kernel-based predictors, our
analysis holds in a setting where both the model and learning dynamics
are fundamentally nonlinear.},
  year = {2022},
  booktitle = {Proceedings of the 35th Conference on Learning Theory (COLT2022)}
}

@inproceedings{biw-gbddnla-22,
  author = {Peter L.~Bartlett and Piotr Indyk and Tal Wagner},
  title = {Generalization Bounds for Data-Driven Numerical Linear
Algebra},
  abstract = {Data-driven algorithms can adapt their internal structure or parameters to inputs from unknown application-specific distributions, by learning from a training sample of inputs. Several recent works have applied this approach to problems in numerical linear algebra, obtaining significant empirical gains in performance. However, no theoretical explanation for their success was known.
In this work we prove generalization bounds for those algorithms,
within the PAC-learning framework for data-driven algorithm selection
proposed by Gupta and Roughgarden (SICOMP 2017). Our main result is an
almost tight bound on the fat shattering dimension of the
learning-based low rank approximation algorithm of Indyk et
al.~(NeurIPS 2019). Our techniques are general, and provide
generalization bounds for many other recently proposed data-driven
algorithms in numerical linear algebra, covering both sketching-based
and multigrid-based methods. This considerably broadens the class of
data-driven algorithms for which a PAC-learning analysis is
available.},
  year = {2022},
  booktitle = {Proceedings of the 35th Conference on Learning Theory (COLT2022)}
}

@inproceedings{ctbj-omewv-22,
  title = {Optimal Mean Estimation without a Variance},
  author = {Yeshwanth Cherapanamjeri and Nilesh Tripuraneni and Peter L.~Bartlett and Michael I.~Jordan},
  abstract = {We study the problem of heavy-tailed mean estimation in settings where the variance of the data-generating distribution does not exist. Concretely, given a sample $\bm{X} = \{X_i\}_{i = 1}^n$ from a distribution $\mc{D}$ over $\mb{R}^d$ with mean $\mu$ which satisfies the following \emph{weak-moment} assumption for some ${\alpha \in [0, 1]}$:
    $ \forall \norm{v} = 1: \mb{E}_{X \ts \mc{D}}[\abs{\inp{X -
    \mu}{v}}^{1 + \alpha}] \leq 1$,
and given a target failure probability, $\delta$, our goal is to design an estimator which attains the smallest possible confidence interval as a function of $n,d,\delta$. For the specific case of $\alpha = 1$, foundational work of Lugosi and Mendelson exhibits an estimator achieving subgaussian confidence intervals, and subsequent work has led to computationally efficient versions of this estimator. Here, we study the case of general $\alpha$, and establish the following information-theoretic lower bound on the optimal attainable confidence interval:
    $ \Omega \lprp{\sqrt{\frac{d}{n}} +
    \lprp{\frac{d}{n}}^{\frac{\alpha}{(1 + \alpha)}} +
    \lprp{\frac{\log 1 / \delta}{n}}^{\frac{\alpha}{(1 + \alpha)}}}$.
Moreover, we devise a computationally-efficient estimator which
achieves this lower bound.},
  year = {2022},
  booktitle = {Proceedings of the 35th Conference on Learning Theory (COLT2022)}
}

@article{clb-ibibbotlln-22,
  author = {Niladri S.~Chatterji and Philip M.~Long and Peter L.~Bartlett},
  title = {The Interplay Between Implicit Bias and Benign Overfitting in Two-Layer Linear Networks},
  journal = {Journal of Machine Learning Research},
  year = {2022},
  volume = {23},
  number = {263},
  pages = {1--48},
  url = {http://jmlr.org/papers/v23/21-1011.html}
}

@article{fcb-rfa-22a,
  author = {Spencer Frei and Niladri S.~Chatterji and Peter L.~Bartlett},
  title = {Random Feature Amplification: Feature Learning and
Generalization in Neural Networks},
  abstract = {In this work, we provide a characterization of the feature-learning process in two-layer ReLU networks trained by gradient descent on the logistic loss following random initialization. We consider data with binary labels that are generated by an XOR-like function of the input features. We permit a constant fraction of the training labels to be corrupted by an adversary. We show that, although linear classifiers are no better than random guessing for the distribution we consider, two-layer ReLU networks trained by gradient descent achieve generalization error close to the label noise rate. We develop a novel proof technique that shows that at initialization, the vast majority of neurons function as random features that are only weakly correlated with useful features, and the gradient descent dynamics `amplify' these weak, random features to strong, useful features.},
  year = {2023},
  journal = {Journal of Machine Learning Research},
  volume = {24},
  number = {303},
  pages = {1--49},
  url = {http://jmlr.org/papers/v24/22-1132.html}
}

@techreport{mwb-opelf-22,
  author = {Wenlong Mou and Martin J. Wainwright and Peter L. Bartlett},
  title = {Off-policy estimation of linear functionals: Non-asymptotic theory for semi-parametric efficiency},
  year = {2022},
  institution = {arXiv},
  number = {2209.13075},
  abstract = {The problem of estimating a linear functional based on
observational data is canonical in both the causal inference and
bandit literatures. We analyze a broad class of two-stage procedures
that first estimate the treatment effect function, and then use this
quantity to estimate the linear functional. We prove non-asymptotic
upper bounds on the mean-squared error of such procedures: these
bounds reveal that in order to obtain non-asymptotically optimal
procedures, the error in estimating the treatment effect should be
minimized in a certain weighted L2-norm. We analyze a two-stage
procedure based on constrained regression in this weighted norm, and
establish its instance-dependent optimality in finite samples via
matching non-asymptotic local minimax lower bounds. These results show
that the optimal non-asymptotic risk, in addition to depending on the
asymptotically efficient variance, depends on the weighted norm
distance between the true outcome function and its approximation by
the richest function class supported by the sample size.}
}

@techreport{mdwb-kbopewo-23,
  title = {Kernel-based off-policy estimation without overlap: Instance optimality beyond semiparametric efficiency},
  author = {Wenlong Mou and Peng Ding and Martin J. Wainwright and Peter L. Bartlett},
  year = {2023},
  institution = {arXiv},
  number = {2301.06240},
  abstract = {We study optimal procedures for estimating a linear
functional based on observational data. In many problems of this kind,
a widely used assumption is strict overlap, i.e., uniform boundedness
of the importance ratio, which measures how well the observational
data covers the directions of interest. When it is violated, the
classical semi-parametric efficiency bound can easily become infinite,
so that the instance-optimal risk depends on the function class used
to model the regression function. For any convex and symmetric
function class F, we derive a non-asymptotic local minimax bound on
the mean-squared error in estimating a broad class of linear
functionals. This lower bound refines the classical semi-parametric
one, and makes connections to moduli of continuity in functional
estimation. When F is a reproducing kernel Hilbert space, we prove
that this lower bound can be achieved up to a constant factor by
analyzing a computationally simple regression estimator. We apply our
general results to various families of examples, thereby uncovering a
spectrum of rates that interpolate between the classical theories of
semi-parametric efficiency (with sqrt n-consistency) and the slower
minimax rates associated with non-parametric function estimation.}
}

@inproceedings{fvbsh-iblrnthdd-23,
  title = {Implicit Bias in Leaky {ReLU} Networks Trained on High-Dimensional Data},
  author = {Spencer Frei and Gal Vardi and Peter L.~Bartlett and Nathan
Srebro and Wei Hu},
  abstract = {The implicit biases of gradient-based optimization algorithms are conjectured to be a major factor in the success of modern deep learning. In this work, we investigate the implicit bias of gradient flow and gradient descent in two-layer fully-connected neural networks with leaky ReLU activations when the training data are nearly-orthogonal, a common property of high-dimensional data. For gradient flow, we leverage recent work on the implicit bias for homogeneous neural networks to show that asymptotically, gradient flow produces a neural network with rank at most two. Moreover, this network is an $\ell_2$-max-margin solution (in parameter space), and has a linear decision boundary that corresponds to an approximate-max-margin linear predictor. For gradient descent, provided the random initialization variance is small enough, we show that a single step of gradient descent suffices to drastically reduce the rank of the network, and that the rank remains small throughout training. We provide experiments which suggest that a small initialization scale is important for finding low-rank neural networks with gradient descent.},
  year = {2023},
  booktitle = {Proceedings of ICLR 2023},
  note = {To appear}
}

@article{pkbk-scleope-23,
  author = {Juan C.~Perdomo and Akshay Krishnamurthy and Peter L.~Bartlett
and Sham Kakade},
  title = {A Complete Characterization of Linear Estimators for Offline Policy Evaluation},
  abstract = {Offline policy evaluation is a fundamental statistical problem in reinforcement learning that involves estimating the value function of some decision-making policy given data collected by a potentially different policy. In order to tackle problems with complex, high-dimensional observations, there has been significant interest from theoreticians and practitioners alike in understanding the possibility of function approximation in reinforcement learning. Despite significant study, a sharp characterization of when we might expect offline policy evaluation to be tractable, even in the simplest setting of linear function approximation, has so far remained elusive, with a surprising number of strong negative results recently appearing in the literature.
In this work, we identify simple control-theoretic and linear-algebraic conditions that are necessary and sufficient for classical methods, in particular Fitted Q-iteration (FQI) and least squares temporal difference learning (LSTD), to succeed at offline policy evaluation. Using this characterization, we establish a precise hierarchy of regimes under which these estimators succeed. We prove that LSTD works under strictly weaker conditions than FQI. Furthermore, we establish that if a problem is not solvable via LSTD, then it cannot be solved by a broad class of linear estimators, even in the limit of infinite data. Taken together, our results provide a complete picture of the behavior of linear estimators for offline policy evaluation, unify previously disparate analyses of canonical algorithms, and provide significantly sharper notions of the underlying statistical complexity of offline policy evaluation.},
  year = {2023},
  journal = {Journal of Machine Learning Research},
  volume = {24},
  number = {284},
  pages = {1--50},
  url = {http://jmlr.org/papers/v24/22-0341.html}
}

@inproceedings{pbj-idacmpmab-23,
  author = {Aldo Pacchiano and Peter L.~Bartlett and Michael I.~Jordan},
  title = {An Instance-Dependent Analysis for the Cooperative
Multi-Player Multi-Armed Bandit},
  abstract = {We study the problem of information sharing and
cooperation in Multi-Player Multi-Armed bandits. We propose the first
algorithm that achieves logarithmic regret for this problem. Our
results are based on two innovations. First, we show that a simple
modification to a successive elimination strategy can be used to allow
the players to estimate their suboptimality gaps, up to constant
factors, in the absence of collisions. Second, we leverage the first
result to design a communication protocol that successfully uses the
small reward of collisions to coordinate among players, while
preserving meaningful instance-dependent logarithmic regret
guarantees.},
  year = {2023},
  booktitle = {Proceedings of the 34th International Conference on Algorithmic Learning Theory},
  pages = {1166--1215},
  editor = {Agrawal, Shipra and Orabona, Francesco},
  volume = {201},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  pdf = {https://proceedings.mlr.press/v201/pacchiano23a/pacchiano23a.pdf},
  url = {https://proceedings.mlr.press/v201/pacchiano23a.html}
}

@inproceedings{fvbs-bolclrnfkcmm-23,
  title = {Benign Overfitting in Linear Classifiers and Leaky {ReLU}
      Networks from {KKT} Conditions for Margin Maximization},
  author = {Spencer Frei and Gal Vardi and Peter L. Bartlett and Nathan Srebro},
  year = {2023},
  note = {To Appear. Also arXiv:2303.01462.},
  abstract = {Linear classifiers and leaky ReLU networks trained by gradient flow on the logistic loss have an implicit bias towards solutions which satisfy the Karush-Kuhn-Tucker (KKT) conditions for margin maximization. In this work we establish a number of settings where the satisfaction of these KKT conditions implies benign overfitting in linear classifiers and in two-layer leaky ReLU networks: the estimators interpolate noisy training data and simultaneously generalize well to test data. The settings include variants of the noisy class-conditional Gaussians considered in previous work as well as new distributional settings where benign overfitting has not been previously observed. The key ingredient to our proof is the observation that when the training data is nearly-orthogonal, both linear classifiers and leaky ReLU networks satisfying the KKT conditions for their respective margin maximization problems behave like a nearly uniform average of the training examples.},
  booktitle = {Proceedings of the 36th Conference on Learning Theory (COLT2023)}
}

@techreport{blb-dsam-22,
  title = {The Dynamics of Sharpness-Aware Minimization: Bouncing Across Ravines
and Drifting Towards Wide Minima},
  author = {Peter L.~Bartlett and Philip M.~Long and Olivier Bousquet},
  abstract = {We consider Sharpness-Aware Minimization (SAM), a gradient-based optimization method for deep networks that has exhibited performance improvements on image and language prediction problems. We show that when SAM is applied with a convex quadratic objective, for most random initializations it converges to a cycle that oscillates between either side of the minimum in the direction with the largest curvature, and we provide bounds on the rate of convergence.
In the non-quadratic case, we show that such oscillations effectively
perform gradient descent, with a smaller step-size, on the spectral
norm of the Hessian. In such cases, SAM’s update may be regarded as a
third derivative—the derivative of the Hessian in the leading
eigenvector direction—that encourages drift toward wider minima.},
  year = {2022},
  institution = {arXiv},
  number = {2210.01513}
}

@article{blb-dsam-23,
  title = {The Dynamics of Sharpness-Aware Minimization: Bouncing Across Ravines
and Drifting Towards Wide Minima},
  author = {Peter L.~Bartlett and Philip M.~Long and Olivier Bousquet},
  abstract = {We consider Sharpness-Aware Minimization (SAM), a gradient-based optimization method for deep networks that has exhibited performance improvements on image and language prediction problems. We show that when SAM is applied with a convex quadratic objective, for most random initializations it converges to a cycle that oscillates between either side of the minimum in the direction with the largest curvature, and we provide bounds on the rate of convergence.
In the non-quadratic case, we show that such oscillations effectively
perform gradient descent, with a smaller step-size, on the spectral
norm of the Hessian. In such cases, SAM’s update may be regarded as a
third derivative—the derivative of the Hessian in the leading
eigenvector direction—that encourages drift toward wider minima.},
  year = {2023},
  journal = {Journal of Machine Learning Research},
  volume = {24},
  number = {316},
  pages = {1--36},
  url = {http://jmlr.org/papers/v24/23-043.html}
}

@techreport{fvbs-desibgrrn-23,
  title = {The Double-Edged Sword of Implicit Bias: Generalization vs.
Robustness in {ReLU} Networks},
  author = {Spencer Frei and Gal Vardi and Peter L.~Bartlett and
Nathan Srebro},
  abstract = {In this work, we study the implications of the implicit
bias of gradient flow on generalization and adversarial robustness in
ReLU networks. We focus on a setting where the data consists of
clusters and the correlations between cluster means are small, and
show that in two-layer ReLU networks gradient flow is biased towards
solutions that generalize well, but are highly vulnerable to
adversarial examples. Our results hold even in cases where the network
has many more parameters than training examples. Despite the potential
for harmful overfitting in such overparameterized settings, we prove
that the implicit bias of gradient flow prevents it. However, the
implicit bias also leads to non-robust solutions (susceptible to small
adversarial l2-perturbations), even though robust networks that fit
the data exist.},
  year = {2023},
  institution = {arXiv},
  number = {2303.01456}
}

@techreport{zfb-ttllmic-23,
  title = {Trained Transformers Learn Linear Models In-Context},
  author = {Ruiqi Zhang and Spencer Frei and Peter L.~Bartlett},
  abstract = {Attention-based neural networks such as transformers have demonstrated a remarkable ability to exhibit in-context learning (ICL): Given a short prompt sequence of tokens from an unseen task, they can formulate relevant per-token and next-token predictions without any parameter updates. By embedding a sequence of labeled training data and unlabeled test data as a prompt, this allows for transformers to behave like supervised learning algorithms. Indeed, recent work has shown that when training transformer architectures over random instances of linear regression problems, these models’ predictions mimic those of ordinary least squares. Towards understanding the mechanisms underlying this phenomenon, we investigate the dynamics of ICL in transformers with a single linear self-attention layer trained by gradient flow on linear regression tasks. We show that despite non-convexity, gradient flow with a suitable random initialization finds a global minimum of the objective function. At this global minimum, when given a test prompt of labeled examples from a new prediction task, the transformer achieves prediction error competitive with the best linear predictor over the test prompt distribution. We additionally characterize the robustness of the trained transformer to a variety of distribution shifts and show that although a number of shifts are tolerated, shifts in the covariate distribution of the prompts are not. Motivated by this, we consider a generalized ICL setting where the covariate distributions can vary across prompts. We show that although gradient flow succeeds at finding a global minimum in this setting, the trained transformer is still brittle under mild covariate shifts},
  year = {2023},
  institution = {arXiv},
  number = {2306.09927}
}

@techreport{lb-sames-23,
  title = {Sharpness-Aware Minimization and the Edge of Stability},
  author = {Philip M. Long and Peter L. Bartlett},
  year = {2023},
  number = {2309.12488},
  institution = {arXiv}
}

@article{bl-cplucssd-24,
  title = {Corrigendum to 'Prediction, Learning, Uniform Convergence, and
Scale-Sensitive Dimensions'},
  journal = {Journal of Computer and System Sciences},
  author = {Peter L.~Bartlett and Philip M.~Long},
  volume = {140},
  pages = {103465},
  year = {2024}
}

@inproceedings{fvbs-desibgrrn-23a,
  title = {The Double-Edged Sword of Implicit Bias: Generalization vs.
Robustness in {ReLU} Networks},
  author = {Spencer Frei and Gal Vardi and Peter L.~Bartlett and
Nathan Srebro},
  year = {2023},
  booktitle = {Advances in Neural Information Processing Systems 36}
}

@article{mhwbj-dpppcrp-24,
  author = {Wenlong Mou and Nhat Ho and Martin Wainwright and
Peter L.~Bartlett and Michael Jordan},
  title = {A Diffusion Process Perspective on Posterior Contraction Rates for Parameters},
  journal = {SIAM Journal on Mathematics of Data Science},
  year = {2024},
  note = {(To appear)}
}

@article{zfb-ttllmic-24,
  title = {Trained Transformers Learn Linear Models In-Context},
  author = {Ruiqi Zhang and Spencer Frei and Peter L.~Bartlett},
  abstract = {Attention-based neural networks such as transformers have demonstrated a remarkable ability to exhibit in-context learning (ICL): Given a short prompt sequence of tokens from an unseen task, they can formulate relevant per-token and next-token predictions without any parameter updates. By embedding a sequence of labeled training data and unlabeled test data as a prompt, this allows for transformers to behave like supervised learning algorithms. Indeed, recent work has shown that when training transformer architectures over random instances of linear regression problems, these models’ predictions mimic those of ordinary least squares. Towards understanding the mechanisms underlying this phenomenon, we investigate the dynamics of ICL in transformers with a single linear self-attention layer trained by gradient flow on linear regression tasks. We show that despite non-convexity, gradient flow with a suitable random initialization finds a global minimum of the objective function. At this global minimum, when given a test prompt of labeled examples from a new prediction task, the transformer achieves prediction error competitive with the best linear predictor over the test prompt distribution. We additionally characterize the robustness of the trained transformer to a variety of distribution shifts and show that although a number of shifts are tolerated, shifts in the covariate distribution of the prompts are not. Motivated by this, we consider a generalized ICL setting where the covariate distributions can vary across prompts. We show that although gradient flow succeeds at finding a global minimum in this setting, the trained transformer is still brittle under mild covariate shifts},
  year = {2024},
  journal = {Journal of Machine Learning Research},
  volume = {25},
  number = {49},
  pages = {1--55},
  url = {http://jmlr.org/papers/v25/23-1042.html}
}

@inproceedings{wzcbgb-hmptanicllr-24,
  title = {How Many Pretraining Tasks Are Needed for In-Context Learning of Linear Regression?},
  author = {Jingfeng Wu and Difan Zou and Zixiang Chen and Vladimir
Braverman and Quanquan Gu and Peter L.~Bartlett},
  booktitle = {The Twelfth International Conference on Learning Representations},
  year = {2024},
  url = {https://openreview.net/forum?id=vSh5ePa0ph}
}

@inproceedings{cb-sawafildd-24,
  title = {A Statistical Analysis of {W}asserstein Autoencoders for
Intrinsically Low-dimensional Data},
  author = {Saptarshi Chakraborty and Peter L. Bartlett},
  booktitle = {The Twelfth International Conference on Learning Representations},
  year = {2024},
  url = {https://openreview.net/pdf?id=WjRPZsfeBO}
}

@techreport{mkbbbdllpb-ideoss-24,
  title = {Implicit Diffusion: Efficient Optimization through
Stochastic Sampling},
  author = {Pierre Marion and Anna Korba and Peter L.~Bartlett and Mathieu
Blondel and Valentin De Bortoli and Arnaud Doucet and Felipe
Llinares-López and Courtney Paquette and Quentin Berthet},
  abstract = {We present a new algorithm to optimize distributions defined
implicitly by parameterized stochastic diffusions. Doing so allows us
to modify the outcome distribution of sampling processes by optimizing
over their parameters. We introduce a general framework for
first-order optimization of these processes, that performs jointly, in
a single loop, optimization and sampling steps. This approach is
inspired by recent advances in bilevel optimization and automatic
implicit differentiation, leveraging the point of view of sampling as
optimization over the space of probability distributions. We provide
theoretical guarantees on the performance of our method, as well as
experimental results demonstrating its effectiveness in real-world
settings.},
  year = {2024},
  number = {2402.05468},
  institution = {arXiv}
}

@inproceedings{wbty-lsgdll-24,
  author = {Jingfeng Wu and Peter L.~Bartlett and Matus Telgarsky and
Bin Yu},
  title = {Large Stepsize Gradient Descent for Logistic Loss:
Non-Monotonicity of the Loss Improves Optimization Efficiency},
  abstract = {We consider gradient descent (GD) with a constant stepsize
applied to logistic regression with linearly separable data, where the
constant stepsize $\eta$ is so large that the loss initially
oscillates. We show that GD exits this initial oscillatory phase
rapidly --- in $O(\eta)$ steps, and subsequently achieves an
$\tilde{O}(1 / (\eta t) )$ convergence rate after $t$ additional
steps. Our results imply that, given a budget of $T$ steps, GD can
achieve an accelerated loss of $\tilde{O}(1/T^2)$ with an aggressive
stepsize $\eta:= \Theta( T)$, without any use of momentum or variable
stepsize schedulers. Our proof technique is versatile and also handles
general classification loss functions (where exponential tails are
needed for the $\tilde{O}(1/T^2)$ acceleration), nonlinear predictors
in the neural tangent kernel regime, and online stochastic gradient
descent (SGD) with a large stepsize, under suitable separability
conditions.},
  year = {2024},
  note = {(To appear)},
  booktitle = {Proceedings of the 37th Conference on Learning Theory (COLT2024)}
}

@article{bmdbj-brnv-23,
  title = {Bayesian Robustness: A Nonasymptotic Viewpoint},
  author = {Kush Bhatia and Yi-An Ma and Anca D. Dragan and Peter L. Bartlett and Michael I. Jordan},
  year = {2023},
  volume = {119},
  number = {546},
  pages = {1112--1123},
  journal = {Journal of the American Statistical Association},
  doi = {http://doi.org/10.1080/01621459.2023.2174121}
}

This file was generated by bibtex2html 1.99.

recent-pubs.bib

Books

Journal papers, book chapters

All Publications