@inproceedings{GolGe22,
    title={Learning Deep Neural Networks Through Iterative Linearisation},
    cat={deep},
    author={Goldwaser, Adrian and Ge, Hong},
    url={https://arxiv.org/abs/2211.12345},
    booktitle={Neurips 2022 Workshop Optimisation in Machine Learning},
    year={2022},
    month={Nov.},
    abstract={The excellent real-world performance of deep neural networks has
              received increasing attention. Despite the capacity to overfit
              significantly, such large models work better than smaller ones.
              This phenomenon is often referred to as the scaling law by
              practitioners. It is of fundamental interest to study why the
              scaling law exists and how it avoids/controls overfitting. One
              approach has been looking at infinite width limits of neural
              networks (e.g., Neural Tangent Kernels, Gaussian Processes);
              however, in practise, these do not fully explain finite networks
              as their infinite counterparts do not learn features.
              Furthermore, the empirical kernel for finite networks (i.e., the
              inner product of feature vectors), changes significantly during
              training in contrast to infinite width networks. In this work we
              derive an iterative linearised training method. We justify
              iterative lineralisation as an interpolation between finite
              analogs of the infinite width regime, which do not learn
              features, and standard gradient descent training which does. We
              show some preliminary results where iterative linearised training
              works well, noting in particular how much feature learning is
              required to achieve comparable performance. We also provide novel
              insights into the training behaviour of neural networks.}
}