Merge pull request #11 from KGrewal1/paper

Add paper for JoSS submission
KGrewal1 · Dec 20, 2023 · a91c355 · a91c355
2 parents 04bb3a6 + 06da660
commit a91c355
Show file tree

Hide file tree

Showing 3 changed files with 209 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,6 @@ out.txt
 lbfgs_testbed.ipynb
 test*.ipynb
 References.md
-pseudo/*
+pseudo/*
+paper/paper.pdf
+paper/paper.jats
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,146 @@
+@article{adadelta,
+  author       = {Matthew D. Zeiler},
+  title        = {{ADADELTA:} An Adaptive Learning Rate Method},
+  journal      = {CoRR},
+  volume       = {abs/1212.5701},
+  year         = {2012},
+  url          = {http://arxiv.org/abs/1212.5701},
+  eprinttype    = {arXiv},
+  eprint       = {1212.5701},
+  timestamp    = {Mon, 13 Aug 2018 16:45:57 +0200},
+  biburl       = {https://dblp.org/rec/journals/corr/abs-1212-5701.bib},
+  bibsource    = {dblp computer science bibliography, https://dblp.org},
+  doi         = {10.48550/arXiv.1212.5701}
+}
+
+@article{adagrad,
+  author  = {John Duchi and Elad Hazan and Yoram Singer},
+  title   = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
+  journal = {Journal of Machine Learning Research},
+  year    = {2011},
+  volume  = {12},
+  number  = {61},
+  pages   = {2121--2159},
+  url     = {http://jmlr.org/papers/v12/duchi11a.html},
+
+}
+
+@inproceedings{adam,
+  author       = {Diederik P. Kingma and
+                  Jimmy Ba},
+  editor       = {Yoshua Bengio and
+                  Yann LeCun},
+  title        = {Adam: {A} Method for Stochastic Optimization},
+  booktitle    = {3rd International Conference on Learning Representations, {ICLR} 2015,
+                  San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
+  year         = {2015},
+  url          = {http://arxiv.org/abs/1412.6980},
+  timestamp    = {Thu, 25 Jul 2019 14:25:37 +0200},
+  biburl       = {https://dblp.org/rec/journals/corr/KingmaB14.bib},
+  bibsource    = {dblp computer science bibliography, https://dblp.org},
+  doi         = {10.48550/arXiv.1412.6980}
+}
+
+@article{weightdecay,
+  author       = {Ilya Loshchilov and
+                  Frank Hutter},
+  title        = {Fixing Weight Decay Regularization in Adam},
+  journal      = {CoRR},
+  volume       = {abs/1711.05101},
+  year         = {2017},
+  url          = {http://arxiv.org/abs/1711.05101},
+  eprinttype    = {arXiv},
+  eprint       = {1711.05101},
+  timestamp    = {Mon, 13 Aug 2018 16:48:18 +0200},
+  biburl       = {https://dblp.org/rec/journals/corr/abs-1711-05101.bib},
+  bibsource    = {dblp computer science bibliography, https://dblp.org},
+  doi         = {10.48550/arXiv.1711.05101}
+}
+
+@inproceedings{amsgrad,
+  title={On the Convergence of Adam and Beyond},
+  author={Sashank J. Reddi and Satyen Kale and Sanjiv Kumar},
+  booktitle={International Conference on Learning Representations},
+  year={2018},
+  url={https://openreview.net/forum?id=ryQu7f-RZ}
+}
+
+@inproceedings{nmomentum,
+  title = 	 {On the importance of initialization and momentum in deep learning},
+  author = 	 {Sutskever, Ilya and Martens, James and Dahl, George and Hinton, Geoffrey},
+  booktitle = 	 {Proceedings of the 30th International Conference on Machine Learning},
+  pages = 	 {1139--1147},
+  year = 	 {2013},
+  editor = 	 {Dasgupta, Sanjoy and McAllester, David},
+  volume = 	 {28},
+  number =       {3},
+  series = 	 {Proceedings of Machine Learning Research},
+  address = 	 {Atlanta, Georgia, USA},
+  month = 	 {17--19 Jun},
+  publisher =    {PMLR},
+  pdf = 	 {http://proceedings.mlr.press/v28/sutskever13.pdf},
+  url = 	 {https://proceedings.mlr.press/v28/sutskever13.html},
+  abstract = 	 {Deep and recurrent neural networks (DNNs and RNNs respectively) are powerful models that were considered to be almost impossible to train using stochastic gradient descent with momentum. In this paper, we show that when stochastic gradient descent with momentum uses a well-designed random initialization and a particular type of slowly increasing schedule for the momentum parameter, it can train both DNNs and RNNs (on datasets with long-term dependencies) to levels of performance that were previously achievable only with Hessian-Free optimization. We find that both the initialization and the momentum are crucial since poorly initialized networks cannot be trained with momentum and well-initialized networks perform markedly worse when the momentum is absent or poorly tuned.     Our success training these models suggests that previous attempts to train deep and recurrent neural networks from random initializations have likely failed due to poor initialization schemes. Furthermore, carefully tuned momentum methods suffice for dealing with the curvature issues in deep and recurrent network training objectives without the need for sophisticated second-order methods.   }
+}
+
+
+@article{LBFGS,
+  title={On the limited memory BFGS method for large scale optimization},
+  author={Liu, Dong C and Nocedal, Jorge},
+  journal={Mathematical programming},
+  volume={45},
+  number={1-3},
+  pages={503--528},
+  year={1989},
+  publisher={Springer},
+  doi = {10.1007/BF01589116}
+}
+
+@inproceedings{nadam,
+	title        = {Incorporating {Nesterov Momentum into Adam}},
+	author       = {Dozat, Timothy},
+	booktitle    = {Proceedings of the 4th International Conference on Learning Representations},
+	pages        = {1--4},
+	date         = 2016,
+  url = {https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ},
+  biburl = {https://bibbase.org/network/publication/dozat-incorporatingnesterovmomentumintoadam}
+}
+
+@article{radam,
+  author       = {Liyuan Liu and
+                  Haoming Jiang and
+                  Pengcheng He and
+                  Weizhu Chen and
+                  Xiaodong Liu and
+                  Jianfeng Gao and
+                  Jiawei Han},
+  title        = {On the Variance of the Adaptive Learning Rate and Beyond},
+  journal      = {CoRR},
+  volume       = {abs/1908.03265},
+  year         = {2019},
+  url          = {http://arxiv.org/abs/1908.03265},
+  eprinttype    = {arXiv},
+  eprint       = {1908.03265},
+  timestamp    = {Mon, 30 May 2022 13:48:56 +0200},
+  biburl       = {https://dblp.org/rec/journals/corr/abs-1908-03265.bib},
+  bibsource    = {dblp computer science bibliography, https://dblp.org},
+  doi         = {10.48550/arXiv.1908.03265}
+}
+
+@misc{rmsprop,
+  author = {Geoffrey Hinton and
+            Nitish Srivastava and
+            Kevin Swersky},
+  title = {Neural Networks for Machine Learning},
+  year = {2012},
+  url = {https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf}
+}
+
+@misc{candle,
+  author = {Laurent Mazare and
+            Nicolas Patry and
+            others} ,
+  title = {Candle},
+  howpublished = {https://github.com/huggingface/candle},
+  year = {2023}
+}
diff --git a/paper/paper.md b/paper/paper.md
@@ -0,0 +1,60 @@
+---
+title: 'Candle Optimisers: A Rust crate for optimisation algorithms'
+tags:
+  - Rust
+  - optimisation
+  - optimization
+  - machine learning
+authors:
+  - name: Kirpal Grewal
+    orcid: 0009-0001-7923-9975
+    affiliation: 1
+affiliations:
+ - name: Yusuf Hamied Department of Chemistry, University of Cambridge
+   index: 1
+date: 12 December 2023
+bibliography: paper.bib
+---
+
+# Summary
+
+`candle-optimisers` is a crate for optimisers written in Rust for use with candle (@candle) a lightweight machine learning framework. The crate offers a set of
+optimisers for training neural networks. This allows network training to be done with far lower overhead than using a full python framework such as PyTorch or Tensorflow.
+
+# Statement of need
+
+Rust provides the opportunity for the development of high performance machine learning libraries, with a leaner runtime. However, there is a lack of optimisation algorithms implemented in Rust,
+with libraries currently implementing only some combination of Adam, AdamW, SGD and RMSProp.
+This crate aims to provide a set of complete set of optimisation algorithms for use with candle.
+This will allow Rust to be used for the training of models more easily.
+
+# Features
+
+This library implements the following optimisation algorithms:
+
+* SGD (including momentum and Nesterov momentum (@nmomentum))
+
+* RMSprop (@rmsprop)
+
+* AdaDelta (@adadelta)
+
+* AdaGrad (@adagrad)
+
+* AdaMax (@adam)
+
+* Adam (@adam) including AMSGrad (@amsgrad)
+
+* AdamW (@weightdecay) (as decoupled weight decay of Adam)
+
+* NAdam (@nadam)
+
+* RAdam (@radam)
+
+* RMSProp (@rmsprop)
+
+* LBFGS (@LBFGS)
+
+Furthermore, decoupled weight decay (@weightdecay) is implemented for all of the adaptive methods listed and SGD,
+allowing for use of the method beyond solely AdamW.
+
+# References