diff --git a/Changelog.md b/Changelog.md index b38d775d0d..c00a6f9d48 100644 --- a/Changelog.md +++ b/Changelog.md @@ -5,6 +5,16 @@ All notable Changes to the Julia package `Manopt.jl` will be documented in this The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.4.41] - 02/11/2023 + +### Changed + +– `trust_regions` is now more flexible and the sub solver (Steinhaug-Toint tCG by default) + can now be exchanged. +- `adaptive_regularization_with_cubics` is now more flexible as well, where it previously was a bit too + much tightened to the Lanczos solver as well. +- Unified documentation notation and bumped dependencies to use DocumenterCitations 1.3 + ## [0.4.40] – 24/10/2023 ### Added diff --git a/Project.toml b/Project.toml index 535eaff242..cab0e81dd0 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Manopt" uuid = "0fc0a36d-df90-57f3-8f93-d78a9fc72bb5" authors = ["Ronny Bergmann "] -version = "0.4.40" +version = "0.4.41" [deps] ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4" @@ -38,13 +38,16 @@ ColorSchemes = "3.5.0" ColorTypes = "0.9.1, 0.10, 0.11" Colors = "0.11.2, 0.12" DataStructures = "0.17, 0.18" +LinearAlgebra = "1.6" LRUCache = "1.4" -ManifoldDiff = "0.2, 0.3.3" +ManifoldDiff = "0.3.8" Manifolds = "0.9" ManifoldsBase = "0.15" +Markdown = "1.6" PolynomialRoots = "1" +Random = "1.6" Requires = "0.5, 1" -Statistics = "1" +Statistics = "1.6" julia = "1.6" [extras] diff --git a/Readme.md b/Readme.md index 1f1d3d7315..fb1ff0c82b 100644 --- a/Readme.md +++ b/Readme.md @@ -74,7 +74,24 @@ To refer to a certain version or the source code in general we recommend to cite ``` for the most recent version or a corresponding version specific DOI, see [the list of all versions](https://zenodo.org/search?page=1&size=20&q=conceptrecid:%224290905%22&sort=-version&all_versions=True). -Note that both citations are in [BibLaTeX](https://ctan.org/pkg/biblatex) format. + +If you are also using [`Manifolds.jl`](https://juliamanifolds.github.io/Manifolds.jl/stable/) please consider to cite + +``` +@article{AxenBaranBergmannRzecki:2023, + AUTHOR = {Seth D. Axen and Mateusz Baran and Ronny Bergmann and Krzysztof Rzecki}, + DOI = {10.1145/3618296}, + EPRINT = {2021.08777}, + EPRINTTYPE = {arXiv}, + JOURNAL = {AMS Transactions on Mathematical Software}, + NOTE = {accepted for publication}, + TITLE = {Manifolds.jl: An Extensible {J}ulia Framework for Data Analysis on Manifolds}, + YEAR = {2023} +} +``` + +as well. +Note that all citations are in [BibLaTeX](https://ctan.org/pkg/biblatex) format. ## Further and Similar Packages & Links diff --git a/docs/make.jl b/docs/make.jl index 430d51e03c..234b23dbdc 100755 --- a/docs/make.jl +++ b/docs/make.jl @@ -108,7 +108,7 @@ tutorials_menu = bib = CitationBibliography(joinpath(@__DIR__, "src", "references.bib"); style=:alpha) makedocs(; format=Documenter.HTML(; - mathengine=MathJax3(), prettyurls=get(ENV, "CI", nothing) == "true" + prettyurls=false, assets=["assets/favicon.ico", "assets/citations.css"] ), modules=[ Manopt, diff --git a/docs/src/assets/citations.css b/docs/src/assets/citations.css new file mode 100644 index 0000000000..5ea611881e --- /dev/null +++ b/docs/src/assets/citations.css @@ -0,0 +1,19 @@ +/* Taken from https://juliadocs.org/DocumenterCitations.jl/v1.2/styling/ */ + +.citation dl { + display: grid; + grid-template-columns: max-content auto; } +.citation dt { + grid-column-start: 1; } +.citation dd { + grid-column-start: 2; + margin-bottom: 0.75em; } +.citation ul { + padding: 0 0 2.25em 0; + margin: 0; + list-style: none;} +.citation ul li { + text-indent: -2.25em; + margin: 0.33em 0.5em 0.5em 2.25em;} +.citation ol li { + padding-left:0.75em;} diff --git a/docs/src/functions/adjoint_differentials.md b/docs/src/functions/adjoint_differentials.md index 5d1fdfeb4e..640d2e4457 100644 --- a/docs/src/functions/adjoint_differentials.md +++ b/docs/src/functions/adjoint_differentials.md @@ -8,6 +8,6 @@ Pages = ["adjoint_differentials.jl"] # Literature ```@bibliography -Pages = ["functions/adjoint_differentials.md"] +Pages = ["adjoint_differentials.md"] Canonical=false ``` \ No newline at end of file diff --git a/docs/src/functions/bezier.md b/docs/src/functions/bezier.md index c1bcc49aa5..229215fb38 100644 --- a/docs/src/functions/bezier.md +++ b/docs/src/functions/bezier.md @@ -8,6 +8,6 @@ Pages = ["bezier_curves.jl"] # Literature ```@bibliography -Pages = ["functions/bezier.md"] +Pages = ["bezier.md"] Canonical=false ``` \ No newline at end of file diff --git a/docs/src/functions/gradients.md b/docs/src/functions/gradients.md index a02ebc81a1..245dc17a20 100644 --- a/docs/src/functions/gradients.md +++ b/docs/src/functions/gradients.md @@ -1,13 +1,12 @@ # [Gradients](@id GradientFunctions) -For a function $f:\mathcal M→ℝ$ -the Riemannian gradient $\operatorname{grad}f(x)$ at $x∈\mathcal M$ +For a function ``f:\mathcal M→ℝ`` +the Riemannian gradient ``\operatorname{grad}f(x)`` at ``x∈\mathcal M`` is given by the unique tangent vector fulfilling - -$\langle \operatorname{grad}f(x), ξ\rangle_x = D_xf[ξ],\quad -\forall ξ ∈ T_x\mathcal M,$ -where $D_xf[ξ]$ denotes the differential of $f$ at $x$ with respect to -the tangent direction (vector) $ξ$ or in other words the directional +``\langle \operatorname{grad}f(x), ξ\rangle_x = D_xf[ξ],\quad +\forall ξ ∈ T_x\mathcal M,`` +where ``D_xf[ξ]`` denotes the differential of ``f`` at ``x`` with respect to +the tangent direction (vector) ``ξ`` or in other words the directional derivative. This page collects the available gradients. @@ -20,6 +19,6 @@ Pages = ["gradients.jl"] # Literature ```@bibliography -Pages = ["functions/gradients.md"] +Pages = ["gradients.md"] Canonical=false ``` \ No newline at end of file diff --git a/docs/src/functions/proximal_maps.md b/docs/src/functions/proximal_maps.md index 553df3ec8d..a3eb4912c1 100644 --- a/docs/src/functions/proximal_maps.md +++ b/docs/src/functions/proximal_maps.md @@ -29,6 +29,6 @@ Pages = ["proximal_maps.jl"] # Literature ```@bibliography -Pages = ["functions/proximal_maps.md"] +Pages = ["proximal_maps.md"] Canonical=false ``` \ No newline at end of file diff --git a/docs/src/helpers/checks.md b/docs/src/helpers/checks.md index df15b0a7b0..8fc5eae5a6 100644 --- a/docs/src/helpers/checks.md +++ b/docs/src/helpers/checks.md @@ -10,6 +10,6 @@ Pages = ["checks.jl"] ## Literature ```@bibliography -Pages = ["helpers/checks.md"] +Pages = ["checks.md"] Canonical=false ``` \ No newline at end of file diff --git a/docs/src/helpers/data.md b/docs/src/helpers/data.md index 924dd9a1ce..15e23d0a8e 100644 --- a/docs/src/helpers/data.md +++ b/docs/src/helpers/data.md @@ -13,6 +13,6 @@ Pages = ["artificialDataFunctions.jl"] # Literature ```@bibliography -Pages = ["helpers/data.md"] +Pages = ["data.md"] Canonical=false ``` \ No newline at end of file diff --git a/docs/src/plans/index.md b/docs/src/plans/index.md index bbeffad462..195f1fc905 100644 --- a/docs/src/plans/index.md +++ b/docs/src/plans/index.md @@ -17,5 +17,30 @@ Still there might be the need to set certain parameters within any of these stru ```@docs set_manopt_parameter! +get_manopt_parameter Manopt.status_summary -``` \ No newline at end of file +``` + +Where the following Symbols are used + +The following symbols are used. +The column “generic” refers to a short hand that might be used – for readability if clear from context. + +| Symbol | Used in | Description | generic | +| :----------- | :------: | ;-------------------------------------------------------- | :------ | +| `:active` | [`DebugWhenActive`](@ref) | activity of the debug action stored within | | +| `:Basepoint` | [`TangentSpace`]() | the point the tangent space is at | `:p` | +| `:Cost` | generic |the cost function (e.g. within an objective, as pass down) | | +| `:Debug` | [`DebugSolverState`](@ref) | the stored `debugDictionary` | | +| `:Gradient` | generic |the gradient function (e.g. within an objective, as pass down) | | +| `:Iterate` | generic | the (current) iterate – similar to [`set_iterate!`](@ref) – within a state | | +| `:Manifold` | generic |the manifold (e.g. within a problem, as pass down) | | +| `:Objective` | generic | the objective (e.g. within a problem, as pass down) | | +| `:SubProblem` | generic | the sub problem (e.g. within a state, as pass down) | | +| `:SubState` | generic | the sub state (e.g. within a state, as pass down) | | +| `:λ` | [`ProximalDCCost`](@ref), [`ProximalDCGrad`](@ref) | set the proximal parameter within the proximal sub objective elements | | +| `:p` | generic | a certain point | | +| `:X` | generic | a certain tangent vector | | +| `:TrustRegionRadius` | [`TrustRegionsState`](@ref) | the trust region radius | `:σ` | +| `:ρ`, `:u` | [`ExactPenaltyCost`](@ref), [`ExactPenaltyGrad`](@ref) | Parameters within the exact penalty objetive | | +| `:ρ`, `:μ`, `:λ` | [`AugmentedLagrangianCost`](@ref) and [`AugmentedLagrangianGrad`](@ref) | Parameters of the Lagrangian function | | diff --git a/docs/src/plans/objective.md b/docs/src/plans/objective.md index 8ebe00d2e2..fdaa7aa39c 100644 --- a/docs/src/plans/objective.md +++ b/docs/src/plans/objective.md @@ -160,10 +160,10 @@ ManifoldProximalMapObjective get_proximal_map ``` - ### Hessian Objective ```@docs +AbstractManifoldHessianObjective ManifoldHessianObjective ``` @@ -234,3 +234,21 @@ get_grad_inequality_constraint! get_grad_inequality_constraints get_grad_inequality_constraints! ``` + +### Subproblem Objective + +This objective can be use when the objective of a sub problem +solver still needs access to the (outer/main) objective. + +```@docs +AbstractManifoldSubObjective +``` + +#### Access functions + +```@docs +Manopt.get_objective_cost +Manopt.get_objective_gradient +Manopt.get_objective_hessian +Manopt.get_objective_preconditioner +``` \ No newline at end of file diff --git a/docs/src/plans/stepsize.md b/docs/src/plans/stepsize.md index f30fe69ecd..7979393e2e 100644 --- a/docs/src/plans/stepsize.md +++ b/docs/src/plans/stepsize.md @@ -29,6 +29,6 @@ Filter = t -> t != Stepsize ## Literature ```@bibliography -Pages = ["plans/stepsize.md"] +Pages = ["stepsize.md"] Canonical=false ``` \ No newline at end of file diff --git a/docs/src/references.bib b/docs/src/references.bib index 93e54c4971..588a65f25d 100644 --- a/docs/src/references.bib +++ b/docs/src/references.bib @@ -203,7 +203,7 @@ @book{Boumal:2023 EDITION = {First}, PUBLISHER = {Cambridge University Press}, DOI = {10.1017/9781009166164}, - URLDATE = {2023-07-13}, + NOTE = {Homepage to the book: [nicolasboumal.net/book/index.html](https://www.nicolasboumal.net/book/index.html).}, ABSTRACT = {Optimization on Riemannian manifolds-the result of smooth geometry and optimization merging into one elegant modern framework-spans many areas of science and engineering, including machine learning, computer vision, signal processing, dynamical systems and scientific computing. This text introduces the differential geometry and Riemannian geometry concepts that will help students and researchers in applied mathematics, computer science and engineering gain a firm mathematical grounding to use these tools confidently in their research. Its charts-last approach will prove more intuitive from an optimizer's viewpoint, and all definitions and theorems are motivated to build time-tested optimization algorithms. Starting from first principles, the text goes on to cover current research on topics including worst-case complexity and geodesic convexity. Readers will appreciate the tricks of the trade for conducting research and for numerical implementations sprinkled throughout the book.}, ISBN = {978-1-00-916616-4} } @@ -215,7 +215,7 @@ @book{doCarmo:1992 TITLE = {Riemannian Geometry}, AUTHOR = {do Carmo, Manfredo Perdigão}, YEAR = {1992}, - SERIES = {Mathematics: Theory & Applications}, + SERIES = {Mathematics: Theory \& Applications}, PAGES = {xiv+300}, PUBLISHER = {Birkhäuser Boston, Inc., Boston, MA}, DOI = {10.1007/978-1-4757-2201-7}, @@ -266,7 +266,7 @@ @article{DaiYuan:1999 MONTH = jan, NUMBER = {1}, PAGES = {177--182}, - PUBLISHER = {Society for Industrial & Applied Mathematics (SIAM)}, + PUBLISHER = {Society for Industrial \& Applied Mathematics (SIAM)}, TITLE = {A Nonlinear Conjugate Gradient Method with a Strong Global Convergence Property}, VOLUME = {10}, YEAR = {1999} @@ -279,7 +279,7 @@ @article{DiepeveenLellmann:2021 NOTE = {arXiv: [2102.10309](https://arxiv.org/abs/2102.10309)}, NUMBER = {4}, PAGES = {1565--1600}, - PUBLISHER = {Society for Industrial & Applied Mathematics ({SIAM})}, + PUBLISHER = {Society for Industrial \& Applied Mathematics ({SIAM})}, TITLE = {An Inexact Semismooth Newton Method on Riemannian Manifolds with Application to Duality-Based Total Variation Denoising}, VOLUME = {14}, YEAR = {2021}, @@ -307,7 +307,7 @@ @book{Fletcher:1987 AUTHOR = {Fletcher, R.}, EDITION = {2}, LOCATION = {Chichester}, - PUBLISHER = {John Wiley & Sons Ltd.}, + PUBLISHER = {John Wiley \& Sons Ltd.}, SERIES = {A Wiley-Interscience Publication}, TITLE = {Practical Methods of Optimization}, YEAR = {1987} @@ -358,7 +358,7 @@ @article{HagerZhang:2005 MONTH = jan, NUMBER = {1}, PAGES = {170--192}, - PUBLISHER = {Society for Industrial & Applied Mathematics (SIAM)}, + PUBLISHER = {Society for Industrial \& Applied Mathematics (SIAM)}, TITLE = {A New Conjugate Gradient Method with Guaranteed Descent and an Efficient Line Search}, VOLUME = {16}, YEAR = {2005} @@ -425,7 +425,7 @@ @article{IannazzoPorcelli:2017 NUMBER = {1}, PAGES = {495--517}, PUBLISHER = {Oxford University Press ({OUP})}, - TITLE = {The Riemannian Barzilai{\textendash}Borwein method with nonmonotone line search and the matrix geometric mean computation}, + TITLE = {The Riemannian Barzilai–Borwein method with nonmonotone line search and the matrix geometric mean computation}, VOLUME = {38}, YEAR = {2017}, } @@ -467,7 +467,7 @@ @article{LiuBoumal:2019 YEAR = {2019}, MONTH = mar, DOI = {10.1007/s00245-019-09564-3}, - JOURNAL = {Applied Mathematics & Optimization}, + JOURNAL = {Applied Mathematics \& Optimization}, TITLE = {Simple algorithms for optimization on Riemannian manifolds with constraints}, URL = {https://arxiv.org/abs/1901.10000} } diff --git a/docs/src/solvers/ChambollePock.md b/docs/src/solvers/ChambollePock.md index 8d8503f76a..0fc28d8b95 100644 --- a/docs/src/solvers/ChambollePock.md +++ b/docs/src/solvers/ChambollePock.md @@ -3,31 +3,34 @@ The Riemannian Chambolle–Pock is a generalization of the Chambolle–Pock algorithm [ChambollePock:2011](@citet*) It is also known as primal-dual hybrid gradient (PDHG) or primal-dual proximal splitting (PDPS) algorithm. -In order to minimize over $p∈\mathcal M$ the cost function consisting of +In order to minimize over ``p∈\mathcal M`` the cost function consisting of +In order to minimize a cost function consisting of ```math F(p) + G(Λ(p)), ``` -where $F:\mathcal M → \overline{ℝ}$, $G:\mathcal N → \overline{ℝ}$, and -$Λ:\mathcal M →\mathcal N$. -If the manifolds $\mathcal M$ or $\mathcal N$ are not Hadamard, it has to be considered locally, -i.e. on geodesically convex sets $\mathcal C \subset \mathcal M$ and $\mathcal D \subset\mathcal N$ -such that $Λ(\mathcal C) \subset \mathcal D$. + over ``p∈\mathcal M`` + +where ``F:\mathcal M → \overline{ℝ}``, ``G:\mathcal N → \overline{ℝ}``, and +``Λ:\mathcal M →\mathcal N``. +If the manifolds ``\mathcal M`` or ``\mathcal N`` are not Hadamard, it has to be considered locally, +i.e. on geodesically convex sets ``\mathcal C \subset \mathcal M`` and ``\mathcal D \subset\mathcal N`` +such that ``Λ(\mathcal C) \subset \mathcal D``. The algorithm is available in four variants: exact versus linearized (see `variant`) as well as with primal versus dual relaxation (see `relax`). For more details, see [BergmannHerzogSilvaLouzeiroTenbrinckVidalNunez:2021](@citet*). In the following we note the case of the exact, primal relaxed Riemannian Chambolle–Pock algorithm. -Given base points $m∈\mathcal C$, $n=Λ(m)∈\mathcal D$, -initial primal and dual values $p^{(0)} ∈\mathcal C$, $ξ_n^{(0)} ∈T_n^*\mathcal N$, -and primal and dual step sizes $\sigma_0$, $\tau_0$, relaxation $\theta_0$, -as well as acceleration $\gamma$. +Given base points ``m∈\mathcal C``, ``n=Λ(m)∈\mathcal D``, +initial primal and dual values ``p^{(0)} ∈\mathcal C``, ``ξ_n^{(0)} ∈T_n^*\mathcal N``, +and primal and dual step sizes ``\sigma_0``, ``\tau_0``, relaxation ``\theta_0``, +as well as acceleration ``\gamma``. -As an initialization, perform $\bar p^{(0)} \gets p^{(0)}$. +As an initialization, perform ``\bar p^{(0)} \gets p^{(0)}``. -The algorithms performs the steps $k=1,…,$ (until a [`StoppingCriterion`](@ref) is fulfilled with) +The algorithms performs the steps ``k=1,…,`` (until a [`StoppingCriterion`](@ref) is fulfilled with) 1. ```math ξ^{(k+1)}_n = \operatorname{prox}_{\tau_k G_n^*}\Bigl(ξ_n^{(k)} + \tau_k \bigl(\log_n Λ (\bar p^{(k)})\bigr)^\flat\Bigr) @@ -46,9 +49,9 @@ The algorithms performs the steps $k=1,…,$ (until a [`StoppingCriterion`](@ref Furthermore you can exchange the exponential map, the logarithmic map, and the parallel transport by a retraction, an inverse retraction, and a vector transport. -Finally you can also update the base points $m$ and $n$ during the iterations. +Finally you can also update the base points ``m`` and ``n`` during the iterations. This introduces a few additional vector transports. The same holds for the case -$Λ(m^{(k)})\neq n^{(k)}$ at some point. All these cases are covered in the algorithm. +``Λ(m^{(k)})\neq n^{(k)}`` at some point. All these cases are covered in the algorithm. ```@meta CurrentModule = Manopt @@ -110,6 +113,6 @@ Manopt.update_prox_parameters! ## Literature ```@bibliography -Pages = ["solvers/ChambollePock.md"] +Pages = ["ChambollePock.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/DouglasRachford.md b/docs/src/solvers/DouglasRachford.md index 98c548b858..3a4dacf0f5 100644 --- a/docs/src/solvers/DouglasRachford.md +++ b/docs/src/solvers/DouglasRachford.md @@ -65,5 +65,5 @@ For specific [`DebugAction`](@ref)s and [`RecordAction`](@ref)s see also ## Literature ```@bibliography -Pages = ["solvers/DouglasRachford.md"] +Pages = ["DouglasRachford.md"] ``` diff --git a/docs/src/solvers/FrankWolfe.md b/docs/src/solvers/FrankWolfe.md index a0f068389d..abfe2b6b50 100644 --- a/docs/src/solvers/FrankWolfe.md +++ b/docs/src/solvers/FrankWolfe.md @@ -25,6 +25,6 @@ FrankWolfeGradient ``` ```@bibliography -Pages = ["solvers/FrankWolfe.md"] +Pages = ["FrankWolfe.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/LevenbergMarquardt.md b/docs/src/solvers/LevenbergMarquardt.md index a3fde242cd..82f743be57 100644 --- a/docs/src/solvers/LevenbergMarquardt.md +++ b/docs/src/solvers/LevenbergMarquardt.md @@ -18,6 +18,6 @@ LevenbergMarquardtState ## Literature ```@bibliography -Pages = ["solvers/LevenbergMarquardt.md"] +Pages = ["LevenbergMarquardt.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/adaptive-regularization-with-cubics.md b/docs/src/solvers/adaptive-regularization-with-cubics.md index f47bf034be..ad6ab2cc9c 100644 --- a/docs/src/solvers/adaptive-regularization-with-cubics.md +++ b/docs/src/solvers/adaptive-regularization-with-cubics.md @@ -29,19 +29,17 @@ Manopt.LanczosState ## (Conjugate) Gradient Descent -There are two generic functors, that implement the sub problem +There is a generic objective, that implements the sub problem ```@docs -AdaptiveRegularizationCubicCost -AdaptiveRegularizationCubicGrad +AdaptiveRagularizationWithCubicsModelObjective ``` Since the sub problem is given on the tangent space, you have to provide ``` -g = AdaptiveRegularizationCubicCost(M, mho, σ) -grad_g = AdaptiveRegularizationCubicGrad(M, mho, σ) -sub_problem = DefaultProblem(TangentSpaceAt(M,p), ManifoldGradienObjective(g, grad_g)) +arc_obj = AdaptiveRagularizationWithCubicsModelObjective(mho, σ) +sub_problem = DefaultProblem(TangentSpaceAt(M,p), arc_obj) ``` where `mho` is the hessian objective of `f` to solve. @@ -59,6 +57,6 @@ StopWhenFirstOrderProgress ## Literature ```@bibliography -Pages = ["solvers/adaptive-regularization-with-cubics.md"] +Pages = ["adaptive-regularization-with-cubics.md"] Canonical=false ``` \ No newline at end of file diff --git a/docs/src/solvers/augmented_Lagrangian_method.md b/docs/src/solvers/augmented_Lagrangian_method.md index 2640fe4adf..ada799cf92 100644 --- a/docs/src/solvers/augmented_Lagrangian_method.md +++ b/docs/src/solvers/augmented_Lagrangian_method.md @@ -25,6 +25,6 @@ AugmentedLagrangianGrad ## Literature ```@bibliography -Pages = ["solvers/augmented_Lagrangian_method.md"] +Pages = ["augmented_Lagrangian_method.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/conjugate_gradient_descent.md b/docs/src/solvers/conjugate_gradient_descent.md index 88e11096ec..289907d363 100644 --- a/docs/src/solvers/conjugate_gradient_descent.md +++ b/docs/src/solvers/conjugate_gradient_descent.md @@ -35,6 +35,6 @@ SteepestDirectionUpdateRule # Literature ```@bibliography -Pages = ["solvers/conjugate_gradient_descent.md"] +Pages = ["conjugate_gradient_descent.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/cyclic_proximal_point.md b/docs/src/solvers/cyclic_proximal_point.md index 7f6b5af8a2..018eee9a92 100644 --- a/docs/src/solvers/cyclic_proximal_point.md +++ b/docs/src/solvers/cyclic_proximal_point.md @@ -6,11 +6,11 @@ The Cyclic Proximal Point (CPP) algorithm aims to minimize F(x) = \sum_{i=1}^c f_i(x) ``` -assuming that the [proximal maps](@ref proximalMapFunctions) $\operatorname{prox}_{λ f_i}(x)$ +assuming that the [proximal maps](@ref proximalMapFunctions) ``\operatorname{prox}_{λ f_i}(x)`` are given in closed form or can be computed efficiently (at least approximately). The algorithm then cycles through these proximal maps, where the type of cycle -might differ and the proximal parameter $λ_k$ changes after each cycle $k$. +might differ and the proximal parameter ``λ_k`` changes after each cycle ``k``. For a convergence result on [Hadamard manifolds](https://en.wikipedia.org/wiki/Hadamard_manifold) @@ -42,6 +42,6 @@ RecordProximalParameter ## Literature ```@bibliography -Pages = ["solvers/cyclic_proximal_point.md"] +Pages = ["cyclic_proximal_point.md"] Canonical=false ``` diff --git a/docs/src/solvers/difference_of_convex.md b/docs/src/solvers/difference_of_convex.md index ad1f7aab58..89a314cdd1 100644 --- a/docs/src/solvers/difference_of_convex.md +++ b/docs/src/solvers/difference_of_convex.md @@ -58,6 +58,6 @@ get_subtrahend_gradient ## Literature ```@bibliography -Pages = ["solvers/difference_of_convex.md"] +Pages = ["difference_of_convex.md"] Canonical=false ``` \ No newline at end of file diff --git a/docs/src/solvers/exact_penalty_method.md b/docs/src/solvers/exact_penalty_method.md index 22ccdd959d..9f17a3f32c 100644 --- a/docs/src/solvers/exact_penalty_method.md +++ b/docs/src/solvers/exact_penalty_method.md @@ -29,6 +29,6 @@ LogarithmicSumOfExponentials ## Literature ```@bibliography -Pages = ["solvers/exact_penalty_method.md"] +Pages = ["exact_penalty_method.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/gradient_descent.md b/docs/src/solvers/gradient_descent.md index 60677c02e9..fa5516f2c8 100644 --- a/docs/src/solvers/gradient_descent.md +++ b/docs/src/solvers/gradient_descent.md @@ -43,10 +43,18 @@ RecordGradientNorm RecordStepsize ``` +## Technical Details + +The [`gradient_descent`](@ref) solver requires the following functions of your manifold to be available + +* A retraction; if you do not want to specify them directly, [`default_retraction_method`](https://juliamanifolds.github.io/ManifoldsBase.jl/stable/retractions/#ManifoldsBase.default_retraction_method-Tuple{AbstractManifold}) should be implemented as well. +* By default gradient descent uses [`ArmijoLinesearch`](@ref) which requires [`max_stepsize`](@ref)`(M)` to be set and an implementation of [`norm`](https://juliamanifolds.github.io/ManifoldsBase.jl/stable/functions/#LinearAlgebra.norm-Tuple{AbstractManifold,%20Any,%20Any})`(M, p, X)`. +* By default the stopping criterion uses the [`norm`](https://juliamanifolds.github.io/ManifoldsBase.jl/stable/functions/#LinearAlgebra.norm-Tuple{AbstractManifold,%20Any,%20Any}) as well, to check for a small gradient + ## Literature ```@bibliography -Pages = ["solvers/gradient_descent.md"] +Pages = ["gradient_descent.md"] Canonical=false Luenberger:1972 diff --git a/docs/src/solvers/particle_swarm.md b/docs/src/solvers/particle_swarm.md index c215fca2aa..a219a817ba 100644 --- a/docs/src/solvers/particle_swarm.md +++ b/docs/src/solvers/particle_swarm.md @@ -18,6 +18,6 @@ ParticleSwarmState ## Literature ```@bibliography -Pages = ["solvers/particle_swarm.md"] +Pages = ["particle_swarm.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/primal_dual_semismooth_Newton.md b/docs/src/solvers/primal_dual_semismooth_Newton.md index e56dc64d12..e007783d05 100644 --- a/docs/src/solvers/primal_dual_semismooth_Newton.md +++ b/docs/src/solvers/primal_dual_semismooth_Newton.md @@ -8,13 +8,13 @@ The aim is to solve an optimization problem on a manifold with a cost function o F(p) + G(Λ(p)), ``` -where $F:\mathcal M → \overline{ℝ}$, $G:\mathcal N → \overline{ℝ}$, and -$Λ:\mathcal M →\mathcal N$. -If the manifolds $\mathcal M$ or $\mathcal N$ are not Hadamard, it has to be considered locally, -i.e. on geodesically convex sets $\mathcal C \subset \mathcal M$ and $\mathcal D \subset\mathcal N$ -such that $Λ(\mathcal C) \subset \mathcal D$. +where ``F:\mathcal M → \overline{ℝ}``, ``G:\mathcal N → \overline{ℝ}``, and +``Λ:\mathcal M →\mathcal N``. +If the manifolds ``\mathcal M`` or ``\mathcal N`` are not Hadamard, it has to be considered locally, +i.e. on geodesically convex sets ``\mathcal C \subset \mathcal M`` and ``\mathcal D \subset\mathcal N`` +such that ``Λ(\mathcal C) \subset \mathcal D``. -The algorithm comes down to applying the Riemannian semismooth Newton method to the rewritten primal-dual optimality conditions, i.e., we define the vector field $X: \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N} \rightarrow \mathcal{T} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}$ as +The algorithm comes down to applying the Riemannian semismooth Newton method to the rewritten primal-dual optimality conditions, i.e., we define the vector field ``X: \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N} \rightarrow \mathcal{T} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}`` as ```math X\left(p, \xi_{n}\right):=\left(\begin{array}{c} @@ -23,13 +23,13 @@ X\left(p, \xi_{n}\right):=\left(\begin{array}{c} \end{array}\right) ``` -and solve for $X(p,ξ_{n})=0$. +and solve for ``X(p,ξ_{n})=0``. -Given base points $m∈\mathcal C$, $n=Λ(m)∈\mathcal D$, -initial primal and dual values $p^{(0)} ∈\mathcal C$, $ξ_{n}^{(0)} ∈ \mathcal T_{n}^{*}\mathcal N$, -and primal and dual step sizes $\sigma$, $\tau$. +Given base points ``m∈\mathcal C``, ``n=Λ(m)∈\mathcal D``, +initial primal and dual values ``p^{(0)} ∈\mathcal C``, ``ξ_{n}^{(0)} ∈ \mathcal T_{n}^{*}\mathcal N``, +and primal and dual step sizes ``\sigma``, ``\tau``. -The algorithms performs the steps $k=1,…,$ (until a [`StoppingCriterion`](@ref) is reached) +The algorithms performs the steps ``k=1,…,`` (until a [`StoppingCriterion`](@ref) is reached) 1. Choose any element ```math @@ -40,7 +40,7 @@ The algorithms performs the steps $k=1,…,$ (until a [`StoppingCriterion`](@ref ```math V^{(k)} [(d_p^{(k)}, d_n^{(k)})] = - X(p^{(k)},ξ_n^{(k)}) ``` - in the vector space $\mathcal{T}_{p^{(k)}} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}$ + in the vector space ``\mathcal{T}_{p^{(k)}} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}`` 3. Update ```math p^{(k+1)} := \exp_{p^{(k)}}(d_p^{(k)}) @@ -53,9 +53,9 @@ The algorithms performs the steps $k=1,…,$ (until a [`StoppingCriterion`](@ref Furthermore you can exchange the exponential map, the logarithmic map, and the parallel transport by a retraction, an inverse retraction and a vector transport. -Finally you can also update the base points $m$ and $n$ during the iterations. +Finally you can also update the base points ``m`` and ``n`` during the iterations. This introduces a few additional vector transports. The same holds for the case that -$Λ(m^{(k)})\neq n^{(k)}$ at some point. All these cases are covered in the algorithm. +``Λ(m^{(k)})\neq n^{(k)}`` at some point. All these cases are covered in the algorithm. ```@meta CurrentModule = Manopt @@ -75,6 +75,6 @@ PrimalDualSemismoothNewtonState ## Literature ```@bibliography -Pages = ["solvers/primal_dual_semismooth_Newton.md"] +Pages = ["primal_dual_semismooth_Newton.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/quasi_Newton.md b/docs/src/solvers/quasi_Newton.md index 7d71d4104c..a14e992344 100644 --- a/docs/src/solvers/quasi_Newton.md +++ b/docs/src/solvers/quasi_Newton.md @@ -115,6 +115,6 @@ QuasiNewtonState ## Literature ```@bibliography -Pages = ["solvers/quasi_Newton.md"] +Pages = ["quasi_Newton.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/truncated_conjugate_gradient_descent.md b/docs/src/solvers/truncated_conjugate_gradient_descent.md index c6d797fccc..31da535568 100644 --- a/docs/src/solvers/truncated_conjugate_gradient_descent.md +++ b/docs/src/solvers/truncated_conjugate_gradient_descent.md @@ -1,128 +1,18 @@ # [Steihaug-Toint Truncated Conjugate-Gradient Method](@id tCG) -The aim is to solve the trust-region subproblem +Solve the constraint optimization problem on the tangent space ```math -\operatorname*{arg\,min}_{η ∈ T_{x}\mathcal{M}} m_{x}(η) = F(x) + -⟨\operatorname{grad}F(x), η⟩_{x} + \frac{1}{2} ⟨ -\mathcal{H}[η], η⟩_{x} +\begin{align*} +\operatorname*{arg\,min}_{Y ∈ T_p\mathcal{M}}&\ m_p(Y) = f(p) + +⟨\operatorname{grad}f(p), Y⟩_p + \frac{1}{2} ⟨\mathcal{H}_p[Y], Y⟩_p\\ +\text{such that}& \ \lVert Y \rVert_p ≤ Δ +\end{align*} ``` -```math -\text{s.t.} \; ⟨η, η⟩_{x} \leq {Δ}^2 -``` - -on a manifold by using the Steihaug-Toint truncated conjugate-gradient method, -abbreviated tCG-method. -All terms involving the trust-region radius use an inner product w.r.t. the -preconditioner; this is because the iterates grow in length w.r.t. the -preconditioner, guaranteeing that we do not re-enter the trust-region. - -## Initialization - -Initialize ``η_0 = η`` if using randomized approach and -``η`` the zero tangent vector otherwise, ``r_0 = \operatorname{grad}F(x)``, -``z_0 = \operatorname{P}(r_0)``, ``δ_0 = z_0`` and ``k=0`` - -## Iteration - -Repeat until a convergence criterion is reached - -1. Set ``α =\frac{⟨r_k, z_k⟩_x}{⟨δ_k, \mathcal{H}[δ_k]⟩_x}`` and - ``⟨η_k, η_k⟩_{x}^* = ⟨η_k, \operatorname{P}(η_k)⟩_x + - 2α ⟨η_k, \operatorname{P}(δ_k)⟩_{x} + {α}^2 - ⟨δ_k, \operatorname{P}(δ_k)⟩_{x}``. -2. If ``⟨δ_k, \mathcal{H}[δ_k]⟩_x ≤ 0`` or ``⟨η_k, η_k⟩_x^* ≥ Δ^2`` - return ``η_{k+1} = η_k + τ δ_k`` and stop. -3. Set ``η_{k}^*= η_k + α δ_k``, if - ``⟨η_k, η_k⟩_{x} + \frac{1}{2} ⟨η_k, - \operatorname{Hess}[F] (η_k)_{x}⟩_{x} ≤ ⟨η_k^*, - η_k^*⟩_{x} + \frac{1}{2} ⟨η_k^*, - \operatorname{Hess}[F] (η_k)_ {x}⟩_{x}`` - set ``η_{k+1} = η_k`` else set ``η_{k+1} = η_{k}^*``. -4. Set ``r_{k+1} = r_k + α \mathcal{H}[δ_k]``, - ``z_{k+1} = \operatorname{P}(r_{k+1})``, - ``β = \frac{⟨r_{k+1}, z_{k+1}⟩_{x}}{⟨r_k, z_k - ⟩_{x}}`` and ``δ_{k+1} = -z_{k+1} + β δ_k``. -5. Set ``k=k+1``. - -## Result - -The result is given by the last computed ``η_k``. - -## Remarks - -The ``\operatorname{P}(⋅)`` denotes the symmetric, positive definite -preconditioner. It is required if a randomized approach is used i.e. using -a random tangent vector ``η_0`` as the initial -vector. The idea behind it is to avoid saddle points. Preconditioning is -simply a rescaling of the variables and thus a redefinition of the shape of -the trust region. Ideally ``\operatorname{P}(⋅)`` is a cheap, positive -approximation of the inverse of the Hessian of ``F`` at ``x``. On -default, the preconditioner is just the identity. - -To step number 2: obtain ``τ`` from the positive root of -``\left\lVert η_k + τ δ_k \right\rVert_{\operatorname{P}, x} = Δ`` -what becomes after the conversion of the equation to - -````math - τ = \frac{-⟨η_k, \operatorname{P}(δ_k)⟩_{x} + - \sqrt{⟨η_k, \operatorname{P}(δ_k)⟩_{x}^{2} + - ⟨δ_k, \operatorname{P}(δ_k)⟩_{x} ( Δ^2 - - ⟨η_k, \operatorname{P}(η_k)⟩_{x})}} - {⟨δ_k, \operatorname{P}(δ_k)⟩_{x}}. -```` - -It can occur that ``⟨δ_k, \operatorname{Hess}[F] (δ_k)_{x}⟩_{x} -= κ ≤ 0`` at iteration ``k``. In this case, the model is not strictly -convex, and the stepsize ``α =\frac{⟨r_k, z_k⟩_{x}} -{κ}`` computed in step 1. does not give a reduction in the model function -``m_x(⋅)``. Indeed, ``m_x(⋅)`` is unbounded from below along the -line ``η_k + α δ_k``. If our aim is to minimize the model within -the trust-region, it makes far more sense to reduce ``m_x(⋅)`` along -``η_k + α δ_k`` as much as we can while staying within the -trust-region, and this means moving to the trust-region boundary along this -line. Thus, when ``κ ≤ 0`` at iteration k, we replace -``α = \frac{⟨r_k, z_k⟩_{x}}{κ}`` with ``τ`` described as above. -The other possibility is that ``η_{k+1}`` would lie outside the trust-region at -iteration k (i.e. ``⟨η_k, η_k⟩_{x}^{* } ≥ {Δ}^2`` -that can be identified with the norm of ``η_{k+1}``). In -particular, when ``\operatorname{Hess}[F] (⋅)_{x}`` is positive definite -and ``η_{k+1}`` lies outside the trust region, the solution to the -trust-region problem must lie on the trust-region boundary. Thus, there -is no reason to continue with the conjugate gradient iteration, as it -stands, as subsequent iterates will move further outside the trust-region -boundary. A sensible strategy, just as in the case considered above, is to -move to the trust-region boundary by finding ``τ``. - -Although it is virtually impossible in practice to know how many iterations are -necessary to provide a good estimate ``η_{k}`` of the trust-region subproblem, -the method stops after a certain number of iterations, which is realised by -[`StopAfterIteration`](@ref). In order to increase the convergence rate of -the underlying trust-region method, see -[`trust_regions`](@ref), a typical stopping criterion -is to stop as soon as an iteration ``k`` is reached for which - -```math - \Vert r_k \Vert_x \leqq \Vert r_0 \Vert_x \min \left( \Vert r_0 \Vert^{θ}_x, κ \right) -``` - -holds, where ``0 < κ < 1`` and ``θ > 0`` are chosen in advance. This is -realized in this method by [`StopWhenResidualIsReducedByFactorOrPower`](@ref). -It can be shown that under appropriate conditions the iterates ``x_k`` -of the underlying trust-region method converge to nondegenerate critical -points with an order of convergence of at least ``\min \left( θ + 1, 2 \right)``, -see [Absil, Mahony, Sepulchre, Princeton University Press, 2008](@cite AbsilMahonySepulchre:2008). -The method also aborts if the curvature of the model is negative, i.e. if -``\langle \delta_k, \mathcal{H}[δ_k] \rangle_x \leqq 0``, which is realised by -[`StopWhenCurvatureIsNegative`](@ref). If the next possible approximate -solution ``η_{k}^{*}`` calculated in iteration ``k`` lies outside the -trust region, i.e. if ``\lVert η_{k}^{*} \rVert_x \geq Δ``, then the method -aborts, which is realised by [`StopWhenTrustRegionIsExceeded`](@ref). -Furthermore, the method aborts if the new model value evaluated at ``η_{k}^{*}`` -is greater than the previous model value evaluated at ``η_{k}``, which -is realised by [`StopWhenModelIncreased`](@ref). - +on the tangent space ``T_p\mathcal M`` of a Riemannian manifold ``\mathcal M`` by using the Steihaug-Toint truncated conjugate-gradient (tCG) method, +see [AbsilBakerGallivan:2006](@cite), Algorithm 2, and [ConnGouldToint:2000](@cite). +Here ``\mathcal H_p`` is either the Hessian ``\operatorname{Hess} f(p)`` or a linear symmetric operator on the tangent space approximating the Hessian. ## Interface @@ -148,9 +38,15 @@ update_stopping_criterion!(::StopWhenResidualIsReducedByFactorOrPower, ::Val{:Re update_stopping_criterion!(::StopWhenResidualIsReducedByFactorOrPower, ::Val{:ResidualFactor}, ::Any) ``` +## Trust Region Model + +```@docs +TrustRegionModelObjective +``` + ## Literature ```@bibliography -Pages = ["solvers/truncated_conjugate_gradient_descent.md"] +Pages = ["truncated_conjugate_gradient_descent.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/solvers/trust_regions.md b/docs/src/solvers/trust_regions.md index 2e0f4c9c27..8cf54a4c56 100644 --- a/docs/src/solvers/trust_regions.md +++ b/docs/src/solvers/trust_regions.md @@ -1,131 +1,34 @@ # [The Riemannian Trust-Regions Solver](@id trust_regions) -The aim is to solve an optimization problem on a manifold +Minimize a function ```math -\operatorname*{min}_{x ∈ \mathcal{M}} F(x) +\operatorname*{\arg\,min}_{p ∈ \mathcal{M}}\ f(p) ``` -by using the Riemannian trust-regions solver. It is number one choice for smooth -optimization. This trust-region method uses the Steihaug-Toint truncated -conjugate-gradient method [`truncated_conjugate_gradient_descent`](@ref) -to solve the inner minimization problem called the -trust-regions subproblem. This inner solver can be preconditioned by providing -a preconditioner (symmetric and positive definite, an approximation of the -inverse of the Hessian of ``F``). If no Hessian of the cost function ``F`` is -provided, a standard approximation of the Hessian based on the gradient -``\operatorname{grad}F`` with [`ApproxHessianFiniteDifference`](@ref) will be computed. - -## Initialization - -Initialize ``x_0 = x`` with an initial point ``x`` on the manifold. It can be -given by the caller or set randomly. Set the initial trust-region radius -``\Delta =\frac{1}{8} \bar{\Delta}`` where ``\bar{\Delta}`` is the maximum radius -the trust-region can have. Usually one uses -the root of the manifold's dimension ``\operatorname{dim}(\mathcal{M})``. -For accepting the next iterate and evaluating the new trust-region radius, one -needs an accept/reject threshold ``\rho' ∈ [0,\frac{1}{4})``, which is -``\rho' = 0.1`` on default. Set ``k=0``. - -## Iteration - -Repeat until a convergence criterion is reached - -1. Set ``η`` as a random tangent vector if using randomized approach. Else - set ``η`` as the zero vector in the tangential space ``T_{x_k}\mathcal{M}``. -2. Set ``η^*`` as the solution of the trust-region subproblem, computed by - the tcg-method with ``η`` as initial vector. -3. If using randomized approach, compare ``η^*`` with the Cauchy point - ``η_{c}^* = -\tau_{c} \frac{\Delta}{\lVert \operatorname{Grad}[F] (x_k) \rVert_{x_k}} \operatorname{Grad}[F] (x_k)`` by the model function ``m_{x_k}(⋅)``. If the - model decrease is larger by using the Cauchy point, set - ``η^* = η_{c}^*``. -4. Set ``{x}^* = \operatorname{retr}_{x_k}(η^*)``. -5. Set ``\rho = \frac{F(x_k)-F({x}^*)}{m_{x_k}(η)-m_{x_k}(η^*)}``, where - ``m_{x_k}(⋅)`` describes the quadratic model function. -6. Update the trust-region radius:``\Delta = \begin{cases}\frac{1}{4} \Delta &\text{ if } \rho < \frac{1}{4} \, \text{or} \, m_{x_k}(η)-m_{x_k}(η^*) \leq 0 \, \text{or} \, \rho = \pm ∈ fty , \\\operatorname{min}(2 \Delta, \bar{\Delta}) &\text{ if } \rho > \frac{3}{4} \, \text{and the tcg-method stopped because of negative curvature or exceeding the trust-region},\\\Delta & \, \text{otherwise.}\end{cases}`` -7. If ``m_{x_k}(η)-m_{x_k}(η^*) \geq 0`` and ``\rho > \rho'`` set - ``x_k = {x}^*``. -8. Set ``k = k+1``. - -## Result - -The result is given by the last computed ``x_k``. - -## Remarks - -To the initialization: a random point on the manifold. - -To step number 1: using a randomized approach means using a random tangent -vector as initial vector for the approximate solve of the trust-regions -subproblem. If this is the case, keep in mind that the vector must be in the -trust-region radius. This is achieved by multiplying -`η` by `sqrt(4,eps(Float64))` as long as -its norm is greater than the current trust-region radius ``\Delta``. -For not using randomized approach, one can get the zero tangent vector. - -To step number 2: obtain ``η^*`` by (approximately) solving the -trust-regions subproblem +by using the Riemannian trust-regions solver following [AbsilBakerGallivan:2006](@cite), +i.e. by building a lifted model at the ``k``th iterate ``p_k`` by locally mapping the +cost function ``f`` to the tangent space as ``f_k: T_{p_k}\mathcal M → \mathbb R`` as +``f_k(X) = f(\operatorname{retr}_{p_k}(X))``. +We then define the trust region subproblem as ```math -\operatorname*{arg\,min}_{η ∈ T_{x_k}\mathcal{M}} m_{x_k}(η) = F(x_k) + -\langle \operatorname{grad}F(x_k), η \rangle_{x_k} + \frac{1}{2} \langle -\operatorname{Hess}[F](η)_ {x_k}, η \rangle_{x_k} +\operatorname*{arg\,min}_{X ∈ T_{p_k}\mathcal M}\ m_k(X), ``` -```math -\text{s.t.} \; \langle η, η \rangle_{x_k} \leq {\Delta}^2 -``` - -with the Steihaug-Toint truncated conjugate-gradient (tcg) method. The problem -as well as the solution method is described in the -[`truncated_conjugate_gradient_descent`](@ref). In this inner solver, the -stopping criterion [`StopWhenResidualIsReducedByFactorOrPower`](@ref) so that superlinear or at least linear convergence in the trust-region method can be achieved. - -To step number 3: if using a random tangent vector as an initial vector, compare -the result of the tcg-method with the Cauchy point. Convergence proofs assume -that one achieves at least (a fraction of) the reduction of the Cauchy point. -The idea is to go in the direction of the gradient to an optimal point. This -can be on the edge, but also before. -The parameter ``\tau_{c}`` for the optimal length is defined by +where ```math -\tau_{c} = \begin{cases} 1 & \langle \operatorname{Grad}[F] (x_k), \, -\operatorname{Hess}[F] (η_k)_ {x_k}\rangle_{x_k} \leq 0 , \\ -\operatorname{min}(\frac{{\operatorname{norm}(\operatorname{Grad}[F] (x_k))}^3} -{\Delta \langle \operatorname{Grad}[F] (x_k), \, -\operatorname{Hess}[F] (η_k)_ {x_k}\rangle_{x_k}}, 1) & \, \text{otherwise.} -\end{cases} +\begin{align*} +m_k&: T_{p_K}\mathcal M → \mathbb R,\\ +m_k(X) &= f(p_k) + ⟨\operatorname{grad} f(p_k), X⟩_{p_k} + \frac{1}{2}\langle \mathcal H_k(X),X⟩_{p_k}\\ +\text{such that}&\ \lVert X \rVert_{p_k} ≤ Δ_k. +\end{align*} ``` -To check the model decrease one compares +Here ``Δ_k`` is a trust region radius, that is adapted every iteration, and ``\mathcal H_k`` is +some symmetric linear operator that approximates the Hessian ``\operatorname{Hess} f`` of ``f``. -```math -m_{x_k}(η_{c}^*) = F(x_k) + \langle η_{c}^*, -\operatorname{Grad}[F] (x_k)\rangle_{x_k} + \frac{1}{2}\langle η_{c}^*, -\operatorname{Hess}[F] (η_{c}^*)_ {x_k}\rangle_{x_k} -``` - -with - -```math -m_{x_k}(η^*) = F(x_k) + \langle η^*, -\operatorname{Grad}[F] (x_k)\rangle_{x_k} + \frac{1}{2}\langle η^*, -\operatorname{Hess}[F] (η^*)_ {x_k}\rangle_{x_k}. -``` - -If ``m_{x_k}(η_{c}^*) < m_{x_k}(η^*)`` then ``m_{x_k}(η_{c}^*)`` is the better choice. - -To step number 4: ``\operatorname{retr}_{x_k}(⋅)`` denotes the retraction, a -mapping ``\operatorname{retr}_{x_k}:T_{x_k}\mathcal{M} \rightarrow \mathcal{M}`` -which approximates the exponential map. In some cases it is cheaper to use this -instead of the exponential. - -To step number 6: one knows that the [`truncated_conjugate_gradient_descent`](@ref) algorithm stopped for -these reasons when the stopping criteria [`StopWhenCurvatureIsNegative`](@ref), -[`StopWhenTrustRegionIsExceeded`](@ref) are activated. - -To step number 7: the last step is to decide if the new point ``{x}^*`` is -accepted. ## Interface @@ -159,6 +62,6 @@ Manopt.AbstractApproxHessian ## Literature ```@bibliography -Pages = ["solvers/trust_regions.md"] +Pages = ["trust_regions.md"] Canonical=false -``` \ No newline at end of file +``` diff --git a/docs/src/tutorials/GeodesicRegression.md b/docs/src/tutorials/GeodesicRegression.md index 477d931807..8681d2ce15 100644 --- a/docs/src/tutorials/GeodesicRegression.md +++ b/docs/src/tutorials/GeodesicRegression.md @@ -514,7 +514,6 @@ Note that the geodesics from the data to the regression geodesic meet at a nearl ## Literature ```@bibliography -Pages = ["tutorials/GeodesicRegression.md"] +Pages = ["GeodesicRegression.md"] Canonical=false ``` - diff --git a/docs/src/tutorials/HowToDebug.md b/docs/src/tutorials/HowToDebug.md index 10b870cb10..fb5c1e3f6b 100644 --- a/docs/src/tutorials/HowToDebug.md +++ b/docs/src/tutorials/HowToDebug.md @@ -74,7 +74,7 @@ There is two more advanced variants that can be used. The first is a tuple of a We can for example change the way the `:ϵ` is printed by adding a format string and use [`DebugCost`](@ref)`()` which is equivalent to using `:Cost`. -Especially with the format change, the lines are more consistent in length. +Especially with the format change, the lines are more coniststent in length. ``` julia p2 = exact_penalty_method( diff --git a/src/Manopt.jl b/src/Manopt.jl index 6b6095170d..83a99a2dc3 100644 --- a/src/Manopt.jl +++ b/src/Manopt.jl @@ -162,6 +162,7 @@ include("solvers/difference_of_convex_algorithm.jl") include("solvers/difference-of-convex-proximal-point.jl") include("solvers/DouglasRachford.jl") include("solvers/exact_penalty_method.jl") +include("solvers/Lanczos.jl") include("solvers/NelderMead.jl") include("solvers/FrankWolfe.jl") include("solvers/gradient_descent.jl") @@ -266,7 +267,7 @@ export AbstractGradientSolverState, TruncatedConjugateGradientState, TrustRegionsState -export FrankWolfeCost, FrankWolfeGradient +# Objectives and Costs export NelderMeadSimplex export AlternatingGradient # @@ -324,6 +325,8 @@ export ConstraintType, FunctionConstraint, VectorConstraint # Subproblem cost/grad export AugmentedLagrangianCost, AugmentedLagrangianGrad, ExactPenaltyCost, ExactPenaltyGrad export ProximalDCCost, ProximalDCGrad, LinearizedDCCost, LinearizedDCGrad +export FrankWolfeCost, FrankWolfeGradient +export TrustRegionModelObjective export QuasiNewtonState, QuasiNewtonLimitedMemoryDirectionUpdate export QuasiNewtonMatrixDirectionUpdate @@ -399,7 +402,7 @@ export solve! export ApproxHessianFiniteDifference, ApproxHessianSymmetricRankOne, ApproxHessianBFGS export update_hessian!, update_hessian_basis! export ExactPenaltyCost, ExactPenaltyGrad, AugmentedLagrangianCost, AugmentedLagrangianGrad -export AdaptiveRegularizationCubicCost, AdaptiveRegularizationCubicGrad +export AdaptiveRagularizationWithCubicsModelObjective # # Stepsize export Stepsize diff --git a/src/deprecated.jl b/src/deprecated.jl index 53d39698b1..03e7a566d9 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -2,3 +2,14 @@ Base.@deprecate_binding HeestenesStiefelCoefficient HestenesStiefelCoefficient export HeestenesStiefelCoefficient Base.@deprecate_binding SimpleCacheObjective SimpleManifoldCachedObjective export SimpleCacheObjective +# +# Deprecated tCG calls +# +@deprecate truncated_conjugate_gradient_descent( + M::AbstractManifold, F, gradF, x, Y, H::TH; kwargs... +) where {TH<:Function} truncated_conjugate_gradient_descent(M, F, gradF, H, x, Y; kwargs...) +@deprecate truncated_conjugate_gradient_descent!( + M::AbstractManifold, F::TF, gradF::TG, x, Y, H::TH; kwargs... +) where {TF<:Function,TG<:Function,TH<:Function} truncated_conjugate_gradient_descent!( + M, F, gradF, H, x, Y; kwargs... +) diff --git a/src/functions/gradients.jl b/src/functions/gradients.jl index d1dec11b92..e77c784c3e 100644 --- a/src/functions/gradients.jl +++ b/src/functions/gradients.jl @@ -500,7 +500,7 @@ function grad_TV2!(M::PowerManifold, X, q, p::Int=1) c = costTV2(M, q, p, false) for i in R # iterate over all pixel di = 0.0 - for k in 1:d # for all direction combinations (TODO) + for k in 1:d # for all direction combinations ek = CartesianIndex(ntuple(i -> (i == k) ? 1 : 0, d)) #k th unit vector jF = i + ek # compute forward neighbor jB = i - ek # compute backward neighbor diff --git a/src/functions/manifold_functions.jl b/src/functions/manifold_functions.jl index e384273476..4f3851fcaf 100644 --- a/src/functions/manifold_functions.jl +++ b/src/functions/manifold_functions.jl @@ -28,7 +28,7 @@ Reflect the point `x` from the manifold `M` at point `p`, i.e. \operatorname{refl}_p(x) = \operatorname{retr}_p(-\operatorname{retr}^{-1}_p x). ```` -where $\operatorname{retr}$ and $\operatorname{retr}^{-1}$ denote a retraction and an inverse +where ``\operatorname{retr}`` and ``\operatorname{retr}^{-1}`` denote a retraction and an inverse retraction, respectively. This can also be done in place of `q`. @@ -55,8 +55,6 @@ function reflect( M, p, -inverse_retract(M, p, x, inverse_retraction_method), retraction_method ) end - -# TODO: adapt function reflect!( M::AbstractManifold, q, diff --git a/src/helpers/exports/Asymptote.jl b/src/helpers/exports/Asymptote.jl index 220bec7ad7..d047ee0152 100644 --- a/src/helpers/exports/Asymptote.jl +++ b/src/helpers/exports/Asymptote.jl @@ -1,6 +1,6 @@ @doc raw""" asymptote_export_S2_signals(filename; points, curves, tangent_vectors, colors, options...) -Export given `points`, `curves`, and `tangent_vectors` on the sphere $\mathbb S^2$ +Export given `points`, `curves`, and `tangent_vectors` on the sphere ``\mathbb S^2`` to Asymptote. # Input diff --git a/src/plans/adabtive_regularization_with_cubics_plan.jl b/src/plans/adabtive_regularization_with_cubics_plan.jl index 7dda3f8d7e..916d070f9a 100644 --- a/src/plans/adabtive_regularization_with_cubics_plan.jl +++ b/src/plans/adabtive_regularization_with_cubics_plan.jl @@ -1,99 +1,106 @@ @doc raw""" - AdaptiveRegularizationCubicCost + AdaptiveRagularizationWithCubicsModelObjective -We define the model ``m(X)`` in the tangent space of the current iterate ``p=p_k`` as +A model for the adaptive regularization with Cubics ```math - m(X) = f(p) + - + \frac{1}{2} + \frac{σ}{3} \lVert X \rVert^3 +m(X) = f(p) + ⟨\operatorname{grad} f(p), X ⟩_p + \frac{1}{2} ⟨\operatorname{Hess} f(p)[X], X⟩_p + + \frac{σ}{3} \lVert X \rVert^3, ``` +cf. Eq. (33) in [AgarwalBoumalBullinsCartis:2020](@cite) + # Fields -* `mho` – an [`AbstractManifoldObjective`](@ref) that should provide at least [`get_cost`](@ref), [`get_gradient`](@ref) and [`get_hessian`](@ref). -* `σ` – the current regularization parameter -* `X` – a storage for the gradient at `p` of the original cost -# Constructors +* `objective` – an [`AbstractManifoldHessianObjective`](@ref) proving ``f``, its gradient and Hessian +* `σ` – the current (cubic) regularization parameter - AdaptiveRegularizationCubicCost(mho, σ, X) - AdaptiveRegularizationCubicCost(M, mho, σ; p=rand(M), X=get_gradient(M, mho, p)) +# Constructors -Initialize the cubic cost to the objective `mho`, regularization parameter `σ`, and -(temporary) gradient `X`. + AdaptiveRagularizationWithCubicsModelObjective(mho, σ=1.0) -!!! note - For this gradient function to work, we require the [`TangentSpaceAtPoint`](https://juliamanifolds.github.io/Manifolds.jl/latest/manifolds/vector_bundle.html#Manifolds.TangentSpaceAtPoint) - from `Manifolds.jl` +with either an [`AbstractManifoldHessianObjective`](@ref) `objective` or an decorator containing such an objective. """ -mutable struct AdaptiveRegularizationCubicCost{T,R,O<:AbstractManifoldObjective} - mho::O +mutable struct AdaptiveRagularizationWithCubicsModelObjective{ + E<:AbstractEvaluationType, + O<:Union{ManifoldHessianObjective,AbstractDecoratedManifoldObjective}, + R, +} <: AbstractManifoldSubObjective{E,O} + objective::O σ::R - X::T end -function AdaptiveRegularizationCubicCost( - M, mho::O, σ::R=1.0; p::P=rand(M), X::T=get_gradient(M, mho, p) -) where {P,T,R,O} - return AdaptiveRegularizationCubicCost{T,R,O}(mho, σ, X) +function AdaptiveRagularizationWithCubicsModelObjective( + mho::O, σ::R=1.0 +) where { + E,O<:Union{AbstractManifoldHessianObjective{E},AbstractDecoratedManifoldObjective{E}},R +} + return AdaptiveRagularizationWithCubicsModelObjective{E,O,R}(mho, σ) end -function set_manopt_parameter!(f::AdaptiveRegularizationCubicCost, ::Val{:X}, X) - f.X = X - return f -end -function set_manopt_parameter!(f::AdaptiveRegularizationCubicCost, ::Val{:σ}, σ) +function set_manopt_parameter!( + f::AdaptiveRagularizationWithCubicsModelObjective, + ::Union{Val{:σ},Val{:RegularizationParameter}}, + σ, +) f.σ = σ return f end +get_objective(arcmo::AdaptiveRagularizationWithCubicsModelObjective) = arcmo.objective + @doc raw""" - AdaptiveRegularizationCubicGrad + get_cost(TpM, trmo::AdaptiveRagularizationWithCubicsModelObjective, X) -We define the model ``m(X)`` in the tangent space of the current iterate ``p=p_k`` as +Evaluate the tangent space [`AdaptiveRagularizationWithCubicsModelObjective`](@ref) ```math - m(X) = f(p) + - + \frac{1}{2} + \frac{σ}{3} \lVert X \rVert^3 +m(X) = f(p) + ⟨\operatorname{grad} f(p), X ⟩_p + \frac{1}{2} ⟨\operatorname{Hess} f(p)[X], X⟩_p + + \frac{σ}{3} \lVert X \rVert^3, ``` -This struct represents its gradient, given by +at `X`, cf. Eq. (33) in [AgarwalBoumalBullinsCartis:2020](@cite). +""" +function get_cost( + TpM::TangentSpace, arcmo::AdaptiveRagularizationWithCubicsModelObjective, X +) + M = base_manifold(TpM) + p = TpM.point + c = get_objective_cost(M, arcmo, p) + G = get_objective_gradient(M, arcmo, p) + Y = get_objective_hessian(M, arcmo, p, X) + return c + inner(M, p, G, X) + 1 / 2 * inner(M, p, Y, X) + arcmo.σ / 3 * norm(M, p, X)^3 +end +function get_cost_function(arcmo::AdaptiveRagularizationWithCubicsModelObjective) + return (TpM, X) -> get_cost(TpM, arcmo, X) +end +@doc raw""" + get_gradient(TpM, trmo::AdaptiveRagularizationWithCubicsModelObjective, X) + +Evaluate the gradient of the [`AdaptiveRagularizationWithCubicsModelObjective`](@ref) ```math - \operatorname{grad} m(X) = \operatorname{grad}f(p) + \operatorname{Hess} f(p)[X] + σ \lVert X \rVert X +\operatorname{grad} m(X) = \operatorname{grad} f(p) + \operatorname{Hess} f(p)[X] + + σ\lVert X \rVert X, ``` -# Fields - -* `mho` – an [`AbstractManifoldObjective`](@ref) that should provide at least [`get_cost`](@ref), [`get_gradient`](@ref) and [`get_hessian`](@ref). -* `σ` – the current regularization parameter -* `X` – a storage for the gradient at `p` of the original cost - -# Constructors - - AdaptiveRegularizationCubicGrad(mho, σ, X) - AdaptiveRegularizationCubicGrad(M, mho, σ; p=rand(M), X=get_gradient(M, mho, p)) - -Initialize the cubic cost to the original objective `mho`, regularization parameter `σ`, and -(temporary) gradient `X`. - -!!! note - * For this gradient function to work, we require the [`TangentSpaceAtPoint`](https://juliamanifolds.github.io/Manifolds.jl/latest/manifolds/vector_bundle.html#Manifolds.TangentSpaceAtPoint) - from `Manifolds.jl` - * The gradient functor provides both an allocating as well as an in-place variant. +at `X`, cf. Eq. (37) in [AgarwalBoumalBullinsCartis:2020](@cite). """ -mutable struct AdaptiveRegularizationCubicGrad{T,R,O<:AbstractManifoldObjective} - mho::O - σ::R - X::T -end -function AdaptiveRegularizationCubicGrad( - M, mho::O, σ::R=1.0; p::P=rand(M), X::T=get_gradient(M, mho, p) -) where {P,T,R,O} - return AdaptiveRegularizationCubicGrad{T,R,O}(mho, σ, X) +function get_gradient( + TpM::TangentSpace, arcmo::AdaptiveRagularizationWithCubicsModelObjective, X +) + M = base_manifold(TpM) + p = TpM.point + G = get_objective_gradient(M, arcmo, p) + return G + get_objective_hessian(M, arcmo, p, X) + arcmo.σ * norm(M, p, X) * X end -function set_manopt_parameter!(f::AdaptiveRegularizationCubicGrad, ::Val{:X}, X) - f.X = X - return f +function get_gradient!( + TpM::TangentSpace, Y, arcmo::AdaptiveRagularizationWithCubicsModelObjective, X +) + M = base_manifold(TpM) + p = TpM.point + get_objective_hessian!(M, Y, arcmo, p, X) + Y .= Y + get_objective_gradient(M, arcmo, p) + arcmo.σ * norm(M, p, X) * X + return Y end -function set_manopt_parameter!(f::AdaptiveRegularizationCubicGrad, ::Val{:σ}, σ) - f.σ = σ - return f +function get_gradient_function(arcmo::AdaptiveRagularizationWithCubicsModelObjective) + return (TpM, X) -> get_gradient(TpM, arcmo, X) end diff --git a/src/plans/debug.jl b/src/plans/debug.jl index cd1a2a0a89..b68e19d402 100644 --- a/src/plans/debug.jl +++ b/src/plans/debug.jl @@ -69,6 +69,18 @@ function DebugSolverState( ) where {S<:AbstractManoptSolverState} return DebugSolverState{S}(st, DebugFactory(format)) end + +""" + set_manopt_parameter!(ams::DebugSolverState, ::Val{:Debug}, args...) + +Set certain values specified by `args...` into the elements of the `debugDictionary` +""" +function set_manopt_parameter!(dss::DebugSolverState, ::Val{:Debug}, args...) + for d in values(dss.debugDictionary) + set_manopt_parameter!(d, args...) + end + return dss +end function status_summary(dst::DebugSolverState) if length(dst.debugDictionary) > 0 s = "" diff --git a/src/plans/gradient_plan.jl b/src/plans/gradient_plan.jl index 3b2efa594b..17148c8e04 100644 --- a/src/plans/gradient_plan.jl +++ b/src/plans/gradient_plan.jl @@ -1,7 +1,7 @@ @doc raw""" AbstractManifoldGradientObjective{E<:AbstractEvaluationType, TC, TG} <: AbstractManifoldCostObjective{E, TC} -An abstract type for all functions that provide a (full) gradient, where +An abstract type for all objectives that provide a (full) gradient, where `T` is a [`AbstractEvaluationType`](@ref) for the gradient function. """ abstract type AbstractManifoldGradientObjective{E<:AbstractEvaluationType,TC,TG} <: diff --git a/src/plans/hessian_plan.jl b/src/plans/hessian_plan.jl index 646ad2eeaa..6ab5a49283 100644 --- a/src/plans/hessian_plan.jl +++ b/src/plans/hessian_plan.jl @@ -6,21 +6,31 @@ These options are assumed to have a field (`gradient`) to store the current grad """ abstract type AbstractHessianSolverState <: AbstractGradientSolverState end +""" + AbstractManifoldHessianObjective{T<:AbstractEvaluationType,TC,TG,TH} <: AbstractManifoldGradientObjective{T,TC,TG} + +An abstract type for all objectives that provide a (full) Hessian, where +`T` is a [`AbstractEvaluationType`](@ref) for the gradient and Hessian functions. +""" +abstract type AbstractManifoldHessianObjective{E<:AbstractEvaluationType,TC,TG,TH} <: + AbstractManifoldGradientObjective{E,TC,TG} end + @doc raw""" - ManifoldHessianObjective{T<:AbstractEvaluationType,C,G,H,Pre} <: AbstractManifoldGradientObjective{T} + ManifoldHessianObjective{T<:AbstractEvaluationType,C,G,H,Pre} <: AbstractManifoldHessianObjective{T,C,G,H} specify a problem for hessian based algorithms. # Fields -* `cost` : a function $F:\mathcal M→ℝ$ to minimize -* `gradient` : the gradient $\operatorname{grad}F:\mathcal M - → \mathcal T\mathcal M$ of the cost function $F$ -* `hessian` : the hessian $\operatorname{Hess}F(x)[⋅]: \mathcal T_{x} \mathcal M - → \mathcal T_{x} \mathcal M$ of the cost function $F$ -* `preconditioner` : the symmetric, positive definite preconditioner - as an approximation of the inverse of the Hessian of $f$, i.e. as a map with the same - input variables as the `hessian`. +* `cost` : a function ``f:\mathcal M→ℝ`` to minimize +* `gradient` : the gradient ``\operatorname{grad}f:\mathcal M + → \mathcal T\mathcal M`` of the cost function ``f`` +* `hessian` : the hessian ``\operatorname{Hess}f(x)[⋅]: \mathcal T_{x} \mathcal M + → \mathcal T_{x} \mathcal M`` of the cost function ``f`` +* `preconditioner` : the symmetric, positive definite preconditioner + as an approximation of the inverse of the Hessian of ``f``, i.e. as a map with the same + input variables as the `hessian` to numerically stabilize iterations when the Hessian is + ill-conditioned Depending on the [`AbstractEvaluationType`](@ref) `T` the gradient and can have to forms @@ -36,7 +46,7 @@ Depending on the [`AbstractEvaluationType`](@ref) `T` the gradient and can have [`truncated_conjugate_gradient_descent`](@ref), [`trust_regions`](@ref) """ struct ManifoldHessianObjective{T<:AbstractEvaluationType,C,G,H,Pre} <: - AbstractManifoldGradientObjective{T,C,G} + AbstractManifoldHessianObjective{T,C,G,H} cost::C gradient!!::G hessian!!::H diff --git a/src/plans/higher_order_primal_dual_plan.jl b/src/plans/higher_order_primal_dual_plan.jl index 3e2c21f1d4..55f5fe57a3 100644 --- a/src/plans/higher_order_primal_dual_plan.jl +++ b/src/plans/higher_order_primal_dual_plan.jl @@ -5,13 +5,13 @@ Describes a Problem for the Primal-dual Riemannian semismooth Newton algorithm. # Fields -* `cost` $F + G(Λ(⋅))$ to evaluate interims cost function values -* `linearized_operator` the linearization $DΛ(⋅)[⋅]$ of the operator $Λ(⋅)$. -* `linearized_adjoint_operator` The adjoint differential $(DΛ)^* \colon \mathcal N \to T\mathcal M$ -* `prox_F` the proximal map belonging to $f$ -* `diff_prox_F` the (Clarke Generalized) differential of the proximal maps of $F$ -* `prox_G_dual` the proximal map belonging to $g_n^*$ -* `diff_prox_dual_G` the (Clarke Generalized) differential of the proximal maps of $G^\ast_n$ +* `cost` ``F + G(Λ(⋅))`` to evaluate interims cost function values +* `linearized_operator` the linearization ``DΛ(⋅)[⋅]`` of the operator ``Λ(⋅)``. +* `linearized_adjoint_operator` The adjoint differential ``(DΛ)^* \colon \mathcal N \to T\mathcal M`` +* `prox_F` the proximal map belonging to ``F`` +* `diff_prox_F` the (Clarke Generalized) differential of the proximal maps of ``F`` +* `prox_G_dual` the proximal map belonging to ``g_n^*`` +* `diff_prox_dual_G` the (Clarke Generalized) differential of the proximal maps of ``G^\ast_n`` * `Λ` – the exact forward operator. This operator is required if `Λ(m)=n` does not hold. @@ -67,10 +67,10 @@ end @doc raw""" PrimalDualSemismoothNewtonState <: AbstractPrimalDualSolverState -* `m` - base point on $ \mathcal M $ -* `n` - base point on $ \mathcal N $ -* `x` - an initial point on $x^{(0)} \in \mathcal M$ (and its previous iterate) -* `ξ` - an initial tangent vector $\xi^{(0)}\in T_{n}^*\mathcal N$ (and its previous iterate) +* `m` - base point on ``\mathcal M`` +* `n` - base point on ``\mathcal N`` +* `x` - an initial point on ``x^{(0)} \in \mathcal M`` (and its previous iterate) +* `ξ` - an initial tangent vector ``\xi^{(0)}\in T_{n}^*\mathcal N`` (and its previous iterate) * `primal_stepsize` – (`1/sqrt(8)`) proximal parameter of the primal prox * `dual_stepsize` – (`1/sqrt(8)`) proximal parameter of the dual prox * `reg_param` – (`1e-5`) regularisation parameter for the Newton matrix diff --git a/src/plans/plan.jl b/src/plans/plan.jl index 9cee213cdb..215fd6184f 100644 --- a/src/plans/plan.jl +++ b/src/plans/plan.jl @@ -14,6 +14,8 @@ status_summary(e) = "$(e)" For any `f` and a `Symbol` `e` we dispatch on its value so by default, to set some `args...` in `f` or one of uts sub elements. + + """ function set_manopt_parameter!(f, e::Symbol, args...) return set_manopt_parameter!(f, Val(e), args...) @@ -37,7 +39,9 @@ include("hessian_plan.jl") include("proximal_plan.jl") include("subgradient_plan.jl") +include("subsolver_plan.jl") include("constrained_plan.jl") +include("trust_regions_plan.jl") include("adabtive_regularization_with_cubics_plan.jl") include("alternating_gradient_plan.jl") @@ -55,7 +59,6 @@ include("higher_order_primal_dual_plan.jl") include("stochastic_gradient_plan.jl") include("embedded_objective.jl") -include("subsolver_plan.jl") include("cache.jl") include("count.jl") diff --git a/src/plans/problem.jl b/src/plans/problem.jl index ba6341ac3f..36b42072b4 100644 --- a/src/plans/problem.jl +++ b/src/plans/problem.jl @@ -95,7 +95,10 @@ function set_manopt_parameter!(amp::AbstractManoptProblem, ::Val{:Manifold}, arg set_manopt_parameter!(get_manifold(amp), args...) return amp end - +function set_manopt_parameter!(TpM::TangentSpace, ::Union{Val{:Basepoint},Val{:p}}, p) + copyto!(TpM.manifold, TpM.point, p) + return TpM +end function set_manopt_parameter!(amp::AbstractManoptProblem, ::Val{:Objective}, args...) set_manopt_parameter!(get_objective(amp), args...) return amp diff --git a/src/plans/solver_state.jl b/src/plans/solver_state.jl index f24211ba2f..b807c1722f 100644 --- a/src/plans/solver_state.jl +++ b/src/plans/solver_state.jl @@ -203,18 +203,15 @@ end _get_iterate(s::AbstractManoptSolverState, ::Val{true}) = get_iterate(s.state) """ - set_iterate!(s::AbstractManoptSolverState, M::AbstractManifold, p) + get_manopt_parameter(ams::AbstractManoptSolverState, element::Symbol, args...) -set the iterate within an [`AbstractManoptSolverState`](@ref) to some (start) value `p`. +Obtain a certain field or semantic element from the [`AbstractManoptSolverState`](@ref) `ams`. +This function passes to `Val(element)` and specific setters should dispatch on `Val{element}`. """ -function set_iterate!(s::AbstractManoptSolverState, M, p) - return _set_iterate!(s, M, p, dispatch_state_decorator(s)) -end -function _set_iterate!(s::AbstractManoptSolverState, ::Any, ::Any, ::Val{false}) - return error( - "It seems the AbstractManoptSolverState $s do not provide (write) access to an iterate", - ) +function get_manopt_parameter(ams::AbstractManoptSolverState, e::Symbol, args...) + return get_manopt_parameter(ams, Val(e), args...) end + _set_iterate!(s::AbstractManoptSolverState, M, p, ::Val{true}) = set_iterate!(s.state, M, p) """ @@ -249,6 +246,20 @@ end _get_solver_result(s::AbstractManoptSolverState, ::Val{false}) = get_iterate(s) _get_solver_result(s::AbstractManoptSolverState, ::Val{true}) = get_solver_result(s.state) +""" + set_iterate!(s::AbstractManoptSolverState, M::AbstractManifold, p) + +set the iterate within an [`AbstractManoptSolverState`](@ref) to some (start) value `p`. +""" +function set_iterate!(s::AbstractManoptSolverState, M, p) + return _set_iterate!(s, M, p, dispatch_state_decorator(s)) +end +function _set_iterate!(s::AbstractManoptSolverState, ::Any, ::Any, ::Val{false}) + return error( + "It seems the AbstractManoptSolverState $s do not provide (write) access to an iterate", + ) +end + """ struct PointStorageKey{key} end diff --git a/src/plans/stepsize.jl b/src/plans/stepsize.jl index 7c0fa36c29..800eb2aa48 100644 --- a/src/plans/stepsize.jl +++ b/src/plans/stepsize.jl @@ -127,7 +127,7 @@ Alternatively one can also use the following keyword. ) initializes all fields above, where none of them is mandatory and the length is set to -half and to $1$ if the injectivity radius is infinite. +half and to ``1`` if the injectivity radius is infinite. """ mutable struct DecreasingStepsize <: Stepsize length::Float64 @@ -216,9 +216,9 @@ last step size. Furthermore the following fields act as safeguards -* `stop_when_stepsize_less - (`0.0`) smallest stepsize when to stop (the last one before is taken) +* `stop_when_stepsize_less` - (`0.0`) smallest stepsize when to stop (the last one before is taken) * `stop_when_stepsize_exceeds - ([`max_stepsize`](@ref)`(M, p)`) – largest stepsize when to stop. -* `stop_increasing_at_step` - (^100`) last step to increase the stepsize (phase 1), +* `stop_increasing_at_step` - (`100`) last step to increase the stepsize (phase 1), * `stop_decreasing_at_step` - (`1000`) last step size to decrease the stepsize (phase 2), Pass `:Messages` to a `debug=` to see `@info`s when these happen. @@ -467,16 +467,16 @@ and ``γ`` is the sufficient decrease parameter ``∈(0,1)``. We can then find t ``` # Fields -* `initial_stepsize` – (`1.0`) the step size we start the search with -* `memory_size` – (`10`) number of iterations after which the cost value needs to be lower than the current one -* `bb_min_stepsize` – (`1e-3`) lower bound for the Barzilai-Borwein step size greater than zero -* `bb_max_stepsize` – (`1e3`) upper bound for the Barzilai-Borwein step size greater than min_stepsize -* `retraction_method` – (`ExponentialRetraction()`) the retraction to use -* `strategy` – (`direct`) defines if the new step size is computed using the direct, indirect or alternating strategy -* `storage` – (for `:Iterate` and `:Gradient`) a [`StoreStateAction`](@ref) -* `stepsize_reduction` – (`0.5`) step size reduction factor contained in the interval (0,1) -* `sufficient_decrease` – (`1e-4`) sufficient decrease parameter contained in the interval (0,1) -* `vector_transport_method` – (`ParallelTransport()`) the vector transport method to use +* `initial_stepsize` – (`1.0`) the step size we start the search with +* `memory_size` – (`10`) number of iterations after which the cost value needs to be lower than the current one +* `bb_min_stepsize` – (`1e-3`) lower bound for the Barzilai-Borwein step size greater than zero +* `bb_max_stepsize` – (`1e3`) upper bound for the Barzilai-Borwein step size greater than min_stepsize +* `retraction_method` – (`ExponentialRetraction()`) the retraction to use +* `strategy` – (`direct`) defines if the new step size is computed using the direct, indirect or alternating strategy +* `storage` – (for `:Iterate` and `:Gradient`) a [`StoreStateAction`](@ref) +* `stepsize_reduction` – (`0.5`) step size reduction factor contained in the interval (0,1) +* `sufficient_decrease` – (`1e-4`) sufficient decrease parameter contained in the interval (0,1) +* `vector_transport_method` – (`ParallelTransport()`) the vector transport method to use Furthermore the following fields act as safeguards @@ -988,7 +988,7 @@ Represent an adaptive gradient method introduced by [Grapiglia,Stella, J. Optim. Given a positive threshold ``\hat c \mathbb N``, an minimal bound ``b_{\mathrm{min}} > 0``, an initial ``b_0 ≥ b_{\mathrm{min}}``, and a -gradient reduction factor threshold ``\alpha \in [0,1). +gradient reduction factor threshold ``\alpha \in [0,1)``. Set ``c_0=0`` and use ``\omega_0 = \lVert \operatorname{grad} f(p_0) \rvert_{p_0}``. diff --git a/src/plans/subgradient_plan.jl b/src/plans/subgradient_plan.jl index f66daaf598..cefb8b9060 100644 --- a/src/plans/subgradient_plan.jl +++ b/src/plans/subgradient_plan.jl @@ -4,8 +4,8 @@ A structure to store information about a objective for a subgradient based optimization problem # Fields -* `cost` – the function $F$ to be minimized -* `subgradient` – a function returning a subgradient $\partial F$ of $F$ +* `cost` – the function ``f`` to be minimized +* `subgradient` – a function returning a subgradient ``\partial f`` of ``f`` # Constructor diff --git a/src/plans/subsolver_plan.jl b/src/plans/subsolver_plan.jl index 3e9940d03a..2de55158fc 100644 --- a/src/plans/subsolver_plan.jl +++ b/src/plans/subsolver_plan.jl @@ -1,10 +1,94 @@ """ AbstractSubProblemSolverState <: AbstractManoptSolverState -An abstract type for problems that involve a subsolver +An abstract type for solvers that involve a subsolver. """ abstract type AbstractSubProblemSolverState <: AbstractManoptSolverState end +""" + AbstractManifoldSubObjective{O<:AbstractManifoldObjective} <: AbstractManifoldObjective + +An abstract type for objectives of sub problems within a solver but still store the +original objective internally to generate generic objectives for sub solvers. +""" +abstract type AbstractManifoldSubObjective{ + E<:AbstractEvaluationType,O<:AbstractManifoldObjective +} <: AbstractManifoldObjective{E} end + +function get_gradient_function(cgo::AbstractManifoldSubObjective) + return (M, p) -> get_gradient(M, cgo, p) +end + +@doc raw""" + get_objective(amso::AbstractManifoldSubObjective) + +Return the (original) objective stored the sub obective is build on. +""" +get_objective(amso::AbstractManifoldSubObjective) + +@doc raw""" + get_objective_cost(M, amso::AbstractManifoldSubObjective, p) + +Evaluate the cost of the (original) objective stored within the subobjective. +""" +function get_objective_cost( + M::AbstractManifold, amso::AbstractManifoldSubObjective{E,O}, p +) where {E,O<:AbstractManifoldCostObjective} + return get_cost(M, get_objective(amso), p) +end + +@doc raw""" + X = get_objective_gradient(M, amso::AbstractManifoldSubObjective, p) + get_objective_gradient!(M, X, amso::AbstractManifoldSubObjective, p) + +Evaluate the gradient of the (original) objective stored within the subobjective `amso`. +""" +function get_objective_gradient( + M::AbstractManifold, amso::AbstractManifoldSubObjective{E,O}, p +) where {E,O<:AbstractManifoldObjective{E}} + return get_gradient(M, get_objective(amso), p) +end +function get_objective_gradient!( + M::AbstractManifold, X, amso::AbstractManifoldSubObjective{E,O}, p +) where {E,O<:AbstractManifoldObjective{E}} + return get_gradient!(M, X, get_objective(amso), p) +end + +@doc raw""" + Y = get_objective_Hessian(M, amso::AbstractManifoldSubObjective, p, X) + get_objective_Hessian!(M, Y, amso::AbstractManifoldSubObjective, p, X) + +Evaluate the Hessian of the (original) objective stored within the subobjective `amso`. +""" +function get_objective_hessian( + M::AbstractManifold, amso::AbstractManifoldSubObjective{E,O}, p, X +) where {E,O<:AbstractManifoldObjective{E}} + return get_hessian(M, get_objective(amso), p, X) +end +function get_objective_hessian!( + M::AbstractManifold, Y, amso::AbstractManifoldSubObjective{E,O}, p, X +) where {E,O<:AbstractManifoldObjective{E}} + get_hessian!(M, Y, get_objective(amso), p, X) + return Y +end + +@doc raw""" + Y = get_objective_preconditioner(M, amso::AbstractManifoldSubObjective, p, X) + get_objective_preconditioner(M, Y, amso::AbstractManifoldSubObjective, p, X) + +Evaluate the Hessian of the (original) objective stored within the subobjective `amso`. +""" +function get_objective_preconditioner( + M::AbstractManifold, amso::AbstractManifoldSubObjective{E,O}, p, X +) where {E,O<:AbstractManifoldHessianObjective{E}} + return get_preconditioner(M, get_objective(amso), p, X) +end +function get_objective_preconditioner!( + M::AbstractManifold, Y, amso::AbstractManifoldSubObjective{E,O}, p, X +) where {E,O<:AbstractManifoldHessianObjective{E}} + return get_preconditioner!(M, Y, get_objective(amso), p, X) +end + @doc raw""" get_sub_problem(ams::AbstractSubProblemSolverState) @@ -24,8 +108,10 @@ get_sub_state(ams::AbstractSubProblemSolverState) = ams.sub_state """ set_manopt_parameter!(ams::AbstractManoptSolverState, element::Symbol, args...) -Set a certain field/element from the [`AbstractManoptSolverState`](@ref) `ams` to `value. -This function dispatches on `Val(element)`. +Set a certain field or semantic element from the [`AbstractManoptSolverState`](@ref) `ams` to `value`. +This function passes to `Val(element)` and specific setters should dispatch on `Val{element}`. + +By default, this function just does nothing. """ function set_manopt_parameter!(ams::AbstractManoptSolverState, e::Symbol, args...) return set_manopt_parameter!(ams, Val(e), args...) @@ -34,17 +120,6 @@ end function set_manopt_parameter!(ams::AbstractManoptSolverState, ::Val, args...) return ams end -""" - set_manopt_parameter!(ams::DebugSolverState, ::Val{:Debug}, args...) - -Set certain values specified by `args...` into the elements of the `debugDictionary` -""" -function set_manopt_parameter!(dss::DebugSolverState, ::Val{:Debug}, args...) - for d in values(dss.debugDictionary) - set_manopt_parameter!(d, args...) - end - return dss -end """ set_manopt_parameter!(ams::DebugSolverState, ::Val{:SubProblem}, args...) diff --git a/src/plans/trust_regions_plan.jl b/src/plans/trust_regions_plan.jl new file mode 100644 index 0000000000..fab1875cab --- /dev/null +++ b/src/plans/trust_regions_plan.jl @@ -0,0 +1,93 @@ + +@doc raw""" + TrustRegionModelObjective{O<:AbstractManifoldHessianObjective} <: AbstractManifoldSubObjective{O} + +A trust region model of the form + +```math + m(X) = f(p) + ⟨\operatorname{grad} f(p), X⟩_p + \frac{1}(2} ⟨\operatorname{Hess} f(p)[X], X⟩_p +``` + +# Fields + +* `objective` – an [`AbstractManifoldHessianObjective`](@ref) proving ``f``, its gradient and Hessian + +# Constructors + + TrustRegionModelObjective(objective) + +with either an [`AbstractManifoldHessianObjective`](@ref) `objective` or an decorator containing such an objective +""" +struct TrustRegionModelObjective{ + E<:AbstractEvaluationType, + O<:Union{ManifoldHessianObjective,AbstractDecoratedManifoldObjective}, +} <: AbstractManifoldSubObjective{E,O} + objective::O +end +function TrustRegionModelObjective( + mho::O +) where { + E,O<:Union{AbstractManifoldHessianObjective{E},AbstractDecoratedManifoldObjective{E}} +} + return TrustRegionModelObjective{E,O}(mho) +end + +get_objective(trmo::TrustRegionModelObjective) = trmo.objective + +@doc raw""" + get_cost(TpM, trmo::TrustRegionModelObjective, X) + +Evaluate the tangent space [`TrustRegionModelObjective`](@ref) + +```math +m(X) = f(p) + ⟨\operatorname{grad} f(p), X ⟩_p + \frac{1}{2} ⟨\operatorname{Hess} f(p)[X], X⟩_p. +``` +""" +function get_cost(TpM::TangentSpace, trmo::TrustRegionModelObjective, X) + M = base_manifold(TpM) + p = TpM.point + c = get_objective_cost(M, trmo, p) + G = get_objective_gradient(M, trmo, p) + Y = get_objective_hessian(M, trmo, p, X) + return c + inner(M, p, G, X) + 1 / 2 * inner(M, p, Y, X) +end +@doc raw""" + get_gradient(TpM, trmo::TrustRegionModelObjective, X) + +Evaluate the gradient of the [`TrustRegionModelObjective`](@ref) + +```math +\operatorname{grad} m(X) = \operatorname{grad} f(p) + \operatorname{Hess} f(p)[X]. +``` +""" +function get_gradient(TpM::TangentSpace, trmo::TrustRegionModelObjective, X) + M = base_manifold(TpM) + p = TpM.point + return get_objective_gradient(M, trmo, p) + get_objective_hessian(M, trmo, p, X) +end +function get_gradient!(TpM::TangentSpace, Y, trmo::TrustRegionModelObjective, X) + M = base_manifold(TpM) + p = TpM.point + get_objective_hessian!(M, Y, trmo, p, X) + Y .+= get_objective_gradient(M, trmo, p) + return Y +end +@doc raw""" + get_hessian(TpM, trmo::TrustRegionModelObjective, X) + +Evaluate the Hessian of the [`TrustRegionModelObjective`](@ref) + +```math +\operatorname{Hess} m(X)[Y] = \operatorname{Hess} f(p)[Y]. +``` +""" +function get_hessian(TpM::TangentSpace, trmo::TrustRegionModelObjective, X, V) + M = base_manifold(TpM) + p = TpM.point + return get_objective_hessian(M, trmo, p, V) +end +function get_hessian!(TpM::TangentSpace, W, trmo::TrustRegionModelObjective, X, V) + M = base_manifold(TpM) + p = TpM.point + return get_objective_hessian!(M, W, trmo, p, V) +end diff --git a/src/solvers/ChambollePock.jl b/src/solvers/ChambollePock.jl index e9aed1d406..6ae6fb740e 100644 --- a/src/solvers/ChambollePock.jl +++ b/src/solvers/ChambollePock.jl @@ -175,17 +175,17 @@ end Perform the Riemannian Chambolle–Pock algorithm. -Given a `cost` function $\mathcal E:\mathcal M → ℝ$ of the form +Given a `cost` function ``\mathcal E:\mathcal M → ℝ`` of the form ```math \mathcal E(p) = F(p) + G( Λ(p) ), ``` -where $F:\mathcal M → ℝ$, $G:\mathcal N → ℝ$, -and $Λ:\mathcal M → \mathcal N$. The remaining input parameters are +where ``F:\mathcal M → ℝ``, ``G:\mathcal N → ℝ``, +and ``Λ:\mathcal M → \mathcal N``. The remaining input parameters are -* `p, X` primal and dual start points $x∈\mathcal M$ and $ξ∈T_n\mathcal N$ -* `m,n` base points on $\mathcal M$ and $\mathcal N$, respectively. -* `adjoint_linearized_operator` the adjoint $DΛ^*$ of the linearized operator $DΛ(m): T_{m}\mathcal M → T_{Λ(m)}\mathcal N$ -* `prox_F, prox_G_Dual` the proximal maps of $F$ and $G^\ast_n$ +* `p, X` primal and dual start points ``x∈\mathcal M`` and ``ξ∈T_n\mathcal N`` +* `m,n` base points on ``\mathcal M`` and ``\mathcal N``, respectively. +* `adjoint_linearized_operator` the adjoint ``DΛ^*`` of the linearized operator ``DΛ(m): T_{m}\mathcal M → T_{Λ(m)}\mathcal N`` +* `prox_F, prox_G_Dual` the proximal maps of ``F`` and ``G^\ast_n`` note that depending on the [`AbstractEvaluationType`](@ref) `evaluation` the last three parameters as well as the forward_operator `Λ` and the `linearized_forward_operator` can be given as @@ -204,8 +204,8 @@ For more details on the algorithm, see [Bergmann et al., Found. Comput. Math., 2 * `evaluation` ([`AllocatingEvaluation`](@ref)`()) specify whether the proximal maps and operators are allocating functions `(Manifolds, parameters) -> result` or given as mutating functions `(Manifold, result, parameters)` -> result` to spare allocations. -* `Λ` (`missing`) the (forward) operator $Λ(⋅)$ (required for the `:exact` variant) -* `linearized_forward_operator` (`missing`) its linearization $DΛ(⋅)[⋅]$ (required for the `:linearized` variant) +* `Λ` (`missing`) the (forward) operator ``Λ(⋅)`` (required for the `:exact` variant) +* `linearized_forward_operator` (`missing`) its linearization ``DΛ(⋅)[⋅]`` (required for the `:linearized` variant) * `primal_stepsize` – (`1/sqrt(8)`) proximal parameter of the dual prox * `relaxation` – (`1.`) * `relax` – (`:primal`) whether to relax the primal or dual diff --git a/src/solvers/DouglasRachford.jl b/src/solvers/DouglasRachford.jl index 088370b721..d66bf0d10e 100644 --- a/src/solvers/DouglasRachford.jl +++ b/src/solvers/DouglasRachford.jl @@ -171,7 +171,7 @@ If you provide a [`ManifoldProximalMapObjective`](@ref) `mpo` instead, the proxi * `retraction_method` - (`default_retraction_metiod(M, typeof(p))`) the retraction to use in - the reflection (ignored, if you set `R` directly) - the relaxation step -* `stopping_criterion` – ([`StopWhenAny`](@ref)`(`[`StopAfterIteration`](@ref)`(200),`[`StopWhenChangeLess`](@ref)`(10.0^-5))`) +* `stopping_criterion` – ([`StopAfterIteration`](@ref)`(200) | `[`StopWhenChangeLess`](@ref)`(1e-5)`) a [`StoppingCriterion`](@ref). * `parallel` – (`false`) clarify that we are doing a parallel DR, i.e. on a `PowerManifold` manifold with two proxes. This can be used to trigger @@ -307,9 +307,8 @@ function DouglasRachford!( ) end, parallel::Int=0, - stopping_criterion::StoppingCriterion=StopWhenAny( - StopAfterIteration(200), StopWhenChangeLess(10.0^-5) - ), + stopping_criterion::StoppingCriterion=StopAfterIteration(200) | + StopWhenChangeLess(1e-5), kwargs..., #especially may contain decorator options ) where { Tλ, diff --git a/src/solvers/Lanczos.jl b/src/solvers/Lanczos.jl new file mode 100644 index 0000000000..4f4cf522c5 --- /dev/null +++ b/src/solvers/Lanczos.jl @@ -0,0 +1,350 @@ + +# +# Lanczos sub solver +# +@doc raw""" + LanczosState{P,T,SC,B,I,R,TM,V,Y} <: AbstractManoptSolverState + +Solve the adaptive regularized subproblem with a Lanczos iteration + +# Fields + +* `stop` – the stopping criterion +* `σ` – the current regularization parameter +* `X` the Iterate +* `Lanczos_vectors` – the obtained Lanczos vectors +* `tridig_matrix` the tridiagonal coefficient matrix T +* `coefficients` the coefficients `y_1,...y_k`` that determine the solution +* `Hp` – a temporary vector containing the evaluation of the Hessian +* `Hp_residual` – a temporary vector containing the residual to the Hessian +* `S` – the current obtained / approximated solution +""" +mutable struct LanczosState{T,R,SC,SCN,B,TM,C} <: AbstractManoptSolverState + X::T + σ::R + stop::SC + stop_newton::SCN + Lanczos_vectors::B # qi + tridig_matrix::TM # T + coefficients::C # y + Hp::T # Hess_f A temporary vector for evaluations of the hessian + Hp_residual::T # A residual vector + # Maybe not necessary? + S::T # store the tangent vector that solves the minimization problem +end +function LanczosState( + TpM::TangentSpace; + X::T=zero_vector(TpM.manifold, TpM.point), + maxIterLanczos=200, + θ=0.5, + stopping_criterion::SC=StopAfterIteration(maxIterLanczos) | + StopWhenFirstOrderProgress(θ), + stopping_criterion_newtown::SCN=StopAfterIteration(200), + σ::R=10.0, +) where {T,SC<:StoppingCriterion,SCN<:StoppingCriterion,R} + tridig = spdiagm(maxIterLanczos, maxIterLanczos, [0.0]) + coeffs = zeros(maxIterLanczos) + Lanczos_vectors = typeof(X)[] + return LanczosState{T,R,SC,SCN,typeof(Lanczos_vectors),typeof(tridig),typeof(coeffs)}( + X, + σ, + stopping_criterion, + stopping_criterion_newtown, + Lanczos_vectors, + tridig, + coeffs, + copy(TpM, X), + copy(TpM, X), + copy(TpM, X), + ) +end +function get_solver_result(ls::LanczosState) + return ls.S +end +function set_iterate!(ls::LanczosState, M, X) + ls.X .= X + return ls +end +function set_manopt_parameter!(ls::LanczosState, ::Val{:σ}, σ) + ls.σ = σ + return ls +end + +function show(io::IO, ls::LanczosState) + i = get_count(ls, :Iterations) + Iter = (i > 0) ? "After $i iterations\n" : "" + Conv = indicates_convergence(ls.stop) ? "Yes" : "No" + s = """ + # Solver state for `Manopt.jl`s Lanczos Iteration + $Iter + ## Parameters + * σ : $(ls.σ) + * # of Lanczos vectors used : $(length(ls.Lanczos_vectors)) + + ## Stopping Criteria + (a) For the Lanczos Iteration + $(status_summary(ls.stop)) + (b) For the Newton sub solver + $(status_summary(ls.stop_newton)) + This indicates convergence: $Conv""" + return print(io, s) +end + +# +# The Lanczos Subsolver implementation +# +function initialize_solver!(dmp::AbstractManoptProblem{<:TangentSpace}, ls::LanczosState) + TpM = get_manifold(dmp) + M = base_manifold(TpM) + p = TpM.point + maxIterLanczos = size(ls.tridig_matrix, 1) + ls.tridig_matrix = spdiagm(maxIterLanczos, maxIterLanczos, [0.0]) + ls.coefficients = zeros(maxIterLanczos) + for X in ls.Lanczos_vectors + zero_vector!(M, X, p) + end + zero_vector!(M, ls.Hp, p) + zero_vector!(M, ls.Hp_residual, p) + return ls +end + +function step_solver!(dmp::AbstractManoptProblem{<:TangentSpace}, ls::LanczosState, i) + TpM = get_manifold(dmp) + p = TpM.point + M = base_manifold(TpM) + arcmo = get_objective(dmp) + if i == 1 #we can easily compute the first Lanczos vector + nX = norm(M, p, ls.X) + if length(ls.Lanczos_vectors) == 0 + push!(ls.Lanczos_vectors, ls.X ./ nX) + else + copyto!(M, ls.Lanczos_vectors[1], p, ls.X ./ nX) + end + get_objective_hessian!(M, ls.Hp, arcmo, p, ls.Lanczos_vectors[1]) + α = inner(M, p, ls.Lanczos_vectors[1], ls.Hp) + # This is also the first coefficient in the tridiagonal matrix + ls.tridig_matrix[1, 1] = α + ls.Hp_residual .= ls.Hp - α * ls.Lanczos_vectors[1] + #argmin of one dimensional model + ls.coefficients[1] = (α - sqrt(α^2 + 4 * ls.σ * nX)) / (2 * ls.σ) + else # i > 1 + β = norm(M, p, ls.Hp_residual) + if β > 1e-12 # Obtained new orth Lanczos long enough cf. to num stability + if length(ls.Lanczos_vectors) < i + push!(ls.Lanczos_vectors, ls.Hp_residual ./ β) + else + copyto!(M, ls.Lanczos_vectors[i], p, ls.Hp_residual ./ β) + end + else # Generate new random vec and MGS of new vec wrt. Q + rand!(M, ls.Hp_residual; vector_at=p) + for k in 1:(i - 1) + ls.Hp_residual .= + ls.Hp_residual - + inner(M, p, ls.Lanczos_vectors[k], ls.Hp_residual) * + ls.Lanczos_vectors[k] + end + if length(ls.Lanczos_vectors) < i + push!(ls.Lanczos_vectors, ls.Hp_residual ./ norm(M, p, ls.Hp_residual)) + else + copyto!( + M, + ls.Lanczos_vectors[i], + p, + ls.Hp_residual ./ norm(M, p, ls.Hp_residual), + ) + end + end + # Update Hessian and residual + get_objective_hessian!(M, ls.Hp, arcmo, p, ls.Lanczos_vectors[i]) + ls.Hp_residual .= ls.Hp - β * ls.Lanczos_vectors[i - 1] + α = inner(M, p, ls.Hp_residual, ls.Lanczos_vectors[i]) + ls.Hp_residual .= ls.Hp_residual - α * ls.Lanczos_vectors[i] + # Update tridiagonal matric + ls.tridig_matrix[i, i] = α + ls.tridig_matrix[i - 1, i] = β + ls.tridig_matrix[i, i - 1] = β + min_cubic_Newton!(dmp, ls, i) + end + copyto!(M, ls.S, p, sum(ls.Lanczos_vectors[k] * ls.coefficients[k] for k in 1:i)) + return ls +end +# +# Solve Lanczos sub problem +# +function min_cubic_Newton!(mp::AbstractManoptProblem{<:TangentSpace}, ls::LanczosState, i) + TpM = get_manifold(mp) + p = TpM.point + M = base_manifold(TpM) + tol = 1e-16 # TODO: Put into a stopping criterion + + gvec = zeros(i) + gvec[1] = norm(M, p, ls.X) + λ = opnorm(Array(@view ls.tridig_matrix[1:i, 1:i])) + 2 + T_λ = @view(ls.tridig_matrix[1:i, 1:i]) + λ * I + + λ_min = eigmin(Array(@view ls.tridig_matrix[1:i, 1:i])) + lower_barrier = max(0, -λ_min) + k = 0 + y = zeros(i) + while !ls.stop_newton(mp, ls, k) + k += 1 + y = -(T_λ \ gvec) + ynorm = norm(y, 2) + ϕ = 1 / ynorm - ls.σ / λ #when ϕ is "zero", y is the solution. + (abs(ϕ) < tol * ynorm) && break + #compute the newton step + ψ = ynorm^2 + Δy = -(T_λ) \ y + ψ_prime = 2 * dot(y, Δy) + # Quadratic polynomial coefficients + p0 = 2 * ls.σ * ψ^(1.5) + p1 = -2 * ψ - λ * ψ_prime + p2 = ψ_prime + #Polynomial roots + r1 = (-p1 + sqrt(p1^2 - 4 * p2 * p0)) / (2 * p2) + r2 = (-p1 - sqrt(p1^2 - 4 * p2 * p0)) / (2 * p2) + + Δλ = max(r1, r2) - λ + + #if we jumped past the lower barrier for λ, jump to midpoint between current and lower λ. + (λ + Δλ <= lower_barrier) && (Δλ = -0.5 * (λ - lower_barrier)) + #if the steps we make are to small, terminate + (abs(Δλ) <= eps(λ)) && break + T_λ = T_λ + Δλ * I + λ = λ + Δλ + end + ls.coefficients[1:i] .= y + return ls.coefficients +end + +# +# Stopping Criteria +# +@doc raw""" + StopWhenFirstOrderProgress <: StoppingCriterion + +A stopping criterion related to the Riemannian adaptive regularization with cubics (ARC) +solver indicating that the model function at the current (outer) iterate, i.e. + +```math + m(X) = f(p) + + + \frac{1}{2} + \frac{σ}{3} \lVert X \rVert^3, +``` + +defined on the tangent space ``T_{p}\mathcal M`` +fulfills at the current iterate ``X_k`` that + +```math +m(X_k) \leq m(0) +\quad\text{ and }\quad +\lVert \operatorname{grad} m(X_k) \rVert ≤ θ \lVert X_k \rVert^2 +``` + +# Fields + +* `θ` – the factor ``θ`` in the second condition above +* `reason` – a String indicating the reason if the criterion indicated to stop +""" +mutable struct StopWhenFirstOrderProgress <: StoppingCriterion + θ::Float64 #θ + reason::String + StopWhenFirstOrderProgress(θ::Float64) = new(θ, "") +end +function (c::StopWhenFirstOrderProgress)( + dmp::AbstractManoptProblem{<:TangentSpace}, ls::LanczosState, i::Int +) + if (i == 0) + c.reason = "" + return false + end + #Update Gradient + TpM = get_manifold(dmp) + p = TpM.point + M = base_manifold(TpM) + nX = norm(M, p, get_gradient(dmp, p)) + y = @view(ls.coefficients[1:(i - 1)]) + Ty = @view(ls.tridig_matrix[1:i, 1:(i - 1)]) * y + ny = norm(y) + model_grad_norm = norm(nX .* [1, zeros(i - 1)...] + Ty + ls.σ * ny * [y..., 0]) + prog = (model_grad_norm <= c.θ * ny^2) + prog && (c.reason = "The algorithm has reduced the model grad norm by a factor $(c.θ).") + return prog +end +function (c::StopWhenFirstOrderProgress)( + dmp::AbstractManoptProblem{<:TangentSpace}, ams::AbstractManoptSolverState, i::Int +) + if (i == 0) + c.reason = "" + return false + end + TpM = get_manifold(dmp) + p = TpM.point + q = get_iterate(ams) + # Update Gradient and compute norm + nG = norm(base_manifold(TpM), p, get_gradient(dmp, q)) + # norm of current iterate + nX = norm(base_manifold(TpM), p, q) + prog = (nG <= c.θ * nX^2) + prog && (c.reason = "The algorithm has reduced the model grad norm by a factor $(c.θ).") + return prog +end + +function status_summary(c::StopWhenFirstOrderProgress) + has_stopped = length(c.reason) > 0 + s = has_stopped ? "reached" : "not reached" + return "First order progress with θ=$(c.θ):\t$s" +end +indicates_convergence(c::StopWhenFirstOrderProgress) = true +function show(io::IO, c::StopWhenFirstOrderProgress) + return print(io, "StopWhenFirstOrderProgress($(repr(c.θ)))\n $(status_summary(c))") +end + +@doc raw""" + StopWhenAllLanczosVectorsUsed <: StoppingCriterion + +When an inner iteration has used up all Lanczos vectors, then this stopping criterion is +a fallback / security stopping criterion in order to not access a non-existing field +in the array allocated for vectors. + +Note that this stopping criterion (for now) is only implemented for the case that an +[`AdaptiveRegularizationState`](@ref) when using a [`LanczosState`](@ref) subsolver + +# Fields + +* `maxLanczosVectors` – maximal number of Lanczos vectors +* `reason` – a String indicating the reason if the criterion indicated to stop + +# Constructor + + StopWhenAllLanczosVectorsUsed(maxLancosVectors::Int) + +""" +mutable struct StopWhenAllLanczosVectorsUsed <: StoppingCriterion + maxLanczosVectors::Int + reason::String + StopWhenAllLanczosVectorsUsed(maxLanczosVectors::Int) = new(maxLanczosVectors, "") +end +function (c::StopWhenAllLanczosVectorsUsed)( + ::AbstractManoptProblem, + arcs::AdaptiveRegularizationState{P,T,Pr,<:LanczosState}, + i::Int, +) where {P,T,Pr} + (i == 0) && (c.reason = "") # reset on init + if (i > 0) && length(arcs.sub_state.Lanczos_vectors) == c.maxLanczosVectors + c.reason = "The algorithm used all ($(c.maxLanczosVectors)) preallocated Lanczos vectors and may have stagnated.\n Consider increasing this value.\n" + return true + end + return false +end +function status_summary(c::StopWhenAllLanczosVectorsUsed) + has_stopped = length(c.reason) > 0 + s = has_stopped ? "reached" : "not reached" + return "All Lanczos vectors ($(c.maxLanczosVectors)) used:\t$s" +end +indicates_convergence(c::StopWhenAllLanczosVectorsUsed) = false +function show(io::IO, c::StopWhenAllLanczosVectorsUsed) + return print( + io, + "StopWhenAllLanczosVectorsUsed($(repr(c.maxLanczosVectors)))\n $(status_summary(c))", + ) +end diff --git a/src/solvers/LevenbergMarquardt.jl b/src/solvers/LevenbergMarquardt.jl index e89fd37d35..fc9580dc37 100644 --- a/src/solvers/LevenbergMarquardt.jl +++ b/src/solvers/LevenbergMarquardt.jl @@ -31,7 +31,7 @@ then the keyword `jacobian_tangent_basis` below is ignored * `evaluation` – ([`AllocatingEvaluation`](@ref)) specify whether the gradient works by allocation (default) form `gradF(M, x)` or [`InplaceEvaluation`](@ref) in place, i.e. is of the form `gradF!(M, X, x)`. * `retraction_method` – (`default_retraction_method(M, typeof(p))`) a `retraction(M,x,ξ)` to use. -* `stopping_criterion` – ([`StopWhenAny`](@ref)`(`[`StopAfterIteration`](@ref)`(200), `[`StopWhenGradientNormLess`](@ref)`(1e-12))`) +* `stopping_criterion` – ([`StopAfterIteration`](@ref)`(200) | `[`StopWhenGradientNormLess`](@ref)`(1e-12)`) a functor inheriting from [`StoppingCriterion`](@ref) indicating when to stop. * `expect_zero_residual` – (`false`) whether or not the algorithm might expect that the value of residual (objective) at minimum is equal to 0. diff --git a/src/solvers/adaptive_regularization_with_cubics.jl b/src/solvers/adaptive_regularization_with_cubics.jl index adce490d9e..2a7f75f1ec 100644 --- a/src/solvers/adaptive_regularization_with_cubics.jl +++ b/src/solvers/adaptive_regularization_with_cubics.jl @@ -26,7 +26,7 @@ a default value is given in brackets if a parameter can be left out in initializ Furthermore the following integral fields are defined * `q` - (`copy(M,p)`) a point for the candidates to evaluate model and ρ -* `H` – (`copy(M, p, X)`) the current hessian, $\operatorname{Hess}F(p)[⋅]$ +* `H` – (`copy(M, p, X)`) the current hessian, ``\operatorname{Hess}F(p)[⋅]`` * `S` – (`copy(M, p, X)`) the current solution from the subsolver * `ρ` – the current regularized ratio of actual improvement and model improvement. * `ρ_denominator` – (`one(ρ)`) a value to store the denominator from the computation of ρ @@ -41,8 +41,8 @@ Construct the solver state with all fields stated above as keyword arguments. mutable struct AdaptiveRegularizationState{ P, T, - Pr<:AbstractManoptProblem, - St<:AbstractManoptSolverState, + Pr<:Union{AbstractManoptProblem,<:Function}, + St<:Union{AbstractManoptSolverState,<:AbstractEvaluationType}, TStop<:StoppingCriterion, R, TRTM<:AbstractRetractionMethod, @@ -56,7 +56,7 @@ mutable struct AdaptiveRegularizationState{ S::T σ::R ρ::R - ρ_denonimator::R + ρ_denominator::R ρ_regularization::R stop::TStop retraction_method::TRTM @@ -75,9 +75,13 @@ function AdaptiveRegularizationState( sub_problem::Pr=if isnothing(sub_objective) nothing else - DefaultManoptProblem(M, sub_objective) + DefaultManoptProblem(TangentSpace(M, copy(M, p)), sub_objective) + end, + sub_state::St=if sub_problem isa Function + AllocatingEvaluation() + else + LanczosState(TangentSpace(M, copy(M, p))) end, - sub_state::St=sub_problem isa Function ? AllocatingEvaluation() : LanczosState(M, p), σ::R=100.0 / sqrt(manifold_dimension(M)),# Had this to initial value of 0.01. However try same as in MATLAB: 100/sqrt(dim(M)) ρ_regularization::R=1e3, stopping_criterion::SC=StopAfterIteration(100), @@ -229,7 +233,7 @@ All other keyword arguments are passed to [`decorate_state!`](@ref) for state de [`decorate_objective!`](@ref) for objective, respectively. If you provide the [`ManifoldGradientObjective`](@ref) directly, these decorations can still be specified -By default the `debug=` keyword is set to [`DebugIfEntry`](@ref)`(:ρ_denonimator, >(0); message="Denominator nonpositive", type=:error)`` +By default the `debug=` keyword is set to [`DebugIfEntry`](@ref)`(:ρ_denominator, >(0); message="Denominator nonpositive", type=:error)`` to avoid that by rounding errors the denominator in the computation of `ρ` gets nonpositive. """ adaptive_regularization_with_cubics(M::AbstractManifold, args...; kwargs...) @@ -372,7 +376,7 @@ function adaptive_regularization_with_cubics!( mho::O, p=rand(M); debug=DebugIfEntry( - :ρ_denonimator, >(0); message="Denominator nonpositive", type=:error + :ρ_denominator, >(-1e-8); message="denominator nonpositive", type=:error ), evaluation::AbstractEvaluationType=AllocatingEvaluation(), initial_tangent_vector::T=zero_vector(M, p), @@ -391,15 +395,14 @@ function adaptive_regularization_with_cubics!( sub_stopping_criterion::StoppingCriterion=StopAfterIteration(maxIterLanczos) | StopWhenFirstOrderProgress(θ), sub_state::Union{<:AbstractManoptSolverState,<:AbstractEvaluationType}=LanczosState( - M, - copy(M, p); + TangentSpace(M, copy(M, p)); maxIterLanczos=maxIterLanczos, σ=σ, θ=θ, stopping_criterion=sub_stopping_criterion, ), - sub_objective=decorate_objective!(M, mho; objective_type=objective_type, sub_kwargs...), - sub_problem=DefaultManoptProblem(M, sub_objective), + sub_objective=nothing, + sub_problem=nothing, stopping_criterion::StoppingCriterion=if sub_state isa LanczosState StopAfterIteration(40) | StopWhenGradientNormLess(1e-9) | @@ -409,8 +412,14 @@ function adaptive_regularization_with_cubics!( end, kwargs..., ) where {T,R,O<:Union{ManifoldHessianObjective,AbstractDecoratedManifoldObjective}} - X = copy(M, p, initial_tangent_vector) dmho = decorate_objective!(M, mho; objective_type=objective_type, kwargs...) + if isnothing(sub_objective) + sub_objective = AdaptiveRagularizationWithCubicsModelObjective(dmho, σ) + end + if isnothing(sub_problem) + sub_problem = DefaultManoptProblem(TangentSpace(M, copy(M, p)), sub_objective) + end + X = copy(M, p, initial_tangent_vector) dmp = DefaultManoptProblem(M, dmho) arcs = AdaptiveRegularizationState( M, @@ -445,15 +454,9 @@ function step_solver!(dmp::AbstractManoptProblem, arcs::AdaptiveRegularizationSt get_gradient!(M, arcs.X, mho, arcs.p) # Update base point in manifold set_manopt_parameter!(arcs.sub_problem, :Manifold, :p, copy(M, arcs.p)) - set_manopt_parameter!(arcs.sub_problem, :Objective, :Cost, :X, copy(M, arcs.p, arcs.X)) - set_manopt_parameter!(arcs.sub_problem, :Objective, :Cost, :σ, arcs.σ) - set_manopt_parameter!( - arcs.sub_problem, :Objective, :Gradient, :X, copy(M, arcs.p, arcs.X) - ) - set_manopt_parameter!(arcs.sub_problem, :Objective, :Gradient, :σ, arcs.σ) + set_manopt_parameter!(arcs.sub_problem, :Objective, :σ, arcs.σ) set_iterate!(arcs.sub_state, M, copy(M, arcs.p, arcs.X)) set_manopt_parameter!(arcs.sub_state, :σ, arcs.σ) - set_manopt_parameter!(arcs.sub_state, :p, copy(M, arcs.p)) #Solve the sub_problem – via dispatch depending on type solve_arc_subproblem!(M, arcs.S, arcs.sub_problem, arcs.sub_state, arcs.p) # Compute ρ @@ -463,9 +466,8 @@ function step_solver!(dmp::AbstractManoptProblem, arcs::AdaptiveRegularizationSt ρ_vec = arcs.X + 0.5 * get_hessian(M, mho, arcs.p, arcs.S) ρ_den = -inner(M, arcs.p, arcs.S, ρ_vec) ρ_reg = arcs.ρ_regularization * eps(Float64) * max(abs(cost), 1) - arcs.ρ_denonimator = ρ_den + ρ_reg # <= 0 -> the default debug kicks in + arcs.ρ_denominator = ρ_den + ρ_reg # <= 0 -> the default debug kicks in arcs.ρ = (ρ_num + ρ_reg) / (ρ_den + ρ_reg) - #Update iterate if arcs.ρ >= arcs.η1 copyto!(M, arcs.p, arcs.q) @@ -500,388 +502,3 @@ function solve_arc_subproblem!( problem!(M, s, p) return s end - -# -# Lanczos sub solver -# - -@doc raw""" - LanczosState{P,T,SC,B,I,R,TM,V,Y} <: AbstractManoptSolverState - -Solve the adaptive regularized subproblem with a Lanczos iteration - -# Fields - -* `p` the current iterate -* `stop` – the stopping criterion -* `σ` – the current regularization parameter -* `X` the current gradient -* `Lanczos_vectors` – the obtained Lanczos vectors -* `tridig_matrix` the tridiagonal coefficient matrix T -* `coefficients` the coefficients `y_1,...y_k`` that determine the solution -* `Hp` – a temporary vector containing the evaluation of the Hessian -* `Hp_residual` – a temporary vector containing the residual to the Hessian -* `S` – the current obtained / approximated solution -""" -mutable struct LanczosState{P,T,R,SC,SCN,B,TM,C} <: AbstractManoptSolverState - p::P - X::T - σ::R - stop::SC # Notation in ABBC - stop_newton::SCN - Lanczos_vectors::B # qi - tridig_matrix::TM # T - coefficients::C # y - Hp::T # Hess_f A temporary vector for evaluations of the hessian - Hp_residual::T # A residual vector - # Maybe not necessary? - S::T # store the tangent vector that solves the minimization problem -end -function LanczosState( - M::AbstractManifold, - p::P=rand(M); - X::T=zero_vector(M, p), - maxIterLanczos=200, - θ=0.5, - stopping_criterion::SC=StopAfterIteration(maxIterLanczos) | - StopWhenFirstOrderProgress(θ), - stopping_criterion_newtown::SCN=StopAfterIteration(200), - σ::R=10.0, -) where {P,T,SC<:StoppingCriterion,SCN<:StoppingCriterion,R} - tridig = spdiagm(maxIterLanczos, maxIterLanczos, [0.0]) - coeffs = zeros(maxIterLanczos) - Lanczos_vectors = typeof(X)[] - return LanczosState{P,T,R,SC,SCN,typeof(Lanczos_vectors),typeof(tridig),typeof(coeffs)}( - p, - X, - σ, - stopping_criterion, - stopping_criterion_newtown, - Lanczos_vectors, - tridig, - coeffs, - copy(M, p, X), - copy(M, p, X), - copy(M, p, X), - ) -end -function get_solver_result(ls::LanczosState) - return ls.S -end -function set_manopt_parameter!(ls::LanczosState, ::Val{:p}, p) - ls.p = p - return ls -end -function set_iterate!(ls::LanczosState, M, X) - ls.X = X - return ls -end -function set_manopt_parameter!(ls::LanczosState, ::Val{:σ}, σ) - ls.σ = σ - return ls -end - -function show(io::IO, ls::LanczosState) - i = get_count(ls, :Iterations) - Iter = (i > 0) ? "After $i iterations\n" : "" - Conv = indicates_convergence(ls.stop) ? "Yes" : "No" - s = """ - # Solver state for `Manopt.jl`s Lanczos Iteration - $Iter - ## Parameters - * σ : $(ls.σ) - * # of Lanczos vectors used : $(length(ls.Lanczos_vectors)) - - ## Stopping Criteria - (a) For the Lanczos Iteration - $(status_summary(ls.stop)) - (b) For the Newton sub solver - $(status_summary(ls.stop_newton)) - This indicates convergence: $Conv""" - return print(io, s) -end - -# -# The Lanczos Subsolver implementation -# -function initialize_solver!(dmp::AbstractManoptProblem, ls::LanczosState) - M = get_manifold(dmp) - # Maybe better to allocate once and just reset the number of vectors k? - maxIterLanczos = size(ls.tridig_matrix, 1) - ls.tridig_matrix = spdiagm(maxIterLanczos, maxIterLanczos, [0.0]) - ls.coefficients = zeros(maxIterLanczos) - for X in ls.Lanczos_vectors - zero_vector!(M, X, ls.p) - end - zero_vector!(M, ls.Hp, ls.p) - zero_vector!(M, ls.Hp_residual, ls.p) - return ls -end - -function step_solver!(dmp::AbstractManoptProblem, ls::LanczosState, i) - M = get_manifold(dmp) - mho = get_objective(dmp) - if i == 1 #we can easily compute the first Lanczos vector - nX = norm(M, ls.p, ls.X) - if length(ls.Lanczos_vectors) == 0 - push!(ls.Lanczos_vectors, ls.X ./ nX) - else - copyto!(M, ls.Lanczos_vectors[1], ls.p, ls.X ./ nX) - end - get_hessian!(M, ls.Hp, mho, ls.p, ls.Lanczos_vectors[1]) - α = inner(M, ls.p, ls.Lanczos_vectors[1], ls.Hp) - # This is also the first coefficient in the tridiagonal matrix - ls.tridig_matrix[1, 1] = α - ls.Hp_residual .= ls.Hp - α * ls.Lanczos_vectors[1] - #argmin of one dimensional model - ls.coefficients[1] = (α - sqrt(α^2 + 4 * ls.σ * nX)) / (2 * ls.σ) - else # i > 1 - β = norm(M, ls.p, ls.Hp_residual) - if β > 1e-12 # Obtained new orth Lanczos long enough cf. to num stability - if length(ls.Lanczos_vectors) < i - push!(ls.Lanczos_vectors, ls.Hp_residual ./ β) - else - copyto!(M, ls.Lanczos_vectors[i], ls.p, ls.Hp_residual ./ β) - end - else # Generate new random vec and MGS of new vec wrt. Q - rand!(M, ls.Hp_residual; vector_at=ls.p) - for k in 1:(i - 1) - ls.Hp_residual .= - ls.Hp_residual - - inner(M, ls.p, ls.Lanczos_vectors[k], ls.Hp_residual) * - ls.Lanczos_vectors[k] - end - if length(ls.Lanczos_vectors) < i - push!(ls.Lanczos_vectors, ls.Hp_residual ./ norm(M, ls.p, ls.Hp_residual)) - else - copyto!( - M, - ls.Lanczos_vectors[i], - ls.p, - ls.Hp_residual ./ norm(M, ls.p, ls.Hp_residual), - ) - end - end - # Update Hessian and residual - get_hessian!(M, ls.Hp, mho, ls.p, ls.Lanczos_vectors[i]) - ls.Hp_residual .= ls.Hp - β * ls.Lanczos_vectors[i - 1] - α = inner(M, ls.p, ls.Hp_residual, ls.Lanczos_vectors[i]) - ls.Hp_residual .= ls.Hp_residual - α * ls.Lanczos_vectors[i] - # Update tridiagonal matric - ls.tridig_matrix[i, i] = α - ls.tridig_matrix[i - 1, i] = β - ls.tridig_matrix[i, i - 1] = β - min_cubic_Newton!(dmp, ls, i) - end - copyto!(M, ls.S, ls.p, sum(ls.Lanczos_vectors[k] * ls.coefficients[k] for k in 1:i)) - return ls -end -# -# Solve Lanczos sub problem -# -function min_cubic_Newton!(mp::AbstractManoptProblem, ls::LanczosState, i) - M = get_manifold(mp) - tol = 1e-16 # TODO: Put into a stopping criterion - - gvec = zeros(i) - gvec[1] = norm(M, ls.p, ls.X) - λ = opnorm(Array(@view ls.tridig_matrix[1:i, 1:i])) + 2 - T_λ = @view(ls.tridig_matrix[1:i, 1:i]) + λ * I - - λ_min = eigmin(Array(@view ls.tridig_matrix[1:i, 1:i])) - lower_barrier = max(0, -λ_min) - k = 0 - y = zeros(i) - while !ls.stop_newton(mp, ls, k) - k += 1 - y = -(T_λ \ gvec) - ynorm = norm(y, 2) - ϕ = 1 / ynorm - ls.σ / λ #when ϕ is "zero", y is the solution. - (abs(ϕ) < tol * ynorm) && break - #compute the newton step - ψ = ynorm^2 - Δy = -(T_λ) \ y - ψ_prime = 2 * dot(y, Δy) - # Quadratic polynomial coefficients - p0 = 2 * ls.σ * ψ^(1.5) - p1 = -2 * ψ - λ * ψ_prime - p2 = ψ_prime - #Polynomial roots - r1 = (-p1 + sqrt(p1^2 - 4 * p2 * p0)) / (2 * p2) - r2 = (-p1 - sqrt(p1^2 - 4 * p2 * p0)) / (2 * p2) - - Δλ = max(r1, r2) - λ - - #if we jumped past the lower barrier for λ, jump to midpoint between current and lower λ. - (λ + Δλ <= lower_barrier) && (Δλ = -0.5 * (λ - lower_barrier)) - #if the steps we make are to small, terminate - (abs(Δλ) <= eps(λ)) && break - T_λ = T_λ + Δλ * I - λ = λ + Δλ - end - ls.coefficients[1:i] .= y - return ls.coefficients -end - -# -# Stopping Criteria -# -@doc raw""" - StopWhenFirstOrderProgress <: StoppingCriterion - -A stopping criterion related to the Riemannian adaptive regularization with cubics (ARC) -solver indicating that the model function at the current (outer) iterate, i.e. - -```math - m(X) = f(p) + - + \frac{1}{2} + \frac{σ}{3} \lVert X \rVert^3, -``` - -defined on the tangent space ``T_{p}\mathcal M`` -fulfills at the current iterate ``X_k`` that - -```math -m(X_k) \leq m(0) -\quad\text{ and }\quad -\lVert \operatorname{grad} m(X_k) \rVert ≤ θ \lVert X_k \rVert^2 -``` - -# Fields - -* `θ` – the factor ``θ`` in the second condition above -* `reason` – a String indicating the reason if the criterion indicated to stop -""" -mutable struct StopWhenFirstOrderProgress <: StoppingCriterion - θ::Float64 #θ - reason::String - StopWhenFirstOrderProgress(θ::Float64) = new(θ, "") -end -function (c::StopWhenFirstOrderProgress)( - dmp::AbstractManoptProblem, ls::LanczosState, i::Int -) - if (i == 0) - c.reason = "" - return false - end - #Update Gradient - M = get_manifold(dmp) - get_gradient!(dmp, ls.X, ls.p) - nX = norm(M, ls.p, ls.X) - y = @view(ls.coefficients[1:(i - 1)]) - Ty = @view(ls.tridig_matrix[1:i, 1:(i - 1)]) * y - ny = norm(y) - model_grad_norm = norm(nX .* [1, zeros(i - 1)...] + Ty + ls.σ * ny * [y..., 0]) - if (i > 0) && (model_grad_norm <= c.θ * ny^2) - c.reason = "The subproblem has reached a point with ||grad m(X)|| ≤ θ ||X||^2, θ = $(c.θ)." - return true - end - return false -end -function status_summary(c::StopWhenFirstOrderProgress) - has_stopped = length(c.reason) > 0 - s = has_stopped ? "reached" : "not reached" - return "First order progress with θ=$(c.θ):\t$s" -end -indicates_convergence(c::StopWhenFirstOrderProgress) = true -function show(io::IO, c::StopWhenFirstOrderProgress) - return print(io, "StopWhenFirstOrderProgress($(repr(c.θ)))\n $(status_summary(c))") -end - -#A new stopping criterion that deals with the scenario when a step needs more Lanczos vectors than preallocated. -#Previously this would just cause an error due to out of bounds error. So this stopping criterion deals both with the scenario -#of too few allocated vectors and stagnation in the solver. -@doc raw""" - StopWhenAllLanczosVectorsUsed <: StoppingCriterion - -When an inner iteration has used up all Lanczos vectors, then this stopping criterion is -a fallback / security stopping criterion in order to not access a non-existing field -in the array allocated for vectors. - -Note that this stopping criterion (for now) is only implemented for the case that an -[`AdaptiveRegularizationState`](@ref) when using a [`LanczosState`](@ref) subsolver - -# Fields - -* `maxLanczosVectors` – maximal number of Lanczos vectors -* `reason` – a String indicating the reason if the criterion indicated to stop - -# Constructor - - StopWhenAllLanczosVectorsUsed(maxLancosVectors::Int) - -""" -mutable struct StopWhenAllLanczosVectorsUsed <: StoppingCriterion - maxLanczosVectors::Int - reason::String - StopWhenAllLanczosVectorsUsed(maxLanczosVectors::Int) = new(maxLanczosVectors, "") -end -function (c::StopWhenAllLanczosVectorsUsed)( - ::AbstractManoptProblem, - arcs::AdaptiveRegularizationState{P,T,Pr,<:LanczosState}, - i::Int, -) where {P,T,Pr} - (i == 0) && (c.reason = "") # reset on init - if (i > 0) && length(arcs.sub_state.Lanczos_vectors) == c.maxLanczosVectors - c.reason = "The algorithm used all ($(c.maxLanczosVectors)) preallocated Lanczos vectors and may have stagnated.\n Consider increasing this value.\n" - return true - end - return false -end -function status_summary(c::StopWhenAllLanczosVectorsUsed) - has_stopped = length(c.reason) > 0 - s = has_stopped ? "reached" : "not reached" - return "All Lanczos vectors ($(c.maxLanczosVectors)) used:\t$s" -end -indicates_convergence(c::StopWhenAllLanczosVectorsUsed) = false -function show(io::IO, c::StopWhenAllLanczosVectorsUsed) - return print( - io, - "StopWhenAllLanczosVectorsUsed($(repr(c.maxLanczosVectors)))\n $(status_summary(c))", - ) -end -# -# -function set_manopt_parameter!(M::TangentSpace, ::Val{:p}, v) - M.point .= v - return M -end -function (f::Manopt.AdaptiveRegularizationCubicCost)(M::TangentSpace, X) - ## (33) in Agarwal et al. - return get_cost(base_manifold(M), f.mho, M.point) + - inner(base_manifold(M), M.point, X, f.X) + - 1 / 2 * inner( - base_manifold(M), - M.point, - X, - get_hessian(base_manifold(M), f.mho, M.point, X), - ) + - f.σ / 3 * norm(base_manifold(M), M.point, X)^3 -end -function (grad_f::Manopt.AdaptiveRegularizationCubicGrad)(M::TangentSpace, X) - # (37) in Agarwal et - return grad_f.X + - get_hessian(base_manifold(M), grad_f.mho, M.point, X) + - grad_f.σ * norm(base_manifold(M), M.point, X) * X -end -function (grad_f::Manopt.AdaptiveRegularizationCubicGrad)(M::TangentSpace, Y, X) - get_hessian!(base_manifold(M), Y, grad_f.mho, M.point, X) - Y .= Y + grad_f.X + grad_f.σ * norm(base_manifold(M), M.point, X) * X - return Y -end -function (c::StopWhenFirstOrderProgress)( - dmp::AbstractManoptProblem{<:TangentSpace}, ams::AbstractManoptSolverState, i::Int -) - if (i == 0) - c.reason = "" - return false - end - #Update Gradient - TpM = get_manifold(dmp) - nG = norm(base_manifold(TpM), TpM.point, get_gradient(dmp, ams.p)) - nX = norm(base_manifold(TpM), TpM.point, ams.p) - if (i > 0) && (nG <= c.θ * nX^2) - c.reason = "The algorithm has reduced the model grad norm by $(c.θ).\n" - return true - end - return false -end diff --git a/src/solvers/conjugate_gradient_descent.jl b/src/solvers/conjugate_gradient_descent.jl index f4e8b6bbd7..cd3f497dc1 100644 --- a/src/solvers/conjugate_gradient_descent.jl +++ b/src/solvers/conjugate_gradient_descent.jl @@ -161,9 +161,8 @@ function conjugate_gradient_descent!( stepsize::Stepsize=default_stepsize( M, ConjugateGradientDescentState; retraction_method=retraction_method ), - stopping_criterion::StoppingCriterion=StopWhenAny( - StopAfterIteration(500), StopWhenGradientNormLess(10^(-8)) - ), + stopping_criterion::StoppingCriterion=StopAfterIteration(500) | + StopWhenGradientNormLess(1e-8), vector_transport_method=default_vector_transport_method(M, typeof(p)), initial_gradient=zero_vector(M, p), kwargs..., diff --git a/src/solvers/cyclic_proximal_point.jl b/src/solvers/cyclic_proximal_point.jl index 54419f130b..8b2b8fad3d 100644 --- a/src/solvers/cyclic_proximal_point.jl +++ b/src/solvers/cyclic_proximal_point.jl @@ -38,7 +38,7 @@ where `f` and the proximal maps `proxes_f` can also be given directly as a [`Man cycle permuted sequence (`:Random`) or the default linear one. * `λ` – ( `iter -> 1/iter` ) a function returning the (square summable but not summable) sequence of λi -* `stopping_criterion` – ([`StopWhenAny`](@ref)`(`[`StopAfterIteration`](@ref)`(5000),`[`StopWhenChangeLess`](@ref)`(10.0^-8))`) a [`StoppingCriterion`](@ref). +* `stopping_criterion` – ([`StopAfterIteration`](@ref)`(5000) | `[`StopWhenChangeLess`](@ref)`(1e-12)`) a [`StoppingCriterion`](@ref). All other keyword arguments are passed to [`decorate_state!`](@ref) for decorators or [`decorate_objective!`](@ref), respectively. @@ -121,9 +121,8 @@ function cyclic_proximal_point!( mpo::O, p; evaluation_order::Symbol=:Linear, - stopping_criterion::StoppingCriterion=StopWhenAny( - StopAfterIteration(5000), StopWhenChangeLess(10.0^-12) - ), + stopping_criterion::StoppingCriterion=StopAfterIteration(5000) | + StopWhenChangeLess(1e-12), λ=i -> 1 / i, kwargs..., ) where {O<:Union{ManifoldProximalMapObjective,AbstractDecoratedManifoldObjective}} diff --git a/src/solvers/difference-of-convex-proximal-point.jl b/src/solvers/difference-of-convex-proximal-point.jl index b36f254c62..4f82789ccb 100644 --- a/src/solvers/difference-of-convex-proximal-point.jl +++ b/src/solvers/difference-of-convex-proximal-point.jl @@ -77,6 +77,12 @@ mutable struct DifferenceOfConvexProximalState{ ) end end +# no point -> add point +function DifferenceOfConvexProximalState( + M::AbstractManifold, sub_problem, sub_state; kwargs... +) + return DifferenceOfConvexProximalState(M, rand(M), sub_problem, sub_state; kwargs...) +end get_iterate(dcps::DifferenceOfConvexProximalState) = dcps.p function set_iterate!(dcps::DifferenceOfConvexProximalState, M, p) copyto!(M, dcps.p, p) @@ -352,10 +358,12 @@ function difference_of_convex_proximal_point!( DefaultManoptProblem(M, sub_objective) end end, - sub_state::Union{AbstractEvaluationType,AbstractManoptSolverState}=if !isnothing( + sub_state::Union{AbstractEvaluationType,AbstractManoptSolverState,Nothing}=if !isnothing( prox_g ) evaluation + elseif isnothing(sub_objective) + nothing else decorate_state!( if isnothing(sub_hess) @@ -363,7 +371,15 @@ function difference_of_convex_proximal_point!( M, copy(M, p); stopping_criterion=sub_stopping_criterion ) else - TrustRegionsState(M, copy(M, p); stopping_criterion=sub_stopping_criterion) + TrustRegionsState( + M, + copy(M, p), + DefaultManoptProblem( + TangentSpace(M, copy(M, p)), + TrustRegionModelObjective(sub_objective), + ), + TruncatedConjugateGradientState(TangentSpace(M, p)), + ) end; sub_kwargs..., ) diff --git a/src/solvers/difference_of_convex_algorithm.jl b/src/solvers/difference_of_convex_algorithm.jl index 936776a28b..fbc01be1de 100644 --- a/src/solvers/difference_of_convex_algorithm.jl +++ b/src/solvers/difference_of_convex_algorithm.jl @@ -63,6 +63,7 @@ mutable struct DifferenceOfConvexState{Pr,St,P,T,SC<:StoppingCriterion} <: p, initial_vector, sub_problem, sub_state, stopping_criterion ) end + # Function function DifferenceOfConvexState( M::AbstractManifold, p::P, @@ -323,9 +324,11 @@ function difference_of_convex_algorithm!( else DefaultManoptProblem(M, sub_objective) end, - sub_state::Union{AbstractManoptSolverState,AbstractEvaluationType}=if sub_problem isa + sub_state::Union{AbstractManoptSolverState,AbstractEvaluationType,Nothing}=if sub_problem isa Function evaluation + elseif isnothing(sub_objective) + nothing else decorate_state!( if isnothing(sub_hess) @@ -333,7 +336,9 @@ function difference_of_convex_algorithm!( M, copy(M, p); stopping_criterion=sub_stopping_criterion ) else - TrustRegionsState(M, copy(M, p); stopping_criterion=sub_stopping_criterion) + TrustRegionsState( + M, copy(M, p), sub_objective; stopping_criterion=sub_stopping_criterion + ) end; sub_kwargs..., ) @@ -342,7 +347,6 @@ function difference_of_convex_algorithm!( ) where {O<:Union{ManifoldDifferenceOfConvexObjective,AbstractDecoratedManifoldObjective}} dmdco = decorate_objective!(M, mdco; objective_type=objective_type, kwargs...) dmp = DefaultManoptProblem(M, dmdco) - # For now only subsolvers - TODO closed form solution init here if isnothing(sub_problem) error( """ diff --git a/src/solvers/exact_penalty_method.jl b/src/solvers/exact_penalty_method.jl index 8da9ddeee7..ce0062f7e2 100644 --- a/src/solvers/exact_penalty_method.jl +++ b/src/solvers/exact_penalty_method.jl @@ -15,8 +15,7 @@ a default value is given in brackets if a parameter can be left out in initializ * `u_min` – (`1e-6`) the lower bound for the smoothing parameter and threshold for violation of the constraints * `ρ` – (`1.0`) the penalty parameter * `θ_ρ` – (`0.3`) the scaling factor of the penalty parameter -* `stopping_criterion` – ([`StopWhenAny`](@ref)`(`[`StopAfterIteration`](@ref)`(300), `[`StopWhenAll`](@ref)`(`[`StopWhenSmallerOrEqual`](@ref)`(ϵ, ϵ_min), `[`StopWhenChangeLess`](@ref)`(min_stepsize)))`) a functor inheriting from [`StoppingCriterion`](@ref) indicating when to stop. - +* `stopping_criterion` – ([`StopAfterIteration`](@ref)`(300) | (`[`StopWhenSmallerOrEqual`](@ref)`(ϵ, ϵ_min) & `[`StopWhenChangeLess`](@ref)`(min_stepsize))`) a functor inheriting from [`StoppingCriterion`](@ref) indicating when to stop. # Constructor @@ -59,9 +58,8 @@ mutable struct ExactPenaltyMethodState{P,Pr,St,R<:Real,TStopping<:StoppingCriter θ_u=(u_min / u)^(u_exponent), ρ::R=1.0, θ_ρ::R=0.3, - stopping_criterion::SC=StopWhenAny( - StopAfterIteration(300), - StopWhenAll(StopWhenSmallerOrEqual(:ϵ, ϵ_min), StopWhenChangeLess(1e-10)), + stopping_criterion::SC=StopAfterIteration(300) | ( + StopWhenSmallerOrEqual(:ϵ, ϵ_min) | StopWhenChangeLess(1e-10) ), ) where { P, diff --git a/src/solvers/gradient_descent.jl b/src/solvers/gradient_descent.jl index e322329886..72861ad76d 100644 --- a/src/solvers/gradient_descent.jl +++ b/src/solvers/gradient_descent.jl @@ -7,7 +7,7 @@ Describes a Gradient based descent algorithm, with # Fields a default value is given in brackets if a parameter can be left out in initialization. -* `p – (`rand(M)` the current iterate +* `p` – (`rand(M)` the current iterate * `X` – (`zero_vector(M,p)`) the current gradient ``\operatorname{grad}f(p)``, initialised to zero vector. * `stopping_criterion` – ([`StopAfterIteration`](@ref)`(100)`) a [`StoppingCriterion`](@ref) * `stepsize` – ([`default_stepsize`](@ref)`(M, GradientDescentState)`) a [`Stepsize`](@ref) @@ -129,7 +129,7 @@ with different choices of the stepsize ``s_k`` available (see `stepsize` option * `M` – a manifold ``\mathcal M`` * `f` – a cost function ``f: \mathcal M→ℝ`` to find a minimizer ``p^*`` for * `grad_f` – the gradient ``\operatorname{grad}f: \mathcal M → T\mathcal M`` of f - - as a function `(M, p) -> X` or a function `(M, X, p) -> X` + as a function `(M, p) -> X` or a function `(M, X, p) -> X` * `p` – an initial value `p` ``= p_0 ∈ \mathcal M`` Alternatively to `f` and `grad_f` you can provide @@ -139,10 +139,9 @@ the [`AbstractManifoldGradientObjective`](@ref) `gradient_objective` directly. * `direction` – ([`IdentityUpdateRule`](@ref)) perform a processing of the direction, e.g. * `evaluation` – ([`AllocatingEvaluation`](@ref)) specify whether the gradient works by allocation (default) form `grad_f(M, p)` or [`InplaceEvaluation`](@ref) in place, i.e. is of the form `grad_f!(M, X, p)`. -* `retraction_method` – (`default_retraction_method(M, typeof(p))`) a retraction to use -* `stepsize` – ([`ConstantStepsize`](@ref)`(1.)`) specify a [`Stepsize`](@ref) - functor. -* `stopping_criterion` – ([`StopWhenAny`](@ref)`(`[`StopAfterIteration`](@ref)`(200), `[`StopWhenGradientNormLess`](@ref)`(10.0^-8))`) +* `retraction_method` – ([`default_retraction_method`](https://juliamanifolds.github.io/ManifoldsBase.jl/stable/retractions/#ManifoldsBase.default_retraction_method-Tuple{AbstractManifold})(M, typeof(p))`) a retraction to use +* `stepsize` – ([`default_stepsize`](@ref)`(M, GradientDescentState)`) a [`Stepsize`](@ref) +* `stopping_criterion` – ([`StopAfterIteration`](@ref)`(200) | `[`StopWhenGradientNormLess`](@ref)`(1e-8)`) a functor inheriting from [`StoppingCriterion`](@ref) indicating when to stop. If you provide the [`ManifoldGradientObjective`](@ref) directly, `evaluation` is ignored. diff --git a/src/solvers/particle_swarm.jl b/src/solvers/particle_swarm.jl index b9be3eda94..0a101681ad 100644 --- a/src/solvers/particle_swarm.jl +++ b/src/solvers/particle_swarm.jl @@ -198,7 +198,7 @@ Instead of a cost function `f` you can also provide an [`AbstractManifoldCostObj * `swarm_size` - (`100`) number of random initial positions of x0 * `retraction_method` – (`default_retraction_method(M, eltype(x))`) a `retraction(M,x,ξ)` to use. * `social_weight` – (`1.4`) a social weight factor -* `stopping_criterion` – ([`StopWhenAny`](@ref)`(`[`StopAfterIteration`](@ref)`(500)`, [`StopWhenChangeLess`](@ref)`(10^{-4})))` +* `stopping_criterion` – ([`StopAfterIteration`](@ref)`(500) | `[`StopWhenChangeLess`](@ref)`(1e-4)`) a functor inheriting from [`StoppingCriterion`](@ref) indicating when to stop. * `vector_transport_mthod` - (`default_vector_transport_method(M, eltype(x))`) a vector transport method to use. * `velocity` – a set of tangent vectors (of type `AbstractVector{T}`) representing the velocities of the particles, per default a random tangent vector per initial position diff --git a/src/solvers/primal_dual_semismooth_Newton.jl b/src/solvers/primal_dual_semismooth_Newton.jl index 3976e58bed..5092bcd88e 100644 --- a/src/solvers/primal_dual_semismooth_Newton.jl +++ b/src/solvers/primal_dual_semismooth_Newton.jl @@ -3,19 +3,19 @@ Perform the Primal-Dual Riemannian Semismooth Newton algorithm. -Given a `cost` function $\mathcal E\colon\mathcal M \to \overline{ℝ}$ of the form +Given a `cost` function ``\mathcal E\colon\mathcal M \to \overline{ℝ}`` of the form ```math \mathcal E(p) = F(p) + G( Λ(p) ), ``` -where $F\colon\mathcal M \to \overline{ℝ}$, $G\colon\mathcal N \to \overline{ℝ}$, -and $\Lambda\colon\mathcal M \to \mathcal N$. The remaining input parameters are +where ``F\colon\mathcal M \to \overline{ℝ}``, ``G\colon\mathcal N \to \overline{ℝ}``, +and ``\Lambda\colon\mathcal M \to \mathcal N``. The remaining input parameters are -* `p, X` primal and dual start points $x\in\mathcal M$ and $\xi\in T_n\mathcal N$ -* `m,n` base points on $\mathcal M$ and $\mathcal N$, respectively. -* `linearized_forward_operator` the linearization $DΛ(⋅)[⋅]$ of the operator $Λ(⋅)$. -* `adjoint_linearized_operator` the adjoint $DΛ^*$ of the linearized operator $DΛ(m)\colon T_{m}\mathcal M \to T_{Λ(m)}\mathcal N$ -* `prox_F, prox_G_Dual` the proximal maps of $F$ and $G^\ast_n$ -* `diff_prox_F, diff_prox_dual_G` the (Clarke Generalized) differentials of the proximal maps of $F$ and $G^\ast_n$ +* `p, X` primal and dual start points ``x\in\mathcal M`` and ``ξ ∈ T_n\mathcal N`` +* `m,n` base points on ``\mathcal M`` and ``\mathcal N``, respectively. +* `linearized_forward_operator` the linearization ``DΛ(⋅)[⋅]`` of the operator ``Λ(⋅)``. +* `adjoint_linearized_operator` the adjoint ``DΛ^*`` of the linearized operator ``DΛ(m)\colon T_{m}\mathcal M \to T_{Λ(m)}\mathcal N`` +* `prox_F, prox_G_Dual` the proximal maps of ``F`` and ``G^\ast_n`` +* `diff_prox_F, diff_prox_dual_G` the (Clarke Generalized) differentials of the proximal maps of ``F`` and ``G^\ast_n`` For more details on the algorithm, see [Diepeveen, Lellmann, SIAM J. Imag. Sci., 2021](@cite DiepeveenLellmann:2021). @@ -201,7 +201,7 @@ end raw""" construct_primal_dual_residual_vector(p, o) -Constructs the vector representation of $X(p^{(k)}, ξ_{n}^{(k)}) \in \mathcal{T}_{p^{(k)}} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}$ +Constructs the vector representation of ``X(p^{(k)}, ξ_{n}^{(k)}) \in \mathcal{T}_{p^{(k)}} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}`` """ function construct_primal_dual_residual_vector( tmp::TwoManifoldProblem, pdsn::PrimalDualSemismoothNewtonState @@ -264,7 +264,7 @@ end raw""" onstruct_primal_dual_residual_covariant_derivative_matrix(p, o) -Constructs the matrix representation of $V^{(k)}:\mathcal{T}_{p^{(k)}} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}\rightarrow \mathcal{T}_{p^{(k)}} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}$ +Constructs the matrix representation of ``V^{(k)}:\mathcal{T}_{p^{(k)}} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}\rightarrow \mathcal{T}_{p^{(k)}} \mathcal{M} \times \mathcal{T}_{n}^{*} \mathcal{N}`` """ function construct_primal_dual_residual_covariant_derivative_matrix( tmp::TwoManifoldProblem, pdsn::PrimalDualSemismoothNewtonState @@ -287,7 +287,7 @@ function construct_primal_dual_residual_covariant_derivative_matrix( q₄ = retract(M, qb, q₅, pdsn.retraction_method) q₃ = -inverse_retract(M, pdsn.p, q₄, pdsn.inverse_retraction_method) q₂ = retract(M, pdsn.p, q₃, pdsn.retraction_method) - q₁ = get_primal_prox(tmp, pdsn.primal_stepsize, q₂) # TODO hier gebleven met debuggen + q₁ = get_primal_prox(tmp, pdsn.primal_stepsize, q₂) # (1) compute update direction η₁ = linearized_forward_operator( diff --git a/src/solvers/quasi_Newton.jl b/src/solvers/quasi_Newton.jl index 30bb5f8bff..6019b2d5de 100644 --- a/src/solvers/quasi_Newton.jl +++ b/src/solvers/quasi_Newton.jl @@ -185,7 +185,7 @@ The ``k``th iteration consists of directions to the tangent space to reduce numerical errors * `stepsize` – ([`WolfePowellLinesearch`](@ref)`(retraction_method, vector_transport_method)`) specify a [`Stepsize`](@ref). -* `stopping_criterion` - (`StopWhenAny(StopAfterIteration(max(1000, memory_size)), StopWhenGradientNormLess(10^(-6))`) +* `stopping_criterion` - ([`StopAfterIteration`](@ref)`(max(1000, memory_size)) | `[`StopWhenGradientNormLess`](@ref)`(1e-6)`) specify a [`StoppingCriterion`](@ref) * `vector_transport_method` – (`default_vector_transport_method(M, typeof(p))`) a vector transport to use. diff --git a/src/solvers/truncated_conjugate_gradient_descent.jl b/src/solvers/truncated_conjugate_gradient_descent.jl index d962de7a20..e00cff29b5 100644 --- a/src/solvers/truncated_conjugate_gradient_descent.jl +++ b/src/solvers/truncated_conjugate_gradient_descent.jl @@ -6,82 +6,75 @@ describe the Steihaug-Toint truncated conjugate-gradient method, with # Fields a default value is given in brackets if a parameter can be left out in initialization. -* `x` : a point, where the trust-region subproblem needs - to be solved -* `η` : a tangent vector (called update vector), which solves the - trust-region subproblem after successful calculation by the algorithm -* `stop` : a [`StoppingCriterion`](@ref). -* `gradient` : the gradient at the current iterate -* `δ` : search direction -* `trust_region_radius` : (`injectivity_radius(M)/4`) the trust-region radius -* `residual` : the gradient -* `randomize` : indicates if the trust-region solve and so the algorithm is to be - initiated with a random tangent vector. If set to true, no - preconditioner will be used. This option is set to true in some - scenarios to escape saddle points, but is otherwise seldom activated. -* `project!` : (`copyto!`) specify a projection operation for tangent vectors - for numerical stability. A function `(M, Y, p, X) -> ...` working in place of `Y`. - per default, no projection is perfomed, set it to `project!` to activate projection. +* `p` – (`rand(M)` a point, where we use the tangent space to solve the trust-region subproblem +* `Y` – (`zero_vector(M,p)`) Current iterate, whose type is also used for the other, internal, tangent vector fields +* `stop` – a [`StoppingCriterion`](@ref). +* `X` – the gradient ``\operatorname{grad}f(p)``` +* `δ` – the conjugate gradient search direction +* `θ` – (`1.0`) 1+θ is the superlinear convergence target rate. +* `κ` – (`0.1`) the linear convergence target rate. +* `trust_region_radius` – (`injectivity_radius(M)/4`) the trust-region radius +* `residual` – the gradient of the model ``m(Y)`` +* `randomize` … (`false`) +* `project!` – (`copyto!`) for numerical stability it is possivle to project onto + the tangent space after every iteration. By default this only copies instead. + +# Internal (temporary) Fields + +* `Hδ`, `HY` – temporary results of the Hessian applied to `δ` and `Y`, respectively. +* `δHδ`, `YPδ`, `δPδ`, `YPδ` – temporary inner products with `Hδ` and preconditioned inner products. +* `z` – the preconditioned residual +* `z_r` - inner product of the residual and `z` # Constructor - TruncatedConjugateGradientState(M, p=rand(M), η=zero_vector(M,p); - trust_region_radius=injectivity_radius(M)/4, - randomize=false, - θ=1.0, - κ=0.1, - project!=copyto!, - ) - - and a slightly involved `stopping_criterion` + TruncatedConjugateGradientState(M, p=rand(M), Y=zero_vector(M,p); kwargs...) # See also [`truncated_conjugate_gradient_descent`](@ref), [`trust_regions`](@ref) """ -mutable struct TruncatedConjugateGradientState{P,T,R<:Real,SC<:StoppingCriterion,Proj} <: +mutable struct TruncatedConjugateGradientState{T,R<:Real,SC<:StoppingCriterion,Proj} <: AbstractHessianSolverState - p::P stop::SC X::T - η::T - Hη::T + Y::T + HY::T δ::T Hδ::T δHδ::R - ηPδ::R + YPδ::R δPδ::R - ηPη::R + YPY::R z::T z_r::R residual::T trust_region_radius::R model_value::R - new_model_value::R randomize::Bool project!::Proj initialResidualNorm::Float64 function TruncatedConjugateGradientState( - M::AbstractManifold, - p::P=rand(M), - η::T=zero_vector(M, p); - trust_region_radius::R=injectivity_radius(M) / 4.0, + TpM::TangentSpace, + Y::T=rand(TpM); + trust_region_radius::R=injectivity_radius(base_manifold(TpM)) / 4.0, randomize::Bool=false, project!::F=copyto!, θ::Float64=1.0, κ::Float64=0.1, - stopping_criterion::StoppingCriterion=StopAfterIteration(manifold_dimension(M)) | + stopping_criterion::StoppingCriterion=StopAfterIteration( + manifold_dimension(base_manifold(TpM)) + ) | StopWhenResidualIsReducedByFactorOrPower(; κ=κ, θ=θ ) | StopWhenTrustRegionIsExceeded() | StopWhenCurvatureIsNegative() | StopWhenModelIncreased(), - ) where {P,T,R<:Real,F} - tcgs = new{P,T,R,typeof(stopping_criterion),F}() - tcgs.p = p + ) where {T,R<:Real,F} + tcgs = new{T,R,typeof(stopping_criterion),F}() tcgs.stop = stopping_criterion - tcgs.η = η + tcgs.Y = Y tcgs.trust_region_radius = trust_region_radius tcgs.randomize = randomize tcgs.project! = project! @@ -105,27 +98,47 @@ function show(io::IO, tcgs::TruncatedConjugateGradientState) This indicates convergence: $Conv""" return print(io, s) end +function set_manopt_parameter!(tcgs::TruncatedConjugateGradientState, ::Val{:Iterate}, Y) + return tcgs.Y = Y +end +function set_manopt_parameter!( + tcgs::TruncatedConjugateGradientState, ::Val{:TrustRegionRadius}, r +) + return tcgs.trust_region_radius = r +end + +function get_manopt_parameter( + tcgs::TruncatedConjugateGradientState, ::Val{:TrustRegionExceeded} +) + return (tcgs.YPY >= tcgs.trust_region_radius^2) +end # # Special stopping Criteria # - @doc raw""" StopWhenResidualIsReducedByFactorOrPower <: StoppingCriterion + A functor for testing if the norm of residual at the current iterate is reduced either by a power of 1+θ or by a factor κ compared to the norm of the initial residual, i.e. $\Vert r_k \Vert_x \leqq \Vert r_0 \Vert_{x} \ \min \left( \kappa, \Vert r_0 \Vert_{x}^{\theta} \right)$. + # Fields + * `κ` – the reduction factor * `θ` – part of the reduction power * `reason` – stores a reason of stopping if the stopping criterion has one be reached, see [`get_reason`](@ref). + # Constructor + StopWhenResidualIsReducedByFactorOrPower(; κ=0.1, θ=1.0) -initialize the StopWhenResidualIsReducedByFactorOrPower functor to indicate to stop after + +Initialize the StopWhenResidualIsReducedByFactorOrPower functor to indicate to stop after the norm of the current residual is lesser than either the norm of the initial residual to the power of 1+θ or the norm of the initial residual times κ. + # See also [`truncated_conjugate_gradient_descent`](@ref), [`trust_regions`](@ref) @@ -146,7 +159,10 @@ function (c::StopWhenResidualIsReducedByFactorOrPower)( c.reason = "" c.at_iteration = 0 end - if norm(get_manifold(mp), tcgstate.p, tcgstate.residual) <= + TpM = get_manifold(mp) + M = base_manifold(TpM) + p = TpM.point + if norm(M, p, tcgstate.residual) <= tcgstate.initialResidualNorm * min(c.κ, tcgstate.initialResidualNorm^(c.θ)) && i > 0 c.reason = "The norm of the residual is less than or equal either to κ=$(c.κ) times the norm of the initial residual or to the norm of the initial residual to the power 1 + θ=$(1+(c.θ)). \n" return true @@ -191,10 +207,11 @@ end StopWhenTrustRegionIsExceeded <: StoppingCriterion A functor for testing if the norm of the next iterate in the Steihaug-Toint tcg -method is larger than the trust-region radius, i.e. $\Vert η_{k}^{*} \Vert_x +method is larger than the trust-region radius, i.e. $\Vert Y_{k}^{*} \Vert_x ≧ trust_region_radius$. terminate the algorithm when the trust region has been left. # Fields + * `reason` – stores a reason of stopping if the stopping criterion has been reached, see [`get_reason`](@ref). @@ -221,8 +238,8 @@ function (c::StopWhenTrustRegionIsExceeded)( c.reason = "" c.at_iteration = 0 end - if tcgs.ηPη >= tcgs.trust_region_radius^2 && i >= 0 - c.reason = "Trust-region radius violation (‖η‖² = $(tcgs.ηPη)) >= $(tcgs.trust_region_radius^2) = trust_region_radius²). \n" + if tcgs.YPY >= tcgs.trust_region_radius^2 && i >= 0 + c.reason = "Trust-region radius violation (‖Y‖² = $(tcgs.YPY)) >= $(tcgs.trust_region_radius^2) = trust_region_radius²). \n" c.at_iteration = i return true end @@ -236,6 +253,7 @@ end function show(io::IO, c::StopWhenTrustRegionIsExceeded) return print(io, "StopWhenTrustRegionIsExceeded()\n $(status_summary(c))") end + @doc raw""" StopWhenCurvatureIsNegative <: StoppingCriterion @@ -304,19 +322,22 @@ A functor for testing if the curvature of the model value increased. mutable struct StopWhenModelIncreased <: StoppingCriterion reason::String at_iteration::Int + model_value::Float64 end -StopWhenModelIncreased() = StopWhenModelIncreased("", 0) +StopWhenModelIncreased() = StopWhenModelIncreased("", 0, Inf) function (c::StopWhenModelIncreased)( ::AbstractManoptProblem, tcgs::TruncatedConjugateGradientState, i::Int ) if i == 0 # reset on init c.reason = "" c.at_iteration = 0 + c.model_value = Inf end - if i > 0 && (tcgs.new_model_value > tcgs.model_value) - c.reason = "Model value increased from $(tcgs.model_value) to $(tcgs.new_model_value).\n" + if i > 0 && (tcgs.model_value > c.model_value) + c.reason = "Model value increased from $(c.model_value) to $(tcgs.model_value).\n" return true end + c.model_value = tcgs.model_value return false end function status_summary(c::StopWhenModelIncreased) @@ -335,69 +356,58 @@ end truncated_conjugate_gradient_descent(M, f, grad_f, Hess_f, p; kwargs...) truncated_conjugate_gradient_descent(M, f, grad_f, Hess_f, p, X; kwargs...) truncated_conjugate_gradient_descent(M, mho::ManifoldHessianObjective, p, X; kwargs...) + truncated_conjugate_gradient_descent(M, trmo::TrustRegionModelObjective, p, X; kwargs...) solve the trust-region subproblem ```math -\operatorname*{arg\,min}_{η ∈ T_pM} -m_p(η) \quad\text{where} -m_p(η) = f(p) + ⟨\operatorname{grad} f(p),η⟩_x + \frac{1}{2}⟨\operatorname{Hess} f(p)[η],η⟩_x, +\begin{align*} +\operatorname*{arg\,min}_{Y ∈ T_p\mathcal{M}}&\ m_p(Y) = f(p) + +⟨\operatorname{grad}f(p), Y⟩_p + \frac{1}{2} ⟨\mathcal{H}_p[Y], Y⟩_p\\ +\text{such that}& \ \lVert Y \rVert_p ≤ Δ +\end{align*} ``` -```math -\text{such that}\quad ⟨η,η⟩_x ≤ Δ^2 -``` - -on a manifold M by using the Steihaug-Toint truncated conjugate-gradient method, -abbreviated tCG-method. +on a manifold M by using the Steihaug-Toint truncated conjugate-gradient (tCG) method. For a description of the algorithm and theorems offering convergence guarantees, -see the reference: - -* P.-A. Absil, C.G. Baker, K.A. Gallivan, - Trust-region methods on Riemannian manifolds, FoCM, 2007. - doi: [10.1007/s10208-005-0179-9](https://doi.org/10.1007/s10208-005-0179-9) -* A. R. Conn, N. I. M. Gould, P. L. Toint, Trust-region methods, SIAM, - MPS, 2000. doi: [10.1137/1.9780898719857](https://doi.org/10.1137/1.9780898719857) +see [Absil, Baker, Gallivan, FoCM, 2007](@cite AbsilBakerGallivan:2006), [Conn, Gould, Toint, 2000](@cite ConnGouldToint:2000). # Input See signatures above, you can leave out only the Hessian, the vector, the point and the vector, or all 3. -* `M` – a manifold ``\mathcal M`` -* `f` – a cost function ``F: \mathcal M → ℝ`` to minimize +* `M` – a manifold ``\mathcal M`` +* `f` – a cost function ``f: \mathcal M → ℝ`` to minimize * `grad_f` – the gradient ``\operatorname{grad}f: \mathcal M → T\mathcal M`` of `F` * `Hess_f` – (optional, cf. [`ApproxHessianFiniteDifference`](@ref)) the hessian ``\operatorname{Hess}f: T_p\mathcal M → T_p\mathcal M``, ``X ↦ \operatorname{Hess}F(p)[X] = ∇_X\operatorname{grad}f(p)`` -* `p` – a point on the manifold ``p ∈ \mathcal M`` -* `X` – an update tangential vector ``X ∈ T_p\mathcal M`` +* `p` – a point on the manifold ``p ∈ \mathcal M`` +* `X` – an initial tangential vector ``X ∈ T_p\mathcal M`` + +Instead of the three functions, you either provide a [`ManifoldHessianObjective`](@ref) `mho` +which is then used to build the trust region model, or a [`TrustRegionModelObjective`](@ref) `trmo` +directly. # Optional -* `evaluation` – ([`AllocatingEvaluation`](@ref)) specify whether the gradient and hessian work by +* `evaluation` – ([`AllocatingEvaluation`](@ref)) specify whether the gradient and hessian work by allocation (default) or [`InplaceEvaluation`](@ref) in place -* `preconditioner` – a preconditioner for the hessian H -* `θ` – (`1.0`) 1+θ is the superlinear convergence target rate. The method aborts - if the residual is less than or equal to the initial residual to the power of 1+θ. -* `κ` – (`0.1`) the linear convergence target rate. The method aborts if the - residual is less than or equal to κ times the initial residual. -* `randomize` – set to true if the trust-region solve is to be initiated with a - random tangent vector. If set to true, no preconditioner will be - used. This option is set to true in some scenarios to escape saddle - points, but is otherwise seldom activated. +* `preconditioner` – a preconditioner for the hessian H +* `θ` – (`1.0`) 1+θ is the superlinear convergence target rate. +* `κ` – (`0.1`) the linear convergence target rate. +* `randomize` – set to true if the trust-region solve is initialized to a random tangent vector. + This disables preconditioning. * `trust_region_radius` – (`injectivity_radius(M)/4`) a trust-region radius -* `project!` : (`copyto!`) specify a projection operation for tangent vectors - for numerical stability. A function `(M, Y, p, X) -> ...` working in place of `Y`. - per default, no projection is perfomed, set it to `project!` to activate projection. -* `stopping_criterion` – ([`StopAfterIteration`](@ref)` | [`StopWhenResidualIsReducedByFactorOrPower`](@ref)` | '[`StopWhenCurvatureIsNegative`](@ref)` | `[`StopWhenTrustRegionIsExceeded`](@ref) ) - a functor inheriting from [`StoppingCriterion`](@ref) indicating when to stop, - where for the default, the maximal number of iterations is set to the dimension of the - manifold, the power factor is `θ`, the reduction factor is `κ`. +* `project!` – (`copyto!`) for numerical stability it is possivle to project onto + the tangent space after every iteration. By default this only copies instead. +* `stopping_criterion` – ([`StopAfterIteration`](@ref)`(manifol_dimension(M)) | `[`StopWhenResidualIsReducedByFactorOrPower`](@ref)`(;κ=κ, θ=θ) | `[`StopWhenCurvatureIsNegative`](@ref)`() | `[`StopWhenTrustRegionIsExceeded`](@ref)`() | `[`StopWhenModelIncreased`](@ref)`()`) + a functor inheriting from [`StoppingCriterion`](@ref) indicating when to stop, and the ones that are passed to [`decorate_state!`](@ref) for decorators. # Output -the obtained (approximate) minimizer ``\eta^*``, see [`get_solver_return`](@ref) for details +the obtained (approximate) minimizer ``Y^*``, see [`get_solver_return`](@ref) for details # see also [`trust_regions`](@ref) @@ -515,21 +525,29 @@ function truncated_conjugate_gradient_descent( return (typeof(q) == typeof(rs)) ? rs[] : rs end # -# Objective -> Allocate and call ! -# +# Objective I -> generate model function truncated_conjugate_gradient_descent( M::AbstractManifold, mho::O, p, X; kwargs... ) where {O<:Union{ManifoldHessianObjective,AbstractDecoratedManifoldObjective}} + trmo = TrustRegionModelObjective(mho) + TpM = TangentSpace(M, copy(M, p)) + return truncated_conjugate_gradient_descent(TpM, trmo, p, X; kwargs...) +end +# +# Objective II; We have a tangent space model -> Allocate and call ! +function truncated_conjugate_gradient_descent( + M::AbstractManifold, mho::O, p, X; kwargs... +) where { + E, + O<:Union{ + AbstractManifoldSubObjective, + AbstractDecoratedManifoldObjective{E,<:AbstractManifoldSubObjective}, + }, +} q = copy(M, p) Y = copy(M, p, X) return truncated_conjugate_gradient_descent!(M, mho, q, Y; kwargs...) end -# -# Deprecated - even keeping old notation. -# -@deprecate truncated_conjugate_gradient_descent( - M::AbstractManifold, F, gradF, x, η, H::TH; kwargs... -) where {TH<:Function} truncated_conjugate_gradient_descent(M, F, gradF, H, x, η; kwargs...) @doc raw""" truncated_conjugate_gradient_descent!(M, f, grad_f, Hess_f, p, X; kwargs...) @@ -595,16 +613,24 @@ function truncated_conjugate_gradient_descent!( M, mho, p, X; evaluation=evaluation, kwargs... ) end +# Main Initiliaization I: We have an mho -> generate tangent space model objective function truncated_conjugate_gradient_descent!( - M::AbstractManifold, - mho::O, + M::AbstractManifold, mho::O, p, X; kwargs... +) where {O<:Union{ManifoldHessianObjective,AbstractDecoratedManifoldObjective}} + trmo = TrustRegionModelObjective(mho) + TpM = TangentSpace(M, copy(M, p)) + return truncated_conjugate_gradient_descent!(TpM, trmo, p, X; kwargs...) +end +function truncated_conjugate_gradient_descent!( + TpM::TangentSpace, + trm::TrustRegionModelObjective, p, X; - trust_region_radius::Float64=injectivity_radius(M) / 4, + trust_region_radius::Float64=injectivity_radius(TpM) / 4, θ::Float64=1.0, κ::Float64=0.1, randomize::Bool=false, - stopping_criterion::StoppingCriterion=StopAfterIteration(manifold_dimension(M)) | + stopping_criterion::StoppingCriterion=StopAfterIteration(manifold_dimension(TpM)) | StopWhenResidualIsReducedByFactorOrPower(; κ=κ, θ=θ ) | @@ -613,12 +639,11 @@ function truncated_conjugate_gradient_descent!( StopWhenModelIncreased(), project!::Proj=copyto!, kwargs..., #collect rest -) where {Proj,O<:Union{ManifoldHessianObjective,AbstractDecoratedManifoldObjective}} - dmho = decorate_objective!(M, mho; kwargs...) - mp = DefaultManoptProblem(M, dmho) +) where {Proj} + dtrm = decorate_objective!(TpM, trm; kwargs...) + mp = DefaultManoptProblem(TpM, dtrm) tcgs = TruncatedConjugateGradientState( - M, - p, + TpM, X; trust_region_radius=trust_region_radius, randomize=randomize, @@ -631,13 +656,6 @@ function truncated_conjugate_gradient_descent!( solve!(mp, dtcgs) return get_solver_return(get_objective(mp), dtcgs) end -# -# Deprecated - even kept old notation -@deprecate truncated_conjugate_gradient_descent!( - M::AbstractManifold, F::TF, gradF::TG, x, η, H::TH; kwargs... -) where {TF<:Function,TG<:Function,TH<:Function} truncated_conjugate_gradient_descent!( - M, F, gradF, H, x, η; kwargs... -) # # Maybe these could be improved a bit in readability some time @@ -645,71 +663,90 @@ end function initialize_solver!( mp::AbstractManoptProblem, tcgs::TruncatedConjugateGradientState ) - M = get_manifold(mp) - (tcgs.randomize) || zero_vector!(M, tcgs.η, tcgs.p) - tcgs.Hη = tcgs.randomize ? get_hessian(mp, tcgs.p, tcgs.η) : zero_vector(M, tcgs.p) - tcgs.X = get_gradient(mp, tcgs.p) - tcgs.residual = tcgs.randomize ? tcgs.X + tcgs.Hη : tcgs.X - tcgs.z = tcgs.randomize ? tcgs.residual : get_preconditioner(mp, tcgs.p, tcgs.residual) - tcgs.δ = -copy(M, tcgs.p, tcgs.z) - tcgs.Hδ = zero_vector(M, tcgs.p) - tcgs.δHδ = real(inner(M, tcgs.p, tcgs.δ, tcgs.Hδ)) - tcgs.ηPδ = tcgs.randomize ? real(inner(M, tcgs.p, tcgs.η, tcgs.δ)) : zero(tcgs.δHδ) - tcgs.δPδ = real(inner(M, tcgs.p, tcgs.residual, tcgs.z)) - tcgs.ηPη = tcgs.randomize ? real(inner(M, tcgs.p, tcgs.η, tcgs.η)) : zero(tcgs.δHδ) + TpM = get_manifold(mp) + M = base_manifold(TpM) + p = TpM.point + trmo = get_objective(mp) + # TODO Reworked until here + (tcgs.randomize) || zero_vector!(M, tcgs.Y, p) + tcgs.HY = tcgs.randomize ? get_objective_hessian(M, trmo, p, tcgs.Y) : zero_vector(M, p) + tcgs.X = get_objective_gradient(M, trmo, p) # Initialize gradient + tcgs.residual = tcgs.randomize ? tcgs.X + tcgs.HY : tcgs.X + tcgs.z = if tcgs.randomize + tcgs.residual + else + get_objective_preconditioner(M, trmo, p, tcgs.residual) + end + tcgs.δ = -copy(M, p, tcgs.z) + tcgs.Hδ = zero_vector(M, p) + tcgs.δHδ = real(inner(M, p, tcgs.δ, tcgs.Hδ)) + tcgs.YPδ = tcgs.randomize ? real(inner(M, p, tcgs.Y, tcgs.δ)) : zero(tcgs.δHδ) + tcgs.δPδ = real(inner(M, p, tcgs.residual, tcgs.z)) + tcgs.YPY = tcgs.randomize ? real(inner(M, p, tcgs.Y, tcgs.Y)) : zero(tcgs.δHδ) if tcgs.randomize tcgs.model_value = - real(inner(M, tcgs.p, tcgs.η, tcgs.X)) + - 0.5 * real(inner(M, tcgs.p, tcgs.η, tcgs.Hη)) + real(inner(M, p, tcgs.Y, tcgs.X)) + 0.5 * real(inner(M, p, tcgs.Y, tcgs.HY)) else tcgs.model_value = 0 end - tcgs.z_r = real(inner(M, tcgs.p, tcgs.z, tcgs.residual)) - tcgs.initialResidualNorm = norm(M, tcgs.p, tcgs.residual) + tcgs.z_r = real(inner(M, p, tcgs.z, tcgs.residual)) + tcgs.initialResidualNorm = norm(M, p, tcgs.residual) return tcgs end function step_solver!( mp::AbstractManoptProblem, tcgs::TruncatedConjugateGradientState, ::Any ) - M = get_manifold(mp) - get_hessian!(mp, tcgs.Hδ, tcgs.p, tcgs.δ) - tcgs.δHδ = real(inner(M, tcgs.p, tcgs.δ, tcgs.Hδ)) + TpM = get_manifold(mp) + M = base_manifold(TpM) + p = TpM.point + trmo = get_objective(mp) + get_objective_hessian!(M, tcgs.Hδ, trmo, p, tcgs.δ) + tcgs.δHδ = real(inner(M, p, tcgs.δ, tcgs.Hδ)) α = tcgs.z_r / tcgs.δHδ - ηPη_new = tcgs.ηPη + 2 * α * tcgs.ηPδ + α^2 * tcgs.δPδ + YPY_new = tcgs.YPY + 2 * α * tcgs.YPδ + α^2 * tcgs.δPδ # Check against negative curvature and trust-region radius violation. - if tcgs.δHδ <= 0 || ηPη_new >= tcgs.trust_region_radius^2 + if tcgs.δHδ <= 0 || YPY_new >= tcgs.trust_region_radius^2 τ = ( - -tcgs.ηPδ + - sqrt(tcgs.ηPδ^2 + tcgs.δPδ * (tcgs.trust_region_radius^2 - tcgs.ηPη)) + -tcgs.YPδ + + sqrt(tcgs.YPδ^2 + tcgs.δPδ * (tcgs.trust_region_radius^2 - tcgs.YPY)) ) / tcgs.δPδ - copyto!(M, tcgs.η, tcgs.p, tcgs.η + τ * tcgs.δ) - copyto!(M, tcgs.Hη, tcgs.p, tcgs.Hη + τ * tcgs.Hδ) - tcgs.ηPη = ηPη_new + copyto!(M, tcgs.Y, p, tcgs.Y + τ * tcgs.δ) + copyto!(M, tcgs.HY, p, tcgs.HY + τ * tcgs.Hδ) + tcgs.YPY = YPY_new + return tcgs + end + tcgs.YPY = YPY_new + new_Y = tcgs.Y + α * tcgs.δ # Update iterate Y + new_HY = tcgs.HY + α * tcgs.Hδ # Update HY + new_model_value = + real(inner(M, p, new_Y, tcgs.X)) + 0.5 * real(inner(M, p, new_Y, new_HY)) + # If we did not improved the model with this iterate -> end iteration + if new_model_value >= tcgs.model_value + tcgs.model_value = new_model_value return tcgs end - tcgs.ηPη = ηPη_new - new_η = tcgs.η + α * tcgs.δ - new_Hη = tcgs.Hη + α * tcgs.Hδ - # No negative curvature and s.η - α * (s.δ) inside TR: accept it. - tcgs.new_model_value = - real(inner(M, tcgs.p, new_η, tcgs.X)) + 0.5 * real(inner(M, tcgs.p, new_η, new_Hη)) - tcgs.new_model_value >= tcgs.model_value && return tcgs - copyto!(M, tcgs.η, tcgs.p, new_η) - tcgs.model_value = tcgs.new_model_value - copyto!(M, tcgs.Hη, tcgs.p, new_Hη) + # otherweise accept step + copyto!(M, tcgs.Y, p, new_Y) + tcgs.model_value = new_model_value + copyto!(M, tcgs.HY, p, new_HY) tcgs.residual = tcgs.residual + α * tcgs.Hδ - # Precondition the residual. - tcgs.z = tcgs.randomize ? tcgs.residual : get_preconditioner(mp, tcgs.p, tcgs.residual) - zr = real(inner(M, tcgs.p, tcgs.z, tcgs.residual)) + # Precondition the residual if we are not running in random mode + tcgs.z = if tcgs.randomize + tcgs.residual + else + get_objective_preconditioner(M, trmo, p, tcgs.residual) + end + zr = real(inner(M, p, tcgs.z, tcgs.residual)) # Compute new search direction. β = zr / tcgs.z_r tcgs.z_r = zr tcgs.δ = -tcgs.z + β * tcgs.δ - tcgs.project!(M, tcgs.δ, tcgs.p, tcgs.δ) - tcgs.ηPδ = β * (α * tcgs.δPδ + tcgs.ηPδ) + # potentially stabilize step by projecting. + tcgs.project!(M, tcgs.δ, p, tcgs.δ) + tcgs.YPδ = β * (α * tcgs.δPδ + tcgs.YPδ) tcgs.δPδ = tcgs.z_r + β^2 * tcgs.δPδ return tcgs end -get_solver_result(s::TruncatedConjugateGradientState) = s.η +get_solver_result(s::TruncatedConjugateGradientState) = s.Y diff --git a/src/solvers/trust_regions.jl b/src/solvers/trust_regions.jl index 05d838e31d..b47a9252a8 100644 --- a/src/solvers/trust_regions.jl +++ b/src/solvers/trust_regions.jl @@ -2,64 +2,74 @@ @doc raw""" TrustRegionsState <: AbstractHessianSolverState -describe the trust-regions solver, with - +Store the state of the trust-regions solver. # Fields -where all but `p` are keyword arguments in the constructor -* `p` : the current iterate -* `stop` : (`StopAfterIteration(1000) | StopWhenGradientNormLess(1e-6)) -* `max_trust_region_radius` : (`sqrt(manifold_dimension(M))`) the maximum trust-region radius -* `project!` : (`copyto!`) specify a projection operation for tangent vectors +All the following fields (besides `p`) can be set by specifying them as keywords. + +* `acceptance_rate` – (`0.1`) a lower bound of the performance ratio for the iterate that + decides if the iteration will be accepted or not. +* `max_trust_region_radius` – (`sqrt(manifold_dimension(M))`) the maximum trust-region radius +* `p` – (`rand(M)` if a manifold is provided) the current iterate +* `project!` – (`copyto!`) specify a projection operation for tangent vectors for numerical stability. A function `(M, Y, p, X) -> ...` working in place of `Y`. per default, no projection is perfomed, set it to `project!` to activate projection. -* `randomize` : (`false`) indicates if the trust-region solve is to be initiated with a - random tangent vector. If set to true, no preconditioner will be - used. This option is set to true in some scenarios to escape saddle - points, but is otherwise seldom activated. -* `ρ_prime` : (`0.1`) a lower bound of the performance ratio for the iterate that - decides if the iteration will be accepted or not. If not, the - trust-region radius will have been decreased. To ensure this, - ρ'>= 0 must be strictly smaller than 1/4. If ρ' is negative, - the algorithm is not guaranteed to produce monotonically decreasing - cost values. It is strongly recommended to set ρ' > 0, to aid - convergence. -* `ρ_regularization` : (`10000.0`) Close to convergence, evaluating the performance ratio ρ - is numerically challenging. Meanwhile, close to convergence, the - quadratic model should be a good fit and the steps should be - accepted. Regularization lets ρ go to 1 as the model decrease and - the actual decrease go to zero. Set this option to zero to disable - regularization (not recommended). When this is not zero, it may happen - that the iterates produced are not monotonically improving the cost - when very close to convergence. This is because the corrected cost - improvement could change sign if it is negative but very small. -* `trust_region_radius` : the (initial) trust-region radius - -# Constructor - - TrustRegionsState(M, - p=rand(M), - X=zero_vector(M,p), - sub_state=TruncatedConjugateGradientState(M, p, X), -) +* `stop` – ([`StopAfterIteration`](@ref)`(1000) | `[`StopWhenGradientNormLess`](@ref)`(1e-6)`) +* `randomize` – (`false`) indicates if the trust-region solve is to be initiated with a + random tangent vector. If set to true, no preconditioner will be + used. This option is set to true in some scenarios to escape saddle + points, but is otherwise seldom activated. +* `ρ_regularization` – (`10000.0`) regularize the model fitness ``ρ`` to avoid division by zero +* `sub_problem` – an [`AbstractManoptProblem`](@ref) problem or a function `(M, p, X) -> q` or `(M, q, p, X)` for the a closed form solution of the sub problem +* `sub_state` – ([`TruncatedConjugateGradientState`](@ref)`(M, p, X)`) +* `σ` – (`0.0` or `1e-6` depending on `randomize`) Gaussian standard deviation when creating the random initial tangent vector +* `trust_region_radius` – (`max_trust_region_radius / 8`) the (initial) trust-region radius +* `X` – (`zero_vector(M,p)`) the current gradient `grad_f(p)` + Use this default to specify the type of tangent vector to allocate also for the internal (tangent vector) fields. + +# Internal Fields + +* `HX`, `HY`, `HZ` – interims storage (to avoid allocation) of ``\operatorname{Hess} f(p)[\cdot]` of `X`, `Y`, `Z` +* `Y` – the solution (tangent vector) of the subsolver +* `Z` – the Cauchy point (only used if random is activated) + + +# Constructors + +All the following constructors have the above fields as keyword arguents with the defaults +given in brackets. If no initial point `p` is provided, `p=rand(M)` is used + + + + TrustRegionsState(M, mho; kwargs...) + TrustRegionsState(M, p, mho; kwargs...) + +A trust region state, where the sub problem is set to a [`DefaultManoptProblem`](@ref) on the +tangent space using the [`TrustRegionModelObjective`](@ref) to be solved with [`truncated_conjugate_gradient_descent!`](@ref) +or in other words the sub state is set to [`TruncatedConjugateGradientState`](@ref). + + TrustRegionsState(M, sub_problem, sub_state; kwargs...) + TrustRegionsState(M, p, sub_problem, sub_state; kwargs...) + +A trust region state, where the sub problem is solved using a [`AbstractManoptProblem`](@ref) `sub_problem` +and an [`AbstractManoptSolverState`](@ref) `sub_state`. + + TrustRegionsState(M, f::Function; evaluation=AllocatingEvaluation, kwargs...) + TrustRegionsState(M, p, f; evaluation=AllocatingEvaluation, kwargs...) + +A trust region state, where the sub problem is solved in closed form by a funtion +`f(M, p, Y, Δ)`, where `p` is the current iterate, `Y` the inital tangent vector at `p` and +`Δ` the current trust region radius. -construct a trust-regions Option with all other fields from above being -keyword arguments # See also -[`trust_regions`](@ref) +[`trust_regions`](@ref), [`trust_regions!`](@ref) """ mutable struct TrustRegionsState{ - P, - T, - SC<:StoppingCriterion, - RTR<:AbstractRetractionMethod, - R<:Real, - Proj, - Op<:AbstractHessianSolverState, -} <: AbstractHessianSolverState + P,T,Pr,St,SC<:StoppingCriterion,RTR<:AbstractRetractionMethod,R<:Real,Proj +} <: AbstractSubProblemSolverState p::P X::T stop::SC @@ -68,66 +78,104 @@ mutable struct TrustRegionsState{ retraction_method::RTR randomize::Bool project!::Proj - ρ_prime::R + acceptance_rate::R ρ_regularization::R - sub_state::Op + sub_problem::Pr + sub_state::St p_proposal::P f_proposal::R - # Random - Hgrad::T - η::T - Hη::T - η_Cauchy::T - Hη_Cauchy::T + # Only required for Random mode Random + HX::T + Y::T + HY::T + Z::T + HZ::T τ::R + σ::R reduction_threshold::R + reduction_factor::R augmentation_threshold::R - function TrustRegionsState{P,T,SC,RTR,R,Proj,Op}( + augmentation_factor::R + function TrustRegionsState{P,T,Pr,St,SC,RTR,R,Proj}( p::P, X::T, trust_region_radius::R, max_trust_region_radius::R, - ρ_prime::R, + acceptance_rate::R, ρ_regularization::R, randomize::Bool, stopping_citerion::SC, retraction_method::RTR, reduction_threshold::R, augmentation_threshold::R, - sub_state::Op, + sub_problem::Pr, + sub_state::St, project!::Proj=copyto!, - ) where { - P, - T, - SC<:StoppingCriterion, - RTR<:AbstractRetractionMethod, - R<:Real, - Proj, - Op<:AbstractHessianSolverState, - } - trs = new{P,T,SC,RTR,R,Proj,Op}() + reduction_factor=0.25, + augmentation_factor=2.0, + σ::R=random ? 1e-6 : 0.0, + ) where {P,T,Pr,St,SC<:StoppingCriterion,RTR<:AbstractRetractionMethod,R<:Real,Proj} + trs = new{P,T,Pr,St,SC,RTR,R,Proj}() trs.p = p trs.X = X trs.stop = stopping_citerion trs.retraction_method = retraction_method trs.trust_region_radius = trust_region_radius trs.max_trust_region_radius = max_trust_region_radius::R - trs.ρ_prime = ρ_prime + trs.acceptance_rate = acceptance_rate trs.ρ_regularization = ρ_regularization trs.randomize = randomize + trs.sub_problem = sub_problem trs.sub_state = sub_state trs.reduction_threshold = reduction_threshold + trs.reduction_factor = reduction_factor trs.augmentation_threshold = augmentation_threshold + trs.augmentation_factor = augmentation_factor trs.project! = project! + trs.σ = σ return trs end end +# No point, no state -> add point +function TrustRegionsState( + M, sub_problem::Pr; kwargs... +) where {Pr<:Union{AbstractManoptProblem,<:Function,AbstractManifoldHessianObjective}} + return TrustRegionsState(M, rand(M), sub_problem; kwargs...) +end +# No point but state -> add point +function TrustRegionsState( + M, sub_problem::Pr, sub_state::St; kwargs... +) where {Pr<:Union{AbstractManoptProblem,<:Function},St} + return TrustRegionsState(M, rand(M), sub_problem, sub_state; kwargs...) +end +# point, but no state for a function -> add evaluation as state +function TrustRegionsState( + M, + p, + sub_problem::Pr; + evaluation::AbstractEvaluationType=AllocatingEvaluation(), + kwargs..., +) where {Pr<:Function} + return TrustRegionsState(M, p, sub_problem, evaluation; kwargs...) +end +function TrustRegionsState( + M, p, mho::H; kwargs... +) where {H<:AbstractManifoldHessianObjective} + TpM = TangentSpace(M, copy(M, p)) + problem = DefaultManoptProblem(TpM, TrustRegionModelObjective(mho)) + state = TruncatedConjugateGradientState(TpM, get_gradient(M, mho, p)) + return TrustRegionsState(M, p, problem, state; kwargs...) +end function TrustRegionsState( M::TM, - p::P=rand(M), + p::P, + sub_problem::Pr, + sub_state::St=TruncatedConjugateGradientState( + TangentSpace(M, copy(M, p)), zero_vector(M, p) + ); X::T=zero_vector(M, p), - sub_state::Op=TruncatedConjugateGradientState(M, p, X); - ρ_prime::R=0.1, + ρ_prime::R=0.1, #deprecated, remove on next breaking change + acceptance_rate=ρ_prime, ρ_regularization::R=1000.0, randomize::Bool=false, stopping_criterion::SC=StopAfterIteration(1000) | StopWhenGradientNormLess(1e-6), @@ -135,32 +183,40 @@ function TrustRegionsState( trust_region_radius::R=max_trust_region_radius / 8, retraction_method::RTR=default_retraction_method(M, typeof(p)), reduction_threshold::R=0.1, + reduction_factor=0.25, augmentation_threshold::R=0.75, + augmentation_factor=2.0, project!::Proj=copyto!, + σ=randomize ? 1e-4 : 0.0, ) where { TM<:AbstractManifold, + Pr<:AbstractManoptProblem, + St, P, T, R<:Real, SC<:StoppingCriterion, RTR<:AbstractRetractionMethod, Proj, - Op<:AbstractHessianSolverState, } - return TrustRegionsState{P,T,SC,RTR,R,Proj,Op}( + return TrustRegionsState{P,T,Pr,St,SC,RTR,R,Proj}( p, X, trust_region_radius, max_trust_region_radius, - ρ_prime, + acceptance_rate, ρ_regularization, randomize, stopping_criterion, retraction_method, reduction_threshold, augmentation_threshold, + sub_problem, sub_state, project!, + reduction_factor, + augmentation_factor, + σ, ) end get_iterate(trs::TrustRegionsState) = trs.p @@ -168,22 +224,35 @@ function set_iterate!(trs::TrustRegionsState, M, p) copyto!(M, trs.p, p) return trs end +get_gradient(agst::TrustRegionsState) = agst.X +function set_gradient!(agst::TrustRegionsState, M, p, X) + copyto!(M, agst.X, p, X) + return agst +end +function get_message(dcs::TrustRegionsState) + # for now only the sub solver might have messages + return get_message(dcs.sub_state) +end function show(io::IO, trs::TrustRegionsState) i = get_count(trs, :Iterations) Iter = (i > 0) ? "After $i iterations\n" : "" Conv = indicates_convergence(trs.stop) ? "Yes" : "No" + sub = repr(trs.sub_state) + sub = replace(sub, "\n" => "\n | ") s = """ # Solver state for `Manopt.jl`s Trust Region Method $Iter ## Parameters - * augmentation threshold: $(trs.augmentation_threshold) - * randomize: $(trs.randomize) - * reduction threshold: $(trs.reduction_threshold) - * retraction method: $(trs.retraction_method) - * ρ‘: $(trs.ρ_prime) - * ρ_regularization: $(trs.ρ_regularization) - * trust region radius: $(trs.trust_region_radius) (max: $(trs.max_trust_region_radius)) + * acceptance_rate (ρ'): $(trs.acceptance_rate) + * augmentation threshold: $(trs.augmentation_threshold) (factor: $(trs.augmentation_factor)) + * randomize: $(trs.randomize) + * reduction threshold: $(trs.reduction_threshold) (factor: $(trs.reduction_factor)) + * retraction method: $(trs.retraction_method) + * ρ_regularization: $(trs.ρ_regularization) + * trust region radius: $(trs.trust_region_radius) (max: $(trs.max_trust_region_radius)) + * sub solver state : + | $(sub) ## Stopping Criterion $(status_summary(trs.stop)) @@ -192,71 +261,58 @@ function show(io::IO, trs::TrustRegionsState) end @doc raw""" - trust_regions(M, f, grad_f, hess_f, p) - trust_regions(M, f, grad_f, p) + trust_regions(M, f, grad_f, hess_f, p=rand(M)) + trust_regions(M, f, grad_f, p=rand(M)) run the Riemannian trust-regions solver for optimization on manifolds to minimize `f` cf. [[Absil, Baker, Gallivan, FoCM, 2006](@cite AbsilBakerGallivan:2006); [Conn, Gould, Toint, SIAM, 2000](@cite ConnGouldToint:2000)]. -For the case that no hessian is provided, the Hessian is computed using finite difference, see -[`ApproxHessianFiniteDifference`](@ref). +For the case that no hessian is provided, the Hessian is computed using finite differences, +see [`ApproxHessianFiniteDifference`](@ref). For solving the the inner trust-region subproblem of finding an update-vector, -see [`truncated_conjugate_gradient_descent`](@ref). +by default the [`truncated_conjugate_gradient_descent`](@ref) is used. # Input -* `M` – a manifold ``\mathcal M`` -* `f` – a cost function ``F : \mathcal M → ℝ`` to minimize -* `grad_f`- the gradient ``\operatorname{grad}F : \mathcal M → T \mathcal M`` of ``F`` +* `M` – a manifold ``\mathcal M`` +* `f` – a cost function ``f : \mathcal M → ℝ`` to minimize +* `grad_f` – the gradient ``\operatorname{grad}F : \mathcal M → T \mathcal M`` of ``F`` * `Hess_f` – (optional), the hessian ``\operatorname{Hess}F(x): T_x\mathcal M → T_x\mathcal M``, ``X ↦ \operatorname{Hess}F(x)[X] = ∇_ξ\operatorname{grad}f(x)`` -* `p` – an initial value ``x ∈ \mathcal M`` +* `p` – (`rand(M)`) an initial value ``x ∈ \mathcal M`` # Optional -* `evaluation` – ([`AllocatingEvaluation`](@ref)) specify whether the gradient and hessian work by +* `evaluation` – ([`AllocatingEvaluation`](@ref)) specify whether the gradient and hessian work by allocation (default) or [`InplaceEvaluation`](@ref) in place * `max_trust_region_radius` – the maximum trust-region radius -* `preconditioner` – a preconditioner (a symmetric, positive definite operator +* `preconditioner` – a preconditioner (a symmetric, positive definite operator that should approximate the inverse of the Hessian) -* `randomize` – set to true if the trust-region solve is to be initiated with a - random tangent vector. If set to true, no preconditioner will be - used. This option is set to true in some scenarios to escape saddle - points, but is otherwise seldom activated. -* `project!` : (`copyto!`) specify a projection operation for tangent vectors within the TCG - for numerical stability. A function `(M, Y, p, X) -> ...` working in place of `Y`. - per default, no projection is perfomed, set it to `project!` to activate projection. -* `retraction` – (`default_retraction_method(M, typeof(p))`) approximation of the exponential map -* `stopping_criterion` – ([`StopWhenAny`](@ref)([`StopAfterIteration`](@ref)`(1000)`, - [`StopWhenGradientNormLess`](@ref)`(10^(-6))`) a functor inheriting +* `randomize` – set to true if the trust-region solve is to be initiated with a + random tangent vector and no preconditioner will be used. +* `project!` – (`copyto!`) specify a projection operation for tangent vectors + within the subsolver for numerical stability. + this means we require a function `(M, Y, p, X) -> ...` working in place of `Y`. +* `retraction` – (`default_retraction_method(M, typeof(p))`) a retraction to use +* `stopping_criterion` – ([`StopAfterIteration`](@ref)`(1000) | `[`StopWhenGradientNormLess`](@ref)`(1e-6)`) a functor inheriting from [`StoppingCriterion`](@ref) indicating when to stop. -* `trust_region_radius` - the initial trust-region radius -* `ρ_prime` – Accept/reject threshold: if ρ (the performance ratio for the - iterate) is at least ρ', the outer iteration is accepted. - Otherwise, it is rejected. In case it is rejected, the trust-region - radius will have been decreased. To ensure this, ρ' >= 0 must be - strictly smaller than 1/4. If ρ_prime is negative, the algorithm is not - guaranteed to produce monotonically decreasing cost values. It is - strongly recommended to set ρ' > 0, to aid convergence. -* `ρ_regularization` – Close to convergence, evaluating the performance ratio ρ - is numerically challenging. Meanwhile, close to convergence, the - quadratic model should be a good fit and the steps should be - accepted. Regularization lets ρ go to 1 as the model decrease and - the actual decrease go to zero. Set this option to zero to disable - regularization (not recommended). When this is not zero, it may happen - that the iterates produced are not monotonically improving the cost - when very close to convergence. This is because the corrected cost - improvement could change sign if it is negative but very small. -* `θ` – (`1.0`) 1+θ is the superlinear convergence target rate of the tCG-method - [`truncated_conjugate_gradient_descent`](@ref), which computes an - approximate solution for the trust-region subproblem. The tCG-method aborts - if the residual is less than or equal to the initial residual to the power of 1+θ. -* `κ` – (`0.1`) the linear convergence target rate of the tCG-method - [`truncated_conjugate_gradient_descent`](@ref), which computes an - approximate solution for the trust-region subproblem. The method aborts if the - residual is less than or equal to κ times the initial residual. -* `reduction_threshold` – (`0.1`) Trust-region reduction threshold: if ρ (the performance ratio for - the iterate) is less than this bound, the trust-region radius and thus the trust-regions - decreases. -* `augmentation_threshold` – (`0.75`) Trust-region augmentation threshold: if ρ (the performance ratio for - the iterate) is greater than this and further conditions apply, the trust-region radius and thus the trust-regions increases. +* `trust_region_radius` – the initial trust-region radius +* `acceptance_rate` – Accept/reject threshold: if ρ (the performance ratio for the + iterate) is at least the acceptance rate ρ', the candidate is accepted. + This value should be between ``0`` and ``\frac{1}{4}`` + (formerly this was called `ρ_prime, which will be removed on the next breaking change) +* `ρ_regularization` – (`1e3`) regularize the performance evaluation ``ρ`` + to avoid numerical inaccuracies. +* `θ` – (`1.0`) 1+θ is the superlinear convergence target rate of the tCG-method + [`truncated_conjugate_gradient_descent`](@ref), and is used in a stopping crierion therein +* `κ` – (`0.1`) the linear convergence target rate of the tCG method + [`truncated_conjugate_gradient_descent`](@ref), and is used in a stopping crierion therein +* `reduction_threshold` – (`0.1`) trust-region reduction threshold: if ρ is below this threshold, + the trust region radius is reduced by `reduction_factor`. +* `reduction_factor` – (`0.25`) trust-region reduction factor +* `augmentation_threshold` – (`0.75`) trust-region augmentation threshold: if ρ is above this threshold, + we have a solution on the trust region boundary and negative curvature, we extend (augment) the radius +* `augmentation_factor` – (`2.0`) trust-region augmentation factor + +For the case that no hessian is provided, the Hessian is computed using finite difference, see +[`ApproxHessianFiniteDifference`](@ref). # Output @@ -266,11 +322,13 @@ the obtained (approximate) minimizer ``p^*``, see [`get_solver_return`](@ref) fo [`truncated_conjugate_gradient_descent`](@ref) """ trust_regions(M::AbstractManifold, args...; kwargs...) +# Hesian (Function) but no point function trust_regions( M::AbstractManifold, f, grad_f, Hess_f::TH; kwargs... ) where {TH<:Function} return trust_regions(M, f, grad_f, Hess_f, rand(M); kwargs...) end +# Hesian (Function) and point function trust_regions( M::AbstractManifold, f, @@ -288,6 +346,7 @@ function trust_regions( mho = ManifoldHessianObjective(f, grad_f, Hess_f, preconditioner; evaluation=evaluation) return trust_regions(M, mho, p; evaluation=evaluation, kwargs...) end +# Hesian (Function) and point (but a number) function trust_regions( M::AbstractManifold, f, @@ -317,9 +376,11 @@ function trust_regions( ) return (typeof(q) == typeof(rs)) ? rs[] : rs end +# neither Hesian (Function) nor point function trust_regions(M::AbstractManifold, f, grad_f; kwargs...) return trust_regions(M, f, grad_f, rand(M); kwargs...) end +# no Hesian (Function), but point (any) function trust_regions( M::AbstractManifold, f::TF, @@ -343,6 +404,7 @@ function trust_regions( kwargs..., ) end +# Objective function trust_regions( M::AbstractManifold, mho::O, p=rand(M); kwargs... ) where {O<:Union{ManifoldHessianObjective,AbstractDecoratedManifoldObjective}} @@ -357,11 +419,11 @@ end evaluate the Riemannian trust-regions solver in place of `p`. # Input -* `M` – a manifold ``\mathcal M`` -* `f` – a cost function ``F: \mathcal M → ℝ`` to minimize -* `grad_f`- the gradient ``\operatorname{grad}F: \mathcal M → T \mathcal M`` of ``F`` -* `Hess_f` – (optional) the hessian ``H( \mathcal M, x, ξ)`` of ``F`` -* `p` – an initial value ``p ∈ \mathcal M`` +* `M` – a manifold ``\mathcal M`` +* `f` – a cost function ``f: \mathcal M → ℝ`` to minimize +* `grad_f` – the gradient ``\operatorname{grad}f: \mathcal M → T \mathcal M`` of ``F`` +* `Hess_f` – (optional) the hessian ``\operatorname{Hess} f`` +* `p` – an initial value ``p ∈ \mathcal M`` For the case that no hessian is provided, the Hessian is computed using finite difference, see [`ApproxHessianFiniteDifference`](@ref). @@ -369,6 +431,7 @@ For the case that no hessian is provided, the Hessian is computed using finite d for more details and all options, see [`trust_regions`](@ref) """ trust_regions!(M::AbstractManifold, args...; kwargs...) +# No Hessian but a point (Any) function trust_regions!( M::AbstractManifold, f, @@ -392,6 +455,7 @@ function trust_regions!( kwargs..., ) end +# Hessian and point function trust_regions!( M::AbstractManifold, f, @@ -409,6 +473,7 @@ function trust_regions!( mho = ManifoldHessianObjective(f, grad_f, Hess_f, preconditioner; evaluation=evaluation) return trust_regions!(M, mho, p; evaluation=evaluation, kwargs...) end +# Objective function trust_regions!( M::AbstractManifold, mho::O, @@ -416,19 +481,24 @@ function trust_regions!( retraction_method::AbstractRetractionMethod=default_retraction_method(M, typeof(p)), stopping_criterion::StoppingCriterion=StopAfterIteration(1000) | StopWhenGradientNormLess(1e-6), - max_trust_region_radius=sqrt(manifold_dimension(M)), - trust_region_radius=max_trust_region_radius / 8, - randomize::Bool=false, + max_trust_region_radius::R=sqrt(manifold_dimension(M)), + trust_region_radius::R=max_trust_region_radius / 8, + randomize::Bool=false, # Deprecated, remove on next release (use just σ) project!::Proj=copyto!, - ρ_prime::Float64=0.1, - ρ_regularization=1000.0, - θ::Float64=1.0, - κ::Float64=0.1, - reduction_threshold::Float64=0.1, - augmentation_threshold::Float64=0.75, - sub_state::AbstractHessianSolverState=TruncatedConjugateGradientState( - M, - p, + ρ_prime::R=0.1, # Deprecated, remove on next breaking change (use acceptance_rate) + acceptance_rate::R=ρ_prime, + ρ_regularization=1e3, + θ::R=1.0, + κ::R=0.1, + σ=randomize ? 1e-3 : 0.0, + reduction_threshold::R=0.1, + reduction_factor::R=0.25, + augmentation_threshold::R=0.75, + augmentation_factor::R=2.0, + sub_objective=TrustRegionModelObjective(mho), + sub_problem=DefaultManoptProblem(TangentSpace(M, p), sub_objective), + sub_state::Union{AbstractHessianSolverState,AbstractEvaluationType}=TruncatedConjugateGradientState( + TangentSpace(M, copy(M, p)), zero_vector(M, p); θ=θ, κ=κ, @@ -437,10 +507,7 @@ function trust_regions!( (project!)=project!, ), kwargs..., #collect rest -) where {Proj,O<:Union{ManifoldHessianObjective,AbstractDecoratedManifoldObjective}} - (ρ_prime >= 0.25) && throw( - ErrorException("ρ_prime must be strictly smaller than 0.25 but it is $ρ_prime.") - ) +) where {Proj,O<:Union{ManifoldHessianObjective,AbstractDecoratedManifoldObjective},R} (max_trust_region_radius <= 0) && throw( ErrorException( "max_trust_region_radius must be positive but it is $max_trust_region_radius.", @@ -456,120 +523,119 @@ function trust_regions!( trs = TrustRegionsState( M, p, - get_gradient(dmp, p), + sub_problem, sub_state; + X=get_gradient(dmp, p), trust_region_radius=trust_region_radius, max_trust_region_radius=max_trust_region_radius, - ρ_prime=ρ_prime, + acceptance_rate=acceptance_rate, ρ_regularization=ρ_regularization, randomize=randomize, stopping_criterion=stopping_criterion, retraction_method=retraction_method, reduction_threshold=reduction_threshold, + reduction_factor=reduction_factor, augmentation_threshold=augmentation_threshold, + augmentation_factor=augmentation_factor, (project!)=project!, + σ=σ, ) dtrs = decorate_state!(trs; kwargs...) solve!(dmp, dtrs) return get_solver_return(get_objective(dmp), dtrs) end + function initialize_solver!(mp::AbstractManoptProblem, trs::TrustRegionsState) M = get_manifold(mp) get_gradient!(mp, trs.X, trs.p) - trs.η = zero_vector(M, trs.p) - trs.Hη = zero_vector(M, trs.p) + trs.Y = zero_vector(M, trs.p) + trs.HY = zero_vector(M, trs.p) trs.p_proposal = deepcopy(trs.p) trs.f_proposal = zero(trs.trust_region_radius) - - trs.η_Cauchy = zero_vector(M, trs.p) - trs.Hη_Cauchy = zero_vector(M, trs.p) - trs.τ = zero(trs.trust_region_radius) - trs.Hgrad = zero_vector(M, trs.p) + if trs.randomize #only init if necessary + trs.Z = zero_vector(M, trs.p) + trs.HZ = zero_vector(M, trs.p) + trs.τ = zero(trs.trust_region_radius) + trs.HX = zero_vector(M, trs.p) + end return trs end function step_solver!(mp::AbstractManoptProblem, trs::TrustRegionsState, i) M = get_manifold(mp) mho = get_objective(mp) - # Determine eta0 - if trs.randomize - # Random vector in T_x M (this has to be very small) - trs.η = 10.0^(-6) * rand(M; vector_at=trs.p) - while norm(M, trs.p, trs.η) > trs.trust_region_radius - # inside trust-region - trs.η *= sqrt(sqrt(eps(Float64))) + # Determine the initial tangent vector used as start point for the subsolvereta0 + if trs.σ > 0 + rand!(M, trs.Y; vector_at=trs.p, σ=trs.σ) + nY = norm(M, trs.p, trs.Y) + if nY > trs.trust_region_radius # move inside if outside + trs.Y *= trs.trust_region_radius / (2 * nY) end else - zero_vector!(M, trs.η, trs.p) + zero_vector!(M, trs.Y, trs.p) end - # Solve TR subproblem - update options - trs.sub_state.p = trs.p - trs.sub_state.η = trs.η - trs.sub_state.trust_region_radius = trs.trust_region_radius - solve!(mp, trs.sub_state) + # Update the current gradient + get_gradient!(M, trs.X, mho, trs.p) + set_manopt_parameter!(trs.sub_problem, :Manifold, :Basepoint, copy(M, trs.p)) + set_manopt_parameter!(trs.sub_state, :Iterate, copy(M, trs.p, trs.Y)) + set_manopt_parameter!(trs.sub_state, :TrustRegionRadius, trs.trust_region_radius) + solve!(trs.sub_problem, trs.sub_state) # - trs.η = trs.sub_state.η - trs.Hη = trs.sub_state.Hη - - # Initialize the cost function F und the gradient of the cost function - # gradF at the point x - trs.X = trs.sub_state.X - fx = get_cost(mp, trs.p) - # If using randomized approach, compare result with the Cauchy point. - if trs.randomize - norm_grad = norm(M, trs.p, trs.X) + copyto!(M, trs.Y, trs.p, get_solver_result(trs.sub_state)) + f = get_cost(mp, trs.p) + if trs.σ > 0 # randomized approach: compare result with the Cauchy point. + nX = norm(M, trs.p, trs.X) + get_hessian!(M, trs.HY, mho, trs.p, trs.Y) # Check the curvature, - get_hessian!(mp, trs.Hgrad, trs.p, trs.X) - trs.τ = real(inner(M, trs.p, trs.X, trs.Hgrad)) + get_hessian!(mp, trs.HX, trs.p, trs.X) + trs.τ = real(inner(M, trs.p, trs.X, trs.HX)) trs.τ = if (trs.τ <= 0) one(trs.τ) else - min(norm_grad^3 / (trs.trust_region_radius * trs.τ), 1) + min(nX^3 / (trs.trust_region_radius * trs.τ), 1) end # compare to Cauchy point and store best model_value = - fx + - real(inner(M, trs.p, trs.X, trs.η)) + - 0.5 * real(inner(M, trs.p, trs.Hη, trs.η)) + f + + real(inner(M, trs.p, trs.X, trs.Y)) + + 0.5 * real(inner(M, trs.p, trs.HY, trs.Y)) model_value_Cauchy = - fx - trs.τ * trs.trust_region_radius * norm_grad + - 0.5 * trs.τ^2 * trs.trust_region_radius^2 / (norm_grad^2) * - real(inner(M, trs.p, trs.Hgrad, trs.X)) + f - trs.τ * trs.trust_region_radius * nX + + 0.5 * trs.τ^2 * trs.trust_region_radius^2 / (nX^2) * + real(inner(M, trs.p, trs.HX, trs.X)) if model_value_Cauchy < model_value - copyto!(M, trs.η, (-trs.τ * trs.trust_region_radius / norm_grad) * trs.X) - copyto!(M, trs.Hη, (-trs.τ * trs.trust_region_radius / norm_grad) * trs.Hgrad) + copyto!(M, trs.Y, (-trs.τ * trs.trust_region_radius / nX) * trs.X) + copyto!(M, trs.HY, (-trs.τ * trs.trust_region_radius / nX) * trs.HX) end end # Compute the tentative next iterate (the proposal) - retract!(M, trs.p_proposal, trs.p, trs.η, trs.retraction_method) - # Check the performance of the quadratic model against the actual cost. - ρ_reg = max(1, abs(fx)) * eps(Float64) * trs.ρ_regularization - ρnum = fx - get_cost(mp, trs.p_proposal) - ρden = -real(inner(M, trs.p, trs.η, trs.X)) - 0.5 * real(inner(M, trs.p, trs.η, trs.Hη)) + retract!(M, trs.p_proposal, trs.p, trs.Y, trs.retraction_method) + # Compute ρ_k as in (8) of ABG2007 + ρ_reg = max(1, abs(f)) * eps(Float64) * trs.ρ_regularization + ρnum = f - get_cost(mp, trs.p_proposal) + ρden = -real(inner(M, trs.p, trs.Y, trs.X)) - 0.5 * real(inner(M, trs.p, trs.Y, trs.HY)) ρnum = ρnum + ρ_reg ρden = ρden + ρ_reg - ρ = (abs(ρnum / fx) < sqrt(eps(Float64))) ? 1 : ρnum / ρden # stability for small absolute relative model change - + ρ = ρnum / ρden model_decreased = ρden ≥ 0 # Update the Hessian approximation - i.e. really unwrap the original Hessian function # and update it if it is an approximate Hessian. - update_hessian!(M, get_hessian_function(mho, true), trs.p, trs.p_proposal, trs.η) + update_hessian!(M, get_hessian_function(mho, true), trs.p, trs.p_proposal, trs.Y) # Choose the new TR radius based on the model performance. - # If the actual decrease is smaller than reduction_threshold of the predicted decrease, - # then reduce the TR radius. + # Case (a) we performe poorly -> decrease radius if ρ < trs.reduction_threshold || !model_decreased || isnan(ρ) - trs.trust_region_radius /= 4 - elseif ρ > trs.augmentation_threshold / 4 && - ((trs.sub_state.ηPη >= trs.trust_region_radius^2) || (trs.sub_state.δHδ <= 0)) + trs.trust_region_radius *= trs.reduction_factor + elseif ρ > trs.augmentation_threshold && + get_manopt_parameter(trs.sub_state, :TrustRegionExceeded) + # (b) We perform great and exceed/reach the trust region boundary -> increase radius trs.trust_region_radius = min( - 2 * trs.trust_region_radius, trs.max_trust_region_radius + trs.augmentation_factor * trs.trust_region_radius, trs.max_trust_region_radius ) end - # Choose to accept or reject the proposed step based on the model - # performance. Note the strict inequality. - if model_decreased && - (ρ > trs.ρ_prime || (abs((ρnum) / (abs(fx) + 1)) < sqrt(eps(Float64)) && 0 < ρnum)) + # (c) if we decreased and perform well enough -> accept step + if model_decreased && (ρ > trs.acceptance_rate) copyto!(trs.p, trs.p_proposal) + # If we work with approximate hessians -> update base point there update_hessian_basis!(M, get_hessian_function(mho, true), trs.p) end return trs diff --git a/test/plans/test_higher_order_primal_dual_plan.jl b/test/plans/test_higher_order_primal_dual_plan.jl index 35bb5a21bd..702a0bea09 100644 --- a/test/plans/test_higher_order_primal_dual_plan.jl +++ b/test/plans/test_higher_order_primal_dual_plan.jl @@ -151,7 +151,7 @@ using Manopt, Manifolds, ManifoldsBase, Test n = m p0 = deepcopy(data) ξ0 = zero_vector(M, m) - X = log(M, p0, m)# TODO construct tangent vector + X = log(M, p0, m) Ξ = X @testset "test Mutating/Allocation Problem Variants" begin diff --git a/test/plans/test_stopping_criteria.jl b/test/plans/test_stopping_criteria.jl index fd2e729ffb..f0267dd95d 100644 --- a/test/plans/test_stopping_criteria.jl +++ b/test/plans/test_stopping_criteria.jl @@ -26,7 +26,7 @@ struct DummyStoppingCriterion <: StoppingCriterion end sn = StopWhenAny(StopAfterIteration(10), s3) @test !Manopt.indicates_convergence(sn) # since it might stop after 10 iters @test startswith(repr(sn), "StopWhenAny with the") - sn2 = StopWhenAny([StopAfterIteration(10), s3]) + sn2 = StopAfterIteration(10) | s3 @test get_stopping_criteria(sn)[1].maxIter == get_stopping_criteria(sn2)[1].maxIter @test get_stopping_criteria(sn)[2].threshold == get_stopping_criteria(sn2)[2].threshold # then s3 is the only active one @@ -80,7 +80,7 @@ end @test typeof(d) === typeof((a & b) & c) update_stopping_criterion!(d, :MinIterateChange, 1e-8) @test d.criteria[2].threshold == 1e-8 - e = StopWhenAny(a, b, c) + e = a | b | c @test typeof(e) === typeof(a | b | c) @test typeof(e) === typeof(a | (b | c)) @test typeof(e) === typeof((a | b) | c) @@ -128,13 +128,13 @@ end ho = ManifoldHessianObjective(x -> x, (M, x) -> x, (M, x) -> x, x -> x) hp = DefaultManoptProblem(Euclidean(), ho) tcgs = TruncatedConjugateGradientState( - Euclidean(), 1.0, 0.0; trust_region_radius=2.0, randomize=false + TangentSpace(Euclidean(), 1.0), 0.0; trust_region_radius=2.0, randomize=false ) - tcgs.new_model_value = 2.0 tcgs.model_value = 1.0 s = StopWhenModelIncreased() @test !s(hp, tcgs, 0) @test s.reason == "" + s.model_value = 0.5 # twweak @test s(hp, tcgs, 1) @test length(s.reason) > 0 s2 = StopWhenCurvatureIsNegative() diff --git a/test/solvers/test_adaptive_regularization_with_cubics.jl b/test/solvers/test_adaptive_regularization_with_cubics.jl index 2cd1802424..a685203e14 100644 --- a/test/solvers/test_adaptive_regularization_with_cubics.jl +++ b/test/solvers/test_adaptive_regularization_with_cubics.jl @@ -8,7 +8,6 @@ include("../utils/example_tasks.jl") n = 8 k = 3 A = Symmetric(randn(n, n)) - M = Grassmann(n, k) f(M, p) = -0.5 * tr(p' * A * p) @@ -16,22 +15,34 @@ include("../utils/example_tasks.jl") Hess_f(M, p, X) = -A * X + p * p' * A * X + X * p' * A * p p0 = Matrix{Float64}(I, n, n)[:, 1:k] - M2 = TangentSpace(M, p0) - + M2 = TangentSpace(M, copy(M, p0)) mho = ManifoldHessianObjective(f, grad_f, Hess_f) - g = AdaptiveRegularizationCubicCost(M2, mho) - grad_g = AdaptiveRegularizationCubicGrad(M2, mho) + arcmo = AdaptiveRagularizationWithCubicsModelObjective(mho) + + @testset "Accessors for the Objective" begin + isapprox( + M, p0, Manopt.get_objective_gradient(M, arcmo, p0), get_gradient(M, mho, p0) + ) + X0 = zero_vector(M, p0) + Manopt.get_objective_gradient!(M, X0, arcmo, p0) + isapprox(M, p0, X0, get_gradient(M, mho, p0)) + + g = get_gradient_function(arcmo) + isapprox(M, p0, g(M2, p0), get_gradient(M, mho, p0)) + X0 = zero_vector(M, p0) + X1 = similar(X0) + Manopt.get_objective_preconditioner!(M, X1, arcmo, p0, X0) + isapprox(M, p0, X1, get_preconditioner(M, mho, p0, X0)) + end @testset "State and repr" begin # if we neither provide a problem nor an objective, we expect an error @test_throws ErrorException AdaptiveRegularizationState(ℝ^2) - g = AdaptiveRegularizationCubicCost(M2, mho) - grad_g = AdaptiveRegularizationCubicGrad(M2, mho) arcs = AdaptiveRegularizationState( M, p0; - sub_problem=DefaultManoptProblem(M2, ManifoldGradientObjective(g, grad_g)), + sub_problem=DefaultManoptProblem(M2, arcmo), sub_state=GradientDescentState(M2, zero_vector(M, p0)), ) @test startswith( @@ -47,8 +58,8 @@ include("../utils/example_tasks.jl") arcs2 = AdaptiveRegularizationState( M, p0; - sub_objective=mho, - sub_state=LanczosState(M; maxIterLanczos=1), + sub_objective=arcmo, + sub_state=LanczosState(M2; maxIterLanczos=1), stopping_criterion=StopWhenAllLanczosVectorsUsed(1), ) #add a fake Lanczos @@ -58,9 +69,10 @@ include("../utils/example_tasks.jl") @test stop_solver!(arcs2.sub_problem, arcs2, 1) arcs3 = AdaptiveRegularizationState( - M, p0; sub_objective=mho, sub_state=LanczosState(M; maxIterLanczos=2) + M, p0; sub_objective=arcmo, sub_state=LanczosState(M2; maxIterLanczos=2) ) #add a fake Lanczos + initialize_solver!(arcs3.sub_problem, arcs3.sub_state) push!(arcs3.sub_state.Lanczos_vectors, copy(M, p1, X1)) step_solver!(arcs3.sub_problem, arcs3.sub_state, 2) # to introduce a random new one # test orthogonality of the new 2 ones @@ -76,7 +88,7 @@ include("../utils/example_tasks.jl") ) # a second that copies arcs4 = AdaptiveRegularizationState( - M, p0; sub_objective=mho, sub_state=LanczosState(M; maxIterLanczos=2) + M, p0; sub_objective=arcmo, sub_state=LanczosState(M2; maxIterLanczos=2) ) #add a fake Lanczos push!(arcs4.sub_state.Lanczos_vectors, copy(M, p1, X1)) @@ -112,6 +124,9 @@ include("../utils/example_tasks.jl") r = copy(M, p1) Manopt.solve_arc_subproblem!(M, r, f1!, InplaceEvaluation(), p0) @test r == p0 + # Dummy construction with a function for the sub_problem + arcs4 = AdaptiveRegularizationState(M, p0; sub_problem=f1) + @test arcs4.sub_state isa AbstractEvaluationType end @testset "A few solver runs" begin @@ -163,12 +178,12 @@ include("../utils/example_tasks.jl") # test both inplace and allocating variants of grad_g X0 = grad_f(M, p0) - X1 = grad_g(M2, X0) + X1 = get_gradient(M2, arcmo, X0) X2 = zero_vector(M, p0) - grad_g(M2, X2, X0) + get_gradient!(M2, X2, arcmo, X0) @test isapprox(M, p0, X1, X2) - sub_problem = DefaultManoptProblem(M2, ManifoldGradientObjective(g, grad_g)) + sub_problem = DefaultManoptProblem(M2, arcmo) sub_state = GradientDescentState( M2, zero_vector(M, p0); @@ -191,6 +206,7 @@ include("../utils/example_tasks.jl") ) @test isapprox(M, p1, q3) end + @testset "A short solver run on the circle" begin Mc, fc, grad_fc, pc0, pc_star = Circle_mean_task() hess_fc(Mc, p, X) = 1.0 diff --git a/test/solvers/test_difference_of_convex.jl b/test/solvers/test_difference_of_convex.jl index 145c19f9f8..b9195e699f 100644 --- a/test/solvers/test_difference_of_convex.jl +++ b/test/solvers/test_difference_of_convex.jl @@ -62,8 +62,10 @@ import Manifolds: inner dcppa_sub_problem = DefaultManoptProblem(M, dcppa_sub_objective) dcppa_sub_state = GradientDescentState(M, copy(M, p0)) - dcps = DifferenceOfConvexProximalState( - M, copy(M, p0), dcppa_sub_problem, dcppa_sub_objective + dcps = DifferenceOfConvexProximalState( #Initialize with random point + M, + dcppa_sub_problem, + dcppa_sub_objective, ) set_iterate!(dcps, M, p1) @test dcps.p == p1 @@ -115,7 +117,7 @@ import Manifolds: inner @test Manopt.get_message(s1) == "" # no message in last step @test isapprox(M, p1, p2) @test isapprox(M, p2, p3) - @test isapprox(f(M, p1), 0.0; atol=2e-16) + @test isapprox(f(M, p1), 0.0; atol=1e-8) # not provided grad_g or problem nothing @test_throws ErrorException difference_of_convex_algorithm( M, f, g, grad_h, p0; sub_problem=nothing @@ -154,12 +156,12 @@ import Manifolds: inner @test isapprox(M, p4, p5) @test isapprox(M, p5, p6) @test isapprox(f(M, p5b), 0.0; atol=2e-16) # bit might be a different min due to rand - @test isapprox(f(M, p5c), 0.0; atol=1e-9) # might be a bit imprecise - @test isapprox(f(M, p4), 0.0; atol=2e-16) + @test isapprox(f(M, p5c), 0.0; atol=1e-10) + @test isapprox(f(M, p4), 0.0; atol=1e-14) Random.seed!(23) p7 = difference_of_convex_algorithm(M, f, g, grad_h; grad_g=grad_g) - @test isapprox(f(M, p7), 0.0; atol=2e-16) + @test isapprox(f(M, p7), 0.0; atol=1e-8) p8 = copy(M, p0) # Same call as p2 inplace difference_of_convex_algorithm!(M, f, g, grad_h, p8; grad_g=grad_g) @@ -218,7 +220,7 @@ import Manifolds: inner trust_regions!(M, prox, grad_prox, hess_prox, q) return q end - @test_logs (:warn,) difference_of_convex_proximal_point(M, prox_g, grad_h, p0;) + # @test_logs (:warn,) difference_of_convex_proximal_point(M, prox_g, grad_h, p0;) p13 = difference_of_convex_proximal_point(M, grad_h, p0; prox_g=prox_g) p13b = copy(M, p0) difference_of_convex_proximal_point!(M, grad_h, p13b; prox_g=prox_g) @@ -231,9 +233,9 @@ import Manifolds: inner trust_regions!(M, prox, grad_prox, hess_prox, q) return q end - @test_logs (:warn,) difference_of_convex_proximal_point( - M, prox_g!, grad_h!, p0; evaluation=InplaceEvaluation() - ) + #@test_logs (:warn,) difference_of_convex_proximal_point( + # M, prox_g!, grad_h!, p0; evaluation=InplaceEvaluation() + #) p14 = difference_of_convex_proximal_point( M, grad_h!, p0; prox_g=prox_g!, evaluation=InplaceEvaluation() ) diff --git a/test/solvers/test_gradient_descent.jl b/test/solvers/test_gradient_descent.jl index fe899f8a07..de84700e19 100644 --- a/test/solvers/test_gradient_descent.jl +++ b/test/solvers/test_gradient_descent.jl @@ -89,9 +89,7 @@ using Manopt, Manifolds, Test, Random f, grad_f, data[1]; - stopping_criterion=StopWhenAny( - StopAfterIteration(1000), StopWhenChangeLess(1e-16) - ), + stopping_criterion=StopAfterIteration(1000) | StopWhenChangeLess(1e-16), direction=Nesterov(M, data[1]), ) @test isapprox(M, p, p6; atol=1e-13) diff --git a/test/solvers/test_primal_dual_semismooth_Newton.jl b/test/solvers/test_primal_dual_semismooth_Newton.jl index 1df5b6b8c1..f0c3583168 100644 --- a/test/solvers/test_primal_dual_semismooth_Newton.jl +++ b/test/solvers/test_primal_dual_semismooth_Newton.jl @@ -85,7 +85,7 @@ using Manopt, Manifolds, ManifoldsBase, Test return Y elseif p == 2 norms = norm.(Ref(N.manifold), x, Ξ) - norms_ = sqrt.(sum(norms .^ 2; dims=length(pdims))) # TODO check size + norms_ = sqrt.(sum(norms .^ 2; dims=length(pdims))) for i in R # iterate over all pixel for k in 1:d # for all direction combinations diff --git a/test/solvers/test_truncated_cg.jl b/test/solvers/test_truncated_cg.jl index 35b63f9fa5..a64fbe3b3e 100644 --- a/test/solvers/test_truncated_cg.jl +++ b/test/solvers/test_truncated_cg.jl @@ -4,7 +4,7 @@ using Manifolds, Manopt, ManifoldsBase, Test M = Grassmann(3, 2) p = [1.0 0.0; 0.0 1.0; 0.0 0.0] η = zero_vector(M, p) - s = TruncatedConjugateGradientState(M, p, η) + s = TruncatedConjugateGradientState(TangentSpace(M, p), η) @test startswith( repr(s), "# Solver state for `Manopt.jl`s Truncated Conjugate Gradient Descent\n" ) diff --git a/test/solvers/test_trust_regions.jl b/test/solvers/test_trust_regions.jl index 66e6c4e873..f74b64c5ff 100644 --- a/test/solvers/test_trust_regions.jl +++ b/test/solvers/test_trust_regions.jl @@ -1,4 +1,4 @@ -using Manifolds, Manopt, Test, LinearAlgebra +using Manifolds, Manopt, Test, LinearAlgebra, Random include("trust_region_model.jl") include("../utils/example_tasks.jl") @@ -13,7 +13,6 @@ include("../utils/example_tasks.jl") p[:, :, 1] = [1.0 0.0; 0.0 1.0; 0.0 0.0] p[:, :, 2] = [0.0 0.0; 1.0 0.0; 0.0 1.0] - @test_throws ErrorException trust_regions(M, f, rgrad, rhess, p; ρ_prime=0.3) @test_throws ErrorException trust_regions( M, f, rgrad, rhess, p; max_trust_region_radius=-0.1 ) @@ -23,7 +22,33 @@ include("../utils/example_tasks.jl") @test_throws ErrorException trust_regions( M, f, rgrad, rhess, p; max_trust_region_radius=0.1, trust_region_radius=0.11 ) - + @testset "State Constructors" begin + X = rgrad(M, p) + TpM = TangentSpace(M, copy(M, p)) + mho = ManifoldHessianObjective(f, rgrad, rhess) + sub_problem = DefaultManoptProblem(TpM, TrustRegionModelObjective(mho)) + sub_state = TruncatedConjugateGradientState(TpM, get_gradient(M, mho, p)) + trs1 = TrustRegionsState(M, sub_problem) + trs2 = TrustRegionsState(M, sub_problem, sub_state) + trs3 = TrustRegionsState(M, p, sub_problem) + end + @testset "Objective accessors" begin + mho = ManifoldHessianObjective(f, rgrad, rhess) + X = rgrad(M, p) + TpM = TangentSpace(M, copy(M, p)) + trmo = TrustRegionModelObjective(mho) + c = f(M, p) + g = inner(M, p, X, X) + H = rhess(M, p, X) + @test get_cost(TpM, trmo, X) == c + g + 1 / 2 * inner(M, p, H, X) + @test get_gradient(TpM, trmo, X) == X + H + Y = similar(X) + get_gradient!(TpM, Y, trmo, X) + @test Y == X + H + @test get_hessian(TpM, trmo, Y, X) == H + get_hessian!(TpM, Y, trmo, Y, X) + @test Y == H + end @testset "Allocating Variant" begin s = trust_regions( M, f, rgrad, rhess, p; max_trust_region_radius=8.0, return_state=true @@ -35,6 +60,7 @@ include("../utils/example_tasks.jl") @test norm(M, p, get_gradient(s)) ≈ 0.0 trust_regions!(M, f, rgrad, rhess, q; max_trust_region_radius=8.0) @test isapprox(M, p1, q) + Random.seed!(42) p2 = trust_regions( M, f, rgrad, rhess, p; max_trust_region_radius=8.0, randomize=true ) @@ -63,8 +89,6 @@ include("../utils/example_tasks.jl") end @testset "TCG" begin X = zero_vector(M, p) - - @test_logs (:warn,) truncated_conjugate_gradient_descent(M, f, rgrad, p, X, rhess) Y = truncated_conjugate_gradient_descent( M, f, rgrad, rhess, p, X; trust_region_radius=0.5 ) @@ -112,7 +136,6 @@ include("../utils/example_tasks.jl") Y7 = copy(M, p, X) truncated_conjugate_gradient_descent!(M, f, rgrad, p, Y7; trust_region_radius=0.5) @test Y7 != X - @test_logs (:warn,) truncated_conjugate_gradient_descent!(M, f, rgrad, p, Y4, rhess) end @testset "Mutating" begin g = RGrad(M, A) @@ -206,9 +229,7 @@ include("../utils/example_tasks.jl") M, p, grad_f!; nu=eps(Float64)^2, evaluation=InplaceEvaluation() ), p; - stopping_criterion=StopWhenAny( - StopAfterIteration(10000), StopWhenGradientNormLess(10^(-6)) - ), + stopping_criterion=StopAfterIteration(100) | StopWhenGradientNormLess(1e-8), trust_region_radius=1.0, θ=0.1, κ=0.9, @@ -224,9 +245,8 @@ include("../utils/example_tasks.jl") grad_f, ApproxHessianSymmetricRankOne(M, qaHSR1_2, grad_f; nu=eps(Float64)^2), qaHSR1_2; - stopping_criterion=StopWhenAny( - StopAfterIteration(10000), StopWhenGradientNormLess(10^(-6)) - ), + stopping_criterion=StopAfterIteration(10000) | + StopWhenGradientNormLess(1e-6), trust_region_radius=1.0, θ=0.1, κ=0.9, @@ -240,9 +260,8 @@ include("../utils/example_tasks.jl") grad_f, ApproxHessianBFGS(M, p, grad_f), p; - stopping_criterion=StopWhenAny( - StopAfterIteration(10000), StopWhenGradientNormLess(10^(-6)) - ), + stopping_criterion=StopAfterIteration(10000) | + StopWhenGradientNormLess(1e-6), trust_region_radius=1.0, θ=0.1, κ=0.9, @@ -257,9 +276,8 @@ include("../utils/example_tasks.jl") grad_f, ApproxHessianBFGS(M, qaHBFGS_2, grad_f), qaHBFGS_2; - stopping_criterion=StopWhenAny( - StopAfterIteration(10000), StopWhenGradientNormLess(10^(-6)) - ), + stopping_criterion=StopAfterIteration(10000) | + StopWhenGradientNormLess(1e-6), trust_region_radius=1.0, θ=0.1, κ=0.9, @@ -302,9 +320,8 @@ include("../utils/example_tasks.jl") M, qaHSR1_3, grad_f!; nu=eps(Float64)^2, evaluation=InplaceEvaluation() ), qaHSR1_3; - stopping_criterion=StopWhenAny( - StopAfterIteration(10000), StopWhenGradientNormLess(10^(-6)) - ), + stopping_criterion=StopAfterIteration(10000) | + StopWhenGradientNormLess(1e-6), trust_region_radius=1.0, θ=0.1, κ=0.9, @@ -320,9 +337,8 @@ include("../utils/example_tasks.jl") grad_f!, ApproxHessianBFGS(M, qaHBFGS_3, grad_f!; evaluation=InplaceEvaluation()), qaHBFGS_3; - stopping_criterion=StopWhenAny( - StopAfterIteration(10000), StopWhenGradientNormLess(10^(-6)) - ), + stopping_criterion=StopAfterIteration(10000) | + StopWhenGradientNormLess(1e-6), trust_region_radius=1.0, θ=0.1, κ=0.9, @@ -362,24 +378,22 @@ include("../utils/example_tasks.jl") end @testset "Euclidean Embedding" begin Random.seed!(42) - n = 5 + n = 2 A = Symmetric(randn(n + 1, n + 1)) # Euclidean variant with conversion M = Sphere(n) - p0 = rand(M) + p0 = [1.0, zeros(n)...] f(E, p) = p' * A * p ∇f(E, p) = A * p ∇²f(M, p, X) = A * X λ = min(eigvals(A)...) - q = trust_regions(M, f, ∇f, p0; objective_type=:Euclidean) - q2 = trust_regions(M, f, ∇f, ∇²f, p0; objective_type=:Euclidean) - @test λ ≈ f(M, q) - @test λ ≈ f(M, q2) + q = trust_regions(M, f, ∇f, p0; objective_type=:Euclidean, (project!)=project!) + @test f(M, q) ≈ λ atol = 1 * 1e-1 # a bit inprecise? grad_f(M, p) = A * p - (p' * A * p) * p Hess_f(M, p, X) = A * X - (p' * A * X) .* p - (p' * A * p) .* X q3 = trust_regions(M, f, grad_f, p0) q4 = trust_regions(M, f, grad_f, Hess_f, p0) - @test λ ≈ f(M, q3) atol = 2e-1 # Riemannian Hessian a bit imprecise? - @test λ ≈ f(M, q4) + @test f(M, q3) ≈ λ atol = 5 * 1e-8 + @test f(M, q4) ≈ λ atol = 5 * 1e-10 end end diff --git a/test/solvers/trust_region_model.jl b/test/solvers/trust_region_model.jl index df3e67a255..d28c8c691b 100644 --- a/test/solvers/trust_region_model.jl +++ b/test/solvers/trust_region_model.jl @@ -1,3 +1,4 @@ +using Manifolds A = [1.0 2.0 3.0; 4.0 5.0 6.0; 7.0 8.0 9.0] f(M::PowerManifold, p) = -0.5 * norm(transpose(p[M, 1]) * A * p[M, 2])^2 diff --git a/tutorials/ConstrainedOptimization.qmd b/tutorials/ConstrainedOptimization.qmd index 4088f217bc..339b2934f6 100644 --- a/tutorials/ConstrainedOptimization.qmd +++ b/tutorials/ConstrainedOptimization.qmd @@ -254,7 +254,7 @@ maximum(g(M, w1)) ````{=commonmark} ```@bibliography -Pages = ["tutorials/ConstrainedOptimization.md"] +Pages = ["ConstrainedOptimization.md"] Canonical=false ``` ```` diff --git a/tutorials/EmbeddingObjectives.qmd b/tutorials/EmbeddingObjectives.qmd index d1738953af..dada11ba4a 100644 --- a/tutorials/EmbeddingObjectives.qmd +++ b/tutorials/EmbeddingObjectives.qmd @@ -193,7 +193,7 @@ to a Riemannian one. ````{=commonmark} ```@bibliography -Pages = ["tutorials/EmbeddingObjectives.md"] +Pages = ["EmbeddingObjectives.md"] Canonical=false ``` ```` diff --git a/tutorials/GeodesicRegression.qmd b/tutorials/GeodesicRegression.qmd index 41b3f44762..4f9906784e 100644 --- a/tutorials/GeodesicRegression.qmd +++ b/tutorials/GeodesicRegression.qmd @@ -515,7 +515,7 @@ Note that the geodesics from the data to the regression geodesic meet at a nearl ````{=commonmark} ```@bibliography -Pages = ["tutorials/GeodesicRegression.md"] +Pages = ["GeodesicRegression.md"] Canonical=false ``` ```` diff --git a/tutorials/Optimize!.qmd b/tutorials/Optimize!.qmd index 436716d6d7..743aec7e7f 100644 --- a/tutorials/Optimize!.qmd +++ b/tutorials/Optimize!.qmd @@ -223,7 +223,7 @@ plot(x,y,xaxis=:log, label="CPPA Cost") ````{=commonmark} ```@bibliography -Pages = ["tutorials/Optimize!.md"] +Pages = ["Optimize!.md"] Canonical=false ``` ```` \ No newline at end of file