cMHN 1.2
C++ library for learning MHNs with pRC
Loading...
Searching...
No Matches
lbfgsb.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: BSD-2-Clause
2
3// References:
4// Authors: Richard H. Byrd, Peihuang Lu, Jorge Nocedal, Ciyou Zhu
5// Title: A Limited Memory Algorithm for Bound Constrained Optimization
6// Year: 1995
7// URL: https://doi.org/10.1137/0916069
8
9#ifndef pRC_ALGORITHMS_OPTIMIZER_LBFGSB_H
10#define pRC_ALGORITHMS_OPTIMIZER_LBFGSB_H
11
20
21namespace pRC::Optimizer
22{
23 template<class LS = LineSearch::MoreThuente, Size M = 5>
24 class LBFGSB
25 {
26 private:
27 static constexpr Size defaultMaxIterations()
28 {
29 return 1000;
30 }
31
32 template<class X, class G, class L, class U, class T>
33 static constexpr auto projectedGradientConverged(X const &x, G const &g,
34 L const &lowerBound, U const &upperBound, T const &tolerance)
35 {
36 auto const projG = where(g < zero(), max(x - upperBound, g),
37 min(x - lowerBound, g));
38
39 auto const infNorm = norm<2, 0>(projG)();
40
41 return infNorm <= tolerance * identity<T>(1e-3);
42 }
43
44 template<class F, class T>
45 static constexpr auto valueConverged(F const &f0, F const &f,
46 T const &tolerance)
47 {
48 auto const scale = max(abs(f0), abs(f), identity<T>());
49
50 return delta(f0, f) <= tolerance * scale;
51 }
52
53 template<class F>
54 static constexpr auto valueDiverged(F const &f)
55 {
56 return !isFinite(f);
57 }
58
59 template<class F>
60 static constexpr auto valueIncreased(F const &f0, F const &f)
61 {
62 return f > f0;
63 }
64
65 template<class S, class Y, class STY, class J, class T>
66 static constexpr auto resetHistory(S &s, Y &y, STY &sTy, J &j, T &theta)
67 {
68 s.clear();
69 y.clear();
70 sTy = zero();
71 j = zero();
72 theta = identity();
73
74 return;
75 }
76
77 template<class T>
78 static constexpr auto applyMiddleMatrix(Tensor<T, M, M> const &sTy,
79 Tensor<T, M, M> const &j, Tensor<T, 2 * M> const &v,
80 Size const size)
81 {
82 auto v1 = block<2>(v, 0);
83 auto v2 = block<2>(v, 1);
84
86 auto p1 = block<2>(p, 0);
87 auto p2 = block<2>(p, 1);
88
89 p1 = -v1 / extractDiagonal(sTy);
90 p2 = v2;
91 for(Index k = 0; k < size; ++k)
92 {
93 for(Index i = k + 1; i < size; ++i)
94 {
95 p2(i) -= sTy(i, k) * p1(k);
96 }
97 }
98
101
102 for(Index i = 0; i < size; ++i)
103 {
104 auto const scale = rcp(sTy(i, i));
105 for(Index k = i + 1; k < size; ++k)
106 {
107 p1(i) += sTy(k, i) * p2(k) * scale;
108 }
109 }
110
111 return p;
112 }
113
114 template<class X, class G, class LB, class UB, class T, class S,
115 class Y, class STY, class J, class C>
116 static constexpr auto computeGeneralizedCauchyPoint(X const &x,
117 G const &g, LB const &lowerBound, UB const &upperBound,
118 T const &theta, S const &s, Y const &y, STY const &sTy, J const &j,
119 C &c)
120 {
121 using TX = typename X::Type;
122
123 Tensor const t = where(g == zero(),
125 where(g < zero(), (x - upperBound) / g, (x - lowerBound) / g));
126
127 Tensor d = where(t == zero(), zero<G>(), -g);
128
129 Array<Index, X::size()> sortedIndices;
130 for(Index i = 0; i < X::size(); ++i)
131 {
132 sortedIndices[i] = i;
133 }
134
135 sort(
136 [&t](auto const i, auto const j)
137 {
138 return t[i] < t[j];
139 },
140 sortedIndices);
141
143 auto p0 = block<2>(p, 0);
144 auto p1 = block<2>(p, 1);
145 for(Index m = 0; m < s.size(); ++m)
146 {
147 p0(m) = scalarProduct(y.front(m), d)();
148 p1(m) = theta * scalarProduct(s.front(m), d)();
149 }
150
151 auto f1 = -norm<2, 1>(d)();
152
153 auto const mp = applyMiddleMatrix(sTy, j, p, s.size());
154 auto f2 = -theta * f1 - scalarProduct(p, mp)();
155
156 auto deltaTMin = -f1 / f2;
157
158 Index sorted = 0;
159 auto b = sortedIndices[sorted];
160 while(t[b] <= zero())
161 {
162 d[b] = zero();
163 ++sorted;
164 b = sortedIndices[sorted];
165 }
166
167 auto deltaT = t[b];
168
169 Tensor xCP = x;
170 c = zero();
171 TX t0 = zero();
172 while(deltaTMin >= deltaT)
173 {
174 if(d[b] > zero())
175 {
176 xCP[b] = upperBound[b];
177 }
178
179 if(d[b] < zero())
180 {
181 xCP[b] = lowerBound[b];
182 }
183
184 auto const z = xCP[b] - x[b];
185
186 c += deltaT * p;
187
188 f1 += deltaT * f2 + square(d[b]) - theta * d[b] * z;
189 f2 -= theta * square(d[b]);
190
192 auto w0 = block<2>(w, 0);
193 auto w1 = block<2>(w, 1);
194 for(Index m = 0; m < s.size(); ++m)
195 {
196 w0(m) = y.front(m)[b];
197 w1(m) = theta * s.front(m)[b];
198 }
199
200 auto wM = applyMiddleMatrix(sTy, j, w, s.size());
201
202 f1 += d[b] * scalarProduct(wM, c)();
203 f2 += identity<TX>(2) * d[b] * scalarProduct(wM, p)() -
204 square(d[b]) * scalarProduct(wM, w)();
205
206 p -= d[b] * wM;
207
208 deltaTMin = -f1 / f2;
209 d[b] = zero();
210 t0 = t[b];
211 ++sorted;
212 if(sorted < X::size())
213 {
214 b = sortedIndices[sorted];
215 deltaT = t[b] - t0;
216 }
217 else
218 {
219 break;
220 }
221 }
222
223 deltaTMin = max(deltaTMin, zero<TX>());
224 c += deltaTMin * p;
225
226 auto const scale = t0 + deltaTMin;
227
228 return xCP += scale * d;
229 }
230
231 template<class X, class CP, class G, class LB, class UB, class T,
232 class S, class Y, class STY, class J, class C>
233 static constexpr auto minimizeSubspace(X const &x, CP &xCP, G const &g,
234 LB const &lowerBound, UB const &upperBound, T const &theta,
235 S const &s, Y const &y, STY const &sTy, J const &j, C const &c)
236 {
237 using TX = typename X::Type;
238
239 Tensor r = -theta * (xCP - x) - g;
240 Tensor const mc = applyMiddleMatrix(sTy, j, c, s.size());
241 for(Index m = 0; m < s.size(); ++m)
242 {
243 r += y.front(m) * mc(m);
244 r += s.front(m) * theta * mc(M + m);
245 }
246
247 Size nFree = 0;
248 Array<Index, X::size()> freeIndices;
249 for(Index i = 0; i < X::size(); ++i)
250 {
251 if((xCP != upperBound && xCP != lowerBound)[i])
252 {
253 freeIndices[nFree++] = i;
254 }
255 }
256
257 decltype(r) zTr = zero();
258 for(Index f = 0; f < nFree; ++f)
259 {
260 zTr[f] = r[freeIndices[f]];
261 }
262
263 Deque<X, M> zTs;
264 for(Index m = 0; m < s.size(); ++m)
265 {
266 zTs.emplaceBack(zero<X>());
267 for(Index f = 0; f < nFree; ++f)
268 {
269 zTs.back()[f] = s.front(m)[freeIndices[f]];
270 }
271 }
272
273 Deque<X, M> zTy;
274 for(Index m = 0; m < y.size(); ++m)
275 {
276 zTy.emplaceBack(zero<X>());
277 for(Index f = 0; f < nFree; ++f)
278 {
279 zTy.back()[f] = y.front(m)[freeIndices[f]];
280 }
281 }
282
284 auto l00 = block<2, 2>(l, 0, 0);
285 auto l10 = block<2, 2>(l, 1, 0);
286 auto l01 = block<2, 2>(l, 0, 1);
287 auto l11 = block<2, 2>(l, 1, 1);
288
289 {
290 Tensor<TX, M, M> yTzzTy = zero();
291 for(Index m = 0; m < zTy.size(); ++m)
292 {
293 for(Index n = 0; n < zTy.size(); ++n)
294 {
295 yTzzTy(m, n) =
296 scalarProduct(zTy.front(m), zTy.front(n))();
297 }
298 }
299
300 l00 = cholesky(diagonal(sTy) + yTzzTy / theta);
301 }
302
303 l10 = zero();
304
305 {
306 Tensor<TX, M, M> yTzzTs = zero();
307 for(Index m = 0; m < zTy.size(); ++m)
308 {
309 for(Index n = 0; n < zTs.size(); ++n)
310 {
311 yTzzTs(m, n) =
312 scalarProduct(zTy.front(m), zTs.front(n))();
313 }
314 }
315
316 auto const rhs =
317 -transpose(strictlyLowerTriangular(sTy)) + yTzzTs;
319 }
320
321 {
322 Tensor<TX, M, M> sTaaTs = zero();
323 for(Index m = 0; m < s.size(); ++m)
324 {
325 for(Index n = 0; n < s.size(); ++n)
326 {
327 sTaaTs(m, n) = (scalarProduct(s.front(m), s.front(n)) -
328 scalarProduct(zTs.front(m), zTs.front(n)))();
329 }
330 }
331
332 l11 = cholesky(theta * sTaaTs + transpose(l01) * l01);
333 }
334
335 Tensor<TX, 2 * M> rTw = zero();
336 auto rTw0 = block<2>(rTw, 0);
337 auto rTw1 = block<2>(rTw, 1);
338
339 for(Index m = 0; m < zTy.size(); ++m)
340 {
341 rTw0(m) = scalarProduct(zTr, zTy.front(m))();
342 }
343 for(Index m = 0; m < zTs.size(); ++m)
344 {
345 rTw1(m) = theta * scalarProduct(zTr, zTs.front(m))();
346 }
347
349 rTw0 = -rTw0;
351
352 Tensor d = zero<X>();
353 for(Index m = 0; m < zTy.size(); ++m)
354 {
355 d += zTy.front(m) * rTw0(m);
356 d += zTs.front(m) * rTw1(m) * theta;
357 }
358
359 auto const rTheta = rcp(theta);
360 d = (zTr + d * rTheta) * rTheta;
361
362 {
363 Tensor xCPcopy = xCP;
364
365 for(Index f = 0; f < nFree; ++f)
366 {
367 auto const i = freeIndices[f];
368 xCP[i] =
369 min(upperBound[i], max(lowerBound[i], xCP[i] + d[f]));
370 }
371
372 xCP -= x;
373 auto const dd = scalarProduct(xCP, g)();
374
375 if(dd < zero())
376 {
377 return dd;
378 }
379
380 xCP = xCPcopy;
381 }
382
383 TX alpha = identity();
384 for(Index f = 0; f < nFree; ++f)
385 {
386 auto const i = freeIndices[f];
387 if(d[f] > zero())
388 {
389 alpha = min(alpha, (upperBound[i] - xCP[i]) / d[f]);
390 }
391 if(d[f] < zero())
392 {
393 alpha = min(alpha, (lowerBound[i] - xCP[i]) / d[f]);
394 }
395 }
396
397 for(Index f = 0; f < nFree; ++f)
398 {
399 xCP[freeIndices[f]] += alpha * d[f];
400 }
401
402 xCP -= x;
403 return scalarProduct(xCP, g)();
404 }
405
406 template<class X, class P, class LB, class UB>
407 static constexpr auto getLineSearchParameters(X const &x, P const &p,
408 LB const &lowerBound, UB const &upperBound)
409 {
410 using TX = typename X::Type;
411
412 auto alphaMax = NumericLimits<TX>::max();
413
414 for(Index i = 0; i < X::size(); ++i)
415 {
416 if(p[i] < zero())
417 {
418 auto const d = lowerBound[i] - x[i];
419 if(d > zero())
420 {
421 alphaMax = zero();
422 }
423 else
424 {
425 if(p[i] * alphaMax < d)
426 {
427 alphaMax = d / p[i];
428 }
429 }
430 }
431 if(p[i] > zero())
432 {
433 auto const d = upperBound[i] - x[i];
434 if(d < zero())
435 {
436 alphaMax = zero();
437 }
438 else
439 {
440 if(p[i] * alphaMax > d)
441 {
442 alphaMax = d / p[i];
443 }
444 }
445 }
446 }
447
448 auto const alpha = min(rcp(norm<2>(p))(), alphaMax);
449
450 return Tuple<TX, TX, TX>(alpha, zero<TX>(), alphaMax);
451 }
452
453 public:
454 constexpr LBFGSB(LS const &lineSearch,
455 Size const maxIterations = defaultMaxIterations())
456 : mLineSearch(lineSearch)
457 , mMaxIterations(maxIterations)
458 {
459 }
460
461 constexpr LBFGSB(Size const maxIterations = defaultMaxIterations())
462 : mMaxIterations(maxIterations)
463 {
464 }
465
466 constexpr auto &lineSearch() const
467 {
468 return mLineSearch;
469 }
470
471 constexpr auto maxIterations() const
472 {
473 return mMaxIterations;
474 }
475
476 template<class XX, class FF, class FC,
479 IsFloat VX = Value<RX>,
480 class XL = decltype(unit<RX>(NumericLimits<VX>::lowest())),
482 class XU = decltype(unit<RX>(NumericLimits<VX>::max())),
487 IsInvocable<FC, RXE const &> && (RX::Dimension == RL::Dimension) &&
488 (RX::Dimension == RU::Dimension) &&
493 RXE(RXE const &), RXE const &, VX, VX, VX>
494 inline constexpr auto operator()(XX &&x0, FF &&function, FC &&callback,
495 VT const &tolerance = NumericLimits<VT>::tolerance(),
496 XL &&lowerBound = unit<RL>(NumericLimits<VL>::lowest()),
497 XU &&upperBound = unit<RU>(NumericLimits<VU>::max())) const
498 {
499 using TX = typename RX::Type;
500
501 if constexpr(cDebugLevel >= DebugLevel::High)
502 {
503 if(reduce<LogicalOr>(lowerBound > upperBound)())
504 {
505 Logging::error("L-BFGS-B: Lower bound > upper bound.");
506 }
507 }
508
509 if constexpr(cDebugLevel >= DebugLevel::Low)
510 {
511 if(reduce<LogicalOr>(x0 < lowerBound || x0 > upperBound)())
512 {
514 "L-BFGS-B: Initial parameters not in range "
515 "[lowerBound, "
516 "upperBound]");
517 }
518 }
519
520 decltype(auto) x =
521 copy<!(!IsReference<XX> && !IsConst<RX>)>(eval(x0));
522
523 RXE g;
524 auto f = function(x, g);
525
526 Logging::info("L-BFGS-B initial f(x) =", f);
527
528 if(projectedGradientConverged(x, g, lowerBound, upperBound,
529 tolerance))
530 {
531 return x;
532 }
533
536 Tensor<TX, M, M> sTy = zero();
538 TX theta = identity();
539
540 for(Index iteration = 0;;)
541 {
543
544 Tensor xCP = computeGeneralizedCauchyPoint(x, g, lowerBound,
545 upperBound, theta, s, y, sTy, j, c);
546
547 auto d = minimizeSubspace(x, xCP, g, lowerBound, upperBound,
548 theta, s, y, sTy, j, c);
549
550 if(d >= zero())
551 {
552 resetHistory(s, y, sTy, j, theta);
553 continue;
554 }
555
556 auto const &p = xCP;
557 auto [alphaInit, alphaMin, alphaMax] =
558 getLineSearchParameters(x, p, lowerBound, upperBound);
559
560 auto const f0 = f;
561 auto const g0 = g;
562 auto const d0 = d;
563 auto const x0 = x;
564
565 auto alpha = lineSearch()(
566 x, f, g, d, function,
567 [&lowerBound, &upperBound](auto &&x)
568 {
569 return min(upperBound,
570 max(lowerBound, forward<decltype(x)>(x)));
571 },
572 p, alphaInit, alphaMin, alphaMax);
573
574 callback(x);
575
576 if(valueDiverged(f))
577 {
578 Logging::info("L-BFGS-B diverged at f(x) =", f);
579 x = x0;
580 break;
581 }
582
583 if(valueIncreased(f))
584 {
585 Logging::info("L-BFGS-B made no further progress at f(x) =",
586 f);
587 x = x0;
588 break;
589 }
590
591 if(++iteration; !(iteration < maxIterations()))
592 {
593 Logging::info("L-BFGS-B max iterations reached at f(x) =",
594 f);
595 break;
596 }
597
598 if(valueConverged(f0, f, tolerance))
599 {
600 Logging::info("L-BFGS-B converged at f(x) =", f);
601 break;
602 }
603
604 if(projectedGradientConverged(x, g, lowerBound, upperBound,
605 tolerance))
606 {
607 Logging::info("L-BFGS-B converged at f(x) =", f);
608 break;
609 }
610
611 Logging::info("L-BFGS-B current f(x) =", f);
612
613 auto const skTyk = (d - d0) * alpha;
614
615 if(skTyk <= -NumericLimits<VX>::epsilon() * d0 * alpha)
616 {
617 continue;
618 }
619
620 s.pushBack(alpha * p);
621 y.pushBack(g - g0);
622
623 theta = norm<2, 1>(y.back())() / skTyk;
624
625 for(Index n = 0; n < y.size(); ++n)
626 {
627 for(Index m = 0; m < s.size(); ++m)
628 {
629 sTy(m, n) = scalarProduct(s.front(m), y.front(n))();
630 }
631 }
632
633 j = zero();
634 for(Index m = 0; m < s.size(); ++m)
635 {
636 for(Index n = m; n < s.size(); ++n)
637 {
638 j(m, n) =
639 theta * scalarProduct(s.front(m), s.front(n))();
640 for(Index k = 0; k < m; ++k)
641 {
642 j(m, n) += sTy(m, k) * sTy(n, k) / sTy(k, k);
643 }
644 }
645 }
646
648 }
649
650 callback(x);
651
652 if constexpr(IsReference<decltype(x)>)
653 {
654 return forward<XX>(x0);
655 }
656 else
657 {
658 return x;
659 }
660 }
661
662 private:
663 LS const mLineSearch;
664 Size const mMaxIterations;
665 };
666}
667#endif // pRC_ALGORITHMS_OPTIMIZER_LBFGSB_H
Definition deque.hpp:15
constexpr auto emplaceBack(Args &&...args) &&
Definition deque.hpp:166
constexpr auto size() const
Definition deque.hpp:28
constexpr decltype(auto) front(Index const position=0) &&
Definition deque.hpp:33
constexpr decltype(auto) back(Index const position=0) &&
Definition deque.hpp:53
constexpr auto pushBack(R const &element) &&
Definition deque.hpp:135
Definition value.hpp:12
Definition lbfgsb.hpp:25
constexpr LBFGSB(Size const maxIterations=defaultMaxIterations())
Definition lbfgsb.hpp:461
constexpr LBFGSB(LS const &lineSearch, Size const maxIterations=defaultMaxIterations())
Definition lbfgsb.hpp:454
constexpr auto & lineSearch() const
Definition lbfgsb.hpp:466
constexpr auto operator()(XX &&x0, FF &&function, FC &&callback, VT const &tolerance=NumericLimits< VT >::tolerance(), XL &&lowerBound=unit< RL >(NumericLimits< VL >::lowest()), XU &&upperBound=unit< RU >(NumericLimits< VU >::max())) const
Definition lbfgsb.hpp:494
constexpr auto maxIterations() const
Definition lbfgsb.hpp:471
Definition tensor.hpp:25
static constexpr auto size()
Definition tensor.hpp:39
Definition concepts.hpp:25
Definition value.hpp:24
Definition concepts.hpp:31
Definition concepts.hpp:19
Definition concepts.hpp:28
Definition declarations.hpp:27
Definition declarations.hpp:45
TN::Subscripts S
Definition externs_nonTT.hpp:9
pRC::Float<> T
Definition externs_nonTT.hpp:1
int i
Definition gmock-matchers-comparisons_test.cc:603
Uncopyable z
Definition gmock-matchers-containers_test.cc:378
const double y
Definition gmock-matchers-containers_test.cc:377
int x
Definition gmock-matchers-containers_test.cc:376
const char * p
Definition gmock-matchers-containers_test.cc:379
static void info(Xs &&...args)
Definition log.hpp:27
static void error(Xs &&...args)
Definition log.hpp:14
Definition bfgs.hpp:11
static constexpr auto isFinite(T const &a)
Definition is_finite.hpp:13
static constexpr decltype(auto) solve(Solver &&solver, XA &&A, XB &&b)
Definition solve.hpp:18
static constexpr auto unit()
Definition unit.hpp:13
static constexpr Conditional< C, RemoveConstReference< X >, RemoveConst< X > > copy(X &&a)
Definition copy.hpp:13
static constexpr decltype(auto) where(TE const e, XA &&a, XB &&b)
Definition where.hpp:11
static constexpr auto rcp(T const &b)
Definition rcp.hpp:12
static constexpr auto diagonal(X &&a)
Definition diagonal.hpp:15
Size Index
Definition basics.hpp:32
std::size_t Size
Definition basics.hpp:31
std::tuple< Ts... > Tuple
Definition basics.hpp:23
std::invoke_result_t< F, Args... > ResultOf
Definition basics.hpp:59
std::remove_reference_t< T > RemoveReference
Definition basics.hpp:41
static constexpr auto strictlyLowerTriangular(X &&a)
Definition strictly_lower_triangular.hpp:14
std::common_type_t< Ts... > Common
Definition basics.hpp:53
static constexpr auto cholesky(X &&a)
Definition cholesky.hpp:14
static constexpr auto block(X &&a, Os const ... offsets)
Definition block.hpp:17
static constexpr auto transpose(JacobiRotation< T > const &a)
Definition jacobi_rotation.hpp:306
static constexpr auto abs(T const &a)
Definition abs.hpp:11
typename ValueType< T >::Type Value
Definition value.hpp:72
static constexpr decltype(auto) min(X &&a)
Definition min.hpp:13
Conditional<((Ns *... *1) *sizeof(T) > cHugepageSizeByte), HeapArray< T, Ns... >, StackArray< T, Ns... > > Array
Definition declarations.hpp:21
static constexpr auto scalarProduct(TA const &a, TB const &b)
Definition scalar_product.hpp:11
static constexpr auto reduce(Sequence< T, I1, I2, Is... > const)
Definition sequence.hpp:458
static constexpr auto delta(TA const &a, TB const &b)
Definition delta.hpp:11
static constexpr auto extractDiagonal(X &&a)
Definition extract_diagonal.hpp:16
constexpr auto cDebugLevel
Definition config.hpp:48
RemoveConst< RemoveReference< T > > RemoveConstReference
Definition basics.hpp:47
static constexpr auto square(T const &a)
Definition square.hpp:11
static constexpr auto identity()
Definition identity.hpp:13
static constexpr auto zero()
Definition zero.hpp:12
static constexpr void sort(C const &compare, T &a, Size const k=T::size(), Size const d=0)
Definition sort.hpp:15
static constexpr decltype(auto) eval(X &&a)
Definition eval.hpp:12
static constexpr auto norm(T const &a)
Definition norm.hpp:12
static constexpr decltype(auto) max(X &&a)
Definition max.hpp:13
Definition limits.hpp:13