pRC
multi-purpose Tensor Train library for C++
Loading...
Searching...
No Matches
lbfgsb.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: BSD-2-Clause
2
3// References:
4// Authors: Richard H. Byrd, Peihuang Lu, Jorge Nocedal, Ciyou Zhu
5// Title: A Limited Memory Algorithm for Bound Constrained Optimization
6// Year: 1995
7// URL: https://doi.org/10.1137/0916069
8
9#ifndef pRC_ALGORITHMS_OPTIMIZER_LBFGSB_H
10#define pRC_ALGORITHMS_OPTIMIZER_LBFGSB_H
11
12#include <prc/config.hpp>
27
28namespace pRC::Optimizer
29{
30 template<class LS = LineSearch::MoreThuente, Size M = 5>
31 class LBFGSB
32 {
33 private:
34 static constexpr Size defaultMaxIterations()
35 {
36 return 1000;
37 }
38
39 template<class X, class G, class L, class U, class T>
40 static constexpr auto projectedGradientConverged(X const &x, G const &g,
41 L const &lowerBound, U const &upperBound, T const &tolerance)
42 {
43 auto const projG = where(g < zero(), max(x - upperBound, g),
44 min(x - lowerBound, g));
45
46 auto const infNorm = norm<2, 0>(projG)();
47
49 }
50
51 template<class F, class T>
52 static constexpr auto valueConverged(F const &f0, F const &f,
53 T const &tolerance)
54 {
55 auto const scale = max(abs(f0), abs(f), identity<T>());
56
57 return delta(f0, f) <= tolerance * scale;
58 }
59
60 template<class S, class Y, class STY, class J, class T>
61 static constexpr auto resetHistory(S &s, Y &y, STY &sTy, J &j, T &theta)
62 {
63 s.clear();
64 y.clear();
65 sTy = zero();
66 j = zero();
67 theta = identity();
68
69 return;
70 }
71
72 template<class T>
73 static constexpr auto applyMiddleMatrix(Tensor<T, M, M> const &sTy,
74 Tensor<T, M, M> const &j, Tensor<T, 2 * M> const &v,
75 Size const size)
76 {
77 auto v1 = block<2>(v, 0);
78 auto v2 = block<2>(v, 1);
79
81 auto p1 = block<2>(p, 0);
82 auto p2 = block<2>(p, 1);
83
85 p2 = v2;
86 for(Index k = 0; k < size; ++k)
87 {
88 for(Index i = k + 1; i < size; ++i)
89 {
90 p2(i) -= sTy(i, k) * p1(k);
91 }
92 }
93
96
97 for(Index i = 0; i < size; ++i)
98 {
99 auto const scale = rcp(sTy(i, i));
100 for(Index k = i + 1; k < size; ++k)
101 {
102 p1(i) += sTy(k, i) * p2(k) * scale;
103 }
104 }
105
106 return p;
107 }
108
109 template<class X, class G, class LB, class UB, class T, class S,
110 class Y, class STY, class J, class C>
111 static constexpr auto computeGeneralizedCauchyPoint(X const &x,
112 G const &g, LB const &lowerBound, UB const &upperBound,
113 T const &theta, S const &s, Y const &y, STY const &sTy, J const &j,
114 C &c)
115 {
116 using TX = typename X::Type;
117
118 Tensor const t = where(g == zero(),
120 where(g < zero(), (x - upperBound) / g, (x - lowerBound) / g));
121
122 Tensor d = where(t == zero(), zero<G>(), -g);
123
124 Array<Index, X::size()> sortedIndices;
125 for(Index i = 0; i < X::size(); ++i)
126 {
127 sortedIndices[i] = i;
128 }
129
130 sort(
131 [&t](auto const i, auto const j)
132 {
133 return t[i] < t[j];
134 },
136
138 auto p0 = block<2>(p, 0);
139 auto p1 = block<2>(p, 1);
140 for(Index m = 0; m < s.size(); ++m)
141 {
142 p0(m) = scalarProduct(y.front(m), d)();
143 p1(m) = theta * scalarProduct(s.front(m), d)();
144 }
145
146 auto f1 = -norm<2, 1>(d)();
147
148 auto const mp = applyMiddleMatrix(sTy, j, p, s.size());
149 auto f2 = -theta * f1 - scalarProduct(p, mp)();
150
151 auto deltaTMin = -f1 / f2;
152
153 Index sorted = 0;
154 auto b = sortedIndices[sorted];
155 while(t[b] <= zero())
156 {
157 d[b] = zero();
158 ++sorted;
160 }
161
162 auto deltaT = t[b];
163
164 Tensor xCP = x;
165 c = zero();
166 TX t0 = zero();
167 while(deltaTMin >= deltaT)
168 {
169 if(d[b] > zero())
170 {
171 xCP[b] = upperBound[b];
172 }
173
174 if(d[b] < zero())
175 {
176 xCP[b] = lowerBound[b];
177 }
178
179 auto const z = xCP[b] - x[b];
180
181 c += deltaT * p;
182
183 f1 += deltaT * f2 + square(d[b]) - theta * d[b] * z;
184 f2 -= theta * square(d[b]);
185
187 auto w0 = block<2>(w, 0);
188 auto w1 = block<2>(w, 1);
189 for(Index m = 0; m < s.size(); ++m)
190 {
191 w0(m) = y.front(m)[b];
192 w1(m) = theta * s.front(m)[b];
193 }
194
195 auto wM = applyMiddleMatrix(sTy, j, w, s.size());
196
197 f1 += d[b] * scalarProduct(wM, c)();
198 f2 += identity<TX>(2) * d[b] * scalarProduct(wM, p)() -
199 square(d[b]) * scalarProduct(wM, w)();
200
201 p -= d[b] * wM;
202
203 deltaTMin = -f1 / f2;
204 d[b] = zero();
205 t0 = t[b];
206 ++sorted;
207 if(sorted < X::size())
208 {
210 deltaT = t[b] - t0;
211 }
212 else
213 {
214 break;
215 }
216 }
217
219 c += deltaTMin * p;
220
221 auto const scale = t0 + deltaTMin;
222
223 return xCP += scale * d;
224 }
225
226 template<class X, class CP, class G, class LB, class UB, class T,
227 class S, class Y, class STY, class J, class C>
228 static constexpr auto minimizeSubspace(X const &x, CP &xCP, G const &g,
229 LB const &lowerBound, UB const &upperBound, T const &theta,
230 S const &s, Y const &y, STY const &sTy, J const &j, C const &c)
231 {
232 using TX = typename X::Type;
233
234 Tensor r = -theta * (xCP - x) - g;
235 Tensor const mc = applyMiddleMatrix(sTy, j, c, s.size());
236 for(Index m = 0; m < s.size(); ++m)
237 {
238 r += y.front(m) * mc(m);
239 r += s.front(m) * theta * mc(M + m);
240 }
241
242 Size nFree = 0;
243 Array<Index, X::size()> freeIndices;
244 for(Index i = 0; i < X::size(); ++i)
245 {
246 if((xCP != upperBound && xCP != lowerBound)[i])
247 {
248 freeIndices[nFree++] = i;
249 }
250 }
251
252 decltype(r) zTr = zero();
253 for(Index f = 0; f < nFree; ++f)
254 {
255 zTr[f] = r[freeIndices[f]];
256 }
257
259 for(Index m = 0; m < s.size(); ++m)
260 {
262 for(Index f = 0; f < nFree; ++f)
263 {
264 zTs.back()[f] = s.front(m)[freeIndices[f]];
265 }
266 }
267
269 for(Index m = 0; m < y.size(); ++m)
270 {
272 for(Index f = 0; f < nFree; ++f)
273 {
274 zTy.back()[f] = y.front(m)[freeIndices[f]];
275 }
276 }
277
279 auto l00 = block<2, 2>(l, 0, 0);
280 auto l10 = block<2, 2>(l, 1, 0);
281 auto l01 = block<2, 2>(l, 0, 1);
282 auto l11 = block<2, 2>(l, 1, 1);
283
284 {
286 for(Index m = 0; m < zTy.size(); ++m)
287 {
288 for(Index n = 0; n < zTy.size(); ++n)
289 {
290 yTzzTy(m, n) =
291 scalarProduct(zTy.front(m), zTy.front(n))();
292 }
293 }
294
296 }
297
298 l10 = zero();
299
300 {
302 for(Index m = 0; m < zTy.size(); ++m)
303 {
304 for(Index n = 0; n < zTs.size(); ++n)
305 {
306 yTzzTs(m, n) =
307 scalarProduct(zTy.front(m), zTs.front(n))();
308 }
309 }
310
311 auto const rhs =
314 }
315
316 {
318 for(Index m = 0; m < s.size(); ++m)
319 {
320 for(Index n = 0; n < s.size(); ++n)
321 {
322 sTaaTs(m, n) = (scalarProduct(s.front(m), s.front(n)) -
323 scalarProduct(zTs.front(m), zTs.front(n)))();
324 }
325 }
326
328 }
329
331 auto rTw0 = block<2>(rTw, 0);
332 auto rTw1 = block<2>(rTw, 1);
333
334 for(Index m = 0; m < zTy.size(); ++m)
335 {
336 rTw0(m) = scalarProduct(zTr, zTy.front(m))();
337 }
338 for(Index m = 0; m < zTs.size(); ++m)
339 {
340 rTw1(m) = theta * scalarProduct(zTr, zTs.front(m))();
341 }
342
344 rTw0 = -rTw0;
346
347 Tensor d = zero<X>();
348 for(Index m = 0; m < zTy.size(); ++m)
349 {
350 d += zTy.front(m) * rTw0(m);
351 d += zTs.front(m) * rTw1(m) * theta;
352 }
353
354 auto const rTheta = rcp(theta);
355 d = (zTr + d * rTheta) * rTheta;
356
357 {
359
360 for(Index f = 0; f < nFree; ++f)
361 {
362 auto const i = freeIndices[f];
363 xCP[i] =
364 min(upperBound[i], max(lowerBound[i], xCP[i] + d[f]));
365 }
366
367 xCP -= x;
368 auto const dd = scalarProduct(xCP, g)();
369
370 if(dd < zero())
371 {
372 return dd;
373 }
374
375 xCP = xCPcopy;
376 }
377
378 TX alpha = identity();
379 for(Index f = 0; f < nFree; ++f)
380 {
381 auto const i = freeIndices[f];
382 if(d[f] > zero())
383 {
384 alpha = min(alpha, (upperBound[i] - xCP[i]) / d[f]);
385 }
386 if(d[f] < zero())
387 {
388 alpha = min(alpha, (lowerBound[i] - xCP[i]) / d[f]);
389 }
390 }
391
392 for(Index f = 0; f < nFree; ++f)
393 {
394 xCP[freeIndices[f]] += alpha * d[f];
395 }
396
397 xCP -= x;
398 return scalarProduct(xCP, g)();
399 }
400
401 template<class X, class P, class LB, class UB>
402 static constexpr auto getLineSearchParameters(X const &x, P const &p,
403 LB const &lowerBound, UB const &upperBound)
404 {
405 using TX = typename X::Type;
406
408
409 for(Index i = 0; i < X::size(); ++i)
410 {
411 if(p[i] < zero())
412 {
413 auto const d = lowerBound[i] - x[i];
414 if(d > zero())
415 {
416 alphaMax = zero();
417 }
418 else
419 {
420 if(p[i] * alphaMax < d)
421 {
422 alphaMax = d / p[i];
423 }
424 }
425 }
426 if(p[i] > zero())
427 {
428 auto const d = upperBound[i] - x[i];
429 if(d < zero())
430 {
431 alphaMax = zero();
432 }
433 else
434 {
435 if(p[i] * alphaMax > d)
436 {
437 alphaMax = d / p[i];
438 }
439 }
440 }
441 }
442
443 auto const alpha = min(rcp(norm<2>(p))(), alphaMax);
444
446 }
447
448 public:
449 constexpr LBFGSB(LS const &lineSearch,
450 Size const maxIterations = defaultMaxIterations())
451 : mLineSearch(lineSearch)
452 , mMaxIterations(maxIterations)
453 {
454 }
455
456 constexpr LBFGSB(Size const maxIterations = defaultMaxIterations())
457 : mMaxIterations(maxIterations)
458 {
459 }
460
461 constexpr auto &lineSearch() const
462 {
463 return mLineSearch;
464 }
465
466 constexpr auto maxIterations() const
467 {
468 return mMaxIterations;
469 }
470
471 template<class XX, class RX = RemoveReference<XX>,
472 class TX = typename RX::Type, class VX = typename TX::Value,
473 If<IsTensorish<RX>> = 0,
474 class RXE = RemoveConstReference<ResultOf<Eval, XX>>, class FF,
475 If<IsInvocable<FF, RXE const &, RXE &>> = 0,
476 If<IsFloat<ResultOf<FF, RXE const &, RXE &>>> = 0, class FC,
477 If<IsInvocable<FC, RXE>> = 0,
478 class XL = decltype(unit<RX>(NumericLimits<VX>::lowest())),
479 class RL = RemoveReference<XL>, class TL = typename RL::Type,
480 class VL = typename TL::Value, If<IsTensorish<RL>> = 0,
481 class XU = decltype(unit<RX>(NumericLimits<VX>::max())),
482 class RU = RemoveReference<XU>, class TU = typename RU::Type,
483 class VU = typename TU::Value, If<IsTensorish<RU>> = 0,
484 class VT = Common<VX, VL, VU>,
485 If<All<IsFloat<VX>, IsFloat<VT>>> = 0,
486 If<IsSame<typename RX::Dimension, typename RL::Dimension,
487 typename RU::Dimension>> = 0,
488 If<IsSame<typename RX::Sizes, typename RL::Sizes,
489 typename RU::Sizes>> = 0,
490 If<IsInvocable<LS, RXE &, ResultOf<FF, RXE const &, RXE &> &, RXE &,
491 VX &, FF, RXE(RXE const &), RXE const &, VX, VX, VX>> = 0>
492 inline constexpr auto operator()(XX &&x0, FF &&function, FC &&callback,
493 VT const &tolerance = NumericLimits<VT>::tolerance(),
496 {
497 if constexpr(cDebugLevel >= DebugLevel::High)
498 {
500 {
501 Logging::error("L-BFGS-B: Lower bound > upper bound.");
502 }
503 }
504
505 if constexpr(cDebugLevel >= DebugLevel::Low)
506 {
508 {
510 "L-BFGS-B: Initial parameters not in range "
511 "[lowerBound, "
512 "upperBound]");
513 }
514 }
515
516 decltype(auto) x =
517 copy<!(!IsReference<XX>() && !IsConst<RX>())>(eval(x0));
518
519 RXE g;
520 auto f = function(x, g);
521
522 Logging::info("L-BFGS-B initial f(x) =", f);
523
524 if(projectedGradientConverged(x, g, lowerBound, upperBound,
525 tolerance))
526 {
527 return x;
528 }
529
534 TX theta = identity();
535
536 for(Index iteration = 0;;)
537 {
539
540 Tensor xCP = computeGeneralizedCauchyPoint(x, g, lowerBound,
541 upperBound, theta, s, y, sTy, j, c);
542
543 auto d = minimizeSubspace(x, xCP, g, lowerBound, upperBound,
544 theta, s, y, sTy, j, c);
545
546 if(d >= zero())
547 {
548 resetHistory(s, y, sTy, j, theta);
549 continue;
550 }
551
552 auto const &p = xCP;
553 auto [alphaInit, alphaMin, alphaMax] =
554 getLineSearchParameters(x, p, lowerBound, upperBound);
555
556 auto const f0 = f;
557 auto const g0 = g;
558 auto const d0 = d;
559
560 auto alpha = lineSearch()(
561 x, f, g, d, function,
562 [&lowerBound, &upperBound](auto &&x)
563 {
564 return min(upperBound,
565 max(lowerBound, forward<decltype(x)>(x)));
566 },
568
569 callback(x);
570
571 if(++iteration; !(iteration < maxIterations()))
572 {
573 Logging::info("L-BFGS-B max iterations reached at f(x) =",
574 f);
575 break;
576 }
577
578 if(valueConverged(f0, f, tolerance))
579 {
580 Logging::info("L-BFGS-B converged at f(x) =", f);
581 break;
582 }
583
584 if(projectedGradientConverged(x, g, lowerBound, upperBound,
585 tolerance))
586 {
587 Logging::info("L-BFGS-B converged at f(x) =", f);
588 break;
589 }
590
591 Logging::info("L-BFGS-B current f(x) =", f);
592
593 auto const skTyk = (d - d0) * alpha;
594
596 {
597 continue;
598 }
599
600 s.pushBack(alpha * p);
601 y.pushBack(g - g0);
602
603 theta = norm<2, 1>(y.back())() / skTyk;
604
605 for(Index n = 0; n < y.size(); ++n)
606 {
607 for(Index m = 0; m < s.size(); ++m)
608 {
609 sTy(m, n) = scalarProduct(s.front(m), y.front(n))();
610 }
611 }
612
613 j = zero();
614 for(Index m = 0; m < s.size(); ++m)
615 {
616 for(Index n = m; n < s.size(); ++n)
617 {
618 j(m, n) =
619 theta * scalarProduct(s.front(m), s.front(n))();
620 for(Index k = 0; k < m; ++k)
621 {
622 j(m, n) += sTy(m, k) * sTy(n, k) / sTy(k, k);
623 }
624 }
625 }
626
628 }
629
630 if constexpr(IsReference<decltype(x)>())
631 {
632 return forward<XX>(x0);
633 }
634 else
635 {
636 return x;
637 }
638 }
639
640 private:
641 LS const mLineSearch;
642 Size const mMaxIterations;
643 };
644}
645#endif // pRC_ALGORITHMS_OPTIMIZER_LBFGSB_H
Definition deque.hpp:15
constexpr auto size() const
Definition deque.hpp:28
constexpr decltype(auto) front(Index const position=0) &&
Definition deque.hpp:33
constexpr auto pushBack(R const &element) &&
Definition deque.hpp:133
constexpr auto emplaceBack(Args &&...args) &&
Definition deque.hpp:163
Definition lbfgsb.hpp:32
constexpr LBFGSB(Size const maxIterations=defaultMaxIterations())
Definition lbfgsb.hpp:456
constexpr LBFGSB(LS const &lineSearch, Size const maxIterations=defaultMaxIterations())
Definition lbfgsb.hpp:449
constexpr auto & lineSearch() const
Definition lbfgsb.hpp:461
constexpr auto maxIterations() const
Definition lbfgsb.hpp:466
constexpr auto operator()(XX &&x0, FF &&function, FC &&callback, VT const &tolerance=NumericLimits< VT >::tolerance(), XL &&lowerBound=unit< RL >(NumericLimits< VL >::lowest()), XU &&upperBound=unit< RU >(NumericLimits< VU >::max())) const
Definition lbfgsb.hpp:492
Class storing tensors.
Definition tensor.hpp:44
static void info(Xs &&...args)
Definition log.hpp:27
static void error(Xs &&...args)
Definition log.hpp:14
Definition bfgs.hpp:16
static constexpr auto extractDiagonal(X &&a)
Extracts the diagonal of a Tensor.
Definition extract_diagonal.hpp:28
static constexpr X eval(X &&a)
Definition eval.hpp:11
static constexpr X min(X &&a)
Definition min.hpp:13
static constexpr auto diagonal(X &&a)
Transforms a Tensor into diagonal form.
Definition diagonal.hpp:26
static constexpr auto rcp(Complex< T > const &b)
Definition rcp.hpp:13
static constexpr auto cholesky(X &&a)
Definition cholesky.hpp:24
static constexpr auto zero()
Definition zero.hpp:12
std::size_t Size
Definition type_traits.hpp:20
Conditional< IsSatisfied<((Ns *... *1) *sizeof(T) > cHugepageSizeByte)>, HeapArray< T, Ns... >, StackArray< T, Ns... > > Array
Definition type_traits.hpp:60
static constexpr auto transpose(JacobiRotation< T > const &a)
Definition jacobi_rotation.hpp:319
static constexpr T where(TE const e, T &&a, T &&b)
Definition where.hpp:12
static constexpr void sort(C const &compare, T &a, Size const k=T::size(), Size const d=0)
Definition sort.hpp:15
static constexpr auto square(Complex< T > const &a)
Definition square.hpp:14
static constexpr auto strictlyLowerTriangular(X &&a)
Transforms a Tensor into strictly lower triangular form.
Definition strictly_lower_triangular.hpp:22
static constexpr auto delta(Complex< TA > const &a, Complex< TB > const &b)
Definition delta.hpp:12
static constexpr auto abs(Complex< T > const &a)
Definition abs.hpp:12
static constexpr Conditional< IsSatisfied< C >, RemoveConstReference< X >, X > copy(X &&a)
Definition copy.hpp:13
constexpr auto cDebugLevel
Definition config.hpp:46
std::is_reference< T > IsReference
Definition type_traits.hpp:47
static constexpr auto scalarProduct(Complex< TA > const &a, Complex< TB > const &b)
Definition scalar_product.hpp:13
Size Index
Definition type_traits.hpp:21
static constexpr auto identity()
Definition identity.hpp:12
static constexpr X max(X &&a)
Definition max.hpp:13
Definition limits.hpp:13