documentation_v0.1/html/lbfgsb_8hpp_source.html

// SPDX-License-Identifier: BSD-2-Clause


// References:

//  Authors:    Richard H. Byrd, Peihuang Lu, Jorge Nocedal, Ciyou Zhu

//  Title:      A Limited Memory Algorithm for Bound Constrained Optimization

//  Year:       1995

//  URL:        https://doi.org/10.1137/0916069


#ifndef pRC_ALGORITHMS_OPTIMIZER_LBFGSB_H

#define pRC_ALGORITHMS_OPTIMIZER_LBFGSB_H


#include <prc/config.hpp>

#include <prc/algorithms/cholesky.hpp>

#include <prc/algorithms/optimizer/line_search/more_thuente.hpp>

#include <prc/algorithms/solve.hpp>

#include <prc/algorithms/solver/backward_substitution.hpp>

#include <prc/algorithms/solver/forward_substitution.hpp>

#include <prc/algorithms/sort.hpp>

#include <prc/core/basic/functions/copy.hpp>

#include <prc/core/functors/eval.hpp>

#include <prc/core/functors/logical_or.hpp>

#include <prc/core/tensor/functions/block.hpp>

#include <prc/core/tensor/functions/norm.hpp>

#include <prc/core/tensor/operator/functions/diagonal.hpp>

#include <prc/core/tensor/type_traits.hpp>

#include <prc/core/value/limits.hpp>


namespace pRC::Optimizer

{

    template<class LS = LineSearch::MoreThuente, Size M = 5>


    class LBFGSB

    {

    private:

        static constexpr Size defaultMaxIterations()

        {

            return 1000;

        }


        template<class X, class G, class L, class U, class T>

        static constexpr auto projectedGradientConverged(X const &x, G const &g,

            L const &lowerBound, U const &upperBound, T const &tolerance)

        {

            auto const projG = where(g < zero(), max(x - upperBound, g),

                min(x - lowerBound, g));


            auto const infNorm = norm<2, 0>(projG)();


            return infNorm <= tolerance * identity<T>(1e-3);

        }


        template<class F, class T>

        static constexpr auto valueConverged(F const &f0, F const &f,

            T const &tolerance)

        {

            auto const scale = max(abs(f0), abs(f), identity<T>());


            return delta(f0, f) <= tolerance * scale;

        }


        template<class S, class Y, class STY, class J, class T>

        static constexpr auto resetHistory(S &s, Y &y, STY &sTy, J &j, T &theta)

        {

            s.clear();

            y.clear();

            sTy = zero();

            j = zero();

            theta = identity();


            return;

        }


        template<class T>

        static constexpr auto applyMiddleMatrix(Tensor<T, M, M> const &sTy,

            Tensor<T, M, M> const &j, Tensor<T, 2 * M> const &v,

            Size const size)

        {

            auto v1 = block<2>(v, 0);

            auto v2 = block<2>(v, 1);


            Tensor<T, 2 * M> p;

            auto p1 = block<2>(p, 0);

            auto p2 = block<2>(p, 1);


            p1 = -v1 / extractDiagonal(sTy);

            p2 = v2;

            for(Index k = 0; k < size; ++k)

            {

                for(Index i = k + 1; i < size; ++i)

                {

                    p2(i) -= sTy(i, k) * p1(k);

                }

            }


            p2 = solve<Solver::ForwardSubstitution>(transpose(j), p2);

            p2 = solve<Solver::BackwardSubstitution>(j, p2);


            for(Index i = 0; i < size; ++i)

            {

                auto const scale = rcp(sTy(i, i));

                for(Index k = i + 1; k < size; ++k)

                {

                    p1(i) += sTy(k, i) * p2(k) * scale;

                }

            }


            return p;

        }


        template<class X, class G, class LB, class UB, class T, class S,

            class Y, class STY, class J, class C>

        static constexpr auto computeGeneralizedCauchyPoint(X const &x,

            G const &g, LB const &lowerBound, UB const &upperBound,

            T const &theta, S const &s, Y const &y, STY const &sTy, J const &j,

            C &c)

        {

            using TX = typename X::Type;


            Tensor const t = where(g == zero(),

                unit<G>(NumericLimits<typename G::Value>::max()),

                where(g < zero(), (x - upperBound) / g, (x - lowerBound) / g));


            Tensor d = where(t == zero(), zero<G>(), -g);


            Array<Index, X::size()> sortedIndices;

            for(Index i = 0; i < X::size(); ++i)

            {

                sortedIndices[i] = i;

            }


            sort(

                [&t](auto const i, auto const j)

                {

                    return t[i] < t[j];

                },

                sortedIndices);


            Tensor<TX, 2 *M> p = zero();

            auto p0 = block<2>(p, 0);

            auto p1 = block<2>(p, 1);

            for(Index m = 0; m < s.size(); ++m)

            {

                p0(m) = scalarProduct(y.front(m), d)();

                p1(m) = theta * scalarProduct(s.front(m), d)();

            }


            auto f1 = -norm<2, 1>(d)();


            auto const mp = applyMiddleMatrix(sTy, j, p, s.size());

            auto f2 = -theta * f1 - scalarProduct(p, mp)();


            auto deltaTMin = -f1 / f2;


            Index sorted = 0;

            auto b = sortedIndices[sorted];

            while(t[b] <= zero())

            {

                d[b] = zero();

                ++sorted;

                b = sortedIndices[sorted];

            }


            auto deltaT = t[b];


            Tensor xCP = x;

            c = zero();

            TX t0 = zero();

            while(deltaTMin >= deltaT)

            {

                if(d[b] > zero())

                {

                    xCP[b] = upperBound[b];

                }


                if(d[b] < zero())

                {

                    xCP[b] = lowerBound[b];

                }


                auto const z = xCP[b] - x[b];


                c += deltaT * p;


                f1 += deltaT * f2 + square(d[b]) - theta * d[b] * z;

                f2 -= theta * square(d[b]);


                Tensor<TX, 2 *M> w = zero();

                auto w0 = block<2>(w, 0);

                auto w1 = block<2>(w, 1);

                for(Index m = 0; m < s.size(); ++m)

                {

                    w0(m) = y.front(m)[b];

                    w1(m) = theta * s.front(m)[b];

                }


                auto wM = applyMiddleMatrix(sTy, j, w, s.size());


                f1 += d[b] * scalarProduct(wM, c)();

                f2 += identity<TX>(2) * d[b] * scalarProduct(wM, p)() -

                    square(d[b]) * scalarProduct(wM, w)();


                p -= d[b] * wM;


                deltaTMin = -f1 / f2;

                d[b] = zero();

                t0 = t[b];

                ++sorted;

                if(sorted < X::size())

                {

                    b = sortedIndices[sorted];

                    deltaT = t[b] - t0;

                }

                else

                {

                    break;

                }

            }


            deltaTMin = max(deltaTMin, zero<TX>());

            c += deltaTMin * p;


            auto const scale = t0 + deltaTMin;


            return xCP += scale * d;

        }


        template<class X, class CP, class G, class LB, class UB, class T,

            class S, class Y, class STY, class J, class C>

        static constexpr auto minimizeSubspace(X const &x, CP &xCP, G const &g,

            LB const &lowerBound, UB const &upperBound, T const &theta,

            S const &s, Y const &y, STY const &sTy, J const &j, C const &c)

        {

            using TX = typename X::Type;


            Tensor r = -theta * (xCP - x) - g;

            Tensor const mc = applyMiddleMatrix(sTy, j, c, s.size());

            for(Index m = 0; m < s.size(); ++m)

            {

                r += y.front(m) * mc(m);

                r += s.front(m) * theta * mc(M + m);

            }


            Size nFree = 0;

            Array<Index, X::size()> freeIndices;

            for(Index i = 0; i < X::size(); ++i)

            {

                if((xCP != upperBound && xCP != lowerBound)[i])

                {

                    freeIndices[nFree++] = i;

                }

            }


            decltype(r) zTr = zero();

            for(Index f = 0; f < nFree; ++f)

            {

                zTr[f] = r[freeIndices[f]];

            }


            Deque<X, M> zTs;

            for(Index m = 0; m < s.size(); ++m)

            {

                zTs.emplaceBack(zero<X>());

                for(Index f = 0; f < nFree; ++f)

                {

                    zTs.back()[f] = s.front(m)[freeIndices[f]];

                }

            }


            Deque<X, M> zTy;

            for(Index m = 0; m < y.size(); ++m)

            {

                zTy.emplaceBack(zero<X>());

                for(Index f = 0; f < nFree; ++f)

                {

                    zTy.back()[f] = y.front(m)[freeIndices[f]];

                }

            }


            Tensor<TX, 2 * M, 2 * M> l;

            auto l00 = block<2, 2>(l, 0, 0);

            auto l10 = block<2, 2>(l, 1, 0);

            auto l01 = block<2, 2>(l, 0, 1);

            auto l11 = block<2, 2>(l, 1, 1);


            {

                Tensor<TX, M, M> yTzzTy = zero();

                for(Index m = 0; m < zTy.size(); ++m)

                {

                    for(Index n = 0; n < zTy.size(); ++n)

                    {

                        yTzzTy(m, n) =

                            scalarProduct(zTy.front(m), zTy.front(n))();

                    }

                }


                l00 = cholesky(diagonal(sTy) + yTzzTy / theta);

            }


            l10 = zero();


            {

                Tensor<TX, M, M> yTzzTs = zero();

                for(Index m = 0; m < zTy.size(); ++m)

                {

                    for(Index n = 0; n < zTs.size(); ++n)

                    {

                        yTzzTs(m, n) =

                            scalarProduct(zTy.front(m), zTs.front(n))();

                    }

                }


                auto const rhs =

                    -transpose(strictlyLowerTriangular(sTy)) + yTzzTs;

                l01 = solve<Solver::ForwardSubstitution>(transpose(l00), rhs);

            }


            {

                Tensor<TX, M, M> sTaaTs = zero();

                for(Index m = 0; m < s.size(); ++m)

                {

                    for(Index n = 0; n < s.size(); ++n)

                    {

                        sTaaTs(m, n) = (scalarProduct(s.front(m), s.front(n)) -

                            scalarProduct(zTs.front(m), zTs.front(n)))();

                    }

                }


                l11 = cholesky(theta * sTaaTs + transpose(l01) * l01);

            }


            Tensor<TX, 2 *M> rTw = zero();

            auto rTw0 = block<2>(rTw, 0);

            auto rTw1 = block<2>(rTw, 1);


            for(Index m = 0; m < zTy.size(); ++m)

            {

                rTw0(m) = scalarProduct(zTr, zTy.front(m))();

            }

            for(Index m = 0; m < zTs.size(); ++m)

            {

                rTw1(m) = theta * scalarProduct(zTr, zTs.front(m))();

            }


            rTw = solve<Solver::ForwardSubstitution>(transpose(l), rTw);

            rTw0 = -rTw0;

            rTw = solve<Solver::BackwardSubstitution>(l, rTw);


            Tensor d = zero<X>();

            for(Index m = 0; m < zTy.size(); ++m)

            {

                d += zTy.front(m) * rTw0(m);

                d += zTs.front(m) * rTw1(m) * theta;

            }


            auto const rTheta = rcp(theta);

            d = (zTr + d * rTheta) * rTheta;


            {

                Tensor xCPcopy = xCP;


                for(Index f = 0; f < nFree; ++f)

                {

                    auto const i = freeIndices[f];

                    xCP[i] =

                        min(upperBound[i], max(lowerBound[i], xCP[i] + d[f]));

                }


                xCP -= x;

                auto const dd = scalarProduct(xCP, g)();


                if(dd < zero())

                {

                    return dd;

                }


                xCP = xCPcopy;

            }


            TX alpha = identity();

            for(Index f = 0; f < nFree; ++f)

            {

                auto const i = freeIndices[f];

                if(d[f] > zero())

                {

                    alpha = min(alpha, (upperBound[i] - xCP[i]) / d[f]);

                }

                if(d[f] < zero())

                {

                    alpha = min(alpha, (lowerBound[i] - xCP[i]) / d[f]);

                }

            }


            for(Index f = 0; f < nFree; ++f)

            {

                xCP[freeIndices[f]] += alpha * d[f];

            }


            xCP -= x;

            return scalarProduct(xCP, g)();

        }


        template<class X, class P, class LB, class UB>

        static constexpr auto getLineSearchParameters(X const &x, P const &p,

            LB const &lowerBound, UB const &upperBound)

        {

            using TX = typename X::Type;


            auto alphaMax = NumericLimits<TX>::max();


            for(Index i = 0; i < X::size(); ++i)

            {

                if(p[i] < zero())

                {

                    auto const d = lowerBound[i] - x[i];

                    if(d > zero())

                    {

                        alphaMax = zero();

                    }

                    else

                    {

                        if(p[i] * alphaMax < d)

                        {

                            alphaMax = d / p[i];

                        }

                    }

                }

                if(p[i] > zero())

                {

                    auto const d = upperBound[i] - x[i];

                    if(d < zero())

                    {

                        alphaMax = zero();

                    }

                    else

                    {

                        if(p[i] * alphaMax > d)

                        {

                            alphaMax = d / p[i];

                        }

                    }

                }

            }


            auto const alpha = min(rcp(norm<2>(p))(), alphaMax);


            return tuple<TX, TX, TX>(alpha, zero<TX>(), alphaMax);

        }


    public:


        constexpr LBFGSB(LS const &lineSearch,

            Size const maxIterations = defaultMaxIterations())

            : mLineSearch(lineSearch)

            , mMaxIterations(maxIterations)

        {

        }


        constexpr LBFGSB(Size const maxIterations = defaultMaxIterations())

            : mMaxIterations(maxIterations)

        {

        }


        constexpr auto &lineSearch() const

        {

            return mLineSearch;

        }


        constexpr auto maxIterations() const

        {

            return mMaxIterations;

        }


        template<class XX, class RX = RemoveReference<XX>,

            class TX = typename RX::Type, class VX = typename TX::Value,

            If<IsTensorish<RX>> = 0,

            class RXE = RemoveConstReference<ResultOf<Eval, XX>>, class FF,

            If<IsInvocable<FF, RXE const &, RXE &>> = 0,

            If<IsFloat<ResultOf<FF, RXE const &, RXE &>>> = 0, class FC,

            If<IsInvocable<FC, RXE>> = 0,

            class XL = decltype(unit<RX>(NumericLimits<VX>::lowest())),

            class RL = RemoveReference<XL>, class TL = typename RL::Type,

            class VL = typename TL::Value, If<IsTensorish<RL>> = 0,

            class XU = decltype(unit<RX>(NumericLimits<VX>::max())),

            class RU = RemoveReference<XU>, class TU = typename RU::Type,

            class VU = typename TU::Value, If<IsTensorish<RU>> = 0,

            class VT = Common<VX, VL, VU>,

            If<All<IsFloat<VX>, IsFloat<VT>>> = 0,

            If<IsSame<typename RX::Dimension, typename RL::Dimension,

                typename RU::Dimension>> = 0,

            If<IsSame<typename RX::Sizes, typename RL::Sizes,

                typename RU::Sizes>> = 0,

            If<IsInvocable<LS, RXE &, ResultOf<FF, RXE const &, RXE &> &, RXE &,

                VX &, FF, RXE(RXE const &), RXE const &, VX, VX, VX>> = 0>


        inline constexpr auto operator()(XX &&x0, FF &&function, FC &&callback,

            VT const &tolerance = NumericLimits<VT>::tolerance(),

            XL &&lowerBound = unit<RL>(NumericLimits<VL>::lowest()),

            XU &&upperBound = unit<RU>(NumericLimits<VU>::max())) const

        {

            if constexpr(cDebugLevel >= DebugLevel::High)

            {

                if(reduce<LogicalOr>(lowerBound > upperBound)())

                {

                    Logging::error("L-BFGS-B: Lower bound > upper bound.");

                }

            }


            if constexpr(cDebugLevel >= DebugLevel::Low)

            {

                if(reduce<LogicalOr>(x0 < lowerBound || x0 > upperBound)())

                {

                    Logging::error(

                        "L-BFGS-B: Initial parameters not in range "

                        "[lowerBound, "

                        "upperBound]");

                }

            }


            decltype(auto) x =

                copy<!(!IsReference<XX>() && !IsConst<RX>())>(eval(x0));


            RXE g;

            auto f = function(x, g);


            Logging::info("L-BFGS-B initial f(x) =", f);


            if(projectedGradientConverged(x, g, lowerBound, upperBound,

                   tolerance))

            {

                return x;

            }


            Deque<RXE, M> s;

            Deque<RXE, M> y;

            Tensor<TX, M, M> sTy = zero();

            Tensor<TX, M, M> j = zero();

            TX theta = identity();


            for(Index iteration = 0;;)

            {

                Tensor<TX, 2 * M> c;


                Tensor xCP = computeGeneralizedCauchyPoint(x, g, lowerBound,

                    upperBound, theta, s, y, sTy, j, c);


                auto d = minimizeSubspace(x, xCP, g, lowerBound, upperBound,

                    theta, s, y, sTy, j, c);


                if(d >= zero())

                {

                    resetHistory(s, y, sTy, j, theta);

                    continue;

                }


                auto const &p = xCP;

                auto [alphaInit, alphaMin, alphaMax] =

                    getLineSearchParameters(x, p, lowerBound, upperBound);


                auto const f0 = f;

                auto const g0 = g;

                auto const d0 = d;


                auto alpha = lineSearch()(

                    x, f, g, d, function,

                    [&lowerBound, &upperBound](auto &&x)

                    {

                        return min(upperBound,

                            max(lowerBound, forward<decltype(x)>(x)));

                    },

                    p, alphaInit, alphaMin, alphaMax);


                callback(x);


                if(++iteration; !(iteration < maxIterations()))

                {

                    Logging::info("L-BFGS-B max iterations reached at f(x) =",

                        f);

                    break;

                }


                if(valueConverged(f0, f, tolerance))

                {

                    Logging::info("L-BFGS-B converged at f(x) =", f);

                    break;

                }


                if(projectedGradientConverged(x, g, lowerBound, upperBound,

                       tolerance))

                {

                    Logging::info("L-BFGS-B converged at f(x) =", f);

                    break;

                }


                Logging::info("L-BFGS-B current f(x) =", f);


                auto const skTyk = (d - d0) * alpha;


                if(skTyk <= -NumericLimits<VX>::epsilon() * d0 * alpha)

                {

                    continue;

                }


                s.pushBack(alpha * p);

                y.pushBack(g - g0);


                theta = norm<2, 1>(y.back())() / skTyk;


                for(Index n = 0; n < y.size(); ++n)

                {

                    for(Index m = 0; m < s.size(); ++m)

                    {

                        sTy(m, n) = scalarProduct(s.front(m), y.front(n))();

                    }

                }


                j = zero();

                for(Index m = 0; m < s.size(); ++m)

                {

                    for(Index n = m; n < s.size(); ++n)

                    {

                        j(m, n) =

                            theta * scalarProduct(s.front(m), s.front(n))();

                        for(Index k = 0; k < m; ++k)

                        {

                            j(m, n) += sTy(m, k) * sTy(n, k) / sTy(k, k);

                        }

                    }

                }


                j = cholesky<Operator::Hint::UpperTriangular>(move(j));

            }


            if constexpr(IsReference<decltype(x)>())

            {

                return forward<XX>(x0);

            }

            else

            {

                return x;

            }

        }


    private:

        LS const mLineSearch;

        Size const mMaxIterations;

    };


}

#endif // pRC_ALGORITHMS_OPTIMIZER_LBFGSB_H

backward_substitution.hpp

cholesky.hpp

pRC::Deque
Definition deque.hpp:15

pRC::Deque::size
constexpr auto size() const
Definition deque.hpp:28

pRC::Deque::front
constexpr decltype(auto) front(Index const position=0) &&
Definition deque.hpp:33

pRC::Deque::pushBack
constexpr auto pushBack(R const &element) &&
Definition deque.hpp:133

pRC::Deque::emplaceBack
constexpr auto emplaceBack(Args &&...args) &&
Definition deque.hpp:163

pRC::Optimizer::LBFGSB
Definition lbfgsb.hpp:32

pRC::Optimizer::LBFGSB::LBFGSB
constexpr LBFGSB(Size const maxIterations=defaultMaxIterations())
Definition lbfgsb.hpp:456

pRC::Optimizer::LBFGSB::LBFGSB
constexpr LBFGSB(LS const &lineSearch, Size const maxIterations=defaultMaxIterations())
Definition lbfgsb.hpp:449

pRC::Optimizer::LBFGSB::lineSearch
constexpr auto & lineSearch() const
Definition lbfgsb.hpp:461

pRC::Optimizer::LBFGSB::maxIterations
constexpr auto maxIterations() const
Definition lbfgsb.hpp:466

pRC::Optimizer::LBFGSB::operator()
constexpr auto operator()(XX &&x0, FF &&function, FC &&callback, VT const &tolerance=NumericLimits< VT >::tolerance(), XL &&lowerBound=unit< RL >(NumericLimits< VL >::lowest()), XU &&upperBound=unit< RU >(NumericLimits< VU >::max())) const
Definition lbfgsb.hpp:492

pRC::Tensor
Class storing tensors.
Definition tensor.hpp:44

config.hpp

copy.hpp

eval.hpp

norm.hpp

type_traits.hpp

forward_substitution.hpp

logical_or.hpp

more_thuente.hpp

pRC::Logging::info
static void info(Xs &&...args)
Definition log.hpp:27

pRC::Logging::error
static void error(Xs &&...args)
Definition log.hpp:14

pRC::Optimizer
Definition bfgs.hpp:16

pRC::extractDiagonal
static constexpr auto extractDiagonal(X &&a)
Extracts the diagonal of a Tensor.
Definition extract_diagonal.hpp:28

pRC::eval
static constexpr X eval(X &&a)
Definition eval.hpp:11

pRC::min
static constexpr X min(X &&a)
Definition min.hpp:13

pRC::diagonal
static constexpr auto diagonal(X &&a)
Transforms a Tensor into diagonal form.
Definition diagonal.hpp:26

pRC::DebugLevel::Low
@ Low

pRC::DebugLevel::High
@ High

pRC::rcp
static constexpr auto rcp(Complex< T > const &b)
Definition rcp.hpp:13

pRC::cholesky
static constexpr auto cholesky(X &&a)
Definition cholesky.hpp:24

pRC::zero
static constexpr auto zero()
Definition zero.hpp:12

pRC::Size
std::size_t Size
Definition type_traits.hpp:20

pRC::Array
Conditional< IsSatisfied<((Ns *... *1) *sizeof(T) > cHugepageSizeByte)>, HeapArray< T, Ns... >, StackArray< T, Ns... > > Array
Definition type_traits.hpp:60

pRC::transpose
static constexpr auto transpose(JacobiRotation< T > const &a)
Definition jacobi_rotation.hpp:319

pRC::where
static constexpr T where(TE const e, T &&a, T &&b)
Definition where.hpp:12

pRC::sort
static constexpr void sort(C const &compare, T &a, Size const k=T::size(), Size const d=0)
Definition sort.hpp:15

pRC::square
static constexpr auto square(Complex< T > const &a)
Definition square.hpp:14

pRC::strictlyLowerTriangular
static constexpr auto strictlyLowerTriangular(X &&a)
Transforms a Tensor into strictly lower triangular form.
Definition strictly_lower_triangular.hpp:22

pRC::delta
static constexpr auto delta(Complex< TA > const &a, Complex< TB > const &b)
Definition delta.hpp:12

pRC::abs
static constexpr auto abs(Complex< T > const &a)
Definition abs.hpp:12

pRC::copy
static constexpr Conditional< IsSatisfied< C >, RemoveConstReference< X >, X > copy(X &&a)
Definition copy.hpp:13

pRC::cDebugLevel
constexpr auto cDebugLevel
Definition config.hpp:46

pRC::IsReference
std::is_reference< T > IsReference
Definition type_traits.hpp:47

pRC::scalarProduct
static constexpr auto scalarProduct(Complex< TA > const &a, Complex< TB > const &b)
Definition scalar_product.hpp:13

pRC::Index
Size Index
Definition type_traits.hpp:21

pRC::identity
static constexpr auto identity()
Definition identity.hpp:12

pRC::max
static constexpr X max(X &&a)
Definition max.hpp:13

solve.hpp

sort.hpp

pRC::NumericLimits
Definition limits.hpp:13

block.hpp

diagonal.hpp

limits.hpp