View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2025.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.optimization;
39  
40  import ffx.numerics.OptimizationInterface;
41  import ffx.numerics.optimization.LineSearch.LineSearchResult;
42  
43  import javax.annotation.Nullable;
44  import java.util.logging.Logger;
45  
46  import static java.lang.Double.isInfinite;
47  import static java.lang.Double.isNaN;
48  import static java.lang.Math.fma;
49  import static java.lang.String.format;
50  import static java.lang.System.arraycopy;
51  import static java.util.Arrays.fill;
52  import static org.apache.commons.math3.util.FastMath.abs;
53  import static org.apache.commons.math3.util.FastMath.min;
54  import static org.apache.commons.math3.util.FastMath.sqrt;
55  
56  /**
57   * This class implements the limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) algorithm for
58   * large-scale multidimensional unconstrained optimization problems.
59   *
60   * <p>The L-BFGS algorithm is a quasi-Newton method that approximates the Broyden–Fletcher–Goldfarb–Shanno
61   * (BFGS) algorithm using a limited amount of computer memory. It is particularly well suited for
62   * optimization problems with a large number of variables.
63   *
64   * <p>The algorithm works by storing a sparse representation of the approximate inverse Hessian matrix,
65   * using only a few vectors that represent the approximation implicitly. Unlike the original BFGS method,
66   * L-BFGS stores only a few vectors that represent the approximation implicitly, which makes it particularly
67   * well suited for problems with many variables.
68   *
69   * <p>This implementation uses a line search procedure to ensure sufficient decrease in the objective
70   * function and to maintain positive definiteness of the Hessian approximation.
71   *
72   * @author Michael J. Schnieders
73   * <br> Derived from:
74   * <br> Robert Dodier's Java translation of original FORTRAN code by Jorge Nocedal.
75   * <br> J. Nocedal, "Updating Quasi-Newton Matrices with Limited Storage", Mathematics of Computation, 35, 773-782 (1980)
76   * <br> D. C. Lui and J. Nocedal, "On the Limited Memory BFGS Method for Large Scale Optimization", Mathematical Programming, 45, 503-528 (1989)
77   * <br> J. Nocedal and S. J. Wright, "Numerical Optimization", Springer-Verlag, New York, 1999, Section 9.1
78   * @since 1.0
79   */
80  public class LBFGS {
81  
82    /**
83     * Controls the accuracy of the line search.
84     * <p>
85     * If the function and gradient evaluations are inexpensive with respect to the cost of the
86     * iteration (which is sometimes the case when solving very large problems) it may be advantageous
87     * to set <code>CAPPA</code> to a small value.
88     * <p>
89     * A typical small value is 0.1.
90     * <p>
91     * Restriction: <code>CAPPA</code> should be greater than 1e-4.
92     */
93    public static final double DEFAULT_CAPPA = 0.9;
94  
95    /**
96     * This specifies the default lower bound for the step in the line search.
97     * <p>
98     * The default value of 1.0e-16 does not need to be modified unless the problem is extremely badly
99     * scaled (in which case the exponent should be increased).
100    */
101   public static final double DEFAULT_STEPMIN = 1.0e-16;
102 
103   /**
104    * This specifies the default upper bound for the step in the line search.
105    * <p>
106    * The default value of 5.0e0 does not need to be modified unless the problem is extremely badly
107    * scaled (in which case the exponent should be increased).
108    */
109   public static final double DEFAULT_STEPMAX = 5.0e0;
110 
111   /**
112    * The default projected gradient above which step size is reduced.
113    */
114   public static final double DEFAULT_SLOPEMAX = 1.0e4;
115 
116   /**
117    * The default maximum angle between search direction and gradient.
118    */
119   public static final double DEFAULT_ANGLEMAX = 180.0;
120 
121   /**
122    * The default maximum number of interpolations during line search.
123    */
124   public static final int DEFAULT_INTMAX = 5;
125 
126   private static final Logger logger = Logger.getLogger(LBFGS.class.getName());
127 
128   /**
129    * Make the constructor private so that the L-BFGS cannot be instantiated.
130    */
131   private LBFGS() {
132     // Private constructor to prevent instantiation
133   }
134 
135   /**
136    * This method solves the unconstrained minimization problem
137    *
138    * <pre>
139    *     min f(x),    x = (x1,x2,...,x_n),
140    * </pre>
141    * <p>
142    * using the limited-memory BFGS method. The routine is especially effective on problems involving
143    * a large number of variables. In a typical iteration of this method an approximation <code>Hk
144    * </code> to the inverse of the Hessian is obtained by applying <code>m</code> BFGS updates to a
145    * diagonal matrix <code>Hk0</code>, using information from the previous <code>m</code> steps.
146    *
147    * <p>The user specifies the number <code>mSave</code>, which determines the amount of storage
148    * required by the routine. This is the number of previous steps that will be used to approximate
149    * the Hessian. Larger values of <code>mSave</code> can lead to better convergence but require more
150    * memory and computation per iteration.
151    *
152    * <p>The algorithm works as follows:
153    * <ol>
154    *   <li>Initialize with the current point, function value, and gradient</li>
155    *   <li>Compute a search direction using the L-BFGS approximation of the inverse Hessian</li>
156    *   <li>Perform a line search along this direction to find a new point with sufficient decrease in the function value</li>
157    *   <li>Update the L-BFGS approximation using the new point and gradient</li>
158    *   <li>Repeat until convergence or maximum iterations reached</li>
159    * </ol>
160    *
161    * <p>The step length is determined at each iteration by means of the line search routine <code>
162    * lineSearch</code>, which is a slight modification of the routine <code>CSRCH</code> written by
163    * More and Thuente. This ensures that the function value decreases sufficiently and that the
164    * Hessian approximation remains positive definite.
165    *
166    * @param n             The number of variables in the minimization problem. Must be positive.
167    * @param mSave         The number of corrections used in the BFGS update. Values of <code>mSave</code>
168    *                      less than 3 are not recommended; large values of <code>mSave</code> will result in excessive
169    *                      computing time. <code>3 &lt;= mSave &lt;= 7</code> is recommended.
170    *                      Must be non-negative. If <code>mSave</code> is 0, the method will use steepest descent.
171    * @param x             On initial entry this must be set by the user to the values of the initial estimate
172    *                      of the solution vector. On exit, it contains the values of the variables at the best point
173    *                      found (usually a solution). The array must have length at least <code>n</code>.
174    * @param f             The value of the function <code>f</code> at the point <code>x</code>.
175    * @param g             The components of the gradient <code>g</code> at the point <code>x</code>.
176    *                      On exit, it contains the gradient at the final point. The array must have length at least <code>n</code>.
177    * @param eps           Determines the accuracy with which the solution is to be found. The subroutine
178    *                      terminates when <code>G RMS &lt; EPS</code>. Should be positive.
179    * @param maxIterations Maximum number of optimization steps. Must be positive.
180    * @param potential     Implements the {@link ffx.numerics.Potential} interface to supply function
181    *                      values and gradients. Cannot be null.
182    * @param listener      Implements the {@link OptimizationListener} interface and will be notified
183    *                      after each successful step. Can be null, in which case progress will be logged
184    *                      but no callbacks will be made.
185    * @return status code:
186    * <ul>
187    *   <li>0 = success (convergence achieved)</li>
188    *   <li>1 = maximum iterations reached without convergence</li>
189    *   <li>-1 = optimization failed (e.g., line search failure, invalid inputs)</li>
190    * </ul>
191    * @since 1.0
192    */
193   public static int minimize(final int n, int mSave, final double[] x, double f, double[] g,
194                              final double eps, final int maxIterations, OptimizationInterface potential,
195                              @Nullable OptimizationListener listener) {
196 
197     // Validate input parameters with explicit checks instead of assertions
198     if (n <= 0) {
199       logger.severe("Number of variables must be positive.");
200       return -1;
201     }
202     if (mSave < 0) {
203       logger.severe("Number of correction vectors must be non-negative.");
204       return -1;
205     }
206     if (maxIterations <= 0) {
207       logger.severe("Maximum number of iterations must be positive.");
208       return -1;
209     }
210     if (x == null || x.length < n) {
211       logger.severe("Coordinate array is null or too small.");
212       return -1;
213     }
214     if (g == null || g.length < n) {
215       logger.severe("Gradient array is null or too small.");
216       return -1;
217     }
218     if (potential == null) {
219       logger.severe("Potential interface cannot be null.");
220       return -1;
221     }
222     if (eps <= 0.0) {
223       logger.warning("Convergence criterion (eps) should be positive.");
224     }
225 
226     if (mSave > n) {
227       logger.fine(format(" Resetting the number of saved L-BFGS vectors to %d.", n));
228       mSave = n;
229     }
230 
231     int iterations = 0;
232     int evaluations = 1;
233     int nErrors = 0;
234     int maxErrors = 2;
235 
236     double rms = sqrt(n);
237     double[] scaling = potential.getScaling();
238     if (scaling == null) {
239       scaling = new double[n];
240       fill(scaling, 1.0);
241     }
242 
243     // Initial search direction is the steepest decent direction.
244     double[][] s = new double[mSave][n];
245     double[][] y = new double[mSave][n];
246     if (mSave > 0) {
247       for (int i = 0; i < n; i++) {
248         s[0][i] = -g[i];
249       }
250     }
251 
252     double grms = 0.0;
253     double gnorm = 0.0;
254     for (int i = 0; i < n; i++) {
255       double gi = g[i];
256       if (isNaN(gi) || isInfinite(gi)) {
257         logger.warning(format("The gradient of variable %d is %8.3f.", i, gi));
258         return 1;
259       }
260       double gis = gi * scaling[i];
261       gnorm += gi * gi;
262       grms += gis * gis;
263     }
264     gnorm = sqrt(gnorm);
265     grms = sqrt(grms) / rms;
266 
267     // Notify the listeners of initial conditions.
268     if (listener != null) {
269       if (!listener.optimizationUpdate(iterations, mSave, evaluations, grms, 0.0, f, 0.0, 0.0, null)) {
270         // Terminate the optimization.
271         return 1;
272       }
273     } else {
274       log(iterations, evaluations, grms, 0.0, f, 0.0, 0.0, null);
275     }
276 
277     // The convergence criteria may already be satisfied.
278     if (grms <= eps) {
279       return 0;
280     }
281 
282     final double[] prevX = new double[n];
283     final double[] prevG = new double[n];
284     final double[] r = new double[n];
285     final double[] p = new double[n];
286     final double[] h0 = new double[n];
287     final double[] q = new double[n];
288     final double[] alpha = new double[mSave];
289     final double[] rho = new double[mSave];
290     double gamma = 1.0;
291 
292     // Line search parameters.
293     final LineSearch lineSearch = new LineSearch(n);
294     final LineSearchResult[] info = {LineSearchResult.Success};
295     final int[] nFunctionEvals = {0};
296     final double[] angle = {0.0};
297     double df = 0.5 * DEFAULT_STEPMAX * gnorm;
298     int m = -1;
299 
300     while (true) {
301       iterations++;
302       if (iterations > maxIterations) {
303         logger.info(format(" Maximum number of iterations reached: %d.", maxIterations));
304         return 1;
305       }
306 
307       if (mSave > 0) {
308         int muse = min(iterations - 1, mSave);
309         m++;
310         if (m > mSave - 1) {
311           m = 0;
312         }
313 
314         // Estimate the Hessian Diagonal.
315         fill(h0, gamma);
316         arraycopy(g, 0, q, 0, n);
317         int k = m;
318         for (int j = 0; j < muse; j++) {
319           k--;
320           if (k < 0) {
321             k = mSave - 1;
322           }
323           alpha[k] = v1DotV2(n, s[k], 0, 1, q, 0, 1);
324           alpha[k] *= rho[k];
325           aV1PlusV2(n, -alpha[k], y[k], 0, 1, q, 0, 1);
326         }
327         for (int i = 0; i < n; i++) {
328           r[i] = h0[i] * q[i];
329         }
330         for (int j = 0; j < muse; j++) {
331           double beta = v1DotV2(n, r, 0, 1, y[k], 0, 1);
332           beta *= rho[k];
333           aV1PlusV2(n, alpha[k] - beta, s[k], 0, 1, r, 0, 1);
334           k++;
335           if (k > mSave - 1) {
336             k = 0;
337           }
338         }
339 
340         // Set the search direction.
341         for (int i = 0; i < n; i++) {
342           p[i] = -r[i];
343         }
344       } else {
345         // Steepest-decent
346         for (int i = 0; i < n; i++) {
347           p[i] = -g[i];
348         }
349       }
350 
351       arraycopy(x, 0, prevX, 0, n);
352       arraycopy(g, 0, prevG, 0, n);
353 
354       // Perform the line search along the new conjugate direction.
355       nFunctionEvals[0] = 0;
356       double prevF = f;
357       f = lineSearch.search(n, x, f, g, p, angle, df, info, nFunctionEvals, potential);
358       evaluations += nFunctionEvals[0];
359 
360       // Check for NaN or infinite gradient values after line search
361       for (int i = 0; i < n; i++) {
362         if (isNaN(g[i]) || isInfinite(g[i])) {
363           logger.warning(format("The gradient of variable %d is %8.3f after line search. Terminating optimization.", i, g[i]));
364           return -1;
365         }
366       }
367 
368       // Update variables based on the results of this iteration.
369       if (mSave > 0) {
370         for (int i = 0; i < n; i++) {
371           s[m][i] = x[i] - prevX[i];
372           y[m][i] = g[i] - prevG[i];
373         }
374         double ys = v1DotV2(n, y[m], 0, 1, s[m], 0, 1);
375         double yy = v1DotV2(n, y[m], 0, 1, y[m], 0, 1);
376         gamma = abs(ys / yy);
377         rho[m] = 1.0 / ys;
378       }
379 
380       // Get the sizes of the moves made during this iteration.
381       df = prevF - f;
382       double xrms = 0.0;
383       grms = 0.0;
384       for (int i = 0; i < n; i++) {
385         double dx = (x[i] - prevX[i]) / scaling[i];
386         xrms += dx * dx;
387         double gx = g[i] * scaling[i];
388         grms += gx * gx;
389       }
390       xrms = sqrt(xrms) / rms;
391       grms = sqrt(grms) / rms;
392 
393       boolean done = false;
394       if (info[0] == LineSearchResult.BadIntpln || info[0] == LineSearchResult.IntplnErr) {
395         nErrors++;
396         if (nErrors >= maxErrors) {
397           done = true;
398         }
399       } else {
400         nErrors = 0;
401       }
402 
403       if (listener != null) {
404         if (!listener.optimizationUpdate(iterations, mSave, evaluations, grms, xrms, f, df, angle[0],
405             info[0])) {
406           // Terminate the optimization.
407           return 1;
408         }
409       } else {
410         log(iterations, evaluations, grms, xrms, f, df, angle[0], info[0]);
411       }
412 
413       /*
414        Terminate the optimization if the line search failed or upon
415        satisfying the convergence criteria.
416       */
417       if (done) {
418         return -1;
419       } else if (grms <= eps) {
420         return 0;
421       }
422     }
423   }
424 
425   /**
426    * This method solves the unconstrained minimization problem
427    *
428    * <pre>
429    *     min f(x),    x = (x1,x2,...,x_n),
430    * </pre>
431    * <p>
432    * using the limited-memory BFGS method. The routine is especially effective on problems involving
433    * a large number of variables. In a typical iteration of this method an approximation <code>Hk
434    * </code> to the inverse of the Hessian is obtained by applying <code>m</code> BFGS updates to a
435    * diagonal matrix <code>Hk0</code>, using information from the previous <code>m</code> steps.
436    *
437    * <p>The user specifies the number <code>m</code>, which determines the amount of storage
438    * required by the routine.
439    *
440    * <p>The user is required to calculate the function value <code>f</code> and its gradient <code>g
441    * </code>.
442    *
443    * <p>The step length is determined at each iteration by means of the line search routine <code>
444    * lineSearch</code>, which is a slight modification of the routine <code>CSRCH</code> written by
445    * More and Thuente.
446    *
447    * @param n         The number of variables in the minimization problem. Restriction: <code>n &gt; 0
448    *                  </code>.
449    * @param mSave     The number of corrections used in the BFGS update. Values of <code>mSave</code>
450    *                  less than 3 are not recommended; large values of <code>mSave</code> will result in excessive
451    *                  computing time. <code>3 &lt;= mSave &lt;= 7</code> is recommended. * Restriction:
452    *                  <code>mSave &gt; 0</code>.
453    * @param x         On initial entry this must be set by the user to the values of the initial estimate
454    *                  of the solution vector. On exit, it contains the values of the variables at the best point
455    *                  found (usually a solution).
456    * @param f         The value of the function <code>f</code> at the point <code>x</code>.
457    * @param g         The components of the gradient <code>g</code> at the point <code>x</code>.
458    * @param eps       Determines the accuracy with which the solution is to be found. The subroutine
459    *                  terminates when <code>G RMS &lt; EPS</code>
460    * @param potential Implements the {@link ffx.numerics.Potential} interface to supply function
461    *                  values and gradients.
462    * @param listener  Implements the {@link OptimizationListener} interface and will be notified
463    *                  after each successful step.
464    * @return status code (0 = success, -1 = failed)
465    * @since 1.0
466    */
467   public static int minimize(int n, int mSave, double[] x, double f, double[] g, double eps,
468                              OptimizationInterface potential, OptimizationListener listener) {
469     return minimize(n, mSave, x, f, g, eps, Integer.MAX_VALUE - 1, potential, listener);
470   }
471 
472   /**
473    * Compute the sum of a vector times a scalar plus another vector.
474    * <p>
475    * This operation computes: v2 = a * v1 + v2
476    * <p>
477    * The method is optimized for the common case where both step sizes are 1 and
478    * both start indices are 0, which is the most frequent usage pattern in the LBFGS algorithm.
479    *
480    * @param n       The number of points.
481    * @param a       The scalar.
482    * @param v1      The X array.
483    * @param v1Start The first point in the X array.
484    * @param v1Step  The X array increment.
485    * @param v2      The Y array.
486    * @param v2Start The first point in the Y array.
487    * @param v2Step  The Y array increment.
488    * @since 1.0
489    */
490   static void aV1PlusV2(final int n, final double a, final double[] v1, final int v1Start,
491                         final int v1Step, final double[] v2, final int v2Start, final int v2Step) {
492     /*
493      Require the number of entries (n) to be greater than zero. If the
494      scalar (a) is zero, then the v2 array is unchanged.
495     */
496     if (n <= 0 || a == 0) {
497       return;
498     }
499 
500     // Optimize for the common case where both step sizes are 1 and both start indices are 0
501     if (v1Step == 1 && v2Step == 1 && v1Start == 0 && v2Start == 0) {
502       for (int i = 0; i < n; i++) {
503         v2[i] = fma(a, v1[i], v2[i]);
504       }
505       return;
506     }
507 
508     // General case with arbitrary step sizes and start indices
509     int stop = v1Start + v1Step * n;
510     for (int i = v1Start, j = v2Start; i != stop; i += v1Step, j += v2Step) {
511       v2[j] = fma(a, v1[i], v2[j]);
512     }
513   }
514 
515   /**
516    * Print status messages for <code>LBFGS</code> if there is no listener.
517    *
518    * @param iter  Number of iterations so far.
519    * @param nfun  Number of function evaluations so far.
520    * @param grms  Gradient RMS at current solution.
521    * @param xrms  Coordinate change RMS at current solution.
522    * @param f     Function value at current solution.
523    * @param df    Change in the function value compared to the previous solution.
524    * @param angle Current angle between gradient and search direction.
525    * @since 1.0
526    */
527   private static void log(int iter, int nfun, double grms, double xrms, double f, double df,
528                           double angle, LineSearchResult info) {
529     if (iter == 0) {
530       logger.info("\n Limited Memory BFGS Quasi-Newton Optimization: \n");
531       logger.info(
532           " QN Iter    F Value      G RMS     F Move    X Move    Angle  FG Call  Comment\n");
533     }
534     if (info == null) {
535       logger.info(
536           format("%6d%13.4f%11.4f%11.4f%10.4f%9.2f%7d", iter, f, grms, df, xrms, angle, nfun));
537     } else {
538       logger.info(
539           format("%6d%13.4f%11.4f%11.4f%10.4f%9.2f%7d   %8s", iter, f, grms, df, xrms, angle, nfun,
540               info));
541     }
542   }
543 
544   /**
545    * Compute the dot product of two vectors.
546    * <p>
547    * This operation computes: sum(v1[i] * v2[i]) for i = 0 to n-1
548    * <p>
549    * The method is optimized for the common case where both step sizes are 1 and
550    * both start indices are 0, which is the most frequent usage pattern in the LBFGS algorithm.
551    *
552    * @param n       Number of entries to include.
553    * @param v1      The X array.
554    * @param v1Start The first point in the X array.
555    * @param v1Step  The X array increment.
556    * @param v2      The Y array.
557    * @param v2Start The first point in the Y array.
558    * @param v2Step  The Y increment.
559    * @return dot product
560    * @since 1.0
561    */
562   static double v1DotV2(final int n, final double[] v1, final int v1Start, final int v1Step,
563                         final double[] v2, final int v2Start, final int v2Step) {
564 
565     // Require the number of entries to be greater than zero.
566     if (n <= 0) {
567       return 0;
568     }
569 
570     // Optimize for the common case where both step sizes are 1 and both start indices are 0
571     if (v1Step == 1 && v2Step == 1 && v1Start == 0 && v2Start == 0) {
572       double sum = 0.0;
573       for (int i = 0; i < n; i++) {
574         sum = fma(v1[i], v2[i], sum);
575       }
576       return sum;
577     }
578 
579     // General case with arbitrary step sizes and start indices
580     double sum = 0.0;
581     int stop = v1Start + v1Step * n;
582     for (int i = v1Start, j = v2Start; i != stop; i += v1Step, j += v2Step) {
583       sum = fma(v1[i], v2[j], sum);
584     }
585     return sum;
586   }
587 }