View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2024.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.math;
39  
40  import static java.lang.String.format;
41  import static java.util.Arrays.fill;
42  import static org.apache.commons.math3.util.FastMath.abs;
43  import static org.apache.commons.math3.util.FastMath.max;
44  import static org.apache.commons.math3.util.FastMath.min;
45  import static org.apache.commons.math3.util.FastMath.sqrt;
46  
47  import org.apache.commons.math3.distribution.TDistribution;
48  
49  import javax.annotation.Nullable;
50  
51  /**
52   * The SummaryStatistics class uses online, stable algorithms to calculate summary statistics from
53   * double arrays/lists, including mean, variance, standard deviation, max, min, sum, and count.
54   *
55   * <p>This is intended for accuracy and numerical stability, not necessarily for performance (e.g.
56   * using Kahan summation).
57   *
58   * @author Michael J. Schnieders
59   * @author Jacob M. Litman
60   * @since 1.0
61   */
62  public class SummaryStatistics {
63  
64    // Weight-sensitive values.
65    public final double mean;
66    public final double var;
67    public final double varPopulation;
68    public final double sd;
69    public final double sdPopulation;
70    public final double sumWeights;
71    // Weight-insensitive values.
72    public final double min;
73    public final double max;
74    public final long count;
75    public final double sum;
76    public final long dof;
77    private final TDistribution tDist;
78    private final String descString;
79  
80    /**
81     * Builds a static view of a running statistic.
82     *
83     * @param rs Running statistic.
84     */
85    public SummaryStatistics(RunningStatistics rs) {
86      mean = rs.getMean();
87      var = rs.getVariance();
88      varPopulation = rs.getPopulationVariance();
89      sumWeights = rs.getWeight();
90      min = rs.getMin();
91      max = rs.getMax();
92      count = rs.getCount();
93      sum = rs.getSum();
94      dof = rs.getDOF();
95      tDist = (dof > 0) ? new TDistribution(dof) : null;
96      sd = rs.getStandardDeviation();
97      sdPopulation = rs.getPopulationStandardDeviation();
98      descString =
99          format(
100             " Summary of %d observations: sum is %17.14g, mean is %17.14g, min is %17.14g, "
101                 + "max is %17.14g, and the sum of weights is %17.14g"
102                 + "\nSample standard deviation: %17.14g (dof = %d)"
103                 + "\nPopulation standard deviation: %17.14g (dof = %d)",
104             count, sum, mean, min, max, sumWeights, sd, dof, sdPopulation, count);
105   }
106 
107   /**
108    * Constructs a static summary of a statistic from provided values. Assumes weights are all
109    * constant (1.0). Assumes all values will be used.
110    *
111    * @param values Values to summarize.
112    */
113   public SummaryStatistics(double[] values) {
114     this(values, null, 0, values.length, 1);
115   }
116 
117   /**
118    * Constructs a static summary of a statistic from provided values. Assumes weights are all
119    * constant (1.0). Assumes all values from first to end will be used.
120    *
121    * @param values Values to summarize.
122    * @param first  First value to use.
123    */
124   public SummaryStatistics(double[] values, int first) {
125     this(values, null, first, values.length, 1);
126   }
127 
128   /**
129    * Constructs a static summary of a statistic from provided values. Assumes weights are all
130    * constant (1.0). Assumes a stride of 1.
131    *
132    * @param values Values to summarize.
133    * @param first  First value to use.
134    * @param last   Last value to use.
135    */
136   public SummaryStatistics(double[] values, int first, int last) {
137     this(values, null, first, last, 1);
138   }
139 
140   /**
141    * Constructs a static summary of a statistic from provided values. Assumes weights are all
142    * constant (1.0).
143    *
144    * @param values Values to summarize.
145    * @param first  First value to use.
146    * @param last   Last value to use.
147    * @param stride Stride between values used.
148    */
149   public SummaryStatistics(double[] values, int first, int last, int stride) {
150     this(values, null, first, last, stride);
151   }
152 
153   /**
154    * Constructs a static summary of a statistic from provided values.
155    *
156    * @param values  Values to summarize.
157    * @param weights Weights for each value.
158    * @param first   First value to use.
159    * @param last    Last value to use.
160    * @param stride  Stride between values used.
161    */
162   public SummaryStatistics(@Nullable double[] values, @Nullable double[] weights, int first, int last, int stride) {
163     if (values == null) {
164       throw new IllegalArgumentException(" Cannot have null values!");
165     }
166     int nVals = values.length;
167 
168     if (first < 0 || first > (nVals - 1)) {
169       throw new IllegalArgumentException(
170           format(" First entry %d was not in valid range 0-%d (0 to length of values - 1)",
171               first, nVals - 1));
172     }
173     if (last <= first || last > nVals) {
174       throw new IllegalArgumentException(
175           format(" Last entry %d was not in valid range %d-%d (first+1 to length of values",
176               last, (first + 1), nVals));
177     }
178 
179     if (weights == null) {
180       weights = new double[nVals];
181       fill(weights, 1.0);
182     }
183 
184     int tempCount = (last - first);
185     if (tempCount % stride == 0) {
186       count = tempCount / stride;
187     } else {
188       count = (tempCount / stride) + 1;
189     }
190     assert count > 0;
191 
192     if (count == 1) {
193       mean = values[first];
194       var = Double.NaN;
195       varPopulation = 0;
196       sd = Double.NaN;
197       sdPopulation = 0;
198       min = mean;
199       max = mean;
200       sum = mean;
201       sumWeights = weights[first];
202       dof = 0;
203       tDist = null;
204 
205       descString = format(" Summary of single observation: value is %17.14g", mean);
206     } else {
207       double meanAcc = 0;
208       double varAcc = 0;
209       double minAcc = Double.MAX_VALUE;
210       double maxAcc = Double.MIN_VALUE;
211       double sumAcc = 0;
212       double comp = 0;
213       double weightAcc = 0;
214 
215       for (int i = 0; i < count; i++) {
216         int ii = first + (i * stride);
217         assert ii < last;
218         double val = values[ii];
219         double priorMean = meanAcc;
220         double weight = weights[ii];
221         weightAcc += weight;
222 
223         double y = val - comp;
224         double t = sumAcc + y;
225         comp = (t - sumAcc) - y;
226         sumAcc = t;
227 
228         minAcc = min(minAcc, val);
229         maxAcc = max(maxAcc, val);
230 
231         double invCount = weight / weightAcc;
232         meanAcc += ((val - meanAcc) * invCount);
233         // TODO: Check correctness of variance accumulation when weight != 1.0.
234         varAcc += ((val - priorMean) * (val - meanAcc)) * weight;
235       }
236 
237       min = minAcc;
238       max = maxAcc;
239       mean = meanAcc;
240       sum = sumAcc;
241       sumWeights = weightAcc;
242       // Mean via Kahan summation and online mean estimation should be pretty close to each other.
243       assert abs(mean - (sum / count)) < 1.0E-11;
244       varPopulation = varAcc / count;
245       sdPopulation = sqrt(varPopulation);
246       dof = count - 1;
247       var = varAcc / dof;
248       sd = sqrt(var);
249       tDist = new TDistribution(dof);
250 
251       descString =
252           format(
253               " Summary of %d observations: sum is %17.14g, mean is %17.14g, min is %17.14g, "
254                   + "max is %17.14g, and the sum of weights is %17.14g"
255                   + "\nSample standard deviation: %17.14g (dof = %d)"
256                   + "\nPopulation standard deviation: %17.14g (dof = %d)",
257               count, sum, mean, min, max, sumWeights, sd, dof, sdPopulation, count);
258     }
259   }
260 
261   /**
262    * Computes a 95% confidence interval based on a Student's T-distribution.
263    *
264    * @return 95% confidence interval.
265    */
266   public double confidenceInterval() {
267     return confidenceInterval(0.05);
268   }
269 
270   /**
271    * Computes a confidence interval based on a Student's T-distribution.
272    *
273    * @param alpha Alpha (e.g. 0.05 for a 95% CI).
274    * @return Confidence interval.
275    */
276   public double confidenceInterval(double alpha) {
277     if (dof == 0) {
278       throw new IllegalArgumentException(
279           " Cannot calculate confidence intervals when there are no degrees of freedom!");
280     }
281     double critVal = tDist.inverseCumulativeProbability(0.5 * (1.0 - alpha));
282     return critVal * sd / sqrt(count);
283   }
284 
285   /**
286    * The mean.
287    *
288    * @return Return the mean.
289    */
290   public double getMean() {
291     return mean;
292   }
293 
294   /**
295    * The standard deviation.
296    *
297    * @return Return the standard deviation.
298    */
299   public double getSd() {
300     return sd;
301   }
302 
303   /**
304    * The variance.
305    *
306    * @return Return the variance.
307    */
308   public double getVar() {
309     return var;
310   }
311 
312   /**
313    * ${@inheritDoc}
314    */
315   @Override
316   public String toString() {
317     return descString;
318   }
319 
320   /**
321    * Describe the Summary Statistics.
322    *
323    * @return Return the description.
324    */
325   public String describe() {
326     return format(" Mean: %12.6f +/-%12.6f, Min/Max: %12.6f/%12.6f", mean, sd, min, max);
327   }
328 }