View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2026.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.fft;
39  
40  import jdk.incubator.vector.DoubleVector;
41  import jdk.incubator.vector.VectorSpecies;
42  
43  import static jdk.incubator.vector.DoubleVector.SPECIES_128;
44  import static jdk.incubator.vector.DoubleVector.SPECIES_256;
45  import static jdk.incubator.vector.DoubleVector.SPECIES_512;
46  import static jdk.incubator.vector.DoubleVector.broadcast;
47  import static jdk.incubator.vector.DoubleVector.fromArray;
48  
49  /**
50   * The MixedRadixFactor4 class handles factors of 4 in the FFT.
51   */
52  public class MixedRadixFactor4 extends MixedRadixFactor {
53  
54    /**
55     * Available SIMD sizes for Pass 4.
56     */
57    private static int[] simdSizes = {8, 4, 2};
58  
59    private final int di2;
60    private final int di3;
61    private final int dj2;
62    private final int dj3;
63  
64    /**
65     * Construct a MixedRadixFactor4.
66     *
67     * @param passConstants PassConstants.
68     */
69    public MixedRadixFactor4(PassConstants passConstants) {
70      super(passConstants);
71      di2 = 2 * di;
72      di3 = 3 * di;
73      dj2 = 2 * dj;
74      dj3 = 3 * dj;
75    }
76  
77    /**
78     * Check if the requested SIMD length is valid.
79     *
80     * @param width Requested SIMD species width.
81     * @return True if this width is supported.
82     */
83    @Override
84    public boolean isValidSIMDWidth(int width) {
85      // Must be a supported width.
86      if (width != 2 && width != 4 && width != 8) {
87        return false;
88      }
89      if (im == 1) {
90        // Interleaved
91        return innerLoopLimit % (width / 2) == 0;
92      } else {
93        // Blocked
94        return innerLoopLimit % width == 0;
95      }
96    }
97  
98    /**
99     * Determine the optimal SIMD width. Currently supported widths are 2, 4 and 8.
100    * If no SIMD width is valid, return 0 to indicate use of the scalar path.
101    *
102    * @return The optimal SIMD width.
103    */
104   public int getOptimalSIMDWidth() {
105     // Check the platform specific preferred width.
106     if (isValidSIMDWidth(LENGTH)) {
107       return LENGTH;
108     }
109     // Fall back to a smaller SIMD vector that fits the inner loop limit.
110     for (int size : simdSizes) {
111       if (size >= LENGTH) {
112         // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
113         continue;
114       }
115       if (isValidSIMDWidth(size)) {
116         return size;
117       }
118     }
119     // No valid SIMD width is found.
120     return 0;
121   }
122 
123   /**
124    * Handle factors of 4.
125    */
126   @Override
127   protected void passScalar(PassData passData) {
128     final double[] data = passData.in;
129     final double[] ret = passData.out;
130     int sign = passData.sign;
131     int i = passData.inOffset;
132     int j = passData.outOffset;
133     // First pass of the 4-point FFT has no twiddle factors.
134     for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
135       final double z0_r = data[i];
136       final double z1_r = data[i + di];
137       final double z2_r = data[i + di2];
138       final double z3_r = data[i + di3];
139       final double z0_i = data[i + im];
140       final double z1_i = data[i + di + im];
141       final double z2_i = data[i + di2 + im];
142       final double z3_i = data[i + di3 + im];
143       final double t1_r = z0_r + z2_r;
144       final double t1_i = z0_i + z2_i;
145       final double t2_r = z1_r + z3_r;
146       final double t2_i = z1_i + z3_i;
147       final double t3_r = z0_r - z2_r;
148       final double t3_i = z0_i - z2_i;
149       final double t4_r = sign * (z1_r - z3_r);
150       final double t4_i = sign * (z1_i - z3_i);
151       ret[j] = t1_r + t2_r;
152       ret[j + im] = t1_i + t2_i;
153       ret[j + dj] = t3_r - t4_i;
154       ret[j + dj + im] = t3_i + t4_r;
155       ret[j + dj2] = t1_r - t2_r;
156       ret[j + dj2 + im] = t1_i - t2_i;
157       ret[j + dj3] = t3_r + t4_i;
158       ret[j + dj3 + im] = t3_i - t4_r;
159     }
160     j += jstep;
161     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
162       final int index = k * 3;
163       final double w1_r = wr[index];
164       final double w2_r = wr[index + 1];
165       final double w3_r = wr[index + 2];
166       final double w1_i = -sign * wi[index];
167       final double w2_i = -sign * wi[index + 1];
168       final double w3_i = -sign * wi[index + 2];
169       for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
170         final double z0_r = data[i];
171         final double z1_r = data[i + di];
172         final double z2_r = data[i + di2];
173         final double z3_r = data[i + di3];
174         final double z0_i = data[i + im];
175         final double z1_i = data[i + di + im];
176         final double z2_i = data[i + di2 + im];
177         final double z3_i = data[i + di3 + im];
178         final double t1_r = z0_r + z2_r;
179         final double t1_i = z0_i + z2_i;
180         final double t2_r = z1_r + z3_r;
181         final double t2_i = z1_i + z3_i;
182         final double t3_r = z0_r - z2_r;
183         final double t3_i = z0_i - z2_i;
184         final double t4_r = sign * (z1_r - z3_r);
185         final double t4_i = sign * (z1_i - z3_i);
186         ret[j] = t1_r + t2_r;
187         ret[j + im] = t1_i + t2_i;
188         multiplyAndStore(t3_r - t4_i, t3_i + t4_r, w1_r, w1_i, ret, j + dj, j + dj + im);
189         multiplyAndStore(t1_r - t2_r, t1_i - t2_i, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
190         multiplyAndStore(t3_r + t4_i, t3_i - t4_r, w3_r, w3_i, ret, j + dj3, j + dj3 + im);
191       }
192     }
193   }
194 
195   /**
196    * Handle factors of 4 using SIMD vectors.
197    *
198    * @param passData The pass data.
199    */
200   @Override
201   protected void passSIMD(PassData passData) {
202     if (!isValidSIMDWidth(simdWidth)) {
203       passScalar(passData);
204     } else {
205       if (im == 1) {
206         interleaved(passData, simdWidth);
207       } else {
208         blocked(passData, simdWidth);
209       }
210     }
211   }
212 
213   /**
214    * Handle factors of 4 using the chosen SIMD vector.
215    *
216    * @param passData   The pass data.
217    * @param simdLength The SIMD vector length.
218    */
219   private void interleaved(PassData passData, int simdLength) {
220     // Use the preferred SIMD vector.
221     switch (simdLength) {
222       case 2:
223         // 1 complex number per loop iteration.
224         interleaved128(passData);
225         break;
226       case 4:
227         // 2 complex numbers per loop iteration.
228         interleaved256(passData);
229         break;
230       case 8:
231         // 4 complex numbers per loop iteration.
232         interleaved512(passData);
233         break;
234       default:
235         passScalar(passData);
236     }
237   }
238 
239   /**
240    * Handle factors of 4 using the chosen SIMD vector.
241    *
242    * @param passData   The pass data.
243    * @param simdLength The SIMD vector length.
244    */
245   private void blocked(PassData passData, int simdLength) {
246     // Use the preferred SIMD vector.
247     switch (simdLength) {
248       case 2:
249         // 2 complex numbers per loop iteration.
250         blocked128(passData);
251         break;
252       case 4:
253         // 4 complex numbers per loop iteration.
254         blocked256(passData);
255         break;
256       case 8:
257         // 8 complex numbers per loop iteration.
258         blocked512(passData);
259         break;
260       default:
261         passScalar(passData);
262     }
263   }
264 
265   /**
266    * ButterFly for radix-2 with blocked data.
267    *
268    * @param data The input array to retrieve data from.
269    * @param i    The index to read the first real input.
270    * @param w1r  The real part of the first twiddle factor.
271    * @param w1i  The imaginary part of the first twiddle factor.
272    * @param w2r  The real part of the first twiddle factor.
273    * @param w2i  The imaginary part of the first twiddle factor.
274    * @param w3r  The real part of the first twiddle factor.
275    * @param w3i  The imaginary part of the first twiddle factor.
276    * @param ret  The array to store into.
277    * @param j    The index to store the first real output.
278    */
279   private void butterFly4Blocked(
280       VectorSpecies<Double> species,
281       double[] data, int i, double sign,
282       double w1r, double w1i,
283       double w2r, double w2i,
284       double w3r, double w3i,
285       double[] ret, int j) {
286     final DoubleVector
287         z0_r = fromArray(species, data, i),
288         z0_i = fromArray(species, data, i + im),
289         z1_r = fromArray(species, data, i + di),
290         z1_i = fromArray(species, data, i + di + im),
291         z2_r = fromArray(species, data, i + di2),
292         z2_i = fromArray(species, data, i + di2 + im),
293         z3_r = fromArray(species, data, i + di3),
294         z3_i = fromArray(species, data, i + di3 + im);
295     final DoubleVector
296         t1_r = z0_r.add(z2_r),
297         t1_i = z0_i.add(z2_i),
298         t2_r = z1_r.add(z3_r),
299         t2_i = z1_i.add(z3_i),
300         t3_r = z0_r.sub(z2_r),
301         t3_i = z0_i.sub(z2_i),
302         t4_r = z1_r.sub(z3_r).mul(sign),
303         t4_i = z1_i.sub(z3_i).mul(sign);
304     t1_r.add(t2_r).intoArray(ret, j);
305     t1_i.add(t2_i).intoArray(ret, j + im);
306     final DoubleVector
307         x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
308         x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
309         x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
310     x1_r.mul(w1r).sub(x1_i.mul(w1i)).intoArray(ret, j + dj);
311     x2_r.mul(w2r).sub(x2_i.mul(w2i)).intoArray(ret, j + dj2);
312     x3_r.mul(w3r).sub(x3_i.mul(w3i)).intoArray(ret, j + dj3);
313     x1_i.mul(w1r).add(x1_r.mul(w1i)).intoArray(ret, j + dj + im);
314     x2_i.mul(w2r).add(x2_r.mul(w2i)).intoArray(ret, j + dj2 + im);
315     x3_i.mul(w3r).add(x3_r.mul(w3i)).intoArray(ret, j + dj3 + im);
316   }
317 
318   /**
319    * Handle factors of 4 using the 128-bit SIMD vectors.
320    */
321   private void blocked128(PassData passData) {
322     final double[] data = passData.in;
323     final double[] ret = passData.out;
324     final int sign = passData.sign;
325     int i = passData.inOffset;
326     int j = passData.outOffset;
327     // First pass of the 4-point FFT has no twiddle factors.
328     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
329       final DoubleVector
330           z0_r = fromArray(SPECIES_128, data, i),
331           z0_i = fromArray(SPECIES_128, data, i + im),
332           z1_r = fromArray(SPECIES_128, data, i + di),
333           z1_i = fromArray(SPECIES_128, data, i + di + im),
334           z2_r = fromArray(SPECIES_128, data, i + di2),
335           z2_i = fromArray(SPECIES_128, data, i + di2 + im),
336           z3_r = fromArray(SPECIES_128, data, i + di3),
337           z3_i = fromArray(SPECIES_128, data, i + di3 + im);
338       final DoubleVector t1_r = z0_r.add(z2_r), t1_i = z0_i.add(z2_i);
339       final DoubleVector t2_r = z1_r.add(z3_r), t2_i = z1_i.add(z3_i);
340       t1_r.add(t2_r).intoArray(ret, j);
341       t1_i.add(t2_i).intoArray(ret, j + im);
342       final DoubleVector t3_r = z0_r.sub(z2_r), t3_i = z0_i.sub(z2_i);
343       final DoubleVector t4_r = z1_r.sub(z3_r).mul(sign), t4_i = z1_i.sub(z3_i).mul(sign);
344       t3_r.sub(t4_i).intoArray(ret, j + dj);
345       t1_r.sub(t2_r).intoArray(ret, j + dj2);
346       t3_r.add(t4_i).intoArray(ret, j + dj3);
347       t3_i.add(t4_r).intoArray(ret, j + dj + im);
348       t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
349       t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
350     }
351 
352     j += jstep;
353     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
354       final int index = 3 * k;
355       final double
356           w1r = wr[index],
357           w2r = wr[index + 1],
358           w3r = wr[index + 2],
359           w1i = -sign * wi[index],
360           w2i = -sign * wi[index + 1],
361           w3i = -sign * wi[index + 2];
362       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
363         final DoubleVector
364             z0_r = fromArray(SPECIES_128, data, i),
365             z1_r = fromArray(SPECIES_128, data, i + di),
366             z2_r = fromArray(SPECIES_128, data, i + di2),
367             z3_r = fromArray(SPECIES_128, data, i + di3),
368             z0_i = fromArray(SPECIES_128, data, i + im),
369             z1_i = fromArray(SPECIES_128, data, i + di + im),
370             z2_i = fromArray(SPECIES_128, data, i + di2 + im),
371             z3_i = fromArray(SPECIES_128, data, i + di3 + im);
372         final DoubleVector
373             t1_r = z0_r.add(z2_r),
374             t1_i = z0_i.add(z2_i),
375             t2_r = z1_r.add(z3_r),
376             t2_i = z1_i.add(z3_i),
377             t3_r = z0_r.sub(z2_r),
378             t3_i = z0_i.sub(z2_i),
379             t4_r = z1_r.sub(z3_r).mul(sign),
380             t4_i = z1_i.sub(z3_i).mul(sign);
381         t1_r.add(t2_r).intoArray(ret, j);
382         t1_i.add(t2_i).intoArray(ret, j + im);
383         final DoubleVector
384             x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
385             x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
386             x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
387         x1_r.mul(w1r).sub(x1_i.mul(w1i)).intoArray(ret, j + dj);
388         x2_r.mul(w2r).sub(x2_i.mul(w2i)).intoArray(ret, j + dj2);
389         x3_r.mul(w3r).sub(x3_i.mul(w3i)).intoArray(ret, j + dj3);
390         x1_i.mul(w1r).add(x1_r.mul(w1i)).intoArray(ret, j + dj + im);
391         x2_i.mul(w2r).add(x2_r.mul(w2i)).intoArray(ret, j + dj2 + im);
392         x3_i.mul(w3r).add(x3_r.mul(w3i)).intoArray(ret, j + dj3 + im);
393       }
394     }
395   }
396 
397   /**
398    * Handle factors of 4 using the 256-bit SIMD vectors.
399    */
400   private void blocked256(PassData passData) {
401     final double[] data = passData.in;
402     final double[] ret = passData.out;
403     final int sign = passData.sign;
404     int i = passData.inOffset;
405     int j = passData.outOffset;
406     // First pass of the 4-point FFT has no twiddle factors.
407     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
408       final DoubleVector
409           z0_r = fromArray(SPECIES_256, data, i),
410           z1_r = fromArray(SPECIES_256, data, i + di),
411           z2_r = fromArray(SPECIES_256, data, i + di2),
412           z3_r = fromArray(SPECIES_256, data, i + di3),
413           z0_i = fromArray(SPECIES_256, data, i + im),
414           z1_i = fromArray(SPECIES_256, data, i + di + im),
415           z2_i = fromArray(SPECIES_256, data, i + di2 + im),
416           z3_i = fromArray(SPECIES_256, data, i + di3 + im);
417       final DoubleVector
418           t1_r = z0_r.add(z2_r),
419           t1_i = z0_i.add(z2_i),
420           t2_r = z1_r.add(z3_r),
421           t2_i = z1_i.add(z3_i),
422           t3_r = z0_r.sub(z2_r),
423           t3_i = z0_i.sub(z2_i),
424           t4_r = z1_r.sub(z3_r).mul(sign),
425           t4_i = z1_i.sub(z3_i).mul(sign);
426       t1_r.add(t2_r).intoArray(ret, j);
427       t3_r.sub(t4_i).intoArray(ret, j + dj);
428       t1_r.sub(t2_r).intoArray(ret, j + dj2);
429       t3_r.add(t4_i).intoArray(ret, j + dj3);
430       t1_i.add(t2_i).intoArray(ret, j + im);
431       t3_i.add(t4_r).intoArray(ret, j + dj + im);
432       t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
433       t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
434     }
435 
436     j += jstep;
437     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
438       final int index = 3 * k;
439       final double
440           w1r = wr[index],
441           w2r = wr[index + 1],
442           w3r = wr[index + 2],
443           w1i = -sign * wi[index],
444           w2i = -sign * wi[index + 1],
445           w3i = -sign * wi[index + 2];
446       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
447         final DoubleVector
448             z0_r = fromArray(SPECIES_256, data, i),
449             z1_r = fromArray(SPECIES_256, data, i + di),
450             z2_r = fromArray(SPECIES_256, data, i + di2),
451             z3_r = fromArray(SPECIES_256, data, i + di3),
452             z0_i = fromArray(SPECIES_256, data, i + im),
453             z1_i = fromArray(SPECIES_256, data, i + di + im),
454             z2_i = fromArray(SPECIES_256, data, i + di2 + im),
455             z3_i = fromArray(SPECIES_256, data, i + di3 + im);
456         final DoubleVector
457             t1_r = z0_r.add(z2_r),
458             t1_i = z0_i.add(z2_i),
459             t2_r = z1_r.add(z3_r),
460             t2_i = z1_i.add(z3_i),
461             t3_r = z0_r.sub(z2_r),
462             t3_i = z0_i.sub(z2_i),
463             t4_r = z1_r.sub(z3_r).mul(sign),
464             t4_i = z1_i.sub(z3_i).mul(sign);
465         t1_r.add(t2_r).intoArray(ret, j);
466         t1_i.add(t2_i).intoArray(ret, j + im);
467         final DoubleVector
468             x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
469             x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
470             x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
471         x1_r.mul(w1r).sub(x1_i.mul(w1i)).intoArray(ret, j + dj);
472         x2_r.mul(w2r).sub(x2_i.mul(w2i)).intoArray(ret, j + dj2);
473         x3_r.mul(w3r).sub(x3_i.mul(w3i)).intoArray(ret, j + dj3);
474         x1_i.mul(w1r).add(x1_r.mul(w1i)).intoArray(ret, j + dj + im);
475         x2_i.mul(w2r).add(x2_r.mul(w2i)).intoArray(ret, j + dj2 + im);
476         x3_i.mul(w3r).add(x3_r.mul(w3i)).intoArray(ret, j + dj3 + im);
477       }
478     }
479   }
480 
481   /**
482    * Handle factors of 4 using the 512-bit SIMD vectors.
483    */
484   private void blocked512(PassData passData) {
485     final double[] data = passData.in;
486     final double[] ret = passData.out;
487     final int sign = passData.sign;
488     int i = passData.inOffset;
489     int j = passData.outOffset;
490     // First pass of the 4-point FFT has no twiddle factors.
491     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
492       final DoubleVector
493           z0_r = fromArray(SPECIES_512, data, i),
494           z1_r = fromArray(SPECIES_512, data, i + di),
495           z2_r = fromArray(SPECIES_512, data, i + di2),
496           z3_r = fromArray(SPECIES_512, data, i + di3),
497           z0_i = fromArray(SPECIES_512, data, i + im),
498           z1_i = fromArray(SPECIES_512, data, i + di + im),
499           z2_i = fromArray(SPECIES_512, data, i + di2 + im),
500           z3_i = fromArray(SPECIES_512, data, i + di3 + im);
501       final DoubleVector
502           t1_r = z0_r.add(z2_r),
503           t1_i = z0_i.add(z2_i),
504           t2_r = z1_r.add(z3_r),
505           t2_i = z1_i.add(z3_i),
506           t3_r = z0_r.sub(z2_r),
507           t3_i = z0_i.sub(z2_i),
508           t4_r = z1_r.sub(z3_r).mul(sign),
509           t4_i = z1_i.sub(z3_i).mul(sign);
510       t1_r.add(t2_r).intoArray(ret, j);
511       t3_r.sub(t4_i).intoArray(ret, j + dj);
512       t1_r.sub(t2_r).intoArray(ret, j + dj2);
513       t3_r.add(t4_i).intoArray(ret, j + dj3);
514       t1_i.add(t2_i).intoArray(ret, j + im);
515       t3_i.add(t4_r).intoArray(ret, j + dj + im);
516       t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
517       t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
518     }
519 
520     j += jstep;
521     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
522       final int index = 3 * k;
523       final double
524           w1r = wr[index],
525           w2r = wr[index + 1],
526           w3r = wr[index + 2],
527           w1i = -sign * wi[index],
528           w2i = -sign * wi[index + 1],
529           w3i = -sign * wi[index + 2];
530       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
531         final DoubleVector
532             z0_r = fromArray(SPECIES_512, data, i),
533             z1_r = fromArray(SPECIES_512, data, i + di),
534             z2_r = fromArray(SPECIES_512, data, i + di2),
535             z3_r = fromArray(SPECIES_512, data, i + di3),
536             z0_i = fromArray(SPECIES_512, data, i + im),
537             z1_i = fromArray(SPECIES_512, data, i + di + im),
538             z2_i = fromArray(SPECIES_512, data, i + di2 + im),
539             z3_i = fromArray(SPECIES_512, data, i + di3 + im);
540         final DoubleVector
541             t1_r = z0_r.add(z2_r),
542             t1_i = z0_i.add(z2_i),
543             t2_r = z1_r.add(z3_r),
544             t2_i = z1_i.add(z3_i),
545             t3_r = z0_r.sub(z2_r),
546             t3_i = z0_i.sub(z2_i),
547             t4_r = z1_r.sub(z3_r).mul(sign),
548             t4_i = z1_i.sub(z3_i).mul(sign);
549         t1_r.add(t2_r).intoArray(ret, j);
550         t1_i.add(t2_i).intoArray(ret, j + im);
551         final DoubleVector
552             x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
553             x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
554             x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
555         x1_r.mul(w1r).sub(x1_i.mul(w1i)).intoArray(ret, j + dj);
556         x2_r.mul(w2r).sub(x2_i.mul(w2i)).intoArray(ret, j + dj2);
557         x3_r.mul(w3r).sub(x3_i.mul(w3i)).intoArray(ret, j + dj3);
558         x1_i.mul(w1r).add(x1_r.mul(w1i)).intoArray(ret, j + dj + im);
559         x2_i.mul(w2r).add(x2_r.mul(w2i)).intoArray(ret, j + dj2 + im);
560         x3_i.mul(w3r).add(x3_r.mul(w3i)).intoArray(ret, j + dj3 + im);
561       }
562     }
563   }
564 
565   /**
566    * Handle factors of 4 using the 128-bit SIMD vectors.
567    */
568   private void interleaved128(PassData passData) {
569     final double[] data = passData.in;
570     final double[] ret = passData.out;
571     final int sign = passData.sign;
572     int i = passData.inOffset;
573     int j = passData.outOffset;
574     // First pass of the 4-point FFT has no twiddle factors.
575     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
576       final DoubleVector
577           z0 = fromArray(SPECIES_128, data, i),
578           z1 = fromArray(SPECIES_128, data, i + di),
579           z2 = fromArray(SPECIES_128, data, i + di2),
580           z3 = fromArray(SPECIES_128, data, i + di3);
581       final DoubleVector
582           t1 = z0.add(z2),
583           t2 = z1.add(z3),
584           t3 = z0.sub(z2),
585           t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
586       t1.add(t2).intoArray(ret, j);
587       t3.add(t4.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
588       t1.sub(t2).intoArray(ret, j + dj2);
589       t3.add(t4.mul(NEGATE_IM_128)).intoArray(ret, j + dj3);
590     }
591 
592     j += jstep;
593     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
594       final int index = 3 * k;
595       final DoubleVector
596           w1r = broadcast(SPECIES_128, wr[index]),
597           w2r = broadcast(SPECIES_128, wr[index + 1]),
598           w3r = broadcast(SPECIES_128, wr[index + 2]),
599           w1i = broadcast(SPECIES_128, -sign * wi[index]).mul(NEGATE_IM_128),
600           w2i = broadcast(SPECIES_128, -sign * wi[index + 1]).mul(NEGATE_IM_128),
601           w3i = broadcast(SPECIES_128, -sign * wi[index + 2]).mul(NEGATE_IM_128);
602       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
603         final DoubleVector
604             z0 = fromArray(SPECIES_128, data, i),
605             z1 = fromArray(SPECIES_128, data, i + di),
606             z2 = fromArray(SPECIES_128, data, i + di2),
607             z3 = fromArray(SPECIES_128, data, i + di3);
608         final DoubleVector
609             t1 = z0.add(z2),
610             t2 = z1.add(z3),
611             t3 = z0.sub(z2),
612             t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
613         t1.add(t2).intoArray(ret, j);
614         final DoubleVector
615             x1 = t3.add(t4.mul(NEGATE_RE_128)),
616             x2 = t1.sub(t2),
617             x3 = t3.add(t4.mul(NEGATE_IM_128));
618         w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
619         w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
620         w3r.fma(x3, w3i.mul(x3).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj3);
621       }
622     }
623   }
624 
625   /**
626    * Handle factors of 4 using the 256-bit SIMD vectors.
627    */
628   private void interleaved256(PassData passData) {
629     final double[] data = passData.in;
630     final double[] ret = passData.out;
631     final int sign = passData.sign;
632     int i = passData.inOffset;
633     int j = passData.outOffset;
634     // First pass of the 4-point FFT has no twiddle factors.
635     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
636       final DoubleVector
637           z0 = fromArray(SPECIES_256, data, i),
638           z1 = fromArray(SPECIES_256, data, i + di),
639           z2 = fromArray(SPECIES_256, data, i + di2),
640           z3 = fromArray(SPECIES_256, data, i + di3);
641       final DoubleVector
642           t1 = z0.add(z2),
643           t2 = z1.add(z3),
644           t3 = z0.sub(z2),
645           t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
646       t1.add(t2).intoArray(ret, j);
647       t3.add(t4.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
648       t1.sub(t2).intoArray(ret, j + dj2);
649       t3.add(t4.mul(NEGATE_IM_256)).intoArray(ret, j + dj3);
650     }
651 
652     j += jstep;
653     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
654       final int index = 3 * k;
655       final DoubleVector
656           w1r = broadcast(SPECIES_256, wr[index]),
657           w2r = broadcast(SPECIES_256, wr[index + 1]),
658           w3r = broadcast(SPECIES_256, wr[index + 2]),
659           w1i = broadcast(SPECIES_256, -sign * wi[index]).mul(NEGATE_IM_256),
660           w2i = broadcast(SPECIES_256, -sign * wi[index + 1]).mul(NEGATE_IM_256),
661           w3i = broadcast(SPECIES_256, -sign * wi[index + 2]).mul(NEGATE_IM_256);
662       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
663         final DoubleVector
664             z0 = fromArray(SPECIES_256, data, i),
665             z1 = fromArray(SPECIES_256, data, i + di),
666             z2 = fromArray(SPECIES_256, data, i + di2),
667             z3 = fromArray(SPECIES_256, data, i + di3);
668         final DoubleVector
669             t1 = z0.add(z2),
670             t2 = z1.add(z3),
671             t3 = z0.sub(z2),
672             t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
673         t1.add(t2).intoArray(ret, j);
674         final DoubleVector
675             x1 = t3.add(t4.mul(NEGATE_RE_256)),
676             x2 = t1.sub(t2),
677             x3 = t3.add(t4.mul(NEGATE_IM_256));
678         w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
679         w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
680         w3r.fma(x3, w3i.mul(x3).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj3);
681       }
682     }
683   }
684 
685   /**
686    * Handle factors of 4 using the 512-bit SIMD vectors.
687    */
688   private void interleaved512(PassData passData) {
689     final double[] data = passData.in;
690     final double[] ret = passData.out;
691     final int sign = passData.sign;
692     int i = passData.inOffset;
693     int j = passData.outOffset;
694     // First pass of the 4-point FFT has no twiddle factors.
695     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
696       final DoubleVector
697           z0 = fromArray(SPECIES_512, data, i),
698           z1 = fromArray(SPECIES_512, data, i + di),
699           z2 = fromArray(SPECIES_512, data, i + di2),
700           z3 = fromArray(SPECIES_512, data, i + di3);
701       final DoubleVector
702           t1 = z0.add(z2),
703           t2 = z1.add(z3),
704           t3 = z0.sub(z2),
705           t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
706       t1.add(t2).intoArray(ret, j);
707       t3.add(t4.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
708       t1.sub(t2).intoArray(ret, j + dj2);
709       t3.add(t4.mul(NEGATE_IM_512)).intoArray(ret, j + dj3);
710     }
711 
712     j += jstep;
713     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
714       final int index = 3 * k;
715       final DoubleVector
716           w1r = broadcast(SPECIES_512, wr[index]),
717           w2r = broadcast(SPECIES_512, wr[index + 1]),
718           w3r = broadcast(SPECIES_512, wr[index + 2]),
719           w1i = broadcast(SPECIES_512, -sign * wi[index]).mul(NEGATE_IM_512),
720           w2i = broadcast(SPECIES_512, -sign * wi[index + 1]).mul(NEGATE_IM_512),
721           w3i = broadcast(SPECIES_512, -sign * wi[index + 2]).mul(NEGATE_IM_512);
722       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
723         final DoubleVector
724             z0 = fromArray(SPECIES_512, data, i),
725             z1 = fromArray(SPECIES_512, data, i + di),
726             z2 = fromArray(SPECIES_512, data, i + di2),
727             z3 = fromArray(SPECIES_512, data, i + di3);
728         final DoubleVector
729             t1 = z0.add(z2),
730             t2 = z1.add(z3),
731             t3 = z0.sub(z2),
732             t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
733         t1.add(t2).intoArray(ret, j);
734         final DoubleVector
735             x1 = t3.add(t4.mul(NEGATE_RE_512)),
736             x2 = t1.sub(t2),
737             x3 = t3.add(t4.mul(NEGATE_IM_512));
738         w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
739         w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
740         w3r.fma(x3, w3i.mul(x3).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj3);
741       }
742     }
743   }
744 }