View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2025.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.fft;
39  
40  import jdk.incubator.vector.DoubleVector;
41  
42  import static jdk.incubator.vector.DoubleVector.SPECIES_128;
43  import static jdk.incubator.vector.DoubleVector.SPECIES_256;
44  import static jdk.incubator.vector.DoubleVector.SPECIES_512;
45  import static jdk.incubator.vector.DoubleVector.broadcast;
46  import static jdk.incubator.vector.DoubleVector.fromArray;
47  
48  /**
49   * The MixedRadixFactor4 class handles factors of 4 in the FFT.
50   */
51  public class MixedRadixFactor4 extends MixedRadixFactor {
52  
53    private final int di2;
54    private final int di3;
55    private final int dj2;
56    private final int dj3;
57  
58    /**
59     * Construct a MixedRadixFactor4.
60     * @param passConstants PassConstants.
61     */
62    public MixedRadixFactor4(PassConstants passConstants) {
63      super(passConstants);
64      di2 = 2 * di;
65      di3 = 3 * di;
66      dj2 = 2 * dj;
67      dj3 = 3 * dj;
68    }
69  
70    /**
71     * Handle factors of 4.
72     */
73    @Override
74    protected void passScalar(PassData passData) {
75      final double[] data = passData.in;
76      final double[] ret = passData.out;
77      int sign = passData.sign;
78      int i = passData.inOffset;
79      int j = passData.outOffset;
80      // First pass of the 4-point FFT has no twiddle factors.
81      for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
82        final double z0_r = data[i];
83        final double z1_r = data[i + di];
84        final double z2_r = data[i + di2];
85        final double z3_r = data[i + di3];
86        final double z0_i = data[i + im];
87        final double z1_i = data[i + di + im];
88        final double z2_i = data[i + di2 + im];
89        final double z3_i = data[i + di3 + im];
90        final double t1_r = z0_r + z2_r;
91        final double t1_i = z0_i + z2_i;
92        final double t2_r = z1_r + z3_r;
93        final double t2_i = z1_i + z3_i;
94        final double t3_r = z0_r - z2_r;
95        final double t3_i = z0_i - z2_i;
96        final double t4_r = sign * (z1_r - z3_r);
97        final double t4_i = sign * (z1_i - z3_i);
98        ret[j] = t1_r + t2_r;
99        ret[j + im] = t1_i + t2_i;
100       ret[j + dj] = t3_r - t4_i;
101       ret[j + dj + im] = t3_i + t4_r;
102       ret[j + dj2] = t1_r - t2_r;
103       ret[j + dj2 + im] = t1_i - t2_i;
104       ret[j + dj3] = t3_r + t4_i;
105       ret[j + dj3 + im] = t3_i - t4_r;
106     }
107     j += jstep;
108     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
109       final double[] twids = twiddles[k];
110       final double w1_r = twids[0];
111       final double w1_i = -sign * twids[1];
112       final double w2_r = twids[2];
113       final double w2_i = -sign * twids[3];
114       final double w3_r = twids[4];
115       final double w3_i = -sign * twids[5];
116       for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
117         final double z0_r = data[i];
118         final double z1_r = data[i + di];
119         final double z2_r = data[i + di2];
120         final double z3_r = data[i + di3];
121         final double z0_i = data[i + im];
122         final double z1_i = data[i + di + im];
123         final double z2_i = data[i + di2 + im];
124         final double z3_i = data[i + di3 + im];
125         final double t1_r = z0_r + z2_r;
126         final double t1_i = z0_i + z2_i;
127         final double t2_r = z1_r + z3_r;
128         final double t2_i = z1_i + z3_i;
129         final double t3_r = z0_r - z2_r;
130         final double t3_i = z0_i - z2_i;
131         final double t4_r = sign * (z1_r - z3_r);
132         final double t4_i = sign * (z1_i - z3_i);
133         ret[j] = t1_r + t2_r;
134         ret[j + im] = t1_i + t2_i;
135         multiplyAndStore(t3_r - t4_i, t3_i + t4_r, w1_r, w1_i, ret, j + dj, j + dj + im);
136         multiplyAndStore(t1_r - t2_r, t1_i - t2_i, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
137         multiplyAndStore(t3_r + t4_i, t3_i - t4_r, w3_r, w3_i, ret, j + dj3, j + dj3 + im);
138       }
139     }
140   }
141 
142   /**
143    * Handle factors of 4 using SIMD vectors.
144    */
145   @Override
146   protected void passSIMD(PassData passData) {
147     if (im == 1) {
148       interleaved(passData);
149     } else {
150       blocked(passData);
151     }
152   }
153 
154   /**
155    * Available SIMD sizes for Pass 4.
156    */
157   private static int[] simdSizes = {8, 4, 2};
158 
159   /**
160    * Handle factors of 4 using the chosen SIMD vector.
161    * @param passData The pass data.
162    * @param simdLength The SIMD vector length.
163    */
164   private void interleaved(PassData passData, int simdLength) {
165     // Use the preferred SIMD vector.
166     switch (simdLength) {
167       case 2:
168         // 1 complex number per loop iteration.
169         interleaved128(passData);
170         break;
171       case 4:
172         // 2 complex numbers per loop iteration.
173         interleaved256(passData);
174         break;
175       case 8:
176         // 4 complex numbers per loop iteration.
177         interleaved512(passData);
178         break;
179       default:
180         passScalar(passData);
181     }
182   }
183 
184   /**
185    * Handle factors of 4 using the chosen SIMD vector.
186    * @param passData The pass data.
187    * @param simdLength The SIMD vector length.
188    */
189   private void blocked(PassData passData, int simdLength) {
190     // Use the preferred SIMD vector.
191     switch (simdLength) {
192       case 2:
193         // 2 complex numbers per loop iteration.
194         blocked128(passData);
195         break;
196       case 4:
197         // 4 complex numbers per loop iteration.
198         blocked256(passData);
199         break;
200       case 8:
201         // 8 complex numbers per loop iteration.
202         blocked512(passData);
203         break;
204       default:
205         passScalar(passData);
206     }
207   }
208 
209   /**
210    * Handle factors of 4 using the 128-bit SIMD vectors.
211    * @param passData The interleaved pass data.
212    */
213   private void interleaved(PassData passData) {
214     if (innerLoopLimit % LOOP == 0) {
215       // Use the preferred SIMD vector.
216       interleaved(passData, LENGTH);
217     } else {
218       // If the inner loop limit is odd, use the scalar method unless the inner loop limit is 1.
219       if (innerLoopLimit % 2 != 0 && innerLoopLimit != 1) {
220         passScalar(passData);
221         return;
222       }
223       // Fall back to a smaller SIMD vector that fits the inner loop limit.
224       for (int size : simdSizes) {
225         if (size >= LENGTH) {
226           // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
227           continue;
228         }
229         // Divide the SIMD size by two because for interleaved a single SIMD vectors stores both real and imaginary parts.
230         if (innerLoopLimit % (size / 2) == 0) {
231           interleaved(passData, size);
232         }
233       }
234     }
235   }
236 
237   /**
238    * Handle factors of 4 using the 128-bit SIMD vectors.
239    * @param passData The pass blocked data.
240    */
241   private void blocked(PassData passData) {
242     if (innerLoopLimit % BLOCK_LOOP == 0) {
243       // Use the preferred SIMD vector.
244       blocked(passData, LENGTH);
245     } else {
246       // If the inner loop limit is odd, use the scalar method unless the inner loop limit is 1.
247       if (innerLoopLimit % 2 != 0) {
248         passScalar(passData);
249         return;
250       }
251       // Fall back to a smaller SIMD vector that fits the inner loop limit.
252       for (int size : simdSizes) {
253         if (size >= LENGTH) {
254           // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
255           continue;
256         }
257         // Divide the SIMD size by two because for interleaved a single SIMD vectors stores both real and imaginary parts.
258         if (innerLoopLimit % size == 0) {
259           blocked(passData, size);
260         }
261       }
262     }
263   }
264 
265   /**
266    * Handle factors of 4 using the 128-bit SIMD vectors.
267    */
268   private void blocked128(PassData passData) {
269     final double[] data = passData.in;
270     final double[] ret = passData.out;
271     int sign = passData.sign;
272     int i = passData.inOffset;
273     int j = passData.outOffset;
274     // First pass of the 4-point FFT has no twiddle factors.
275     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
276       final DoubleVector
277           z0_r = fromArray(SPECIES_128, data, i),
278           z0_i = fromArray(SPECIES_128, data, i + im),
279           z1_r = fromArray(SPECIES_128, data, i + di),
280           z1_i = fromArray(SPECIES_128, data, i + di + im),
281           z2_r = fromArray(SPECIES_128, data, i + di2),
282           z2_i = fromArray(SPECIES_128, data, i + di2 + im),
283           z3_r = fromArray(SPECIES_128, data, i + di3),
284           z3_i = fromArray(SPECIES_128, data, i + di3 + im);
285       final DoubleVector
286           t1_r = z0_r.add(z2_r),
287           t1_i = z0_i.add(z2_i),
288           t2_r = z1_r.add(z3_r),
289           t2_i = z1_i.add(z3_i),
290           t3_r = z0_r.sub(z2_r),
291           t3_i = z0_i.sub(z2_i),
292           t4_r = z1_r.sub(z3_r).mul(sign),
293           t4_i = z1_i.sub(z3_i).mul(sign);
294       t1_r.add(t2_r).intoArray(ret, j);
295       t1_i.add(t2_i).intoArray(ret, j + im);
296       t3_r.sub(t4_i).intoArray(ret, j + dj);
297       t3_i.add(t4_r).intoArray(ret, j + dj + im);
298       t1_r.sub(t2_r).intoArray(ret, j + dj2);
299       t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
300       t3_r.add(t4_i).intoArray(ret, j + dj3);
301       t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
302     }
303 
304     j += jstep;
305     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
306       final double[] twids = twiddles[k];
307       DoubleVector
308           w1r = broadcast(SPECIES_128, twids[0]),
309           w1i = broadcast(SPECIES_128, -sign * twids[1]),
310           w2r = broadcast(SPECIES_128, twids[2]),
311           w2i = broadcast(SPECIES_128, -sign * twids[3]),
312           w3r = broadcast(SPECIES_128, twids[4]),
313           w3i = broadcast(SPECIES_128, -sign * twids[5]);
314       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
315         final DoubleVector
316             z0_r = fromArray(SPECIES_128, data, i),
317             z0_i = fromArray(SPECIES_128, data, i + im),
318             z1_r = fromArray(SPECIES_128, data, i + di),
319             z1_i = fromArray(SPECIES_128, data, i + di + im),
320             z2_r = fromArray(SPECIES_128, data, i + di2),
321             z2_i = fromArray(SPECIES_128, data, i + di2 + im),
322             z3_r = fromArray(SPECIES_128, data, i + di3),
323             z3_i = fromArray(SPECIES_128, data, i + di3 + im);
324         final DoubleVector
325             t1_r = z0_r.add(z2_r),
326             t1_i = z0_i.add(z2_i),
327             t2_r = z1_r.add(z3_r),
328             t2_i = z1_i.add(z3_i),
329             t3_r = z0_r.sub(z2_r),
330             t3_i = z0_i.sub(z2_i),
331             t4_r = z1_r.sub(z3_r).mul(sign),
332             t4_i = z1_i.sub(z3_i).mul(sign);
333         t1_r.add(t2_r).intoArray(ret, j);
334         t1_i.add(t2_i).intoArray(ret, j + im);
335         DoubleVector
336             x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
337             x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
338             x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
339         w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
340         w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
341         w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
342         w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
343         w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
344         w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
345       }
346     }
347   }
348 
349   /**
350    * Handle factors of 4 using the 256-bit SIMD vectors.
351    */
352   private void blocked256(PassData passData) {
353     final double[] data = passData.in;
354     final double[] ret = passData.out;
355     int sign = passData.sign;
356     int i = passData.inOffset;
357     int j = passData.outOffset;
358     // First pass of the 4-point FFT has no twiddle factors.
359     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
360       final DoubleVector
361           z0_r = fromArray(SPECIES_256, data, i),
362           z1_r = fromArray(SPECIES_256, data, i + di),
363           z2_r = fromArray(SPECIES_256, data, i + di2),
364           z3_r = fromArray(SPECIES_256, data, i + di3),
365           z0_i = fromArray(SPECIES_256, data, i + im),
366           z1_i = fromArray(SPECIES_256, data, i + di + im),
367           z2_i = fromArray(SPECIES_256, data, i + di2 + im),
368           z3_i = fromArray(SPECIES_256, data, i + di3 + im);
369       final DoubleVector
370           t1_r = z0_r.add(z2_r),
371           t1_i = z0_i.add(z2_i),
372           t2_r = z1_r.add(z3_r),
373           t2_i = z1_i.add(z3_i),
374           t3_r = z0_r.sub(z2_r),
375           t3_i = z0_i.sub(z2_i),
376           t4_r = z1_r.sub(z3_r).mul(sign),
377           t4_i = z1_i.sub(z3_i).mul(sign);
378       t1_r.add(t2_r).intoArray(ret, j);
379       t3_r.sub(t4_i).intoArray(ret, j + dj);
380       t1_r.sub(t2_r).intoArray(ret, j + dj2);
381       t3_r.add(t4_i).intoArray(ret, j + dj3);
382       t1_i.add(t2_i).intoArray(ret, j + im);
383       t3_i.add(t4_r).intoArray(ret, j + dj + im);
384       t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
385       t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
386     }
387 
388     j += jstep;
389     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
390       final double[] twids = twiddles[k];
391       DoubleVector
392           w1r = broadcast(SPECIES_256, twids[0]),
393           w1i = broadcast(SPECIES_256, -sign * twids[1]),
394           w2r = broadcast(SPECIES_256, twids[2]),
395           w2i = broadcast(SPECIES_256, -sign * twids[3]),
396           w3r = broadcast(SPECIES_256, twids[4]),
397           w3i = broadcast(SPECIES_256, -sign * twids[5]);
398       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
399         final DoubleVector
400             z0_r = fromArray(SPECIES_256, data, i),
401             z1_r = fromArray(SPECIES_256, data, i + di),
402             z2_r = fromArray(SPECIES_256, data, i + di2),
403             z3_r = fromArray(SPECIES_256, data, i + di3),
404             z0_i = fromArray(SPECIES_256, data, i + im),
405             z1_i = fromArray(SPECIES_256, data, i + di + im),
406             z2_i = fromArray(SPECIES_256, data, i + di2 + im),
407             z3_i = fromArray(SPECIES_256, data, i + di3 + im);
408         final DoubleVector
409             t1_r = z0_r.add(z2_r),
410             t1_i = z0_i.add(z2_i),
411             t2_r = z1_r.add(z3_r),
412             t2_i = z1_i.add(z3_i),
413             t3_r = z0_r.sub(z2_r),
414             t3_i = z0_i.sub(z2_i),
415             t4_r = z1_r.sub(z3_r).mul(sign),
416             t4_i = z1_i.sub(z3_i).mul(sign);
417         t1_r.add(t2_r).intoArray(ret, j);
418         t1_i.add(t2_i).intoArray(ret, j + im);
419         DoubleVector
420             x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
421             x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
422             x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
423         w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
424         w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
425         w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
426         w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
427         w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
428         w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
429       }
430     }
431   }
432 
433   /**
434    * Handle factors of 4 using the 512-bit SIMD vectors.
435    */
436   private void blocked512(PassData passData) {
437     final double[] data = passData.in;
438     final double[] ret = passData.out;
439     int sign = passData.sign;
440     int i = passData.inOffset;
441     int j = passData.outOffset;
442     // First pass of the 4-point FFT has no twiddle factors.
443     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
444       final DoubleVector
445           z0_r = fromArray(SPECIES_512, data, i),
446           z1_r = fromArray(SPECIES_512, data, i + di),
447           z2_r = fromArray(SPECIES_512, data, i + di2),
448           z3_r = fromArray(SPECIES_512, data, i + di3),
449           z0_i = fromArray(SPECIES_512, data, i + im),
450           z1_i = fromArray(SPECIES_512, data, i + di + im),
451           z2_i = fromArray(SPECIES_512, data, i + di2 + im),
452           z3_i = fromArray(SPECIES_512, data, i + di3 + im);
453       final DoubleVector
454           t1_r = z0_r.add(z2_r),
455           t1_i = z0_i.add(z2_i),
456           t2_r = z1_r.add(z3_r),
457           t2_i = z1_i.add(z3_i),
458           t3_r = z0_r.sub(z2_r),
459           t3_i = z0_i.sub(z2_i),
460           t4_r = z1_r.sub(z3_r).mul(sign),
461           t4_i = z1_i.sub(z3_i).mul(sign);
462       t1_r.add(t2_r).intoArray(ret, j);
463       t3_r.sub(t4_i).intoArray(ret, j + dj);
464       t1_r.sub(t2_r).intoArray(ret, j + dj2);
465       t3_r.add(t4_i).intoArray(ret, j + dj3);
466       t1_i.add(t2_i).intoArray(ret, j + im);
467       t3_i.add(t4_r).intoArray(ret, j + dj + im);
468       t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
469       t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
470     }
471 
472     j += jstep;
473     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
474       final double[] twids = twiddles[k];
475       DoubleVector
476           w1r = broadcast(SPECIES_512, twids[0]),
477           w1i = broadcast(SPECIES_512, -sign * twids[1]),
478           w2r = broadcast(SPECIES_512, twids[2]),
479           w2i = broadcast(SPECIES_512, -sign * twids[3]),
480           w3r = broadcast(SPECIES_512, twids[4]),
481           w3i = broadcast(SPECIES_512, -sign * twids[5]);
482       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
483         final DoubleVector
484             z0_r = fromArray(SPECIES_512, data, i),
485             z1_r = fromArray(SPECIES_512, data, i + di),
486             z2_r = fromArray(SPECIES_512, data, i + di2),
487             z3_r = fromArray(SPECIES_512, data, i + di3),
488             z0_i = fromArray(SPECIES_512, data, i + im),
489             z1_i = fromArray(SPECIES_512, data, i + di + im),
490             z2_i = fromArray(SPECIES_512, data, i + di2 + im),
491             z3_i = fromArray(SPECIES_512, data, i + di3 + im);
492         final DoubleVector
493             t1_r = z0_r.add(z2_r),
494             t1_i = z0_i.add(z2_i),
495             t2_r = z1_r.add(z3_r),
496             t2_i = z1_i.add(z3_i),
497             t3_r = z0_r.sub(z2_r),
498             t3_i = z0_i.sub(z2_i),
499             t4_r = z1_r.sub(z3_r).mul(sign),
500             t4_i = z1_i.sub(z3_i).mul(sign);
501         t1_r.add(t2_r).intoArray(ret, j);
502         t1_i.add(t2_i).intoArray(ret, j + im);
503         DoubleVector
504             x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
505             x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
506             x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
507         w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
508         w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
509         w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
510         w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
511         w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
512         w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
513       }
514     }
515   }
516 
517   /**
518    * Handle factors of 4 using the 128-bit SIMD vectors.
519    */
520   private void interleaved128(PassData passData) {
521     final double[] data = passData.in;
522     final double[] ret = passData.out;
523     int sign = passData.sign;
524     int i = passData.inOffset;
525     int j = passData.outOffset;
526     // First pass of the 4-point FFT has no twiddle factors.
527     for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_128, i += LENGTH_128, j += LENGTH_128) {
528       DoubleVector
529           z0 = fromArray(SPECIES_128, data, i),
530           z1 = fromArray(SPECIES_128, data, i + di),
531           z2 = fromArray(SPECIES_128, data, i + di2),
532           z3 = fromArray(SPECIES_128, data, i + di3);
533       DoubleVector
534           t1 = z0.add(z2),
535           t2 = z1.add(z3),
536           t3 = z0.sub(z2),
537           t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
538       t1.add(t2).intoArray(ret, j);
539       t3.add(t4.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
540       t1.sub(t2).intoArray(ret, j + dj2);
541       t3.add(t4.mul(NEGATE_IM_128)).intoArray(ret, j + dj3);
542     }
543 
544     j += jstep;
545     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
546       final double[] twids = twiddles[k];
547       DoubleVector
548           w1r = broadcast(SPECIES_128, twids[0]),
549           w1i = broadcast(SPECIES_128, -sign * twids[1]).mul(NEGATE_IM_128),
550           w2r = broadcast(SPECIES_128, twids[2]),
551           w2i = broadcast(SPECIES_128, -sign * twids[3]).mul(NEGATE_IM_128),
552           w3r = broadcast(SPECIES_128, twids[4]),
553           w3i = broadcast(SPECIES_128, -sign * twids[5]).mul(NEGATE_IM_128);
554       for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_128, i += LENGTH_128, j += LENGTH_128) {
555         DoubleVector
556             z0 = fromArray(SPECIES_128, data, i),
557             z1 = fromArray(SPECIES_128, data, i + di),
558             z2 = fromArray(SPECIES_128, data, i + di2),
559             z3 = fromArray(SPECIES_128, data, i + di3);
560         DoubleVector
561             t1 = z0.add(z2),
562             t2 = z1.add(z3),
563             t3 = z0.sub(z2),
564             t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
565         t1.add(t2).intoArray(ret, j);
566         DoubleVector
567             x1 = t3.add(t4.mul(NEGATE_RE_128)),
568             x2 = t1.sub(t2),
569             x3 = t3.add(t4.mul(NEGATE_IM_128));
570         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
571         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
572         w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj3);
573       }
574     }
575   }
576 
577   /**
578    * Handle factors of 4 using the 256-bit SIMD vectors.
579    */
580   private void interleaved256(PassData passData) {
581     final double[] data = passData.in;
582     final double[] ret = passData.out;
583     int sign = passData.sign;
584     int i = passData.inOffset;
585     int j = passData.outOffset;
586     // First pass of the 4-point FFT has no twiddle factors.
587     for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_256, i += LENGTH_256, j += LENGTH_256) {
588       DoubleVector
589           z0 = fromArray(SPECIES_256, data, i),
590           z1 = fromArray(SPECIES_256, data, i + di),
591           z2 = fromArray(SPECIES_256, data, i + di2),
592           z3 = fromArray(SPECIES_256, data, i + di3);
593       DoubleVector
594           t1 = z0.add(z2),
595           t2 = z1.add(z3),
596           t3 = z0.sub(z2),
597           t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
598       t1.add(t2).intoArray(ret, j);
599       t3.add(t4.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
600       t1.sub(t2).intoArray(ret, j + dj2);
601       t3.add(t4.mul(NEGATE_IM_256)).intoArray(ret, j + dj3);
602     }
603 
604     j += jstep;
605     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
606       final double[] twids = twiddles[k];
607       DoubleVector
608           w1r = broadcast(SPECIES_256, twids[0]),
609           w1i = broadcast(SPECIES_256, -sign * twids[1]).mul(NEGATE_IM_256),
610           w2r = broadcast(SPECIES_256, twids[2]),
611           w2i = broadcast(SPECIES_256, -sign * twids[3]).mul(NEGATE_IM_256),
612           w3r = broadcast(SPECIES_256, twids[4]),
613           w3i = broadcast(SPECIES_256, -sign * twids[5]).mul(NEGATE_IM_256);
614       for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_256, i += LENGTH_256, j += LENGTH_256) {
615         DoubleVector
616             z0 = fromArray(SPECIES_256, data, i),
617             z1 = fromArray(SPECIES_256, data, i + di),
618             z2 = fromArray(SPECIES_256, data, i + di2),
619             z3 = fromArray(SPECIES_256, data, i + di3);
620         DoubleVector
621             t1 = z0.add(z2),
622             t2 = z1.add(z3),
623             t3 = z0.sub(z2),
624             t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
625         t1.add(t2).intoArray(ret, j);
626         DoubleVector
627             x1 = t3.add(t4.mul(NEGATE_RE_256)),
628             x2 = t1.sub(t2),
629             x3 = t3.add(t4.mul(NEGATE_IM_256));
630         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
631         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
632         w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj3);
633       }
634     }
635   }
636 
637   /**
638    * Handle factors of 4 using the 512-bit SIMD vectors.
639    */
640   private void interleaved512(PassData passData) {
641     final double[] data = passData.in;
642     final double[] ret = passData.out;
643     int sign = passData.sign;
644     int i = passData.inOffset;
645     int j = passData.outOffset;
646     // First pass of the 4-point FFT has no twiddle factors.
647     for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_512, i += LENGTH_512, j += LENGTH_512) {
648       DoubleVector
649           z0 = fromArray(SPECIES_512, data, i),
650           z1 = fromArray(SPECIES_512, data, i + di),
651           z2 = fromArray(SPECIES_512, data, i + di2),
652           z3 = fromArray(SPECIES_512, data, i + di3);
653       DoubleVector
654           t1 = z0.add(z2),
655           t2 = z1.add(z3),
656           t3 = z0.sub(z2),
657           t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
658       t1.add(t2).intoArray(ret, j);
659       t3.add(t4.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
660       t1.sub(t2).intoArray(ret, j + dj2);
661       t3.add(t4.mul(NEGATE_IM_512)).intoArray(ret, j + dj3);
662     }
663 
664     j += jstep;
665     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
666       final double[] twids = twiddles[k];
667       DoubleVector
668           w1r = broadcast(SPECIES_512, twids[0]),
669           w1i = broadcast(SPECIES_512, -sign * twids[1]).mul(NEGATE_IM_512),
670           w2r = broadcast(SPECIES_512, twids[2]),
671           w2i = broadcast(SPECIES_512, -sign * twids[3]).mul(NEGATE_IM_512),
672           w3r = broadcast(SPECIES_512, twids[4]),
673           w3i = broadcast(SPECIES_512, -sign * twids[5]).mul(NEGATE_IM_512);
674       for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_512, i += LENGTH_512, j += LENGTH_512) {
675         DoubleVector
676             z0 = fromArray(SPECIES_512, data, i),
677             z1 = fromArray(SPECIES_512, data, i + di),
678             z2 = fromArray(SPECIES_512, data, i + di2),
679             z3 = fromArray(SPECIES_512, data, i + di3);
680         DoubleVector
681             t1 = z0.add(z2),
682             t2 = z1.add(z3),
683             t3 = z0.sub(z2),
684             t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
685         t1.add(t2).intoArray(ret, j);
686         DoubleVector
687             x1 = t3.add(t4.mul(NEGATE_RE_512)),
688             x2 = t1.sub(t2),
689             x3 = t3.add(t4.mul(NEGATE_IM_512));
690         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
691         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
692         w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj3);
693       }
694     }
695   }
696 }