View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2025.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.fft;
39  
40  import jdk.incubator.vector.DoubleVector;
41  
42  import static jdk.incubator.vector.DoubleVector.SPECIES_128;
43  import static jdk.incubator.vector.DoubleVector.SPECIES_256;
44  import static jdk.incubator.vector.DoubleVector.SPECIES_512;
45  import static jdk.incubator.vector.DoubleVector.broadcast;
46  import static jdk.incubator.vector.DoubleVector.fromArray;
47  
48  /**
49   * The MixedRadixFactor4 class handles factors of 4 in the FFT.
50   */
51  public class MixedRadixFactor4 extends MixedRadixFactor {
52  
53    private final int di2;
54    private final int di3;
55    private final int dj2;
56    private final int dj3;
57  
58    /**
59     * Construct a MixedRadixFactor4.
60     * @param passConstants PassConstants.
61     */
62    public MixedRadixFactor4(PassConstants passConstants) {
63      super(passConstants);
64      di2 = 2 * di;
65      di3 = 3 * di;
66      dj2 = 2 * dj;
67      dj3 = 3 * dj;
68    }
69  
70    /**
71     * Handle factors of 4.
72     */
73    @Override
74    protected void passScalar(PassData passData) {
75      final double[] data = passData.in;
76      final double[] ret = passData.out;
77      int sign = passData.sign;
78      int i = passData.inOffset;
79      int j = passData.outOffset;
80      // First pass of the 4-point FFT has no twiddle factors.
81      for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
82        final double z0_r = data[i];
83        final double z1_r = data[i + di];
84        final double z2_r = data[i + di2];
85        final double z3_r = data[i + di3];
86        final double z0_i = data[i + im];
87        final double z1_i = data[i + di + im];
88        final double z2_i = data[i + di2 + im];
89        final double z3_i = data[i + di3 + im];
90        final double t1_r = z0_r + z2_r;
91        final double t1_i = z0_i + z2_i;
92        final double t2_r = z1_r + z3_r;
93        final double t2_i = z1_i + z3_i;
94        final double t3_r = z0_r - z2_r;
95        final double t3_i = z0_i - z2_i;
96        final double t4_r = sign * (z1_r - z3_r);
97        final double t4_i = sign * (z1_i - z3_i);
98        ret[j] = t1_r + t2_r;
99        ret[j + im] = t1_i + t2_i;
100       ret[j + dj] = t3_r - t4_i;
101       ret[j + dj + im] = t3_i + t4_r;
102       ret[j + dj2] = t1_r - t2_r;
103       ret[j + dj2 + im] = t1_i - t2_i;
104       ret[j + dj3] = t3_r + t4_i;
105       ret[j + dj3 + im] = t3_i - t4_r;
106     }
107     j += jstep;
108     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
109       final int index = k * 3;
110       final double w1_r = wr[index];
111       final double w2_r = wr[index + 1];
112       final double w3_r = wr[index + 2];
113       final double w1_i = -sign * wi[index];
114       final double w2_i = -sign * wi[index + 1];
115       final double w3_i = -sign * wi[index + 2];
116       for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
117         final double z0_r = data[i];
118         final double z1_r = data[i + di];
119         final double z2_r = data[i + di2];
120         final double z3_r = data[i + di3];
121         final double z0_i = data[i + im];
122         final double z1_i = data[i + di + im];
123         final double z2_i = data[i + di2 + im];
124         final double z3_i = data[i + di3 + im];
125         final double t1_r = z0_r + z2_r;
126         final double t1_i = z0_i + z2_i;
127         final double t2_r = z1_r + z3_r;
128         final double t2_i = z1_i + z3_i;
129         final double t3_r = z0_r - z2_r;
130         final double t3_i = z0_i - z2_i;
131         final double t4_r = sign * (z1_r - z3_r);
132         final double t4_i = sign * (z1_i - z3_i);
133         ret[j] = t1_r + t2_r;
134         ret[j + im] = t1_i + t2_i;
135         multiplyAndStore(t3_r - t4_i, t3_i + t4_r, w1_r, w1_i, ret, j + dj, j + dj + im);
136         multiplyAndStore(t1_r - t2_r, t1_i - t2_i, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
137         multiplyAndStore(t3_r + t4_i, t3_i - t4_r, w3_r, w3_i, ret, j + dj3, j + dj3 + im);
138       }
139     }
140   }
141 
142   /**
143    * Handle factors of 4 using SIMD vectors.
144    */
145   @Override
146   protected void passSIMD(PassData passData) {
147     if (im == 1) {
148       interleaved(passData);
149     } else {
150       blocked(passData);
151     }
152   }
153 
154   /**
155    * Available SIMD sizes for Pass 4.
156    */
157   private static int[] simdSizes = {8, 4, 2};
158 
159   /**
160    * Handle factors of 4 using the chosen SIMD vector.
161    * @param passData The pass data.
162    * @param simdLength The SIMD vector length.
163    */
164   private void interleaved(PassData passData, int simdLength) {
165     // Use the preferred SIMD vector.
166     switch (simdLength) {
167       case 2:
168         // 1 complex number per loop iteration.
169         interleaved128(passData);
170         break;
171       case 4:
172         // 2 complex numbers per loop iteration.
173         interleaved256(passData);
174         break;
175       case 8:
176         // 4 complex numbers per loop iteration.
177         interleaved512(passData);
178         break;
179       default:
180         passScalar(passData);
181     }
182   }
183 
184   /**
185    * Handle factors of 4 using the chosen SIMD vector.
186    * @param passData The pass data.
187    * @param simdLength The SIMD vector length.
188    */
189   private void blocked(PassData passData, int simdLength) {
190     // Use the preferred SIMD vector.
191     switch (simdLength) {
192       case 2:
193         // 2 complex numbers per loop iteration.
194         blocked128(passData);
195         break;
196       case 4:
197         // 4 complex numbers per loop iteration.
198         blocked256(passData);
199         break;
200       case 8:
201         // 8 complex numbers per loop iteration.
202         blocked512(passData);
203         break;
204       default:
205         passScalar(passData);
206     }
207   }
208 
209   /**
210    * Handle factors of 4 using the 128-bit SIMD vectors.
211    * @param passData The interleaved pass data.
212    */
213   private void interleaved(PassData passData) {
214     if (innerLoopLimit % INTERLEAVED_LOOP == 0) {
215       // Use the preferred SIMD vector.
216       interleaved(passData, LENGTH);
217     } else {
218       // If the inner loop limit is odd, use the scalar method unless the inner loop limit is 1.
219       if (innerLoopLimit % 2 != 0 && innerLoopLimit != 1) {
220         passScalar(passData);
221         return;
222       }
223       // Fall back to a smaller SIMD vector that fits the inner loop limit.
224       for (int size : simdSizes) {
225         if (size >= LENGTH) {
226           // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
227           continue;
228         }
229         // Divide the SIMD size by two because for interleaved a single SIMD vectors stores both real and imaginary parts.
230         if (innerLoopLimit % (size / 2) == 0) {
231           interleaved(passData, size);
232         }
233       }
234     }
235   }
236 
237   /**
238    * Handle factors of 4 using the 128-bit SIMD vectors.
239    * @param passData The pass blocked data.
240    */
241   private void blocked(PassData passData) {
242     if (innerLoopLimit % BLOCK_LOOP == 0) {
243       // Use the preferred SIMD vector.
244       blocked(passData, LENGTH);
245     } else {
246       // If the inner loop limit is odd, use the scalar method unless the inner loop limit is 1.
247       if (innerLoopLimit % 2 != 0) {
248         passScalar(passData);
249         return;
250       }
251       // Fall back to a smaller SIMD vector that fits the inner loop limit.
252       for (int size : simdSizes) {
253         if (size >= LENGTH) {
254           // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
255           continue;
256         }
257         // Divide the SIMD size by two because for interleaved a single SIMD vectors stores both real and imaginary parts.
258         if (innerLoopLimit % size == 0) {
259           blocked(passData, size);
260         }
261       }
262     }
263   }
264 
265   /**
266    * Handle factors of 4 using the 128-bit SIMD vectors.
267    */
268   private void blocked128(PassData passData) {
269     final double[] data = passData.in;
270     final double[] ret = passData.out;
271     int sign = passData.sign;
272     int i = passData.inOffset;
273     int j = passData.outOffset;
274     // First pass of the 4-point FFT has no twiddle factors.
275     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
276       final DoubleVector
277           z0_r = fromArray(SPECIES_128, data, i),
278           z0_i = fromArray(SPECIES_128, data, i + im),
279           z1_r = fromArray(SPECIES_128, data, i + di),
280           z1_i = fromArray(SPECIES_128, data, i + di + im),
281           z2_r = fromArray(SPECIES_128, data, i + di2),
282           z2_i = fromArray(SPECIES_128, data, i + di2 + im),
283           z3_r = fromArray(SPECIES_128, data, i + di3),
284           z3_i = fromArray(SPECIES_128, data, i + di3 + im);
285       final DoubleVector
286           t1_r = z0_r.add(z2_r),
287           t1_i = z0_i.add(z2_i),
288           t2_r = z1_r.add(z3_r),
289           t2_i = z1_i.add(z3_i),
290           t3_r = z0_r.sub(z2_r),
291           t3_i = z0_i.sub(z2_i),
292           t4_r = z1_r.sub(z3_r).mul(sign),
293           t4_i = z1_i.sub(z3_i).mul(sign);
294       t1_r.add(t2_r).intoArray(ret, j);
295       t1_i.add(t2_i).intoArray(ret, j + im);
296       t3_r.sub(t4_i).intoArray(ret, j + dj);
297       t3_i.add(t4_r).intoArray(ret, j + dj + im);
298       t1_r.sub(t2_r).intoArray(ret, j + dj2);
299       t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
300       t3_r.add(t4_i).intoArray(ret, j + dj3);
301       t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
302     }
303 
304     j += jstep;
305     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
306       final int index = 3 * k;
307       final DoubleVector
308           w1r = broadcast(SPECIES_128, wr[index]),
309           w2r = broadcast(SPECIES_128, wr[index + 1]),
310           w3r = broadcast(SPECIES_128, wr[index + 2]),
311           w1i = broadcast(SPECIES_128, -sign * wi[index]),
312           w2i = broadcast(SPECIES_128, -sign * wi[index + 1]),
313           w3i = broadcast(SPECIES_128, -sign * wi[index + 2]);
314       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
315         final DoubleVector
316             z0_r = fromArray(SPECIES_128, data, i),
317             z0_i = fromArray(SPECIES_128, data, i + im),
318             z1_r = fromArray(SPECIES_128, data, i + di),
319             z1_i = fromArray(SPECIES_128, data, i + di + im),
320             z2_r = fromArray(SPECIES_128, data, i + di2),
321             z2_i = fromArray(SPECIES_128, data, i + di2 + im),
322             z3_r = fromArray(SPECIES_128, data, i + di3),
323             z3_i = fromArray(SPECIES_128, data, i + di3 + im);
324         final DoubleVector
325             t1_r = z0_r.add(z2_r),
326             t1_i = z0_i.add(z2_i),
327             t2_r = z1_r.add(z3_r),
328             t2_i = z1_i.add(z3_i),
329             t3_r = z0_r.sub(z2_r),
330             t3_i = z0_i.sub(z2_i),
331             t4_r = z1_r.sub(z3_r).mul(sign),
332             t4_i = z1_i.sub(z3_i).mul(sign);
333         t1_r.add(t2_r).intoArray(ret, j);
334         t1_i.add(t2_i).intoArray(ret, j + im);
335         DoubleVector
336             x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
337             x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
338             x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
339         w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
340         w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
341         w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
342         w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
343         w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
344         w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
345       }
346     }
347   }
348 
349   /**
350    * Handle factors of 4 using the 256-bit SIMD vectors.
351    */
352   private void blocked256(PassData passData) {
353     final double[] data = passData.in;
354     final double[] ret = passData.out;
355     int sign = passData.sign;
356     int i = passData.inOffset;
357     int j = passData.outOffset;
358     // First pass of the 4-point FFT has no twiddle factors.
359     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
360       final DoubleVector
361           z0_r = fromArray(SPECIES_256, data, i),
362           z1_r = fromArray(SPECIES_256, data, i + di),
363           z2_r = fromArray(SPECIES_256, data, i + di2),
364           z3_r = fromArray(SPECIES_256, data, i + di3),
365           z0_i = fromArray(SPECIES_256, data, i + im),
366           z1_i = fromArray(SPECIES_256, data, i + di + im),
367           z2_i = fromArray(SPECIES_256, data, i + di2 + im),
368           z3_i = fromArray(SPECIES_256, data, i + di3 + im);
369       final DoubleVector
370           t1_r = z0_r.add(z2_r),
371           t1_i = z0_i.add(z2_i),
372           t2_r = z1_r.add(z3_r),
373           t2_i = z1_i.add(z3_i),
374           t3_r = z0_r.sub(z2_r),
375           t3_i = z0_i.sub(z2_i),
376           t4_r = z1_r.sub(z3_r).mul(sign),
377           t4_i = z1_i.sub(z3_i).mul(sign);
378       t1_r.add(t2_r).intoArray(ret, j);
379       t3_r.sub(t4_i).intoArray(ret, j + dj);
380       t1_r.sub(t2_r).intoArray(ret, j + dj2);
381       t3_r.add(t4_i).intoArray(ret, j + dj3);
382       t1_i.add(t2_i).intoArray(ret, j + im);
383       t3_i.add(t4_r).intoArray(ret, j + dj + im);
384       t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
385       t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
386     }
387 
388     j += jstep;
389     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
390       final int index = 3 * k;
391       final DoubleVector
392           w1r = broadcast(SPECIES_256, wr[index]),
393           w2r = broadcast(SPECIES_256, wr[index + 1]),
394           w3r = broadcast(SPECIES_256, wr[index + 2]),
395           w1i = broadcast(SPECIES_256, -sign * wi[index]),
396           w2i = broadcast(SPECIES_256, -sign * wi[index + 1]),
397           w3i = broadcast(SPECIES_256, -sign * wi[index + 2]);
398       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
399         final DoubleVector
400             z0_r = fromArray(SPECIES_256, data, i),
401             z1_r = fromArray(SPECIES_256, data, i + di),
402             z2_r = fromArray(SPECIES_256, data, i + di2),
403             z3_r = fromArray(SPECIES_256, data, i + di3),
404             z0_i = fromArray(SPECIES_256, data, i + im),
405             z1_i = fromArray(SPECIES_256, data, i + di + im),
406             z2_i = fromArray(SPECIES_256, data, i + di2 + im),
407             z3_i = fromArray(SPECIES_256, data, i + di3 + im);
408         final DoubleVector
409             t1_r = z0_r.add(z2_r),
410             t1_i = z0_i.add(z2_i),
411             t2_r = z1_r.add(z3_r),
412             t2_i = z1_i.add(z3_i),
413             t3_r = z0_r.sub(z2_r),
414             t3_i = z0_i.sub(z2_i),
415             t4_r = z1_r.sub(z3_r).mul(sign),
416             t4_i = z1_i.sub(z3_i).mul(sign);
417         t1_r.add(t2_r).intoArray(ret, j);
418         t1_i.add(t2_i).intoArray(ret, j + im);
419         DoubleVector
420             x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
421             x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
422             x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
423         w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
424         w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
425         w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
426         w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
427         w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
428         w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
429       }
430     }
431   }
432 
433   /**
434    * Handle factors of 4 using the 512-bit SIMD vectors.
435    */
436   private void blocked512(PassData passData) {
437     final double[] data = passData.in;
438     final double[] ret = passData.out;
439     int sign = passData.sign;
440     int i = passData.inOffset;
441     int j = passData.outOffset;
442     // First pass of the 4-point FFT has no twiddle factors.
443     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
444       final DoubleVector
445           z0_r = fromArray(SPECIES_512, data, i),
446           z1_r = fromArray(SPECIES_512, data, i + di),
447           z2_r = fromArray(SPECIES_512, data, i + di2),
448           z3_r = fromArray(SPECIES_512, data, i + di3),
449           z0_i = fromArray(SPECIES_512, data, i + im),
450           z1_i = fromArray(SPECIES_512, data, i + di + im),
451           z2_i = fromArray(SPECIES_512, data, i + di2 + im),
452           z3_i = fromArray(SPECIES_512, data, i + di3 + im);
453       final DoubleVector
454           t1_r = z0_r.add(z2_r),
455           t1_i = z0_i.add(z2_i),
456           t2_r = z1_r.add(z3_r),
457           t2_i = z1_i.add(z3_i),
458           t3_r = z0_r.sub(z2_r),
459           t3_i = z0_i.sub(z2_i),
460           t4_r = z1_r.sub(z3_r).mul(sign),
461           t4_i = z1_i.sub(z3_i).mul(sign);
462       t1_r.add(t2_r).intoArray(ret, j);
463       t3_r.sub(t4_i).intoArray(ret, j + dj);
464       t1_r.sub(t2_r).intoArray(ret, j + dj2);
465       t3_r.add(t4_i).intoArray(ret, j + dj3);
466       t1_i.add(t2_i).intoArray(ret, j + im);
467       t3_i.add(t4_r).intoArray(ret, j + dj + im);
468       t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
469       t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
470     }
471 
472     j += jstep;
473     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
474       final int index = 3 * k;
475       final DoubleVector
476           w1r = broadcast(SPECIES_512, wr[index]),
477           w2r = broadcast(SPECIES_512, wr[index + 1]),
478           w3r = broadcast(SPECIES_512, wr[index + 2]),
479           w1i = broadcast(SPECIES_512, -sign * wi[index]),
480           w2i = broadcast(SPECIES_512, -sign * wi[index + 1]),
481           w3i = broadcast(SPECIES_512, -sign * wi[index + 2]);
482       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
483         final DoubleVector
484             z0_r = fromArray(SPECIES_512, data, i),
485             z1_r = fromArray(SPECIES_512, data, i + di),
486             z2_r = fromArray(SPECIES_512, data, i + di2),
487             z3_r = fromArray(SPECIES_512, data, i + di3),
488             z0_i = fromArray(SPECIES_512, data, i + im),
489             z1_i = fromArray(SPECIES_512, data, i + di + im),
490             z2_i = fromArray(SPECIES_512, data, i + di2 + im),
491             z3_i = fromArray(SPECIES_512, data, i + di3 + im);
492         final DoubleVector
493             t1_r = z0_r.add(z2_r),
494             t1_i = z0_i.add(z2_i),
495             t2_r = z1_r.add(z3_r),
496             t2_i = z1_i.add(z3_i),
497             t3_r = z0_r.sub(z2_r),
498             t3_i = z0_i.sub(z2_i),
499             t4_r = z1_r.sub(z3_r).mul(sign),
500             t4_i = z1_i.sub(z3_i).mul(sign);
501         t1_r.add(t2_r).intoArray(ret, j);
502         t1_i.add(t2_i).intoArray(ret, j + im);
503         DoubleVector
504             x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
505             x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
506             x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
507         w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
508         w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
509         w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
510         w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
511         w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
512         w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
513       }
514     }
515   }
516 
517   /**
518    * Handle factors of 4 using the 128-bit SIMD vectors.
519    */
520   private void interleaved128(PassData passData) {
521     final double[] data = passData.in;
522     final double[] ret = passData.out;
523     int sign = passData.sign;
524     int i = passData.inOffset;
525     int j = passData.outOffset;
526     // First pass of the 4-point FFT has no twiddle factors.
527     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
528       DoubleVector
529           z0 = fromArray(SPECIES_128, data, i),
530           z1 = fromArray(SPECIES_128, data, i + di),
531           z2 = fromArray(SPECIES_128, data, i + di2),
532           z3 = fromArray(SPECIES_128, data, i + di3);
533       DoubleVector
534           t1 = z0.add(z2),
535           t2 = z1.add(z3),
536           t3 = z0.sub(z2),
537           t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
538       t1.add(t2).intoArray(ret, j);
539       t3.add(t4.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
540       t1.sub(t2).intoArray(ret, j + dj2);
541       t3.add(t4.mul(NEGATE_IM_128)).intoArray(ret, j + dj3);
542     }
543 
544     j += jstep;
545     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
546 //      final double[] twids = twiddles[k];
547 //      DoubleVector
548 //          w1r = broadcast(SPECIES_128, twids[0]),
549 //          w1i = broadcast(SPECIES_128, -sign * twids[1]).mul(NEGATE_IM_128),
550 //          w2r = broadcast(SPECIES_128, twids[2]),
551 //          w2i = broadcast(SPECIES_128, -sign * twids[3]).mul(NEGATE_IM_128),
552 //          w3r = broadcast(SPECIES_128, twids[4]),
553 //          w3i = broadcast(SPECIES_128, -sign * twids[5]).mul(NEGATE_IM_128);
554       final int index = 3 * k;
555       DoubleVector
556           w1r = broadcast(SPECIES_128, wr[index]),
557           w2r = broadcast(SPECIES_128, wr[index + 1]),
558           w3r = broadcast(SPECIES_128, wr[index + 2]),
559           w1i = broadcast(SPECIES_128, -sign * wi[index]).mul(NEGATE_IM_128),
560           w2i = broadcast(SPECIES_128, -sign * wi[index + 1]).mul(NEGATE_IM_128),
561           w3i = broadcast(SPECIES_128, -sign * wi[index + 2]).mul(NEGATE_IM_128);
562       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
563         DoubleVector
564             z0 = fromArray(SPECIES_128, data, i),
565             z1 = fromArray(SPECIES_128, data, i + di),
566             z2 = fromArray(SPECIES_128, data, i + di2),
567             z3 = fromArray(SPECIES_128, data, i + di3);
568         DoubleVector
569             t1 = z0.add(z2),
570             t2 = z1.add(z3),
571             t3 = z0.sub(z2),
572             t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
573         t1.add(t2).intoArray(ret, j);
574         DoubleVector
575             x1 = t3.add(t4.mul(NEGATE_RE_128)),
576             x2 = t1.sub(t2),
577             x3 = t3.add(t4.mul(NEGATE_IM_128));
578         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
579         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
580         w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj3);
581       }
582     }
583   }
584 
585   /**
586    * Handle factors of 4 using the 256-bit SIMD vectors.
587    */
588   private void interleaved256(PassData passData) {
589     final double[] data = passData.in;
590     final double[] ret = passData.out;
591     int sign = passData.sign;
592     int i = passData.inOffset;
593     int j = passData.outOffset;
594     // First pass of the 4-point FFT has no twiddle factors.
595     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
596       DoubleVector
597           z0 = fromArray(SPECIES_256, data, i),
598           z1 = fromArray(SPECIES_256, data, i + di),
599           z2 = fromArray(SPECIES_256, data, i + di2),
600           z3 = fromArray(SPECIES_256, data, i + di3);
601       DoubleVector
602           t1 = z0.add(z2),
603           t2 = z1.add(z3),
604           t3 = z0.sub(z2),
605           t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
606       t1.add(t2).intoArray(ret, j);
607       t3.add(t4.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
608       t1.sub(t2).intoArray(ret, j + dj2);
609       t3.add(t4.mul(NEGATE_IM_256)).intoArray(ret, j + dj3);
610     }
611 
612     j += jstep;
613     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
614 //      final double[] twids = twiddles[k];
615 //      DoubleVector
616 //          w1r = broadcast(SPECIES_256, twids[0]),
617 //          w1i = broadcast(SPECIES_256, -sign * twids[1]).mul(NEGATE_IM_256),
618 //          w2r = broadcast(SPECIES_256, twids[2]),
619 //          w2i = broadcast(SPECIES_256, -sign * twids[3]).mul(NEGATE_IM_256),
620 //          w3r = broadcast(SPECIES_256, twids[4]),
621 //          w3i = broadcast(SPECIES_256, -sign * twids[5]).mul(NEGATE_IM_256);
622       final int index = 3 * k;
623       DoubleVector
624           w1r = broadcast(SPECIES_256, wr[index]),
625           w2r = broadcast(SPECIES_256, wr[index + 1]),
626           w3r = broadcast(SPECIES_256, wr[index + 2]),
627           w1i = broadcast(SPECIES_256, -sign * wi[index]).mul(NEGATE_IM_256),
628           w2i = broadcast(SPECIES_256, -sign * wi[index + 1]).mul(NEGATE_IM_256),
629           w3i = broadcast(SPECIES_256, -sign * wi[index + 2]).mul(NEGATE_IM_256);
630       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
631         DoubleVector
632             z0 = fromArray(SPECIES_256, data, i),
633             z1 = fromArray(SPECIES_256, data, i + di),
634             z2 = fromArray(SPECIES_256, data, i + di2),
635             z3 = fromArray(SPECIES_256, data, i + di3);
636         DoubleVector
637             t1 = z0.add(z2),
638             t2 = z1.add(z3),
639             t3 = z0.sub(z2),
640             t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
641         t1.add(t2).intoArray(ret, j);
642         DoubleVector
643             x1 = t3.add(t4.mul(NEGATE_RE_256)),
644             x2 = t1.sub(t2),
645             x3 = t3.add(t4.mul(NEGATE_IM_256));
646         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
647         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
648         w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj3);
649       }
650     }
651   }
652 
653   /**
654    * Handle factors of 4 using the 512-bit SIMD vectors.
655    */
656   private void interleaved512(PassData passData) {
657     final double[] data = passData.in;
658     final double[] ret = passData.out;
659     int sign = passData.sign;
660     int i = passData.inOffset;
661     int j = passData.outOffset;
662     // First pass of the 4-point FFT has no twiddle factors.
663     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
664       DoubleVector
665           z0 = fromArray(SPECIES_512, data, i),
666           z1 = fromArray(SPECIES_512, data, i + di),
667           z2 = fromArray(SPECIES_512, data, i + di2),
668           z3 = fromArray(SPECIES_512, data, i + di3);
669       DoubleVector
670           t1 = z0.add(z2),
671           t2 = z1.add(z3),
672           t3 = z0.sub(z2),
673           t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
674       t1.add(t2).intoArray(ret, j);
675       t3.add(t4.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
676       t1.sub(t2).intoArray(ret, j + dj2);
677       t3.add(t4.mul(NEGATE_IM_512)).intoArray(ret, j + dj3);
678     }
679 
680     j += jstep;
681     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
682 //      final double[] twids = twiddles[k];
683 //      DoubleVector
684 //          w1r = broadcast(SPECIES_512, twids[0]),
685 //          w1i = broadcast(SPECIES_512, -sign * twids[1]).mul(NEGATE_IM_512),
686 //          w2r = broadcast(SPECIES_512, twids[2]),
687 //          w2i = broadcast(SPECIES_512, -sign * twids[3]).mul(NEGATE_IM_512),
688 //          w3r = broadcast(SPECIES_512, twids[4]),
689 //          w3i = broadcast(SPECIES_512, -sign * twids[5]).mul(NEGATE_IM_512);
690       final int index = 3 * k;
691       DoubleVector
692           w1r = broadcast(SPECIES_512, wr[index]),
693           w2r = broadcast(SPECIES_512, wr[index + 1]),
694           w3r = broadcast(SPECIES_512, wr[index + 2]),
695           w1i = broadcast(SPECIES_512, -sign * wi[index]).mul(NEGATE_IM_512),
696           w2i = broadcast(SPECIES_512, -sign * wi[index + 1]).mul(NEGATE_IM_512),
697           w3i = broadcast(SPECIES_512, -sign * wi[index + 2]).mul(NEGATE_IM_512);
698       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
699         DoubleVector
700             z0 = fromArray(SPECIES_512, data, i),
701             z1 = fromArray(SPECIES_512, data, i + di),
702             z2 = fromArray(SPECIES_512, data, i + di2),
703             z3 = fromArray(SPECIES_512, data, i + di3);
704         DoubleVector
705             t1 = z0.add(z2),
706             t2 = z1.add(z3),
707             t3 = z0.sub(z2),
708             t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
709         t1.add(t2).intoArray(ret, j);
710         DoubleVector
711             x1 = t3.add(t4.mul(NEGATE_RE_512)),
712             x2 = t1.sub(t2),
713             x3 = t3.add(t4.mul(NEGATE_IM_512));
714         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
715         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
716         w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj3);
717       }
718     }
719   }
720 }