View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2026.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.fft;
39  
40  import jdk.incubator.vector.DoubleVector;
41  
42  import static java.lang.Math.fma;
43  import static jdk.incubator.vector.DoubleVector.SPECIES_128;
44  import static jdk.incubator.vector.DoubleVector.SPECIES_256;
45  import static jdk.incubator.vector.DoubleVector.SPECIES_512;
46  import static jdk.incubator.vector.DoubleVector.broadcast;
47  import static jdk.incubator.vector.DoubleVector.fromArray;
48  import static org.apache.commons.math3.util.FastMath.sqrt;
49  
50  /**
51   * The MixedRadixFactor3 class handles factors of 3 in the FFT.
52   */
53  public class MixedRadixFactor3 extends MixedRadixFactor {
54  
55    private static final double sqrt3_2 = sqrt(3.0) / 2.0;
56    private final int di2;
57    private final int dj2;
58  
59    /**
60     * Available SIMD sizes for Pass 3.
61     */
62    private static int[] simdSizes = {8, 4, 2};
63  
64    /**
65     * Create a new MixedRadixFactor3 instance.
66     *
67     * @param passConstants The pass constants.
68     */
69    public MixedRadixFactor3(PassConstants passConstants) {
70      super(passConstants);
71      di2 = 2 * di;
72      dj2 = 2 * dj;
73    }
74  
75    /**
76     * Check if the requested SIMD length is valid.
77     * @param width Requested SIMD species width.
78     * @return True if this width is supported.
79     */
80    public boolean isValidSIMDWidth(int width) {
81      // Must be a supported width.
82      if (width != 2 && width != 4 && width != 8) {
83        return false;
84      }
85      if (im == 1) {
86        // Interleaved
87        return innerLoopLimit % (width / 2) == 0;
88      } else {
89        // Blocked
90        return innerLoopLimit % width == 0;
91      }
92    }
93  
94    /**
95     * Determine the optimal SIMD width. Currently supported widths are 2, 4 and 8.
96     * If no SIMD width is valid, return 0 to indicate use of the scalar path.
97     * @return The optimal SIMD width.
98     */
99    public int getOptimalSIMDWidth() {
100     // Check the platform specific preferred width.
101     if (isValidSIMDWidth(LENGTH)) {
102       return LENGTH;
103     }
104     // Fall back to a smaller SIMD vector that fits the inner loop limit.
105     for (int size : simdSizes) {
106       if (size >= LENGTH) {
107         // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
108         continue;
109       }
110       if (isValidSIMDWidth(size)) {
111         return size;
112       }
113     }
114     // No valid SIMD width is found.
115     return 0;
116   }
117 
118   /**
119    * Handle factors of 3.
120    */
121   @Override
122   protected void passScalar(PassData passData) {
123     final double[] data = passData.in;
124     final double[] ret = passData.out;
125     int sign = passData.sign;
126     int i = passData.inOffset;
127     int j = passData.outOffset;
128     final double tau = sign * sqrt3_2;
129 
130     // First pass of the 3-point FFT has no twiddle factors.
131     for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
132       final double z0_r = data[i];
133       final double z1_r = data[i + di];
134       final double z2_r = data[i + di2];
135       final double z0_i = data[i + im];
136       final double z1_i = data[i + di + im];
137       final double z2_i = data[i + di2 + im];
138       final double t1_r = z1_r + z2_r;
139       final double t1_i = z1_i + z2_i;
140       final double t2_r = fma(-0.5, t1_r, z0_r);
141       final double t2_i = fma(-0.5, t1_i, z0_i);
142       final double t3_r = tau * (z1_r - z2_r);
143       final double t3_i = tau * (z1_i - z2_i);
144       ret[j] = z0_r + t1_r;
145       ret[j + im] = z0_i + t1_i;
146       ret[j + dj] = t2_r - t3_i;
147       ret[j + dj + im] = t2_i + t3_r;
148       ret[j + dj2] = t2_r + t3_i;
149       ret[j + dj2 + im] = t2_i - t3_r;
150     }
151     j += jstep;
152     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
153       final int index = k * 2;
154       final double w1_r = wr[index];
155       final double w2_r = wr[index + 1];
156       final double w1_i = -sign * wi[index];
157       final double w2_i = -sign * wi[index + 1];
158       for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
159         final double z0_r = data[i];
160         final double z1_r = data[i + di];
161         final double z2_r = data[i + di2];
162         final double z0_i = data[i + im];
163         final double z1_i = data[i + di + im];
164         final double z2_i = data[i + di2 + im];
165         final double t1_r = z1_r + z2_r;
166         final double t1_i = z1_i + z2_i;
167         final double t2_r = fma(-0.5, t1_r, z0_r);
168         final double t2_i = fma(-0.5, t1_i, z0_i);
169         final double t3_r = tau * (z1_r - z2_r);
170         final double t3_i = tau * (z1_i - z2_i);
171         ret[j] = z0_r + t1_r;
172         ret[j + im] = z0_i + t1_i;
173         multiplyAndStore(t2_r - t3_i, t2_i + t3_r, w1_r, w1_i, ret, j + dj, j + dj + im);
174         multiplyAndStore(t2_r + t3_i, t2_i - t3_r, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
175       }
176     }
177   }
178 
179   /**
180    * Handle factors of 3 using SIMD vectors.
181    *
182    * @param passData The pass data.
183    */
184   @Override
185   protected void passSIMD(PassData passData) {
186     if (!isValidSIMDWidth(simdWidth)) {
187       passScalar(passData);
188     } else {
189       if (im == 1) {
190         interleaved(passData, simdWidth);
191       } else {
192         blocked(passData, simdWidth);
193       }
194     }
195   }
196 
197   /**
198    * Handle factors of 3 using the chosen SIMD vector.
199    *
200    * @param passData   The pass data.
201    * @param simdLength The SIMD vector length.
202    */
203   private void interleaved(PassData passData, int simdLength) {
204     // Use the preferred SIMD vector.
205     switch (simdLength) {
206       case 2:
207         // 1 complex number per loop iteration.
208         interleaved128(passData);
209         break;
210       case 4:
211         // 2 complex numbers per loop iteration.
212         interleaved256(passData);
213         break;
214       case 8:
215         // 4 complex numbers per loop iteration.
216         interleaved512(passData);
217         break;
218       default:
219         passScalar(passData);
220     }
221   }
222 
223   /**
224    * Handle factors of 3 using the chosen SIMD vector.
225    *
226    * @param passData   The pass data.
227    * @param simdLength The SIMD vector length.
228    */
229   private void blocked(PassData passData, int simdLength) {
230     // Use the preferred SIMD vector.
231     switch (simdLength) {
232       case 2:
233         // 2 complex numbers per loop iteration.
234         blocked128(passData);
235         break;
236       case 4:
237         // 4 complex numbers per loop iteration.
238         blocked256(passData);
239         break;
240       case 8:
241         // 8 complex numbers per loop iteration.
242         blocked512(passData);
243         break;
244       default:
245         passScalar(passData);
246     }
247   }
248 
249   /**
250    * Handle factors of 3 using the 128-bit SIMD vectors.
251    */
252   private void blocked128(PassData passData) {
253     final double[] data = passData.in;
254     final double[] ret = passData.out;
255     int sign = passData.sign;
256     int i = passData.inOffset;
257     int j = passData.outOffset;
258     final double tau = sign * sqrt3_2;
259 
260     // First pass of the 3-point FFT has no twiddle factors.
261     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
262       final DoubleVector
263           z0_r = fromArray(SPECIES_128, data, i),
264           z1_r = fromArray(SPECIES_128, data, i + di),
265           z2_r = fromArray(SPECIES_128, data, i + di2),
266           z0_i = fromArray(SPECIES_128, data, i + im),
267           z1_i = fromArray(SPECIES_128, data, i + di + im),
268           z2_i = fromArray(SPECIES_128, data, i + di2 + im);
269       final DoubleVector
270           t1_r = z1_r.add(z2_r),
271           t1_i = z1_i.add(z2_i),
272           t2_r = t1_r.mul(-0.5).add(z0_r),
273           t2_i = t1_i.mul(-0.5).add(z0_i),
274           t3_r = z1_r.sub(z2_r).mul(tau),
275           t3_i = z1_i.sub(z2_i).mul(tau);
276       z0_r.add(t1_r).intoArray(ret, j);
277       z0_i.add(t1_i).intoArray(ret, j + im);
278       t2_r.sub(t3_i).intoArray(ret, j + dj);
279       t2_i.add(t3_r).intoArray(ret, j + dj + im);
280       t2_r.add(t3_i).intoArray(ret, j + dj2);
281       t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
282     }
283 
284     j += jstep;
285     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
286       final int index = k * 2;
287       final double
288           w1_r = wr[index],
289           w2_r = wr[index + 1],
290           w1_i = -sign * wi[index],
291           w2_i = -sign * wi[index + 1];
292       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
293         final DoubleVector
294             z0_r = fromArray(SPECIES_128, data, i),
295             z1_r = fromArray(SPECIES_128, data, i + di),
296             z2_r = fromArray(SPECIES_128, data, i + di2),
297             z0_i = fromArray(SPECIES_128, data, i + im),
298             z1_i = fromArray(SPECIES_128, data, i + di + im),
299             z2_i = fromArray(SPECIES_128, data, i + di2 + im);
300         final DoubleVector
301             t1_r = z1_r.add(z2_r),
302             t1_i = z1_i.add(z2_i),
303             t2_r = t1_r.mul(-0.5).add(z0_r),
304             t2_i = t1_i.mul(-0.5).add(z0_i),
305             t3_r = z1_r.sub(z2_r).mul(tau),
306             t3_i = z1_i.sub(z2_i).mul(tau);
307         z0_r.add(t1_r).intoArray(ret, j);
308         z0_i.add(t1_i).intoArray(ret, j + im);
309         final DoubleVector
310             x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
311             x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
312         x1_r.mul(w1_r).sub(x1_i.mul(w1_i)).intoArray(ret, j + dj);
313         x2_r.mul(w2_r).sub(x2_i.mul(w2_i)).intoArray(ret, j + dj2);
314         x1_i.mul(w1_r).add(x1_r.mul(w1_i)).intoArray(ret, j + dj + im);
315         x2_i.mul(w2_r).add(x2_r.mul(w2_i)).intoArray(ret, j + dj2 + im);
316       }
317     }
318   }
319 
320   /**
321    * Handle factors of 3 using the 256-bit SIMD vectors.
322    */
323   private void blocked256(PassData passData) {
324     final double[] data = passData.in;
325     final double[] ret = passData.out;
326     int sign = passData.sign;
327     int i = passData.inOffset;
328     int j = passData.outOffset;
329     final double tau = sign * sqrt3_2;
330 
331     // First pass of the 3-point FFT has no twiddle factors.
332     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
333       final DoubleVector
334           z0_r = fromArray(SPECIES_256, data, i),
335           z1_r = fromArray(SPECIES_256, data, i + di),
336           z2_r = fromArray(SPECIES_256, data, i + di2),
337           z0_i = fromArray(SPECIES_256, data, i + im),
338           z1_i = fromArray(SPECIES_256, data, i + di + im),
339           z2_i = fromArray(SPECIES_256, data, i + di2 + im);
340       final DoubleVector
341           t1_r = z1_r.add(z2_r),
342           t1_i = z1_i.add(z2_i),
343           t2_r = t1_r.mul(-0.5).add(z0_r),
344           t2_i = t1_i.mul(-0.5).add(z0_i),
345           t3_r = z1_r.sub(z2_r).mul(tau),
346           t3_i = z1_i.sub(z2_i).mul(tau);
347       z0_r.add(t1_r).intoArray(ret, j);
348       z0_i.add(t1_i).intoArray(ret, j + im);
349       t2_r.sub(t3_i).intoArray(ret, j + dj);
350       t2_i.add(t3_r).intoArray(ret, j + dj + im);
351       t2_r.add(t3_i).intoArray(ret, j + dj2);
352       t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
353     }
354 
355     j += jstep;
356     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
357       final int index = k * 2;
358       final double
359           w1_r = wr[index],
360           w2_r = wr[index + 1],
361           w1_i = -sign * wi[index],
362           w2_i = -sign * wi[index + 1];
363       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
364         final DoubleVector
365             z0_r = fromArray(SPECIES_256, data, i),
366             z1_r = fromArray(SPECIES_256, data, i + di),
367             z2_r = fromArray(SPECIES_256, data, i + di2),
368             z0_i = fromArray(SPECIES_256, data, i + im),
369             z1_i = fromArray(SPECIES_256, data, i + di + im),
370             z2_i = fromArray(SPECIES_256, data, i + di2 + im);
371         final DoubleVector
372             t1_r = z1_r.add(z2_r),
373             t1_i = z1_i.add(z2_i),
374             t2_r = t1_r.mul(-0.5).add(z0_r),
375             t2_i = t1_i.mul(-0.5).add(z0_i),
376             t3_r = z1_r.sub(z2_r).mul(tau),
377             t3_i = z1_i.sub(z2_i).mul(tau);
378         z0_r.add(t1_r).intoArray(ret, j);
379         z0_i.add(t1_i).intoArray(ret, j + im);
380         final DoubleVector
381             x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
382             x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
383         x1_r.mul(w1_r).sub(x1_i.mul(w1_i)).intoArray(ret, j + dj);
384         x2_r.mul(w2_r).sub(x2_i.mul(w2_i)).intoArray(ret, j + dj2);
385         x1_i.mul(w1_r).add(x1_r.mul(w1_i)).intoArray(ret, j + dj + im);
386         x2_i.mul(w2_r).add(x2_r.mul(w2_i)).intoArray(ret, j + dj2 + im);
387       }
388     }
389   }
390 
391   /**
392    * Handle factors of 3 using the 512-bit SIMD vectors.
393    */
394   private void blocked512(PassData passData) {
395     final double[] data = passData.in;
396     final double[] ret = passData.out;
397     int sign = passData.sign;
398     int i = passData.inOffset;
399     int j = passData.outOffset;
400     final double tau = sign * sqrt3_2;
401 
402     // First pass of the 3-point FFT has no twiddle factors.
403     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
404       final DoubleVector
405           z0_r = fromArray(SPECIES_512, data, i),
406           z1_r = fromArray(SPECIES_512, data, i + di),
407           z2_r = fromArray(SPECIES_512, data, i + di2),
408           z0_i = fromArray(SPECIES_512, data, i + im),
409           z1_i = fromArray(SPECIES_512, data, i + di + im),
410           z2_i = fromArray(SPECIES_512, data, i + di2 + im);
411       final DoubleVector
412           t1_r = z1_r.add(z2_r),
413           t1_i = z1_i.add(z2_i),
414           t2_r = t1_r.mul(-0.5).add(z0_r),
415           t2_i = t1_i.mul(-0.5).add(z0_i),
416           t3_r = z1_r.sub(z2_r).mul(tau),
417           t3_i = z1_i.sub(z2_i).mul(tau);
418       z0_r.add(t1_r).intoArray(ret, j);
419       z0_i.add(t1_i).intoArray(ret, j + im);
420       t2_r.sub(t3_i).intoArray(ret, j + dj);
421       t2_i.add(t3_r).intoArray(ret, j + dj + im);
422       t2_r.add(t3_i).intoArray(ret, j + dj2);
423       t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
424     }
425 
426     j += jstep;
427     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
428       final int index = k * 2;
429       final double
430           w1_r = wr[index],
431           w2_r = wr[index + 1],
432           w1_i = -sign * wi[index],
433           w2_i = -sign * wi[index + 1];
434       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
435         final DoubleVector
436             z0_r = fromArray(SPECIES_512, data, i),
437             z1_r = fromArray(SPECIES_512, data, i + di),
438             z2_r = fromArray(SPECIES_512, data, i + di2),
439             z0_i = fromArray(SPECIES_512, data, i + im),
440             z1_i = fromArray(SPECIES_512, data, i + di + im),
441             z2_i = fromArray(SPECIES_512, data, i + di2 + im);
442         final DoubleVector
443             t1_r = z1_r.add(z2_r),
444             t1_i = z1_i.add(z2_i),
445             t2_r = t1_r.mul(-0.5).add(z0_r),
446             t2_i = t1_i.mul(-0.5).add(z0_i),
447             t3_r = z1_r.sub(z2_r).mul(tau),
448             t3_i = z1_i.sub(z2_i).mul(tau);
449         z0_r.add(t1_r).intoArray(ret, j);
450         z0_i.add(t1_i).intoArray(ret, j + im);
451         final DoubleVector
452             x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
453             x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
454         x1_r.mul(w1_r).sub(x1_i.mul(w1_i)).intoArray(ret, j + dj);
455         x2_r.mul(w2_r).sub(x2_i.mul(w2_i)).intoArray(ret, j + dj2);
456         x1_i.mul(w1_r).add(x1_r.mul(w1_i)).intoArray(ret, j + dj + im);
457         x2_i.mul(w2_r).add(x2_r.mul(w2_i)).intoArray(ret, j + dj2 + im);
458       }
459     }
460   }
461 
462   /**
463    * Handle factors of 3 using the 128-bit SIMD vectors.
464    */
465   private void interleaved128(PassData passData) {
466     final double[] data = passData.in;
467     final double[] ret = passData.out;
468     int sign = passData.sign;
469     int i = passData.inOffset;
470     int j = passData.outOffset;
471     final double tau = sign * sqrt3_2;
472     // First pass of the 3-point FFT has no twiddle factors.
473     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
474       final DoubleVector
475           z0 = fromArray(SPECIES_128, data, i),
476           z1 = fromArray(SPECIES_128, data, i + di),
477           z2 = fromArray(SPECIES_128, data, i + di2);
478       final DoubleVector
479           t1 = z1.add(z2),
480           t2 = t1.mul(-0.5).add(z0),
481           t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
482       z0.add(t1).intoArray(ret, j);
483       t2.add(t3.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
484       t2.add(t3.mul(NEGATE_IM_128)).intoArray(ret, j + dj2);
485     }
486 
487     j += jstep;
488     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
489       final int index = k * 2;
490       final DoubleVector
491           w1r = broadcast(SPECIES_128, wr[index]),
492           w2r = broadcast(SPECIES_128, wr[index + 1]),
493           w1i = broadcast(SPECIES_128, -sign * wi[index]).mul(NEGATE_IM_128),
494           w2i = broadcast(SPECIES_128, -sign * wi[index + 1]).mul(NEGATE_IM_128);
495       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
496         final DoubleVector
497             z0 = fromArray(SPECIES_128, data, i),
498             z1 = fromArray(SPECIES_128, data, i + di),
499             z2 = fromArray(SPECIES_128, data, i + di2);
500         final DoubleVector
501             t1 = z1.add(z2),
502             t2 = t1.mul(-0.5).add(z0),
503             t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
504         z0.add(t1).intoArray(ret, j);
505         z0.add(t1).intoArray(ret, j);
506         final DoubleVector
507             x1 = t3.fma(NEGATE_RE_128, t2),
508             x2 = t3.fma(NEGATE_IM_128, t2);
509         w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
510         w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
511       }
512     }
513   }
514 
515   /**
516    * Handle factors of 3 using the 256-bit SIMD vectors.
517    */
518   private void interleaved256(PassData passData) {
519     final double[] data = passData.in;
520     final double[] ret = passData.out;
521     int sign = passData.sign;
522     int i = passData.inOffset;
523     int j = passData.outOffset;
524     final double tau = sign * sqrt3_2;
525     // First pass of the 3-point FFT has no twiddle factors.
526     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
527       final DoubleVector
528           z0 = fromArray(SPECIES_256, data, i),
529           z1 = fromArray(SPECIES_256, data, i + di),
530           z2 = fromArray(SPECIES_256, data, i + di2);
531       final DoubleVector
532           t1 = z1.add(z2),
533           t2 = t1.mul(-0.5).add(z0),
534           t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
535       z0.add(t1).intoArray(ret, j);
536       t2.add(t3.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
537       t2.add(t3.mul(NEGATE_IM_256)).intoArray(ret, j + dj2);
538     }
539 
540     j += jstep;
541     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
542       final int index = k * 2;
543       final DoubleVector
544           w1r = broadcast(SPECIES_256, wr[index]),
545           w2r = broadcast(SPECIES_256, wr[index + 1]),
546           w1i = broadcast(SPECIES_256, -sign * wi[index]).mul(NEGATE_IM_256),
547           w2i = broadcast(SPECIES_256, -sign * wi[index + 1]).mul(NEGATE_IM_256);
548       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
549         final DoubleVector
550             z0 = fromArray(SPECIES_256, data, i),
551             z1 = fromArray(SPECIES_256, data, i + di),
552             z2 = fromArray(SPECIES_256, data, i + di2);
553         final DoubleVector
554             t1 = z1.add(z2),
555             t2 = t1.mul(-0.5).add(z0),
556             t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
557         z0.add(t1).intoArray(ret, j);
558         final DoubleVector
559             x1 = t3.fma(NEGATE_RE_256, t2),
560             x2 = t3.fma(NEGATE_IM_256, t2);
561         w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
562         w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
563       }
564     }
565   }
566 
567   /**
568    * Handle factors of 3 using the 512-bit SIMD vectors.
569    */
570   private void interleaved512(PassData passData) {
571     final double[] data = passData.in;
572     final double[] ret = passData.out;
573     int sign = passData.sign;
574     int i = passData.inOffset;
575     int j = passData.outOffset;
576     final double tau = sign * sqrt3_2;
577     // First pass of the 3-point FFT has no twiddle factors.
578     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
579       final DoubleVector
580           z0 = fromArray(SPECIES_512, data, i),
581           z1 = fromArray(SPECIES_512, data, i + di),
582           z2 = fromArray(SPECIES_512, data, i + di2);
583       final DoubleVector
584           t1 = z1.add(z2),
585           t2 = t1.mul(-0.5).add(z0),
586           t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
587       z0.add(t1).intoArray(ret, j);
588       t2.add(t3.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
589       t2.add(t3.mul(NEGATE_IM_512)).intoArray(ret, j + dj2);
590     }
591 
592     j += jstep;
593     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
594       final int index = k * 2;
595       final DoubleVector
596           w1r = broadcast(SPECIES_512, wr[index]),
597           w2r = broadcast(SPECIES_512, wr[index + 1]),
598           w1i = broadcast(SPECIES_512, -sign * wi[index]).mul(NEGATE_IM_512),
599           w2i = broadcast(SPECIES_512, -sign * wi[index + 1]).mul(NEGATE_IM_512);
600       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
601         final DoubleVector
602             z0 = fromArray(SPECIES_512, data, i),
603             z1 = fromArray(SPECIES_512, data, i + di),
604             z2 = fromArray(SPECIES_512, data, i + di2);
605         final DoubleVector
606             t1 = z1.add(z2),
607             t2 = t1.mul(-0.5).add(z0),
608             t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
609         z0.add(t1).intoArray(ret, j);
610         final DoubleVector
611             x1 = t3.fma(NEGATE_RE_512, t2),
612             x2 = t3.fma(NEGATE_IM_512, t2);
613         w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
614         w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
615       }
616     }
617   }
618 }