View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2025.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.fft;
39  
40  import jdk.incubator.vector.DoubleVector;
41  
42  import static java.lang.Math.fma;
43  import static jdk.incubator.vector.DoubleVector.SPECIES_128;
44  import static jdk.incubator.vector.DoubleVector.SPECIES_256;
45  import static jdk.incubator.vector.DoubleVector.SPECIES_512;
46  import static jdk.incubator.vector.DoubleVector.broadcast;
47  import static jdk.incubator.vector.DoubleVector.fromArray;
48  import static org.apache.commons.math3.util.FastMath.sqrt;
49  
50  /**
51   * The MixedRadixFactor3 class handles factors of 3 in the FFT.
52   */
53  public class MixedRadixFactor3 extends MixedRadixFactor {
54  
55    private static final double sqrt3_2 = sqrt(3.0) / 2.0;
56    private final int di2;
57    private final int dj2;
58  
59    /**
60     * Create a new MixedRadixFactor3 instance.
61     *
62     * @param passConstants The pass constants.
63     */
64    public MixedRadixFactor3(PassConstants passConstants) {
65      super(passConstants);
66      di2 = 2 * di;
67      dj2 = 2 * dj;
68    }
69  
70    /**
71     * Handle factors of 3.
72     */
73    @Override
74    protected void passScalar(PassData passData) {
75      final double[] data = passData.in;
76      final double[] ret = passData.out;
77      int sign = passData.sign;
78      int i = passData.inOffset;
79      int j = passData.outOffset;
80      final double tau = sign * sqrt3_2;
81  
82      // First pass of the 3-point FFT has no twiddle factors.
83      for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
84        final double z0_r = data[i];
85        final double z1_r = data[i + di];
86        final double z2_r = data[i + di2];
87        final double z0_i = data[i + im];
88        final double z1_i = data[i + di + im];
89        final double z2_i = data[i + di2 + im];
90        final double t1_r = z1_r + z2_r;
91        final double t1_i = z1_i + z2_i;
92        final double t2_r = fma(-0.5, t1_r, z0_r);
93        final double t2_i = fma(-0.5, t1_i, z0_i);
94        final double t3_r = tau * (z1_r - z2_r);
95        final double t3_i = tau * (z1_i - z2_i);
96        ret[j] = z0_r + t1_r;
97        ret[j + im] = z0_i + t1_i;
98        ret[j + dj] = t2_r - t3_i;
99        ret[j + dj + im] = t2_i + t3_r;
100       ret[j + dj2] = t2_r + t3_i;
101       ret[j + dj2 + im] = t2_i - t3_r;
102     }
103     j += jstep;
104     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
105       final int index = k * 2;
106       final double w1_r = wr[index];
107       final double w2_r = wr[index + 1];
108       final double w1_i = -sign * wi[index];
109       final double w2_i = -sign * wi[index + 1];
110       for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
111         final double z0_r = data[i];
112         final double z1_r = data[i + di];
113         final double z2_r = data[i + di2];
114         final double z0_i = data[i + im];
115         final double z1_i = data[i + di + im];
116         final double z2_i = data[i + di2 + im];
117         final double t1_r = z1_r + z2_r;
118         final double t1_i = z1_i + z2_i;
119         final double t2_r = fma(-0.5, t1_r, z0_r);
120         final double t2_i = fma(-0.5, t1_i, z0_i);
121         final double t3_r = tau * (z1_r - z2_r);
122         final double t3_i = tau * (z1_i - z2_i);
123         ret[j] = z0_r + t1_r;
124         ret[j + im] = z0_i + t1_i;
125         multiplyAndStore(t2_r - t3_i, t2_i + t3_r, w1_r, w1_i, ret, j + dj, j + dj + im);
126         multiplyAndStore(t2_r + t3_i, t2_i - t3_r, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
127       }
128     }
129   }
130 
131   /**
132    * Handle factors of 3 using SIMD vectors.
133    */
134   @Override
135   protected void passSIMD(PassData passData) {
136     if (im == 1) {
137       interleaved(passData);
138     } else {
139       blocked(passData);
140     }
141   }
142 
143   /**
144    * Available SIMD sizes for Pass 3.
145    */
146   private static int[] simdSizes = {8, 4, 2};
147 
148   /**
149    * Handle factors of 3 using the chosen SIMD vector.
150    *
151    * @param passData   The pass data.
152    * @param simdLength The SIMD vector length.
153    */
154   private void interleaved(PassData passData, int simdLength) {
155     // Use the preferred SIMD vector.
156     switch (simdLength) {
157       case 2:
158         // 1 complex number per loop iteration.
159         interleaved128(passData);
160         break;
161       case 4:
162         // 2 complex numbers per loop iteration.
163         interleaved256(passData);
164         break;
165       case 8:
166         // 4 complex numbers per loop iteration.
167         interleaved512(passData);
168         break;
169       default:
170         passScalar(passData);
171     }
172   }
173 
174   /**
175    * Handle factors of 3 using the chosen SIMD vector.
176    *
177    * @param passData   The pass data.
178    * @param simdLength The SIMD vector length.
179    */
180   private void blocked(PassData passData, int simdLength) {
181     // Use the preferred SIMD vector.
182     switch (simdLength) {
183       case 2:
184         // 2 complex numbers per loop iteration.
185         blocked128(passData);
186         break;
187       case 4:
188         // 4 complex numbers per loop iteration.
189         blocked256(passData);
190         break;
191       case 8:
192         // 8 complex numbers per loop iteration.
193         blocked512(passData);
194         break;
195       default:
196         passScalar(passData);
197     }
198   }
199 
200   /**
201    * Handle factors of 3 using the 128-bit SIMD vectors.
202    *
203    * @param passData The interleaved pass data.
204    */
205   private void interleaved(PassData passData) {
206     if (innerLoopLimit % INTERLEAVED_LOOP == 0) {
207       // Use the preferred SIMD vector.
208       interleaved(passData, LENGTH);
209     } else {
210       // If the inner loop limit is odd, use the scalar method unless the inner loop limit is 1.
211       if (innerLoopLimit % 2 != 0 && innerLoopLimit != 1) {
212         passScalar(passData);
213         return;
214       }
215       // Fall back to a smaller SIMD vector that fits the inner loop limit.
216       for (int size : simdSizes) {
217         if (size >= LENGTH) {
218           // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
219           continue;
220         }
221         // Divide the SIMD size by two because for interleaved a single SIMD vectors stores both real and imaginary parts.
222         if (innerLoopLimit % (size / 2) == 0) {
223           interleaved(passData, size);
224         }
225       }
226     }
227   }
228 
229   /**
230    * Handle factors of 3 using the 128-bit SIMD vectors.
231    *
232    * @param passData The pass blocked data.
233    */
234   private void blocked(PassData passData) {
235     if (innerLoopLimit % BLOCK_LOOP == 0) {
236       // Use the preferred SIMD vector.
237       blocked(passData, LENGTH);
238     } else {
239       // If the inner loop limit is odd, use the scalar method unless the inner loop limit is 1.
240       if (innerLoopLimit % 2 != 0) {
241         passScalar(passData);
242         return;
243       }
244       // Fall back to a smaller SIMD vector that fits the inner loop limit.
245       for (int size : simdSizes) {
246         if (size >= LENGTH) {
247           // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
248           continue;
249         }
250         // Divide the SIMD size by two because for interleaved a single SIMD vectors stores both real and imaginary parts.
251         if (innerLoopLimit % size == 0) {
252           blocked(passData, size);
253         }
254       }
255     }
256   }
257 
258   /**
259    * Handle factors of 3 using the 128-bit SIMD vectors.
260    */
261   private void blocked128(PassData passData) {
262     final double[] data = passData.in;
263     final double[] ret = passData.out;
264     int sign = passData.sign;
265     int i = passData.inOffset;
266     int j = passData.outOffset;
267     final double tau = sign * sqrt3_2;
268 
269     // First pass of the 3-point FFT has no twiddle factors.
270     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
271       final DoubleVector
272           z0_r = fromArray(SPECIES_128, data, i),
273           z1_r = fromArray(SPECIES_128, data, i + di),
274           z2_r = fromArray(SPECIES_128, data, i + di2),
275           z0_i = fromArray(SPECIES_128, data, i + im),
276           z1_i = fromArray(SPECIES_128, data, i + di + im),
277           z2_i = fromArray(SPECIES_128, data, i + di2 + im);
278       final DoubleVector
279           t1_r = z1_r.add(z2_r),
280           t1_i = z1_i.add(z2_i),
281           t2_r = t1_r.mul(-0.5).add(z0_r),
282           t2_i = t1_i.mul(-0.5).add(z0_i),
283           t3_r = z1_r.sub(z2_r).mul(tau),
284           t3_i = z1_i.sub(z2_i).mul(tau);
285       z0_r.add(t1_r).intoArray(ret, j);
286       z0_i.add(t1_i).intoArray(ret, j + im);
287       t2_r.sub(t3_i).intoArray(ret, j + dj);
288       t2_i.add(t3_r).intoArray(ret, j + dj + im);
289       t2_r.add(t3_i).intoArray(ret, j + dj2);
290       t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
291     }
292 
293     j += jstep;
294     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
295       final int index = k * 2;
296       final DoubleVector
297           w1_r = broadcast(SPECIES_128, wr[index]),
298           w2_r = broadcast(SPECIES_128, wr[index + 1]),
299           w1_i = broadcast(SPECIES_128, -sign * wi[index]),
300           w2_i = broadcast(SPECIES_128, -sign * wi[index + 1]);
301       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
302         final DoubleVector
303             z0_r = fromArray(SPECIES_128, data, i),
304             z1_r = fromArray(SPECIES_128, data, i + di),
305             z2_r = fromArray(SPECIES_128, data, i + di2),
306             z0_i = fromArray(SPECIES_128, data, i + im),
307             z1_i = fromArray(SPECIES_128, data, i + di + im),
308             z2_i = fromArray(SPECIES_128, data, i + di2 + im);
309         final DoubleVector
310             t1_r = z1_r.add(z2_r),
311             t1_i = z1_i.add(z2_i),
312             t2_r = t1_r.mul(-0.5).add(z0_r),
313             t2_i = t1_i.mul(-0.5).add(z0_i),
314             t3_r = z1_r.sub(z2_r).mul(tau),
315             t3_i = z1_i.sub(z2_i).mul(tau);
316         z0_r.add(t1_r).intoArray(ret, j);
317         z0_i.add(t1_i).intoArray(ret, j + im);
318         DoubleVector
319             x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
320             x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
321         w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
322         w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
323         w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
324         w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
325       }
326     }
327   }
328 
329   /**
330    * Handle factors of 3 using the 256-bit SIMD vectors.
331    */
332   private void blocked256(PassData passData) {
333     final double[] data = passData.in;
334     final double[] ret = passData.out;
335     int sign = passData.sign;
336     int i = passData.inOffset;
337     int j = passData.outOffset;
338     final double tau = sign * sqrt3_2;
339 
340     // First pass of the 3-point FFT has no twiddle factors.
341     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
342       final DoubleVector
343           z0_r = fromArray(SPECIES_256, data, i),
344           z1_r = fromArray(SPECIES_256, data, i + di),
345           z2_r = fromArray(SPECIES_256, data, i + di2),
346           z0_i = fromArray(SPECIES_256, data, i + im),
347           z1_i = fromArray(SPECIES_256, data, i + di + im),
348           z2_i = fromArray(SPECIES_256, data, i + di2 + im);
349       final DoubleVector
350           t1_r = z1_r.add(z2_r),
351           t1_i = z1_i.add(z2_i),
352           t2_r = t1_r.mul(-0.5).add(z0_r),
353           t2_i = t1_i.mul(-0.5).add(z0_i),
354           t3_r = z1_r.sub(z2_r).mul(tau),
355           t3_i = z1_i.sub(z2_i).mul(tau);
356       z0_r.add(t1_r).intoArray(ret, j);
357       z0_i.add(t1_i).intoArray(ret, j + im);
358       t2_r.sub(t3_i).intoArray(ret, j + dj);
359       t2_i.add(t3_r).intoArray(ret, j + dj + im);
360       t2_r.add(t3_i).intoArray(ret, j + dj2);
361       t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
362     }
363 
364     j += jstep;
365     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
366       final int index = k * 2;
367       final DoubleVector
368           w1_r = broadcast(SPECIES_256, wr[index]),
369           w2_r = broadcast(SPECIES_256, wr[index + 1]),
370           w1_i = broadcast(SPECIES_256, -sign * wi[index]),
371           w2_i = broadcast(SPECIES_256, -sign * wi[index + 1]);
372       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
373         final DoubleVector
374             z0_r = fromArray(SPECIES_256, data, i),
375             z1_r = fromArray(SPECIES_256, data, i + di),
376             z2_r = fromArray(SPECIES_256, data, i + di2),
377             z0_i = fromArray(SPECIES_256, data, i + im),
378             z1_i = fromArray(SPECIES_256, data, i + di + im),
379             z2_i = fromArray(SPECIES_256, data, i + di2 + im);
380         final DoubleVector
381             t1_r = z1_r.add(z2_r),
382             t1_i = z1_i.add(z2_i),
383             t2_r = t1_r.mul(-0.5).add(z0_r),
384             t2_i = t1_i.mul(-0.5).add(z0_i),
385             t3_r = z1_r.sub(z2_r).mul(tau),
386             t3_i = z1_i.sub(z2_i).mul(tau);
387         z0_r.add(t1_r).intoArray(ret, j);
388         z0_i.add(t1_i).intoArray(ret, j + im);
389         DoubleVector
390             x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
391             x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
392         w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
393         w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
394         w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
395         w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
396       }
397     }
398   }
399 
400   /**
401    * Handle factors of 3 using the 512-bit SIMD vectors.
402    */
403   private void blocked512(PassData passData) {
404     final double[] data = passData.in;
405     final double[] ret = passData.out;
406     int sign = passData.sign;
407     int i = passData.inOffset;
408     int j = passData.outOffset;
409     final double tau = sign * sqrt3_2;
410 
411     // First pass of the 3-point FFT has no twiddle factors.
412     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
413       final DoubleVector
414           z0_r = fromArray(SPECIES_512, data, i),
415           z1_r = fromArray(SPECIES_512, data, i + di),
416           z2_r = fromArray(SPECIES_512, data, i + di2),
417           z0_i = fromArray(SPECIES_512, data, i + im),
418           z1_i = fromArray(SPECIES_512, data, i + di + im),
419           z2_i = fromArray(SPECIES_512, data, i + di2 + im);
420       final DoubleVector
421           t1_r = z1_r.add(z2_r),
422           t1_i = z1_i.add(z2_i),
423           t2_r = t1_r.mul(-0.5).add(z0_r),
424           t2_i = t1_i.mul(-0.5).add(z0_i),
425           t3_r = z1_r.sub(z2_r).mul(tau),
426           t3_i = z1_i.sub(z2_i).mul(tau);
427       z0_r.add(t1_r).intoArray(ret, j);
428       z0_i.add(t1_i).intoArray(ret, j + im);
429       t2_r.sub(t3_i).intoArray(ret, j + dj);
430       t2_i.add(t3_r).intoArray(ret, j + dj + im);
431       t2_r.add(t3_i).intoArray(ret, j + dj2);
432       t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
433     }
434 
435     j += jstep;
436     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
437       final int index = k * 2;
438       final DoubleVector
439           w1_r = broadcast(SPECIES_512, wr[index]),
440           w2_r = broadcast(SPECIES_512, wr[index + 1]),
441           w1_i = broadcast(SPECIES_512, -sign * wi[index]),
442           w2_i = broadcast(SPECIES_512, -sign * wi[index + 1]);
443       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
444         final DoubleVector
445             z0_r = fromArray(SPECIES_512, data, i),
446             z1_r = fromArray(SPECIES_512, data, i + di),
447             z2_r = fromArray(SPECIES_512, data, i + di2),
448             z0_i = fromArray(SPECIES_512, data, i + im),
449             z1_i = fromArray(SPECIES_512, data, i + di + im),
450             z2_i = fromArray(SPECIES_512, data, i + di2 + im);
451         final DoubleVector
452             t1_r = z1_r.add(z2_r),
453             t1_i = z1_i.add(z2_i),
454             t2_r = t1_r.mul(-0.5).add(z0_r),
455             t2_i = t1_i.mul(-0.5).add(z0_i),
456             t3_r = z1_r.sub(z2_r).mul(tau),
457             t3_i = z1_i.sub(z2_i).mul(tau);
458         z0_r.add(t1_r).intoArray(ret, j);
459         z0_i.add(t1_i).intoArray(ret, j + im);
460         DoubleVector
461             x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
462             x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
463         w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
464         w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
465         w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
466         w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
467       }
468     }
469   }
470 
471   /**
472    * Handle factors of 3 using the 128-bit SIMD vectors.
473    */
474   private void interleaved128(PassData passData) {
475     final double[] data = passData.in;
476     final double[] ret = passData.out;
477     int sign = passData.sign;
478     int i = passData.inOffset;
479     int j = passData.outOffset;
480     final double tau = sign * sqrt3_2;
481     // First pass of the 3-point FFT has no twiddle factors.
482     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
483       DoubleVector
484           z0 = fromArray(SPECIES_128, data, i),
485           z1 = fromArray(SPECIES_128, data, i + di),
486           z2 = fromArray(SPECIES_128, data, i + di2);
487       DoubleVector
488           t1 = z1.add(z2),
489           t2 = t1.mul(-0.5).add(z0),
490           t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
491       z0.add(t1).intoArray(ret, j);
492       t2.add(t3.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
493       t2.add(t3.mul(NEGATE_IM_128)).intoArray(ret, j + dj2);
494     }
495 
496     j += jstep;
497     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
498       final int index = k * 2;
499       final DoubleVector
500           w1r = broadcast(SPECIES_128, wr[index]),
501           w2r = broadcast(SPECIES_128, wr[index + 1]),
502           w1i = broadcast(SPECIES_128, -sign * wi[index]).mul(NEGATE_IM_128),
503           w2i = broadcast(SPECIES_128, -sign * wi[index + 1]).mul(NEGATE_IM_128);
504       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
505         DoubleVector
506             z0 = fromArray(SPECIES_128, data, i),
507             z1 = fromArray(SPECIES_128, data, i + di),
508             z2 = fromArray(SPECIES_128, data, i + di2);
509         DoubleVector
510             t1 = z1.add(z2),
511             t2 = t1.mul(-0.5).add(z0),
512             t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
513         z0.add(t1).intoArray(ret, j);
514         z0.add(t1).intoArray(ret, j);
515         DoubleVector
516             x1 = t2.add(t3.mul(NEGATE_RE_128)),
517             x2 = t2.add(t3.mul(NEGATE_IM_128));
518         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
519         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
520       }
521     }
522   }
523 
524   /**
525    * Handle factors of 3 using the 256-bit SIMD vectors.
526    */
527   private void interleaved256(PassData passData) {
528     final double[] data = passData.in;
529     final double[] ret = passData.out;
530     int sign = passData.sign;
531     int i = passData.inOffset;
532     int j = passData.outOffset;
533     final double tau = sign * sqrt3_2;
534     // First pass of the 3-point FFT has no twiddle factors.
535     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
536       DoubleVector
537           z0 = fromArray(SPECIES_256, data, i),
538           z1 = fromArray(SPECIES_256, data, i + di),
539           z2 = fromArray(SPECIES_256, data, i + di2);
540       DoubleVector
541           t1 = z1.add(z2),
542           t2 = t1.mul(-0.5).add(z0),
543           t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
544       z0.add(t1).intoArray(ret, j);
545       t2.add(t3.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
546       t2.add(t3.mul(NEGATE_IM_256)).intoArray(ret, j + dj2);
547     }
548 
549     j += jstep;
550     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
551       final int index = k * 2;
552       final DoubleVector
553           w1r = broadcast(SPECIES_256, wr[index]),
554           w2r = broadcast(SPECIES_256, wr[index + 1]),
555           w1i = broadcast(SPECIES_256, -sign * wi[index]).mul(NEGATE_IM_256),
556           w2i = broadcast(SPECIES_256, -sign * wi[index + 1]).mul(NEGATE_IM_256);
557       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
558         DoubleVector
559             z0 = fromArray(SPECIES_256, data, i),
560             z1 = fromArray(SPECIES_256, data, i + di),
561             z2 = fromArray(SPECIES_256, data, i + di2);
562         DoubleVector
563             t1 = z1.add(z2),
564             t2 = t1.mul(-0.5).add(z0),
565             t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
566         z0.add(t1).intoArray(ret, j);
567         DoubleVector
568             x1 = t2.add(t3.mul(NEGATE_RE_256)),
569             x2 = t2.add(t3.mul(NEGATE_IM_256));
570         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
571         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
572       }
573     }
574   }
575 
576   /**
577    * Handle factors of 3 using the 512-bit SIMD vectors.
578    */
579   private void interleaved512(PassData passData) {
580     final double[] data = passData.in;
581     final double[] ret = passData.out;
582     int sign = passData.sign;
583     int i = passData.inOffset;
584     int j = passData.outOffset;
585     final double tau = sign * sqrt3_2;
586     // First pass of the 3-point FFT has no twiddle factors.
587     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
588       DoubleVector
589           z0 = fromArray(SPECIES_512, data, i),
590           z1 = fromArray(SPECIES_512, data, i + di),
591           z2 = fromArray(SPECIES_512, data, i + di2);
592       DoubleVector
593           t1 = z1.add(z2),
594           t2 = t1.mul(-0.5).add(z0),
595           t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
596       z0.add(t1).intoArray(ret, j);
597       t2.add(t3.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
598       t2.add(t3.mul(NEGATE_IM_512)).intoArray(ret, j + dj2);
599     }
600 
601     j += jstep;
602     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
603       final int index = k * 2;
604       final DoubleVector
605           w1r = broadcast(SPECIES_512, wr[index]),
606           w2r = broadcast(SPECIES_512, wr[index + 1]),
607           w1i = broadcast(SPECIES_512, -sign * wi[index]).mul(NEGATE_IM_512),
608           w2i = broadcast(SPECIES_512, -sign * wi[index + 1]).mul(NEGATE_IM_512);
609       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
610         DoubleVector
611             z0 = fromArray(SPECIES_512, data, i),
612             z1 = fromArray(SPECIES_512, data, i + di),
613             z2 = fromArray(SPECIES_512, data, i + di2);
614         DoubleVector
615             t1 = z1.add(z2),
616             t2 = t1.mul(-0.5).add(z0),
617             t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
618         z0.add(t1).intoArray(ret, j);
619         DoubleVector
620             x1 = t2.add(t3.mul(NEGATE_RE_512)),
621             x2 = t2.add(t3.mul(NEGATE_IM_512));
622         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
623         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
624       }
625     }
626   }
627 }