View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2025.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.fft;
39  
40  import jdk.incubator.vector.DoubleVector;
41  
42  import static java.lang.Math.fma;
43  import static jdk.incubator.vector.DoubleVector.SPECIES_128;
44  import static jdk.incubator.vector.DoubleVector.SPECIES_256;
45  import static jdk.incubator.vector.DoubleVector.SPECIES_512;
46  import static jdk.incubator.vector.DoubleVector.broadcast;
47  import static jdk.incubator.vector.DoubleVector.fromArray;
48  import static org.apache.commons.math3.util.FastMath.sqrt;
49  
50  /**
51   * The MixedRadixFactor3 class handles factors of 3 in the FFT.
52   */
53  public class MixedRadixFactor3 extends MixedRadixFactor {
54  
55    private static final double sqrt3_2 = sqrt(3.0) / 2.0;
56    private final int di2;
57    private final int dj2;
58  
59    /**
60     * Create a new MixedRadixFactor3 instance.
61     *
62     * @param passConstants The pass constants.
63     */
64    public MixedRadixFactor3(PassConstants passConstants) {
65      super(passConstants);
66      di2 = 2 * di;
67      dj2 = 2 * dj;
68    }
69  
70    /**
71     * Handle factors of 3.
72     */
73    @Override
74    protected void passScalar(PassData passData) {
75      final double[] data = passData.in;
76      final double[] ret = passData.out;
77      int sign = passData.sign;
78      int i = passData.inOffset;
79      int j = passData.outOffset;
80      final double tau = sign * sqrt3_2;
81      // First pass of the 3-point FFT has no twiddle factors.
82      for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
83        final double z0_r = data[i];
84        final double z1_r = data[i + di];
85        final double z2_r = data[i + di2];
86        final double z0_i = data[i + im];
87        final double z1_i = data[i + di + im];
88        final double z2_i = data[i + di2 + im];
89        final double t1_r = z1_r + z2_r;
90        final double t1_i = z1_i + z2_i;
91        final double t2_r = fma(-0.5, t1_r, z0_r);
92        final double t2_i = fma(-0.5, t1_i, z0_i);
93        final double t3_r = tau * (z1_r - z2_r);
94        final double t3_i = tau * (z1_i - z2_i);
95        ret[j] = z0_r + t1_r;
96        ret[j + im] = z0_i + t1_i;
97        ret[j + dj] = t2_r - t3_i;
98        ret[j + dj + im] = t2_i + t3_r;
99        ret[j + dj2] = t2_r + t3_i;
100       ret[j + dj2 + im] = t2_i - t3_r;
101     }
102     j += jstep;
103     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
104       final double[] twids = twiddles[k];
105       final double w1_r = twids[0];
106       final double w1_i = -sign * twids[1];
107       final double w2_r = twids[2];
108       final double w2_i = -sign * twids[3];
109       for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
110         final double z0_r = data[i];
111         final double z1_r = data[i + di];
112         final double z2_r = data[i + di2];
113         final double z0_i = data[i + im];
114         final double z1_i = data[i + di + im];
115         final double z2_i = data[i + di2 + im];
116         final double t1_r = z1_r + z2_r;
117         final double t1_i = z1_i + z2_i;
118         final double t2_r = fma(-0.5, t1_r, z0_r);
119         final double t2_i = fma(-0.5, t1_i, z0_i);
120         final double t3_r = tau * (z1_r - z2_r);
121         final double t3_i = tau * (z1_i - z2_i);
122         ret[j] = z0_r + t1_r;
123         ret[j + im] = z0_i + t1_i;
124         multiplyAndStore(t2_r - t3_i, t2_i + t3_r, w1_r, w1_i, ret, j + dj, j + dj + im);
125         multiplyAndStore(t2_r + t3_i, t2_i - t3_r, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
126       }
127     }
128   }
129 
130   /**
131    * Handle factors of 3 using SIMD vectors.
132    */
133   @Override
134   protected void passSIMD(PassData passData) {
135     if (im == 1) {
136       interleaved(passData);
137     } else {
138       blocked(passData);
139     }
140   }
141 
142   /**
143    * Available SIMD sizes for Pass 3.
144    */
145   private static int[] simdSizes = {8, 4, 2};
146 
147   /**
148    * Handle factors of 3 using the chosen SIMD vector.
149    *
150    * @param passData   The pass data.
151    * @param simdLength The SIMD vector length.
152    */
153   private void interleaved(PassData passData, int simdLength) {
154     // Use the preferred SIMD vector.
155     switch (simdLength) {
156       case 2:
157         // 1 complex number per loop iteration.
158         interleaved128(passData);
159         break;
160       case 4:
161         // 2 complex numbers per loop iteration.
162         interleaved256(passData);
163         break;
164       case 8:
165         // 4 complex numbers per loop iteration.
166         interleaved512(passData);
167         break;
168       default:
169         passScalar(passData);
170     }
171   }
172 
173   /**
174    * Handle factors of 3 using the chosen SIMD vector.
175    *
176    * @param passData   The pass data.
177    * @param simdLength The SIMD vector length.
178    */
179   private void blocked(PassData passData, int simdLength) {
180     // Use the preferred SIMD vector.
181     switch (simdLength) {
182       case 2:
183         // 2 complex numbers per loop iteration.
184         blocked128(passData);
185         break;
186       case 4:
187         // 4 complex numbers per loop iteration.
188         blocked256(passData);
189         break;
190       case 8:
191         // 8 complex numbers per loop iteration.
192         blocked512(passData);
193         break;
194       default:
195         passScalar(passData);
196     }
197   }
198 
199   /**
200    * Handle factors of 3 using the 128-bit SIMD vectors.
201    *
202    * @param passData The interleaved pass data.
203    */
204   private void interleaved(PassData passData) {
205     if (innerLoopLimit % LOOP == 0) {
206       // Use the preferred SIMD vector.
207       interleaved(passData, LENGTH);
208     } else {
209       // If the inner loop limit is odd, use the scalar method unless the inner loop limit is 1.
210       if (innerLoopLimit % 2 != 0 && innerLoopLimit != 1) {
211         passScalar(passData);
212         return;
213       }
214       // Fall back to a smaller SIMD vector that fits the inner loop limit.
215       for (int size : simdSizes) {
216         if (size >= LENGTH) {
217           // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
218           continue;
219         }
220         // Divide the SIMD size by two because for interleaved a single SIMD vectors stores both real and imaginary parts.
221         if (innerLoopLimit % (size / 2) == 0) {
222           interleaved(passData, size);
223         }
224       }
225     }
226   }
227 
228   /**
229    * Handle factors of 3 using the 128-bit SIMD vectors.
230    *
231    * @param passData The pass blocked data.
232    */
233   private void blocked(PassData passData) {
234     if (innerLoopLimit % BLOCK_LOOP == 0) {
235       // Use the preferred SIMD vector.
236       blocked(passData, LENGTH);
237     } else {
238       // If the inner loop limit is odd, use the scalar method unless the inner loop limit is 1.
239       if (innerLoopLimit % 2 != 0) {
240         passScalar(passData);
241         return;
242       }
243       // Fall back to a smaller SIMD vector that fits the inner loop limit.
244       for (int size : simdSizes) {
245         if (size >= LENGTH) {
246           // Skip anything greater than or equal to the preferred SIMD vector size (which was too big).
247           continue;
248         }
249         // Divide the SIMD size by two because for interleaved a single SIMD vectors stores both real and imaginary parts.
250         if (innerLoopLimit % size == 0) {
251           blocked(passData, size);
252         }
253       }
254     }
255   }
256 
257   /**
258    * Handle factors of 3 using the 128-bit SIMD vectors.
259    */
260   private void blocked128(PassData passData) {
261     final double[] data = passData.in;
262     final double[] ret = passData.out;
263     int sign = passData.sign;
264     int i = passData.inOffset;
265     int j = passData.outOffset;
266     final double tau = sign * sqrt3_2;
267 
268     // First pass of the 3-point FFT has no twiddle factors.
269     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
270       final DoubleVector
271           z0_r = fromArray(SPECIES_128, data, i),
272           z1_r = fromArray(SPECIES_128, data, i + di),
273           z2_r = fromArray(SPECIES_128, data, i + di2),
274           z0_i = fromArray(SPECIES_128, data, i + im),
275           z1_i = fromArray(SPECIES_128, data, i + di + im),
276           z2_i = fromArray(SPECIES_128, data, i + di2 + im);
277       final DoubleVector
278           t1_r = z1_r.add(z2_r),
279           t1_i = z1_i.add(z2_i),
280           t2_r = t1_r.mul(-0.5).add(z0_r),
281           t2_i = t1_i.mul(-0.5).add(z0_i),
282           t3_r = z1_r.sub(z2_r).mul(tau),
283           t3_i = z1_i.sub(z2_i).mul(tau);
284       z0_r.add(t1_r).intoArray(ret, j);
285       z0_i.add(t1_i).intoArray(ret, j + im);
286       t2_r.sub(t3_i).intoArray(ret, j + dj);
287       t2_i.add(t3_r).intoArray(ret, j + dj + im);
288       t2_r.add(t3_i).intoArray(ret, j + dj2);
289       t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
290     }
291 
292     j += jstep;
293     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
294       final double[] twids = twiddles[k];
295       DoubleVector
296           w1_r = broadcast(SPECIES_128, twids[0]),
297           w1_i = broadcast(SPECIES_128, -sign * twids[1]),
298           w2_r = broadcast(SPECIES_128, twids[2]),
299           w2_i = broadcast(SPECIES_128, -sign * twids[3]);
300       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
301         final DoubleVector
302             z0_r = fromArray(SPECIES_128, data, i),
303             z1_r = fromArray(SPECIES_128, data, i + di),
304             z2_r = fromArray(SPECIES_128, data, i + di2),
305             z0_i = fromArray(SPECIES_128, data, i + im),
306             z1_i = fromArray(SPECIES_128, data, i + di + im),
307             z2_i = fromArray(SPECIES_128, data, i + di2 + im);
308         final DoubleVector
309             t1_r = z1_r.add(z2_r),
310             t1_i = z1_i.add(z2_i),
311             t2_r = t1_r.mul(-0.5).add(z0_r),
312             t2_i = t1_i.mul(-0.5).add(z0_i),
313             t3_r = z1_r.sub(z2_r).mul(tau),
314             t3_i = z1_i.sub(z2_i).mul(tau);
315         z0_r.add(t1_r).intoArray(ret, j);
316         z0_i.add(t1_i).intoArray(ret, j + im);
317         DoubleVector
318             x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
319             x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
320         w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
321         w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
322         w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
323         w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
324       }
325     }
326   }
327 
328   /**
329    * Handle factors of 3 using the 256-bit SIMD vectors.
330    */
331   private void blocked256(PassData passData) {
332     final double[] data = passData.in;
333     final double[] ret = passData.out;
334     int sign = passData.sign;
335     int i = passData.inOffset;
336     int j = passData.outOffset;
337     final double tau = sign * sqrt3_2;
338 
339     // First pass of the 3-point FFT has no twiddle factors.
340     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
341       final DoubleVector
342           z0_r = fromArray(SPECIES_256, data, i),
343           z1_r = fromArray(SPECIES_256, data, i + di),
344           z2_r = fromArray(SPECIES_256, data, i + di2),
345           z0_i = fromArray(SPECIES_256, data, i + im),
346           z1_i = fromArray(SPECIES_256, data, i + di + im),
347           z2_i = fromArray(SPECIES_256, data, i + di2 + im);
348       final DoubleVector
349           t1_r = z1_r.add(z2_r),
350           t1_i = z1_i.add(z2_i),
351           t2_r = t1_r.mul(-0.5).add(z0_r),
352           t2_i = t1_i.mul(-0.5).add(z0_i),
353           t3_r = z1_r.sub(z2_r).mul(tau),
354           t3_i = z1_i.sub(z2_i).mul(tau);
355       z0_r.add(t1_r).intoArray(ret, j);
356       z0_i.add(t1_i).intoArray(ret, j + im);
357       t2_r.sub(t3_i).intoArray(ret, j + dj);
358       t2_i.add(t3_r).intoArray(ret, j + dj + im);
359       t2_r.add(t3_i).intoArray(ret, j + dj2);
360       t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
361     }
362 
363     j += jstep;
364     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
365       final double[] twids = twiddles[k];
366       DoubleVector
367           w1_r = broadcast(SPECIES_256, twids[0]),
368           w1_i = broadcast(SPECIES_256, -sign * twids[1]),
369           w2_r = broadcast(SPECIES_256, twids[2]),
370           w2_i = broadcast(SPECIES_256, -sign * twids[3]);
371       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
372         final DoubleVector
373             z0_r = fromArray(SPECIES_256, data, i),
374             z1_r = fromArray(SPECIES_256, data, i + di),
375             z2_r = fromArray(SPECIES_256, data, i + di2),
376             z0_i = fromArray(SPECIES_256, data, i + im),
377             z1_i = fromArray(SPECIES_256, data, i + di + im),
378             z2_i = fromArray(SPECIES_256, data, i + di2 + im);
379         final DoubleVector
380             t1_r = z1_r.add(z2_r),
381             t1_i = z1_i.add(z2_i),
382             t2_r = t1_r.mul(-0.5).add(z0_r),
383             t2_i = t1_i.mul(-0.5).add(z0_i),
384             t3_r = z1_r.sub(z2_r).mul(tau),
385             t3_i = z1_i.sub(z2_i).mul(tau);
386         z0_r.add(t1_r).intoArray(ret, j);
387         z0_i.add(t1_i).intoArray(ret, j + im);
388         DoubleVector
389             x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
390             x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
391         w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
392         w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
393         w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
394         w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
395       }
396     }
397   }
398 
399   /**
400    * Handle factors of 3 using the 512-bit SIMD vectors.
401    */
402   private void blocked512(PassData passData) {
403     final double[] data = passData.in;
404     final double[] ret = passData.out;
405     int sign = passData.sign;
406     int i = passData.inOffset;
407     int j = passData.outOffset;
408     final double tau = sign * sqrt3_2;
409 
410     // First pass of the 3-point FFT has no twiddle factors.
411     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
412       final DoubleVector
413           z0_r = fromArray(SPECIES_512, data, i),
414           z1_r = fromArray(SPECIES_512, data, i + di),
415           z2_r = fromArray(SPECIES_512, data, i + di2),
416           z0_i = fromArray(SPECIES_512, data, i + im),
417           z1_i = fromArray(SPECIES_512, data, i + di + im),
418           z2_i = fromArray(SPECIES_512, data, i + di2 + im);
419       final DoubleVector
420           t1_r = z1_r.add(z2_r),
421           t1_i = z1_i.add(z2_i),
422           t2_r = t1_r.mul(-0.5).add(z0_r),
423           t2_i = t1_i.mul(-0.5).add(z0_i),
424           t3_r = z1_r.sub(z2_r).mul(tau),
425           t3_i = z1_i.sub(z2_i).mul(tau);
426       z0_r.add(t1_r).intoArray(ret, j);
427       z0_i.add(t1_i).intoArray(ret, j + im);
428       t2_r.sub(t3_i).intoArray(ret, j + dj);
429       t2_i.add(t3_r).intoArray(ret, j + dj + im);
430       t2_r.add(t3_i).intoArray(ret, j + dj2);
431       t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
432     }
433 
434     j += jstep;
435     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
436       final double[] twids = twiddles[k];
437       DoubleVector
438           w1_r = broadcast(SPECIES_512, twids[0]),
439           w1_i = broadcast(SPECIES_512, -sign * twids[1]),
440           w2_r = broadcast(SPECIES_512, twids[2]),
441           w2_i = broadcast(SPECIES_512, -sign * twids[3]);
442       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
443         final DoubleVector
444             z0_r = fromArray(SPECIES_512, data, i),
445             z1_r = fromArray(SPECIES_512, data, i + di),
446             z2_r = fromArray(SPECIES_512, data, i + di2),
447             z0_i = fromArray(SPECIES_512, data, i + im),
448             z1_i = fromArray(SPECIES_512, data, i + di + im),
449             z2_i = fromArray(SPECIES_512, data, i + di2 + im);
450         final DoubleVector
451             t1_r = z1_r.add(z2_r),
452             t1_i = z1_i.add(z2_i),
453             t2_r = t1_r.mul(-0.5).add(z0_r),
454             t2_i = t1_i.mul(-0.5).add(z0_i),
455             t3_r = z1_r.sub(z2_r).mul(tau),
456             t3_i = z1_i.sub(z2_i).mul(tau);
457         z0_r.add(t1_r).intoArray(ret, j);
458         z0_i.add(t1_i).intoArray(ret, j + im);
459         DoubleVector
460             x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
461             x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
462         w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
463         w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
464         w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
465         w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
466       }
467     }
468   }
469 
470   /**
471    * Handle factors of 3 using the 128-bit SIMD vectors.
472    */
473   private void interleaved128(PassData passData) {
474     final double[] data = passData.in;
475     final double[] ret = passData.out;
476     int sign = passData.sign;
477     int i = passData.inOffset;
478     int j = passData.outOffset;
479     final double tau = sign * sqrt3_2;
480     // First pass of the 3-point FFT has no twiddle factors.
481     for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_128, i += LENGTH_128, j += LENGTH_128) {
482       DoubleVector
483           z0 = fromArray(SPECIES_128, data, i),
484           z1 = fromArray(SPECIES_128, data, i + di),
485           z2 = fromArray(SPECIES_128, data, i + di2);
486       DoubleVector
487           t1 = z1.add(z2),
488           t2 = t1.mul(-0.5).add(z0),
489           t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
490       z0.add(t1).intoArray(ret, j);
491       t2.add(t3.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
492       t2.add(t3.mul(NEGATE_IM_128)).intoArray(ret, j + dj2);
493     }
494 
495     j += jstep;
496     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
497       final double[] twids = twiddles[k];
498       DoubleVector
499           w1r = broadcast(SPECIES_128, twids[0]),
500           w1i = broadcast(SPECIES_128, -sign * twids[1]).mul(NEGATE_IM_128),
501           w2r = broadcast(SPECIES_128, twids[2]),
502           w2i = broadcast(SPECIES_128, -sign * twids[3]).mul(NEGATE_IM_128);
503       for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_128, i += LENGTH_128, j += LENGTH_128) {
504         DoubleVector
505             z0 = fromArray(SPECIES_128, data, i),
506             z1 = fromArray(SPECIES_128, data, i + di),
507             z2 = fromArray(SPECIES_128, data, i + di2);
508         DoubleVector
509             t1 = z1.add(z2),
510             t2 = t1.mul(-0.5).add(z0),
511             t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
512         z0.add(t1).intoArray(ret, j);
513         z0.add(t1).intoArray(ret, j);
514         DoubleVector
515             x1 = t2.add(t3.mul(NEGATE_RE_128)),
516             x2 = t2.add(t3.mul(NEGATE_IM_128));
517         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
518         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
519       }
520     }
521   }
522 
523   /**
524    * Handle factors of 3 using the 256-bit SIMD vectors.
525    */
526   private void interleaved256(PassData passData) {
527     final double[] data = passData.in;
528     final double[] ret = passData.out;
529     int sign = passData.sign;
530     int i = passData.inOffset;
531     int j = passData.outOffset;
532     final double tau = sign * sqrt3_2;
533     // First pass of the 3-point FFT has no twiddle factors.
534     for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_256, i += LENGTH_256, j += LENGTH_256) {
535       DoubleVector
536           z0 = fromArray(SPECIES_256, data, i),
537           z1 = fromArray(SPECIES_256, data, i + di),
538           z2 = fromArray(SPECIES_256, data, i + di2);
539       DoubleVector
540           t1 = z1.add(z2),
541           t2 = t1.mul(-0.5).add(z0),
542           t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
543       z0.add(t1).intoArray(ret, j);
544       t2.add(t3.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
545       t2.add(t3.mul(NEGATE_IM_256)).intoArray(ret, j + dj2);
546     }
547 
548     j += jstep;
549     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
550       final double[] twids = twiddles[k];
551       DoubleVector
552           w1r = broadcast(SPECIES_256, twids[0]),
553           w1i = broadcast(SPECIES_256, -sign * twids[1]).mul(NEGATE_IM_256),
554           w2r = broadcast(SPECIES_256, twids[2]),
555           w2i = broadcast(SPECIES_256, -sign * twids[3]).mul(NEGATE_IM_256);
556       for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_256, i += LENGTH_256, j += LENGTH_256) {
557         DoubleVector
558             z0 = fromArray(SPECIES_256, data, i),
559             z1 = fromArray(SPECIES_256, data, i + di),
560             z2 = fromArray(SPECIES_256, data, i + di2);
561         DoubleVector
562             t1 = z1.add(z2),
563             t2 = t1.mul(-0.5).add(z0),
564             t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
565         z0.add(t1).intoArray(ret, j);
566         DoubleVector
567             x1 = t2.add(t3.mul(NEGATE_RE_256)),
568             x2 = t2.add(t3.mul(NEGATE_IM_256));
569         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
570         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
571       }
572     }
573   }
574 
575   /**
576    * Handle factors of 3 using the 512-bit SIMD vectors.
577    */
578   private void interleaved512(PassData passData) {
579     final double[] data = passData.in;
580     final double[] ret = passData.out;
581     int sign = passData.sign;
582     int i = passData.inOffset;
583     int j = passData.outOffset;
584     final double tau = sign * sqrt3_2;
585     // First pass of the 3-point FFT has no twiddle factors.
586     for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_512, i += LENGTH_512, j += LENGTH_512) {
587       DoubleVector
588           z0 = fromArray(SPECIES_512, data, i),
589           z1 = fromArray(SPECIES_512, data, i + di),
590           z2 = fromArray(SPECIES_512, data, i + di2);
591       DoubleVector
592           t1 = z1.add(z2),
593           t2 = t1.mul(-0.5).add(z0),
594           t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
595       z0.add(t1).intoArray(ret, j);
596       t2.add(t3.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
597       t2.add(t3.mul(NEGATE_IM_512)).intoArray(ret, j + dj2);
598     }
599 
600     j += jstep;
601     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
602       final double[] twids = twiddles[k];
603       DoubleVector
604           w1r = broadcast(SPECIES_512, twids[0]),
605           w1i = broadcast(SPECIES_512, -sign * twids[1]).mul(NEGATE_IM_512),
606           w2r = broadcast(SPECIES_512, twids[2]),
607           w2i = broadcast(SPECIES_512, -sign * twids[3]).mul(NEGATE_IM_512);
608       for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_512, i += LENGTH_512, j += LENGTH_512) {
609         DoubleVector
610             z0 = fromArray(SPECIES_512, data, i),
611             z1 = fromArray(SPECIES_512, data, i + di),
612             z2 = fromArray(SPECIES_512, data, i + di2);
613         DoubleVector
614             t1 = z1.add(z2),
615             t2 = t1.mul(-0.5).add(z0),
616             t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
617         z0.add(t1).intoArray(ret, j);
618         DoubleVector
619             x1 = t2.add(t3.mul(NEGATE_RE_512)),
620             x2 = t2.add(t3.mul(NEGATE_IM_512));
621         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
622         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
623       }
624     }
625   }
626 }