View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2025.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.fft;
39  
40  import jdk.incubator.vector.DoubleVector;
41  
42  import static jdk.incubator.vector.DoubleVector.broadcast;
43  import static jdk.incubator.vector.DoubleVector.fromArray;
44  import static org.apache.commons.math3.util.FastMath.PI;
45  import static org.apache.commons.math3.util.FastMath.cos;
46  import static org.apache.commons.math3.util.FastMath.sin;
47  
48  /**
49   * The MixedRadixFactor7 class handles factors of 7 in the FFT.
50   */
51  public class MixedRadixFactor7 extends MixedRadixFactor {
52  
53    private static final double oneThird = 1.0 / 3.0;
54    private static final double sin2PI_7 = sin(2.0 * PI / 7.0);
55    private static final double sin4PI_7 = sin(4.0 * PI / 7.0);
56    private static final double sin6PI_7 = sin(6.0 * PI / 7.0);
57    private static final double cos2PI_7 = cos(2.0 * PI / 7.0);
58    private static final double cos4PI_7 = cos(4.0 * PI / 7.0);
59    private static final double cos6PI_7 = cos(6.0 * PI / 7.0);
60    private static final double c1 = cos2PI_7;
61    private static final double c2 = cos4PI_7;
62    private static final double c3 = cos6PI_7;
63    private static final double v1 = (c1 + c2 + c3) * oneThird - 1.0;
64    private static final double v2 = (2.0 * c1 - c2 - c3) * oneThird;
65    private static final double v3 = (c1 - 2.0 * c2 + c3) * oneThird;
66    private static final double v4 = (c1 + c2 - 2.0 * c3) * oneThird;
67  
68    private final int di2;
69    private final int di3;
70    private final int di4;
71    private final int di5;
72    private final int di6;
73    private final int dj2;
74    private final int dj3;
75    private final int dj4;
76    private final int dj5;
77    private final int dj6;
78  
79    /**
80     * Construct a MixedRadixFactor7.
81     *
82     * @param passConstants PassConstants.
83     */
84    public MixedRadixFactor7(PassConstants passConstants) {
85      super(passConstants);
86      di2 = 2 * di;
87      di3 = 3 * di;
88      di4 = 4 * di;
89      di5 = 5 * di;
90      di6 = 6 * di;
91      dj2 = 2 * dj;
92      dj3 = 3 * dj;
93      dj4 = 4 * dj;
94      dj5 = 5 * dj;
95      dj6 = 6 * dj;
96    }
97  
98    /**
99     * Handle factors of 7.
100    *
101    * @param passData PassData.
102    */
103   protected void passScalar(PassData passData) {
104     final double[] data = passData.in;
105     final double[] ret = passData.out;
106     int sign = passData.sign;
107     int i = passData.inOffset;
108     int j = passData.outOffset;
109     final double s1 = (-sign) * sin2PI_7;
110     final double s2 = (-sign) * sin4PI_7;
111     final double s3 = (-sign) * sin6PI_7;
112     final double v5 = (s1 + s2 - s3) * oneThird;
113     final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
114     final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
115     final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
116     // First pass of the 7-point FFT has no twiddle factors.
117     for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
118       final double z0r = data[i];
119       final double z1r = data[i + di];
120       final double z2r = data[i + di2];
121       final double z3r = data[i + di3];
122       final double z4r = data[i + di4];
123       final double z5r = data[i + di5];
124       final double z6r = data[i + di6];
125       final double z0i = data[i + im];
126       final double z1i = data[i + di + im];
127       final double z2i = data[i + di2 + im];
128       final double z3i = data[i + di3 + im];
129       final double z4i = data[i + di4 + im];
130       final double z5i = data[i + di5 + im];
131       final double z6i = data[i + di6 + im];
132       final double t0r = z1r + z6r;
133       final double t0i = z1i + z6i;
134       final double t1r = z1r - z6r;
135       final double t1i = z1i - z6i;
136       final double t2r = z2r + z5r;
137       final double t2i = z2i + z5i;
138       final double t3r = z2r - z5r;
139       final double t3i = z2i - z5i;
140       final double t4r = z4r + z3r;
141       final double t4i = z4i + z3i;
142       final double t5r = z4r - z3r;
143       final double t5i = z4i - z3i;
144       final double t6r = t2r + t0r;
145       final double t6i = t2i + t0i;
146       final double t7r = t5r + t3r;
147       final double t7i = t5i + t3i;
148       final double b0r = z0r + t6r + t4r;
149       final double b0i = z0i + t6i + t4i;
150       final double b1r = v1 * (t6r + t4r);
151       final double b1i = v1 * (t6i + t4i);
152       final double b2r = v2 * (t0r - t4r);
153       final double b2i = v2 * (t0i - t4i);
154       final double b3r = v3 * (t4r - t2r);
155       final double b3i = v3 * (t4i - t2i);
156       final double b4r = v4 * (t2r - t0r);
157       final double b4i = v4 * (t2i - t0i);
158       final double b5r = v5 * (t7r + t1r);
159       final double b5i = v5 * (t7i + t1i);
160       final double b6r = v6 * (t1r - t5r);
161       final double b6i = v6 * (t1i - t5i);
162       final double b7r = v7 * (t5r - t3r);
163       final double b7i = v7 * (t5i - t3i);
164       final double b8r = v8 * (t3r - t1r);
165       final double b8i = v8 * (t3i - t1i);
166       final double u0r = b0r + b1r;
167       final double u0i = b0i + b1i;
168       final double u1r = b2r + b3r;
169       final double u1i = b2i + b3i;
170       final double u2r = b4r - b3r;
171       final double u2i = b4i - b3i;
172       final double u3r = -b2r - b4r;
173       final double u3i = -b2i - b4i;
174       final double u4r = b6r + b7r;
175       final double u4i = b6i + b7i;
176       final double u5r = b8r - b7r;
177       final double u5i = b8i - b7i;
178       final double u6r = -b8r - b6r;
179       final double u6i = -b8i - b6i;
180       final double u7r = u0r + u1r;
181       final double u7i = u0i + u1i;
182       final double u8r = u0r + u2r;
183       final double u8i = u0i + u2i;
184       final double u9r = u0r + u3r;
185       final double u9i = u0i + u3i;
186       final double u10r = u4r + b5r;
187       final double u10i = u4i + b5i;
188       final double u11r = u5r + b5r;
189       final double u11i = u5i + b5i;
190       final double u12r = u6r + b5r;
191       final double u12i = u6i + b5i;
192       ret[j] = b0r;
193       ret[j + im] = b0i;
194       ret[j + dj] = u7r + u10i;
195       ret[j + dj + im] = u7i - u10r;
196       ret[j + dj2] = u9r + u12i;
197       ret[j + dj2 + im] = u9i - u12r;
198       ret[j + dj3] = u8r - u11i;
199       ret[j + dj3 + im] = u8i + u11r;
200       ret[j + dj4] = u8r + u11i;
201       ret[j + dj4 + im] = u8i - u11r;
202       ret[j + dj5] = u9r - u12i;
203       ret[j + dj5 + im] = u9i + u12r;
204       ret[j + dj6] = u7r - u10i;
205       ret[j + dj6 + im] = u7i + u10r;
206     }
207 
208     j += jstep;
209     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
210       final double[] twids = twiddles[k];
211       final double w1r = twids[0];
212       final double w1i = -sign * twids[1];
213       final double w2r = twids[2];
214       final double w2i = -sign * twids[3];
215       final double w3r = twids[4];
216       final double w3i = -sign * twids[5];
217       final double w4r = twids[6];
218       final double w4i = -sign * twids[7];
219       final double w5r = twids[8];
220       final double w5i = -sign * twids[9];
221       final double w6r = twids[10];
222       final double w6i = -sign * twids[11];
223       for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
224         final double z0r = data[i];
225         final double z1r = data[i + di];
226         final double z2r = data[i + di2];
227         final double z3r = data[i + di3];
228         final double z4r = data[i + di4];
229         final double z5r = data[i + di5];
230         final double z6r = data[i + di6];
231         final double z0i = data[i + im];
232         final double z1i = data[i + di + im];
233         final double z2i = data[i + di2 + im];
234         final double z3i = data[i + di3 + im];
235         final double z4i = data[i + di4 + im];
236         final double z5i = data[i + di5 + im];
237         final double z6i = data[i + di6 + im];
238         final double t0r = z1r + z6r;
239         final double t0i = z1i + z6i;
240         final double t1r = z1r - z6r;
241         final double t1i = z1i - z6i;
242         final double t2r = z2r + z5r;
243         final double t2i = z2i + z5i;
244         final double t3r = z2r - z5r;
245         final double t3i = z2i - z5i;
246         final double t4r = z4r + z3r;
247         final double t4i = z4i + z3i;
248         final double t5r = z4r - z3r;
249         final double t5i = z4i - z3i;
250         final double t6r = t2r + t0r;
251         final double t6i = t2i + t0i;
252         final double t7r = t5r + t3r;
253         final double t7i = t5i + t3i;
254         final double b0r = z0r + t6r + t4r;
255         final double b0i = z0i + t6i + t4i;
256         final double b1r = v1 * (t6r + t4r);
257         final double b1i = v1 * (t6i + t4i);
258         final double b2r = v2 * (t0r - t4r);
259         final double b2i = v2 * (t0i - t4i);
260         final double b3r = v3 * (t4r - t2r);
261         final double b3i = v3 * (t4i - t2i);
262         final double b4r = v4 * (t2r - t0r);
263         final double b4i = v4 * (t2i - t0i);
264         final double b5r = v5 * (t7r + t1r);
265         final double b5i = v5 * (t7i + t1i);
266         final double b6r = v6 * (t1r - t5r);
267         final double b6i = v6 * (t1i - t5i);
268         final double b7r = v7 * (t5r - t3r);
269         final double b7i = v7 * (t5i - t3i);
270         final double b8r = v8 * (t3r - t1r);
271         final double b8i = v8 * (t3i - t1i);
272         final double u0r = b0r + b1r;
273         final double u0i = b0i + b1i;
274         final double u1r = b2r + b3r;
275         final double u1i = b2i + b3i;
276         final double u2r = b4r - b3r;
277         final double u2i = b4i - b3i;
278         final double u3r = -b2r - b4r;
279         final double u3i = -b2i - b4i;
280         final double u4r = b6r + b7r;
281         final double u4i = b6i + b7i;
282         final double u5r = b8r - b7r;
283         final double u5i = b8i - b7i;
284         final double u6r = -b8r - b6r;
285         final double u6i = -b8i - b6i;
286         final double u7r = u0r + u1r;
287         final double u7i = u0i + u1i;
288         final double u8r = u0r + u2r;
289         final double u8i = u0i + u2i;
290         final double u9r = u0r + u3r;
291         final double u9i = u0i + u3i;
292         final double u10r = u4r + b5r;
293         final double u10i = u4i + b5i;
294         final double u11r = u5r + b5r;
295         final double u11i = u5i + b5i;
296         final double u12r = u6r + b5r;
297         final double u12i = u6i + b5i;
298         ret[j] = b0r;
299         ret[j + im] = b0i;
300         multiplyAndStore(u7r + u10i, u7i - u10r, w1r, w1i, ret, j + dj, j + dj + im);
301         multiplyAndStore(u9r + u12i, u9i - u12r, w2r, w2i, ret, j + dj2, j + dj2 + im);
302         multiplyAndStore(u8r - u11i, u8i + u11r, w3r, w3i, ret, j + dj3, j + dj3 + im);
303         multiplyAndStore(u8r + u11i, u8i - u11r, w4r, w4i, ret, j + dj4, j + dj4 + im);
304         multiplyAndStore(u9r - u12i, u9i + u12r, w5r, w5i, ret, j + dj5, j + dj5 + im);
305         multiplyAndStore(u7r - u10i, u7i + u10r, w6r, w6i, ret, j + dj6, j + dj6 + im);
306       }
307     }
308   }
309 
310   /**
311    * Handle factors of 6 using SIMD vectors.
312    *
313    * @param passData PassData.
314    */
315   @Override
316   protected void passSIMD(PassData passData) {
317     if (im == 1) {
318       // If the inner loop limit is not divisible by the loop increment, use the scalar method.
319       if (innerLoopLimit % LOOP != 0) {
320         passScalar(passData);
321       } else {
322         interleaved(passData);
323       }
324     } else {
325       // If the inner loop limit is not divisible by the loop increment, use the scalar method.
326       if (innerLoopLimit % BLOCK_LOOP != 0) {
327         passScalar(passData);
328       } else {
329         blocked(passData);
330       }
331     }
332   }
333 
334   /**
335    * Handle factors of 7.
336    *
337    * @param passData PassData.
338    */
339   private void interleaved(PassData passData) {
340     final double[] data = passData.in;
341     final double[] ret = passData.out;
342     int sign = passData.sign;
343     int i = passData.inOffset;
344     int j = passData.outOffset;
345     final double s1 = (-sign) * sin2PI_7;
346     final double s2 = (-sign) * sin4PI_7;
347     final double s3 = (-sign) * sin6PI_7;
348     final double v5 = (s1 + s2 - s3) * oneThird;
349     final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
350     final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
351     final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
352     // First pass of the 7-point FFT has no twiddle factors.
353     for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP, i += LENGTH, j += LENGTH) {
354       DoubleVector
355           z0 = fromArray(DOUBLE_SPECIES, data, i),
356           z1 = fromArray(DOUBLE_SPECIES, data, i + di),
357           z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
358           z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
359           z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
360           z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
361           z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
362       DoubleVector
363           t0 = z1.add(z6),
364           t1 = z1.sub(z6),
365           t2 = z2.add(z5),
366           t3 = z2.sub(z5),
367           t4 = z4.add(z3),
368           t5 = z4.sub(z3),
369           t6 = t2.add(t0),
370           t7 = t5.add(t3);
371       DoubleVector
372           b0 = z0.add(t6).add(t4),
373           b1 = t6.add(t4).mul(v1),
374           b2 = t0.sub(t4).mul(v2),
375           b3 = t4.sub(t2).mul(v3),
376           b4 = t2.sub(t0).mul(v4),
377           b5 = t7.add(t1).mul(v5),
378           b6 = t1.sub(t5).mul(v6),
379           b7 = t5.sub(t3).mul(v7),
380           b8 = t3.sub(t1).mul(v8);
381       DoubleVector
382           u0 = b0.add(b1),
383           u1 = b2.add(b3),
384           u2 = b4.sub(b3),
385           u3 = b2.add(b4).neg(),
386           u4 = b6.add(b7),
387           u5 = b8.sub(b7),
388           u6 = b8.add(b6).neg(),
389           u7 = u0.add(u1),
390           u8 = u0.add(u2),
391           u9 = u0.add(u3),
392           u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
393           u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
394           u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
395       b0.intoArray(ret, j);
396       u7.add(u10.mul(NEGATE_IM)).intoArray(ret, j + dj);
397       u9.add(u12.mul(NEGATE_IM)).intoArray(ret, j + dj2);
398       u8.add(u11.mul(NEGATE_RE)).intoArray(ret, j + dj3);
399       u8.add(u11.mul(NEGATE_IM)).intoArray(ret, j + dj4);
400       u9.add(u12.mul(NEGATE_RE)).intoArray(ret, j + dj5);
401       u7.add(u10.mul(NEGATE_RE)).intoArray(ret, j + dj6);
402     }
403 
404     j += jstep;
405     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
406       final double[] twids = twiddles[k];
407       DoubleVector
408           w1r = broadcast(DOUBLE_SPECIES, twids[0]),
409           w1i = broadcast(DOUBLE_SPECIES, -sign * twids[1]).mul(NEGATE_IM),
410           w2r = broadcast(DOUBLE_SPECIES, twids[2]),
411           w2i = broadcast(DOUBLE_SPECIES, -sign * twids[3]).mul(NEGATE_IM),
412           w3r = broadcast(DOUBLE_SPECIES, twids[4]),
413           w3i = broadcast(DOUBLE_SPECIES, -sign * twids[5]).mul(NEGATE_IM),
414           w4r = broadcast(DOUBLE_SPECIES, twids[6]),
415           w4i = broadcast(DOUBLE_SPECIES, -sign * twids[7]).mul(NEGATE_IM),
416           w5r = broadcast(DOUBLE_SPECIES, twids[8]),
417           w5i = broadcast(DOUBLE_SPECIES, -sign * twids[9]).mul(NEGATE_IM),
418           w6r = broadcast(DOUBLE_SPECIES, twids[10]),
419           w6i = broadcast(DOUBLE_SPECIES, -sign * twids[11]).mul(NEGATE_IM);
420       for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP, i += LENGTH, j += LENGTH) {
421         DoubleVector
422             z0 = fromArray(DOUBLE_SPECIES, data, i),
423             z1 = fromArray(DOUBLE_SPECIES, data, i + di),
424             z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
425             z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
426             z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
427             z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
428             z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
429         DoubleVector
430             t0 = z1.add(z6),
431             t1 = z1.sub(z6),
432             t2 = z2.add(z5),
433             t3 = z2.sub(z5),
434             t4 = z4.add(z3),
435             t5 = z4.sub(z3),
436             t6 = t2.add(t0),
437             t7 = t5.add(t3);
438         DoubleVector
439             b0 = z0.add(t6).add(t4),
440             b1 = t6.add(t4).mul(v1),
441             b2 = t0.sub(t4).mul(v2),
442             b3 = t4.sub(t2).mul(v3),
443             b4 = t2.sub(t0).mul(v4),
444             b5 = t7.add(t1).mul(v5),
445             b6 = t1.sub(t5).mul(v6),
446             b7 = t5.sub(t3).mul(v7),
447             b8 = t3.sub(t1).mul(v8);
448         DoubleVector
449             u0 = b0.add(b1),
450             u1 = b2.add(b3),
451             u2 = b4.sub(b3),
452             u3 = b2.add(b4).neg(),
453             u4 = b6.add(b7),
454             u5 = b8.sub(b7),
455             u6 = b8.add(b6).neg(),
456             u7 = u0.add(u1),
457             u8 = u0.add(u2),
458             u9 = u0.add(u3),
459             u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
460             u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
461             u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
462         b0.intoArray(ret, j);
463         DoubleVector
464             x1 = u7.add(u10.mul(NEGATE_IM)),
465             x2 = u9.add(u12.mul(NEGATE_IM)),
466             x3 = u8.add(u11.mul(NEGATE_RE)),
467             x4 = u8.add(u11.mul(NEGATE_IM)),
468             x5 = u9.add(u12.mul(NEGATE_RE)),
469             x6 = u7.add(u10.mul(NEGATE_RE));
470         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj);
471         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj2);
472         w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj3);
473         w4r.mul(x4).add(w4i.mul(x4).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj4);
474         w5r.mul(x5).add(w5i.mul(x5).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj5);
475         w6r.mul(x6).add(w6i.mul(x6).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj6);
476       }
477     }
478   }
479 
480   /**
481    * Handle factors of 7.
482    *
483    * @param passData PassData.
484    */
485   private void blocked(PassData passData) {
486     final double[] data = passData.in;
487     final double[] ret = passData.out;
488     int sign = passData.sign;
489     int i = passData.inOffset;
490     int j = passData.outOffset;
491     final double s1 = (-sign) * sin2PI_7;
492     final double s2 = (-sign) * sin4PI_7;
493     final double s3 = (-sign) * sin6PI_7;
494     final double v5 = (s1 + s2 - s3) * oneThird;
495     final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
496     final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
497     final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
498     // First pass of the 7-point FFT has no twiddle factors.
499     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
500       final DoubleVector
501           z0r = fromArray(DOUBLE_SPECIES, data, i),
502           z1r = fromArray(DOUBLE_SPECIES, data, i + di),
503           z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
504           z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
505           z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
506           z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
507           z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
508           z0i = fromArray(DOUBLE_SPECIES, data, i + im),
509           z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
510           z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
511           z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
512           z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
513           z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
514           z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
515       final DoubleVector
516           t0r = z1r.add(z6r), t0i = z1i.add(z6i),
517           t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
518           t2r = z2r.add(z5r), t2i = z2i.add(z5i),
519           t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
520           t4r = z4r.add(z3r), t4i = z4i.add(z3i),
521           t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
522           t6r = t2r.add(t0r), t6i = t2i.add(t0i),
523           t7r = t5r.add(t3r), t7i = t5i.add(t3i);
524       final DoubleVector
525           b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
526           b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
527           b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
528           b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
529           b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
530           b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
531           b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
532           b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
533           b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
534       final DoubleVector
535           u0r = b0r.add(b1r), u0i = b0i.add(b1i),
536           u1r = b2r.add(b3r), u1i = b2i.add(b3i),
537           u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
538           u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
539           u4r = b6r.add(b7r), u4i = b6i.add(b7i),
540           u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
541           u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
542           u7r = u0r.add(u1r), u7i = u0i.add(u1i),
543           u8r = u0r.add(u2r), u8i = u0i.add(u2i),
544           u9r = u0r.add(u3r), u9i = u0i.add(u3i),
545           u10r = u4r.add(b5r), u10i = u4i.add(b5i),
546           u11r = u5r.add(b5r), u11i = u5i.add(b5i),
547           u12r = u6r.add(b5r), u12i = u6i.add(b5i);
548       b0r.intoArray(ret, j);
549       b0i.intoArray(ret, j + im);
550       u7r.add(u10i).intoArray(ret, j + dj);
551       u7i.sub(u10r).intoArray(ret, j + dj + im);
552       u9r.add(u12i).intoArray(ret, j + dj2);
553       u9i.sub(u12r).intoArray(ret, j + dj2 + im);
554       u8r.sub(u11i).intoArray(ret, j + dj3);
555       u8i.add(u11r).intoArray(ret, j + dj3 + im);
556       u8r.add(u11i).intoArray(ret, j + dj4);
557       u8i.sub(u11r).intoArray(ret, j + dj4 + im);
558       u9r.sub(u12i).intoArray(ret, j + dj5);
559       u9i.add(u12r).intoArray(ret, j + dj5 + im);
560       u7r.sub(u10i).intoArray(ret, j + dj6);
561       u7i.add(u10r).intoArray(ret, j + dj6 + im);
562     }
563 
564     j += jstep;
565     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
566       final double[] twids = twiddles[k];
567       DoubleVector
568           w1r = broadcast(DOUBLE_SPECIES, twids[0]),
569           w1i = broadcast(DOUBLE_SPECIES, -sign * twids[1]),
570           w2r = broadcast(DOUBLE_SPECIES, twids[2]),
571           w2i = broadcast(DOUBLE_SPECIES, -sign * twids[3]),
572           w3r = broadcast(DOUBLE_SPECIES, twids[4]),
573           w3i = broadcast(DOUBLE_SPECIES, -sign * twids[5]),
574           w4r = broadcast(DOUBLE_SPECIES, twids[6]),
575           w4i = broadcast(DOUBLE_SPECIES, -sign * twids[7]),
576           w5r = broadcast(DOUBLE_SPECIES, twids[8]),
577           w5i = broadcast(DOUBLE_SPECIES, -sign * twids[9]),
578           w6r = broadcast(DOUBLE_SPECIES, twids[10]),
579           w6i = broadcast(DOUBLE_SPECIES, -sign * twids[11]);
580       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
581         final DoubleVector
582             z0r = fromArray(DOUBLE_SPECIES, data, i),
583             z1r = fromArray(DOUBLE_SPECIES, data, i + di),
584             z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
585             z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
586             z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
587             z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
588             z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
589             z0i = fromArray(DOUBLE_SPECIES, data, i + im),
590             z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
591             z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
592             z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
593             z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
594             z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
595             z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
596         final DoubleVector
597             t0r = z1r.add(z6r), t0i = z1i.add(z6i),
598             t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
599             t2r = z2r.add(z5r), t2i = z2i.add(z5i),
600             t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
601             t4r = z4r.add(z3r), t4i = z4i.add(z3i),
602             t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
603             t6r = t2r.add(t0r), t6i = t2i.add(t0i),
604             t7r = t5r.add(t3r), t7i = t5i.add(t3i);
605         final DoubleVector
606             b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
607             b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
608             b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
609             b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
610             b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
611             b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
612             b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
613             b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
614             b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
615         final DoubleVector
616             u0r = b0r.add(b1r), u0i = b0i.add(b1i),
617             u1r = b2r.add(b3r), u1i = b2i.add(b3i),
618             u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
619             u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
620             u4r = b6r.add(b7r), u4i = b6i.add(b7i),
621             u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
622             u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
623             u7r = u0r.add(u1r), u7i = u0i.add(u1i),
624             u8r = u0r.add(u2r), u8i = u0i.add(u2i),
625             u9r = u0r.add(u3r), u9i = u0i.add(u3i),
626             u10r = u4r.add(b5r), u10i = u4i.add(b5i),
627             u11r = u5r.add(b5r), u11i = u5i.add(b5i),
628             u12r = u6r.add(b5r), u12i = u6i.add(b5i);
629         b0r.intoArray(ret, j);
630         b0i.intoArray(ret, j + im);
631         DoubleVector
632             x1r = u7r.add(u10i), x1i = u7i.sub(u10r),
633             x2r = u9r.add(u12i), x2i = u9i.sub(u12r),
634             x3r = u8r.sub(u11i), x3i = u8i.add(u11r),
635             x4r = u8r.add(u11i), x4i = u8i.sub(u11r),
636             x5r = u9r.sub(u12i), x5i = u9i.add(u12r),
637             x6r = u7r.sub(u10i), x6i = u7i.add(u10r);
638         w1r.mul(x1r).add(w1i.neg().mul(x1i)).intoArray(ret, j + dj);
639         w2r.mul(x2r).add(w2i.neg().mul(x2i)).intoArray(ret, j + dj2);
640         w3r.mul(x3r).add(w3i.neg().mul(x3i)).intoArray(ret, j + dj3);
641         w4r.mul(x4r).add(w4i.neg().mul(x4i)).intoArray(ret, j + dj4);
642         w5r.mul(x5r).add(w5i.neg().mul(x5i)).intoArray(ret, j + dj5);
643         w6r.mul(x6r).add(w6i.neg().mul(x6i)).intoArray(ret, j + dj6);
644         w1r.mul(x1i).add(w1i.mul(x1r)).intoArray(ret, j + dj + im);
645         w2r.mul(x2i).add(w2i.mul(x2r)).intoArray(ret, j + dj2 + im);
646         w3r.mul(x3i).add(w3i.mul(x3r)).intoArray(ret, j + dj3 + im);
647         w4r.mul(x4i).add(w4i.mul(x4r)).intoArray(ret, j + dj4 + im);
648         w5r.mul(x5i).add(w5i.mul(x5r)).intoArray(ret, j + dj5 + im);
649         w6r.mul(x6i).add(w6i.mul(x6r)).intoArray(ret, j + dj6 + im);
650       }
651     }
652   }
653 }