View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2026.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.fft;
39  
40  import jdk.incubator.vector.DoubleVector;
41  
42  import static jdk.incubator.vector.DoubleVector.broadcast;
43  import static jdk.incubator.vector.DoubleVector.fromArray;
44  import static org.apache.commons.math3.util.FastMath.PI;
45  import static org.apache.commons.math3.util.FastMath.cos;
46  import static org.apache.commons.math3.util.FastMath.sin;
47  
48  /**
49   * The MixedRadixFactor7 class handles factors of 7 in the FFT.
50   */
51  public class MixedRadixFactor7 extends MixedRadixFactor {
52  
53    private static final double oneThird = 1.0 / 3.0;
54    private static final double sin2PI_7 = sin(2.0 * PI / 7.0);
55    private static final double sin4PI_7 = sin(4.0 * PI / 7.0);
56    private static final double sin6PI_7 = sin(6.0 * PI / 7.0);
57    private static final double cos2PI_7 = cos(2.0 * PI / 7.0);
58    private static final double cos4PI_7 = cos(4.0 * PI / 7.0);
59    private static final double cos6PI_7 = cos(6.0 * PI / 7.0);
60    private static final double c1 = cos2PI_7;
61    private static final double c2 = cos4PI_7;
62    private static final double c3 = cos6PI_7;
63    private static final double v1 = (c1 + c2 + c3) * oneThird - 1.0;
64    private static final double v2 = (2.0 * c1 - c2 - c3) * oneThird;
65    private static final double v3 = (c1 - 2.0 * c2 + c3) * oneThird;
66    private static final double v4 = (c1 + c2 - 2.0 * c3) * oneThird;
67  
68    private final int di2;
69    private final int di3;
70    private final int di4;
71    private final int di5;
72    private final int di6;
73    private final int dj2;
74    private final int dj3;
75    private final int dj4;
76    private final int dj5;
77    private final int dj6;
78  
79    /**
80     * Construct a MixedRadixFactor7.
81     *
82     * @param passConstants PassConstants.
83     */
84    public MixedRadixFactor7(PassConstants passConstants) {
85      super(passConstants);
86      di2 = 2 * di;
87      di3 = 3 * di;
88      di4 = 4 * di;
89      di5 = 5 * di;
90      di6 = 6 * di;
91      dj2 = 2 * dj;
92      dj3 = 3 * dj;
93      dj4 = 4 * dj;
94      dj5 = 5 * dj;
95      dj6 = 6 * dj;
96    }
97  
98    /**
99     * Handle factors of 7.
100    *
101    * @param passData PassData.
102    */
103   protected void passScalar(PassData passData) {
104     final double[] data = passData.in;
105     final double[] ret = passData.out;
106     int sign = passData.sign;
107     int i = passData.inOffset;
108     int j = passData.outOffset;
109     final double s1 = (-sign) * sin2PI_7;
110     final double s2 = (-sign) * sin4PI_7;
111     final double s3 = (-sign) * sin6PI_7;
112     final double v5 = (s1 + s2 - s3) * oneThird;
113     final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
114     final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
115     final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
116     // First pass of the 7-point FFT has no twiddle factors.
117     for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
118       final double z0r = data[i];
119       final double z1r = data[i + di];
120       final double z2r = data[i + di2];
121       final double z3r = data[i + di3];
122       final double z4r = data[i + di4];
123       final double z5r = data[i + di5];
124       final double z6r = data[i + di6];
125       final double z0i = data[i + im];
126       final double z1i = data[i + di + im];
127       final double z2i = data[i + di2 + im];
128       final double z3i = data[i + di3 + im];
129       final double z4i = data[i + di4 + im];
130       final double z5i = data[i + di5 + im];
131       final double z6i = data[i + di6 + im];
132       final double t0r = z1r + z6r;
133       final double t0i = z1i + z6i;
134       final double t1r = z1r - z6r;
135       final double t1i = z1i - z6i;
136       final double t2r = z2r + z5r;
137       final double t2i = z2i + z5i;
138       final double t3r = z2r - z5r;
139       final double t3i = z2i - z5i;
140       final double t4r = z4r + z3r;
141       final double t4i = z4i + z3i;
142       final double t5r = z4r - z3r;
143       final double t5i = z4i - z3i;
144       final double t6r = t2r + t0r;
145       final double t6i = t2i + t0i;
146       final double t7r = t5r + t3r;
147       final double t7i = t5i + t3i;
148       final double b0r = z0r + t6r + t4r;
149       final double b0i = z0i + t6i + t4i;
150       final double b1r = v1 * (t6r + t4r);
151       final double b1i = v1 * (t6i + t4i);
152       final double b2r = v2 * (t0r - t4r);
153       final double b2i = v2 * (t0i - t4i);
154       final double b3r = v3 * (t4r - t2r);
155       final double b3i = v3 * (t4i - t2i);
156       final double b4r = v4 * (t2r - t0r);
157       final double b4i = v4 * (t2i - t0i);
158       final double b5r = v5 * (t7r + t1r);
159       final double b5i = v5 * (t7i + t1i);
160       final double b6r = v6 * (t1r - t5r);
161       final double b6i = v6 * (t1i - t5i);
162       final double b7r = v7 * (t5r - t3r);
163       final double b7i = v7 * (t5i - t3i);
164       final double b8r = v8 * (t3r - t1r);
165       final double b8i = v8 * (t3i - t1i);
166       final double u0r = b0r + b1r;
167       final double u0i = b0i + b1i;
168       final double u1r = b2r + b3r;
169       final double u1i = b2i + b3i;
170       final double u2r = b4r - b3r;
171       final double u2i = b4i - b3i;
172       final double u3r = -b2r - b4r;
173       final double u3i = -b2i - b4i;
174       final double u4r = b6r + b7r;
175       final double u4i = b6i + b7i;
176       final double u5r = b8r - b7r;
177       final double u5i = b8i - b7i;
178       final double u6r = -b8r - b6r;
179       final double u6i = -b8i - b6i;
180       final double u7r = u0r + u1r;
181       final double u7i = u0i + u1i;
182       final double u8r = u0r + u2r;
183       final double u8i = u0i + u2i;
184       final double u9r = u0r + u3r;
185       final double u9i = u0i + u3i;
186       final double u10r = u4r + b5r;
187       final double u10i = u4i + b5i;
188       final double u11r = u5r + b5r;
189       final double u11i = u5i + b5i;
190       final double u12r = u6r + b5r;
191       final double u12i = u6i + b5i;
192       ret[j] = b0r;
193       ret[j + im] = b0i;
194       ret[j + dj] = u7r + u10i;
195       ret[j + dj + im] = u7i - u10r;
196       ret[j + dj2] = u9r + u12i;
197       ret[j + dj2 + im] = u9i - u12r;
198       ret[j + dj3] = u8r - u11i;
199       ret[j + dj3 + im] = u8i + u11r;
200       ret[j + dj4] = u8r + u11i;
201       ret[j + dj4 + im] = u8i - u11r;
202       ret[j + dj5] = u9r - u12i;
203       ret[j + dj5 + im] = u9i + u12r;
204       ret[j + dj6] = u7r - u10i;
205       ret[j + dj6 + im] = u7i + u10r;
206     }
207 
208     j += jstep;
209     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
210       final int index = k * 6;
211       final double w1r = wr[index];
212       final double w2r = wr[index + 1];
213       final double w3r = wr[index + 2];
214       final double w4r = wr[index + 3];
215       final double w5r = wr[index + 4];
216       final double w6r = wr[index + 5];
217       final double w1i = -sign * wi[index];
218       final double w2i = -sign * wi[index + 1];
219       final double w3i = -sign * wi[index + 2];
220       final double w4i = -sign * wi[index + 3];
221       final double w5i = -sign * wi[index + 4];
222       final double w6i = -sign * wi[index + 5];
223       for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
224         final double z0r = data[i];
225         final double z1r = data[i + di];
226         final double z2r = data[i + di2];
227         final double z3r = data[i + di3];
228         final double z4r = data[i + di4];
229         final double z5r = data[i + di5];
230         final double z6r = data[i + di6];
231         final double z0i = data[i + im];
232         final double z1i = data[i + di + im];
233         final double z2i = data[i + di2 + im];
234         final double z3i = data[i + di3 + im];
235         final double z4i = data[i + di4 + im];
236         final double z5i = data[i + di5 + im];
237         final double z6i = data[i + di6 + im];
238         final double t0r = z1r + z6r;
239         final double t0i = z1i + z6i;
240         final double t1r = z1r - z6r;
241         final double t1i = z1i - z6i;
242         final double t2r = z2r + z5r;
243         final double t2i = z2i + z5i;
244         final double t3r = z2r - z5r;
245         final double t3i = z2i - z5i;
246         final double t4r = z4r + z3r;
247         final double t4i = z4i + z3i;
248         final double t5r = z4r - z3r;
249         final double t5i = z4i - z3i;
250         final double t6r = t2r + t0r;
251         final double t6i = t2i + t0i;
252         final double t7r = t5r + t3r;
253         final double t7i = t5i + t3i;
254         final double b0r = z0r + t6r + t4r;
255         final double b0i = z0i + t6i + t4i;
256         final double b1r = v1 * (t6r + t4r);
257         final double b1i = v1 * (t6i + t4i);
258         final double b2r = v2 * (t0r - t4r);
259         final double b2i = v2 * (t0i - t4i);
260         final double b3r = v3 * (t4r - t2r);
261         final double b3i = v3 * (t4i - t2i);
262         final double b4r = v4 * (t2r - t0r);
263         final double b4i = v4 * (t2i - t0i);
264         final double b5r = v5 * (t7r + t1r);
265         final double b5i = v5 * (t7i + t1i);
266         final double b6r = v6 * (t1r - t5r);
267         final double b6i = v6 * (t1i - t5i);
268         final double b7r = v7 * (t5r - t3r);
269         final double b7i = v7 * (t5i - t3i);
270         final double b8r = v8 * (t3r - t1r);
271         final double b8i = v8 * (t3i - t1i);
272         final double u0r = b0r + b1r;
273         final double u0i = b0i + b1i;
274         final double u1r = b2r + b3r;
275         final double u1i = b2i + b3i;
276         final double u2r = b4r - b3r;
277         final double u2i = b4i - b3i;
278         final double u3r = -b2r - b4r;
279         final double u3i = -b2i - b4i;
280         final double u4r = b6r + b7r;
281         final double u4i = b6i + b7i;
282         final double u5r = b8r - b7r;
283         final double u5i = b8i - b7i;
284         final double u6r = -b8r - b6r;
285         final double u6i = -b8i - b6i;
286         final double u7r = u0r + u1r;
287         final double u7i = u0i + u1i;
288         final double u8r = u0r + u2r;
289         final double u8i = u0i + u2i;
290         final double u9r = u0r + u3r;
291         final double u9i = u0i + u3i;
292         final double u10r = u4r + b5r;
293         final double u10i = u4i + b5i;
294         final double u11r = u5r + b5r;
295         final double u11i = u5i + b5i;
296         final double u12r = u6r + b5r;
297         final double u12i = u6i + b5i;
298         ret[j] = b0r;
299         ret[j + im] = b0i;
300         multiplyAndStore(u7r + u10i, u7i - u10r, w1r, w1i, ret, j + dj, j + dj + im);
301         multiplyAndStore(u9r + u12i, u9i - u12r, w2r, w2i, ret, j + dj2, j + dj2 + im);
302         multiplyAndStore(u8r - u11i, u8i + u11r, w3r, w3i, ret, j + dj3, j + dj3 + im);
303         multiplyAndStore(u8r + u11i, u8i - u11r, w4r, w4i, ret, j + dj4, j + dj4 + im);
304         multiplyAndStore(u9r - u12i, u9i + u12r, w5r, w5i, ret, j + dj5, j + dj5 + im);
305         multiplyAndStore(u7r - u10i, u7i + u10r, w6r, w6i, ret, j + dj6, j + dj6 + im);
306       }
307     }
308   }
309 
310   /**
311    * Handle factors of 6 using SIMD vectors.
312    *
313    * @param passData PassData.
314    */
315   @Override
316   protected void passSIMD(PassData passData) {
317     if (!isValidSIMDWidth(simdWidth)) {
318       passScalar(passData);
319     } else {
320       if (im == 1) {
321         interleaved(passData);
322       } else {
323         blocked(passData);
324       }
325     }
326   }
327 
328   /**
329    * Handle factors of 7.
330    *
331    * @param passData PassData.
332    */
333   private void interleaved(PassData passData) {
334     final double[] data = passData.in;
335     final double[] ret = passData.out;
336     int sign = passData.sign;
337     int i = passData.inOffset;
338     int j = passData.outOffset;
339     final double s1 = (-sign) * sin2PI_7;
340     final double s2 = (-sign) * sin4PI_7;
341     final double s3 = (-sign) * sin6PI_7;
342     final double v5 = (s1 + s2 - s3) * oneThird;
343     final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
344     final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
345     final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
346     // First pass of the 7-point FFT has no twiddle factors.
347     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP, i += LENGTH, j += LENGTH) {
348       final DoubleVector
349           z0 = fromArray(DOUBLE_SPECIES, data, i),
350           z1 = fromArray(DOUBLE_SPECIES, data, i + di),
351           z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
352           z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
353           z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
354           z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
355           z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
356       final DoubleVector
357           t0 = z1.add(z6),
358           t1 = z1.sub(z6),
359           t2 = z2.add(z5),
360           t3 = z2.sub(z5),
361           t4 = z4.add(z3),
362           t5 = z4.sub(z3),
363           t6 = t2.add(t0),
364           t7 = t5.add(t3);
365       final DoubleVector
366           b0 = z0.add(t6).add(t4),
367           b1 = t6.add(t4).mul(v1),
368           b2 = t0.sub(t4).mul(v2),
369           b3 = t4.sub(t2).mul(v3),
370           b4 = t2.sub(t0).mul(v4),
371           b5 = t7.add(t1).mul(v5),
372           b6 = t1.sub(t5).mul(v6),
373           b7 = t5.sub(t3).mul(v7),
374           b8 = t3.sub(t1).mul(v8);
375       final DoubleVector
376           u0 = b0.add(b1),
377           u1 = b2.add(b3),
378           u2 = b4.sub(b3),
379           u3 = b2.add(b4).neg(),
380           u4 = b6.add(b7),
381           u5 = b8.sub(b7),
382           u6 = b8.add(b6).neg(),
383           u7 = u0.add(u1),
384           u8 = u0.add(u2),
385           u9 = u0.add(u3),
386           u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
387           u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
388           u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
389       b0.intoArray(ret, j);
390       u10.fma(NEGATE_IM, u7).intoArray(ret, j + dj);
391       u12.fma(NEGATE_IM, u9).intoArray(ret, j + dj2);
392       u11.fma(NEGATE_RE, u8).intoArray(ret, j + dj3);
393       u11.fma(NEGATE_IM, u8).intoArray(ret, j + dj4);
394       u12.fma(NEGATE_RE, u9).intoArray(ret, j + dj5);
395       u10.fma(NEGATE_RE, u7).intoArray(ret, j + dj6);
396     }
397 
398     j += jstep;
399     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
400       final int index = k * 6;
401       final DoubleVector
402           w1r = broadcast(DOUBLE_SPECIES, wr[index]),
403           w2r = broadcast(DOUBLE_SPECIES, wr[index + 1]),
404           w3r = broadcast(DOUBLE_SPECIES, wr[index + 2]),
405           w4r = broadcast(DOUBLE_SPECIES, wr[index + 3]),
406           w5r = broadcast(DOUBLE_SPECIES, wr[index + 4]),
407           w6r = broadcast(DOUBLE_SPECIES, wr[index + 5]),
408           w1i = broadcast(DOUBLE_SPECIES, -sign * wi[index]).mul(NEGATE_IM),
409           w2i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 1]).mul(NEGATE_IM),
410           w3i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 2]).mul(NEGATE_IM),
411           w4i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 3]).mul(NEGATE_IM),
412           w5i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 4]).mul(NEGATE_IM),
413           w6i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 5]).mul(NEGATE_IM);
414       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP, i += LENGTH, j += LENGTH) {
415         final DoubleVector
416             z0 = fromArray(DOUBLE_SPECIES, data, i),
417             z1 = fromArray(DOUBLE_SPECIES, data, i + di),
418             z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
419             z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
420             z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
421             z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
422             z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
423         final DoubleVector
424             t0 = z1.add(z6),
425             t1 = z1.sub(z6),
426             t2 = z2.add(z5),
427             t3 = z2.sub(z5),
428             t4 = z4.add(z3),
429             t5 = z4.sub(z3),
430             t6 = t2.add(t0),
431             t7 = t5.add(t3);
432         final DoubleVector
433             b0 = z0.add(t6).add(t4),
434             b1 = t6.add(t4).mul(v1),
435             b2 = t0.sub(t4).mul(v2),
436             b3 = t4.sub(t2).mul(v3),
437             b4 = t2.sub(t0).mul(v4),
438             b5 = t7.add(t1).mul(v5),
439             b6 = t1.sub(t5).mul(v6),
440             b7 = t5.sub(t3).mul(v7),
441             b8 = t3.sub(t1).mul(v8);
442         final DoubleVector
443             u0 = b0.add(b1),
444             u1 = b2.add(b3),
445             u2 = b4.sub(b3),
446             u3 = b2.add(b4).neg(),
447             u4 = b6.add(b7),
448             u5 = b8.sub(b7),
449             u6 = b8.add(b6).neg(),
450             u7 = u0.add(u1),
451             u8 = u0.add(u2),
452             u9 = u0.add(u3),
453             u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
454             u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
455             u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
456         b0.intoArray(ret, j);
457         final DoubleVector
458             x1 = u10.fma(NEGATE_IM, u7),
459             x2 = u12.fma(NEGATE_IM, u9),
460             x3 = u11.fma(NEGATE_RE, u8),
461             x4 = u11.fma(NEGATE_IM, u8),
462             x5 = u12.fma(NEGATE_RE, u9),
463             x6 = u10.fma(NEGATE_RE, u7);
464         w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj);
465         w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj2);
466         w3r.fma(x3, w3i.mul(x3).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj3);
467         w4r.fma(x4, w4i.mul(x4).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj4);
468         w5r.fma(x5, w5i.mul(x5).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj5);
469         w6r.fma(x6, w6i.mul(x6).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj6);
470       }
471     }
472   }
473 
474   /**
475    * Handle factors of 7.
476    *
477    * @param passData PassData.
478    */
479   private void blocked(PassData passData) {
480     final double[] data = passData.in;
481     final double[] ret = passData.out;
482     int sign = passData.sign;
483     int i = passData.inOffset;
484     int j = passData.outOffset;
485     final double s1 = (-sign) * sin2PI_7;
486     final double s2 = (-sign) * sin4PI_7;
487     final double s3 = (-sign) * sin6PI_7;
488     final double v5 = (s1 + s2 - s3) * oneThird;
489     final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
490     final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
491     final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
492     // First pass of the 7-point FFT has no twiddle factors.
493     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
494       final DoubleVector
495           z0r = fromArray(DOUBLE_SPECIES, data, i),
496           z1r = fromArray(DOUBLE_SPECIES, data, i + di),
497           z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
498           z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
499           z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
500           z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
501           z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
502           z0i = fromArray(DOUBLE_SPECIES, data, i + im),
503           z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
504           z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
505           z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
506           z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
507           z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
508           z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
509       final DoubleVector
510           t0r = z1r.add(z6r), t0i = z1i.add(z6i),
511           t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
512           t2r = z2r.add(z5r), t2i = z2i.add(z5i),
513           t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
514           t4r = z4r.add(z3r), t4i = z4i.add(z3i),
515           t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
516           t6r = t2r.add(t0r), t6i = t2i.add(t0i),
517           t7r = t5r.add(t3r), t7i = t5i.add(t3i);
518       final DoubleVector
519           b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
520           b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
521           b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
522           b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
523           b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
524           b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
525           b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
526           b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
527           b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
528       final DoubleVector
529           u0r = b0r.add(b1r), u0i = b0i.add(b1i),
530           u1r = b2r.add(b3r), u1i = b2i.add(b3i),
531           u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
532           u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
533           u4r = b6r.add(b7r), u4i = b6i.add(b7i),
534           u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
535           u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
536           u7r = u0r.add(u1r), u7i = u0i.add(u1i),
537           u8r = u0r.add(u2r), u8i = u0i.add(u2i),
538           u9r = u0r.add(u3r), u9i = u0i.add(u3i),
539           u10r = u4r.add(b5r), u10i = u4i.add(b5i),
540           u11r = u5r.add(b5r), u11i = u5i.add(b5i),
541           u12r = u6r.add(b5r), u12i = u6i.add(b5i);
542       b0r.intoArray(ret, j);
543       b0i.intoArray(ret, j + im);
544       u7r.add(u10i).intoArray(ret, j + dj);
545       u7i.sub(u10r).intoArray(ret, j + dj + im);
546       u9r.add(u12i).intoArray(ret, j + dj2);
547       u9i.sub(u12r).intoArray(ret, j + dj2 + im);
548       u8r.sub(u11i).intoArray(ret, j + dj3);
549       u8i.add(u11r).intoArray(ret, j + dj3 + im);
550       u8r.add(u11i).intoArray(ret, j + dj4);
551       u8i.sub(u11r).intoArray(ret, j + dj4 + im);
552       u9r.sub(u12i).intoArray(ret, j + dj5);
553       u9i.add(u12r).intoArray(ret, j + dj5 + im);
554       u7r.sub(u10i).intoArray(ret, j + dj6);
555       u7i.add(u10r).intoArray(ret, j + dj6 + im);
556     }
557 
558     j += jstep;
559     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
560       final int index = k * 6;
561       final double
562           w1r = wr[index],
563           w2r = wr[index + 1],
564           w3r = wr[index + 2],
565           w4r = wr[index + 3],
566           w5r = wr[index + 4],
567           w6r = wr[index + 5],
568           w1i = -sign * wi[index],
569           w2i = -sign * wi[index + 1],
570           w3i = -sign * wi[index + 2],
571           w4i = -sign * wi[index + 3],
572           w5i = -sign * wi[index + 4],
573           w6i = -sign * wi[index + 5];
574       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
575         final DoubleVector
576             z0r = fromArray(DOUBLE_SPECIES, data, i),
577             z1r = fromArray(DOUBLE_SPECIES, data, i + di),
578             z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
579             z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
580             z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
581             z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
582             z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
583             z0i = fromArray(DOUBLE_SPECIES, data, i + im),
584             z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
585             z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
586             z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
587             z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
588             z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
589             z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
590         final DoubleVector
591             t0r = z1r.add(z6r), t0i = z1i.add(z6i),
592             t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
593             t2r = z2r.add(z5r), t2i = z2i.add(z5i),
594             t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
595             t4r = z4r.add(z3r), t4i = z4i.add(z3i),
596             t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
597             t6r = t2r.add(t0r), t6i = t2i.add(t0i),
598             t7r = t5r.add(t3r), t7i = t5i.add(t3i);
599         final DoubleVector
600             b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
601             b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
602             b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
603             b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
604             b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
605             b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
606             b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
607             b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
608             b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
609         final DoubleVector
610             u0r = b0r.add(b1r), u0i = b0i.add(b1i),
611             u1r = b2r.add(b3r), u1i = b2i.add(b3i),
612             u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
613             u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
614             u4r = b6r.add(b7r), u4i = b6i.add(b7i),
615             u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
616             u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
617             u7r = u0r.add(u1r), u7i = u0i.add(u1i),
618             u8r = u0r.add(u2r), u8i = u0i.add(u2i),
619             u9r = u0r.add(u3r), u9i = u0i.add(u3i),
620             u10r = u4r.add(b5r), u10i = u4i.add(b5i),
621             u11r = u5r.add(b5r), u11i = u5i.add(b5i),
622             u12r = u6r.add(b5r), u12i = u6i.add(b5i);
623         b0r.intoArray(ret, j);
624         b0i.intoArray(ret, j + im);
625         final DoubleVector
626             x1r = u7r.add(u10i), x1i = u7i.sub(u10r),
627             x2r = u9r.add(u12i), x2i = u9i.sub(u12r),
628             x3r = u8r.sub(u11i), x3i = u8i.add(u11r),
629             x4r = u8r.add(u11i), x4i = u8i.sub(u11r),
630             x5r = u9r.sub(u12i), x5i = u9i.add(u12r),
631             x6r = u7r.sub(u10i), x6i = u7i.add(u10r);
632         x1r.mul(w1r).sub(x1i.mul(w1i)).intoArray(ret, j + dj);
633         x2r.mul(w2r).sub(x2i.mul(w2i)).intoArray(ret, j + dj2);
634         x3r.mul(w3r).sub(x3i.mul(w3i)).intoArray(ret, j + dj3);
635         x4r.mul(w4r).sub(x4i.mul(w4i)).intoArray(ret, j + dj4);
636         x5r.mul(w5r).sub(x5i.mul(w5i)).intoArray(ret, j + dj5);
637         x6r.mul(w6r).sub(x6i.mul(w6i)).intoArray(ret, j + dj6);
638         x1i.mul(w1r).add(x1r.mul(w1i)).intoArray(ret, j + dj + im);
639         x2i.mul(w2r).add(x2r.mul(w2i)).intoArray(ret, j + dj2 + im);
640         x3i.mul(w3r).add(x3r.mul(w3i)).intoArray(ret, j + dj3 + im);
641         x4i.mul(w4r).add(x4r.mul(w4i)).intoArray(ret, j + dj4 + im);
642         x5i.mul(w5r).add(x5r.mul(w5i)).intoArray(ret, j + dj5 + im);
643         x6i.mul(w6r).add(x6r.mul(w6i)).intoArray(ret, j + dj6 + im);
644       }
645     }
646   }
647 }