View Javadoc
1   // ******************************************************************************
2   //
3   // Title:       Force Field X.
4   // Description: Force Field X - Software for Molecular Biophysics.
5   // Copyright:   Copyright (c) Michael J. Schnieders 2001-2025.
6   //
7   // This file is part of Force Field X.
8   //
9   // Force Field X is free software; you can redistribute it and/or modify it
10  // under the terms of the GNU General Public License version 3 as published by
11  // the Free Software Foundation.
12  //
13  // Force Field X is distributed in the hope that it will be useful, but WITHOUT
14  // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  // FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
16  // details.
17  //
18  // You should have received a copy of the GNU General Public License along with
19  // Force Field X; if not, write to the Free Software Foundation, Inc., 59 Temple
20  // Place, Suite 330, Boston, MA 02111-1307 USA
21  //
22  // Linking this library statically or dynamically with other modules is making a
23  // combined work based on this library. Thus, the terms and conditions of the
24  // GNU General Public License cover the whole combination.
25  //
26  // As a special exception, the copyright holders of this library give you
27  // permission to link this library with independent modules to produce an
28  // executable, regardless of the license terms of these independent modules, and
29  // to copy and distribute the resulting executable under terms of your choice,
30  // provided that you also meet, for each linked independent module, the terms
31  // and conditions of the license of that module. An independent module is a
32  // module which is not derived from or based on this library. If you modify this
33  // library, you may extend this exception to your version of the library, but
34  // you are not obligated to do so. If you do not wish to do so, delete this
35  // exception statement from your version.
36  //
37  // ******************************************************************************
38  package ffx.numerics.fft;
39  
40  import jdk.incubator.vector.DoubleVector;
41  
42  import static jdk.incubator.vector.DoubleVector.broadcast;
43  import static jdk.incubator.vector.DoubleVector.fromArray;
44  import static org.apache.commons.math3.util.FastMath.PI;
45  import static org.apache.commons.math3.util.FastMath.cos;
46  import static org.apache.commons.math3.util.FastMath.sin;
47  
48  /**
49   * The MixedRadixFactor7 class handles factors of 7 in the FFT.
50   */
51  public class MixedRadixFactor7 extends MixedRadixFactor {
52  
53    private static final double oneThird = 1.0 / 3.0;
54    private static final double sin2PI_7 = sin(2.0 * PI / 7.0);
55    private static final double sin4PI_7 = sin(4.0 * PI / 7.0);
56    private static final double sin6PI_7 = sin(6.0 * PI / 7.0);
57    private static final double cos2PI_7 = cos(2.0 * PI / 7.0);
58    private static final double cos4PI_7 = cos(4.0 * PI / 7.0);
59    private static final double cos6PI_7 = cos(6.0 * PI / 7.0);
60    private static final double c1 = cos2PI_7;
61    private static final double c2 = cos4PI_7;
62    private static final double c3 = cos6PI_7;
63    private static final double v1 = (c1 + c2 + c3) * oneThird - 1.0;
64    private static final double v2 = (2.0 * c1 - c2 - c3) * oneThird;
65    private static final double v3 = (c1 - 2.0 * c2 + c3) * oneThird;
66    private static final double v4 = (c1 + c2 - 2.0 * c3) * oneThird;
67  
68    private final int di2;
69    private final int di3;
70    private final int di4;
71    private final int di5;
72    private final int di6;
73    private final int dj2;
74    private final int dj3;
75    private final int dj4;
76    private final int dj5;
77    private final int dj6;
78  
79    /**
80     * Construct a MixedRadixFactor7.
81     *
82     * @param passConstants PassConstants.
83     */
84    public MixedRadixFactor7(PassConstants passConstants) {
85      super(passConstants);
86      di2 = 2 * di;
87      di3 = 3 * di;
88      di4 = 4 * di;
89      di5 = 5 * di;
90      di6 = 6 * di;
91      dj2 = 2 * dj;
92      dj3 = 3 * dj;
93      dj4 = 4 * dj;
94      dj5 = 5 * dj;
95      dj6 = 6 * dj;
96    }
97  
98    /**
99     * Handle factors of 7.
100    *
101    * @param passData PassData.
102    */
103   protected void passScalar(PassData passData) {
104     final double[] data = passData.in;
105     final double[] ret = passData.out;
106     int sign = passData.sign;
107     int i = passData.inOffset;
108     int j = passData.outOffset;
109     final double s1 = (-sign) * sin2PI_7;
110     final double s2 = (-sign) * sin4PI_7;
111     final double s3 = (-sign) * sin6PI_7;
112     final double v5 = (s1 + s2 - s3) * oneThird;
113     final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
114     final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
115     final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
116     // First pass of the 7-point FFT has no twiddle factors.
117     for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
118       final double z0r = data[i];
119       final double z1r = data[i + di];
120       final double z2r = data[i + di2];
121       final double z3r = data[i + di3];
122       final double z4r = data[i + di4];
123       final double z5r = data[i + di5];
124       final double z6r = data[i + di6];
125       final double z0i = data[i + im];
126       final double z1i = data[i + di + im];
127       final double z2i = data[i + di2 + im];
128       final double z3i = data[i + di3 + im];
129       final double z4i = data[i + di4 + im];
130       final double z5i = data[i + di5 + im];
131       final double z6i = data[i + di6 + im];
132       final double t0r = z1r + z6r;
133       final double t0i = z1i + z6i;
134       final double t1r = z1r - z6r;
135       final double t1i = z1i - z6i;
136       final double t2r = z2r + z5r;
137       final double t2i = z2i + z5i;
138       final double t3r = z2r - z5r;
139       final double t3i = z2i - z5i;
140       final double t4r = z4r + z3r;
141       final double t4i = z4i + z3i;
142       final double t5r = z4r - z3r;
143       final double t5i = z4i - z3i;
144       final double t6r = t2r + t0r;
145       final double t6i = t2i + t0i;
146       final double t7r = t5r + t3r;
147       final double t7i = t5i + t3i;
148       final double b0r = z0r + t6r + t4r;
149       final double b0i = z0i + t6i + t4i;
150       final double b1r = v1 * (t6r + t4r);
151       final double b1i = v1 * (t6i + t4i);
152       final double b2r = v2 * (t0r - t4r);
153       final double b2i = v2 * (t0i - t4i);
154       final double b3r = v3 * (t4r - t2r);
155       final double b3i = v3 * (t4i - t2i);
156       final double b4r = v4 * (t2r - t0r);
157       final double b4i = v4 * (t2i - t0i);
158       final double b5r = v5 * (t7r + t1r);
159       final double b5i = v5 * (t7i + t1i);
160       final double b6r = v6 * (t1r - t5r);
161       final double b6i = v6 * (t1i - t5i);
162       final double b7r = v7 * (t5r - t3r);
163       final double b7i = v7 * (t5i - t3i);
164       final double b8r = v8 * (t3r - t1r);
165       final double b8i = v8 * (t3i - t1i);
166       final double u0r = b0r + b1r;
167       final double u0i = b0i + b1i;
168       final double u1r = b2r + b3r;
169       final double u1i = b2i + b3i;
170       final double u2r = b4r - b3r;
171       final double u2i = b4i - b3i;
172       final double u3r = -b2r - b4r;
173       final double u3i = -b2i - b4i;
174       final double u4r = b6r + b7r;
175       final double u4i = b6i + b7i;
176       final double u5r = b8r - b7r;
177       final double u5i = b8i - b7i;
178       final double u6r = -b8r - b6r;
179       final double u6i = -b8i - b6i;
180       final double u7r = u0r + u1r;
181       final double u7i = u0i + u1i;
182       final double u8r = u0r + u2r;
183       final double u8i = u0i + u2i;
184       final double u9r = u0r + u3r;
185       final double u9i = u0i + u3i;
186       final double u10r = u4r + b5r;
187       final double u10i = u4i + b5i;
188       final double u11r = u5r + b5r;
189       final double u11i = u5i + b5i;
190       final double u12r = u6r + b5r;
191       final double u12i = u6i + b5i;
192       ret[j] = b0r;
193       ret[j + im] = b0i;
194       ret[j + dj] = u7r + u10i;
195       ret[j + dj + im] = u7i - u10r;
196       ret[j + dj2] = u9r + u12i;
197       ret[j + dj2 + im] = u9i - u12r;
198       ret[j + dj3] = u8r - u11i;
199       ret[j + dj3 + im] = u8i + u11r;
200       ret[j + dj4] = u8r + u11i;
201       ret[j + dj4 + im] = u8i - u11r;
202       ret[j + dj5] = u9r - u12i;
203       ret[j + dj5 + im] = u9i + u12r;
204       ret[j + dj6] = u7r - u10i;
205       ret[j + dj6 + im] = u7i + u10r;
206     }
207 
208     j += jstep;
209     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
210 //      final double[] twids = twiddles[k];
211 //      final double w1r = twids[0];
212 //      final double w1i = -sign * twids[1];
213 //      final double w2r = twids[2];
214 //      final double w2i = -sign * twids[3];
215 //      final double w3r = twids[4];
216 //      final double w3i = -sign * twids[5];
217 //      final double w4r = twids[6];
218 //      final double w4i = -sign * twids[7];
219 //      final double w5r = twids[8];
220 //      final double w5i = -sign * twids[9];
221 //      final double w6r = twids[10];
222 //      final double w6i = -sign * twids[11];
223       final int index = k * 6;
224       final double w1r = wr[index];
225       final double w2r = wr[index + 1];
226       final double w3r = wr[index + 2];
227       final double w4r = wr[index + 3];
228       final double w5r = wr[index + 4];
229       final double w6r = wr[index + 5];
230       final double w1i = -sign * wi[index];
231       final double w2i = -sign * wi[index + 1];
232       final double w3i = -sign * wi[index + 2];
233       final double w4i = -sign * wi[index + 3];
234       final double w5i = -sign * wi[index + 4];
235       final double w6i = -sign * wi[index + 5];
236       for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
237         final double z0r = data[i];
238         final double z1r = data[i + di];
239         final double z2r = data[i + di2];
240         final double z3r = data[i + di3];
241         final double z4r = data[i + di4];
242         final double z5r = data[i + di5];
243         final double z6r = data[i + di6];
244         final double z0i = data[i + im];
245         final double z1i = data[i + di + im];
246         final double z2i = data[i + di2 + im];
247         final double z3i = data[i + di3 + im];
248         final double z4i = data[i + di4 + im];
249         final double z5i = data[i + di5 + im];
250         final double z6i = data[i + di6 + im];
251         final double t0r = z1r + z6r;
252         final double t0i = z1i + z6i;
253         final double t1r = z1r - z6r;
254         final double t1i = z1i - z6i;
255         final double t2r = z2r + z5r;
256         final double t2i = z2i + z5i;
257         final double t3r = z2r - z5r;
258         final double t3i = z2i - z5i;
259         final double t4r = z4r + z3r;
260         final double t4i = z4i + z3i;
261         final double t5r = z4r - z3r;
262         final double t5i = z4i - z3i;
263         final double t6r = t2r + t0r;
264         final double t6i = t2i + t0i;
265         final double t7r = t5r + t3r;
266         final double t7i = t5i + t3i;
267         final double b0r = z0r + t6r + t4r;
268         final double b0i = z0i + t6i + t4i;
269         final double b1r = v1 * (t6r + t4r);
270         final double b1i = v1 * (t6i + t4i);
271         final double b2r = v2 * (t0r - t4r);
272         final double b2i = v2 * (t0i - t4i);
273         final double b3r = v3 * (t4r - t2r);
274         final double b3i = v3 * (t4i - t2i);
275         final double b4r = v4 * (t2r - t0r);
276         final double b4i = v4 * (t2i - t0i);
277         final double b5r = v5 * (t7r + t1r);
278         final double b5i = v5 * (t7i + t1i);
279         final double b6r = v6 * (t1r - t5r);
280         final double b6i = v6 * (t1i - t5i);
281         final double b7r = v7 * (t5r - t3r);
282         final double b7i = v7 * (t5i - t3i);
283         final double b8r = v8 * (t3r - t1r);
284         final double b8i = v8 * (t3i - t1i);
285         final double u0r = b0r + b1r;
286         final double u0i = b0i + b1i;
287         final double u1r = b2r + b3r;
288         final double u1i = b2i + b3i;
289         final double u2r = b4r - b3r;
290         final double u2i = b4i - b3i;
291         final double u3r = -b2r - b4r;
292         final double u3i = -b2i - b4i;
293         final double u4r = b6r + b7r;
294         final double u4i = b6i + b7i;
295         final double u5r = b8r - b7r;
296         final double u5i = b8i - b7i;
297         final double u6r = -b8r - b6r;
298         final double u6i = -b8i - b6i;
299         final double u7r = u0r + u1r;
300         final double u7i = u0i + u1i;
301         final double u8r = u0r + u2r;
302         final double u8i = u0i + u2i;
303         final double u9r = u0r + u3r;
304         final double u9i = u0i + u3i;
305         final double u10r = u4r + b5r;
306         final double u10i = u4i + b5i;
307         final double u11r = u5r + b5r;
308         final double u11i = u5i + b5i;
309         final double u12r = u6r + b5r;
310         final double u12i = u6i + b5i;
311         ret[j] = b0r;
312         ret[j + im] = b0i;
313         multiplyAndStore(u7r + u10i, u7i - u10r, w1r, w1i, ret, j + dj, j + dj + im);
314         multiplyAndStore(u9r + u12i, u9i - u12r, w2r, w2i, ret, j + dj2, j + dj2 + im);
315         multiplyAndStore(u8r - u11i, u8i + u11r, w3r, w3i, ret, j + dj3, j + dj3 + im);
316         multiplyAndStore(u8r + u11i, u8i - u11r, w4r, w4i, ret, j + dj4, j + dj4 + im);
317         multiplyAndStore(u9r - u12i, u9i + u12r, w5r, w5i, ret, j + dj5, j + dj5 + im);
318         multiplyAndStore(u7r - u10i, u7i + u10r, w6r, w6i, ret, j + dj6, j + dj6 + im);
319       }
320     }
321   }
322 
323   /**
324    * Handle factors of 6 using SIMD vectors.
325    *
326    * @param passData PassData.
327    */
328   @Override
329   protected void passSIMD(PassData passData) {
330     if (im == 1) {
331       // If the inner loop limit is not divisible by the loop increment, use the scalar method.
332       if (innerLoopLimit % INTERLEAVED_LOOP != 0) {
333         passScalar(passData);
334       } else {
335         interleaved(passData);
336       }
337     } else {
338       // If the inner loop limit is not divisible by the loop increment, use the scalar method.
339       if (innerLoopLimit % BLOCK_LOOP != 0) {
340         passScalar(passData);
341       } else {
342         blocked(passData);
343       }
344     }
345   }
346 
347   /**
348    * Handle factors of 7.
349    *
350    * @param passData PassData.
351    */
352   private void interleaved(PassData passData) {
353     final double[] data = passData.in;
354     final double[] ret = passData.out;
355     int sign = passData.sign;
356     int i = passData.inOffset;
357     int j = passData.outOffset;
358     final double s1 = (-sign) * sin2PI_7;
359     final double s2 = (-sign) * sin4PI_7;
360     final double s3 = (-sign) * sin6PI_7;
361     final double v5 = (s1 + s2 - s3) * oneThird;
362     final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
363     final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
364     final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
365     // First pass of the 7-point FFT has no twiddle factors.
366     for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP, i += LENGTH, j += LENGTH) {
367       DoubleVector
368           z0 = fromArray(DOUBLE_SPECIES, data, i),
369           z1 = fromArray(DOUBLE_SPECIES, data, i + di),
370           z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
371           z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
372           z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
373           z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
374           z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
375       DoubleVector
376           t0 = z1.add(z6),
377           t1 = z1.sub(z6),
378           t2 = z2.add(z5),
379           t3 = z2.sub(z5),
380           t4 = z4.add(z3),
381           t5 = z4.sub(z3),
382           t6 = t2.add(t0),
383           t7 = t5.add(t3);
384       DoubleVector
385           b0 = z0.add(t6).add(t4),
386           b1 = t6.add(t4).mul(v1),
387           b2 = t0.sub(t4).mul(v2),
388           b3 = t4.sub(t2).mul(v3),
389           b4 = t2.sub(t0).mul(v4),
390           b5 = t7.add(t1).mul(v5),
391           b6 = t1.sub(t5).mul(v6),
392           b7 = t5.sub(t3).mul(v7),
393           b8 = t3.sub(t1).mul(v8);
394       DoubleVector
395           u0 = b0.add(b1),
396           u1 = b2.add(b3),
397           u2 = b4.sub(b3),
398           u3 = b2.add(b4).neg(),
399           u4 = b6.add(b7),
400           u5 = b8.sub(b7),
401           u6 = b8.add(b6).neg(),
402           u7 = u0.add(u1),
403           u8 = u0.add(u2),
404           u9 = u0.add(u3),
405           u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
406           u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
407           u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
408       b0.intoArray(ret, j);
409       u7.add(u10.mul(NEGATE_IM)).intoArray(ret, j + dj);
410       u9.add(u12.mul(NEGATE_IM)).intoArray(ret, j + dj2);
411       u8.add(u11.mul(NEGATE_RE)).intoArray(ret, j + dj3);
412       u8.add(u11.mul(NEGATE_IM)).intoArray(ret, j + dj4);
413       u9.add(u12.mul(NEGATE_RE)).intoArray(ret, j + dj5);
414       u7.add(u10.mul(NEGATE_RE)).intoArray(ret, j + dj6);
415     }
416 
417     j += jstep;
418     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
419 //      final double[] twids = twiddles[k];
420 //      DoubleVector
421 //          w1r = broadcast(DOUBLE_SPECIES, twids[0]),
422 //          w1i = broadcast(DOUBLE_SPECIES, -sign * twids[1]).mul(NEGATE_IM),
423 //          w2r = broadcast(DOUBLE_SPECIES, twids[2]),
424 //          w2i = broadcast(DOUBLE_SPECIES, -sign * twids[3]).mul(NEGATE_IM),
425 //          w3r = broadcast(DOUBLE_SPECIES, twids[4]),
426 //          w3i = broadcast(DOUBLE_SPECIES, -sign * twids[5]).mul(NEGATE_IM),
427 //          w4r = broadcast(DOUBLE_SPECIES, twids[6]),
428 //          w4i = broadcast(DOUBLE_SPECIES, -sign * twids[7]).mul(NEGATE_IM),
429 //          w5r = broadcast(DOUBLE_SPECIES, twids[8]),
430 //          w5i = broadcast(DOUBLE_SPECIES, -sign * twids[9]).mul(NEGATE_IM),
431 //          w6r = broadcast(DOUBLE_SPECIES, twids[10]),
432 //          w6i = broadcast(DOUBLE_SPECIES, -sign * twids[11]).mul(NEGATE_IM);
433       final int index = k * 6;
434       final DoubleVector
435           w1r = broadcast(DOUBLE_SPECIES, wr[index]),
436           w2r = broadcast(DOUBLE_SPECIES, wr[index + 1]),
437           w3r = broadcast(DOUBLE_SPECIES, wr[index + 2]),
438           w4r = broadcast(DOUBLE_SPECIES, wr[index + 3]),
439           w5r = broadcast(DOUBLE_SPECIES, wr[index + 4]),
440           w6r = broadcast(DOUBLE_SPECIES, wr[index + 5]),
441           w1i = broadcast(DOUBLE_SPECIES, -sign * wi[index]).mul(NEGATE_IM),
442           w2i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 1]).mul(NEGATE_IM),
443           w3i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 2]).mul(NEGATE_IM),
444           w4i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 3]).mul(NEGATE_IM),
445           w5i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 4]).mul(NEGATE_IM),
446           w6i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 5]).mul(NEGATE_IM);
447       for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP, i += LENGTH, j += LENGTH) {
448         DoubleVector
449             z0 = fromArray(DOUBLE_SPECIES, data, i),
450             z1 = fromArray(DOUBLE_SPECIES, data, i + di),
451             z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
452             z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
453             z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
454             z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
455             z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
456         DoubleVector
457             t0 = z1.add(z6),
458             t1 = z1.sub(z6),
459             t2 = z2.add(z5),
460             t3 = z2.sub(z5),
461             t4 = z4.add(z3),
462             t5 = z4.sub(z3),
463             t6 = t2.add(t0),
464             t7 = t5.add(t3);
465         DoubleVector
466             b0 = z0.add(t6).add(t4),
467             b1 = t6.add(t4).mul(v1),
468             b2 = t0.sub(t4).mul(v2),
469             b3 = t4.sub(t2).mul(v3),
470             b4 = t2.sub(t0).mul(v4),
471             b5 = t7.add(t1).mul(v5),
472             b6 = t1.sub(t5).mul(v6),
473             b7 = t5.sub(t3).mul(v7),
474             b8 = t3.sub(t1).mul(v8);
475         DoubleVector
476             u0 = b0.add(b1),
477             u1 = b2.add(b3),
478             u2 = b4.sub(b3),
479             u3 = b2.add(b4).neg(),
480             u4 = b6.add(b7),
481             u5 = b8.sub(b7),
482             u6 = b8.add(b6).neg(),
483             u7 = u0.add(u1),
484             u8 = u0.add(u2),
485             u9 = u0.add(u3),
486             u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
487             u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
488             u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
489         b0.intoArray(ret, j);
490         DoubleVector
491             x1 = u7.add(u10.mul(NEGATE_IM)),
492             x2 = u9.add(u12.mul(NEGATE_IM)),
493             x3 = u8.add(u11.mul(NEGATE_RE)),
494             x4 = u8.add(u11.mul(NEGATE_IM)),
495             x5 = u9.add(u12.mul(NEGATE_RE)),
496             x6 = u7.add(u10.mul(NEGATE_RE));
497         w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj);
498         w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj2);
499         w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj3);
500         w4r.mul(x4).add(w4i.mul(x4).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj4);
501         w5r.mul(x5).add(w5i.mul(x5).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj5);
502         w6r.mul(x6).add(w6i.mul(x6).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj6);
503       }
504     }
505   }
506 
507   /**
508    * Handle factors of 7.
509    *
510    * @param passData PassData.
511    */
512   private void blocked(PassData passData) {
513     final double[] data = passData.in;
514     final double[] ret = passData.out;
515     int sign = passData.sign;
516     int i = passData.inOffset;
517     int j = passData.outOffset;
518     final double s1 = (-sign) * sin2PI_7;
519     final double s2 = (-sign) * sin4PI_7;
520     final double s3 = (-sign) * sin6PI_7;
521     final double v5 = (s1 + s2 - s3) * oneThird;
522     final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
523     final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
524     final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
525     // First pass of the 7-point FFT has no twiddle factors.
526     for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
527       final DoubleVector
528           z0r = fromArray(DOUBLE_SPECIES, data, i),
529           z1r = fromArray(DOUBLE_SPECIES, data, i + di),
530           z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
531           z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
532           z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
533           z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
534           z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
535           z0i = fromArray(DOUBLE_SPECIES, data, i + im),
536           z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
537           z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
538           z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
539           z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
540           z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
541           z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
542       final DoubleVector
543           t0r = z1r.add(z6r), t0i = z1i.add(z6i),
544           t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
545           t2r = z2r.add(z5r), t2i = z2i.add(z5i),
546           t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
547           t4r = z4r.add(z3r), t4i = z4i.add(z3i),
548           t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
549           t6r = t2r.add(t0r), t6i = t2i.add(t0i),
550           t7r = t5r.add(t3r), t7i = t5i.add(t3i);
551       final DoubleVector
552           b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
553           b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
554           b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
555           b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
556           b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
557           b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
558           b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
559           b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
560           b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
561       final DoubleVector
562           u0r = b0r.add(b1r), u0i = b0i.add(b1i),
563           u1r = b2r.add(b3r), u1i = b2i.add(b3i),
564           u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
565           u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
566           u4r = b6r.add(b7r), u4i = b6i.add(b7i),
567           u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
568           u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
569           u7r = u0r.add(u1r), u7i = u0i.add(u1i),
570           u8r = u0r.add(u2r), u8i = u0i.add(u2i),
571           u9r = u0r.add(u3r), u9i = u0i.add(u3i),
572           u10r = u4r.add(b5r), u10i = u4i.add(b5i),
573           u11r = u5r.add(b5r), u11i = u5i.add(b5i),
574           u12r = u6r.add(b5r), u12i = u6i.add(b5i);
575       b0r.intoArray(ret, j);
576       b0i.intoArray(ret, j + im);
577       u7r.add(u10i).intoArray(ret, j + dj);
578       u7i.sub(u10r).intoArray(ret, j + dj + im);
579       u9r.add(u12i).intoArray(ret, j + dj2);
580       u9i.sub(u12r).intoArray(ret, j + dj2 + im);
581       u8r.sub(u11i).intoArray(ret, j + dj3);
582       u8i.add(u11r).intoArray(ret, j + dj3 + im);
583       u8r.add(u11i).intoArray(ret, j + dj4);
584       u8i.sub(u11r).intoArray(ret, j + dj4 + im);
585       u9r.sub(u12i).intoArray(ret, j + dj5);
586       u9i.add(u12r).intoArray(ret, j + dj5 + im);
587       u7r.sub(u10i).intoArray(ret, j + dj6);
588       u7i.add(u10r).intoArray(ret, j + dj6 + im);
589     }
590 
591     j += jstep;
592     for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
593 //      final double[] twids = twiddles[k];
594 //      DoubleVector
595 //          w1r = broadcast(DOUBLE_SPECIES, twids[0]),
596 //          w1i = broadcast(DOUBLE_SPECIES, -sign * twids[1]),
597 //          w2r = broadcast(DOUBLE_SPECIES, twids[2]),
598 //          w2i = broadcast(DOUBLE_SPECIES, -sign * twids[3]),
599 //          w3r = broadcast(DOUBLE_SPECIES, twids[4]),
600 //          w3i = broadcast(DOUBLE_SPECIES, -sign * twids[5]),
601 //          w4r = broadcast(DOUBLE_SPECIES, twids[6]),
602 //          w4i = broadcast(DOUBLE_SPECIES, -sign * twids[7]),
603 //          w5r = broadcast(DOUBLE_SPECIES, twids[8]),
604 //          w5i = broadcast(DOUBLE_SPECIES, -sign * twids[9]),
605 //          w6r = broadcast(DOUBLE_SPECIES, twids[10]),
606 //          w6i = broadcast(DOUBLE_SPECIES, -sign * twids[11]);
607       final int index = k * 6;
608       final DoubleVector
609           w1r = broadcast(DOUBLE_SPECIES, wr[index]),
610           w2r = broadcast(DOUBLE_SPECIES, wr[index + 1]),
611           w3r = broadcast(DOUBLE_SPECIES, wr[index + 2]),
612           w4r = broadcast(DOUBLE_SPECIES, wr[index + 3]),
613           w5r = broadcast(DOUBLE_SPECIES, wr[index + 4]),
614           w6r = broadcast(DOUBLE_SPECIES, wr[index + 5]),
615           w1i = broadcast(DOUBLE_SPECIES, -sign * wi[index]),
616           w2i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 1]),
617           w3i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 2]),
618           w4i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 3]),
619           w5i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 4]),
620           w6i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 5]);
621       for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
622         final DoubleVector
623             z0r = fromArray(DOUBLE_SPECIES, data, i),
624             z1r = fromArray(DOUBLE_SPECIES, data, i + di),
625             z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
626             z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
627             z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
628             z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
629             z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
630             z0i = fromArray(DOUBLE_SPECIES, data, i + im),
631             z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
632             z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
633             z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
634             z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
635             z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
636             z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
637         final DoubleVector
638             t0r = z1r.add(z6r), t0i = z1i.add(z6i),
639             t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
640             t2r = z2r.add(z5r), t2i = z2i.add(z5i),
641             t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
642             t4r = z4r.add(z3r), t4i = z4i.add(z3i),
643             t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
644             t6r = t2r.add(t0r), t6i = t2i.add(t0i),
645             t7r = t5r.add(t3r), t7i = t5i.add(t3i);
646         final DoubleVector
647             b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
648             b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
649             b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
650             b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
651             b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
652             b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
653             b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
654             b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
655             b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
656         final DoubleVector
657             u0r = b0r.add(b1r), u0i = b0i.add(b1i),
658             u1r = b2r.add(b3r), u1i = b2i.add(b3i),
659             u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
660             u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
661             u4r = b6r.add(b7r), u4i = b6i.add(b7i),
662             u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
663             u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
664             u7r = u0r.add(u1r), u7i = u0i.add(u1i),
665             u8r = u0r.add(u2r), u8i = u0i.add(u2i),
666             u9r = u0r.add(u3r), u9i = u0i.add(u3i),
667             u10r = u4r.add(b5r), u10i = u4i.add(b5i),
668             u11r = u5r.add(b5r), u11i = u5i.add(b5i),
669             u12r = u6r.add(b5r), u12i = u6i.add(b5i);
670         b0r.intoArray(ret, j);
671         b0i.intoArray(ret, j + im);
672         DoubleVector
673             x1r = u7r.add(u10i), x1i = u7i.sub(u10r),
674             x2r = u9r.add(u12i), x2i = u9i.sub(u12r),
675             x3r = u8r.sub(u11i), x3i = u8i.add(u11r),
676             x4r = u8r.add(u11i), x4i = u8i.sub(u11r),
677             x5r = u9r.sub(u12i), x5i = u9i.add(u12r),
678             x6r = u7r.sub(u10i), x6i = u7i.add(u10r);
679         w1r.mul(x1r).add(w1i.neg().mul(x1i)).intoArray(ret, j + dj);
680         w2r.mul(x2r).add(w2i.neg().mul(x2i)).intoArray(ret, j + dj2);
681         w3r.mul(x3r).add(w3i.neg().mul(x3i)).intoArray(ret, j + dj3);
682         w4r.mul(x4r).add(w4i.neg().mul(x4i)).intoArray(ret, j + dj4);
683         w5r.mul(x5r).add(w5i.neg().mul(x5i)).intoArray(ret, j + dj5);
684         w6r.mul(x6r).add(w6i.neg().mul(x6i)).intoArray(ret, j + dj6);
685         w1r.mul(x1i).add(w1i.mul(x1r)).intoArray(ret, j + dj + im);
686         w2r.mul(x2i).add(w2i.mul(x2r)).intoArray(ret, j + dj2 + im);
687         w3r.mul(x3i).add(w3i.mul(x3r)).intoArray(ret, j + dj3 + im);
688         w4r.mul(x4i).add(w4i.mul(x4r)).intoArray(ret, j + dj4 + im);
689         w5r.mul(x5i).add(w5i.mul(x5r)).intoArray(ret, j + dj5 + im);
690         w6r.mul(x6i).add(w6i.mul(x6r)).intoArray(ret, j + dj6 + im);
691       }
692     }
693   }
694 }