1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 package ffx.numerics.fft;
39
40 import jdk.incubator.vector.DoubleVector;
41
42 import static jdk.incubator.vector.DoubleVector.broadcast;
43 import static jdk.incubator.vector.DoubleVector.fromArray;
44 import static org.apache.commons.math3.util.FastMath.PI;
45 import static org.apache.commons.math3.util.FastMath.cos;
46 import static org.apache.commons.math3.util.FastMath.sin;
47
48
49
50
51 public class MixedRadixFactor7 extends MixedRadixFactor {
52
53 private static final double oneThird = 1.0 / 3.0;
54 private static final double sin2PI_7 = sin(2.0 * PI / 7.0);
55 private static final double sin4PI_7 = sin(4.0 * PI / 7.0);
56 private static final double sin6PI_7 = sin(6.0 * PI / 7.0);
57 private static final double cos2PI_7 = cos(2.0 * PI / 7.0);
58 private static final double cos4PI_7 = cos(4.0 * PI / 7.0);
59 private static final double cos6PI_7 = cos(6.0 * PI / 7.0);
60 private static final double c1 = cos2PI_7;
61 private static final double c2 = cos4PI_7;
62 private static final double c3 = cos6PI_7;
63 private static final double v1 = (c1 + c2 + c3) * oneThird - 1.0;
64 private static final double v2 = (2.0 * c1 - c2 - c3) * oneThird;
65 private static final double v3 = (c1 - 2.0 * c2 + c3) * oneThird;
66 private static final double v4 = (c1 + c2 - 2.0 * c3) * oneThird;
67
68 private final int di2;
69 private final int di3;
70 private final int di4;
71 private final int di5;
72 private final int di6;
73 private final int dj2;
74 private final int dj3;
75 private final int dj4;
76 private final int dj5;
77 private final int dj6;
78
79
80
81
82
83
84 public MixedRadixFactor7(PassConstants passConstants) {
85 super(passConstants);
86 di2 = 2 * di;
87 di3 = 3 * di;
88 di4 = 4 * di;
89 di5 = 5 * di;
90 di6 = 6 * di;
91 dj2 = 2 * dj;
92 dj3 = 3 * dj;
93 dj4 = 4 * dj;
94 dj5 = 5 * dj;
95 dj6 = 6 * dj;
96 }
97
98
99
100
101
102
103 protected void passScalar(PassData passData) {
104 final double[] data = passData.in;
105 final double[] ret = passData.out;
106 int sign = passData.sign;
107 int i = passData.inOffset;
108 int j = passData.outOffset;
109 final double s1 = (-sign) * sin2PI_7;
110 final double s2 = (-sign) * sin4PI_7;
111 final double s3 = (-sign) * sin6PI_7;
112 final double v5 = (s1 + s2 - s3) * oneThird;
113 final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
114 final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
115 final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
116
117 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
118 final double z0r = data[i];
119 final double z1r = data[i + di];
120 final double z2r = data[i + di2];
121 final double z3r = data[i + di3];
122 final double z4r = data[i + di4];
123 final double z5r = data[i + di5];
124 final double z6r = data[i + di6];
125 final double z0i = data[i + im];
126 final double z1i = data[i + di + im];
127 final double z2i = data[i + di2 + im];
128 final double z3i = data[i + di3 + im];
129 final double z4i = data[i + di4 + im];
130 final double z5i = data[i + di5 + im];
131 final double z6i = data[i + di6 + im];
132 final double t0r = z1r + z6r;
133 final double t0i = z1i + z6i;
134 final double t1r = z1r - z6r;
135 final double t1i = z1i - z6i;
136 final double t2r = z2r + z5r;
137 final double t2i = z2i + z5i;
138 final double t3r = z2r - z5r;
139 final double t3i = z2i - z5i;
140 final double t4r = z4r + z3r;
141 final double t4i = z4i + z3i;
142 final double t5r = z4r - z3r;
143 final double t5i = z4i - z3i;
144 final double t6r = t2r + t0r;
145 final double t6i = t2i + t0i;
146 final double t7r = t5r + t3r;
147 final double t7i = t5i + t3i;
148 final double b0r = z0r + t6r + t4r;
149 final double b0i = z0i + t6i + t4i;
150 final double b1r = v1 * (t6r + t4r);
151 final double b1i = v1 * (t6i + t4i);
152 final double b2r = v2 * (t0r - t4r);
153 final double b2i = v2 * (t0i - t4i);
154 final double b3r = v3 * (t4r - t2r);
155 final double b3i = v3 * (t4i - t2i);
156 final double b4r = v4 * (t2r - t0r);
157 final double b4i = v4 * (t2i - t0i);
158 final double b5r = v5 * (t7r + t1r);
159 final double b5i = v5 * (t7i + t1i);
160 final double b6r = v6 * (t1r - t5r);
161 final double b6i = v6 * (t1i - t5i);
162 final double b7r = v7 * (t5r - t3r);
163 final double b7i = v7 * (t5i - t3i);
164 final double b8r = v8 * (t3r - t1r);
165 final double b8i = v8 * (t3i - t1i);
166 final double u0r = b0r + b1r;
167 final double u0i = b0i + b1i;
168 final double u1r = b2r + b3r;
169 final double u1i = b2i + b3i;
170 final double u2r = b4r - b3r;
171 final double u2i = b4i - b3i;
172 final double u3r = -b2r - b4r;
173 final double u3i = -b2i - b4i;
174 final double u4r = b6r + b7r;
175 final double u4i = b6i + b7i;
176 final double u5r = b8r - b7r;
177 final double u5i = b8i - b7i;
178 final double u6r = -b8r - b6r;
179 final double u6i = -b8i - b6i;
180 final double u7r = u0r + u1r;
181 final double u7i = u0i + u1i;
182 final double u8r = u0r + u2r;
183 final double u8i = u0i + u2i;
184 final double u9r = u0r + u3r;
185 final double u9i = u0i + u3i;
186 final double u10r = u4r + b5r;
187 final double u10i = u4i + b5i;
188 final double u11r = u5r + b5r;
189 final double u11i = u5i + b5i;
190 final double u12r = u6r + b5r;
191 final double u12i = u6i + b5i;
192 ret[j] = b0r;
193 ret[j + im] = b0i;
194 ret[j + dj] = u7r + u10i;
195 ret[j + dj + im] = u7i - u10r;
196 ret[j + dj2] = u9r + u12i;
197 ret[j + dj2 + im] = u9i - u12r;
198 ret[j + dj3] = u8r - u11i;
199 ret[j + dj3 + im] = u8i + u11r;
200 ret[j + dj4] = u8r + u11i;
201 ret[j + dj4 + im] = u8i - u11r;
202 ret[j + dj5] = u9r - u12i;
203 ret[j + dj5 + im] = u9i + u12r;
204 ret[j + dj6] = u7r - u10i;
205 ret[j + dj6 + im] = u7i + u10r;
206 }
207
208 j += jstep;
209 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
210 final double[] twids = twiddles[k];
211 final double w1r = twids[0];
212 final double w1i = -sign * twids[1];
213 final double w2r = twids[2];
214 final double w2i = -sign * twids[3];
215 final double w3r = twids[4];
216 final double w3i = -sign * twids[5];
217 final double w4r = twids[6];
218 final double w4i = -sign * twids[7];
219 final double w5r = twids[8];
220 final double w5i = -sign * twids[9];
221 final double w6r = twids[10];
222 final double w6i = -sign * twids[11];
223 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
224 final double z0r = data[i];
225 final double z1r = data[i + di];
226 final double z2r = data[i + di2];
227 final double z3r = data[i + di3];
228 final double z4r = data[i + di4];
229 final double z5r = data[i + di5];
230 final double z6r = data[i + di6];
231 final double z0i = data[i + im];
232 final double z1i = data[i + di + im];
233 final double z2i = data[i + di2 + im];
234 final double z3i = data[i + di3 + im];
235 final double z4i = data[i + di4 + im];
236 final double z5i = data[i + di5 + im];
237 final double z6i = data[i + di6 + im];
238 final double t0r = z1r + z6r;
239 final double t0i = z1i + z6i;
240 final double t1r = z1r - z6r;
241 final double t1i = z1i - z6i;
242 final double t2r = z2r + z5r;
243 final double t2i = z2i + z5i;
244 final double t3r = z2r - z5r;
245 final double t3i = z2i - z5i;
246 final double t4r = z4r + z3r;
247 final double t4i = z4i + z3i;
248 final double t5r = z4r - z3r;
249 final double t5i = z4i - z3i;
250 final double t6r = t2r + t0r;
251 final double t6i = t2i + t0i;
252 final double t7r = t5r + t3r;
253 final double t7i = t5i + t3i;
254 final double b0r = z0r + t6r + t4r;
255 final double b0i = z0i + t6i + t4i;
256 final double b1r = v1 * (t6r + t4r);
257 final double b1i = v1 * (t6i + t4i);
258 final double b2r = v2 * (t0r - t4r);
259 final double b2i = v2 * (t0i - t4i);
260 final double b3r = v3 * (t4r - t2r);
261 final double b3i = v3 * (t4i - t2i);
262 final double b4r = v4 * (t2r - t0r);
263 final double b4i = v4 * (t2i - t0i);
264 final double b5r = v5 * (t7r + t1r);
265 final double b5i = v5 * (t7i + t1i);
266 final double b6r = v6 * (t1r - t5r);
267 final double b6i = v6 * (t1i - t5i);
268 final double b7r = v7 * (t5r - t3r);
269 final double b7i = v7 * (t5i - t3i);
270 final double b8r = v8 * (t3r - t1r);
271 final double b8i = v8 * (t3i - t1i);
272 final double u0r = b0r + b1r;
273 final double u0i = b0i + b1i;
274 final double u1r = b2r + b3r;
275 final double u1i = b2i + b3i;
276 final double u2r = b4r - b3r;
277 final double u2i = b4i - b3i;
278 final double u3r = -b2r - b4r;
279 final double u3i = -b2i - b4i;
280 final double u4r = b6r + b7r;
281 final double u4i = b6i + b7i;
282 final double u5r = b8r - b7r;
283 final double u5i = b8i - b7i;
284 final double u6r = -b8r - b6r;
285 final double u6i = -b8i - b6i;
286 final double u7r = u0r + u1r;
287 final double u7i = u0i + u1i;
288 final double u8r = u0r + u2r;
289 final double u8i = u0i + u2i;
290 final double u9r = u0r + u3r;
291 final double u9i = u0i + u3i;
292 final double u10r = u4r + b5r;
293 final double u10i = u4i + b5i;
294 final double u11r = u5r + b5r;
295 final double u11i = u5i + b5i;
296 final double u12r = u6r + b5r;
297 final double u12i = u6i + b5i;
298 ret[j] = b0r;
299 ret[j + im] = b0i;
300 multiplyAndStore(u7r + u10i, u7i - u10r, w1r, w1i, ret, j + dj, j + dj + im);
301 multiplyAndStore(u9r + u12i, u9i - u12r, w2r, w2i, ret, j + dj2, j + dj2 + im);
302 multiplyAndStore(u8r - u11i, u8i + u11r, w3r, w3i, ret, j + dj3, j + dj3 + im);
303 multiplyAndStore(u8r + u11i, u8i - u11r, w4r, w4i, ret, j + dj4, j + dj4 + im);
304 multiplyAndStore(u9r - u12i, u9i + u12r, w5r, w5i, ret, j + dj5, j + dj5 + im);
305 multiplyAndStore(u7r - u10i, u7i + u10r, w6r, w6i, ret, j + dj6, j + dj6 + im);
306 }
307 }
308 }
309
310
311
312
313
314
315 @Override
316 protected void passSIMD(PassData passData) {
317 if (im == 1) {
318
319 if (innerLoopLimit % LOOP != 0) {
320 passScalar(passData);
321 } else {
322 interleaved(passData);
323 }
324 } else {
325
326 if (innerLoopLimit % BLOCK_LOOP != 0) {
327 passScalar(passData);
328 } else {
329 blocked(passData);
330 }
331 }
332 }
333
334
335
336
337
338
339 private void interleaved(PassData passData) {
340 final double[] data = passData.in;
341 final double[] ret = passData.out;
342 int sign = passData.sign;
343 int i = passData.inOffset;
344 int j = passData.outOffset;
345 final double s1 = (-sign) * sin2PI_7;
346 final double s2 = (-sign) * sin4PI_7;
347 final double s3 = (-sign) * sin6PI_7;
348 final double v5 = (s1 + s2 - s3) * oneThird;
349 final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
350 final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
351 final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
352
353 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP, i += LENGTH, j += LENGTH) {
354 DoubleVector
355 z0 = fromArray(DOUBLE_SPECIES, data, i),
356 z1 = fromArray(DOUBLE_SPECIES, data, i + di),
357 z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
358 z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
359 z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
360 z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
361 z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
362 DoubleVector
363 t0 = z1.add(z6),
364 t1 = z1.sub(z6),
365 t2 = z2.add(z5),
366 t3 = z2.sub(z5),
367 t4 = z4.add(z3),
368 t5 = z4.sub(z3),
369 t6 = t2.add(t0),
370 t7 = t5.add(t3);
371 DoubleVector
372 b0 = z0.add(t6).add(t4),
373 b1 = t6.add(t4).mul(v1),
374 b2 = t0.sub(t4).mul(v2),
375 b3 = t4.sub(t2).mul(v3),
376 b4 = t2.sub(t0).mul(v4),
377 b5 = t7.add(t1).mul(v5),
378 b6 = t1.sub(t5).mul(v6),
379 b7 = t5.sub(t3).mul(v7),
380 b8 = t3.sub(t1).mul(v8);
381 DoubleVector
382 u0 = b0.add(b1),
383 u1 = b2.add(b3),
384 u2 = b4.sub(b3),
385 u3 = b2.add(b4).neg(),
386 u4 = b6.add(b7),
387 u5 = b8.sub(b7),
388 u6 = b8.add(b6).neg(),
389 u7 = u0.add(u1),
390 u8 = u0.add(u2),
391 u9 = u0.add(u3),
392 u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
393 u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
394 u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
395 b0.intoArray(ret, j);
396 u7.add(u10.mul(NEGATE_IM)).intoArray(ret, j + dj);
397 u9.add(u12.mul(NEGATE_IM)).intoArray(ret, j + dj2);
398 u8.add(u11.mul(NEGATE_RE)).intoArray(ret, j + dj3);
399 u8.add(u11.mul(NEGATE_IM)).intoArray(ret, j + dj4);
400 u9.add(u12.mul(NEGATE_RE)).intoArray(ret, j + dj5);
401 u7.add(u10.mul(NEGATE_RE)).intoArray(ret, j + dj6);
402 }
403
404 j += jstep;
405 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
406 final double[] twids = twiddles[k];
407 DoubleVector
408 w1r = broadcast(DOUBLE_SPECIES, twids[0]),
409 w1i = broadcast(DOUBLE_SPECIES, -sign * twids[1]).mul(NEGATE_IM),
410 w2r = broadcast(DOUBLE_SPECIES, twids[2]),
411 w2i = broadcast(DOUBLE_SPECIES, -sign * twids[3]).mul(NEGATE_IM),
412 w3r = broadcast(DOUBLE_SPECIES, twids[4]),
413 w3i = broadcast(DOUBLE_SPECIES, -sign * twids[5]).mul(NEGATE_IM),
414 w4r = broadcast(DOUBLE_SPECIES, twids[6]),
415 w4i = broadcast(DOUBLE_SPECIES, -sign * twids[7]).mul(NEGATE_IM),
416 w5r = broadcast(DOUBLE_SPECIES, twids[8]),
417 w5i = broadcast(DOUBLE_SPECIES, -sign * twids[9]).mul(NEGATE_IM),
418 w6r = broadcast(DOUBLE_SPECIES, twids[10]),
419 w6i = broadcast(DOUBLE_SPECIES, -sign * twids[11]).mul(NEGATE_IM);
420 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP, i += LENGTH, j += LENGTH) {
421 DoubleVector
422 z0 = fromArray(DOUBLE_SPECIES, data, i),
423 z1 = fromArray(DOUBLE_SPECIES, data, i + di),
424 z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
425 z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
426 z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
427 z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
428 z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
429 DoubleVector
430 t0 = z1.add(z6),
431 t1 = z1.sub(z6),
432 t2 = z2.add(z5),
433 t3 = z2.sub(z5),
434 t4 = z4.add(z3),
435 t5 = z4.sub(z3),
436 t6 = t2.add(t0),
437 t7 = t5.add(t3);
438 DoubleVector
439 b0 = z0.add(t6).add(t4),
440 b1 = t6.add(t4).mul(v1),
441 b2 = t0.sub(t4).mul(v2),
442 b3 = t4.sub(t2).mul(v3),
443 b4 = t2.sub(t0).mul(v4),
444 b5 = t7.add(t1).mul(v5),
445 b6 = t1.sub(t5).mul(v6),
446 b7 = t5.sub(t3).mul(v7),
447 b8 = t3.sub(t1).mul(v8);
448 DoubleVector
449 u0 = b0.add(b1),
450 u1 = b2.add(b3),
451 u2 = b4.sub(b3),
452 u3 = b2.add(b4).neg(),
453 u4 = b6.add(b7),
454 u5 = b8.sub(b7),
455 u6 = b8.add(b6).neg(),
456 u7 = u0.add(u1),
457 u8 = u0.add(u2),
458 u9 = u0.add(u3),
459 u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
460 u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
461 u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
462 b0.intoArray(ret, j);
463 DoubleVector
464 x1 = u7.add(u10.mul(NEGATE_IM)),
465 x2 = u9.add(u12.mul(NEGATE_IM)),
466 x3 = u8.add(u11.mul(NEGATE_RE)),
467 x4 = u8.add(u11.mul(NEGATE_IM)),
468 x5 = u9.add(u12.mul(NEGATE_RE)),
469 x6 = u7.add(u10.mul(NEGATE_RE));
470 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj);
471 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj2);
472 w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj3);
473 w4r.mul(x4).add(w4i.mul(x4).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj4);
474 w5r.mul(x5).add(w5i.mul(x5).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj5);
475 w6r.mul(x6).add(w6i.mul(x6).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj6);
476 }
477 }
478 }
479
480
481
482
483
484
485 private void blocked(PassData passData) {
486 final double[] data = passData.in;
487 final double[] ret = passData.out;
488 int sign = passData.sign;
489 int i = passData.inOffset;
490 int j = passData.outOffset;
491 final double s1 = (-sign) * sin2PI_7;
492 final double s2 = (-sign) * sin4PI_7;
493 final double s3 = (-sign) * sin6PI_7;
494 final double v5 = (s1 + s2 - s3) * oneThird;
495 final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
496 final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
497 final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
498
499 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
500 final DoubleVector
501 z0r = fromArray(DOUBLE_SPECIES, data, i),
502 z1r = fromArray(DOUBLE_SPECIES, data, i + di),
503 z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
504 z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
505 z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
506 z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
507 z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
508 z0i = fromArray(DOUBLE_SPECIES, data, i + im),
509 z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
510 z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
511 z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
512 z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
513 z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
514 z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
515 final DoubleVector
516 t0r = z1r.add(z6r), t0i = z1i.add(z6i),
517 t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
518 t2r = z2r.add(z5r), t2i = z2i.add(z5i),
519 t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
520 t4r = z4r.add(z3r), t4i = z4i.add(z3i),
521 t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
522 t6r = t2r.add(t0r), t6i = t2i.add(t0i),
523 t7r = t5r.add(t3r), t7i = t5i.add(t3i);
524 final DoubleVector
525 b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
526 b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
527 b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
528 b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
529 b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
530 b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
531 b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
532 b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
533 b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
534 final DoubleVector
535 u0r = b0r.add(b1r), u0i = b0i.add(b1i),
536 u1r = b2r.add(b3r), u1i = b2i.add(b3i),
537 u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
538 u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
539 u4r = b6r.add(b7r), u4i = b6i.add(b7i),
540 u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
541 u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
542 u7r = u0r.add(u1r), u7i = u0i.add(u1i),
543 u8r = u0r.add(u2r), u8i = u0i.add(u2i),
544 u9r = u0r.add(u3r), u9i = u0i.add(u3i),
545 u10r = u4r.add(b5r), u10i = u4i.add(b5i),
546 u11r = u5r.add(b5r), u11i = u5i.add(b5i),
547 u12r = u6r.add(b5r), u12i = u6i.add(b5i);
548 b0r.intoArray(ret, j);
549 b0i.intoArray(ret, j + im);
550 u7r.add(u10i).intoArray(ret, j + dj);
551 u7i.sub(u10r).intoArray(ret, j + dj + im);
552 u9r.add(u12i).intoArray(ret, j + dj2);
553 u9i.sub(u12r).intoArray(ret, j + dj2 + im);
554 u8r.sub(u11i).intoArray(ret, j + dj3);
555 u8i.add(u11r).intoArray(ret, j + dj3 + im);
556 u8r.add(u11i).intoArray(ret, j + dj4);
557 u8i.sub(u11r).intoArray(ret, j + dj4 + im);
558 u9r.sub(u12i).intoArray(ret, j + dj5);
559 u9i.add(u12r).intoArray(ret, j + dj5 + im);
560 u7r.sub(u10i).intoArray(ret, j + dj6);
561 u7i.add(u10r).intoArray(ret, j + dj6 + im);
562 }
563
564 j += jstep;
565 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
566 final double[] twids = twiddles[k];
567 DoubleVector
568 w1r = broadcast(DOUBLE_SPECIES, twids[0]),
569 w1i = broadcast(DOUBLE_SPECIES, -sign * twids[1]),
570 w2r = broadcast(DOUBLE_SPECIES, twids[2]),
571 w2i = broadcast(DOUBLE_SPECIES, -sign * twids[3]),
572 w3r = broadcast(DOUBLE_SPECIES, twids[4]),
573 w3i = broadcast(DOUBLE_SPECIES, -sign * twids[5]),
574 w4r = broadcast(DOUBLE_SPECIES, twids[6]),
575 w4i = broadcast(DOUBLE_SPECIES, -sign * twids[7]),
576 w5r = broadcast(DOUBLE_SPECIES, twids[8]),
577 w5i = broadcast(DOUBLE_SPECIES, -sign * twids[9]),
578 w6r = broadcast(DOUBLE_SPECIES, twids[10]),
579 w6i = broadcast(DOUBLE_SPECIES, -sign * twids[11]);
580 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
581 final DoubleVector
582 z0r = fromArray(DOUBLE_SPECIES, data, i),
583 z1r = fromArray(DOUBLE_SPECIES, data, i + di),
584 z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
585 z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
586 z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
587 z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
588 z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
589 z0i = fromArray(DOUBLE_SPECIES, data, i + im),
590 z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
591 z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
592 z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
593 z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
594 z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
595 z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
596 final DoubleVector
597 t0r = z1r.add(z6r), t0i = z1i.add(z6i),
598 t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
599 t2r = z2r.add(z5r), t2i = z2i.add(z5i),
600 t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
601 t4r = z4r.add(z3r), t4i = z4i.add(z3i),
602 t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
603 t6r = t2r.add(t0r), t6i = t2i.add(t0i),
604 t7r = t5r.add(t3r), t7i = t5i.add(t3i);
605 final DoubleVector
606 b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
607 b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
608 b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
609 b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
610 b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
611 b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
612 b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
613 b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
614 b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
615 final DoubleVector
616 u0r = b0r.add(b1r), u0i = b0i.add(b1i),
617 u1r = b2r.add(b3r), u1i = b2i.add(b3i),
618 u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
619 u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
620 u4r = b6r.add(b7r), u4i = b6i.add(b7i),
621 u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
622 u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
623 u7r = u0r.add(u1r), u7i = u0i.add(u1i),
624 u8r = u0r.add(u2r), u8i = u0i.add(u2i),
625 u9r = u0r.add(u3r), u9i = u0i.add(u3i),
626 u10r = u4r.add(b5r), u10i = u4i.add(b5i),
627 u11r = u5r.add(b5r), u11i = u5i.add(b5i),
628 u12r = u6r.add(b5r), u12i = u6i.add(b5i);
629 b0r.intoArray(ret, j);
630 b0i.intoArray(ret, j + im);
631 DoubleVector
632 x1r = u7r.add(u10i), x1i = u7i.sub(u10r),
633 x2r = u9r.add(u12i), x2i = u9i.sub(u12r),
634 x3r = u8r.sub(u11i), x3i = u8i.add(u11r),
635 x4r = u8r.add(u11i), x4i = u8i.sub(u11r),
636 x5r = u9r.sub(u12i), x5i = u9i.add(u12r),
637 x6r = u7r.sub(u10i), x6i = u7i.add(u10r);
638 w1r.mul(x1r).add(w1i.neg().mul(x1i)).intoArray(ret, j + dj);
639 w2r.mul(x2r).add(w2i.neg().mul(x2i)).intoArray(ret, j + dj2);
640 w3r.mul(x3r).add(w3i.neg().mul(x3i)).intoArray(ret, j + dj3);
641 w4r.mul(x4r).add(w4i.neg().mul(x4i)).intoArray(ret, j + dj4);
642 w5r.mul(x5r).add(w5i.neg().mul(x5i)).intoArray(ret, j + dj5);
643 w6r.mul(x6r).add(w6i.neg().mul(x6i)).intoArray(ret, j + dj6);
644 w1r.mul(x1i).add(w1i.mul(x1r)).intoArray(ret, j + dj + im);
645 w2r.mul(x2i).add(w2i.mul(x2r)).intoArray(ret, j + dj2 + im);
646 w3r.mul(x3i).add(w3i.mul(x3r)).intoArray(ret, j + dj3 + im);
647 w4r.mul(x4i).add(w4i.mul(x4r)).intoArray(ret, j + dj4 + im);
648 w5r.mul(x5i).add(w5i.mul(x5r)).intoArray(ret, j + dj5 + im);
649 w6r.mul(x6i).add(w6i.mul(x6r)).intoArray(ret, j + dj6 + im);
650 }
651 }
652 }
653 }