1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 package ffx.numerics.fft;
39
40 import jdk.incubator.vector.DoubleVector;
41
42 import static jdk.incubator.vector.DoubleVector.broadcast;
43 import static jdk.incubator.vector.DoubleVector.fromArray;
44 import static org.apache.commons.math3.util.FastMath.PI;
45 import static org.apache.commons.math3.util.FastMath.cos;
46 import static org.apache.commons.math3.util.FastMath.sin;
47
48
49
50
51 public class MixedRadixFactor7 extends MixedRadixFactor {
52
53 private static final double oneThird = 1.0 / 3.0;
54 private static final double sin2PI_7 = sin(2.0 * PI / 7.0);
55 private static final double sin4PI_7 = sin(4.0 * PI / 7.0);
56 private static final double sin6PI_7 = sin(6.0 * PI / 7.0);
57 private static final double cos2PI_7 = cos(2.0 * PI / 7.0);
58 private static final double cos4PI_7 = cos(4.0 * PI / 7.0);
59 private static final double cos6PI_7 = cos(6.0 * PI / 7.0);
60 private static final double c1 = cos2PI_7;
61 private static final double c2 = cos4PI_7;
62 private static final double c3 = cos6PI_7;
63 private static final double v1 = (c1 + c2 + c3) * oneThird - 1.0;
64 private static final double v2 = (2.0 * c1 - c2 - c3) * oneThird;
65 private static final double v3 = (c1 - 2.0 * c2 + c3) * oneThird;
66 private static final double v4 = (c1 + c2 - 2.0 * c3) * oneThird;
67
68 private final int di2;
69 private final int di3;
70 private final int di4;
71 private final int di5;
72 private final int di6;
73 private final int dj2;
74 private final int dj3;
75 private final int dj4;
76 private final int dj5;
77 private final int dj6;
78
79
80
81
82
83
84 public MixedRadixFactor7(PassConstants passConstants) {
85 super(passConstants);
86 di2 = 2 * di;
87 di3 = 3 * di;
88 di4 = 4 * di;
89 di5 = 5 * di;
90 di6 = 6 * di;
91 dj2 = 2 * dj;
92 dj3 = 3 * dj;
93 dj4 = 4 * dj;
94 dj5 = 5 * dj;
95 dj6 = 6 * dj;
96 }
97
98
99
100
101
102
103 protected void passScalar(PassData passData) {
104 final double[] data = passData.in;
105 final double[] ret = passData.out;
106 int sign = passData.sign;
107 int i = passData.inOffset;
108 int j = passData.outOffset;
109 final double s1 = (-sign) * sin2PI_7;
110 final double s2 = (-sign) * sin4PI_7;
111 final double s3 = (-sign) * sin6PI_7;
112 final double v5 = (s1 + s2 - s3) * oneThird;
113 final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
114 final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
115 final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
116
117 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
118 final double z0r = data[i];
119 final double z1r = data[i + di];
120 final double z2r = data[i + di2];
121 final double z3r = data[i + di3];
122 final double z4r = data[i + di4];
123 final double z5r = data[i + di5];
124 final double z6r = data[i + di6];
125 final double z0i = data[i + im];
126 final double z1i = data[i + di + im];
127 final double z2i = data[i + di2 + im];
128 final double z3i = data[i + di3 + im];
129 final double z4i = data[i + di4 + im];
130 final double z5i = data[i + di5 + im];
131 final double z6i = data[i + di6 + im];
132 final double t0r = z1r + z6r;
133 final double t0i = z1i + z6i;
134 final double t1r = z1r - z6r;
135 final double t1i = z1i - z6i;
136 final double t2r = z2r + z5r;
137 final double t2i = z2i + z5i;
138 final double t3r = z2r - z5r;
139 final double t3i = z2i - z5i;
140 final double t4r = z4r + z3r;
141 final double t4i = z4i + z3i;
142 final double t5r = z4r - z3r;
143 final double t5i = z4i - z3i;
144 final double t6r = t2r + t0r;
145 final double t6i = t2i + t0i;
146 final double t7r = t5r + t3r;
147 final double t7i = t5i + t3i;
148 final double b0r = z0r + t6r + t4r;
149 final double b0i = z0i + t6i + t4i;
150 final double b1r = v1 * (t6r + t4r);
151 final double b1i = v1 * (t6i + t4i);
152 final double b2r = v2 * (t0r - t4r);
153 final double b2i = v2 * (t0i - t4i);
154 final double b3r = v3 * (t4r - t2r);
155 final double b3i = v3 * (t4i - t2i);
156 final double b4r = v4 * (t2r - t0r);
157 final double b4i = v4 * (t2i - t0i);
158 final double b5r = v5 * (t7r + t1r);
159 final double b5i = v5 * (t7i + t1i);
160 final double b6r = v6 * (t1r - t5r);
161 final double b6i = v6 * (t1i - t5i);
162 final double b7r = v7 * (t5r - t3r);
163 final double b7i = v7 * (t5i - t3i);
164 final double b8r = v8 * (t3r - t1r);
165 final double b8i = v8 * (t3i - t1i);
166 final double u0r = b0r + b1r;
167 final double u0i = b0i + b1i;
168 final double u1r = b2r + b3r;
169 final double u1i = b2i + b3i;
170 final double u2r = b4r - b3r;
171 final double u2i = b4i - b3i;
172 final double u3r = -b2r - b4r;
173 final double u3i = -b2i - b4i;
174 final double u4r = b6r + b7r;
175 final double u4i = b6i + b7i;
176 final double u5r = b8r - b7r;
177 final double u5i = b8i - b7i;
178 final double u6r = -b8r - b6r;
179 final double u6i = -b8i - b6i;
180 final double u7r = u0r + u1r;
181 final double u7i = u0i + u1i;
182 final double u8r = u0r + u2r;
183 final double u8i = u0i + u2i;
184 final double u9r = u0r + u3r;
185 final double u9i = u0i + u3i;
186 final double u10r = u4r + b5r;
187 final double u10i = u4i + b5i;
188 final double u11r = u5r + b5r;
189 final double u11i = u5i + b5i;
190 final double u12r = u6r + b5r;
191 final double u12i = u6i + b5i;
192 ret[j] = b0r;
193 ret[j + im] = b0i;
194 ret[j + dj] = u7r + u10i;
195 ret[j + dj + im] = u7i - u10r;
196 ret[j + dj2] = u9r + u12i;
197 ret[j + dj2 + im] = u9i - u12r;
198 ret[j + dj3] = u8r - u11i;
199 ret[j + dj3 + im] = u8i + u11r;
200 ret[j + dj4] = u8r + u11i;
201 ret[j + dj4 + im] = u8i - u11r;
202 ret[j + dj5] = u9r - u12i;
203 ret[j + dj5 + im] = u9i + u12r;
204 ret[j + dj6] = u7r - u10i;
205 ret[j + dj6 + im] = u7i + u10r;
206 }
207
208 j += jstep;
209 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
210
211
212
213
214
215
216
217
218
219
220
221
222
223 final int index = k * 6;
224 final double w1r = wr[index];
225 final double w2r = wr[index + 1];
226 final double w3r = wr[index + 2];
227 final double w4r = wr[index + 3];
228 final double w5r = wr[index + 4];
229 final double w6r = wr[index + 5];
230 final double w1i = -sign * wi[index];
231 final double w2i = -sign * wi[index + 1];
232 final double w3i = -sign * wi[index + 2];
233 final double w4i = -sign * wi[index + 3];
234 final double w5i = -sign * wi[index + 4];
235 final double w6i = -sign * wi[index + 5];
236 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
237 final double z0r = data[i];
238 final double z1r = data[i + di];
239 final double z2r = data[i + di2];
240 final double z3r = data[i + di3];
241 final double z4r = data[i + di4];
242 final double z5r = data[i + di5];
243 final double z6r = data[i + di6];
244 final double z0i = data[i + im];
245 final double z1i = data[i + di + im];
246 final double z2i = data[i + di2 + im];
247 final double z3i = data[i + di3 + im];
248 final double z4i = data[i + di4 + im];
249 final double z5i = data[i + di5 + im];
250 final double z6i = data[i + di6 + im];
251 final double t0r = z1r + z6r;
252 final double t0i = z1i + z6i;
253 final double t1r = z1r - z6r;
254 final double t1i = z1i - z6i;
255 final double t2r = z2r + z5r;
256 final double t2i = z2i + z5i;
257 final double t3r = z2r - z5r;
258 final double t3i = z2i - z5i;
259 final double t4r = z4r + z3r;
260 final double t4i = z4i + z3i;
261 final double t5r = z4r - z3r;
262 final double t5i = z4i - z3i;
263 final double t6r = t2r + t0r;
264 final double t6i = t2i + t0i;
265 final double t7r = t5r + t3r;
266 final double t7i = t5i + t3i;
267 final double b0r = z0r + t6r + t4r;
268 final double b0i = z0i + t6i + t4i;
269 final double b1r = v1 * (t6r + t4r);
270 final double b1i = v1 * (t6i + t4i);
271 final double b2r = v2 * (t0r - t4r);
272 final double b2i = v2 * (t0i - t4i);
273 final double b3r = v3 * (t4r - t2r);
274 final double b3i = v3 * (t4i - t2i);
275 final double b4r = v4 * (t2r - t0r);
276 final double b4i = v4 * (t2i - t0i);
277 final double b5r = v5 * (t7r + t1r);
278 final double b5i = v5 * (t7i + t1i);
279 final double b6r = v6 * (t1r - t5r);
280 final double b6i = v6 * (t1i - t5i);
281 final double b7r = v7 * (t5r - t3r);
282 final double b7i = v7 * (t5i - t3i);
283 final double b8r = v8 * (t3r - t1r);
284 final double b8i = v8 * (t3i - t1i);
285 final double u0r = b0r + b1r;
286 final double u0i = b0i + b1i;
287 final double u1r = b2r + b3r;
288 final double u1i = b2i + b3i;
289 final double u2r = b4r - b3r;
290 final double u2i = b4i - b3i;
291 final double u3r = -b2r - b4r;
292 final double u3i = -b2i - b4i;
293 final double u4r = b6r + b7r;
294 final double u4i = b6i + b7i;
295 final double u5r = b8r - b7r;
296 final double u5i = b8i - b7i;
297 final double u6r = -b8r - b6r;
298 final double u6i = -b8i - b6i;
299 final double u7r = u0r + u1r;
300 final double u7i = u0i + u1i;
301 final double u8r = u0r + u2r;
302 final double u8i = u0i + u2i;
303 final double u9r = u0r + u3r;
304 final double u9i = u0i + u3i;
305 final double u10r = u4r + b5r;
306 final double u10i = u4i + b5i;
307 final double u11r = u5r + b5r;
308 final double u11i = u5i + b5i;
309 final double u12r = u6r + b5r;
310 final double u12i = u6i + b5i;
311 ret[j] = b0r;
312 ret[j + im] = b0i;
313 multiplyAndStore(u7r + u10i, u7i - u10r, w1r, w1i, ret, j + dj, j + dj + im);
314 multiplyAndStore(u9r + u12i, u9i - u12r, w2r, w2i, ret, j + dj2, j + dj2 + im);
315 multiplyAndStore(u8r - u11i, u8i + u11r, w3r, w3i, ret, j + dj3, j + dj3 + im);
316 multiplyAndStore(u8r + u11i, u8i - u11r, w4r, w4i, ret, j + dj4, j + dj4 + im);
317 multiplyAndStore(u9r - u12i, u9i + u12r, w5r, w5i, ret, j + dj5, j + dj5 + im);
318 multiplyAndStore(u7r - u10i, u7i + u10r, w6r, w6i, ret, j + dj6, j + dj6 + im);
319 }
320 }
321 }
322
323
324
325
326
327
328 @Override
329 protected void passSIMD(PassData passData) {
330 if (im == 1) {
331
332 if (innerLoopLimit % INTERLEAVED_LOOP != 0) {
333 passScalar(passData);
334 } else {
335 interleaved(passData);
336 }
337 } else {
338
339 if (innerLoopLimit % BLOCK_LOOP != 0) {
340 passScalar(passData);
341 } else {
342 blocked(passData);
343 }
344 }
345 }
346
347
348
349
350
351
352 private void interleaved(PassData passData) {
353 final double[] data = passData.in;
354 final double[] ret = passData.out;
355 int sign = passData.sign;
356 int i = passData.inOffset;
357 int j = passData.outOffset;
358 final double s1 = (-sign) * sin2PI_7;
359 final double s2 = (-sign) * sin4PI_7;
360 final double s3 = (-sign) * sin6PI_7;
361 final double v5 = (s1 + s2 - s3) * oneThird;
362 final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
363 final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
364 final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
365
366 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP, i += LENGTH, j += LENGTH) {
367 DoubleVector
368 z0 = fromArray(DOUBLE_SPECIES, data, i),
369 z1 = fromArray(DOUBLE_SPECIES, data, i + di),
370 z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
371 z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
372 z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
373 z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
374 z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
375 DoubleVector
376 t0 = z1.add(z6),
377 t1 = z1.sub(z6),
378 t2 = z2.add(z5),
379 t3 = z2.sub(z5),
380 t4 = z4.add(z3),
381 t5 = z4.sub(z3),
382 t6 = t2.add(t0),
383 t7 = t5.add(t3);
384 DoubleVector
385 b0 = z0.add(t6).add(t4),
386 b1 = t6.add(t4).mul(v1),
387 b2 = t0.sub(t4).mul(v2),
388 b3 = t4.sub(t2).mul(v3),
389 b4 = t2.sub(t0).mul(v4),
390 b5 = t7.add(t1).mul(v5),
391 b6 = t1.sub(t5).mul(v6),
392 b7 = t5.sub(t3).mul(v7),
393 b8 = t3.sub(t1).mul(v8);
394 DoubleVector
395 u0 = b0.add(b1),
396 u1 = b2.add(b3),
397 u2 = b4.sub(b3),
398 u3 = b2.add(b4).neg(),
399 u4 = b6.add(b7),
400 u5 = b8.sub(b7),
401 u6 = b8.add(b6).neg(),
402 u7 = u0.add(u1),
403 u8 = u0.add(u2),
404 u9 = u0.add(u3),
405 u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
406 u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
407 u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
408 b0.intoArray(ret, j);
409 u7.add(u10.mul(NEGATE_IM)).intoArray(ret, j + dj);
410 u9.add(u12.mul(NEGATE_IM)).intoArray(ret, j + dj2);
411 u8.add(u11.mul(NEGATE_RE)).intoArray(ret, j + dj3);
412 u8.add(u11.mul(NEGATE_IM)).intoArray(ret, j + dj4);
413 u9.add(u12.mul(NEGATE_RE)).intoArray(ret, j + dj5);
414 u7.add(u10.mul(NEGATE_RE)).intoArray(ret, j + dj6);
415 }
416
417 j += jstep;
418 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433 final int index = k * 6;
434 final DoubleVector
435 w1r = broadcast(DOUBLE_SPECIES, wr[index]),
436 w2r = broadcast(DOUBLE_SPECIES, wr[index + 1]),
437 w3r = broadcast(DOUBLE_SPECIES, wr[index + 2]),
438 w4r = broadcast(DOUBLE_SPECIES, wr[index + 3]),
439 w5r = broadcast(DOUBLE_SPECIES, wr[index + 4]),
440 w6r = broadcast(DOUBLE_SPECIES, wr[index + 5]),
441 w1i = broadcast(DOUBLE_SPECIES, -sign * wi[index]).mul(NEGATE_IM),
442 w2i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 1]).mul(NEGATE_IM),
443 w3i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 2]).mul(NEGATE_IM),
444 w4i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 3]).mul(NEGATE_IM),
445 w5i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 4]).mul(NEGATE_IM),
446 w6i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 5]).mul(NEGATE_IM);
447 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP, i += LENGTH, j += LENGTH) {
448 DoubleVector
449 z0 = fromArray(DOUBLE_SPECIES, data, i),
450 z1 = fromArray(DOUBLE_SPECIES, data, i + di),
451 z2 = fromArray(DOUBLE_SPECIES, data, i + di2),
452 z3 = fromArray(DOUBLE_SPECIES, data, i + di3),
453 z4 = fromArray(DOUBLE_SPECIES, data, i + di4),
454 z5 = fromArray(DOUBLE_SPECIES, data, i + di5),
455 z6 = fromArray(DOUBLE_SPECIES, data, i + di6);
456 DoubleVector
457 t0 = z1.add(z6),
458 t1 = z1.sub(z6),
459 t2 = z2.add(z5),
460 t3 = z2.sub(z5),
461 t4 = z4.add(z3),
462 t5 = z4.sub(z3),
463 t6 = t2.add(t0),
464 t7 = t5.add(t3);
465 DoubleVector
466 b0 = z0.add(t6).add(t4),
467 b1 = t6.add(t4).mul(v1),
468 b2 = t0.sub(t4).mul(v2),
469 b3 = t4.sub(t2).mul(v3),
470 b4 = t2.sub(t0).mul(v4),
471 b5 = t7.add(t1).mul(v5),
472 b6 = t1.sub(t5).mul(v6),
473 b7 = t5.sub(t3).mul(v7),
474 b8 = t3.sub(t1).mul(v8);
475 DoubleVector
476 u0 = b0.add(b1),
477 u1 = b2.add(b3),
478 u2 = b4.sub(b3),
479 u3 = b2.add(b4).neg(),
480 u4 = b6.add(b7),
481 u5 = b8.sub(b7),
482 u6 = b8.add(b6).neg(),
483 u7 = u0.add(u1),
484 u8 = u0.add(u2),
485 u9 = u0.add(u3),
486 u10 = u4.add(b5).rearrange(SHUFFLE_RE_IM),
487 u11 = u5.add(b5).rearrange(SHUFFLE_RE_IM),
488 u12 = u6.add(b5).rearrange(SHUFFLE_RE_IM);
489 b0.intoArray(ret, j);
490 DoubleVector
491 x1 = u7.add(u10.mul(NEGATE_IM)),
492 x2 = u9.add(u12.mul(NEGATE_IM)),
493 x3 = u8.add(u11.mul(NEGATE_RE)),
494 x4 = u8.add(u11.mul(NEGATE_IM)),
495 x5 = u9.add(u12.mul(NEGATE_RE)),
496 x6 = u7.add(u10.mul(NEGATE_RE));
497 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj);
498 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj2);
499 w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj3);
500 w4r.mul(x4).add(w4i.mul(x4).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj4);
501 w5r.mul(x5).add(w5i.mul(x5).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj5);
502 w6r.mul(x6).add(w6i.mul(x6).rearrange(SHUFFLE_RE_IM)).intoArray(ret, j + dj6);
503 }
504 }
505 }
506
507
508
509
510
511
512 private void blocked(PassData passData) {
513 final double[] data = passData.in;
514 final double[] ret = passData.out;
515 int sign = passData.sign;
516 int i = passData.inOffset;
517 int j = passData.outOffset;
518 final double s1 = (-sign) * sin2PI_7;
519 final double s2 = (-sign) * sin4PI_7;
520 final double s3 = (-sign) * sin6PI_7;
521 final double v5 = (s1 + s2 - s3) * oneThird;
522 final double v6 = (2.0 * s1 - s2 + s3) * oneThird;
523 final double v7 = (s1 - 2.0 * s2 - s3) * oneThird;
524 final double v8 = (s1 + s2 + 2.0 * s3) * oneThird;
525
526 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
527 final DoubleVector
528 z0r = fromArray(DOUBLE_SPECIES, data, i),
529 z1r = fromArray(DOUBLE_SPECIES, data, i + di),
530 z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
531 z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
532 z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
533 z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
534 z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
535 z0i = fromArray(DOUBLE_SPECIES, data, i + im),
536 z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
537 z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
538 z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
539 z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
540 z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
541 z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
542 final DoubleVector
543 t0r = z1r.add(z6r), t0i = z1i.add(z6i),
544 t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
545 t2r = z2r.add(z5r), t2i = z2i.add(z5i),
546 t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
547 t4r = z4r.add(z3r), t4i = z4i.add(z3i),
548 t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
549 t6r = t2r.add(t0r), t6i = t2i.add(t0i),
550 t7r = t5r.add(t3r), t7i = t5i.add(t3i);
551 final DoubleVector
552 b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
553 b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
554 b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
555 b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
556 b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
557 b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
558 b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
559 b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
560 b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
561 final DoubleVector
562 u0r = b0r.add(b1r), u0i = b0i.add(b1i),
563 u1r = b2r.add(b3r), u1i = b2i.add(b3i),
564 u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
565 u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
566 u4r = b6r.add(b7r), u4i = b6i.add(b7i),
567 u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
568 u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
569 u7r = u0r.add(u1r), u7i = u0i.add(u1i),
570 u8r = u0r.add(u2r), u8i = u0i.add(u2i),
571 u9r = u0r.add(u3r), u9i = u0i.add(u3i),
572 u10r = u4r.add(b5r), u10i = u4i.add(b5i),
573 u11r = u5r.add(b5r), u11i = u5i.add(b5i),
574 u12r = u6r.add(b5r), u12i = u6i.add(b5i);
575 b0r.intoArray(ret, j);
576 b0i.intoArray(ret, j + im);
577 u7r.add(u10i).intoArray(ret, j + dj);
578 u7i.sub(u10r).intoArray(ret, j + dj + im);
579 u9r.add(u12i).intoArray(ret, j + dj2);
580 u9i.sub(u12r).intoArray(ret, j + dj2 + im);
581 u8r.sub(u11i).intoArray(ret, j + dj3);
582 u8i.add(u11r).intoArray(ret, j + dj3 + im);
583 u8r.add(u11i).intoArray(ret, j + dj4);
584 u8i.sub(u11r).intoArray(ret, j + dj4 + im);
585 u9r.sub(u12i).intoArray(ret, j + dj5);
586 u9i.add(u12r).intoArray(ret, j + dj5 + im);
587 u7r.sub(u10i).intoArray(ret, j + dj6);
588 u7i.add(u10r).intoArray(ret, j + dj6 + im);
589 }
590
591 j += jstep;
592 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607 final int index = k * 6;
608 final DoubleVector
609 w1r = broadcast(DOUBLE_SPECIES, wr[index]),
610 w2r = broadcast(DOUBLE_SPECIES, wr[index + 1]),
611 w3r = broadcast(DOUBLE_SPECIES, wr[index + 2]),
612 w4r = broadcast(DOUBLE_SPECIES, wr[index + 3]),
613 w5r = broadcast(DOUBLE_SPECIES, wr[index + 4]),
614 w6r = broadcast(DOUBLE_SPECIES, wr[index + 5]),
615 w1i = broadcast(DOUBLE_SPECIES, -sign * wi[index]),
616 w2i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 1]),
617 w3i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 2]),
618 w4i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 3]),
619 w5i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 4]),
620 w6i = broadcast(DOUBLE_SPECIES, -sign * wi[index + 5]);
621 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP, i += LENGTH, j += LENGTH) {
622 final DoubleVector
623 z0r = fromArray(DOUBLE_SPECIES, data, i),
624 z1r = fromArray(DOUBLE_SPECIES, data, i + di),
625 z2r = fromArray(DOUBLE_SPECIES, data, i + di2),
626 z3r = fromArray(DOUBLE_SPECIES, data, i + di3),
627 z4r = fromArray(DOUBLE_SPECIES, data, i + di4),
628 z5r = fromArray(DOUBLE_SPECIES, data, i + di5),
629 z6r = fromArray(DOUBLE_SPECIES, data, i + di6),
630 z0i = fromArray(DOUBLE_SPECIES, data, i + im),
631 z1i = fromArray(DOUBLE_SPECIES, data, i + di + im),
632 z2i = fromArray(DOUBLE_SPECIES, data, i + di2 + im),
633 z3i = fromArray(DOUBLE_SPECIES, data, i + di3 + im),
634 z4i = fromArray(DOUBLE_SPECIES, data, i + di4 + im),
635 z5i = fromArray(DOUBLE_SPECIES, data, i + di5 + im),
636 z6i = fromArray(DOUBLE_SPECIES, data, i + di6 + im);
637 final DoubleVector
638 t0r = z1r.add(z6r), t0i = z1i.add(z6i),
639 t1r = z1r.sub(z6r), t1i = z1i.sub(z6i),
640 t2r = z2r.add(z5r), t2i = z2i.add(z5i),
641 t3r = z2r.sub(z5r), t3i = z2i.sub(z5i),
642 t4r = z4r.add(z3r), t4i = z4i.add(z3i),
643 t5r = z4r.sub(z3r), t5i = z4i.sub(z3i),
644 t6r = t2r.add(t0r), t6i = t2i.add(t0i),
645 t7r = t5r.add(t3r), t7i = t5i.add(t3i);
646 final DoubleVector
647 b0r = z0r.add(t6r).add(t4r), b0i = z0i.add(t6i).add(t4i),
648 b1r = t6r.add(t4r).mul(v1), b1i = t6i.add(t4i).mul(v1),
649 b2r = t0r.sub(t4r).mul(v2), b2i = t0i.sub(t4i).mul(v2),
650 b3r = t4r.sub(t2r).mul(v3), b3i = t4i.sub(t2i).mul(v3),
651 b4r = t2r.sub(t0r).mul(v4), b4i = t2i.sub(t0i).mul(v4),
652 b5r = t7r.add(t1r).mul(v5), b5i = t7i.add(t1i).mul(v5),
653 b6r = t1r.sub(t5r).mul(v6), b6i = t1i.sub(t5i).mul(v6),
654 b7r = t5r.sub(t3r).mul(v7), b7i = t5i.sub(t3i).mul(v7),
655 b8r = t3r.sub(t1r).mul(v8), b8i = t3i.sub(t1i).mul(v8);
656 final DoubleVector
657 u0r = b0r.add(b1r), u0i = b0i.add(b1i),
658 u1r = b2r.add(b3r), u1i = b2i.add(b3i),
659 u2r = b4r.sub(b3r), u2i = b4i.sub(b3i),
660 u3r = b2r.add(b4r).neg(), u3i = b2i.add(b4i).neg(),
661 u4r = b6r.add(b7r), u4i = b6i.add(b7i),
662 u5r = b8r.sub(b7r), u5i = b8i.sub(b7i),
663 u6r = b8r.add(b6r).neg(), u6i = b8i.add(b6i).neg(),
664 u7r = u0r.add(u1r), u7i = u0i.add(u1i),
665 u8r = u0r.add(u2r), u8i = u0i.add(u2i),
666 u9r = u0r.add(u3r), u9i = u0i.add(u3i),
667 u10r = u4r.add(b5r), u10i = u4i.add(b5i),
668 u11r = u5r.add(b5r), u11i = u5i.add(b5i),
669 u12r = u6r.add(b5r), u12i = u6i.add(b5i);
670 b0r.intoArray(ret, j);
671 b0i.intoArray(ret, j + im);
672 DoubleVector
673 x1r = u7r.add(u10i), x1i = u7i.sub(u10r),
674 x2r = u9r.add(u12i), x2i = u9i.sub(u12r),
675 x3r = u8r.sub(u11i), x3i = u8i.add(u11r),
676 x4r = u8r.add(u11i), x4i = u8i.sub(u11r),
677 x5r = u9r.sub(u12i), x5i = u9i.add(u12r),
678 x6r = u7r.sub(u10i), x6i = u7i.add(u10r);
679 w1r.mul(x1r).add(w1i.neg().mul(x1i)).intoArray(ret, j + dj);
680 w2r.mul(x2r).add(w2i.neg().mul(x2i)).intoArray(ret, j + dj2);
681 w3r.mul(x3r).add(w3i.neg().mul(x3i)).intoArray(ret, j + dj3);
682 w4r.mul(x4r).add(w4i.neg().mul(x4i)).intoArray(ret, j + dj4);
683 w5r.mul(x5r).add(w5i.neg().mul(x5i)).intoArray(ret, j + dj5);
684 w6r.mul(x6r).add(w6i.neg().mul(x6i)).intoArray(ret, j + dj6);
685 w1r.mul(x1i).add(w1i.mul(x1r)).intoArray(ret, j + dj + im);
686 w2r.mul(x2i).add(w2i.mul(x2r)).intoArray(ret, j + dj2 + im);
687 w3r.mul(x3i).add(w3i.mul(x3r)).intoArray(ret, j + dj3 + im);
688 w4r.mul(x4i).add(w4i.mul(x4r)).intoArray(ret, j + dj4 + im);
689 w5r.mul(x5i).add(w5i.mul(x5r)).intoArray(ret, j + dj5 + im);
690 w6r.mul(x6i).add(w6i.mul(x6r)).intoArray(ret, j + dj6 + im);
691 }
692 }
693 }
694 }