1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 package ffx.numerics.fft;
39
40 import jdk.incubator.vector.DoubleVector;
41 import jdk.incubator.vector.VectorSpecies;
42
43 import static jdk.incubator.vector.DoubleVector.SPECIES_128;
44 import static jdk.incubator.vector.DoubleVector.SPECIES_256;
45 import static jdk.incubator.vector.DoubleVector.SPECIES_512;
46 import static jdk.incubator.vector.DoubleVector.broadcast;
47 import static jdk.incubator.vector.DoubleVector.fromArray;
48
49
50
51
52 public class MixedRadixFactor4 extends MixedRadixFactor {
53
54
55
56
57 private static int[] simdSizes = {8, 4, 2};
58
59 private final int di2;
60 private final int di3;
61 private final int dj2;
62 private final int dj3;
63
64
65
66
67
68
69 public MixedRadixFactor4(PassConstants passConstants) {
70 super(passConstants);
71 di2 = 2 * di;
72 di3 = 3 * di;
73 dj2 = 2 * dj;
74 dj3 = 3 * dj;
75 }
76
77
78
79
80
81
82
83 @Override
84 public boolean isValidSIMDWidth(int width) {
85
86 if (width != 2 && width != 4 && width != 8) {
87 return false;
88 }
89 if (im == 1) {
90
91 return innerLoopLimit % (width / 2) == 0;
92 } else {
93
94 return innerLoopLimit % width == 0;
95 }
96 }
97
98
99
100
101
102
103
104 public int getOptimalSIMDWidth() {
105
106 if (isValidSIMDWidth(LENGTH)) {
107 return LENGTH;
108 }
109
110 for (int size : simdSizes) {
111 if (size >= LENGTH) {
112
113 continue;
114 }
115 if (isValidSIMDWidth(size)) {
116 return size;
117 }
118 }
119
120 return 0;
121 }
122
123
124
125
126 @Override
127 protected void passScalar(PassData passData) {
128 final double[] data = passData.in;
129 final double[] ret = passData.out;
130 int sign = passData.sign;
131 int i = passData.inOffset;
132 int j = passData.outOffset;
133
134 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
135 final double z0_r = data[i];
136 final double z1_r = data[i + di];
137 final double z2_r = data[i + di2];
138 final double z3_r = data[i + di3];
139 final double z0_i = data[i + im];
140 final double z1_i = data[i + di + im];
141 final double z2_i = data[i + di2 + im];
142 final double z3_i = data[i + di3 + im];
143 final double t1_r = z0_r + z2_r;
144 final double t1_i = z0_i + z2_i;
145 final double t2_r = z1_r + z3_r;
146 final double t2_i = z1_i + z3_i;
147 final double t3_r = z0_r - z2_r;
148 final double t3_i = z0_i - z2_i;
149 final double t4_r = sign * (z1_r - z3_r);
150 final double t4_i = sign * (z1_i - z3_i);
151 ret[j] = t1_r + t2_r;
152 ret[j + im] = t1_i + t2_i;
153 ret[j + dj] = t3_r - t4_i;
154 ret[j + dj + im] = t3_i + t4_r;
155 ret[j + dj2] = t1_r - t2_r;
156 ret[j + dj2 + im] = t1_i - t2_i;
157 ret[j + dj3] = t3_r + t4_i;
158 ret[j + dj3 + im] = t3_i - t4_r;
159 }
160 j += jstep;
161 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
162 final int index = k * 3;
163 final double w1_r = wr[index];
164 final double w2_r = wr[index + 1];
165 final double w3_r = wr[index + 2];
166 final double w1_i = -sign * wi[index];
167 final double w2_i = -sign * wi[index + 1];
168 final double w3_i = -sign * wi[index + 2];
169 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
170 final double z0_r = data[i];
171 final double z1_r = data[i + di];
172 final double z2_r = data[i + di2];
173 final double z3_r = data[i + di3];
174 final double z0_i = data[i + im];
175 final double z1_i = data[i + di + im];
176 final double z2_i = data[i + di2 + im];
177 final double z3_i = data[i + di3 + im];
178 final double t1_r = z0_r + z2_r;
179 final double t1_i = z0_i + z2_i;
180 final double t2_r = z1_r + z3_r;
181 final double t2_i = z1_i + z3_i;
182 final double t3_r = z0_r - z2_r;
183 final double t3_i = z0_i - z2_i;
184 final double t4_r = sign * (z1_r - z3_r);
185 final double t4_i = sign * (z1_i - z3_i);
186 ret[j] = t1_r + t2_r;
187 ret[j + im] = t1_i + t2_i;
188 multiplyAndStore(t3_r - t4_i, t3_i + t4_r, w1_r, w1_i, ret, j + dj, j + dj + im);
189 multiplyAndStore(t1_r - t2_r, t1_i - t2_i, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
190 multiplyAndStore(t3_r + t4_i, t3_i - t4_r, w3_r, w3_i, ret, j + dj3, j + dj3 + im);
191 }
192 }
193 }
194
195
196
197
198
199
200 @Override
201 protected void passSIMD(PassData passData) {
202 if (!isValidSIMDWidth(simdWidth)) {
203 passScalar(passData);
204 } else {
205 if (im == 1) {
206 interleaved(passData, simdWidth);
207 } else {
208 blocked(passData, simdWidth);
209 }
210 }
211 }
212
213
214
215
216
217
218
219 private void interleaved(PassData passData, int simdLength) {
220
221 switch (simdLength) {
222 case 2:
223
224 interleaved128(passData);
225 break;
226 case 4:
227
228 interleaved256(passData);
229 break;
230 case 8:
231
232 interleaved512(passData);
233 break;
234 default:
235 passScalar(passData);
236 }
237 }
238
239
240
241
242
243
244
245 private void blocked(PassData passData, int simdLength) {
246
247 switch (simdLength) {
248 case 2:
249
250 blocked128(passData);
251 break;
252 case 4:
253
254 blocked256(passData);
255 break;
256 case 8:
257
258 blocked512(passData);
259 break;
260 default:
261 passScalar(passData);
262 }
263 }
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279 private void butterFly4Blocked(
280 VectorSpecies<Double> species,
281 double[] data, int i, double sign,
282 double w1r, double w1i,
283 double w2r, double w2i,
284 double w3r, double w3i,
285 double[] ret, int j) {
286 final DoubleVector
287 z0_r = fromArray(species, data, i),
288 z0_i = fromArray(species, data, i + im),
289 z1_r = fromArray(species, data, i + di),
290 z1_i = fromArray(species, data, i + di + im),
291 z2_r = fromArray(species, data, i + di2),
292 z2_i = fromArray(species, data, i + di2 + im),
293 z3_r = fromArray(species, data, i + di3),
294 z3_i = fromArray(species, data, i + di3 + im);
295 final DoubleVector
296 t1_r = z0_r.add(z2_r),
297 t1_i = z0_i.add(z2_i),
298 t2_r = z1_r.add(z3_r),
299 t2_i = z1_i.add(z3_i),
300 t3_r = z0_r.sub(z2_r),
301 t3_i = z0_i.sub(z2_i),
302 t4_r = z1_r.sub(z3_r).mul(sign),
303 t4_i = z1_i.sub(z3_i).mul(sign);
304 t1_r.add(t2_r).intoArray(ret, j);
305 t1_i.add(t2_i).intoArray(ret, j + im);
306 final DoubleVector
307 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
308 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
309 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
310 x1_r.mul(w1r).sub(x1_i.mul(w1i)).intoArray(ret, j + dj);
311 x2_r.mul(w2r).sub(x2_i.mul(w2i)).intoArray(ret, j + dj2);
312 x3_r.mul(w3r).sub(x3_i.mul(w3i)).intoArray(ret, j + dj3);
313 x1_i.mul(w1r).add(x1_r.mul(w1i)).intoArray(ret, j + dj + im);
314 x2_i.mul(w2r).add(x2_r.mul(w2i)).intoArray(ret, j + dj2 + im);
315 x3_i.mul(w3r).add(x3_r.mul(w3i)).intoArray(ret, j + dj3 + im);
316 }
317
318
319
320
321 private void blocked128(PassData passData) {
322 final double[] data = passData.in;
323 final double[] ret = passData.out;
324 final int sign = passData.sign;
325 int i = passData.inOffset;
326 int j = passData.outOffset;
327
328 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
329 final DoubleVector
330 z0_r = fromArray(SPECIES_128, data, i),
331 z0_i = fromArray(SPECIES_128, data, i + im),
332 z1_r = fromArray(SPECIES_128, data, i + di),
333 z1_i = fromArray(SPECIES_128, data, i + di + im),
334 z2_r = fromArray(SPECIES_128, data, i + di2),
335 z2_i = fromArray(SPECIES_128, data, i + di2 + im),
336 z3_r = fromArray(SPECIES_128, data, i + di3),
337 z3_i = fromArray(SPECIES_128, data, i + di3 + im);
338 final DoubleVector t1_r = z0_r.add(z2_r), t1_i = z0_i.add(z2_i);
339 final DoubleVector t2_r = z1_r.add(z3_r), t2_i = z1_i.add(z3_i);
340 t1_r.add(t2_r).intoArray(ret, j);
341 t1_i.add(t2_i).intoArray(ret, j + im);
342 final DoubleVector t3_r = z0_r.sub(z2_r), t3_i = z0_i.sub(z2_i);
343 final DoubleVector t4_r = z1_r.sub(z3_r).mul(sign), t4_i = z1_i.sub(z3_i).mul(sign);
344 t3_r.sub(t4_i).intoArray(ret, j + dj);
345 t1_r.sub(t2_r).intoArray(ret, j + dj2);
346 t3_r.add(t4_i).intoArray(ret, j + dj3);
347 t3_i.add(t4_r).intoArray(ret, j + dj + im);
348 t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
349 t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
350 }
351
352 j += jstep;
353 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
354 final int index = 3 * k;
355 final double
356 w1r = wr[index],
357 w2r = wr[index + 1],
358 w3r = wr[index + 2],
359 w1i = -sign * wi[index],
360 w2i = -sign * wi[index + 1],
361 w3i = -sign * wi[index + 2];
362 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
363 final DoubleVector
364 z0_r = fromArray(SPECIES_128, data, i),
365 z1_r = fromArray(SPECIES_128, data, i + di),
366 z2_r = fromArray(SPECIES_128, data, i + di2),
367 z3_r = fromArray(SPECIES_128, data, i + di3),
368 z0_i = fromArray(SPECIES_128, data, i + im),
369 z1_i = fromArray(SPECIES_128, data, i + di + im),
370 z2_i = fromArray(SPECIES_128, data, i + di2 + im),
371 z3_i = fromArray(SPECIES_128, data, i + di3 + im);
372 final DoubleVector
373 t1_r = z0_r.add(z2_r),
374 t1_i = z0_i.add(z2_i),
375 t2_r = z1_r.add(z3_r),
376 t2_i = z1_i.add(z3_i),
377 t3_r = z0_r.sub(z2_r),
378 t3_i = z0_i.sub(z2_i),
379 t4_r = z1_r.sub(z3_r).mul(sign),
380 t4_i = z1_i.sub(z3_i).mul(sign);
381 t1_r.add(t2_r).intoArray(ret, j);
382 t1_i.add(t2_i).intoArray(ret, j + im);
383 final DoubleVector
384 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
385 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
386 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
387 x1_r.mul(w1r).sub(x1_i.mul(w1i)).intoArray(ret, j + dj);
388 x2_r.mul(w2r).sub(x2_i.mul(w2i)).intoArray(ret, j + dj2);
389 x3_r.mul(w3r).sub(x3_i.mul(w3i)).intoArray(ret, j + dj3);
390 x1_i.mul(w1r).add(x1_r.mul(w1i)).intoArray(ret, j + dj + im);
391 x2_i.mul(w2r).add(x2_r.mul(w2i)).intoArray(ret, j + dj2 + im);
392 x3_i.mul(w3r).add(x3_r.mul(w3i)).intoArray(ret, j + dj3 + im);
393 }
394 }
395 }
396
397
398
399
400 private void blocked256(PassData passData) {
401 final double[] data = passData.in;
402 final double[] ret = passData.out;
403 final int sign = passData.sign;
404 int i = passData.inOffset;
405 int j = passData.outOffset;
406
407 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
408 final DoubleVector
409 z0_r = fromArray(SPECIES_256, data, i),
410 z1_r = fromArray(SPECIES_256, data, i + di),
411 z2_r = fromArray(SPECIES_256, data, i + di2),
412 z3_r = fromArray(SPECIES_256, data, i + di3),
413 z0_i = fromArray(SPECIES_256, data, i + im),
414 z1_i = fromArray(SPECIES_256, data, i + di + im),
415 z2_i = fromArray(SPECIES_256, data, i + di2 + im),
416 z3_i = fromArray(SPECIES_256, data, i + di3 + im);
417 final DoubleVector
418 t1_r = z0_r.add(z2_r),
419 t1_i = z0_i.add(z2_i),
420 t2_r = z1_r.add(z3_r),
421 t2_i = z1_i.add(z3_i),
422 t3_r = z0_r.sub(z2_r),
423 t3_i = z0_i.sub(z2_i),
424 t4_r = z1_r.sub(z3_r).mul(sign),
425 t4_i = z1_i.sub(z3_i).mul(sign);
426 t1_r.add(t2_r).intoArray(ret, j);
427 t3_r.sub(t4_i).intoArray(ret, j + dj);
428 t1_r.sub(t2_r).intoArray(ret, j + dj2);
429 t3_r.add(t4_i).intoArray(ret, j + dj3);
430 t1_i.add(t2_i).intoArray(ret, j + im);
431 t3_i.add(t4_r).intoArray(ret, j + dj + im);
432 t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
433 t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
434 }
435
436 j += jstep;
437 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
438 final int index = 3 * k;
439 final double
440 w1r = wr[index],
441 w2r = wr[index + 1],
442 w3r = wr[index + 2],
443 w1i = -sign * wi[index],
444 w2i = -sign * wi[index + 1],
445 w3i = -sign * wi[index + 2];
446 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
447 final DoubleVector
448 z0_r = fromArray(SPECIES_256, data, i),
449 z1_r = fromArray(SPECIES_256, data, i + di),
450 z2_r = fromArray(SPECIES_256, data, i + di2),
451 z3_r = fromArray(SPECIES_256, data, i + di3),
452 z0_i = fromArray(SPECIES_256, data, i + im),
453 z1_i = fromArray(SPECIES_256, data, i + di + im),
454 z2_i = fromArray(SPECIES_256, data, i + di2 + im),
455 z3_i = fromArray(SPECIES_256, data, i + di3 + im);
456 final DoubleVector
457 t1_r = z0_r.add(z2_r),
458 t1_i = z0_i.add(z2_i),
459 t2_r = z1_r.add(z3_r),
460 t2_i = z1_i.add(z3_i),
461 t3_r = z0_r.sub(z2_r),
462 t3_i = z0_i.sub(z2_i),
463 t4_r = z1_r.sub(z3_r).mul(sign),
464 t4_i = z1_i.sub(z3_i).mul(sign);
465 t1_r.add(t2_r).intoArray(ret, j);
466 t1_i.add(t2_i).intoArray(ret, j + im);
467 final DoubleVector
468 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
469 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
470 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
471 x1_r.mul(w1r).sub(x1_i.mul(w1i)).intoArray(ret, j + dj);
472 x2_r.mul(w2r).sub(x2_i.mul(w2i)).intoArray(ret, j + dj2);
473 x3_r.mul(w3r).sub(x3_i.mul(w3i)).intoArray(ret, j + dj3);
474 x1_i.mul(w1r).add(x1_r.mul(w1i)).intoArray(ret, j + dj + im);
475 x2_i.mul(w2r).add(x2_r.mul(w2i)).intoArray(ret, j + dj2 + im);
476 x3_i.mul(w3r).add(x3_r.mul(w3i)).intoArray(ret, j + dj3 + im);
477 }
478 }
479 }
480
481
482
483
484 private void blocked512(PassData passData) {
485 final double[] data = passData.in;
486 final double[] ret = passData.out;
487 final int sign = passData.sign;
488 int i = passData.inOffset;
489 int j = passData.outOffset;
490
491 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
492 final DoubleVector
493 z0_r = fromArray(SPECIES_512, data, i),
494 z1_r = fromArray(SPECIES_512, data, i + di),
495 z2_r = fromArray(SPECIES_512, data, i + di2),
496 z3_r = fromArray(SPECIES_512, data, i + di3),
497 z0_i = fromArray(SPECIES_512, data, i + im),
498 z1_i = fromArray(SPECIES_512, data, i + di + im),
499 z2_i = fromArray(SPECIES_512, data, i + di2 + im),
500 z3_i = fromArray(SPECIES_512, data, i + di3 + im);
501 final DoubleVector
502 t1_r = z0_r.add(z2_r),
503 t1_i = z0_i.add(z2_i),
504 t2_r = z1_r.add(z3_r),
505 t2_i = z1_i.add(z3_i),
506 t3_r = z0_r.sub(z2_r),
507 t3_i = z0_i.sub(z2_i),
508 t4_r = z1_r.sub(z3_r).mul(sign),
509 t4_i = z1_i.sub(z3_i).mul(sign);
510 t1_r.add(t2_r).intoArray(ret, j);
511 t3_r.sub(t4_i).intoArray(ret, j + dj);
512 t1_r.sub(t2_r).intoArray(ret, j + dj2);
513 t3_r.add(t4_i).intoArray(ret, j + dj3);
514 t1_i.add(t2_i).intoArray(ret, j + im);
515 t3_i.add(t4_r).intoArray(ret, j + dj + im);
516 t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
517 t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
518 }
519
520 j += jstep;
521 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
522 final int index = 3 * k;
523 final double
524 w1r = wr[index],
525 w2r = wr[index + 1],
526 w3r = wr[index + 2],
527 w1i = -sign * wi[index],
528 w2i = -sign * wi[index + 1],
529 w3i = -sign * wi[index + 2];
530 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
531 final DoubleVector
532 z0_r = fromArray(SPECIES_512, data, i),
533 z1_r = fromArray(SPECIES_512, data, i + di),
534 z2_r = fromArray(SPECIES_512, data, i + di2),
535 z3_r = fromArray(SPECIES_512, data, i + di3),
536 z0_i = fromArray(SPECIES_512, data, i + im),
537 z1_i = fromArray(SPECIES_512, data, i + di + im),
538 z2_i = fromArray(SPECIES_512, data, i + di2 + im),
539 z3_i = fromArray(SPECIES_512, data, i + di3 + im);
540 final DoubleVector
541 t1_r = z0_r.add(z2_r),
542 t1_i = z0_i.add(z2_i),
543 t2_r = z1_r.add(z3_r),
544 t2_i = z1_i.add(z3_i),
545 t3_r = z0_r.sub(z2_r),
546 t3_i = z0_i.sub(z2_i),
547 t4_r = z1_r.sub(z3_r).mul(sign),
548 t4_i = z1_i.sub(z3_i).mul(sign);
549 t1_r.add(t2_r).intoArray(ret, j);
550 t1_i.add(t2_i).intoArray(ret, j + im);
551 final DoubleVector
552 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
553 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
554 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
555 x1_r.mul(w1r).sub(x1_i.mul(w1i)).intoArray(ret, j + dj);
556 x2_r.mul(w2r).sub(x2_i.mul(w2i)).intoArray(ret, j + dj2);
557 x3_r.mul(w3r).sub(x3_i.mul(w3i)).intoArray(ret, j + dj3);
558 x1_i.mul(w1r).add(x1_r.mul(w1i)).intoArray(ret, j + dj + im);
559 x2_i.mul(w2r).add(x2_r.mul(w2i)).intoArray(ret, j + dj2 + im);
560 x3_i.mul(w3r).add(x3_r.mul(w3i)).intoArray(ret, j + dj3 + im);
561 }
562 }
563 }
564
565
566
567
568 private void interleaved128(PassData passData) {
569 final double[] data = passData.in;
570 final double[] ret = passData.out;
571 final int sign = passData.sign;
572 int i = passData.inOffset;
573 int j = passData.outOffset;
574
575 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
576 final DoubleVector
577 z0 = fromArray(SPECIES_128, data, i),
578 z1 = fromArray(SPECIES_128, data, i + di),
579 z2 = fromArray(SPECIES_128, data, i + di2),
580 z3 = fromArray(SPECIES_128, data, i + di3);
581 final DoubleVector
582 t1 = z0.add(z2),
583 t2 = z1.add(z3),
584 t3 = z0.sub(z2),
585 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
586 t1.add(t2).intoArray(ret, j);
587 t3.add(t4.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
588 t1.sub(t2).intoArray(ret, j + dj2);
589 t3.add(t4.mul(NEGATE_IM_128)).intoArray(ret, j + dj3);
590 }
591
592 j += jstep;
593 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
594 final int index = 3 * k;
595 final DoubleVector
596 w1r = broadcast(SPECIES_128, wr[index]),
597 w2r = broadcast(SPECIES_128, wr[index + 1]),
598 w3r = broadcast(SPECIES_128, wr[index + 2]),
599 w1i = broadcast(SPECIES_128, -sign * wi[index]).mul(NEGATE_IM_128),
600 w2i = broadcast(SPECIES_128, -sign * wi[index + 1]).mul(NEGATE_IM_128),
601 w3i = broadcast(SPECIES_128, -sign * wi[index + 2]).mul(NEGATE_IM_128);
602 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
603 final DoubleVector
604 z0 = fromArray(SPECIES_128, data, i),
605 z1 = fromArray(SPECIES_128, data, i + di),
606 z2 = fromArray(SPECIES_128, data, i + di2),
607 z3 = fromArray(SPECIES_128, data, i + di3);
608 final DoubleVector
609 t1 = z0.add(z2),
610 t2 = z1.add(z3),
611 t3 = z0.sub(z2),
612 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
613 t1.add(t2).intoArray(ret, j);
614 final DoubleVector
615 x1 = t3.add(t4.mul(NEGATE_RE_128)),
616 x2 = t1.sub(t2),
617 x3 = t3.add(t4.mul(NEGATE_IM_128));
618 w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
619 w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
620 w3r.fma(x3, w3i.mul(x3).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj3);
621 }
622 }
623 }
624
625
626
627
628 private void interleaved256(PassData passData) {
629 final double[] data = passData.in;
630 final double[] ret = passData.out;
631 final int sign = passData.sign;
632 int i = passData.inOffset;
633 int j = passData.outOffset;
634
635 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
636 final DoubleVector
637 z0 = fromArray(SPECIES_256, data, i),
638 z1 = fromArray(SPECIES_256, data, i + di),
639 z2 = fromArray(SPECIES_256, data, i + di2),
640 z3 = fromArray(SPECIES_256, data, i + di3);
641 final DoubleVector
642 t1 = z0.add(z2),
643 t2 = z1.add(z3),
644 t3 = z0.sub(z2),
645 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
646 t1.add(t2).intoArray(ret, j);
647 t3.add(t4.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
648 t1.sub(t2).intoArray(ret, j + dj2);
649 t3.add(t4.mul(NEGATE_IM_256)).intoArray(ret, j + dj3);
650 }
651
652 j += jstep;
653 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
654 final int index = 3 * k;
655 final DoubleVector
656 w1r = broadcast(SPECIES_256, wr[index]),
657 w2r = broadcast(SPECIES_256, wr[index + 1]),
658 w3r = broadcast(SPECIES_256, wr[index + 2]),
659 w1i = broadcast(SPECIES_256, -sign * wi[index]).mul(NEGATE_IM_256),
660 w2i = broadcast(SPECIES_256, -sign * wi[index + 1]).mul(NEGATE_IM_256),
661 w3i = broadcast(SPECIES_256, -sign * wi[index + 2]).mul(NEGATE_IM_256);
662 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
663 final DoubleVector
664 z0 = fromArray(SPECIES_256, data, i),
665 z1 = fromArray(SPECIES_256, data, i + di),
666 z2 = fromArray(SPECIES_256, data, i + di2),
667 z3 = fromArray(SPECIES_256, data, i + di3);
668 final DoubleVector
669 t1 = z0.add(z2),
670 t2 = z1.add(z3),
671 t3 = z0.sub(z2),
672 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
673 t1.add(t2).intoArray(ret, j);
674 final DoubleVector
675 x1 = t3.add(t4.mul(NEGATE_RE_256)),
676 x2 = t1.sub(t2),
677 x3 = t3.add(t4.mul(NEGATE_IM_256));
678 w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
679 w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
680 w3r.fma(x3, w3i.mul(x3).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj3);
681 }
682 }
683 }
684
685
686
687
688 private void interleaved512(PassData passData) {
689 final double[] data = passData.in;
690 final double[] ret = passData.out;
691 final int sign = passData.sign;
692 int i = passData.inOffset;
693 int j = passData.outOffset;
694
695 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
696 final DoubleVector
697 z0 = fromArray(SPECIES_512, data, i),
698 z1 = fromArray(SPECIES_512, data, i + di),
699 z2 = fromArray(SPECIES_512, data, i + di2),
700 z3 = fromArray(SPECIES_512, data, i + di3);
701 final DoubleVector
702 t1 = z0.add(z2),
703 t2 = z1.add(z3),
704 t3 = z0.sub(z2),
705 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
706 t1.add(t2).intoArray(ret, j);
707 t3.add(t4.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
708 t1.sub(t2).intoArray(ret, j + dj2);
709 t3.add(t4.mul(NEGATE_IM_512)).intoArray(ret, j + dj3);
710 }
711
712 j += jstep;
713 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
714 final int index = 3 * k;
715 final DoubleVector
716 w1r = broadcast(SPECIES_512, wr[index]),
717 w2r = broadcast(SPECIES_512, wr[index + 1]),
718 w3r = broadcast(SPECIES_512, wr[index + 2]),
719 w1i = broadcast(SPECIES_512, -sign * wi[index]).mul(NEGATE_IM_512),
720 w2i = broadcast(SPECIES_512, -sign * wi[index + 1]).mul(NEGATE_IM_512),
721 w3i = broadcast(SPECIES_512, -sign * wi[index + 2]).mul(NEGATE_IM_512);
722 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
723 final DoubleVector
724 z0 = fromArray(SPECIES_512, data, i),
725 z1 = fromArray(SPECIES_512, data, i + di),
726 z2 = fromArray(SPECIES_512, data, i + di2),
727 z3 = fromArray(SPECIES_512, data, i + di3);
728 final DoubleVector
729 t1 = z0.add(z2),
730 t2 = z1.add(z3),
731 t3 = z0.sub(z2),
732 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
733 t1.add(t2).intoArray(ret, j);
734 final DoubleVector
735 x1 = t3.add(t4.mul(NEGATE_RE_512)),
736 x2 = t1.sub(t2),
737 x3 = t3.add(t4.mul(NEGATE_IM_512));
738 w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
739 w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
740 w3r.fma(x3, w3i.mul(x3).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj3);
741 }
742 }
743 }
744 }