1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 package ffx.numerics.fft;
39
40 import jdk.incubator.vector.DoubleVector;
41
42 import static jdk.incubator.vector.DoubleVector.SPECIES_128;
43 import static jdk.incubator.vector.DoubleVector.SPECIES_256;
44 import static jdk.incubator.vector.DoubleVector.SPECIES_512;
45 import static jdk.incubator.vector.DoubleVector.broadcast;
46 import static jdk.incubator.vector.DoubleVector.fromArray;
47
48
49
50
51 public class MixedRadixFactor4 extends MixedRadixFactor {
52
53 private final int di2;
54 private final int di3;
55 private final int dj2;
56 private final int dj3;
57
58
59
60
61
62 public MixedRadixFactor4(PassConstants passConstants) {
63 super(passConstants);
64 di2 = 2 * di;
65 di3 = 3 * di;
66 dj2 = 2 * dj;
67 dj3 = 3 * dj;
68 }
69
70
71
72
73 @Override
74 protected void passScalar(PassData passData) {
75 final double[] data = passData.in;
76 final double[] ret = passData.out;
77 int sign = passData.sign;
78 int i = passData.inOffset;
79 int j = passData.outOffset;
80
81 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
82 final double z0_r = data[i];
83 final double z1_r = data[i + di];
84 final double z2_r = data[i + di2];
85 final double z3_r = data[i + di3];
86 final double z0_i = data[i + im];
87 final double z1_i = data[i + di + im];
88 final double z2_i = data[i + di2 + im];
89 final double z3_i = data[i + di3 + im];
90 final double t1_r = z0_r + z2_r;
91 final double t1_i = z0_i + z2_i;
92 final double t2_r = z1_r + z3_r;
93 final double t2_i = z1_i + z3_i;
94 final double t3_r = z0_r - z2_r;
95 final double t3_i = z0_i - z2_i;
96 final double t4_r = sign * (z1_r - z3_r);
97 final double t4_i = sign * (z1_i - z3_i);
98 ret[j] = t1_r + t2_r;
99 ret[j + im] = t1_i + t2_i;
100 ret[j + dj] = t3_r - t4_i;
101 ret[j + dj + im] = t3_i + t4_r;
102 ret[j + dj2] = t1_r - t2_r;
103 ret[j + dj2 + im] = t1_i - t2_i;
104 ret[j + dj3] = t3_r + t4_i;
105 ret[j + dj3 + im] = t3_i - t4_r;
106 }
107 j += jstep;
108 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
109 final double[] twids = twiddles[k];
110 final double w1_r = twids[0];
111 final double w1_i = -sign * twids[1];
112 final double w2_r = twids[2];
113 final double w2_i = -sign * twids[3];
114 final double w3_r = twids[4];
115 final double w3_i = -sign * twids[5];
116 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
117 final double z0_r = data[i];
118 final double z1_r = data[i + di];
119 final double z2_r = data[i + di2];
120 final double z3_r = data[i + di3];
121 final double z0_i = data[i + im];
122 final double z1_i = data[i + di + im];
123 final double z2_i = data[i + di2 + im];
124 final double z3_i = data[i + di3 + im];
125 final double t1_r = z0_r + z2_r;
126 final double t1_i = z0_i + z2_i;
127 final double t2_r = z1_r + z3_r;
128 final double t2_i = z1_i + z3_i;
129 final double t3_r = z0_r - z2_r;
130 final double t3_i = z0_i - z2_i;
131 final double t4_r = sign * (z1_r - z3_r);
132 final double t4_i = sign * (z1_i - z3_i);
133 ret[j] = t1_r + t2_r;
134 ret[j + im] = t1_i + t2_i;
135 multiplyAndStore(t3_r - t4_i, t3_i + t4_r, w1_r, w1_i, ret, j + dj, j + dj + im);
136 multiplyAndStore(t1_r - t2_r, t1_i - t2_i, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
137 multiplyAndStore(t3_r + t4_i, t3_i - t4_r, w3_r, w3_i, ret, j + dj3, j + dj3 + im);
138 }
139 }
140 }
141
142
143
144
145 @Override
146 protected void passSIMD(PassData passData) {
147 if (im == 1) {
148 interleaved(passData);
149 } else {
150 blocked(passData);
151 }
152 }
153
154
155
156
157 private static int[] simdSizes = {8, 4, 2};
158
159
160
161
162
163
164 private void interleaved(PassData passData, int simdLength) {
165
166 switch (simdLength) {
167 case 2:
168
169 interleaved128(passData);
170 break;
171 case 4:
172
173 interleaved256(passData);
174 break;
175 case 8:
176
177 interleaved512(passData);
178 break;
179 default:
180 passScalar(passData);
181 }
182 }
183
184
185
186
187
188
189 private void blocked(PassData passData, int simdLength) {
190
191 switch (simdLength) {
192 case 2:
193
194 blocked128(passData);
195 break;
196 case 4:
197
198 blocked256(passData);
199 break;
200 case 8:
201
202 blocked512(passData);
203 break;
204 default:
205 passScalar(passData);
206 }
207 }
208
209
210
211
212
213 private void interleaved(PassData passData) {
214 if (innerLoopLimit % LOOP == 0) {
215
216 interleaved(passData, LENGTH);
217 } else {
218
219 if (innerLoopLimit % 2 != 0 && innerLoopLimit != 1) {
220 passScalar(passData);
221 return;
222 }
223
224 for (int size : simdSizes) {
225 if (size >= LENGTH) {
226
227 continue;
228 }
229
230 if (innerLoopLimit % (size / 2) == 0) {
231 interleaved(passData, size);
232 }
233 }
234 }
235 }
236
237
238
239
240
241 private void blocked(PassData passData) {
242 if (innerLoopLimit % BLOCK_LOOP == 0) {
243
244 blocked(passData, LENGTH);
245 } else {
246
247 if (innerLoopLimit % 2 != 0) {
248 passScalar(passData);
249 return;
250 }
251
252 for (int size : simdSizes) {
253 if (size >= LENGTH) {
254
255 continue;
256 }
257
258 if (innerLoopLimit % size == 0) {
259 blocked(passData, size);
260 }
261 }
262 }
263 }
264
265
266
267
268 private void blocked128(PassData passData) {
269 final double[] data = passData.in;
270 final double[] ret = passData.out;
271 int sign = passData.sign;
272 int i = passData.inOffset;
273 int j = passData.outOffset;
274
275 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
276 final DoubleVector
277 z0_r = fromArray(SPECIES_128, data, i),
278 z0_i = fromArray(SPECIES_128, data, i + im),
279 z1_r = fromArray(SPECIES_128, data, i + di),
280 z1_i = fromArray(SPECIES_128, data, i + di + im),
281 z2_r = fromArray(SPECIES_128, data, i + di2),
282 z2_i = fromArray(SPECIES_128, data, i + di2 + im),
283 z3_r = fromArray(SPECIES_128, data, i + di3),
284 z3_i = fromArray(SPECIES_128, data, i + di3 + im);
285 final DoubleVector
286 t1_r = z0_r.add(z2_r),
287 t1_i = z0_i.add(z2_i),
288 t2_r = z1_r.add(z3_r),
289 t2_i = z1_i.add(z3_i),
290 t3_r = z0_r.sub(z2_r),
291 t3_i = z0_i.sub(z2_i),
292 t4_r = z1_r.sub(z3_r).mul(sign),
293 t4_i = z1_i.sub(z3_i).mul(sign);
294 t1_r.add(t2_r).intoArray(ret, j);
295 t1_i.add(t2_i).intoArray(ret, j + im);
296 t3_r.sub(t4_i).intoArray(ret, j + dj);
297 t3_i.add(t4_r).intoArray(ret, j + dj + im);
298 t1_r.sub(t2_r).intoArray(ret, j + dj2);
299 t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
300 t3_r.add(t4_i).intoArray(ret, j + dj3);
301 t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
302 }
303
304 j += jstep;
305 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
306 final double[] twids = twiddles[k];
307 DoubleVector
308 w1r = broadcast(SPECIES_128, twids[0]),
309 w1i = broadcast(SPECIES_128, -sign * twids[1]),
310 w2r = broadcast(SPECIES_128, twids[2]),
311 w2i = broadcast(SPECIES_128, -sign * twids[3]),
312 w3r = broadcast(SPECIES_128, twids[4]),
313 w3i = broadcast(SPECIES_128, -sign * twids[5]);
314 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
315 final DoubleVector
316 z0_r = fromArray(SPECIES_128, data, i),
317 z0_i = fromArray(SPECIES_128, data, i + im),
318 z1_r = fromArray(SPECIES_128, data, i + di),
319 z1_i = fromArray(SPECIES_128, data, i + di + im),
320 z2_r = fromArray(SPECIES_128, data, i + di2),
321 z2_i = fromArray(SPECIES_128, data, i + di2 + im),
322 z3_r = fromArray(SPECIES_128, data, i + di3),
323 z3_i = fromArray(SPECIES_128, data, i + di3 + im);
324 final DoubleVector
325 t1_r = z0_r.add(z2_r),
326 t1_i = z0_i.add(z2_i),
327 t2_r = z1_r.add(z3_r),
328 t2_i = z1_i.add(z3_i),
329 t3_r = z0_r.sub(z2_r),
330 t3_i = z0_i.sub(z2_i),
331 t4_r = z1_r.sub(z3_r).mul(sign),
332 t4_i = z1_i.sub(z3_i).mul(sign);
333 t1_r.add(t2_r).intoArray(ret, j);
334 t1_i.add(t2_i).intoArray(ret, j + im);
335 DoubleVector
336 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
337 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
338 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
339 w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
340 w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
341 w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
342 w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
343 w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
344 w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
345 }
346 }
347 }
348
349
350
351
352 private void blocked256(PassData passData) {
353 final double[] data = passData.in;
354 final double[] ret = passData.out;
355 int sign = passData.sign;
356 int i = passData.inOffset;
357 int j = passData.outOffset;
358
359 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
360 final DoubleVector
361 z0_r = fromArray(SPECIES_256, data, i),
362 z1_r = fromArray(SPECIES_256, data, i + di),
363 z2_r = fromArray(SPECIES_256, data, i + di2),
364 z3_r = fromArray(SPECIES_256, data, i + di3),
365 z0_i = fromArray(SPECIES_256, data, i + im),
366 z1_i = fromArray(SPECIES_256, data, i + di + im),
367 z2_i = fromArray(SPECIES_256, data, i + di2 + im),
368 z3_i = fromArray(SPECIES_256, data, i + di3 + im);
369 final DoubleVector
370 t1_r = z0_r.add(z2_r),
371 t1_i = z0_i.add(z2_i),
372 t2_r = z1_r.add(z3_r),
373 t2_i = z1_i.add(z3_i),
374 t3_r = z0_r.sub(z2_r),
375 t3_i = z0_i.sub(z2_i),
376 t4_r = z1_r.sub(z3_r).mul(sign),
377 t4_i = z1_i.sub(z3_i).mul(sign);
378 t1_r.add(t2_r).intoArray(ret, j);
379 t3_r.sub(t4_i).intoArray(ret, j + dj);
380 t1_r.sub(t2_r).intoArray(ret, j + dj2);
381 t3_r.add(t4_i).intoArray(ret, j + dj3);
382 t1_i.add(t2_i).intoArray(ret, j + im);
383 t3_i.add(t4_r).intoArray(ret, j + dj + im);
384 t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
385 t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
386 }
387
388 j += jstep;
389 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
390 final double[] twids = twiddles[k];
391 DoubleVector
392 w1r = broadcast(SPECIES_256, twids[0]),
393 w1i = broadcast(SPECIES_256, -sign * twids[1]),
394 w2r = broadcast(SPECIES_256, twids[2]),
395 w2i = broadcast(SPECIES_256, -sign * twids[3]),
396 w3r = broadcast(SPECIES_256, twids[4]),
397 w3i = broadcast(SPECIES_256, -sign * twids[5]);
398 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
399 final DoubleVector
400 z0_r = fromArray(SPECIES_256, data, i),
401 z1_r = fromArray(SPECIES_256, data, i + di),
402 z2_r = fromArray(SPECIES_256, data, i + di2),
403 z3_r = fromArray(SPECIES_256, data, i + di3),
404 z0_i = fromArray(SPECIES_256, data, i + im),
405 z1_i = fromArray(SPECIES_256, data, i + di + im),
406 z2_i = fromArray(SPECIES_256, data, i + di2 + im),
407 z3_i = fromArray(SPECIES_256, data, i + di3 + im);
408 final DoubleVector
409 t1_r = z0_r.add(z2_r),
410 t1_i = z0_i.add(z2_i),
411 t2_r = z1_r.add(z3_r),
412 t2_i = z1_i.add(z3_i),
413 t3_r = z0_r.sub(z2_r),
414 t3_i = z0_i.sub(z2_i),
415 t4_r = z1_r.sub(z3_r).mul(sign),
416 t4_i = z1_i.sub(z3_i).mul(sign);
417 t1_r.add(t2_r).intoArray(ret, j);
418 t1_i.add(t2_i).intoArray(ret, j + im);
419 DoubleVector
420 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
421 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
422 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
423 w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
424 w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
425 w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
426 w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
427 w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
428 w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
429 }
430 }
431 }
432
433
434
435
436 private void blocked512(PassData passData) {
437 final double[] data = passData.in;
438 final double[] ret = passData.out;
439 int sign = passData.sign;
440 int i = passData.inOffset;
441 int j = passData.outOffset;
442
443 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
444 final DoubleVector
445 z0_r = fromArray(SPECIES_512, data, i),
446 z1_r = fromArray(SPECIES_512, data, i + di),
447 z2_r = fromArray(SPECIES_512, data, i + di2),
448 z3_r = fromArray(SPECIES_512, data, i + di3),
449 z0_i = fromArray(SPECIES_512, data, i + im),
450 z1_i = fromArray(SPECIES_512, data, i + di + im),
451 z2_i = fromArray(SPECIES_512, data, i + di2 + im),
452 z3_i = fromArray(SPECIES_512, data, i + di3 + im);
453 final DoubleVector
454 t1_r = z0_r.add(z2_r),
455 t1_i = z0_i.add(z2_i),
456 t2_r = z1_r.add(z3_r),
457 t2_i = z1_i.add(z3_i),
458 t3_r = z0_r.sub(z2_r),
459 t3_i = z0_i.sub(z2_i),
460 t4_r = z1_r.sub(z3_r).mul(sign),
461 t4_i = z1_i.sub(z3_i).mul(sign);
462 t1_r.add(t2_r).intoArray(ret, j);
463 t3_r.sub(t4_i).intoArray(ret, j + dj);
464 t1_r.sub(t2_r).intoArray(ret, j + dj2);
465 t3_r.add(t4_i).intoArray(ret, j + dj3);
466 t1_i.add(t2_i).intoArray(ret, j + im);
467 t3_i.add(t4_r).intoArray(ret, j + dj + im);
468 t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
469 t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
470 }
471
472 j += jstep;
473 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
474 final double[] twids = twiddles[k];
475 DoubleVector
476 w1r = broadcast(SPECIES_512, twids[0]),
477 w1i = broadcast(SPECIES_512, -sign * twids[1]),
478 w2r = broadcast(SPECIES_512, twids[2]),
479 w2i = broadcast(SPECIES_512, -sign * twids[3]),
480 w3r = broadcast(SPECIES_512, twids[4]),
481 w3i = broadcast(SPECIES_512, -sign * twids[5]);
482 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
483 final DoubleVector
484 z0_r = fromArray(SPECIES_512, data, i),
485 z1_r = fromArray(SPECIES_512, data, i + di),
486 z2_r = fromArray(SPECIES_512, data, i + di2),
487 z3_r = fromArray(SPECIES_512, data, i + di3),
488 z0_i = fromArray(SPECIES_512, data, i + im),
489 z1_i = fromArray(SPECIES_512, data, i + di + im),
490 z2_i = fromArray(SPECIES_512, data, i + di2 + im),
491 z3_i = fromArray(SPECIES_512, data, i + di3 + im);
492 final DoubleVector
493 t1_r = z0_r.add(z2_r),
494 t1_i = z0_i.add(z2_i),
495 t2_r = z1_r.add(z3_r),
496 t2_i = z1_i.add(z3_i),
497 t3_r = z0_r.sub(z2_r),
498 t3_i = z0_i.sub(z2_i),
499 t4_r = z1_r.sub(z3_r).mul(sign),
500 t4_i = z1_i.sub(z3_i).mul(sign);
501 t1_r.add(t2_r).intoArray(ret, j);
502 t1_i.add(t2_i).intoArray(ret, j + im);
503 DoubleVector
504 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
505 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
506 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
507 w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
508 w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
509 w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
510 w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
511 w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
512 w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
513 }
514 }
515 }
516
517
518
519
520 private void interleaved128(PassData passData) {
521 final double[] data = passData.in;
522 final double[] ret = passData.out;
523 int sign = passData.sign;
524 int i = passData.inOffset;
525 int j = passData.outOffset;
526
527 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_128, i += LENGTH_128, j += LENGTH_128) {
528 DoubleVector
529 z0 = fromArray(SPECIES_128, data, i),
530 z1 = fromArray(SPECIES_128, data, i + di),
531 z2 = fromArray(SPECIES_128, data, i + di2),
532 z3 = fromArray(SPECIES_128, data, i + di3);
533 DoubleVector
534 t1 = z0.add(z2),
535 t2 = z1.add(z3),
536 t3 = z0.sub(z2),
537 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
538 t1.add(t2).intoArray(ret, j);
539 t3.add(t4.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
540 t1.sub(t2).intoArray(ret, j + dj2);
541 t3.add(t4.mul(NEGATE_IM_128)).intoArray(ret, j + dj3);
542 }
543
544 j += jstep;
545 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
546 final double[] twids = twiddles[k];
547 DoubleVector
548 w1r = broadcast(SPECIES_128, twids[0]),
549 w1i = broadcast(SPECIES_128, -sign * twids[1]).mul(NEGATE_IM_128),
550 w2r = broadcast(SPECIES_128, twids[2]),
551 w2i = broadcast(SPECIES_128, -sign * twids[3]).mul(NEGATE_IM_128),
552 w3r = broadcast(SPECIES_128, twids[4]),
553 w3i = broadcast(SPECIES_128, -sign * twids[5]).mul(NEGATE_IM_128);
554 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_128, i += LENGTH_128, j += LENGTH_128) {
555 DoubleVector
556 z0 = fromArray(SPECIES_128, data, i),
557 z1 = fromArray(SPECIES_128, data, i + di),
558 z2 = fromArray(SPECIES_128, data, i + di2),
559 z3 = fromArray(SPECIES_128, data, i + di3);
560 DoubleVector
561 t1 = z0.add(z2),
562 t2 = z1.add(z3),
563 t3 = z0.sub(z2),
564 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
565 t1.add(t2).intoArray(ret, j);
566 DoubleVector
567 x1 = t3.add(t4.mul(NEGATE_RE_128)),
568 x2 = t1.sub(t2),
569 x3 = t3.add(t4.mul(NEGATE_IM_128));
570 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
571 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
572 w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj3);
573 }
574 }
575 }
576
577
578
579
580 private void interleaved256(PassData passData) {
581 final double[] data = passData.in;
582 final double[] ret = passData.out;
583 int sign = passData.sign;
584 int i = passData.inOffset;
585 int j = passData.outOffset;
586
587 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_256, i += LENGTH_256, j += LENGTH_256) {
588 DoubleVector
589 z0 = fromArray(SPECIES_256, data, i),
590 z1 = fromArray(SPECIES_256, data, i + di),
591 z2 = fromArray(SPECIES_256, data, i + di2),
592 z3 = fromArray(SPECIES_256, data, i + di3);
593 DoubleVector
594 t1 = z0.add(z2),
595 t2 = z1.add(z3),
596 t3 = z0.sub(z2),
597 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
598 t1.add(t2).intoArray(ret, j);
599 t3.add(t4.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
600 t1.sub(t2).intoArray(ret, j + dj2);
601 t3.add(t4.mul(NEGATE_IM_256)).intoArray(ret, j + dj3);
602 }
603
604 j += jstep;
605 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
606 final double[] twids = twiddles[k];
607 DoubleVector
608 w1r = broadcast(SPECIES_256, twids[0]),
609 w1i = broadcast(SPECIES_256, -sign * twids[1]).mul(NEGATE_IM_256),
610 w2r = broadcast(SPECIES_256, twids[2]),
611 w2i = broadcast(SPECIES_256, -sign * twids[3]).mul(NEGATE_IM_256),
612 w3r = broadcast(SPECIES_256, twids[4]),
613 w3i = broadcast(SPECIES_256, -sign * twids[5]).mul(NEGATE_IM_256);
614 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_256, i += LENGTH_256, j += LENGTH_256) {
615 DoubleVector
616 z0 = fromArray(SPECIES_256, data, i),
617 z1 = fromArray(SPECIES_256, data, i + di),
618 z2 = fromArray(SPECIES_256, data, i + di2),
619 z3 = fromArray(SPECIES_256, data, i + di3);
620 DoubleVector
621 t1 = z0.add(z2),
622 t2 = z1.add(z3),
623 t3 = z0.sub(z2),
624 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
625 t1.add(t2).intoArray(ret, j);
626 DoubleVector
627 x1 = t3.add(t4.mul(NEGATE_RE_256)),
628 x2 = t1.sub(t2),
629 x3 = t3.add(t4.mul(NEGATE_IM_256));
630 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
631 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
632 w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj3);
633 }
634 }
635 }
636
637
638
639
640 private void interleaved512(PassData passData) {
641 final double[] data = passData.in;
642 final double[] ret = passData.out;
643 int sign = passData.sign;
644 int i = passData.inOffset;
645 int j = passData.outOffset;
646
647 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_512, i += LENGTH_512, j += LENGTH_512) {
648 DoubleVector
649 z0 = fromArray(SPECIES_512, data, i),
650 z1 = fromArray(SPECIES_512, data, i + di),
651 z2 = fromArray(SPECIES_512, data, i + di2),
652 z3 = fromArray(SPECIES_512, data, i + di3);
653 DoubleVector
654 t1 = z0.add(z2),
655 t2 = z1.add(z3),
656 t3 = z0.sub(z2),
657 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
658 t1.add(t2).intoArray(ret, j);
659 t3.add(t4.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
660 t1.sub(t2).intoArray(ret, j + dj2);
661 t3.add(t4.mul(NEGATE_IM_512)).intoArray(ret, j + dj3);
662 }
663
664 j += jstep;
665 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
666 final double[] twids = twiddles[k];
667 DoubleVector
668 w1r = broadcast(SPECIES_512, twids[0]),
669 w1i = broadcast(SPECIES_512, -sign * twids[1]).mul(NEGATE_IM_512),
670 w2r = broadcast(SPECIES_512, twids[2]),
671 w2i = broadcast(SPECIES_512, -sign * twids[3]).mul(NEGATE_IM_512),
672 w3r = broadcast(SPECIES_512, twids[4]),
673 w3i = broadcast(SPECIES_512, -sign * twids[5]).mul(NEGATE_IM_512);
674 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_512, i += LENGTH_512, j += LENGTH_512) {
675 DoubleVector
676 z0 = fromArray(SPECIES_512, data, i),
677 z1 = fromArray(SPECIES_512, data, i + di),
678 z2 = fromArray(SPECIES_512, data, i + di2),
679 z3 = fromArray(SPECIES_512, data, i + di3);
680 DoubleVector
681 t1 = z0.add(z2),
682 t2 = z1.add(z3),
683 t3 = z0.sub(z2),
684 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
685 t1.add(t2).intoArray(ret, j);
686 DoubleVector
687 x1 = t3.add(t4.mul(NEGATE_RE_512)),
688 x2 = t1.sub(t2),
689 x3 = t3.add(t4.mul(NEGATE_IM_512));
690 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
691 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
692 w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj3);
693 }
694 }
695 }
696 }