1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 package ffx.numerics.fft;
39
40 import jdk.incubator.vector.DoubleVector;
41
42 import static jdk.incubator.vector.DoubleVector.SPECIES_128;
43 import static jdk.incubator.vector.DoubleVector.SPECIES_256;
44 import static jdk.incubator.vector.DoubleVector.SPECIES_512;
45 import static jdk.incubator.vector.DoubleVector.broadcast;
46 import static jdk.incubator.vector.DoubleVector.fromArray;
47
48
49
50
51 public class MixedRadixFactor4 extends MixedRadixFactor {
52
53 private final int di2;
54 private final int di3;
55 private final int dj2;
56 private final int dj3;
57
58
59
60
61
62 public MixedRadixFactor4(PassConstants passConstants) {
63 super(passConstants);
64 di2 = 2 * di;
65 di3 = 3 * di;
66 dj2 = 2 * dj;
67 dj3 = 3 * dj;
68 }
69
70
71
72
73 @Override
74 protected void passScalar(PassData passData) {
75 final double[] data = passData.in;
76 final double[] ret = passData.out;
77 int sign = passData.sign;
78 int i = passData.inOffset;
79 int j = passData.outOffset;
80
81 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
82 final double z0_r = data[i];
83 final double z1_r = data[i + di];
84 final double z2_r = data[i + di2];
85 final double z3_r = data[i + di3];
86 final double z0_i = data[i + im];
87 final double z1_i = data[i + di + im];
88 final double z2_i = data[i + di2 + im];
89 final double z3_i = data[i + di3 + im];
90 final double t1_r = z0_r + z2_r;
91 final double t1_i = z0_i + z2_i;
92 final double t2_r = z1_r + z3_r;
93 final double t2_i = z1_i + z3_i;
94 final double t3_r = z0_r - z2_r;
95 final double t3_i = z0_i - z2_i;
96 final double t4_r = sign * (z1_r - z3_r);
97 final double t4_i = sign * (z1_i - z3_i);
98 ret[j] = t1_r + t2_r;
99 ret[j + im] = t1_i + t2_i;
100 ret[j + dj] = t3_r - t4_i;
101 ret[j + dj + im] = t3_i + t4_r;
102 ret[j + dj2] = t1_r - t2_r;
103 ret[j + dj2 + im] = t1_i - t2_i;
104 ret[j + dj3] = t3_r + t4_i;
105 ret[j + dj3 + im] = t3_i - t4_r;
106 }
107 j += jstep;
108 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
109 final int index = k * 3;
110 final double w1_r = wr[index];
111 final double w2_r = wr[index + 1];
112 final double w3_r = wr[index + 2];
113 final double w1_i = -sign * wi[index];
114 final double w2_i = -sign * wi[index + 1];
115 final double w3_i = -sign * wi[index + 2];
116 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
117 final double z0_r = data[i];
118 final double z1_r = data[i + di];
119 final double z2_r = data[i + di2];
120 final double z3_r = data[i + di3];
121 final double z0_i = data[i + im];
122 final double z1_i = data[i + di + im];
123 final double z2_i = data[i + di2 + im];
124 final double z3_i = data[i + di3 + im];
125 final double t1_r = z0_r + z2_r;
126 final double t1_i = z0_i + z2_i;
127 final double t2_r = z1_r + z3_r;
128 final double t2_i = z1_i + z3_i;
129 final double t3_r = z0_r - z2_r;
130 final double t3_i = z0_i - z2_i;
131 final double t4_r = sign * (z1_r - z3_r);
132 final double t4_i = sign * (z1_i - z3_i);
133 ret[j] = t1_r + t2_r;
134 ret[j + im] = t1_i + t2_i;
135 multiplyAndStore(t3_r - t4_i, t3_i + t4_r, w1_r, w1_i, ret, j + dj, j + dj + im);
136 multiplyAndStore(t1_r - t2_r, t1_i - t2_i, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
137 multiplyAndStore(t3_r + t4_i, t3_i - t4_r, w3_r, w3_i, ret, j + dj3, j + dj3 + im);
138 }
139 }
140 }
141
142
143
144
145 @Override
146 protected void passSIMD(PassData passData) {
147 if (im == 1) {
148 interleaved(passData);
149 } else {
150 blocked(passData);
151 }
152 }
153
154
155
156
157 private static int[] simdSizes = {8, 4, 2};
158
159
160
161
162
163
164 private void interleaved(PassData passData, int simdLength) {
165
166 switch (simdLength) {
167 case 2:
168
169 interleaved128(passData);
170 break;
171 case 4:
172
173 interleaved256(passData);
174 break;
175 case 8:
176
177 interleaved512(passData);
178 break;
179 default:
180 passScalar(passData);
181 }
182 }
183
184
185
186
187
188
189 private void blocked(PassData passData, int simdLength) {
190
191 switch (simdLength) {
192 case 2:
193
194 blocked128(passData);
195 break;
196 case 4:
197
198 blocked256(passData);
199 break;
200 case 8:
201
202 blocked512(passData);
203 break;
204 default:
205 passScalar(passData);
206 }
207 }
208
209
210
211
212
213 private void interleaved(PassData passData) {
214 if (innerLoopLimit % INTERLEAVED_LOOP == 0) {
215
216 interleaved(passData, LENGTH);
217 } else {
218
219 if (innerLoopLimit % 2 != 0 && innerLoopLimit != 1) {
220 passScalar(passData);
221 return;
222 }
223
224 for (int size : simdSizes) {
225 if (size >= LENGTH) {
226
227 continue;
228 }
229
230 if (innerLoopLimit % (size / 2) == 0) {
231 interleaved(passData, size);
232 }
233 }
234 }
235 }
236
237
238
239
240
241 private void blocked(PassData passData) {
242 if (innerLoopLimit % BLOCK_LOOP == 0) {
243
244 blocked(passData, LENGTH);
245 } else {
246
247 if (innerLoopLimit % 2 != 0) {
248 passScalar(passData);
249 return;
250 }
251
252 for (int size : simdSizes) {
253 if (size >= LENGTH) {
254
255 continue;
256 }
257
258 if (innerLoopLimit % size == 0) {
259 blocked(passData, size);
260 }
261 }
262 }
263 }
264
265
266
267
268 private void blocked128(PassData passData) {
269 final double[] data = passData.in;
270 final double[] ret = passData.out;
271 int sign = passData.sign;
272 int i = passData.inOffset;
273 int j = passData.outOffset;
274
275 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
276 final DoubleVector
277 z0_r = fromArray(SPECIES_128, data, i),
278 z0_i = fromArray(SPECIES_128, data, i + im),
279 z1_r = fromArray(SPECIES_128, data, i + di),
280 z1_i = fromArray(SPECIES_128, data, i + di + im),
281 z2_r = fromArray(SPECIES_128, data, i + di2),
282 z2_i = fromArray(SPECIES_128, data, i + di2 + im),
283 z3_r = fromArray(SPECIES_128, data, i + di3),
284 z3_i = fromArray(SPECIES_128, data, i + di3 + im);
285 final DoubleVector
286 t1_r = z0_r.add(z2_r),
287 t1_i = z0_i.add(z2_i),
288 t2_r = z1_r.add(z3_r),
289 t2_i = z1_i.add(z3_i),
290 t3_r = z0_r.sub(z2_r),
291 t3_i = z0_i.sub(z2_i),
292 t4_r = z1_r.sub(z3_r).mul(sign),
293 t4_i = z1_i.sub(z3_i).mul(sign);
294 t1_r.add(t2_r).intoArray(ret, j);
295 t1_i.add(t2_i).intoArray(ret, j + im);
296 t3_r.sub(t4_i).intoArray(ret, j + dj);
297 t3_i.add(t4_r).intoArray(ret, j + dj + im);
298 t1_r.sub(t2_r).intoArray(ret, j + dj2);
299 t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
300 t3_r.add(t4_i).intoArray(ret, j + dj3);
301 t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
302 }
303
304 j += jstep;
305 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
306 final int index = 3 * k;
307 final DoubleVector
308 w1r = broadcast(SPECIES_128, wr[index]),
309 w2r = broadcast(SPECIES_128, wr[index + 1]),
310 w3r = broadcast(SPECIES_128, wr[index + 2]),
311 w1i = broadcast(SPECIES_128, -sign * wi[index]),
312 w2i = broadcast(SPECIES_128, -sign * wi[index + 1]),
313 w3i = broadcast(SPECIES_128, -sign * wi[index + 2]);
314 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
315 final DoubleVector
316 z0_r = fromArray(SPECIES_128, data, i),
317 z0_i = fromArray(SPECIES_128, data, i + im),
318 z1_r = fromArray(SPECIES_128, data, i + di),
319 z1_i = fromArray(SPECIES_128, data, i + di + im),
320 z2_r = fromArray(SPECIES_128, data, i + di2),
321 z2_i = fromArray(SPECIES_128, data, i + di2 + im),
322 z3_r = fromArray(SPECIES_128, data, i + di3),
323 z3_i = fromArray(SPECIES_128, data, i + di3 + im);
324 final DoubleVector
325 t1_r = z0_r.add(z2_r),
326 t1_i = z0_i.add(z2_i),
327 t2_r = z1_r.add(z3_r),
328 t2_i = z1_i.add(z3_i),
329 t3_r = z0_r.sub(z2_r),
330 t3_i = z0_i.sub(z2_i),
331 t4_r = z1_r.sub(z3_r).mul(sign),
332 t4_i = z1_i.sub(z3_i).mul(sign);
333 t1_r.add(t2_r).intoArray(ret, j);
334 t1_i.add(t2_i).intoArray(ret, j + im);
335 DoubleVector
336 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
337 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
338 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
339 w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
340 w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
341 w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
342 w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
343 w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
344 w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
345 }
346 }
347 }
348
349
350
351
352 private void blocked256(PassData passData) {
353 final double[] data = passData.in;
354 final double[] ret = passData.out;
355 int sign = passData.sign;
356 int i = passData.inOffset;
357 int j = passData.outOffset;
358
359 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
360 final DoubleVector
361 z0_r = fromArray(SPECIES_256, data, i),
362 z1_r = fromArray(SPECIES_256, data, i + di),
363 z2_r = fromArray(SPECIES_256, data, i + di2),
364 z3_r = fromArray(SPECIES_256, data, i + di3),
365 z0_i = fromArray(SPECIES_256, data, i + im),
366 z1_i = fromArray(SPECIES_256, data, i + di + im),
367 z2_i = fromArray(SPECIES_256, data, i + di2 + im),
368 z3_i = fromArray(SPECIES_256, data, i + di3 + im);
369 final DoubleVector
370 t1_r = z0_r.add(z2_r),
371 t1_i = z0_i.add(z2_i),
372 t2_r = z1_r.add(z3_r),
373 t2_i = z1_i.add(z3_i),
374 t3_r = z0_r.sub(z2_r),
375 t3_i = z0_i.sub(z2_i),
376 t4_r = z1_r.sub(z3_r).mul(sign),
377 t4_i = z1_i.sub(z3_i).mul(sign);
378 t1_r.add(t2_r).intoArray(ret, j);
379 t3_r.sub(t4_i).intoArray(ret, j + dj);
380 t1_r.sub(t2_r).intoArray(ret, j + dj2);
381 t3_r.add(t4_i).intoArray(ret, j + dj3);
382 t1_i.add(t2_i).intoArray(ret, j + im);
383 t3_i.add(t4_r).intoArray(ret, j + dj + im);
384 t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
385 t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
386 }
387
388 j += jstep;
389 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
390 final int index = 3 * k;
391 final DoubleVector
392 w1r = broadcast(SPECIES_256, wr[index]),
393 w2r = broadcast(SPECIES_256, wr[index + 1]),
394 w3r = broadcast(SPECIES_256, wr[index + 2]),
395 w1i = broadcast(SPECIES_256, -sign * wi[index]),
396 w2i = broadcast(SPECIES_256, -sign * wi[index + 1]),
397 w3i = broadcast(SPECIES_256, -sign * wi[index + 2]);
398 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
399 final DoubleVector
400 z0_r = fromArray(SPECIES_256, data, i),
401 z1_r = fromArray(SPECIES_256, data, i + di),
402 z2_r = fromArray(SPECIES_256, data, i + di2),
403 z3_r = fromArray(SPECIES_256, data, i + di3),
404 z0_i = fromArray(SPECIES_256, data, i + im),
405 z1_i = fromArray(SPECIES_256, data, i + di + im),
406 z2_i = fromArray(SPECIES_256, data, i + di2 + im),
407 z3_i = fromArray(SPECIES_256, data, i + di3 + im);
408 final DoubleVector
409 t1_r = z0_r.add(z2_r),
410 t1_i = z0_i.add(z2_i),
411 t2_r = z1_r.add(z3_r),
412 t2_i = z1_i.add(z3_i),
413 t3_r = z0_r.sub(z2_r),
414 t3_i = z0_i.sub(z2_i),
415 t4_r = z1_r.sub(z3_r).mul(sign),
416 t4_i = z1_i.sub(z3_i).mul(sign);
417 t1_r.add(t2_r).intoArray(ret, j);
418 t1_i.add(t2_i).intoArray(ret, j + im);
419 DoubleVector
420 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
421 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
422 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
423 w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
424 w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
425 w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
426 w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
427 w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
428 w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
429 }
430 }
431 }
432
433
434
435
436 private void blocked512(PassData passData) {
437 final double[] data = passData.in;
438 final double[] ret = passData.out;
439 int sign = passData.sign;
440 int i = passData.inOffset;
441 int j = passData.outOffset;
442
443 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
444 final DoubleVector
445 z0_r = fromArray(SPECIES_512, data, i),
446 z1_r = fromArray(SPECIES_512, data, i + di),
447 z2_r = fromArray(SPECIES_512, data, i + di2),
448 z3_r = fromArray(SPECIES_512, data, i + di3),
449 z0_i = fromArray(SPECIES_512, data, i + im),
450 z1_i = fromArray(SPECIES_512, data, i + di + im),
451 z2_i = fromArray(SPECIES_512, data, i + di2 + im),
452 z3_i = fromArray(SPECIES_512, data, i + di3 + im);
453 final DoubleVector
454 t1_r = z0_r.add(z2_r),
455 t1_i = z0_i.add(z2_i),
456 t2_r = z1_r.add(z3_r),
457 t2_i = z1_i.add(z3_i),
458 t3_r = z0_r.sub(z2_r),
459 t3_i = z0_i.sub(z2_i),
460 t4_r = z1_r.sub(z3_r).mul(sign),
461 t4_i = z1_i.sub(z3_i).mul(sign);
462 t1_r.add(t2_r).intoArray(ret, j);
463 t3_r.sub(t4_i).intoArray(ret, j + dj);
464 t1_r.sub(t2_r).intoArray(ret, j + dj2);
465 t3_r.add(t4_i).intoArray(ret, j + dj3);
466 t1_i.add(t2_i).intoArray(ret, j + im);
467 t3_i.add(t4_r).intoArray(ret, j + dj + im);
468 t1_i.sub(t2_i).intoArray(ret, j + dj2 + im);
469 t3_i.sub(t4_r).intoArray(ret, j + dj3 + im);
470 }
471
472 j += jstep;
473 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
474 final int index = 3 * k;
475 final DoubleVector
476 w1r = broadcast(SPECIES_512, wr[index]),
477 w2r = broadcast(SPECIES_512, wr[index + 1]),
478 w3r = broadcast(SPECIES_512, wr[index + 2]),
479 w1i = broadcast(SPECIES_512, -sign * wi[index]),
480 w2i = broadcast(SPECIES_512, -sign * wi[index + 1]),
481 w3i = broadcast(SPECIES_512, -sign * wi[index + 2]);
482 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
483 final DoubleVector
484 z0_r = fromArray(SPECIES_512, data, i),
485 z1_r = fromArray(SPECIES_512, data, i + di),
486 z2_r = fromArray(SPECIES_512, data, i + di2),
487 z3_r = fromArray(SPECIES_512, data, i + di3),
488 z0_i = fromArray(SPECIES_512, data, i + im),
489 z1_i = fromArray(SPECIES_512, data, i + di + im),
490 z2_i = fromArray(SPECIES_512, data, i + di2 + im),
491 z3_i = fromArray(SPECIES_512, data, i + di3 + im);
492 final DoubleVector
493 t1_r = z0_r.add(z2_r),
494 t1_i = z0_i.add(z2_i),
495 t2_r = z1_r.add(z3_r),
496 t2_i = z1_i.add(z3_i),
497 t3_r = z0_r.sub(z2_r),
498 t3_i = z0_i.sub(z2_i),
499 t4_r = z1_r.sub(z3_r).mul(sign),
500 t4_i = z1_i.sub(z3_i).mul(sign);
501 t1_r.add(t2_r).intoArray(ret, j);
502 t1_i.add(t2_i).intoArray(ret, j + im);
503 DoubleVector
504 x1_r = t3_r.sub(t4_i), x1_i = t3_i.add(t4_r),
505 x2_r = t1_r.sub(t2_r), x2_i = t1_i.sub(t2_i),
506 x3_r = t3_r.add(t4_i), x3_i = t3_i.sub(t4_r);
507 w1r.mul(x1_r).add(w1i.neg().mul(x1_i)).intoArray(ret, j + dj);
508 w2r.mul(x2_r).add(w2i.neg().mul(x2_i)).intoArray(ret, j + dj2);
509 w3r.mul(x3_r).add(w3i.neg().mul(x3_i)).intoArray(ret, j + dj3);
510 w1r.mul(x1_i).add(w1i.mul(x1_r)).intoArray(ret, j + dj + im);
511 w2r.mul(x2_i).add(w2i.mul(x2_r)).intoArray(ret, j + dj2 + im);
512 w3r.mul(x3_i).add(w3i.mul(x3_r)).intoArray(ret, j + dj3 + im);
513 }
514 }
515 }
516
517
518
519
520 private void interleaved128(PassData passData) {
521 final double[] data = passData.in;
522 final double[] ret = passData.out;
523 int sign = passData.sign;
524 int i = passData.inOffset;
525 int j = passData.outOffset;
526
527 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
528 DoubleVector
529 z0 = fromArray(SPECIES_128, data, i),
530 z1 = fromArray(SPECIES_128, data, i + di),
531 z2 = fromArray(SPECIES_128, data, i + di2),
532 z3 = fromArray(SPECIES_128, data, i + di3);
533 DoubleVector
534 t1 = z0.add(z2),
535 t2 = z1.add(z3),
536 t3 = z0.sub(z2),
537 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
538 t1.add(t2).intoArray(ret, j);
539 t3.add(t4.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
540 t1.sub(t2).intoArray(ret, j + dj2);
541 t3.add(t4.mul(NEGATE_IM_128)).intoArray(ret, j + dj3);
542 }
543
544 j += jstep;
545 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
546
547
548
549
550
551
552
553
554 final int index = 3 * k;
555 DoubleVector
556 w1r = broadcast(SPECIES_128, wr[index]),
557 w2r = broadcast(SPECIES_128, wr[index + 1]),
558 w3r = broadcast(SPECIES_128, wr[index + 2]),
559 w1i = broadcast(SPECIES_128, -sign * wi[index]).mul(NEGATE_IM_128),
560 w2i = broadcast(SPECIES_128, -sign * wi[index + 1]).mul(NEGATE_IM_128),
561 w3i = broadcast(SPECIES_128, -sign * wi[index + 2]).mul(NEGATE_IM_128);
562 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
563 DoubleVector
564 z0 = fromArray(SPECIES_128, data, i),
565 z1 = fromArray(SPECIES_128, data, i + di),
566 z2 = fromArray(SPECIES_128, data, i + di2),
567 z3 = fromArray(SPECIES_128, data, i + di3);
568 DoubleVector
569 t1 = z0.add(z2),
570 t2 = z1.add(z3),
571 t3 = z0.sub(z2),
572 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_128);
573 t1.add(t2).intoArray(ret, j);
574 DoubleVector
575 x1 = t3.add(t4.mul(NEGATE_RE_128)),
576 x2 = t1.sub(t2),
577 x3 = t3.add(t4.mul(NEGATE_IM_128));
578 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
579 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
580 w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj3);
581 }
582 }
583 }
584
585
586
587
588 private void interleaved256(PassData passData) {
589 final double[] data = passData.in;
590 final double[] ret = passData.out;
591 int sign = passData.sign;
592 int i = passData.inOffset;
593 int j = passData.outOffset;
594
595 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
596 DoubleVector
597 z0 = fromArray(SPECIES_256, data, i),
598 z1 = fromArray(SPECIES_256, data, i + di),
599 z2 = fromArray(SPECIES_256, data, i + di2),
600 z3 = fromArray(SPECIES_256, data, i + di3);
601 DoubleVector
602 t1 = z0.add(z2),
603 t2 = z1.add(z3),
604 t3 = z0.sub(z2),
605 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
606 t1.add(t2).intoArray(ret, j);
607 t3.add(t4.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
608 t1.sub(t2).intoArray(ret, j + dj2);
609 t3.add(t4.mul(NEGATE_IM_256)).intoArray(ret, j + dj3);
610 }
611
612 j += jstep;
613 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
614
615
616
617
618
619
620
621
622 final int index = 3 * k;
623 DoubleVector
624 w1r = broadcast(SPECIES_256, wr[index]),
625 w2r = broadcast(SPECIES_256, wr[index + 1]),
626 w3r = broadcast(SPECIES_256, wr[index + 2]),
627 w1i = broadcast(SPECIES_256, -sign * wi[index]).mul(NEGATE_IM_256),
628 w2i = broadcast(SPECIES_256, -sign * wi[index + 1]).mul(NEGATE_IM_256),
629 w3i = broadcast(SPECIES_256, -sign * wi[index + 2]).mul(NEGATE_IM_256);
630 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
631 DoubleVector
632 z0 = fromArray(SPECIES_256, data, i),
633 z1 = fromArray(SPECIES_256, data, i + di),
634 z2 = fromArray(SPECIES_256, data, i + di2),
635 z3 = fromArray(SPECIES_256, data, i + di3);
636 DoubleVector
637 t1 = z0.add(z2),
638 t2 = z1.add(z3),
639 t3 = z0.sub(z2),
640 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_256);
641 t1.add(t2).intoArray(ret, j);
642 DoubleVector
643 x1 = t3.add(t4.mul(NEGATE_RE_256)),
644 x2 = t1.sub(t2),
645 x3 = t3.add(t4.mul(NEGATE_IM_256));
646 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
647 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
648 w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj3);
649 }
650 }
651 }
652
653
654
655
656 private void interleaved512(PassData passData) {
657 final double[] data = passData.in;
658 final double[] ret = passData.out;
659 int sign = passData.sign;
660 int i = passData.inOffset;
661 int j = passData.outOffset;
662
663 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
664 DoubleVector
665 z0 = fromArray(SPECIES_512, data, i),
666 z1 = fromArray(SPECIES_512, data, i + di),
667 z2 = fromArray(SPECIES_512, data, i + di2),
668 z3 = fromArray(SPECIES_512, data, i + di3);
669 DoubleVector
670 t1 = z0.add(z2),
671 t2 = z1.add(z3),
672 t3 = z0.sub(z2),
673 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
674 t1.add(t2).intoArray(ret, j);
675 t3.add(t4.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
676 t1.sub(t2).intoArray(ret, j + dj2);
677 t3.add(t4.mul(NEGATE_IM_512)).intoArray(ret, j + dj3);
678 }
679
680 j += jstep;
681 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
682
683
684
685
686
687
688
689
690 final int index = 3 * k;
691 DoubleVector
692 w1r = broadcast(SPECIES_512, wr[index]),
693 w2r = broadcast(SPECIES_512, wr[index + 1]),
694 w3r = broadcast(SPECIES_512, wr[index + 2]),
695 w1i = broadcast(SPECIES_512, -sign * wi[index]).mul(NEGATE_IM_512),
696 w2i = broadcast(SPECIES_512, -sign * wi[index + 1]).mul(NEGATE_IM_512),
697 w3i = broadcast(SPECIES_512, -sign * wi[index + 2]).mul(NEGATE_IM_512);
698 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
699 DoubleVector
700 z0 = fromArray(SPECIES_512, data, i),
701 z1 = fromArray(SPECIES_512, data, i + di),
702 z2 = fromArray(SPECIES_512, data, i + di2),
703 z3 = fromArray(SPECIES_512, data, i + di3);
704 DoubleVector
705 t1 = z0.add(z2),
706 t2 = z1.add(z3),
707 t3 = z0.sub(z2),
708 t4 = z1.sub(z3).mul(sign).rearrange(SHUFFLE_RE_IM_512);
709 t1.add(t2).intoArray(ret, j);
710 DoubleVector
711 x1 = t3.add(t4.mul(NEGATE_RE_512)),
712 x2 = t1.sub(t2),
713 x3 = t3.add(t4.mul(NEGATE_IM_512));
714 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
715 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
716 w3r.mul(x3).add(w3i.mul(x3).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj3);
717 }
718 }
719 }
720 }