1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 package ffx.numerics.fft;
39
40 import jdk.incubator.vector.DoubleVector;
41
42 import static java.lang.Math.fma;
43 import static jdk.incubator.vector.DoubleVector.SPECIES_128;
44 import static jdk.incubator.vector.DoubleVector.SPECIES_256;
45 import static jdk.incubator.vector.DoubleVector.SPECIES_512;
46 import static jdk.incubator.vector.DoubleVector.broadcast;
47 import static jdk.incubator.vector.DoubleVector.fromArray;
48 import static org.apache.commons.math3.util.FastMath.sqrt;
49
50
51
52
53 public class MixedRadixFactor3 extends MixedRadixFactor {
54
55 private static final double sqrt3_2 = sqrt(3.0) / 2.0;
56 private final int di2;
57 private final int dj2;
58
59
60
61
62 private static int[] simdSizes = {8, 4, 2};
63
64
65
66
67
68
69 public MixedRadixFactor3(PassConstants passConstants) {
70 super(passConstants);
71 di2 = 2 * di;
72 dj2 = 2 * dj;
73 }
74
75
76
77
78
79
80 public boolean isValidSIMDWidth(int width) {
81
82 if (width != 2 && width != 4 && width != 8) {
83 return false;
84 }
85 if (im == 1) {
86
87 return innerLoopLimit % (width / 2) == 0;
88 } else {
89
90 return innerLoopLimit % width == 0;
91 }
92 }
93
94
95
96
97
98
99 public int getOptimalSIMDWidth() {
100
101 if (isValidSIMDWidth(LENGTH)) {
102 return LENGTH;
103 }
104
105 for (int size : simdSizes) {
106 if (size >= LENGTH) {
107
108 continue;
109 }
110 if (isValidSIMDWidth(size)) {
111 return size;
112 }
113 }
114
115 return 0;
116 }
117
118
119
120
121 @Override
122 protected void passScalar(PassData passData) {
123 final double[] data = passData.in;
124 final double[] ret = passData.out;
125 int sign = passData.sign;
126 int i = passData.inOffset;
127 int j = passData.outOffset;
128 final double tau = sign * sqrt3_2;
129
130
131 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
132 final double z0_r = data[i];
133 final double z1_r = data[i + di];
134 final double z2_r = data[i + di2];
135 final double z0_i = data[i + im];
136 final double z1_i = data[i + di + im];
137 final double z2_i = data[i + di2 + im];
138 final double t1_r = z1_r + z2_r;
139 final double t1_i = z1_i + z2_i;
140 final double t2_r = fma(-0.5, t1_r, z0_r);
141 final double t2_i = fma(-0.5, t1_i, z0_i);
142 final double t3_r = tau * (z1_r - z2_r);
143 final double t3_i = tau * (z1_i - z2_i);
144 ret[j] = z0_r + t1_r;
145 ret[j + im] = z0_i + t1_i;
146 ret[j + dj] = t2_r - t3_i;
147 ret[j + dj + im] = t2_i + t3_r;
148 ret[j + dj2] = t2_r + t3_i;
149 ret[j + dj2 + im] = t2_i - t3_r;
150 }
151 j += jstep;
152 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
153 final int index = k * 2;
154 final double w1_r = wr[index];
155 final double w2_r = wr[index + 1];
156 final double w1_i = -sign * wi[index];
157 final double w2_i = -sign * wi[index + 1];
158 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
159 final double z0_r = data[i];
160 final double z1_r = data[i + di];
161 final double z2_r = data[i + di2];
162 final double z0_i = data[i + im];
163 final double z1_i = data[i + di + im];
164 final double z2_i = data[i + di2 + im];
165 final double t1_r = z1_r + z2_r;
166 final double t1_i = z1_i + z2_i;
167 final double t2_r = fma(-0.5, t1_r, z0_r);
168 final double t2_i = fma(-0.5, t1_i, z0_i);
169 final double t3_r = tau * (z1_r - z2_r);
170 final double t3_i = tau * (z1_i - z2_i);
171 ret[j] = z0_r + t1_r;
172 ret[j + im] = z0_i + t1_i;
173 multiplyAndStore(t2_r - t3_i, t2_i + t3_r, w1_r, w1_i, ret, j + dj, j + dj + im);
174 multiplyAndStore(t2_r + t3_i, t2_i - t3_r, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
175 }
176 }
177 }
178
179
180
181
182
183
184 @Override
185 protected void passSIMD(PassData passData) {
186 if (!isValidSIMDWidth(simdWidth)) {
187 passScalar(passData);
188 } else {
189 if (im == 1) {
190 interleaved(passData, simdWidth);
191 } else {
192 blocked(passData, simdWidth);
193 }
194 }
195 }
196
197
198
199
200
201
202
203 private void interleaved(PassData passData, int simdLength) {
204
205 switch (simdLength) {
206 case 2:
207
208 interleaved128(passData);
209 break;
210 case 4:
211
212 interleaved256(passData);
213 break;
214 case 8:
215
216 interleaved512(passData);
217 break;
218 default:
219 passScalar(passData);
220 }
221 }
222
223
224
225
226
227
228
229 private void blocked(PassData passData, int simdLength) {
230
231 switch (simdLength) {
232 case 2:
233
234 blocked128(passData);
235 break;
236 case 4:
237
238 blocked256(passData);
239 break;
240 case 8:
241
242 blocked512(passData);
243 break;
244 default:
245 passScalar(passData);
246 }
247 }
248
249
250
251
252 private void blocked128(PassData passData) {
253 final double[] data = passData.in;
254 final double[] ret = passData.out;
255 int sign = passData.sign;
256 int i = passData.inOffset;
257 int j = passData.outOffset;
258 final double tau = sign * sqrt3_2;
259
260
261 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
262 final DoubleVector
263 z0_r = fromArray(SPECIES_128, data, i),
264 z1_r = fromArray(SPECIES_128, data, i + di),
265 z2_r = fromArray(SPECIES_128, data, i + di2),
266 z0_i = fromArray(SPECIES_128, data, i + im),
267 z1_i = fromArray(SPECIES_128, data, i + di + im),
268 z2_i = fromArray(SPECIES_128, data, i + di2 + im);
269 final DoubleVector
270 t1_r = z1_r.add(z2_r),
271 t1_i = z1_i.add(z2_i),
272 t2_r = t1_r.mul(-0.5).add(z0_r),
273 t2_i = t1_i.mul(-0.5).add(z0_i),
274 t3_r = z1_r.sub(z2_r).mul(tau),
275 t3_i = z1_i.sub(z2_i).mul(tau);
276 z0_r.add(t1_r).intoArray(ret, j);
277 z0_i.add(t1_i).intoArray(ret, j + im);
278 t2_r.sub(t3_i).intoArray(ret, j + dj);
279 t2_i.add(t3_r).intoArray(ret, j + dj + im);
280 t2_r.add(t3_i).intoArray(ret, j + dj2);
281 t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
282 }
283
284 j += jstep;
285 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
286 final int index = k * 2;
287 final double
288 w1_r = wr[index],
289 w2_r = wr[index + 1],
290 w1_i = -sign * wi[index],
291 w2_i = -sign * wi[index + 1];
292 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
293 final DoubleVector
294 z0_r = fromArray(SPECIES_128, data, i),
295 z1_r = fromArray(SPECIES_128, data, i + di),
296 z2_r = fromArray(SPECIES_128, data, i + di2),
297 z0_i = fromArray(SPECIES_128, data, i + im),
298 z1_i = fromArray(SPECIES_128, data, i + di + im),
299 z2_i = fromArray(SPECIES_128, data, i + di2 + im);
300 final DoubleVector
301 t1_r = z1_r.add(z2_r),
302 t1_i = z1_i.add(z2_i),
303 t2_r = t1_r.mul(-0.5).add(z0_r),
304 t2_i = t1_i.mul(-0.5).add(z0_i),
305 t3_r = z1_r.sub(z2_r).mul(tau),
306 t3_i = z1_i.sub(z2_i).mul(tau);
307 z0_r.add(t1_r).intoArray(ret, j);
308 z0_i.add(t1_i).intoArray(ret, j + im);
309 final DoubleVector
310 x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
311 x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
312 x1_r.mul(w1_r).sub(x1_i.mul(w1_i)).intoArray(ret, j + dj);
313 x2_r.mul(w2_r).sub(x2_i.mul(w2_i)).intoArray(ret, j + dj2);
314 x1_i.mul(w1_r).add(x1_r.mul(w1_i)).intoArray(ret, j + dj + im);
315 x2_i.mul(w2_r).add(x2_r.mul(w2_i)).intoArray(ret, j + dj2 + im);
316 }
317 }
318 }
319
320
321
322
323 private void blocked256(PassData passData) {
324 final double[] data = passData.in;
325 final double[] ret = passData.out;
326 int sign = passData.sign;
327 int i = passData.inOffset;
328 int j = passData.outOffset;
329 final double tau = sign * sqrt3_2;
330
331
332 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
333 final DoubleVector
334 z0_r = fromArray(SPECIES_256, data, i),
335 z1_r = fromArray(SPECIES_256, data, i + di),
336 z2_r = fromArray(SPECIES_256, data, i + di2),
337 z0_i = fromArray(SPECIES_256, data, i + im),
338 z1_i = fromArray(SPECIES_256, data, i + di + im),
339 z2_i = fromArray(SPECIES_256, data, i + di2 + im);
340 final DoubleVector
341 t1_r = z1_r.add(z2_r),
342 t1_i = z1_i.add(z2_i),
343 t2_r = t1_r.mul(-0.5).add(z0_r),
344 t2_i = t1_i.mul(-0.5).add(z0_i),
345 t3_r = z1_r.sub(z2_r).mul(tau),
346 t3_i = z1_i.sub(z2_i).mul(tau);
347 z0_r.add(t1_r).intoArray(ret, j);
348 z0_i.add(t1_i).intoArray(ret, j + im);
349 t2_r.sub(t3_i).intoArray(ret, j + dj);
350 t2_i.add(t3_r).intoArray(ret, j + dj + im);
351 t2_r.add(t3_i).intoArray(ret, j + dj2);
352 t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
353 }
354
355 j += jstep;
356 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
357 final int index = k * 2;
358 final double
359 w1_r = wr[index],
360 w2_r = wr[index + 1],
361 w1_i = -sign * wi[index],
362 w2_i = -sign * wi[index + 1];
363 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
364 final DoubleVector
365 z0_r = fromArray(SPECIES_256, data, i),
366 z1_r = fromArray(SPECIES_256, data, i + di),
367 z2_r = fromArray(SPECIES_256, data, i + di2),
368 z0_i = fromArray(SPECIES_256, data, i + im),
369 z1_i = fromArray(SPECIES_256, data, i + di + im),
370 z2_i = fromArray(SPECIES_256, data, i + di2 + im);
371 final DoubleVector
372 t1_r = z1_r.add(z2_r),
373 t1_i = z1_i.add(z2_i),
374 t2_r = t1_r.mul(-0.5).add(z0_r),
375 t2_i = t1_i.mul(-0.5).add(z0_i),
376 t3_r = z1_r.sub(z2_r).mul(tau),
377 t3_i = z1_i.sub(z2_i).mul(tau);
378 z0_r.add(t1_r).intoArray(ret, j);
379 z0_i.add(t1_i).intoArray(ret, j + im);
380 final DoubleVector
381 x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
382 x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
383 x1_r.mul(w1_r).sub(x1_i.mul(w1_i)).intoArray(ret, j + dj);
384 x2_r.mul(w2_r).sub(x2_i.mul(w2_i)).intoArray(ret, j + dj2);
385 x1_i.mul(w1_r).add(x1_r.mul(w1_i)).intoArray(ret, j + dj + im);
386 x2_i.mul(w2_r).add(x2_r.mul(w2_i)).intoArray(ret, j + dj2 + im);
387 }
388 }
389 }
390
391
392
393
394 private void blocked512(PassData passData) {
395 final double[] data = passData.in;
396 final double[] ret = passData.out;
397 int sign = passData.sign;
398 int i = passData.inOffset;
399 int j = passData.outOffset;
400 final double tau = sign * sqrt3_2;
401
402
403 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
404 final DoubleVector
405 z0_r = fromArray(SPECIES_512, data, i),
406 z1_r = fromArray(SPECIES_512, data, i + di),
407 z2_r = fromArray(SPECIES_512, data, i + di2),
408 z0_i = fromArray(SPECIES_512, data, i + im),
409 z1_i = fromArray(SPECIES_512, data, i + di + im),
410 z2_i = fromArray(SPECIES_512, data, i + di2 + im);
411 final DoubleVector
412 t1_r = z1_r.add(z2_r),
413 t1_i = z1_i.add(z2_i),
414 t2_r = t1_r.mul(-0.5).add(z0_r),
415 t2_i = t1_i.mul(-0.5).add(z0_i),
416 t3_r = z1_r.sub(z2_r).mul(tau),
417 t3_i = z1_i.sub(z2_i).mul(tau);
418 z0_r.add(t1_r).intoArray(ret, j);
419 z0_i.add(t1_i).intoArray(ret, j + im);
420 t2_r.sub(t3_i).intoArray(ret, j + dj);
421 t2_i.add(t3_r).intoArray(ret, j + dj + im);
422 t2_r.add(t3_i).intoArray(ret, j + dj2);
423 t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
424 }
425
426 j += jstep;
427 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
428 final int index = k * 2;
429 final double
430 w1_r = wr[index],
431 w2_r = wr[index + 1],
432 w1_i = -sign * wi[index],
433 w2_i = -sign * wi[index + 1];
434 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
435 final DoubleVector
436 z0_r = fromArray(SPECIES_512, data, i),
437 z1_r = fromArray(SPECIES_512, data, i + di),
438 z2_r = fromArray(SPECIES_512, data, i + di2),
439 z0_i = fromArray(SPECIES_512, data, i + im),
440 z1_i = fromArray(SPECIES_512, data, i + di + im),
441 z2_i = fromArray(SPECIES_512, data, i + di2 + im);
442 final DoubleVector
443 t1_r = z1_r.add(z2_r),
444 t1_i = z1_i.add(z2_i),
445 t2_r = t1_r.mul(-0.5).add(z0_r),
446 t2_i = t1_i.mul(-0.5).add(z0_i),
447 t3_r = z1_r.sub(z2_r).mul(tau),
448 t3_i = z1_i.sub(z2_i).mul(tau);
449 z0_r.add(t1_r).intoArray(ret, j);
450 z0_i.add(t1_i).intoArray(ret, j + im);
451 final DoubleVector
452 x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
453 x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
454 x1_r.mul(w1_r).sub(x1_i.mul(w1_i)).intoArray(ret, j + dj);
455 x2_r.mul(w2_r).sub(x2_i.mul(w2_i)).intoArray(ret, j + dj2);
456 x1_i.mul(w1_r).add(x1_r.mul(w1_i)).intoArray(ret, j + dj + im);
457 x2_i.mul(w2_r).add(x2_r.mul(w2_i)).intoArray(ret, j + dj2 + im);
458 }
459 }
460 }
461
462
463
464
465 private void interleaved128(PassData passData) {
466 final double[] data = passData.in;
467 final double[] ret = passData.out;
468 int sign = passData.sign;
469 int i = passData.inOffset;
470 int j = passData.outOffset;
471 final double tau = sign * sqrt3_2;
472
473 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
474 final DoubleVector
475 z0 = fromArray(SPECIES_128, data, i),
476 z1 = fromArray(SPECIES_128, data, i + di),
477 z2 = fromArray(SPECIES_128, data, i + di2);
478 final DoubleVector
479 t1 = z1.add(z2),
480 t2 = t1.mul(-0.5).add(z0),
481 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
482 z0.add(t1).intoArray(ret, j);
483 t2.add(t3.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
484 t2.add(t3.mul(NEGATE_IM_128)).intoArray(ret, j + dj2);
485 }
486
487 j += jstep;
488 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
489 final int index = k * 2;
490 final DoubleVector
491 w1r = broadcast(SPECIES_128, wr[index]),
492 w2r = broadcast(SPECIES_128, wr[index + 1]),
493 w1i = broadcast(SPECIES_128, -sign * wi[index]).mul(NEGATE_IM_128),
494 w2i = broadcast(SPECIES_128, -sign * wi[index + 1]).mul(NEGATE_IM_128);
495 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
496 final DoubleVector
497 z0 = fromArray(SPECIES_128, data, i),
498 z1 = fromArray(SPECIES_128, data, i + di),
499 z2 = fromArray(SPECIES_128, data, i + di2);
500 final DoubleVector
501 t1 = z1.add(z2),
502 t2 = t1.mul(-0.5).add(z0),
503 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
504 z0.add(t1).intoArray(ret, j);
505 z0.add(t1).intoArray(ret, j);
506 final DoubleVector
507 x1 = t3.fma(NEGATE_RE_128, t2),
508 x2 = t3.fma(NEGATE_IM_128, t2);
509 w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
510 w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
511 }
512 }
513 }
514
515
516
517
518 private void interleaved256(PassData passData) {
519 final double[] data = passData.in;
520 final double[] ret = passData.out;
521 int sign = passData.sign;
522 int i = passData.inOffset;
523 int j = passData.outOffset;
524 final double tau = sign * sqrt3_2;
525
526 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
527 final DoubleVector
528 z0 = fromArray(SPECIES_256, data, i),
529 z1 = fromArray(SPECIES_256, data, i + di),
530 z2 = fromArray(SPECIES_256, data, i + di2);
531 final DoubleVector
532 t1 = z1.add(z2),
533 t2 = t1.mul(-0.5).add(z0),
534 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
535 z0.add(t1).intoArray(ret, j);
536 t2.add(t3.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
537 t2.add(t3.mul(NEGATE_IM_256)).intoArray(ret, j + dj2);
538 }
539
540 j += jstep;
541 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
542 final int index = k * 2;
543 final DoubleVector
544 w1r = broadcast(SPECIES_256, wr[index]),
545 w2r = broadcast(SPECIES_256, wr[index + 1]),
546 w1i = broadcast(SPECIES_256, -sign * wi[index]).mul(NEGATE_IM_256),
547 w2i = broadcast(SPECIES_256, -sign * wi[index + 1]).mul(NEGATE_IM_256);
548 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
549 final DoubleVector
550 z0 = fromArray(SPECIES_256, data, i),
551 z1 = fromArray(SPECIES_256, data, i + di),
552 z2 = fromArray(SPECIES_256, data, i + di2);
553 final DoubleVector
554 t1 = z1.add(z2),
555 t2 = t1.mul(-0.5).add(z0),
556 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
557 z0.add(t1).intoArray(ret, j);
558 final DoubleVector
559 x1 = t3.fma(NEGATE_RE_256, t2),
560 x2 = t3.fma(NEGATE_IM_256, t2);
561 w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
562 w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
563 }
564 }
565 }
566
567
568
569
570 private void interleaved512(PassData passData) {
571 final double[] data = passData.in;
572 final double[] ret = passData.out;
573 int sign = passData.sign;
574 int i = passData.inOffset;
575 int j = passData.outOffset;
576 final double tau = sign * sqrt3_2;
577
578 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
579 final DoubleVector
580 z0 = fromArray(SPECIES_512, data, i),
581 z1 = fromArray(SPECIES_512, data, i + di),
582 z2 = fromArray(SPECIES_512, data, i + di2);
583 final DoubleVector
584 t1 = z1.add(z2),
585 t2 = t1.mul(-0.5).add(z0),
586 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
587 z0.add(t1).intoArray(ret, j);
588 t2.add(t3.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
589 t2.add(t3.mul(NEGATE_IM_512)).intoArray(ret, j + dj2);
590 }
591
592 j += jstep;
593 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
594 final int index = k * 2;
595 final DoubleVector
596 w1r = broadcast(SPECIES_512, wr[index]),
597 w2r = broadcast(SPECIES_512, wr[index + 1]),
598 w1i = broadcast(SPECIES_512, -sign * wi[index]).mul(NEGATE_IM_512),
599 w2i = broadcast(SPECIES_512, -sign * wi[index + 1]).mul(NEGATE_IM_512);
600 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
601 final DoubleVector
602 z0 = fromArray(SPECIES_512, data, i),
603 z1 = fromArray(SPECIES_512, data, i + di),
604 z2 = fromArray(SPECIES_512, data, i + di2);
605 final DoubleVector
606 t1 = z1.add(z2),
607 t2 = t1.mul(-0.5).add(z0),
608 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
609 z0.add(t1).intoArray(ret, j);
610 final DoubleVector
611 x1 = t3.fma(NEGATE_RE_512, t2),
612 x2 = t3.fma(NEGATE_IM_512, t2);
613 w1r.fma(x1, w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
614 w2r.fma(x2, w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
615 }
616 }
617 }
618 }