1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 package ffx.numerics.fft;
39
40 import jdk.incubator.vector.DoubleVector;
41
42 import static java.lang.Math.fma;
43 import static jdk.incubator.vector.DoubleVector.SPECIES_128;
44 import static jdk.incubator.vector.DoubleVector.SPECIES_256;
45 import static jdk.incubator.vector.DoubleVector.SPECIES_512;
46 import static jdk.incubator.vector.DoubleVector.broadcast;
47 import static jdk.incubator.vector.DoubleVector.fromArray;
48 import static org.apache.commons.math3.util.FastMath.sqrt;
49
50
51
52
53 public class MixedRadixFactor3 extends MixedRadixFactor {
54
55 private static final double sqrt3_2 = sqrt(3.0) / 2.0;
56 private final int di2;
57 private final int dj2;
58
59
60
61
62
63
64 public MixedRadixFactor3(PassConstants passConstants) {
65 super(passConstants);
66 di2 = 2 * di;
67 dj2 = 2 * dj;
68 }
69
70
71
72
73 @Override
74 protected void passScalar(PassData passData) {
75 final double[] data = passData.in;
76 final double[] ret = passData.out;
77 int sign = passData.sign;
78 int i = passData.inOffset;
79 int j = passData.outOffset;
80 final double tau = sign * sqrt3_2;
81
82 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
83 final double z0_r = data[i];
84 final double z1_r = data[i + di];
85 final double z2_r = data[i + di2];
86 final double z0_i = data[i + im];
87 final double z1_i = data[i + di + im];
88 final double z2_i = data[i + di2 + im];
89 final double t1_r = z1_r + z2_r;
90 final double t1_i = z1_i + z2_i;
91 final double t2_r = fma(-0.5, t1_r, z0_r);
92 final double t2_i = fma(-0.5, t1_i, z0_i);
93 final double t3_r = tau * (z1_r - z2_r);
94 final double t3_i = tau * (z1_i - z2_i);
95 ret[j] = z0_r + t1_r;
96 ret[j + im] = z0_i + t1_i;
97 ret[j + dj] = t2_r - t3_i;
98 ret[j + dj + im] = t2_i + t3_r;
99 ret[j + dj2] = t2_r + t3_i;
100 ret[j + dj2 + im] = t2_i - t3_r;
101 }
102 j += jstep;
103 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
104 final double[] twids = twiddles[k];
105 final double w1_r = twids[0];
106 final double w1_i = -sign * twids[1];
107 final double w2_r = twids[2];
108 final double w2_i = -sign * twids[3];
109 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
110 final double z0_r = data[i];
111 final double z1_r = data[i + di];
112 final double z2_r = data[i + di2];
113 final double z0_i = data[i + im];
114 final double z1_i = data[i + di + im];
115 final double z2_i = data[i + di2 + im];
116 final double t1_r = z1_r + z2_r;
117 final double t1_i = z1_i + z2_i;
118 final double t2_r = fma(-0.5, t1_r, z0_r);
119 final double t2_i = fma(-0.5, t1_i, z0_i);
120 final double t3_r = tau * (z1_r - z2_r);
121 final double t3_i = tau * (z1_i - z2_i);
122 ret[j] = z0_r + t1_r;
123 ret[j + im] = z0_i + t1_i;
124 multiplyAndStore(t2_r - t3_i, t2_i + t3_r, w1_r, w1_i, ret, j + dj, j + dj + im);
125 multiplyAndStore(t2_r + t3_i, t2_i - t3_r, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
126 }
127 }
128 }
129
130
131
132
133 @Override
134 protected void passSIMD(PassData passData) {
135 if (im == 1) {
136 interleaved(passData);
137 } else {
138 blocked(passData);
139 }
140 }
141
142
143
144
145 private static int[] simdSizes = {8, 4, 2};
146
147
148
149
150
151
152
153 private void interleaved(PassData passData, int simdLength) {
154
155 switch (simdLength) {
156 case 2:
157
158 interleaved128(passData);
159 break;
160 case 4:
161
162 interleaved256(passData);
163 break;
164 case 8:
165
166 interleaved512(passData);
167 break;
168 default:
169 passScalar(passData);
170 }
171 }
172
173
174
175
176
177
178
179 private void blocked(PassData passData, int simdLength) {
180
181 switch (simdLength) {
182 case 2:
183
184 blocked128(passData);
185 break;
186 case 4:
187
188 blocked256(passData);
189 break;
190 case 8:
191
192 blocked512(passData);
193 break;
194 default:
195 passScalar(passData);
196 }
197 }
198
199
200
201
202
203
204 private void interleaved(PassData passData) {
205 if (innerLoopLimit % LOOP == 0) {
206
207 interleaved(passData, LENGTH);
208 } else {
209
210 if (innerLoopLimit % 2 != 0 && innerLoopLimit != 1) {
211 passScalar(passData);
212 return;
213 }
214
215 for (int size : simdSizes) {
216 if (size >= LENGTH) {
217
218 continue;
219 }
220
221 if (innerLoopLimit % (size / 2) == 0) {
222 interleaved(passData, size);
223 }
224 }
225 }
226 }
227
228
229
230
231
232
233 private void blocked(PassData passData) {
234 if (innerLoopLimit % BLOCK_LOOP == 0) {
235
236 blocked(passData, LENGTH);
237 } else {
238
239 if (innerLoopLimit % 2 != 0) {
240 passScalar(passData);
241 return;
242 }
243
244 for (int size : simdSizes) {
245 if (size >= LENGTH) {
246
247 continue;
248 }
249
250 if (innerLoopLimit % size == 0) {
251 blocked(passData, size);
252 }
253 }
254 }
255 }
256
257
258
259
260 private void blocked128(PassData passData) {
261 final double[] data = passData.in;
262 final double[] ret = passData.out;
263 int sign = passData.sign;
264 int i = passData.inOffset;
265 int j = passData.outOffset;
266 final double tau = sign * sqrt3_2;
267
268
269 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
270 final DoubleVector
271 z0_r = fromArray(SPECIES_128, data, i),
272 z1_r = fromArray(SPECIES_128, data, i + di),
273 z2_r = fromArray(SPECIES_128, data, i + di2),
274 z0_i = fromArray(SPECIES_128, data, i + im),
275 z1_i = fromArray(SPECIES_128, data, i + di + im),
276 z2_i = fromArray(SPECIES_128, data, i + di2 + im);
277 final DoubleVector
278 t1_r = z1_r.add(z2_r),
279 t1_i = z1_i.add(z2_i),
280 t2_r = t1_r.mul(-0.5).add(z0_r),
281 t2_i = t1_i.mul(-0.5).add(z0_i),
282 t3_r = z1_r.sub(z2_r).mul(tau),
283 t3_i = z1_i.sub(z2_i).mul(tau);
284 z0_r.add(t1_r).intoArray(ret, j);
285 z0_i.add(t1_i).intoArray(ret, j + im);
286 t2_r.sub(t3_i).intoArray(ret, j + dj);
287 t2_i.add(t3_r).intoArray(ret, j + dj + im);
288 t2_r.add(t3_i).intoArray(ret, j + dj2);
289 t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
290 }
291
292 j += jstep;
293 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
294 final double[] twids = twiddles[k];
295 DoubleVector
296 w1_r = broadcast(SPECIES_128, twids[0]),
297 w1_i = broadcast(SPECIES_128, -sign * twids[1]),
298 w2_r = broadcast(SPECIES_128, twids[2]),
299 w2_i = broadcast(SPECIES_128, -sign * twids[3]);
300 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
301 final DoubleVector
302 z0_r = fromArray(SPECIES_128, data, i),
303 z1_r = fromArray(SPECIES_128, data, i + di),
304 z2_r = fromArray(SPECIES_128, data, i + di2),
305 z0_i = fromArray(SPECIES_128, data, i + im),
306 z1_i = fromArray(SPECIES_128, data, i + di + im),
307 z2_i = fromArray(SPECIES_128, data, i + di2 + im);
308 final DoubleVector
309 t1_r = z1_r.add(z2_r),
310 t1_i = z1_i.add(z2_i),
311 t2_r = t1_r.mul(-0.5).add(z0_r),
312 t2_i = t1_i.mul(-0.5).add(z0_i),
313 t3_r = z1_r.sub(z2_r).mul(tau),
314 t3_i = z1_i.sub(z2_i).mul(tau);
315 z0_r.add(t1_r).intoArray(ret, j);
316 z0_i.add(t1_i).intoArray(ret, j + im);
317 DoubleVector
318 x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
319 x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
320 w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
321 w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
322 w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
323 w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
324 }
325 }
326 }
327
328
329
330
331 private void blocked256(PassData passData) {
332 final double[] data = passData.in;
333 final double[] ret = passData.out;
334 int sign = passData.sign;
335 int i = passData.inOffset;
336 int j = passData.outOffset;
337 final double tau = sign * sqrt3_2;
338
339
340 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
341 final DoubleVector
342 z0_r = fromArray(SPECIES_256, data, i),
343 z1_r = fromArray(SPECIES_256, data, i + di),
344 z2_r = fromArray(SPECIES_256, data, i + di2),
345 z0_i = fromArray(SPECIES_256, data, i + im),
346 z1_i = fromArray(SPECIES_256, data, i + di + im),
347 z2_i = fromArray(SPECIES_256, data, i + di2 + im);
348 final DoubleVector
349 t1_r = z1_r.add(z2_r),
350 t1_i = z1_i.add(z2_i),
351 t2_r = t1_r.mul(-0.5).add(z0_r),
352 t2_i = t1_i.mul(-0.5).add(z0_i),
353 t3_r = z1_r.sub(z2_r).mul(tau),
354 t3_i = z1_i.sub(z2_i).mul(tau);
355 z0_r.add(t1_r).intoArray(ret, j);
356 z0_i.add(t1_i).intoArray(ret, j + im);
357 t2_r.sub(t3_i).intoArray(ret, j + dj);
358 t2_i.add(t3_r).intoArray(ret, j + dj + im);
359 t2_r.add(t3_i).intoArray(ret, j + dj2);
360 t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
361 }
362
363 j += jstep;
364 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
365 final double[] twids = twiddles[k];
366 DoubleVector
367 w1_r = broadcast(SPECIES_256, twids[0]),
368 w1_i = broadcast(SPECIES_256, -sign * twids[1]),
369 w2_r = broadcast(SPECIES_256, twids[2]),
370 w2_i = broadcast(SPECIES_256, -sign * twids[3]);
371 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
372 final DoubleVector
373 z0_r = fromArray(SPECIES_256, data, i),
374 z1_r = fromArray(SPECIES_256, data, i + di),
375 z2_r = fromArray(SPECIES_256, data, i + di2),
376 z0_i = fromArray(SPECIES_256, data, i + im),
377 z1_i = fromArray(SPECIES_256, data, i + di + im),
378 z2_i = fromArray(SPECIES_256, data, i + di2 + im);
379 final DoubleVector
380 t1_r = z1_r.add(z2_r),
381 t1_i = z1_i.add(z2_i),
382 t2_r = t1_r.mul(-0.5).add(z0_r),
383 t2_i = t1_i.mul(-0.5).add(z0_i),
384 t3_r = z1_r.sub(z2_r).mul(tau),
385 t3_i = z1_i.sub(z2_i).mul(tau);
386 z0_r.add(t1_r).intoArray(ret, j);
387 z0_i.add(t1_i).intoArray(ret, j + im);
388 DoubleVector
389 x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
390 x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
391 w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
392 w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
393 w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
394 w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
395 }
396 }
397 }
398
399
400
401
402 private void blocked512(PassData passData) {
403 final double[] data = passData.in;
404 final double[] ret = passData.out;
405 int sign = passData.sign;
406 int i = passData.inOffset;
407 int j = passData.outOffset;
408 final double tau = sign * sqrt3_2;
409
410
411 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
412 final DoubleVector
413 z0_r = fromArray(SPECIES_512, data, i),
414 z1_r = fromArray(SPECIES_512, data, i + di),
415 z2_r = fromArray(SPECIES_512, data, i + di2),
416 z0_i = fromArray(SPECIES_512, data, i + im),
417 z1_i = fromArray(SPECIES_512, data, i + di + im),
418 z2_i = fromArray(SPECIES_512, data, i + di2 + im);
419 final DoubleVector
420 t1_r = z1_r.add(z2_r),
421 t1_i = z1_i.add(z2_i),
422 t2_r = t1_r.mul(-0.5).add(z0_r),
423 t2_i = t1_i.mul(-0.5).add(z0_i),
424 t3_r = z1_r.sub(z2_r).mul(tau),
425 t3_i = z1_i.sub(z2_i).mul(tau);
426 z0_r.add(t1_r).intoArray(ret, j);
427 z0_i.add(t1_i).intoArray(ret, j + im);
428 t2_r.sub(t3_i).intoArray(ret, j + dj);
429 t2_i.add(t3_r).intoArray(ret, j + dj + im);
430 t2_r.add(t3_i).intoArray(ret, j + dj2);
431 t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
432 }
433
434 j += jstep;
435 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
436 final double[] twids = twiddles[k];
437 DoubleVector
438 w1_r = broadcast(SPECIES_512, twids[0]),
439 w1_i = broadcast(SPECIES_512, -sign * twids[1]),
440 w2_r = broadcast(SPECIES_512, twids[2]),
441 w2_i = broadcast(SPECIES_512, -sign * twids[3]);
442 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
443 final DoubleVector
444 z0_r = fromArray(SPECIES_512, data, i),
445 z1_r = fromArray(SPECIES_512, data, i + di),
446 z2_r = fromArray(SPECIES_512, data, i + di2),
447 z0_i = fromArray(SPECIES_512, data, i + im),
448 z1_i = fromArray(SPECIES_512, data, i + di + im),
449 z2_i = fromArray(SPECIES_512, data, i + di2 + im);
450 final DoubleVector
451 t1_r = z1_r.add(z2_r),
452 t1_i = z1_i.add(z2_i),
453 t2_r = t1_r.mul(-0.5).add(z0_r),
454 t2_i = t1_i.mul(-0.5).add(z0_i),
455 t3_r = z1_r.sub(z2_r).mul(tau),
456 t3_i = z1_i.sub(z2_i).mul(tau);
457 z0_r.add(t1_r).intoArray(ret, j);
458 z0_i.add(t1_i).intoArray(ret, j + im);
459 DoubleVector
460 x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
461 x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
462 w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
463 w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
464 w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
465 w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
466 }
467 }
468 }
469
470
471
472
473 private void interleaved128(PassData passData) {
474 final double[] data = passData.in;
475 final double[] ret = passData.out;
476 int sign = passData.sign;
477 int i = passData.inOffset;
478 int j = passData.outOffset;
479 final double tau = sign * sqrt3_2;
480
481 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_128, i += LENGTH_128, j += LENGTH_128) {
482 DoubleVector
483 z0 = fromArray(SPECIES_128, data, i),
484 z1 = fromArray(SPECIES_128, data, i + di),
485 z2 = fromArray(SPECIES_128, data, i + di2);
486 DoubleVector
487 t1 = z1.add(z2),
488 t2 = t1.mul(-0.5).add(z0),
489 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
490 z0.add(t1).intoArray(ret, j);
491 t2.add(t3.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
492 t2.add(t3.mul(NEGATE_IM_128)).intoArray(ret, j + dj2);
493 }
494
495 j += jstep;
496 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
497 final double[] twids = twiddles[k];
498 DoubleVector
499 w1r = broadcast(SPECIES_128, twids[0]),
500 w1i = broadcast(SPECIES_128, -sign * twids[1]).mul(NEGATE_IM_128),
501 w2r = broadcast(SPECIES_128, twids[2]),
502 w2i = broadcast(SPECIES_128, -sign * twids[3]).mul(NEGATE_IM_128);
503 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_128, i += LENGTH_128, j += LENGTH_128) {
504 DoubleVector
505 z0 = fromArray(SPECIES_128, data, i),
506 z1 = fromArray(SPECIES_128, data, i + di),
507 z2 = fromArray(SPECIES_128, data, i + di2);
508 DoubleVector
509 t1 = z1.add(z2),
510 t2 = t1.mul(-0.5).add(z0),
511 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
512 z0.add(t1).intoArray(ret, j);
513 z0.add(t1).intoArray(ret, j);
514 DoubleVector
515 x1 = t2.add(t3.mul(NEGATE_RE_128)),
516 x2 = t2.add(t3.mul(NEGATE_IM_128));
517 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
518 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
519 }
520 }
521 }
522
523
524
525
526 private void interleaved256(PassData passData) {
527 final double[] data = passData.in;
528 final double[] ret = passData.out;
529 int sign = passData.sign;
530 int i = passData.inOffset;
531 int j = passData.outOffset;
532 final double tau = sign * sqrt3_2;
533
534 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_256, i += LENGTH_256, j += LENGTH_256) {
535 DoubleVector
536 z0 = fromArray(SPECIES_256, data, i),
537 z1 = fromArray(SPECIES_256, data, i + di),
538 z2 = fromArray(SPECIES_256, data, i + di2);
539 DoubleVector
540 t1 = z1.add(z2),
541 t2 = t1.mul(-0.5).add(z0),
542 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
543 z0.add(t1).intoArray(ret, j);
544 t2.add(t3.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
545 t2.add(t3.mul(NEGATE_IM_256)).intoArray(ret, j + dj2);
546 }
547
548 j += jstep;
549 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
550 final double[] twids = twiddles[k];
551 DoubleVector
552 w1r = broadcast(SPECIES_256, twids[0]),
553 w1i = broadcast(SPECIES_256, -sign * twids[1]).mul(NEGATE_IM_256),
554 w2r = broadcast(SPECIES_256, twids[2]),
555 w2i = broadcast(SPECIES_256, -sign * twids[3]).mul(NEGATE_IM_256);
556 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_256, i += LENGTH_256, j += LENGTH_256) {
557 DoubleVector
558 z0 = fromArray(SPECIES_256, data, i),
559 z1 = fromArray(SPECIES_256, data, i + di),
560 z2 = fromArray(SPECIES_256, data, i + di2);
561 DoubleVector
562 t1 = z1.add(z2),
563 t2 = t1.mul(-0.5).add(z0),
564 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
565 z0.add(t1).intoArray(ret, j);
566 DoubleVector
567 x1 = t2.add(t3.mul(NEGATE_RE_256)),
568 x2 = t2.add(t3.mul(NEGATE_IM_256));
569 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
570 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
571 }
572 }
573 }
574
575
576
577
578 private void interleaved512(PassData passData) {
579 final double[] data = passData.in;
580 final double[] ret = passData.out;
581 int sign = passData.sign;
582 int i = passData.inOffset;
583 int j = passData.outOffset;
584 final double tau = sign * sqrt3_2;
585
586 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_512, i += LENGTH_512, j += LENGTH_512) {
587 DoubleVector
588 z0 = fromArray(SPECIES_512, data, i),
589 z1 = fromArray(SPECIES_512, data, i + di),
590 z2 = fromArray(SPECIES_512, data, i + di2);
591 DoubleVector
592 t1 = z1.add(z2),
593 t2 = t1.mul(-0.5).add(z0),
594 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
595 z0.add(t1).intoArray(ret, j);
596 t2.add(t3.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
597 t2.add(t3.mul(NEGATE_IM_512)).intoArray(ret, j + dj2);
598 }
599
600 j += jstep;
601 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
602 final double[] twids = twiddles[k];
603 DoubleVector
604 w1r = broadcast(SPECIES_512, twids[0]),
605 w1i = broadcast(SPECIES_512, -sign * twids[1]).mul(NEGATE_IM_512),
606 w2r = broadcast(SPECIES_512, twids[2]),
607 w2i = broadcast(SPECIES_512, -sign * twids[3]).mul(NEGATE_IM_512);
608 for (int k1 = 0; k1 < innerLoopLimit; k1 += LOOP_512, i += LENGTH_512, j += LENGTH_512) {
609 DoubleVector
610 z0 = fromArray(SPECIES_512, data, i),
611 z1 = fromArray(SPECIES_512, data, i + di),
612 z2 = fromArray(SPECIES_512, data, i + di2);
613 DoubleVector
614 t1 = z1.add(z2),
615 t2 = t1.mul(-0.5).add(z0),
616 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
617 z0.add(t1).intoArray(ret, j);
618 DoubleVector
619 x1 = t2.add(t3.mul(NEGATE_RE_512)),
620 x2 = t2.add(t3.mul(NEGATE_IM_512));
621 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
622 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
623 }
624 }
625 }
626 }