1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 package ffx.numerics.fft;
39
40 import jdk.incubator.vector.DoubleVector;
41
42 import static java.lang.Math.fma;
43 import static jdk.incubator.vector.DoubleVector.SPECIES_128;
44 import static jdk.incubator.vector.DoubleVector.SPECIES_256;
45 import static jdk.incubator.vector.DoubleVector.SPECIES_512;
46 import static jdk.incubator.vector.DoubleVector.broadcast;
47 import static jdk.incubator.vector.DoubleVector.fromArray;
48 import static org.apache.commons.math3.util.FastMath.sqrt;
49
50
51
52
53 public class MixedRadixFactor3 extends MixedRadixFactor {
54
55 private static final double sqrt3_2 = sqrt(3.0) / 2.0;
56 private final int di2;
57 private final int dj2;
58
59
60
61
62
63
64 public MixedRadixFactor3(PassConstants passConstants) {
65 super(passConstants);
66 di2 = 2 * di;
67 dj2 = 2 * dj;
68 }
69
70
71
72
73 @Override
74 protected void passScalar(PassData passData) {
75 final double[] data = passData.in;
76 final double[] ret = passData.out;
77 int sign = passData.sign;
78 int i = passData.inOffset;
79 int j = passData.outOffset;
80 final double tau = sign * sqrt3_2;
81
82
83 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
84 final double z0_r = data[i];
85 final double z1_r = data[i + di];
86 final double z2_r = data[i + di2];
87 final double z0_i = data[i + im];
88 final double z1_i = data[i + di + im];
89 final double z2_i = data[i + di2 + im];
90 final double t1_r = z1_r + z2_r;
91 final double t1_i = z1_i + z2_i;
92 final double t2_r = fma(-0.5, t1_r, z0_r);
93 final double t2_i = fma(-0.5, t1_i, z0_i);
94 final double t3_r = tau * (z1_r - z2_r);
95 final double t3_i = tau * (z1_i - z2_i);
96 ret[j] = z0_r + t1_r;
97 ret[j + im] = z0_i + t1_i;
98 ret[j + dj] = t2_r - t3_i;
99 ret[j + dj + im] = t2_i + t3_r;
100 ret[j + dj2] = t2_r + t3_i;
101 ret[j + dj2 + im] = t2_i - t3_r;
102 }
103 j += jstep;
104 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
105 final int index = k * 2;
106 final double w1_r = wr[index];
107 final double w2_r = wr[index + 1];
108 final double w1_i = -sign * wi[index];
109 final double w2_i = -sign * wi[index + 1];
110 for (int k1 = 0; k1 < innerLoopLimit; k1++, i += ii, j += ii) {
111 final double z0_r = data[i];
112 final double z1_r = data[i + di];
113 final double z2_r = data[i + di2];
114 final double z0_i = data[i + im];
115 final double z1_i = data[i + di + im];
116 final double z2_i = data[i + di2 + im];
117 final double t1_r = z1_r + z2_r;
118 final double t1_i = z1_i + z2_i;
119 final double t2_r = fma(-0.5, t1_r, z0_r);
120 final double t2_i = fma(-0.5, t1_i, z0_i);
121 final double t3_r = tau * (z1_r - z2_r);
122 final double t3_i = tau * (z1_i - z2_i);
123 ret[j] = z0_r + t1_r;
124 ret[j + im] = z0_i + t1_i;
125 multiplyAndStore(t2_r - t3_i, t2_i + t3_r, w1_r, w1_i, ret, j + dj, j + dj + im);
126 multiplyAndStore(t2_r + t3_i, t2_i - t3_r, w2_r, w2_i, ret, j + dj2, j + dj2 + im);
127 }
128 }
129 }
130
131
132
133
134 @Override
135 protected void passSIMD(PassData passData) {
136 if (im == 1) {
137 interleaved(passData);
138 } else {
139 blocked(passData);
140 }
141 }
142
143
144
145
146 private static int[] simdSizes = {8, 4, 2};
147
148
149
150
151
152
153
154 private void interleaved(PassData passData, int simdLength) {
155
156 switch (simdLength) {
157 case 2:
158
159 interleaved128(passData);
160 break;
161 case 4:
162
163 interleaved256(passData);
164 break;
165 case 8:
166
167 interleaved512(passData);
168 break;
169 default:
170 passScalar(passData);
171 }
172 }
173
174
175
176
177
178
179
180 private void blocked(PassData passData, int simdLength) {
181
182 switch (simdLength) {
183 case 2:
184
185 blocked128(passData);
186 break;
187 case 4:
188
189 blocked256(passData);
190 break;
191 case 8:
192
193 blocked512(passData);
194 break;
195 default:
196 passScalar(passData);
197 }
198 }
199
200
201
202
203
204
205 private void interleaved(PassData passData) {
206 if (innerLoopLimit % INTERLEAVED_LOOP == 0) {
207
208 interleaved(passData, LENGTH);
209 } else {
210
211 if (innerLoopLimit % 2 != 0 && innerLoopLimit != 1) {
212 passScalar(passData);
213 return;
214 }
215
216 for (int size : simdSizes) {
217 if (size >= LENGTH) {
218
219 continue;
220 }
221
222 if (innerLoopLimit % (size / 2) == 0) {
223 interleaved(passData, size);
224 }
225 }
226 }
227 }
228
229
230
231
232
233
234 private void blocked(PassData passData) {
235 if (innerLoopLimit % BLOCK_LOOP == 0) {
236
237 blocked(passData, LENGTH);
238 } else {
239
240 if (innerLoopLimit % 2 != 0) {
241 passScalar(passData);
242 return;
243 }
244
245 for (int size : simdSizes) {
246 if (size >= LENGTH) {
247
248 continue;
249 }
250
251 if (innerLoopLimit % size == 0) {
252 blocked(passData, size);
253 }
254 }
255 }
256 }
257
258
259
260
261 private void blocked128(PassData passData) {
262 final double[] data = passData.in;
263 final double[] ret = passData.out;
264 int sign = passData.sign;
265 int i = passData.inOffset;
266 int j = passData.outOffset;
267 final double tau = sign * sqrt3_2;
268
269
270 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
271 final DoubleVector
272 z0_r = fromArray(SPECIES_128, data, i),
273 z1_r = fromArray(SPECIES_128, data, i + di),
274 z2_r = fromArray(SPECIES_128, data, i + di2),
275 z0_i = fromArray(SPECIES_128, data, i + im),
276 z1_i = fromArray(SPECIES_128, data, i + di + im),
277 z2_i = fromArray(SPECIES_128, data, i + di2 + im);
278 final DoubleVector
279 t1_r = z1_r.add(z2_r),
280 t1_i = z1_i.add(z2_i),
281 t2_r = t1_r.mul(-0.5).add(z0_r),
282 t2_i = t1_i.mul(-0.5).add(z0_i),
283 t3_r = z1_r.sub(z2_r).mul(tau),
284 t3_i = z1_i.sub(z2_i).mul(tau);
285 z0_r.add(t1_r).intoArray(ret, j);
286 z0_i.add(t1_i).intoArray(ret, j + im);
287 t2_r.sub(t3_i).intoArray(ret, j + dj);
288 t2_i.add(t3_r).intoArray(ret, j + dj + im);
289 t2_r.add(t3_i).intoArray(ret, j + dj2);
290 t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
291 }
292
293 j += jstep;
294 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
295 final int index = k * 2;
296 final DoubleVector
297 w1_r = broadcast(SPECIES_128, wr[index]),
298 w2_r = broadcast(SPECIES_128, wr[index + 1]),
299 w1_i = broadcast(SPECIES_128, -sign * wi[index]),
300 w2_i = broadcast(SPECIES_128, -sign * wi[index + 1]);
301 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
302 final DoubleVector
303 z0_r = fromArray(SPECIES_128, data, i),
304 z1_r = fromArray(SPECIES_128, data, i + di),
305 z2_r = fromArray(SPECIES_128, data, i + di2),
306 z0_i = fromArray(SPECIES_128, data, i + im),
307 z1_i = fromArray(SPECIES_128, data, i + di + im),
308 z2_i = fromArray(SPECIES_128, data, i + di2 + im);
309 final DoubleVector
310 t1_r = z1_r.add(z2_r),
311 t1_i = z1_i.add(z2_i),
312 t2_r = t1_r.mul(-0.5).add(z0_r),
313 t2_i = t1_i.mul(-0.5).add(z0_i),
314 t3_r = z1_r.sub(z2_r).mul(tau),
315 t3_i = z1_i.sub(z2_i).mul(tau);
316 z0_r.add(t1_r).intoArray(ret, j);
317 z0_i.add(t1_i).intoArray(ret, j + im);
318 DoubleVector
319 x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
320 x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
321 w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
322 w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
323 w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
324 w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
325 }
326 }
327 }
328
329
330
331
332 private void blocked256(PassData passData) {
333 final double[] data = passData.in;
334 final double[] ret = passData.out;
335 int sign = passData.sign;
336 int i = passData.inOffset;
337 int j = passData.outOffset;
338 final double tau = sign * sqrt3_2;
339
340
341 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
342 final DoubleVector
343 z0_r = fromArray(SPECIES_256, data, i),
344 z1_r = fromArray(SPECIES_256, data, i + di),
345 z2_r = fromArray(SPECIES_256, data, i + di2),
346 z0_i = fromArray(SPECIES_256, data, i + im),
347 z1_i = fromArray(SPECIES_256, data, i + di + im),
348 z2_i = fromArray(SPECIES_256, data, i + di2 + im);
349 final DoubleVector
350 t1_r = z1_r.add(z2_r),
351 t1_i = z1_i.add(z2_i),
352 t2_r = t1_r.mul(-0.5).add(z0_r),
353 t2_i = t1_i.mul(-0.5).add(z0_i),
354 t3_r = z1_r.sub(z2_r).mul(tau),
355 t3_i = z1_i.sub(z2_i).mul(tau);
356 z0_r.add(t1_r).intoArray(ret, j);
357 z0_i.add(t1_i).intoArray(ret, j + im);
358 t2_r.sub(t3_i).intoArray(ret, j + dj);
359 t2_i.add(t3_r).intoArray(ret, j + dj + im);
360 t2_r.add(t3_i).intoArray(ret, j + dj2);
361 t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
362 }
363
364 j += jstep;
365 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
366 final int index = k * 2;
367 final DoubleVector
368 w1_r = broadcast(SPECIES_256, wr[index]),
369 w2_r = broadcast(SPECIES_256, wr[index + 1]),
370 w1_i = broadcast(SPECIES_256, -sign * wi[index]),
371 w2_i = broadcast(SPECIES_256, -sign * wi[index + 1]);
372 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
373 final DoubleVector
374 z0_r = fromArray(SPECIES_256, data, i),
375 z1_r = fromArray(SPECIES_256, data, i + di),
376 z2_r = fromArray(SPECIES_256, data, i + di2),
377 z0_i = fromArray(SPECIES_256, data, i + im),
378 z1_i = fromArray(SPECIES_256, data, i + di + im),
379 z2_i = fromArray(SPECIES_256, data, i + di2 + im);
380 final DoubleVector
381 t1_r = z1_r.add(z2_r),
382 t1_i = z1_i.add(z2_i),
383 t2_r = t1_r.mul(-0.5).add(z0_r),
384 t2_i = t1_i.mul(-0.5).add(z0_i),
385 t3_r = z1_r.sub(z2_r).mul(tau),
386 t3_i = z1_i.sub(z2_i).mul(tau);
387 z0_r.add(t1_r).intoArray(ret, j);
388 z0_i.add(t1_i).intoArray(ret, j + im);
389 DoubleVector
390 x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
391 x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
392 w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
393 w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
394 w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
395 w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
396 }
397 }
398 }
399
400
401
402
403 private void blocked512(PassData passData) {
404 final double[] data = passData.in;
405 final double[] ret = passData.out;
406 int sign = passData.sign;
407 int i = passData.inOffset;
408 int j = passData.outOffset;
409 final double tau = sign * sqrt3_2;
410
411
412 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
413 final DoubleVector
414 z0_r = fromArray(SPECIES_512, data, i),
415 z1_r = fromArray(SPECIES_512, data, i + di),
416 z2_r = fromArray(SPECIES_512, data, i + di2),
417 z0_i = fromArray(SPECIES_512, data, i + im),
418 z1_i = fromArray(SPECIES_512, data, i + di + im),
419 z2_i = fromArray(SPECIES_512, data, i + di2 + im);
420 final DoubleVector
421 t1_r = z1_r.add(z2_r),
422 t1_i = z1_i.add(z2_i),
423 t2_r = t1_r.mul(-0.5).add(z0_r),
424 t2_i = t1_i.mul(-0.5).add(z0_i),
425 t3_r = z1_r.sub(z2_r).mul(tau),
426 t3_i = z1_i.sub(z2_i).mul(tau);
427 z0_r.add(t1_r).intoArray(ret, j);
428 z0_i.add(t1_i).intoArray(ret, j + im);
429 t2_r.sub(t3_i).intoArray(ret, j + dj);
430 t2_i.add(t3_r).intoArray(ret, j + dj + im);
431 t2_r.add(t3_i).intoArray(ret, j + dj2);
432 t2_i.sub(t3_r).intoArray(ret, j + dj2 + im);
433 }
434
435 j += jstep;
436 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
437 final int index = k * 2;
438 final DoubleVector
439 w1_r = broadcast(SPECIES_512, wr[index]),
440 w2_r = broadcast(SPECIES_512, wr[index + 1]),
441 w1_i = broadcast(SPECIES_512, -sign * wi[index]),
442 w2_i = broadcast(SPECIES_512, -sign * wi[index + 1]);
443 for (int k1 = 0; k1 < innerLoopLimit; k1 += BLOCK_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
444 final DoubleVector
445 z0_r = fromArray(SPECIES_512, data, i),
446 z1_r = fromArray(SPECIES_512, data, i + di),
447 z2_r = fromArray(SPECIES_512, data, i + di2),
448 z0_i = fromArray(SPECIES_512, data, i + im),
449 z1_i = fromArray(SPECIES_512, data, i + di + im),
450 z2_i = fromArray(SPECIES_512, data, i + di2 + im);
451 final DoubleVector
452 t1_r = z1_r.add(z2_r),
453 t1_i = z1_i.add(z2_i),
454 t2_r = t1_r.mul(-0.5).add(z0_r),
455 t2_i = t1_i.mul(-0.5).add(z0_i),
456 t3_r = z1_r.sub(z2_r).mul(tau),
457 t3_i = z1_i.sub(z2_i).mul(tau);
458 z0_r.add(t1_r).intoArray(ret, j);
459 z0_i.add(t1_i).intoArray(ret, j + im);
460 DoubleVector
461 x1_r = t2_r.sub(t3_i), x1_i = t2_i.add(t3_r),
462 x2_r = t2_r.add(t3_i), x2_i = t2_i.sub(t3_r);
463 w1_r.mul(x1_r).add(w1_i.neg().mul(x1_i)).intoArray(ret, j + dj);
464 w2_r.mul(x2_r).add(w2_i.neg().mul(x2_i)).intoArray(ret, j + dj2);
465 w1_r.mul(x1_i).add(w1_i.mul(x1_r)).intoArray(ret, j + dj + im);
466 w2_r.mul(x2_i).add(w2_i.mul(x2_r)).intoArray(ret, j + dj2 + im);
467 }
468 }
469 }
470
471
472
473
474 private void interleaved128(PassData passData) {
475 final double[] data = passData.in;
476 final double[] ret = passData.out;
477 int sign = passData.sign;
478 int i = passData.inOffset;
479 int j = passData.outOffset;
480 final double tau = sign * sqrt3_2;
481
482 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
483 DoubleVector
484 z0 = fromArray(SPECIES_128, data, i),
485 z1 = fromArray(SPECIES_128, data, i + di),
486 z2 = fromArray(SPECIES_128, data, i + di2);
487 DoubleVector
488 t1 = z1.add(z2),
489 t2 = t1.mul(-0.5).add(z0),
490 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
491 z0.add(t1).intoArray(ret, j);
492 t2.add(t3.mul(NEGATE_RE_128)).intoArray(ret, j + dj);
493 t2.add(t3.mul(NEGATE_IM_128)).intoArray(ret, j + dj2);
494 }
495
496 j += jstep;
497 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
498 final int index = k * 2;
499 final DoubleVector
500 w1r = broadcast(SPECIES_128, wr[index]),
501 w2r = broadcast(SPECIES_128, wr[index + 1]),
502 w1i = broadcast(SPECIES_128, -sign * wi[index]).mul(NEGATE_IM_128),
503 w2i = broadcast(SPECIES_128, -sign * wi[index + 1]).mul(NEGATE_IM_128);
504 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_128, i += LENGTH_128, j += LENGTH_128) {
505 DoubleVector
506 z0 = fromArray(SPECIES_128, data, i),
507 z1 = fromArray(SPECIES_128, data, i + di),
508 z2 = fromArray(SPECIES_128, data, i + di2);
509 DoubleVector
510 t1 = z1.add(z2),
511 t2 = t1.mul(-0.5).add(z0),
512 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_128);
513 z0.add(t1).intoArray(ret, j);
514 z0.add(t1).intoArray(ret, j);
515 DoubleVector
516 x1 = t2.add(t3.mul(NEGATE_RE_128)),
517 x2 = t2.add(t3.mul(NEGATE_IM_128));
518 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj);
519 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_128)).intoArray(ret, j + dj2);
520 }
521 }
522 }
523
524
525
526
527 private void interleaved256(PassData passData) {
528 final double[] data = passData.in;
529 final double[] ret = passData.out;
530 int sign = passData.sign;
531 int i = passData.inOffset;
532 int j = passData.outOffset;
533 final double tau = sign * sqrt3_2;
534
535 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
536 DoubleVector
537 z0 = fromArray(SPECIES_256, data, i),
538 z1 = fromArray(SPECIES_256, data, i + di),
539 z2 = fromArray(SPECIES_256, data, i + di2);
540 DoubleVector
541 t1 = z1.add(z2),
542 t2 = t1.mul(-0.5).add(z0),
543 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
544 z0.add(t1).intoArray(ret, j);
545 t2.add(t3.mul(NEGATE_RE_256)).intoArray(ret, j + dj);
546 t2.add(t3.mul(NEGATE_IM_256)).intoArray(ret, j + dj2);
547 }
548
549 j += jstep;
550 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
551 final int index = k * 2;
552 final DoubleVector
553 w1r = broadcast(SPECIES_256, wr[index]),
554 w2r = broadcast(SPECIES_256, wr[index + 1]),
555 w1i = broadcast(SPECIES_256, -sign * wi[index]).mul(NEGATE_IM_256),
556 w2i = broadcast(SPECIES_256, -sign * wi[index + 1]).mul(NEGATE_IM_256);
557 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_256, i += LENGTH_256, j += LENGTH_256) {
558 DoubleVector
559 z0 = fromArray(SPECIES_256, data, i),
560 z1 = fromArray(SPECIES_256, data, i + di),
561 z2 = fromArray(SPECIES_256, data, i + di2);
562 DoubleVector
563 t1 = z1.add(z2),
564 t2 = t1.mul(-0.5).add(z0),
565 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_256);
566 z0.add(t1).intoArray(ret, j);
567 DoubleVector
568 x1 = t2.add(t3.mul(NEGATE_RE_256)),
569 x2 = t2.add(t3.mul(NEGATE_IM_256));
570 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj);
571 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_256)).intoArray(ret, j + dj2);
572 }
573 }
574 }
575
576
577
578
579 private void interleaved512(PassData passData) {
580 final double[] data = passData.in;
581 final double[] ret = passData.out;
582 int sign = passData.sign;
583 int i = passData.inOffset;
584 int j = passData.outOffset;
585 final double tau = sign * sqrt3_2;
586
587 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
588 DoubleVector
589 z0 = fromArray(SPECIES_512, data, i),
590 z1 = fromArray(SPECIES_512, data, i + di),
591 z2 = fromArray(SPECIES_512, data, i + di2);
592 DoubleVector
593 t1 = z1.add(z2),
594 t2 = t1.mul(-0.5).add(z0),
595 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
596 z0.add(t1).intoArray(ret, j);
597 t2.add(t3.mul(NEGATE_RE_512)).intoArray(ret, j + dj);
598 t2.add(t3.mul(NEGATE_IM_512)).intoArray(ret, j + dj2);
599 }
600
601 j += jstep;
602 for (int k = 1; k < outerLoopLimit; k++, j += jstep) {
603 final int index = k * 2;
604 final DoubleVector
605 w1r = broadcast(SPECIES_512, wr[index]),
606 w2r = broadcast(SPECIES_512, wr[index + 1]),
607 w1i = broadcast(SPECIES_512, -sign * wi[index]).mul(NEGATE_IM_512),
608 w2i = broadcast(SPECIES_512, -sign * wi[index + 1]).mul(NEGATE_IM_512);
609 for (int k1 = 0; k1 < innerLoopLimit; k1 += INTERLEAVED_LOOP_512, i += LENGTH_512, j += LENGTH_512) {
610 DoubleVector
611 z0 = fromArray(SPECIES_512, data, i),
612 z1 = fromArray(SPECIES_512, data, i + di),
613 z2 = fromArray(SPECIES_512, data, i + di2);
614 DoubleVector
615 t1 = z1.add(z2),
616 t2 = t1.mul(-0.5).add(z0),
617 t3 = z1.sub(z2).mul(tau).rearrange(SHUFFLE_RE_IM_512);
618 z0.add(t1).intoArray(ret, j);
619 DoubleVector
620 x1 = t2.add(t3.mul(NEGATE_RE_512)),
621 x2 = t2.add(t3.mul(NEGATE_IM_512));
622 w1r.mul(x1).add(w1i.mul(x1).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj);
623 w2r.mul(x2).add(w2i.mul(x2).rearrange(SHUFFLE_RE_IM_512)).intoArray(ret, j + dj2);
624 }
625 }
626 }
627 }