51 #ifndef NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H
52 #define NE10_FFT_GENERIC_INT32_NEONINTRINSIC_H
54 #include "NE10_types.h"
55 #include "NE10_macros.h"
56 #include "NE10_fft.neonintrinsic.h"
57 #include "NE10_fft_generic_int32.h"
59 typedef int32x4x2_t CPLX;
60 typedef int32x4_t REAL;
61 #define NE10_REAL_DUP_NEON_S32 vdupq_n_s32
63 #ifndef NE10_INLINE_ASM_OPT
64 #define NE10_CPLX_LOAD(PTR) vld2q_s32 ((ne10_int32_t*) (PTR))
65 #define NE10_CPLX_STORE(PTR,OUT) \
67 vst2q_s32 ((ne10_int32_t*) (PTR), OUT); \
71 #error Currently, inline assembly optimizations are only available on AArch64.
74 inline static int32x4x2_t NE10_CPLX_LOAD(T *ptr)
78 "ld2 {v0.4s, v1.4s}, [%[pin]] \n\t"
79 "mov %[r].16b, v0.16b \n\t"
80 "mov %[i].16b, v1.16b \n\t"
81 : [r]
"+w"(result.val[0]),
82 [i]
"+w"(result.val[1])
84 :
"memory",
"v0",
"v1");
89 inline static void NE10_CPLX_STORE(T *ptr, int32x4x2_t out)
92 "mov v0.16b, %[r].16b \n\t"
93 "mov v1.16b, %[i].16b \n\t"
94 "st2 {v0.4s, v1.4s}, [%[pout]] \n\t"
95 : [r]
"+w"(out.val[0]),
98 :
"memory",
"v0",
"v1");
105 inline CPLX NE10_CPX_LOAD_S<CPLX> (
const CPLX *ptr)
107 return NE10_CPLX_LOAD(ptr);
111 inline void NE10_CPX_STORE_S<CPLX> (CPLX *ptr,
const CPLX out)
113 NE10_CPLX_STORE (ptr, out);
117 inline void NE10_LOAD_BY_STEP<1, CPLX> (CPLX out[1],
121 out[0] = NE10_CPX_LOAD_S (Fin);
125 inline void NE10_STORE_BY_STEP<1, CPLX> (CPLX *Fout,
129 NE10_CPX_STORE_S (Fout, out[0]);
132 static inline REAL NE10_S_MUL_NEON_S32 (
const REAL vec,
133 const ne10_int32_t scalar)
135 REAL scalar_neon = NE10_REAL_DUP_NEON_S32 (scalar);
136 REAL result = vqrdmulhq_s32 (scalar_neon, vec);
140 static inline void NE10_CPX_MUL_NEON_S32 (CPLX &result,
const CPLX A,
const CPLX B)
142 REAL ARBR = vqrdmulhq_s32 (A.val[0], B.val[0]);
143 REAL ARBI = vqrdmulhq_s32 (A.val[0], B.val[1]);
144 REAL AIBR = vqrdmulhq_s32 (A.val[1], B.val[0]);
145 REAL AIBI = vqrdmulhq_s32 (A.val[1], B.val[1]);
146 result.val[0] = ARBR - AIBI;
147 result.val[1] = ARBI + AIBR;
151 inline void NE10_LOAD_TW_AND_MUL (CPLX scratch_in[RADIX],
153 const ne10_int32_t step)
156 int32x2_t d2_tmp = vld1_s32 ((ne10_int32_t *)(ptr_in + (RADIX - 2) * step));
158 scratch_tw.val[0] = NE10_REAL_DUP_NEON_S32 (d2_tmp[0]);
159 scratch_tw.val[1] = NE10_REAL_DUP_NEON_S32 (d2_tmp[1]);
160 NE10_CPX_MUL_NEON_S32 (scratch_in[RADIX - 1], scratch_in[RADIX - 1], scratch_tw);
162 NE10_LOAD_TW_AND_MUL<RADIX - 1> (scratch_in, ptr_in, step);
166 inline void NE10_LOAD_TW_AND_MUL<1> (CPLX [1],
176 inline void NE10_CONJ_S<CPLX> (CPLX &cplx)
178 cplx.val[1] = -cplx.val[1];
182 inline void NE10_CONJ<1, CPLX> (CPLX in[1])
184 NE10_CONJ_S<CPLX> (in[0]);
191 template<
int RADIX,
int SIZE = RADIX>
193 inline void operator() (CPLX scratch_out[RADIX])
195 #ifdef NE10_DSP_CFFT_SCALING
196 const int32x4_t one_by_RADIX =
198 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
199 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
200 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
201 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f)
203 scratch_out[SIZE - 1].val[0] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[0], one_by_RADIX);
204 scratch_out[SIZE - 1].val[1] = vqrdmulhq_s32 (scratch_out[SIZE - 1].val[1], one_by_RADIX);
212 inline void operator () (CPLX scratch_out[1])
214 #ifdef NE10_DSP_CFFT_SCALING
215 const int32x4_t one_by_RADIX =
217 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
218 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
219 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f),
220 (ne10_int32_t) floor(1.0 / RADIX * NE10_F2I32_MAX + 0.5f)
222 scratch_out[0].val[0] = vqrdmulhq_s32 (scratch_out[0].val[0], one_by_RADIX);
223 scratch_out[0].val[1] = vqrdmulhq_s32 (scratch_out[0].val[1], one_by_RADIX);
228 inline void NE10_CPX_ADD_NEON_S32 (CPLX &result,
const CPLX a,
const CPLX b)
230 result.val[0] = vaddq_s32 (a.val[0], b.val[0]);
231 result.val[1] = vaddq_s32 (a.val[1], b.val[1]);
234 inline void NE10_CPX_SUB_NEON_S32 (CPLX &result,
const CPLX a,
const CPLX b)
236 result.val[0] = vsubq_s32 (a.val[0], b.val[0]);
237 result.val[1] = vsubq_s32 (a.val[1], b.val[1]);
240 inline REAL NE10_HALF (REAL src)
242 const int32x4_t CONST_HALF_NEON = { -1, -1, -1, -1};
243 src = vshlq_s32 (src, CONST_HALF_NEON);
254 inline void NE10_FFT_FCU_NEON_S32 (CPLX [RADIX],
const CPLX [RADIX]);
257 inline void NE10_FFT_FCU_NEON_S32<2> (CPLX scratch_out[2],
258 const CPLX scratch_in[2])
260 NE10_CPX_ADD_NEON_S32 (scratch_out[0], scratch_in[0], scratch_in[1]);
261 NE10_CPX_SUB_NEON_S32 (scratch_out[1], scratch_in[0], scratch_in[1]);
265 inline void NE10_FFT_FCU_NEON_S32<3> (CPLX Fout[3],
274 scratch[1] = Fout[1];
275 scratch[2] = Fout[2];
277 NE10_CPX_ADD_NEON_S32 (scratch[3], scratch[1], scratch[2]);
278 NE10_CPX_SUB_NEON_S32 (scratch[0], scratch[1], scratch[2]);
280 Fout[1].val[0] = Fout[0].val[0] - NE10_HALF (scratch[3].val[0]);
281 Fout[1].val[1] = Fout[0].val[1] - NE10_HALF (scratch[3].val[1]);
283 scratch[0].val[0] = NE10_S_MUL_NEON_S32 (scratch[0].val[0], TW_3IN_S32);
284 scratch[0].val[1] = NE10_S_MUL_NEON_S32 (scratch[0].val[1], TW_3IN_S32);
286 Fout[0].val[0] += scratch[3].val[0];
287 Fout[0].val[1] += scratch[3].val[1];
289 Fout[2].val[0] = Fout[1].val[0] + scratch[0].val[1];
290 Fout[2].val[1] = Fout[1].val[1] - scratch[0].val[0];
292 Fout[1].val[0] -= scratch[0].val[1];
293 Fout[1].val[1] += scratch[0].val[0];
297 inline void NE10_FFT_FCU_NEON_S32<4> (CPLX scratch_out[4],
298 const CPLX scratch_in[4])
302 NE10_CPX_ADD_NEON_S32 (scratch[0], scratch_in[0], scratch_in[2]);
303 NE10_CPX_SUB_NEON_S32 (scratch[1], scratch_in[0], scratch_in[2]);
304 NE10_CPX_ADD_NEON_S32 (scratch[2], scratch_in[1], scratch_in[3]);
305 NE10_CPX_SUB_NEON_S32 (scratch[3], scratch_in[1], scratch_in[3]);
307 NE10_CPX_SUB_NEON_S32 (scratch_out[2], scratch[0], scratch[2]);
308 NE10_CPX_ADD_NEON_S32 (scratch_out[0], scratch[0], scratch[2]);
310 scratch_out[1].val[0] = scratch[1].val[0] + scratch[3].val[1];
311 scratch_out[1].val[1] = scratch[1].val[1] - scratch[3].val[0];
312 scratch_out[3].val[0] = scratch[1].val[0] - scratch[3].val[1];
313 scratch_out[3].val[1] = scratch[1].val[1] + scratch[3].val[0];
317 inline void NE10_FFT_FCU_NEON_S32<5> (CPLX Fout[5],
320 CPLX scratch[13], scratch_in[5];
322 scratch_in[0] = Fin[0];
323 scratch_in[1] = Fin[1];
324 scratch_in[2] = Fin[2];
325 scratch_in[3] = Fin[3];
326 scratch_in[4] = Fin[4];
328 scratch[0] = scratch_in[0];
329 scratch[1] = scratch_in[1];
330 scratch[2] = scratch_in[2];
331 scratch[3] = scratch_in[3];
332 scratch[4] = scratch_in[4];
334 NE10_CPX_ADD_NEON_S32 (scratch[ 7], scratch[1], scratch[4]);
335 NE10_CPX_SUB_NEON_S32 (scratch[10], scratch[1], scratch[4]);
336 NE10_CPX_ADD_NEON_S32 (scratch[ 8], scratch[2], scratch[3]);
337 NE10_CPX_SUB_NEON_S32 (scratch[ 9], scratch[2], scratch[3]);
339 scratch_in[0].val[0] += scratch[7].val[0] + scratch[8].val[0];
340 scratch_in[0].val[1] += scratch[7].val[1] + scratch[8].val[1];
342 scratch[5].val[0] = scratch[0].val[0]
343 + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5A_S32.r)
344 + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5B_S32.r);
345 scratch[5].val[1] = scratch[0].val[1]
346 + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5A_S32.r)
347 + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5B_S32.r);
349 scratch[6].val[0] = NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5A_S32.i)
350 + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5B_S32.i);
351 scratch[6].val[1] = -NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5A_S32.i)
352 - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5B_S32.i);
354 NE10_CPX_SUB_NEON_S32 (scratch_in[1], scratch[5], scratch[6]);
355 NE10_CPX_ADD_NEON_S32 (scratch_in[4], scratch[5], scratch[6]);
357 scratch[11].val[0] = scratch[0].val[0]
358 + NE10_S_MUL_NEON_S32 (scratch[7].val[0], TW_5B_S32.r)
359 + NE10_S_MUL_NEON_S32 (scratch[8].val[0], TW_5A_S32.r);
360 scratch[11].val[1] = scratch[0].val[1]
361 + NE10_S_MUL_NEON_S32 (scratch[7].val[1], TW_5B_S32.r)
362 + NE10_S_MUL_NEON_S32 (scratch[8].val[1], TW_5A_S32.r);
364 scratch[12].val[0] = -NE10_S_MUL_NEON_S32 (scratch[10].val[1], TW_5B_S32.i)
365 + NE10_S_MUL_NEON_S32 (scratch[9].val[1], TW_5A_S32.i);
366 scratch[12].val[1] = NE10_S_MUL_NEON_S32 (scratch[10].val[0], TW_5B_S32.i)
367 - NE10_S_MUL_NEON_S32 (scratch[9].val[0], TW_5A_S32.i);
369 NE10_CPX_ADD_NEON_S32 (scratch_in[2], scratch[11], scratch[12]);
370 NE10_CPX_SUB_NEON_S32 (scratch_in[3], scratch[11], scratch[12]);
372 Fout[0] = scratch_in[0];
373 Fout[1] = scratch_in[1];
374 Fout[2] = scratch_in[2];
375 Fout[3] = scratch_in[3];
376 Fout[4] = scratch_in[4];
382 template<ne10_
int32_t RADIX,
bool is_first_stage,
bool is_inverse,
bool is_scaled>
383 static __attribute__ ((noinline))
void ne10_radix_butterfly_int32_neon (
387 const ne10_int32_t fstride,
388 const ne10_int32_t out_step,
389 const ne10_int32_t nfft)
392 const ne10_int32_t in_step = nfft / RADIX;
393 ne10_int32_t f_count;
394 ne10_int32_t m_count;
396 for (f_count = fstride; f_count > 0; f_count--)
398 for (m_count = out_step; m_count > 0; m_count--)
403 NE10_LOAD_BY_STEP<RADIX, CPLX> (in, Fin, in_step);
407 NE10_CONJ<RADIX> (in);
417 NE10_LOAD_TW_AND_MUL<RADIX> (in, twiddles, out_step);
420 NE10_FFT_FCU_NEON_S32<RADIX> (out, in);
424 NE10_CONJ<RADIX> (out);
427 NE10_STORE_BY_STEP<RADIX, CPLX> (Fout, out, out_step);
443 twiddles -= out_step;
444 Fout += (RADIX - 1) * out_step;
449 template<
bool is_inverse,
bool is_scaled>
450 static void ne10_mixed_radix_generic_butterfly_int32_neon_impl (CPLX *Fout,
452 const ne10_int32_t *factors,
456 ne10_int32_t fstride, mstride, radix;
457 ne10_int32_t stage_count;
461 stage_count = factors[0];
462 fstride = factors[1];
464 radix = factors[ stage_count << 1 ];
465 nfft = fstride * radix;
468 if (stage_count % 2 == 0)
470 ne10_swap_ptr (buffer, Fout);
477 ne10_radix_butterfly_int32_neon<2, true, is_inverse, is_scaled> (Fout, Fin,
482 ne10_radix_butterfly_int32_neon<4, true, is_inverse, is_scaled> (Fout, Fin,
487 ne10_radix_butterfly_int32_neon<3, true, is_inverse, is_scaled> (Fout, Fin,
492 ne10_radix_butterfly_int32_neon<5, true, is_inverse, is_scaled> (Fout, Fin,
511 radix = factors[ stage_count << 1 ];
514 while (stage_count > 0)
517 assert ((radix > 1) && (radix < 6));
519 ne10_swap_ptr (buffer, Fout);
525 ne10_radix_butterfly_int32_neon<2, false, is_inverse, is_scaled> (Fout, buffer,
527 fstride, mstride, nfft);
530 ne10_radix_butterfly_int32_neon<3, false, is_inverse, is_scaled> (Fout, buffer,
532 fstride, mstride, nfft);
535 ne10_radix_butterfly_int32_neon<4, false, is_inverse, is_scaled> (Fout, buffer,
537 fstride, mstride, nfft);
540 ne10_radix_butterfly_int32_neon<5, false, is_inverse, is_scaled> (Fout, buffer,
541 twiddles, fstride, mstride, nfft);
545 twiddles += mstride * (radix - 1);
549 radix = factors[ stage_count << 1 ];
553 template<
bool is_inverse,
bool is_scaled>
554 static void ne10_c2c_1d_last_stage_neon (CPLX *Fout,
557 const ne10_int32_t fstride,
558 const ne10_int32_t out_step,
561 ne10_int32_t f_count;
562 ne10_int32_t m_count;
564 for (f_count = fstride; f_count > 0; f_count--)
569 for (m_count = out_step / NE10_FFT_PARA_LEVEL; m_count > 0; m_count--)
571 scratch_in[0] = NE10_CPLX_LOAD (Fin + 0);
572 scratch_in[1] = NE10_CPLX_LOAD (Fin + 1);
573 scratch_in[2] = NE10_CPLX_LOAD (Fin + 2);
574 scratch_in[3] = NE10_CPLX_LOAD (Fin + 3);
583 float32x4x2_t scratch0, scratch_in0;
584 float32x4x2_t scratch1, scratch_in1;
585 float32x4x2_t scratch2, scratch_in2;
586 float32x4x2_t scratch3, scratch_in3;
588 scratch_in0.val[0] = vreinterpretq_f32_s32 (scratch_in[0].val[0]);
589 scratch_in1.val[0] = vreinterpretq_f32_s32 (scratch_in[1].val[0]);
590 scratch_in2.val[0] = vreinterpretq_f32_s32 (scratch_in[2].val[0]);
591 scratch_in3.val[0] = vreinterpretq_f32_s32 (scratch_in[3].val[0]);
592 scratch_in0.val[1] = vreinterpretq_f32_s32 (scratch_in[0].val[1]);
593 scratch_in1.val[1] = vreinterpretq_f32_s32 (scratch_in[1].val[1]);
594 scratch_in2.val[1] = vreinterpretq_f32_s32 (scratch_in[2].val[1]);
595 scratch_in3.val[1] = vreinterpretq_f32_s32 (scratch_in[3].val[1]);
597 NE10_RADIX4X4C_TRANSPOSE_NEON (scratch, scratch_in);
599 scratch_in[0].val[0] = vreinterpretq_s32_f32 (scratch0.val[0]);
600 scratch_in[1].val[0] = vreinterpretq_s32_f32 (scratch1.val[0]);
601 scratch_in[2].val[0] = vreinterpretq_s32_f32 (scratch2.val[0]);
602 scratch_in[3].val[0] = vreinterpretq_s32_f32 (scratch3.val[0]);
603 scratch_in[0].val[1] = vreinterpretq_s32_f32 (scratch0.val[1]);
604 scratch_in[1].val[1] = vreinterpretq_s32_f32 (scratch1.val[1]);
605 scratch_in[2].val[1] = vreinterpretq_s32_f32 (scratch2.val[1]);
606 scratch_in[3].val[1] = vreinterpretq_s32_f32 (scratch3.val[1]);
611 NE10_CONJ<4, CPLX> (scratch_in);
618 scratch_tw[0] = NE10_CPLX_LOAD (twiddles + 0 * out_step);
619 scratch_tw[1] = NE10_CPLX_LOAD (twiddles + 1 * out_step);
620 scratch_tw[2] = NE10_CPLX_LOAD (twiddles + 2 * out_step);
622 NE10_CPX_MUL_NEON_S32 (scratch_in[1], scratch_in[1], scratch_tw[0]);
623 NE10_CPX_MUL_NEON_S32 (scratch_in[2], scratch_in[2], scratch_tw[1]);
624 NE10_CPX_MUL_NEON_S32 (scratch_in[3], scratch_in[3], scratch_tw[2]);
627 NE10_FFT_FCU_NEON_S32<4> (scratch_out, scratch_in);
631 NE10_CONJ<4, CPLX> (scratch_out);
639 NE10_CPLX_STORE (Fout_cpx + 0 * out_step, scratch_out[0]);
640 NE10_CPLX_STORE (Fout_cpx + 1 * out_step, scratch_out[1]);
641 NE10_CPLX_STORE (Fout_cpx + 2 * out_step, scratch_out[2]);
642 NE10_CPLX_STORE (Fout_cpx + 3 * out_step, scratch_out[3]);
651 ne10_int32_t left_over = out_step % 4;
660 for (m_count = out_step % 4; m_count > 0; m_count--)
665 scratch_in[0] = Fin_s[0];
666 scratch_in[1] = Fin_s[1];
667 scratch_in[2] = Fin_s[2];
668 scratch_in[3] = Fin_s[3];
672 scratch_in[0].r = scratch_in[0].r >> 2;
673 scratch_in[1].r = scratch_in[1].r >> 2;
674 scratch_in[2].r = scratch_in[2].r >> 2;
675 scratch_in[3].r = scratch_in[3].r >> 2;
677 scratch_in[0].i = scratch_in[0].i >> 2;
678 scratch_in[1].i = scratch_in[1].i >> 2;
679 scratch_in[2].i = scratch_in[2].i >> 2;
680 scratch_in[3].i = scratch_in[3].i >> 2;
686 scratch_in[0].i = -scratch_in[0].i;
687 scratch_in[1].i = -scratch_in[1].i;
688 scratch_in[2].i = -scratch_in[2].i;
689 scratch_in[3].i = -scratch_in[3].i;
692 scratch_tw[0] = twiddles[0 * out_step];
693 scratch_tw[1] = twiddles[1 * out_step];
694 scratch_tw[2] = twiddles[2 * out_step];
696 NE10_CPX_MUL_S32 (scratch_in[1], scratch_in[1], scratch_tw[0]);
697 NE10_CPX_MUL_S32 (scratch_in[2], scratch_in[2], scratch_tw[1]);
698 NE10_CPX_MUL_S32 (scratch_in[3], scratch_in[3], scratch_tw[2]);
700 FFT_FCU<4> (scratch_in, scratch_in);
704 scratch_in[0].i = -scratch_in[0].i;
705 scratch_in[1].i = -scratch_in[1].i;
706 scratch_in[2].i = -scratch_in[2].i;
707 scratch_in[3].i = -scratch_in[3].i;
710 Fout_s[0 * out_step] = scratch_in[0];
711 Fout_s[1 * out_step] = scratch_in[1];
712 Fout_s[2 * out_step] = scratch_in[2];
713 Fout_s[3 * out_step] = scratch_in[3];
structure for the 32 bits fixed point FFT function.