Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_fft_int16.neon.c
1 /*
2  * Copyright 2013-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * NE10 Library : dsp/NE10_fft_int16.neon.c
30  */
31 
32 #include <arm_neon.h>
33 
34 #include "NE10_types.h"
35 #include "NE10_macros.h"
36 #include "NE10_fft.h"
37 
38 static inline void ne10_fft4_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
40 
41 {
42  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
43  ne10_int16_t tmp_r, tmp_i;
44 
45  s2_r = Fin[0].r - Fin[2].r;
46  s2_i = Fin[0].i - Fin[2].i;
47 
48  tmp_r = Fin[0].r + Fin[2].r;
49  tmp_i = Fin[0].i + Fin[2].i;
50 
51  s0_r = Fin[1].r + Fin[3].r;
52  s0_i = Fin[1].i + Fin[3].i;
53 
54  s1_r = Fin[1].r - Fin[3].r;
55  s1_i = Fin[1].i - Fin[3].i;
56  Fout[2].r = tmp_r - s0_r;
57  Fout[2].i = tmp_i - s0_i;
58  Fout[0].r = tmp_r + s0_r;
59  Fout[0].i = tmp_i + s0_i;
60 
61  Fout[1].r = s2_r + s1_i;
62  Fout[1].i = s2_i - s1_r;
63  Fout[3].r = s2_r - s1_i;
64  Fout[3].i = s2_i + s1_r;
65 }
66 
67 static inline void ne10_fft4_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
69 
70 {
71  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
72  ne10_int16_t tmp_r, tmp_i;
73 
74  s2_r = Fin[0].r - Fin[2].r;
75  s2_i = Fin[0].i - Fin[2].i;
76 
77  tmp_r = Fin[0].r + Fin[2].r;
78  tmp_i = Fin[0].i + Fin[2].i;
79 
80  s0_r = Fin[1].r + Fin[3].r;
81  s0_i = Fin[1].i + Fin[3].i;
82 
83  s1_r = Fin[1].r - Fin[3].r;
84  s1_i = Fin[1].i - Fin[3].i;
85 
86  Fout[2].r = tmp_r - s0_r;
87  Fout[2].i = tmp_i - s0_i;
88  Fout[0].r = tmp_r + s0_r;
89  Fout[0].i = tmp_i + s0_i;
90 
91  Fout[1].r = s2_r - s1_i;
92  Fout[1].i = s2_i + s1_r;
93  Fout[3].r = s2_r + s1_i;
94  Fout[3].i = s2_i - s1_r;
95 }
96 static inline void ne10_fft4_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
98 
99 {
100  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
101  ne10_int16_t tmp_r, tmp_i;
102 
103  s2_r = (Fin[0].r - Fin[2].r) >> 2;
104  s2_i = (Fin[0].i - Fin[2].i) >> 2;
105  tmp_r = (Fin[0].r + Fin[2].r) >> 2;
106  tmp_i = (Fin[0].i + Fin[2].i) >> 2;
107 
108  s0_r = (Fin[1].r + Fin[3].r) >> 2;
109  s0_i = (Fin[1].i + Fin[3].i) >> 2;
110  s1_r = (Fin[1].r - Fin[3].r) >> 2;
111  s1_i = (Fin[1].i - Fin[3].i) >> 2;
112 
113  Fout[2].r = tmp_r - s0_r;
114  Fout[2].i = tmp_i - s0_i;
115  Fout[0].r = tmp_r + s0_r;
116  Fout[0].i = tmp_i + s0_i;
117 
118  Fout[1].r = s2_r + s1_i;
119  Fout[1].i = s2_i - s1_r;
120  Fout[3].r = s2_r - s1_i;
121  Fout[3].i = s2_i + s1_r;
122 }
123 
124 static inline void ne10_fft4_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
125  ne10_fft_cpx_int16_t * Fin)
126 
127 {
128  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
129  ne10_int16_t tmp_r, tmp_i;
130 
131  s2_r = (Fin[0].r - Fin[2].r) >> 2;
132  s2_i = (Fin[0].i - Fin[2].i) >> 2;
133  tmp_r = (Fin[0].r + Fin[2].r) >> 2;
134  tmp_i = (Fin[0].i + Fin[2].i) >> 2;
135 
136  s0_r = (Fin[1].r + Fin[3].r) >> 2;
137  s0_i = (Fin[1].i + Fin[3].i) >> 2;
138  s1_r = (Fin[1].r - Fin[3].r) >> 2;
139  s1_i = (Fin[1].i - Fin[3].i) >> 2;
140 
141  Fout[2].r = tmp_r - s0_r;
142  Fout[2].i = tmp_i - s0_i;
143  Fout[0].r = tmp_r + s0_r;
144  Fout[0].i = tmp_i + s0_i;
145 
146  Fout[1].r = s2_r - s1_i;
147  Fout[1].i = s2_i + s1_r;
148  Fout[3].r = s2_r + s1_i;
149  Fout[3].i = s2_i - s1_r;
150 }
151 static inline void ne10_fft8_forward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
152  ne10_fft_cpx_int16_t * Fin)
153 
154 {
155  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
156  ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
157  const ne10_int16_t TW_81 = 23169;
158 
159  s0_r = Fin[0].r + Fin[4].r;
160  s0_i = Fin[0].i + Fin[4].i;
161  s1_r = Fin[0].r - Fin[4].r;
162  s1_i = Fin[0].i - Fin[4].i;
163  s2_r = Fin[1].r + Fin[5].r;
164  s2_i = Fin[1].i + Fin[5].i;
165  s3_r = Fin[1].r - Fin[5].r;
166  s3_i = Fin[1].i - Fin[5].i;
167  s4_r = Fin[2].r + Fin[6].r;
168  s4_i = Fin[2].i + Fin[6].i;
169  s5_r = Fin[2].r - Fin[6].r;
170  s5_i = Fin[2].i - Fin[6].i;
171  s6_r = Fin[3].r + Fin[7].r;
172  s6_i = Fin[3].i + Fin[7].i;
173  s7_r = Fin[3].r - Fin[7].r;
174  s7_i = Fin[3].i - Fin[7].i;
175 
176  t0_r = s0_r - s4_r;
177  t0_i = s0_i - s4_i;
178  t1_r = s0_r + s4_r;
179  t1_i = s0_i + s4_i;
180  t2_r = s2_r + s6_r;
181  t2_i = s2_i + s6_i;
182  t3_r = s2_r - s6_r;
183  t3_i = s2_i - s6_i;
184  Fout[0].r = t1_r + t2_r;
185  Fout[0].i = t1_i + t2_i;
186  Fout[4].r = t1_r - t2_r;
187  Fout[4].i = t1_i - t2_i;
188  Fout[2].r = t0_r + t3_i;
189  Fout[2].i = t0_i - t3_r;
190  Fout[6].r = t0_r - t3_i;
191  Fout[6].i = t0_i + t3_r;
192 
193  t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
194  t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
195  t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
196  t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
197 
198  t0_r = s1_r - s5_i;
199  t0_i = s1_i + s5_r;
200  t1_r = s1_r + s5_i;
201  t1_i = s1_i - s5_r;
202  t2_r = t4_r - t5_r;
203  t2_i = t4_i - t5_i;
204  t3_r = t4_r + t5_r;
205  t3_i = t4_i + t5_i;
206  Fout[1].r = t1_r + t2_r;
207  Fout[1].i = t1_i + t2_i;
208  Fout[5].r = t1_r - t2_r;
209  Fout[5].i = t1_i - t2_i;
210  Fout[3].r = t0_r + t3_i;
211  Fout[3].i = t0_i - t3_r;
212  Fout[7].r = t0_r - t3_i;
213  Fout[7].i = t0_i + t3_r;
214 }
215 
216 static inline void ne10_fft8_backward_int16_unscaled (ne10_fft_cpx_int16_t * Fout,
217  ne10_fft_cpx_int16_t * Fin)
218 
219 {
220  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
221  ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
222  const ne10_int16_t TW_81 = 23169;
223 
224  s0_r = Fin[0].r + Fin[4].r;
225  s0_i = Fin[0].i + Fin[4].i;
226  s1_r = Fin[0].r - Fin[4].r;
227  s1_i = Fin[0].i - Fin[4].i;
228  s2_r = Fin[1].r + Fin[5].r;
229  s2_i = Fin[1].i + Fin[5].i;
230  s3_r = Fin[1].r - Fin[5].r;
231  s3_i = Fin[1].i - Fin[5].i;
232  s4_r = Fin[2].r + Fin[6].r;
233  s4_i = Fin[2].i + Fin[6].i;
234  s5_r = Fin[2].r - Fin[6].r;
235  s5_i = Fin[2].i - Fin[6].i;
236  s6_r = Fin[3].r + Fin[7].r;
237  s6_i = Fin[3].i + Fin[7].i;
238  s7_r = Fin[3].r - Fin[7].r;
239  s7_i = Fin[3].i - Fin[7].i;
240 
241  t0_r = s0_r - s4_r;
242  t0_i = s0_i - s4_i;
243  t1_r = s0_r + s4_r;
244  t1_i = s0_i + s4_i;
245  t2_r = s2_r + s6_r;
246  t2_i = s2_i + s6_i;
247  t3_r = s2_r - s6_r;
248  t3_i = s2_i - s6_i;
249  Fout[0].r = t1_r + t2_r;
250  Fout[0].i = t1_i + t2_i;
251  Fout[4].r = t1_r - t2_r;
252  Fout[4].i = t1_i - t2_i;
253  Fout[2].r = t0_r - t3_i;
254  Fout[2].i = t0_i + t3_r;
255  Fout[6].r = t0_r + t3_i;
256  Fout[6].i = t0_i - t3_r;
257 
258  t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
259  t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
260  t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
261  t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
262 
263  t0_r = s1_r + s5_i;
264  t0_i = s1_i - s5_r;
265  t1_r = s1_r - s5_i;
266  t1_i = s1_i + s5_r;
267  t2_r = t4_r - t5_r;
268  t2_i = t4_i - t5_i;
269  t3_r = t4_r + t5_r;
270  t3_i = t4_i + t5_i;
271  Fout[1].r = t1_r + t2_r;
272  Fout[1].i = t1_i + t2_i;
273  Fout[5].r = t1_r - t2_r;
274  Fout[5].i = t1_i - t2_i;
275  Fout[3].r = t0_r - t3_i;
276  Fout[3].i = t0_i + t3_r;
277  Fout[7].r = t0_r + t3_i;
278  Fout[7].i = t0_i - t3_r;
279 }
280 static inline void ne10_fft8_forward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
281  ne10_fft_cpx_int16_t * Fin)
282 
283 {
284  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
285  ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
286  const ne10_int16_t TW_81 = 23169;
287 
288  s0_r = (Fin[0].r + Fin[4].r) >> 3;
289  s0_i = (Fin[0].i + Fin[4].i) >> 3;
290  s1_r = (Fin[0].r - Fin[4].r) >> 3;
291  s1_i = (Fin[0].i - Fin[4].i) >> 3;
292  s2_r = (Fin[1].r + Fin[5].r) >> 3;
293  s2_i = (Fin[1].i + Fin[5].i) >> 3;
294  s3_r = (Fin[1].r - Fin[5].r) >> 3;
295  s3_i = (Fin[1].i - Fin[5].i) >> 3;
296  s4_r = (Fin[2].r + Fin[6].r) >> 3;
297  s4_i = (Fin[2].i + Fin[6].i) >> 3;
298  s5_r = (Fin[2].r - Fin[6].r) >> 3;
299  s5_i = (Fin[2].i - Fin[6].i) >> 3;
300  s6_r = (Fin[3].r + Fin[7].r) >> 3;
301  s6_i = (Fin[3].i + Fin[7].i) >> 3;
302  s7_r = (Fin[3].r - Fin[7].r) >> 3;
303  s7_i = (Fin[3].i - Fin[7].i) >> 3;
304 
305  t0_r = s0_r - s4_r;
306  t0_i = s0_i - s4_i;
307  t1_r = s0_r + s4_r;
308  t1_i = s0_i + s4_i;
309  t2_r = s2_r + s6_r;
310  t2_i = s2_i + s6_i;
311  t3_r = s2_r - s6_r;
312  t3_i = s2_i - s6_i;
313  Fout[0].r = t1_r + t2_r;
314  Fout[0].i = t1_i + t2_i;
315  Fout[4].r = t1_r - t2_r;
316  Fout[4].i = t1_i - t2_i;
317  Fout[2].r = t0_r + t3_i;
318  Fout[2].i = t0_i - t3_r;
319  Fout[6].r = t0_r - t3_i;
320  Fout[6].i = t0_i + t3_r;
321 
322  t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
323  t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
324  t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
325  t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
326 
327  t0_r = s1_r - s5_i;
328  t0_i = s1_i + s5_r;
329  t1_r = s1_r + s5_i;
330  t1_i = s1_i - s5_r;
331  t2_r = t4_r - t5_r;
332  t2_i = t4_i - t5_i;
333  t3_r = t4_r + t5_r;
334  t3_i = t4_i + t5_i;
335  Fout[1].r = t1_r + t2_r;
336  Fout[1].i = t1_i + t2_i;
337  Fout[5].r = t1_r - t2_r;
338  Fout[5].i = t1_i - t2_i;
339  Fout[3].r = t0_r + t3_i;
340  Fout[3].i = t0_i - t3_r;
341  Fout[7].r = t0_r - t3_i;
342  Fout[7].i = t0_i + t3_r;
343 }
344 
345 static inline void ne10_fft8_backward_int16_scaled (ne10_fft_cpx_int16_t * Fout,
346  ne10_fft_cpx_int16_t * Fin)
347 
348 {
349  ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
350  ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
351  const ne10_int16_t TW_81 = 23169;
352 
353  s0_r = (Fin[0].r + Fin[4].r) >> 3;
354  s0_i = (Fin[0].i + Fin[4].i) >> 3;
355  s1_r = (Fin[0].r - Fin[4].r) >> 3;
356  s1_i = (Fin[0].i - Fin[4].i) >> 3;
357  s2_r = (Fin[1].r + Fin[5].r) >> 3;
358  s2_i = (Fin[1].i + Fin[5].i) >> 3;
359  s3_r = (Fin[1].r - Fin[5].r) >> 3;
360  s3_i = (Fin[1].i - Fin[5].i) >> 3;
361  s4_r = (Fin[2].r + Fin[6].r) >> 3;
362  s4_i = (Fin[2].i + Fin[6].i) >> 3;
363  s5_r = (Fin[2].r - Fin[6].r) >> 3;
364  s5_i = (Fin[2].i - Fin[6].i) >> 3;
365  s6_r = (Fin[3].r + Fin[7].r) >> 3;
366  s6_i = (Fin[3].i + Fin[7].i) >> 3;
367  s7_r = (Fin[3].r - Fin[7].r) >> 3;
368  s7_i = (Fin[3].i - Fin[7].i) >> 3;
369 
370  t0_r = s0_r - s4_r;
371  t0_i = s0_i - s4_i;
372  t1_r = s0_r + s4_r;
373  t1_i = s0_i + s4_i;
374  t2_r = s2_r + s6_r;
375  t2_i = s2_i + s6_i;
376  t3_r = s2_r - s6_r;
377  t3_i = s2_i - s6_i;
378  Fout[0].r = t1_r + t2_r;
379  Fout[0].i = t1_i + t2_i;
380  Fout[4].r = t1_r - t2_r;
381  Fout[4].i = t1_i - t2_i;
382  Fout[2].r = t0_r - t3_i;
383  Fout[2].i = t0_i + t3_r;
384  Fout[6].r = t0_r + t3_i;
385  Fout[6].i = t0_i - t3_r;
386 
387  t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
388  t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
389  t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
390  t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
391 
392  t0_r = s1_r + s5_i;
393  t0_i = s1_i - s5_r;
394  t1_r = s1_r - s5_i;
395  t1_i = s1_i + s5_r;
396  t2_r = t4_r - t5_r;
397  t2_i = t4_i - t5_i;
398  t3_r = t4_r + t5_r;
399  t3_i = t4_i + t5_i;
400  Fout[1].r = t1_r + t2_r;
401  Fout[1].i = t1_i + t2_i;
402  Fout[5].r = t1_r - t2_r;
403  Fout[5].i = t1_i - t2_i;
404  Fout[3].r = t0_r - t3_i;
405  Fout[3].i = t0_i + t3_r;
406  Fout[7].r = t0_r + t3_i;
407  Fout[7].i = t0_i - t3_r;
408 }
409 static void ne10_fft_split_r2c_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
410  const ne10_fft_cpx_int16_t *src,
411  ne10_fft_cpx_int16_t *twiddles,
412  ne10_int32_t ncfft,
413  ne10_int32_t scaled_flag)
414 {
415  ne10_int32_t k;
416  ne10_int32_t count = ncfft / 2;
417  ne10_fft_cpx_int16_t fpnk, fpk, f1k, f2k, tw, tdc;
418  int16x8x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
419  int16x8_t q_fpnk_r, q_fpnk_i;
420  int16x8_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
421  int16x8_t q_tw_r, q_tw_i;
422  int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
423  int16x8_t q_dst2_r, q_dst2_i;
424  int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
425 
426  tdc.r = src[0].r;
427  tdc.i = src[0].i;
428 
429  if (scaled_flag)
430  NE10_F2I16_FIXDIV (tdc, 2);
431 
432  dst[0].r = tdc.r + tdc.i;
433  dst[ncfft].r = tdc.r - tdc.i;
434  dst[ncfft].i = dst[0].i = 0;
435  if (count >= 8)
436  {
437 
438  if (scaled_flag)
439  {
440  for (k = 1; k <= count ; k += 8)
441  {
442  p_src = (int16_t*) (& (src[k]));
443  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
444  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
445  p_dst = (int16_t*) (& (dst[k]));
446  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
447 
448  q2_fpk = vld2q_s16 (p_src);
449  q2_fpnk = vld2q_s16 (p_src2);
450 
451  q2_tw = vld2q_s16 (p_twiddles);
452  q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
453  q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
454  q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
455  q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
456  q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
457  q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
458  q_fpnk_i = vnegq_s16 (q_fpnk_i);
459 
460  q_f1k_r = vhaddq_s16 (q2_fpk.val[0], q_fpnk_r);
461  q_f1k_i = vhaddq_s16 (q2_fpk.val[1], q_fpnk_i);
462 
463  q_f2k_r = vhsubq_s16 (q2_fpk.val[0], q_fpnk_r);
464  q_f2k_i = vhsubq_s16 (q2_fpk.val[1], q_fpnk_i);
465 
466  q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
467  q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
468  q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
469  q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
470  q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
471  q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
472 
473  q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
474  q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
475  q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
476  q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
477  q_dst2_r = vrev32q_s16 (q_dst2_r);
478  q_dst2_i = vrev32q_s16 (q_dst2_i);
479  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
480  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
481  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
482  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
483  vst2q_s16 (p_dst, q2_dst);
484  vst2q_s16 (p_dst2, q2_dst2);
485 
486  }
487  }
488  else
489  {
490  for (k = 1; k <= count ; k += 8)
491  {
492  p_src = (int16_t*) (& (src[k]));
493  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
494  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
495  p_dst = (int16_t*) (& (dst[k]));
496  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
497 
498  q2_fpk = vld2q_s16 (p_src);
499  q2_fpnk = vld2q_s16 (p_src2);
500 
501  q2_tw = vld2q_s16 (p_twiddles);
502  q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
503  q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
504  q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
505  q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
506  q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
507  q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
508  q_fpnk_i = vnegq_s16 (q_fpnk_i);
509 
510  q_f1k_r = vaddq_s16 (q2_fpk.val[0], q_fpnk_r);
511  q_f1k_i = vaddq_s16 (q2_fpk.val[1], q_fpnk_i);
512 
513  q_f2k_r = vsubq_s16 (q2_fpk.val[0], q_fpnk_r);
514  q_f2k_i = vsubq_s16 (q2_fpk.val[1], q_fpnk_i);
515 
516  q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
517  q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
518  q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
519  q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
520  q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
521  q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
522 
523  q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
524  q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
525  q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
526  q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
527  q_dst2_r = vrev32q_s16 (q_dst2_r);
528  q_dst2_i = vrev32q_s16 (q_dst2_i);
529  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
530  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
531  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
532  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
533  vst2q_s16 (p_dst, q2_dst);
534  vst2q_s16 (p_dst2, q2_dst2);
535 
536  }
537  }
538  }
539  else
540  {
541 
542  for (k = 1; k <= ncfft / 2 ; ++k)
543  {
544  fpk = src[k];
545  fpnk.r = src[ncfft - k].r;
546  fpnk.i = - src[ncfft - k].i;
547  if (scaled_flag)
548  {
549  NE10_F2I16_FIXDIV (fpk, 2);
550  NE10_F2I16_FIXDIV (fpnk, 2);
551  }
552 
553  f1k.r = fpk.r + fpnk.r;
554  f1k.i = fpk.i + fpnk.i;
555 
556  f2k.r = fpk.r - fpnk.r;
557  f2k.i = fpk.i - fpnk.i;
558 
559  tw.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).r
560  - (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
561  tw.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).i
562  + (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> NE10_F2I16_SHIFT);
563 
564  dst[k].r = (f1k.r + tw.r) >> 1;
565  dst[k].i = (f1k.i + tw.i) >> 1;
566  dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
567  dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
568  }
569  }
570 }
571 
572 static void ne10_fft_split_c2r_1d_int16_neon (ne10_fft_cpx_int16_t *dst,
573  const ne10_fft_cpx_int16_t *src,
574  ne10_fft_cpx_int16_t *twiddles,
575  ne10_int32_t ncfft,
576  ne10_int32_t scaled_flag)
577 {
578 
579  ne10_int32_t k;
580  ne10_int32_t count = ncfft / 2;
581  ne10_fft_cpx_int16_t fk, fnkc, fek, fok, tmp;
582  int16x8x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
583  int16x8_t q_fnkc_r, q_fnkc_i;
584  int16x8_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
585  int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
586  int16x8_t q_dst2_r, q_dst2_i;
587  int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
588 
589 
590  dst[0].r = src[0].r + src[ncfft].r;
591  dst[0].i = src[0].r - src[ncfft].r;
592 
593  if (scaled_flag)
594  NE10_F2I16_FIXDIV (dst[0], 2);
595  if (count >= 8)
596  {
597  if (scaled_flag)
598  {
599  for (k = 1; k <= count ; k += 8)
600  {
601  p_src = (int16_t*) (& (src[k]));
602  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
603  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
604  p_dst = (int16_t*) (& (dst[k]));
605  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
606 
607  q2_fk = vld2q_s16 (p_src);
608  q2_fnkc = vld2q_s16 (p_src2);
609  q2_tw = vld2q_s16 (p_twiddles);
610  q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
611  q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
612  q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
613  q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
614  q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
615  q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
616  q_fnkc_i = vnegq_s16 (q_fnkc_i);
617 
618  q_fek_r = vhaddq_s16 (q2_fk.val[0], q_fnkc_r);
619  q_fek_i = vhaddq_s16 (q2_fk.val[1], q_fnkc_i);
620  q_tmp0 = vhsubq_s16 (q2_fk.val[0], q_fnkc_r);
621  q_tmp1 = vhsubq_s16 (q2_fk.val[1], q_fnkc_i);
622 
623  q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
624  q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
625  q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
626  q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
627  q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
628  q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
629 
630  q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
631  q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
632  q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
633  q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
634  q_dst2_r = vrev32q_s16 (q_dst2_r);
635  q_dst2_i = vrev32q_s16 (q_dst2_i);
636  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
637  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
638  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
639  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
640  vst2q_s16 (p_dst, q2_dst);
641  vst2q_s16 (p_dst2, q2_dst2);
642 
643  }
644 
645  }
646  else
647  {
648  for (k = 1; k <= count ; k += 8)
649  {
650  p_src = (int16_t*) (& (src[k]));
651  p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
652  p_twiddles = (int16_t*) (& (twiddles[k - 1]));
653  p_dst = (int16_t*) (& (dst[k]));
654  p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
655 
656  q2_fk = vld2q_s16 (p_src);
657  q2_fnkc = vld2q_s16 (p_src2);
658  q2_tw = vld2q_s16 (p_twiddles);
659  q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
660  q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
661  q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
662  q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
663  q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
664  q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
665  q_fnkc_i = vnegq_s16 (q_fnkc_i);
666 
667  q_fek_r = vaddq_s16 (q2_fk.val[0], q_fnkc_r);
668  q_fek_i = vaddq_s16 (q2_fk.val[1], q_fnkc_i);
669  q_tmp0 = vsubq_s16 (q2_fk.val[0], q_fnkc_r);
670  q_tmp1 = vsubq_s16 (q2_fk.val[1], q_fnkc_i);
671 
672  q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
673  q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
674  q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
675  q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
676  q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
677  q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
678 
679  q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
680  q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
681  q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
682  q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
683  q_dst2_r = vrev32q_s16 (q_dst2_r);
684  q_dst2_i = vrev32q_s16 (q_dst2_i);
685  q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
686  q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
687  q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
688  q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
689  vst2q_s16 (p_dst, q2_dst);
690  vst2q_s16 (p_dst2, q2_dst2);
691 
692  }
693  }
694  }
695  else
696  {
697 
698  for (k = 1; k <= ncfft / 2; k++)
699  {
700  fk = src[k];
701  fnkc.r = src[ncfft - k].r;
702  fnkc.i = -src[ncfft - k].i;
703  if (scaled_flag)
704  {
705  NE10_F2I16_FIXDIV (fk, 2);
706  NE10_F2I16_FIXDIV (fnkc, 2);
707  }
708 
709  fek.r = fk.r + fnkc.r;
710  fek.i = fk.i + fnkc.i;
711 
712  tmp.r = fk.r - fnkc.r;
713  tmp.i = fk.i - fnkc.i;
714 
715  fok.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).r
716  + (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
717  fok.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).r
718  - (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
719 
720  dst[k].r = fek.r + fok.r;
721  dst[k].i = fek.i + fok.i;
722 
723  dst[ncfft - k].r = fek.r - fok.r;
724  dst[ncfft - k].i = fok.i - fek.i;
725  }
726  }
727 }
728 
749  ne10_int32_t inverse_fft,
750  ne10_int32_t scaled_flag)
751 {
752  if (scaled_flag)
753  {
754  if (inverse_fft)
755  {
756  switch (cfg->nfft)
757  {
758  case 4:
759  ne10_fft4_backward_int16_scaled (fout, fin);
760  break;
761  case 8:
762  ne10_fft8_backward_int16_scaled (fout, fin);
763  break;
764  default:
765  ne10_mixed_radix_fft_backward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
766  break;
767  }
768  }
769  else
770  {
771  switch (cfg->nfft)
772  {
773  case 4:
774  ne10_fft4_forward_int16_scaled (fout, fin);
775  break;
776  case 8:
777  ne10_fft8_forward_int16_scaled (fout, fin);
778  break;
779  default:
780  ne10_mixed_radix_fft_forward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
781  break;
782  }
783  }
784  }
785  else
786  {
787  if (inverse_fft)
788  {
789  switch (cfg->nfft)
790  {
791  case 4:
792  ne10_fft4_backward_int16_unscaled (fout, fin);
793  break;
794  case 8:
795  ne10_fft8_backward_int16_unscaled (fout, fin);
796  break;
797  default:
798  ne10_mixed_radix_fft_backward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
799  break;
800  }
801  }
802  else
803  {
804  switch (cfg->nfft)
805  {
806  case 4:
807  ne10_fft4_forward_int16_unscaled (fout, fin);
808  break;
809  case 8:
810  ne10_fft8_forward_int16_unscaled (fout, fin);
811  break;
812  default:
813  ne10_mixed_radix_fft_forward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
814  break;
815  }
816  }
817  }
818 }
819  //end of C2C_FFT_IFFT group
823 
841  ne10_int16_t *fin,
843  ne10_int32_t scaled_flag)
844 {
845  ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
846  ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
847  ne10_fft_state_int16_t c2c_state;
848 
849  c2c_state.nfft = cfg->ncfft;
850  c2c_state.factors = cfg->factors;
851  c2c_state.twiddles = cfg->twiddles;
852  c2c_state.buffer = tmpbuf2;
853 
854  ne10_fft_c2c_1d_int16_neon (tmpbuf1, (ne10_fft_cpx_int16_t*) fin, &c2c_state, 0, scaled_flag);
855  ne10_fft_split_r2c_1d_int16_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
856 }
868 void ne10_fft_c2r_1d_int16_neon (ne10_int16_t *fout,
871  ne10_int32_t scaled_flag)
872 {
873  ne10_fft_cpx_int16_t * tmpbuf1 = cfg->buffer;
874  ne10_fft_cpx_int16_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
875  ne10_fft_state_int16_t c2c_state;
876 
877  c2c_state.nfft = cfg->ncfft;
878  c2c_state.factors = cfg->factors;
879  c2c_state.twiddles = cfg->twiddles;
880  c2c_state.buffer = tmpbuf2;
881 
882  ne10_fft_split_c2r_1d_int16_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
883  ne10_fft_c2c_1d_int16_neon ( (ne10_fft_cpx_int16_t*) fout, tmpbuf1, &c2c_state, 1, scaled_flag);
884 }
void ne10_fft_c2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_cfg_int16_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 16-bit fixed point data.
void ne10_fft_r2c_1d_int16_neon(ne10_fft_cpx_int16_t *fout, ne10_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int16 data.
void ne10_fft_c2r_1d_int16_neon(ne10_int16_t *fout, ne10_fft_cpx_int16_t *fin, ne10_fft_r2c_cfg_int16_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int16 data.
structure for the 16 bits fixed point FFT function.
Definition: NE10_types.h:298