4 static const __m128
twoTo23 = (
const __m128) { 0x1.0p23f, 0x1.0p23f, 0x1.0p23f, 0x1.0p23f };
9 __m128
b = (__m128) _mm_srli_epi32( _mm_slli_epi32( (__m128i) v, 1 ), 1 );
13 __m128 largeMaskE = _mm_cmpgt_ps( b,
twoTo23 );
15 __m128
g = _mm_cmplt_ps( v, d );
17 __m128
h = _mm_cvtepi32_ps( (__m128i) g );
19 __m128
t = _mm_add_ps( d, h );
21 v = _mm_and_ps( v, largeMaskE );
22 t = _mm_andnot_ps( largeMaskE, t );
23 return _mm_or_ps( t, v );
27 static const __m128
pi = {0x3.243f6a8885a308d4p0f, 0x3.243f6a8885a308d4p0f, 0x3.243f6a8885a308d4p0f, 0x3.243f6a8885a308d4p0f};
28 static const __m128
twopi = {0x6.487ed5110b4611a8p0f, 0x6.487ed5110b4611a8p0f, 0x6.487ed5110b4611a8p0f, 0x6.487ed5110b4611a8p0f};
29 static const __m128
pi_over_two = {0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f};
30 static const __m128
pi_over_four = {0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f};
32 static const __m128
sqrt2_minus_1 = {0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f};
33 static const __m128
sqrt2_plus_1 = {0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f};
35 static const __m128
zero = {0., 0., 0., 0.};
36 static const __m128
one = {1., 1., 1., 1.};
37 static const __m128
three = {3., 3., 3., 3.};
38 static const __m128
negone = {-1., -1., -1., -1.};
39 static const __m128
one_o_2 = {0.5,0.5,0.5,0.5};
41 static const unsigned int sign_int[4]
__attribute__((aligned(16))) = {0x80000000,0x80000000,0x80000000,0x80000000};
43 static const __m128
atanC0 = {0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f};
44 static const __m128
atanC1 = {-0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f};
45 static const __m128
atanC2 = {0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f};
46 static const __m128
atanC3 = {-0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f};
49 inline __m128
__attribute__((always_inline)) _vec_rsqrt_ps(__m128
x)
51 __m128 x0 = _mm_rsqrt_ps(x);
52 return _mm_mul_ps( one_o_2, _mm_mul_ps( x0, _mm_sub_ps( three, _mm_mul_ps( x0, _mm_mul_ps(x0, x) ) ) ) );
56 inline __m128
__attribute__((always_inline)) _vec_sqrt_ps(__m128 x)
58 __m128 x0 = _mm_rsqrt_ps(x);
59 return _mm_mul_ps(_mm_mul_ps( one_o_2, _mm_mul_ps( x0, _mm_sub_ps( three, _mm_mul_ps( x0, _mm_mul_ps(x0, x) ) ) ) ), x);
63 inline __m128
__attribute__((always_inline)) _vec_rec_ps(__m128 x)
65 __m128
a = _vec_rsqrt_ps(x);
66 return _mm_mul_ps(a, a);
70 inline __m128
__attribute__((always_inline)) _vec_atan_ps(__m128 x)
81 vec_sgn = _mm_load_si128((__m128i*)sign_int);
82 x_sign = _mm_and_ps((__m128)vec_sgn, x);
83 x = _mm_xor_ps(x_sign, x);
88 gr1 = _mm_cmpgt_ps(x, sqrt2_plus_1);
89 gr2 = _mm_cmpgt_ps(x, sqrt2_minus_1);
90 z1 = _mm_div_ps(negone, x);
91 z1 = _mm_and_ps(z1, gr1);
92 z2 = _mm_add_ps(x, negone);
93 x2 = _mm_sub_ps(x, negone);
94 z2 = _mm_div_ps(z2, x2);
95 x = _mm_andnot_ps(gr2, x);
96 gr2 = _mm_xor_ps(gr2, gr1);
97 z2 = _mm_and_ps(z2, gr2);
103 x2 = _mm_mul_ps(x, x);
104 z1 = _mm_mul_ps(x2, atanC3);
105 z1 = _mm_add_ps(z1, atanC2);
106 z2 = _mm_mul_ps(x2, z1);
107 z2 = _mm_add_ps(z2, atanC1);
108 z1 = _mm_mul_ps(x2, z2);
109 z1 = _mm_add_ps(z1, atanC0);
110 z1 = _mm_mul_ps(z1, x);
113 x = _mm_and_ps(gr1, pi_over_two);
114 x2 = _mm_and_ps(gr2, pi_over_four);
115 x2 = _mm_or_ps(x2, x);
116 x2 = _mm_add_ps(x2, z1);
119 x2 = _mm_xor_ps(x2, x_sign);
125 inline __m128
__attribute__((always_inline)) _vec_atan2_ps(__m128
y, __m128 x)
127 __m128 eq0 = _mm_cmpeq_ps(x, zero);
131 __m128i vec_sgn = _mm_load_si128((__m128i*)sign_int);
132 __m128
y_sign = _mm_and_ps((__m128)vec_sgn, y);
133 __m128
less0 = _mm_cmplt_ps(x, zero);
134 zero_pio2 = _mm_xor_ps(y_sign, zero_pio2);
136 zero_pi = _mm_xor_ps(y_sign, zero_pi);
137 atanval = _mm_andnot_ps(eq0, atanval);
138 atanval = _mm_add_ps(zero_pio2, atanval);
139 atanval = _mm_add_ps(zero_pi, atanval);
145 inline void __attribute__((always_inline)) _vec_fabs_ps(__m128 &
v)
147 __m128i vec_sgn = _mm_load_si128((__m128i*)sign_int);
148 v = _mm_andnot_ps((__m128)vec_sgn, v);
151 static const __m128
one_over_twopi = {0x2.8be60db9391054ap-4
f, 0x2.8be60db9391054ap-4
f, 0x2.8be60db9391054ap-4
f, 0x2.8be60db9391054ap-4
f};
153 static const __m128
fourth = {0x0.4p0f, 0x0.4p0f, 0x0.4p0f, 0x0.4p0f};
154 static const __m128
half = {0x0.8p0f, 0x0.8p0f, 0x0.8p0f, 0x0.8p0f};
156 static const __m128
sinC0 = {0x6.487c58e5205cd791p0f, 0x6.487c58e5205cd791p0f, 0x6.487c58e5205cd791p0f, 0x6.487c58e5205cd791p0f};
157 static const __m128
sinC1 = {-0x2.955c385f44a6765fp4f, -0x2.955c385f44a6765fp4f, -0x2.955c385f44a6765fp4f, -0x2.955c385f44a6765fp4f};
158 static const __m128
sinC2 = {0x5.145d3f35fa67830ep4f, 0x5.145d3f35fa67830ep4f, 0x5.145d3f35fa67830ep4f, 0x5.145d3f35fa67830ep4f};
159 static const __m128
sinC3 = {-0x4.65f4793b5cd9628fp4f, -0x4.65f4793b5cd9628fp4f, -0x4.65f4793b5cd9628fp4f, -0x4.65f4793b5cd9628fp4f};
166 x = _mm_mul_ps(x, one_over_twopi);
168 x = _mm_sub_ps(x, _vec_floor_ps(x));
171 __m128
t1 = _mm_andnot_ps(mod_half, zero);
172 __m128
t2 = _mm_and_ps(mod_half, half);
173 t1 = _mm_xor_ps(t1, t2);
174 x = _mm_sub_ps(x, t1);
177 t1 = _mm_andnot_ps(mod_fourth, zero);
178 t2 = _mm_and_ps(mod_fourth, fourth);
179 t1 = _mm_xor_ps(t1, t2);
180 x = _mm_sub_ps(x, t1);
182 __m128
z = _mm_sub_ps(fourth, x);
187 x2 = _mm_mul_ps(x, x);
188 z2 = _mm_mul_ps(z, z);
189 t1 = _mm_mul_ps(x2, sinC3);
190 k1 = _mm_mul_ps(z2, sinC3);
191 t1 = _mm_add_ps(t1, sinC2);
192 k1 = _mm_add_ps(k1, sinC2);
193 t2 = _mm_mul_ps(x2, t1);
194 k2 = _mm_mul_ps(z2, k1);
195 t2 = _mm_add_ps(t2, sinC1);
196 k2 = _mm_add_ps(k2, sinC1);
197 t1 = _mm_mul_ps(x2, t2);
198 k1 = _mm_mul_ps(z2, k2);
199 t1 = _mm_add_ps(t1, sinC0);
200 k1 = _mm_add_ps(k1, sinC0);
201 t1 = _mm_mul_ps(t1, x);
202 k1 = _mm_mul_ps(k1, z);
205 s = _mm_andnot_ps(mod_fourth, t1);
206 t2 = _mm_and_ps(mod_fourth, k1);
207 s = _mm_xor_ps(s, t2);
209 c = _mm_andnot_ps(mod_fourth, k1);
210 k2 = _mm_and_ps(mod_fourth, t1);
211 c = _mm_xor_ps(c, k2);
214 __m128i vec_sgn = _mm_load_si128((__m128i*)sign_int);
215 __m128 x_sign = _mm_and_ps(mod_half, (__m128)vec_sgn);
216 s = _mm_xor_ps(s, x_sign);
218 __m128
mod_cos = _mm_xor_ps(mod_fourth, mod_half);
219 x_sign = _mm_and_ps(mod_cos, (__m128)vec_sgn);
220 c = _mm_xor_ps(c, x_sign);
224 static const __m128d
pid = {0x3.243f6a8885a308d4p0, 0x3.243f6a8885a308d4p0};
225 static const __m128d
twopid = {0x6.487ed5110b4611a8p0, 0x6.487ed5110b4611a8p0};
226 static const __m128d
pi_over_twod = {0x1.921fb54442d1846ap0, 0x1.921fb54442d1846ap0};
227 static const __m128d
pi_over_fourd = {0xc.90fdaa22168c2350p-4, 0xc.90fdaa22168c2350p-4};
229 static const unsigned int sign_int_d[4]
__attribute__((aligned(16))) = {0x80000000,0x00000000,0x80000000,0x00000000};
231 static const __m128d
atanD0 = {0xf.fffffffffffffffffab9f803d2af0fb0p-4, 0xf.fffffffffffffffffab9f803d2af0fb0p-4};
232 static const __m128d
atanD1 = {-0x5.5555555555555540c86e6b5fd8e468b0p-4, -0x5.5555555555555540c86e6b5fd8e468b0p-4};
233 static const __m128d
atanD2 = {0x3.3333333333331b3d286002f2c41b81e0p-4, 0x3.3333333333331b3d286002f2c41b81e0p-4};
234 static const __m128d
atanD3 = {-0x2.4924924924851ef4ced41f651e628f40p-4, -0x2.4924924924851ef4ced41f651e628f40p-4};
235 static const __m128d
atanD4 = {0x1.c71c71c7184d82416ebb498f58e3a070p-4, 0x1.c71c71c7184d82416ebb498f58e3a070p-4};
236 static const __m128d
atanD5 = {-0x1.745d1744fcf2617d04fabeb866a4f6dep-4, -0x1.745d1744fcf2617d04fabeb866a4f6dep-4};
237 static const __m128d
atanD6 = {0x1.3b13b11e1c3da8ad9c8d73ccfc6c3722p-4, 0x1.3b13b11e1c3da8ad9c8d73ccfc6c3722p-4};
238 static const __m128d
atanD7 = {-0x1.11110e44194b2e57596c2fdb8bfd10a0p-4, -0x1.11110e44194b2e57596c2fdb8bfd10a0p-4};
239 static const __m128d
atanD8 = {0xf.0f0be69f64251bcf8af8470c615a7c40p-8, 0xf.0f0be69f64251bcf8af8470c615a7c40p-8};
240 static const __m128d
atanD9 = {-0xd.79192575eea6d23daa4764613eb39850p-8, -0xd.79192575eea6d23daa4764613eb39850p-8};
241 static const __m128d
atanD10 = {0xc.2f1d52fbd8638e0fd3d27cdf0e6e13c0p-8, 0xc.2f1d52fbd8638e0fd3d27cdf0e6e13c0p-8};
242 static const __m128d
atanD11 = {-0xb.1518a42d3671c2ee1bf46f36650357a0p-8, -0xb.1518a42d3671c2ee1bf46f36650357a0p-8};
243 static const __m128d
atanD12 = {0x9.f9678bbe523adaab81aff27ebf0ec070p-8, 0x9.f9678bbe523adaab81aff27ebf0ec070p-8};
244 static const __m128d
atanD13 = {-0x8.68d5692c1b536ea0b6afe3a59bdae0b0p-8, -0x8.68d5692c1b536ea0b6afe3a59bdae0b0p-8};
245 static const __m128d
atanD14 = {0x5.c3234a24f6727d6cacaf7b5c9647c2b8p-8, 0x5.c3234a24f6727d6cacaf7b5c9647c2b8p-8};
246 static const __m128d
atanD15 = {-0x2.46516323aab74d114a581091dd99ed84p-8, -0x2.46516323aab74d114a581091dd99ed84p-8};
248 static const __m128d
sqrt2_minus_1d = {0x6.a09e667f3bcc9080p-4, 0x6.a09e667f3bcc9080p-4};
249 static const __m128d
sqrt2_plus_1d = {0x2.6a09e667f3bcc9080p0, 0x2.6a09e667f3bcc9080p0};
251 static const __m128d
zerod = {0., 0.};
252 static const __m128d
oned = {1., 1.};
257 inline __m128d
__attribute__((always_inline)) _vec_atan_dfull_ps(__m128d x)
268 vec_sgn = _mm_load_si128((__m128i*)sign_int_d);
269 x_sign = _mm_and_pd((__m128d)vec_sgn, x);
270 x = _mm_xor_pd(x_sign, x);
274 gr1 = _mm_cmpgt_pd(x, sqrt2_plus_1d);
275 gr2 = _mm_cmpgt_pd(x, sqrt2_minus_1d);
276 z1 = _mm_div_pd(negoned, x);
277 z1 = _mm_and_pd(z1, gr1);
278 z2 = _mm_add_pd(x, negoned);
279 x2 = _mm_sub_pd(x, negoned);
280 z2 = _mm_div_pd(z2, x2);
281 x = _mm_andnot_pd(gr2, x);
282 gr2 = _mm_xor_pd(gr2, gr1);
283 z2 = _mm_and_pd(z2, gr2);
284 x = _mm_or_pd(x, z1);
285 x = _mm_or_pd(x, z2);
288 x2 = _mm_mul_pd(x, x);
289 z1 = _mm_mul_pd(x2, atanD15);
290 z1 = _mm_add_pd(z1, atanD14);
291 z2 = _mm_mul_pd(x2, z1);
292 z2 = _mm_add_pd(z2, atanD13);
293 z1 = _mm_mul_pd(x2, z2);
294 z1 = _mm_add_pd(z1, atanD12);
295 z2 = _mm_mul_pd(x2, z1);
296 z2 = _mm_add_pd(z2, atanD11);
297 z1 = _mm_mul_pd(x2, z2);
298 z1 = _mm_add_pd(z1, atanD10);
299 z2 = _mm_mul_pd(x2, z1);
300 z2 = _mm_add_pd(z2, atanD9);
301 z1 = _mm_mul_pd(x2, z2);
302 z1 = _mm_add_pd(z1, atanD8);
303 z2 = _mm_mul_pd(x2, z1);
304 z2 = _mm_add_pd(z2, atanD7);
305 z1 = _mm_mul_pd(x2, z2);
306 z1 = _mm_add_pd(z1, atanD6);
307 z2 = _mm_mul_pd(x2, z1);
308 z2 = _mm_add_pd(z2, atanD5);
309 z1 = _mm_mul_pd(x2, z2);
310 z1 = _mm_add_pd(z1, atanD4);
311 z2 = _mm_mul_pd(x2, z1);
312 z2 = _mm_add_pd(z2, atanD3);
313 z1 = _mm_mul_pd(x2, z2);
314 z1 = _mm_add_pd(z1, atanD2);
315 z2 = _mm_mul_pd(x2, z1);
316 z2 = _mm_add_pd(z2, atanD1);
317 z1 = _mm_mul_pd(x2, z2);
318 z1 = _mm_add_pd(z1, atanD0);
319 z1 = _mm_mul_pd(z1, x);
322 x = _mm_and_pd(gr1, pi_over_twod);
323 x2 = _mm_and_pd(gr2, pi_over_fourd);
324 x2 = _mm_or_pd(x2, x);
325 x2 = _mm_add_pd(x2, z1);
328 x2 = _mm_xor_pd(x2, x_sign);