6 static const __m256 pi_over_two_256 = {0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f};
7 static const __m256 pi_over_four_256 = {0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f, 0xc.90fdaa22168c2350p-4
f};
8 static const __m256 three_256 = {3., 3., 3., 3., 3., 3., 3., 3.};
9 static const __m256 one_o_2_256 = {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5};
10 static const __m256 zero_256 = {0., 0., 0., 0., 0., 0., 0., 0.};
11 static const __m256 one_256 = {1., 1., 1., 1., 1., 1., 1., 1.};
12 static const __m256 negone_256 = {-1., -1., -1., -1., -1., -1., -1., -1.};
13 static const __m256 atanC0_256 = {0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f, 0xf.fffb771eba87d370p-4
f};
14 static const __m256 atanC1_256 = {-0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f, -0x5.542eef19db937268p-4
f};
15 static const __m256 atanC2_256 = {0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f, 0x3.1dcf607e2808c0d4p-4
f};
16 static const __m256 atanC3_256 = {-0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f, -0x1.ab85dd26f5264feep-4
f};
17 static const __m256 sqrt2_minus_1_256 = {0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f, 0x6.a09e667f3bcc9080p-4
f};
18 static const __m256 sqrt2_plus_1_256 = {0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f};
20 static const unsigned int sign_int_256[8]
__attribute__((aligned(32))) = {0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000};
23 static inline __m256
__attribute__((always_inline)) _mm256_cmplt_ps(__m256
a, __m256
b){
return _mm256_cmp_ps(a, b, _CMP_LT_OQ); }
24 static inline __m256
__attribute__((always_inline)) _mm256_cmpgt_ps(__m256
a, __m256
b){
return _mm256_cmp_ps(a, b, _CMP_GT_OQ); }
25 static inline __m256
__attribute__((always_inline)) _mm256_cmpeq_ps(__m256
a, __m256
b){
return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
28 static inline __m256
__attribute__((always_inline)) _mm256_load1_ps(
float x){
return _mm256_set_ps(x, x, x, x, x, x, x, x); }
31 inline __m256
__attribute__((always_inline)) _vec256_rsqrt_ps(__m256
x)
33 __m256 x0 = _mm256_rsqrt_ps(x);
34 return _mm256_mul_ps( one_o_2_256, _mm256_mul_ps( x0, _mm256_sub_ps( three_256, _mm256_mul_ps( x0, _mm256_mul_ps(x0, x) ) ) ) );
38 inline __m256
__attribute__((always_inline)) _vec256_sqrt_ps(__m256 x)
40 __m256 x0 = _mm256_rsqrt_ps(x);
41 return _mm256_mul_ps(_mm256_mul_ps( one_o_2_256, _mm256_mul_ps( x0, _mm256_sub_ps( three_256, _mm256_mul_ps( x0, _mm256_mul_ps(x0, x) ) ) ) ), x);
45 inline __m256
__attribute__((always_inline)) _vec256_rec_ps(__m256 x)
47 __m256
a = _vec256_rsqrt_ps(x);
48 return _mm256_mul_ps(a, a);
52 inline void __attribute__((always_inline)) _vec256_fabs_ps(__m256 &
v)
54 __m256i
vec_sgn = _mm256_load_si256((__m256i*)sign_int_256);
55 v = _mm256_andnot_ps((__m256)vec_sgn, v);
61 inline __m256
__attribute__((always_inline)) _vec256_atan_ps(__m256 x)
72 vec_sgn = _mm256_load_si256((__m256i*)sign_int_256);
73 x_sign = _mm256_and_ps((__m256)vec_sgn, x);
74 x = _mm256_xor_ps(x_sign, x);
79 gr1 = _mm256_cmpgt_ps(x, sqrt2_plus_1_256);
80 gr2 = _mm256_cmpgt_ps(x, sqrt2_minus_1_256);
81 z1 = _mm256_div_ps(negone_256, x);
82 z1 = _mm256_and_ps(z1, gr1);
83 z2 = _mm256_add_ps(x, negone_256);
84 x2 = _mm256_sub_ps(x, negone_256);
85 z2 = _mm256_div_ps(z2, x2);
86 x = _mm256_andnot_ps(gr2, x);
87 gr2 = _mm256_xor_ps(gr2, gr1);
88 z2 = _mm256_and_ps(z2, gr2);
89 x = _mm256_or_ps(x, z1);
90 x = _mm256_or_ps(x, z2);
94 x2 = _mm256_mul_ps(x, x);
95 z1 = _mm256_mul_ps(x2, atanC3_256);
96 z1 = _mm256_add_ps(z1, atanC2_256);
97 z2 = _mm256_mul_ps(x2, z1);
98 z2 = _mm256_add_ps(z2, atanC1_256);
99 z1 = _mm256_mul_ps(x2, z2);
100 z1 = _mm256_add_ps(z1, atanC0_256);
101 z1 = _mm256_mul_ps(z1, x);
104 x = _mm256_and_ps(gr1, pi_over_two_256);
105 x2 = _mm256_and_ps(gr2, pi_over_four_256);
106 x2 = _mm256_or_ps(x2, x);
107 x2 = _mm256_add_ps(x2, z1);
110 x2 = _mm256_xor_ps(x2, x_sign);