ECCE @ EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
vector_math_inline_avx.h
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file vector_math_inline_avx.h
1 #ifdef AVXHOUGH
2 
3 #include <immintrin.h>
4 
5 
6 static const __m256 pi_over_two_256 = {0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f};
7 static const __m256 pi_over_four_256 = {0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f};
8 static const __m256 three_256 = {3., 3., 3., 3., 3., 3., 3., 3.};
9 static const __m256 one_o_2_256 = {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5};
10 static const __m256 zero_256 = {0., 0., 0., 0., 0., 0., 0., 0.};
11 static const __m256 one_256 = {1., 1., 1., 1., 1., 1., 1., 1.};
12 static const __m256 negone_256 = {-1., -1., -1., -1., -1., -1., -1., -1.};
13 static const __m256 atanC0_256 = {0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f};
14 static const __m256 atanC1_256 = {-0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f};
15 static const __m256 atanC2_256 = {0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f};
16 static const __m256 atanC3_256 = {-0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f};
17 static const __m256 sqrt2_minus_1_256 = {0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f};
18 static const __m256 sqrt2_plus_1_256 = {0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f};
19 
20 static const unsigned int sign_int_256[8] __attribute__((aligned(32))) = {0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000,0x80000000};
21 
22 
23 static inline __m256 __attribute__((always_inline)) _mm256_cmplt_ps(__m256 a, __m256 b){ return _mm256_cmp_ps(a, b, _CMP_LT_OQ); }
24 static inline __m256 __attribute__((always_inline)) _mm256_cmpgt_ps(__m256 a, __m256 b){ return _mm256_cmp_ps(a, b, _CMP_GT_OQ); }
25 static inline __m256 __attribute__((always_inline)) _mm256_cmpeq_ps(__m256 a, __m256 b){ return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
26 
27 
28 static inline __m256 __attribute__((always_inline)) _mm256_load1_ps(float x){ return _mm256_set_ps(x, x, x, x, x, x, x, x); }
29 
30 
31 inline __m256 __attribute__((always_inline)) _vec256_rsqrt_ps(__m256 x)
32 {
33  __m256 x0 = _mm256_rsqrt_ps(x);
34  return _mm256_mul_ps( one_o_2_256, _mm256_mul_ps( x0, _mm256_sub_ps( three_256, _mm256_mul_ps( x0, _mm256_mul_ps(x0, x) ) ) ) );
35 }
36 
37 
38 inline __m256 __attribute__((always_inline)) _vec256_sqrt_ps(__m256 x)
39 {
40  __m256 x0 = _mm256_rsqrt_ps(x);
41  return _mm256_mul_ps(_mm256_mul_ps( one_o_2_256, _mm256_mul_ps( x0, _mm256_sub_ps( three_256, _mm256_mul_ps( x0, _mm256_mul_ps(x0, x) ) ) ) ), x);
42 }
43 
44 
45 inline __m256 __attribute__((always_inline)) _vec256_rec_ps(__m256 x)
46 {
47  __m256 a = _vec256_rsqrt_ps(x);
48  return _mm256_mul_ps(a, a);
49 }
50 
51 
52 inline void __attribute__((always_inline)) _vec256_fabs_ps(__m256 &v)
53 {
54  __m256i vec_sgn = _mm256_load_si256((__m256i*)sign_int_256);
55  v = _mm256_andnot_ps((__m256)vec_sgn, v);
56 }
57 
58 
59 
60 
61 inline __m256 __attribute__((always_inline)) _vec256_atan_ps(__m256 x)
62 {
63  __m256i vec_sgn;
64  __m256 x_sign;
65  __m256 gr1;
66  __m256 gr2;
67  __m256 z1;
68  __m256 z2;
69  __m256 x2;
70 
71  //extract the sign of x and make x positive
72  vec_sgn = _mm256_load_si256((__m256i*)sign_int_256);
73  x_sign = _mm256_and_ps((__m256)vec_sgn, x);
74  x = _mm256_xor_ps(x_sign, x);
75 
76 
77  //if x > sqrt(2)+1, then set x = -1/x
78  //else if x > sqrt(2)-1, then set x = (x-1)/(x+1)
79  gr1 = _mm256_cmpgt_ps(x, sqrt2_plus_1_256);
80  gr2 = _mm256_cmpgt_ps(x, sqrt2_minus_1_256);
81  z1 = _mm256_div_ps(negone_256, x);
82  z1 = _mm256_and_ps(z1, gr1);
83  z2 = _mm256_add_ps(x, negone_256);
84  x2 = _mm256_sub_ps(x, negone_256);
85  z2 = _mm256_div_ps(z2, x2);
86  x = _mm256_andnot_ps(gr2, x);
87  gr2 = _mm256_xor_ps(gr2, gr1);
88  z2 = _mm256_and_ps(z2, gr2);
89  x = _mm256_or_ps(x, z1);
90  x = _mm256_or_ps(x, z2);
91 
92 
93  //compute z1 = atan(x) from a Chebyshev polynomial (in monomial form) using Horner's scheme
94  x2 = _mm256_mul_ps(x, x);
95  z1 = _mm256_mul_ps(x2, atanC3_256);
96  z1 = _mm256_add_ps(z1, atanC2_256);
97  z2 = _mm256_mul_ps(x2, z1);
98  z2 = _mm256_add_ps(z2, atanC1_256);
99  z1 = _mm256_mul_ps(x2, z2);
100  z1 = _mm256_add_ps(z1, atanC0_256);
101  z1 = _mm256_mul_ps(z1, x);
102 
103  //add either pi/4 or pi/2 to z1 and set result to x2, depending on the initial value of x
104  x = _mm256_and_ps(gr1, pi_over_two_256);
105  x2 = _mm256_and_ps(gr2, pi_over_four_256);
106  x2 = _mm256_or_ps(x2, x);
107  x2 = _mm256_add_ps(x2, z1);
108 
109  //recover the original sign of x
110  x2 = _mm256_xor_ps(x2, x_sign);
111 
112  return x2;
113 }
114 
115 
116 #endif