ECCE @ EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
vector_math_inline.h
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file vector_math_inline.h
1 #include <xmmintrin.h>
2 #include <emmintrin.h>
3 
4 static const __m128 twoTo23 = (const __m128) { 0x1.0p23f, 0x1.0p23f, 0x1.0p23f, 0x1.0p23f };
5 
6 inline __m128 __attribute__((always_inline)) _vec_floor_ps(__m128 v)
7 {
8  // b = fabs(v)
9  __m128 b = (__m128) _mm_srli_epi32( _mm_slli_epi32( (__m128i) v, 1 ), 1 );
10  // The essence of the floor routine
11  __m128 d = _mm_sub_ps( _mm_add_ps( _mm_add_ps( _mm_sub_ps( v, twoTo23 ), twoTo23 ), twoTo23 ), twoTo23 );
12  // ?1 if v >= 2**23
13  __m128 largeMaskE = _mm_cmpgt_ps( b, twoTo23 );
14  // Check for possible off by one error
15  __m128 g = _mm_cmplt_ps( v, d );
16  // Convert positive check result to -1.0, negative to 0.0
17  __m128 h = _mm_cvtepi32_ps( (__m128i) g );
18  // Add in the error if there is one
19  __m128 t = _mm_add_ps( d, h );
20  //Select between output result and input value based on v >= 2**23
21  v = _mm_and_ps( v, largeMaskE );
22  t = _mm_andnot_ps( largeMaskE, t );
23  return _mm_or_ps( t, v );
24 }
25 
26 
27 static const __m128 pi = {0x3.243f6a8885a308d4p0f, 0x3.243f6a8885a308d4p0f, 0x3.243f6a8885a308d4p0f, 0x3.243f6a8885a308d4p0f};
28 static const __m128 twopi = {0x6.487ed5110b4611a8p0f, 0x6.487ed5110b4611a8p0f, 0x6.487ed5110b4611a8p0f, 0x6.487ed5110b4611a8p0f};
29 static const __m128 pi_over_two = {0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f, 0x1.921fb54442d1846ap0f};
30 static const __m128 pi_over_four = {0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f, 0xc.90fdaa22168c2350p-4f};
31 
32 static const __m128 sqrt2_minus_1 = {0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f, 0x6.a09e667f3bcc9080p-4f};
33 static const __m128 sqrt2_plus_1 = {0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f, 0x2.6a09e667f3bcc9080p0f};
34 
35 static const __m128 zero = {0., 0., 0., 0.};
36 static const __m128 one = {1., 1., 1., 1.};
37 static const __m128 three = {3., 3., 3., 3.};
38 static const __m128 negone = {-1., -1., -1., -1.};
39 static const __m128 one_o_2 = {0.5,0.5,0.5,0.5};
40 
41 static const unsigned int sign_int[4] __attribute__((aligned(16))) = {0x80000000,0x80000000,0x80000000,0x80000000};
42 
43 static const __m128 atanC0 = {0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f, 0xf.fffb771eba87d370p-4f};
44 static const __m128 atanC1 = {-0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f, -0x5.542eef19db937268p-4f};
45 static const __m128 atanC2 = {0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f, 0x3.1dcf607e2808c0d4p-4f};
46 static const __m128 atanC3 = {-0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f, -0x1.ab85dd26f5264feep-4f};
47 
48 
49 inline __m128 __attribute__((always_inline)) _vec_rsqrt_ps(__m128 x)
50 {
51  __m128 x0 = _mm_rsqrt_ps(x);
52  return _mm_mul_ps( one_o_2, _mm_mul_ps( x0, _mm_sub_ps( three, _mm_mul_ps( x0, _mm_mul_ps(x0, x) ) ) ) );
53 }
54 
55 
56 inline __m128 __attribute__((always_inline)) _vec_sqrt_ps(__m128 x)
57 {
58  __m128 x0 = _mm_rsqrt_ps(x);
59  return _mm_mul_ps(_mm_mul_ps( one_o_2, _mm_mul_ps( x0, _mm_sub_ps( three, _mm_mul_ps( x0, _mm_mul_ps(x0, x) ) ) ) ), x);
60 }
61 
62 
63 inline __m128 __attribute__((always_inline)) _vec_rec_ps(__m128 x)
64 {
65  __m128 a = _vec_rsqrt_ps(x);
66  return _mm_mul_ps(a, a);
67 }
68 
69 
70 inline __m128 __attribute__((always_inline)) _vec_atan_ps(__m128 x)
71 {
72  __m128i vec_sgn;
73  __m128 x_sign;
74  __m128 gr1;
75  __m128 gr2;
76  __m128 z1;
77  __m128 z2;
78  __m128 x2;
79 
80  //extract the sign of x and make x positive
81  vec_sgn = _mm_load_si128((__m128i*)sign_int);
82  x_sign = _mm_and_ps((__m128)vec_sgn, x);
83  x = _mm_xor_ps(x_sign, x);
84 
85 
86  //if x > sqrt(2)+1, then set x = -1/x
87  //else if x > sqrt(2)-1, then set x = (x-1)/(x+1)
88  gr1 = _mm_cmpgt_ps(x, sqrt2_plus_1);
89  gr2 = _mm_cmpgt_ps(x, sqrt2_minus_1);
90  z1 = _mm_div_ps(negone, x);
91  z1 = _mm_and_ps(z1, gr1);
92  z2 = _mm_add_ps(x, negone);
93  x2 = _mm_sub_ps(x, negone);
94  z2 = _mm_div_ps(z2, x2);
95  x = _mm_andnot_ps(gr2, x);
96  gr2 = _mm_xor_ps(gr2, gr1);
97  z2 = _mm_and_ps(z2, gr2);
98  x = _mm_or_ps(x, z1);
99  x = _mm_or_ps(x, z2);
100 
101 
102  //compute z1 = atan(x) from a Chebyshev polynomial (in monomial form) using Horner's scheme
103  x2 = _mm_mul_ps(x, x);
104  z1 = _mm_mul_ps(x2, atanC3);
105  z1 = _mm_add_ps(z1, atanC2);
106  z2 = _mm_mul_ps(x2, z1);
107  z2 = _mm_add_ps(z2, atanC1);
108  z1 = _mm_mul_ps(x2, z2);
109  z1 = _mm_add_ps(z1, atanC0);
110  z1 = _mm_mul_ps(z1, x);
111 
112  //add either pi/4 or pi/2 to z1 and set result to x2, depending on the initial value of x
113  x = _mm_and_ps(gr1, pi_over_two);
114  x2 = _mm_and_ps(gr2, pi_over_four);
115  x2 = _mm_or_ps(x2, x);
116  x2 = _mm_add_ps(x2, z1);
117 
118  //recover the original sign of x
119  x2 = _mm_xor_ps(x2, x_sign);
120 
121  return x2;
122 }
123 
124 
125 inline __m128 __attribute__((always_inline)) _vec_atan2_ps(__m128 y, __m128 x)
126 {
127  __m128 eq0 = _mm_cmpeq_ps(x, zero);
128  __m128 ratio = _mm_div_ps(y, x);
129  __m128 atanval = _vec_atan_ps(ratio);
130  __m128 zero_pio2 = _mm_and_ps(eq0, pi_over_two);
131  __m128i vec_sgn = _mm_load_si128((__m128i*)sign_int);
132  __m128 y_sign = _mm_and_ps((__m128)vec_sgn, y);
133  __m128 less0 = _mm_cmplt_ps(x, zero);
134  zero_pio2 = _mm_xor_ps(y_sign, zero_pio2);
135  __m128 zero_pi = _mm_and_ps(less0, pi);
136  zero_pi = _mm_xor_ps(y_sign, zero_pi);
137  atanval = _mm_andnot_ps(eq0, atanval);
138  atanval = _mm_add_ps(zero_pio2, atanval);
139  atanval = _mm_add_ps(zero_pi, atanval);
140 
141  return atanval;
142 }
143 
144 
145 inline void __attribute__((always_inline)) _vec_fabs_ps(__m128 &v)
146 {
147  __m128i vec_sgn = _mm_load_si128((__m128i*)sign_int);
148  v = _mm_andnot_ps((__m128)vec_sgn, v);
149 }
150 
151 static const __m128 one_over_twopi = {0x2.8be60db9391054ap-4f, 0x2.8be60db9391054ap-4f, 0x2.8be60db9391054ap-4f, 0x2.8be60db9391054ap-4f};
152 
153 static const __m128 fourth = {0x0.4p0f, 0x0.4p0f, 0x0.4p0f, 0x0.4p0f};
154 static const __m128 half = {0x0.8p0f, 0x0.8p0f, 0x0.8p0f, 0x0.8p0f};
155 
156 static const __m128 sinC0 = {0x6.487c58e5205cd791p0f, 0x6.487c58e5205cd791p0f, 0x6.487c58e5205cd791p0f, 0x6.487c58e5205cd791p0f};
157 static const __m128 sinC1 = {-0x2.955c385f44a6765fp4f, -0x2.955c385f44a6765fp4f, -0x2.955c385f44a6765fp4f, -0x2.955c385f44a6765fp4f};
158 static const __m128 sinC2 = {0x5.145d3f35fa67830ep4f, 0x5.145d3f35fa67830ep4f, 0x5.145d3f35fa67830ep4f, 0x5.145d3f35fa67830ep4f};
159 static const __m128 sinC3 = {-0x4.65f4793b5cd9628fp4f, -0x4.65f4793b5cd9628fp4f, -0x4.65f4793b5cd9628fp4f, -0x4.65f4793b5cd9628fp4f};
160 
161 
162 inline void __attribute__((always_inline)) _vec_sin_cos_ps(__m128 x, __m128 &s, __m128 &c)
163 {
164  //we will compute via a polynomial for sin(2*pi*x)
165  //sin(x) = sin(2*pi*(x/2*pi)), so first we divide x by 2*pi
166  x = _mm_mul_ps(x, one_over_twopi);
167  //set x to the fractional part of x
168  x = _mm_sub_ps(x, _vec_floor_ps(x));
169  //subtract 1/2 from x if x>1/2
170  __m128 mod_half = _mm_cmpgt_ps(x, half);
171  __m128 t1 = _mm_andnot_ps(mod_half, zero);
172  __m128 t2 = _mm_and_ps(mod_half, half);
173  t1 = _mm_xor_ps(t1, t2);
174  x = _mm_sub_ps(x, t1);
175  //subtract 1/4 from x if x>1/4
176  __m128 mod_fourth = _mm_cmpgt_ps(x, fourth);
177  t1 = _mm_andnot_ps(mod_fourth, zero);
178  t2 = _mm_and_ps(mod_fourth, fourth);
179  t1 = _mm_xor_ps(t1, t2);
180  x = _mm_sub_ps(x, t1);
181  //z = 1/4 - x
182  __m128 z = _mm_sub_ps(fourth, x);
183  //compute t1 = sin(x) from a Chebyshev polynomial (in monomial form) using Horner's scheme
184  //if we were using higher precision we would want to calculate sqrt(1 - [sin(x)]^2), but for this low precision we will just use the polynomial again
185  //compute k1 = sin(z) from a Chebyshev polynomial (in monomial form) using Horner's scheme
186  __m128 k1, k2, x2, z2;
187  x2 = _mm_mul_ps(x, x);
188  z2 = _mm_mul_ps(z, z);
189  t1 = _mm_mul_ps(x2, sinC3);
190  k1 = _mm_mul_ps(z2, sinC3);
191  t1 = _mm_add_ps(t1, sinC2);
192  k1 = _mm_add_ps(k1, sinC2);
193  t2 = _mm_mul_ps(x2, t1);
194  k2 = _mm_mul_ps(z2, k1);
195  t2 = _mm_add_ps(t2, sinC1);
196  k2 = _mm_add_ps(k2, sinC1);
197  t1 = _mm_mul_ps(x2, t2);
198  k1 = _mm_mul_ps(z2, k2);
199  t1 = _mm_add_ps(t1, sinC0);
200  k1 = _mm_add_ps(k1, sinC0);
201  t1 = _mm_mul_ps(t1, x);
202  k1 = _mm_mul_ps(k1, z);
203 
204  //set s and c to the appropriate magnitudes of sin and cosine
205  s = _mm_andnot_ps(mod_fourth, t1);
206  t2 = _mm_and_ps(mod_fourth, k1);
207  s = _mm_xor_ps(s, t2);
208 
209  c = _mm_andnot_ps(mod_fourth, k1);
210  k2 = _mm_and_ps(mod_fourth, t1);
211  c = _mm_xor_ps(c, k2);
212 
213  //set the appropriate signs of s and c
214  __m128i vec_sgn = _mm_load_si128((__m128i*)sign_int);
215  __m128 x_sign = _mm_and_ps(mod_half, (__m128)vec_sgn);
216  s = _mm_xor_ps(s, x_sign);
217 
218  __m128 mod_cos = _mm_xor_ps(mod_fourth, mod_half);
219  x_sign = _mm_and_ps(mod_cos, (__m128)vec_sgn);
220  c = _mm_xor_ps(c, x_sign);
221 }
222 
223 
224 static const __m128d pid = {0x3.243f6a8885a308d4p0, 0x3.243f6a8885a308d4p0};
225 static const __m128d twopid = {0x6.487ed5110b4611a8p0, 0x6.487ed5110b4611a8p0};
226 static const __m128d pi_over_twod = {0x1.921fb54442d1846ap0, 0x1.921fb54442d1846ap0};
227 static const __m128d pi_over_fourd = {0xc.90fdaa22168c2350p-4, 0xc.90fdaa22168c2350p-4};
228 
229 static const unsigned int sign_int_d[4] __attribute__((aligned(16))) = {0x80000000,0x00000000,0x80000000,0x00000000};
230 
231 static const __m128d atanD0 = {0xf.fffffffffffffffffab9f803d2af0fb0p-4, 0xf.fffffffffffffffffab9f803d2af0fb0p-4};
232 static const __m128d atanD1 = {-0x5.5555555555555540c86e6b5fd8e468b0p-4, -0x5.5555555555555540c86e6b5fd8e468b0p-4};
233 static const __m128d atanD2 = {0x3.3333333333331b3d286002f2c41b81e0p-4, 0x3.3333333333331b3d286002f2c41b81e0p-4};
234 static const __m128d atanD3 = {-0x2.4924924924851ef4ced41f651e628f40p-4, -0x2.4924924924851ef4ced41f651e628f40p-4};
235 static const __m128d atanD4 = {0x1.c71c71c7184d82416ebb498f58e3a070p-4, 0x1.c71c71c7184d82416ebb498f58e3a070p-4};
236 static const __m128d atanD5 = {-0x1.745d1744fcf2617d04fabeb866a4f6dep-4, -0x1.745d1744fcf2617d04fabeb866a4f6dep-4};
237 static const __m128d atanD6 = {0x1.3b13b11e1c3da8ad9c8d73ccfc6c3722p-4, 0x1.3b13b11e1c3da8ad9c8d73ccfc6c3722p-4};
238 static const __m128d atanD7 = {-0x1.11110e44194b2e57596c2fdb8bfd10a0p-4, -0x1.11110e44194b2e57596c2fdb8bfd10a0p-4};
239 static const __m128d atanD8 = {0xf.0f0be69f64251bcf8af8470c615a7c40p-8, 0xf.0f0be69f64251bcf8af8470c615a7c40p-8};
240 static const __m128d atanD9 = {-0xd.79192575eea6d23daa4764613eb39850p-8, -0xd.79192575eea6d23daa4764613eb39850p-8};
241 static const __m128d atanD10 = {0xc.2f1d52fbd8638e0fd3d27cdf0e6e13c0p-8, 0xc.2f1d52fbd8638e0fd3d27cdf0e6e13c0p-8};
242 static const __m128d atanD11 = {-0xb.1518a42d3671c2ee1bf46f36650357a0p-8, -0xb.1518a42d3671c2ee1bf46f36650357a0p-8};
243 static const __m128d atanD12 = {0x9.f9678bbe523adaab81aff27ebf0ec070p-8, 0x9.f9678bbe523adaab81aff27ebf0ec070p-8};
244 static const __m128d atanD13 = {-0x8.68d5692c1b536ea0b6afe3a59bdae0b0p-8, -0x8.68d5692c1b536ea0b6afe3a59bdae0b0p-8};
245 static const __m128d atanD14 = {0x5.c3234a24f6727d6cacaf7b5c9647c2b8p-8, 0x5.c3234a24f6727d6cacaf7b5c9647c2b8p-8};
246 static const __m128d atanD15 = {-0x2.46516323aab74d114a581091dd99ed84p-8, -0x2.46516323aab74d114a581091dd99ed84p-8};
247 
248 static const __m128d sqrt2_minus_1d = {0x6.a09e667f3bcc9080p-4, 0x6.a09e667f3bcc9080p-4};
249 static const __m128d sqrt2_plus_1d = {0x2.6a09e667f3bcc9080p0, 0x2.6a09e667f3bcc9080p0};
250 
251 static const __m128d zerod = {0., 0.};
252 static const __m128d oned = {1., 1.};
253 static const __m128d negoned = {-1., -1.};
254 
255 
256 
257 inline __m128d __attribute__((always_inline)) _vec_atan_dfull_ps(__m128d x)
258 {
259  __m128i vec_sgn;
260  __m128d x_sign;
261  __m128d gr1;
262  __m128d gr2;
263  __m128d z1;
264  __m128d z2;
265  __m128d x2;
266 
267  //extract the sign of x and make x positive
268  vec_sgn = _mm_load_si128((__m128i*)sign_int_d);
269  x_sign = _mm_and_pd((__m128d)vec_sgn, x);
270  x = _mm_xor_pd(x_sign, x);
271 
272  //if x > sqrt(2)+1, then set x = -1/x
273  //else if x > sqrt(2)-1, then set x = (x-1)/(x+1)
274  gr1 = _mm_cmpgt_pd(x, sqrt2_plus_1d);
275  gr2 = _mm_cmpgt_pd(x, sqrt2_minus_1d);
276  z1 = _mm_div_pd(negoned, x);
277  z1 = _mm_and_pd(z1, gr1);
278  z2 = _mm_add_pd(x, negoned);
279  x2 = _mm_sub_pd(x, negoned);
280  z2 = _mm_div_pd(z2, x2);
281  x = _mm_andnot_pd(gr2, x);
282  gr2 = _mm_xor_pd(gr2, gr1);
283  z2 = _mm_and_pd(z2, gr2);
284  x = _mm_or_pd(x, z1);
285  x = _mm_or_pd(x, z2);
286 
287  //compute z1 = atan(x) from a Chebyshev polynomial (in monomial form) using Horner's scheme
288  x2 = _mm_mul_pd(x, x);
289  z1 = _mm_mul_pd(x2, atanD15);
290  z1 = _mm_add_pd(z1, atanD14);
291  z2 = _mm_mul_pd(x2, z1);
292  z2 = _mm_add_pd(z2, atanD13);
293  z1 = _mm_mul_pd(x2, z2);
294  z1 = _mm_add_pd(z1, atanD12);
295  z2 = _mm_mul_pd(x2, z1);
296  z2 = _mm_add_pd(z2, atanD11);
297  z1 = _mm_mul_pd(x2, z2);
298  z1 = _mm_add_pd(z1, atanD10);
299  z2 = _mm_mul_pd(x2, z1);
300  z2 = _mm_add_pd(z2, atanD9);
301  z1 = _mm_mul_pd(x2, z2);
302  z1 = _mm_add_pd(z1, atanD8);
303  z2 = _mm_mul_pd(x2, z1);
304  z2 = _mm_add_pd(z2, atanD7);
305  z1 = _mm_mul_pd(x2, z2);
306  z1 = _mm_add_pd(z1, atanD6);
307  z2 = _mm_mul_pd(x2, z1);
308  z2 = _mm_add_pd(z2, atanD5);
309  z1 = _mm_mul_pd(x2, z2);
310  z1 = _mm_add_pd(z1, atanD4);
311  z2 = _mm_mul_pd(x2, z1);
312  z2 = _mm_add_pd(z2, atanD3);
313  z1 = _mm_mul_pd(x2, z2);
314  z1 = _mm_add_pd(z1, atanD2);
315  z2 = _mm_mul_pd(x2, z1);
316  z2 = _mm_add_pd(z2, atanD1);
317  z1 = _mm_mul_pd(x2, z2);
318  z1 = _mm_add_pd(z1, atanD0);
319  z1 = _mm_mul_pd(z1, x);
320 
321  //add either pi/4 or pi/2 to z1 and set result to x2, depending on the initial value of x
322  x = _mm_and_pd(gr1, pi_over_twod);
323  x2 = _mm_and_pd(gr2, pi_over_fourd);
324  x2 = _mm_or_pd(x2, x);
325  x2 = _mm_add_pd(x2, z1);
326 
327  //recover the original sign of x
328  x2 = _mm_xor_pd(x2, x_sign);
329 
330  return x2;
331 }
332 
333 
334 
335 
336 
337 
338 
339 
340 
341