ECCE @ EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HelixHough_allButKappaRange_sse.cpp
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file HelixHough_allButKappaRange_sse.cpp
1 #include "HelixHough.h"
2 
3 #include "vector_math_inline.h"
4 
5 #include <emmintrin.h>
6 #include <xmmintrin.h>
7 
8 using namespace std;
9 
10 
11 static const __m128 one_o_4 = {0.25,0.25,0.25,0.25};
12 static const __m128 two = {2., 2., 2., 2.};
13 static const __m128 one_o_100 = {0.01,0.01,0.01,0.01};
14 static const __m128 close_one = {0.999,0.999,0.999,0.999};
15 static const __m128 four = {4., 4., 4., 4.};
16 static const __m128 one_o_3 = {0.3333333333333333333, 0.3333333333333333333, 0.3333333333333333333, 0.3333333333333333333};
17 static const __m128 _3_o_20 = {0.15, 0.15, 0.15, 0.15};
18 static const __m128 _5_o_56 = {8.92857142857142877e-02, 8.92857142857142877e-02, 8.92857142857142877e-02, 8.92857142857142877e-02};
19 static const __m128 SIGNMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
20 static const __m128 three_pi_over_two = {3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f};
21 
22 
23 static inline void __attribute__((always_inline)) calculate_phi_d(__m128& k, __m128& /*Delta*/, __m128& Delta2, __m128& /*Delta_inv*/, __m128& ux, __m128& uy, __m128& x3, __m128& y3, __m128& phi_1, __m128& phi_2, __m128& d_1, __m128& d_2)
24 {
25  __m128 k_inv = _vec_rec_ps(k);
26  __m128 k2 = _mm_mul_ps(k, k);
27 
28  __m128 uscale2 = _mm_mul_ps(Delta2, k2);
29  uscale2 = _mm_mul_ps(one_o_4, uscale2);
30  uscale2 = _mm_sub_ps(one, uscale2);
31  __m128 uscale = _vec_sqrt_ps(uscale2);
32 
33  __m128 tmp1 = _mm_mul_ps(k, y3);
34  __m128 tmp2 = _mm_mul_ps(uscale, uy);
35  __m128 tmp3 = _mm_mul_ps(k, x3);
36  __m128 tmp4 = _mm_mul_ps(uscale, ux);
37 
38  __m128 tmp5 = _mm_add_ps(tmp1, tmp2);
39  __m128 tmp6 = _mm_add_ps(tmp3, tmp4);
40  phi_1 = _vec_atan2_ps(tmp5, tmp6);
41  tmp5 = _mm_sub_ps(tmp1, tmp2);
42  tmp6 = _mm_sub_ps(tmp3, tmp4);
43  phi_2 = _vec_atan2_ps(tmp5, tmp6);
44 
45  tmp1 = _mm_cmplt_ps(phi_1, zero);
46  tmp2 = _mm_and_ps(tmp1, twopi);
47  tmp1 = _mm_andnot_ps(tmp1, zero);
48  tmp1 = _mm_xor_ps(tmp1, tmp2);
49  phi_1 = _mm_add_ps(phi_1, tmp1);
50 
51  tmp1 = _mm_cmplt_ps(phi_2, zero);
52  tmp2 = _mm_and_ps(tmp1, twopi);
53  tmp1 = _mm_andnot_ps(tmp1, zero);
54  tmp1 = _mm_xor_ps(tmp1, tmp2);
55  phi_2 = _mm_add_ps(phi_2, tmp1);
56 
57  tmp1 = _mm_mul_ps(x3, x3);
58  tmp2 = _mm_mul_ps(y3, y3);
59  tmp1 = _mm_add_ps(tmp1, tmp2);
60  __m128 kd1 = _mm_mul_ps(k2, tmp1);
61  kd1 = _mm_add_ps(kd1, uscale2);
62  tmp1 = _mm_mul_ps(two, k);
63  tmp1 = _mm_mul_ps(tmp1, uscale);
64  __m128 k0val = _mm_mul_ps(x3, ux); // value of d when k = 0
65  tmp3 = _mm_mul_ps(y3, uy);
66  k0val = _mm_add_ps(k0val, tmp3);
67  tmp1 = _mm_mul_ps(tmp1, k0val);
68  __m128 kd2 = _mm_sub_ps(kd1, tmp1);
69  kd1 = _mm_add_ps(kd1, tmp1);
70  kd1 = _vec_sqrt_ps(kd1);
71  kd2 = _vec_sqrt_ps(kd2);
72 
73  //helicity 1, k 1
74  d_1 = kd1;
75  d_1 = _mm_sub_ps(d_1, one);
76  d_1 = _mm_mul_ps(d_1, k_inv);
77  //helicity 2, k 1
78  d_2 = kd2;
79  d_2 = _mm_sub_ps(d_2, one);
80  d_2 = _mm_mul_ps(d_2, k_inv);
81  // if k=0 , set d to k0val
82  tmp1 = _mm_cmpeq_ps(k, zero);
83  tmp2 = _mm_and_ps(tmp1, k0val);
84  tmp3 = _mm_andnot_ps(tmp1, d_1);
85  d_1 = _mm_xor_ps(tmp2, tmp3);
86  tmp3 = _mm_andnot_ps(tmp1, d_2);
87  d_2 = _mm_xor_ps(tmp2, tmp3);
88 }
89 
90 
91 static inline __m128 __attribute__((always_inline)) calculate_dzdl(__m128& Delta, __m128& z1, __m128& z2, __m128& k)
92 {
93  __m128 v = _mm_mul_ps(one_o_2, k);
94  v = _mm_mul_ps(v, Delta);
95  //if(v > 0.999){v = 0.999;}
96  __m128 tmp1 = _mm_cmpgt_ps(v, close_one);
97  __m128 tmp2 = _mm_and_ps(tmp1, close_one);
98  __m128 tmp3 = _mm_andnot_ps(tmp1, v);
99  v = _mm_xor_ps(tmp2, tmp3);
100  __m128 one_o_v = _vec_rec_ps(v);
101  //power series assuming v_v is small
102  __m128 s = zero;
103  __m128 temp1 = _mm_mul_ps(v, v);
104  __m128 temp2 = _mm_mul_ps(one_o_2, Delta);
105  tmp1 = _mm_mul_ps(two, temp2);
106  s = _mm_add_ps(s, tmp1);
107  temp2 = _mm_mul_ps(temp2, temp1);
108  tmp1 = _mm_mul_ps(temp2, one_o_3);
109  s = _mm_add_ps(s, tmp1);
110  temp2 = _mm_mul_ps(temp2, temp1);
111  tmp1 = _mm_mul_ps(temp2, _3_o_20);
112  s = _mm_add_ps(s, tmp1);
113  temp2 = _mm_mul_ps(temp2, temp1);
114  tmp1 = _mm_mul_ps(temp2, _5_o_56);
115  s = _mm_add_ps(s, tmp1);
117  //otherwise we calculate an arcsin
118  //asin(x) = 2*atan( x/( 1 + sqrt( 1 - x*x ) ) )
119  //s = 2*asin(v)/k
120  tmp1 = _mm_mul_ps(v, v);
121  tmp1 = _mm_sub_ps(one, tmp1);
122  tmp1 = _vec_sqrt_ps(tmp1);
123  tmp1 = _mm_add_ps(one, tmp1);
124  tmp1 = _mm_mul_ps(tmp1, one_o_v);
125  tmp2 = _vec_atan_ps(tmp1);
126  tmp2 = _mm_sub_ps(pi_over_two, tmp2);
127  tmp2 = _mm_mul_ps(four, tmp2);
128  tmp2 = _mm_mul_ps(tmp2, one_o_v);
129  tmp2 = _mm_mul_ps(tmp2, Delta);
130  tmp2 = _mm_mul_ps(tmp2, one_o_2);
132  //choose between the two methods to calculate s
133  tmp1 = _mm_cmpgt_ps(v, one_o_100);
134  tmp3 = _mm_and_ps(tmp1, tmp2);
135  tmp2 = _mm_andnot_ps(tmp1, s);
136  __m128 s1 = _mm_xor_ps(tmp3, tmp2);
137 
138  __m128 dz2 = _mm_sub_ps(z2, z1);
139  dz2 = _mm_mul_ps(dz2, dz2);
140  tmp1 = _mm_mul_ps(s1, s1);
141  tmp1 = _mm_add_ps(tmp1, dz2);
142  __m128 dzdl = _mm_div_ps(dz2, tmp1);
143  dzdl = _vec_sqrt_ps(dzdl);
144  //if z2 < z1, dzdl = -dzdl
145  tmp1 = _mm_cmplt_ps(z2, z1);
146  tmp2 = _mm_xor_ps(dzdl, SIGNMASK);
147  tmp3 = _mm_and_ps(tmp1, tmp2);
148  dzdl = _mm_andnot_ps(tmp1, dzdl);
149  dzdl = _mm_xor_ps(dzdl, tmp3);
150 
151  return dzdl;
152 }
153 
154 
155 static inline __m128 __attribute__((always_inline)) calculate_z0(__m128& x, __m128& y, __m128& z, __m128& k, __m128& d, __m128& phi, __m128& dzdl)
156 {
157  __m128 cs, sn;
158  _vec_sin_cos_ps(phi, sn, cs);
159  __m128 dx = d*cs;
160  __m128 dy = d*sn;
161 
162  __m128 Dx = dx - x;
163  __m128 Dy = dy - y;
164  __m128 Delta = _vec_sqrt_ps(Dx*Dx + Dy*Dy);
165 
166  __m128 v = _mm_mul_ps(one_o_2, k);
167  v = _mm_mul_ps(v, Delta);
168  //if(v > 0.999){v = 0.999;}
169  __m128 tmp1 = _mm_cmpgt_ps(v, close_one);
170  __m128 tmp2 = _mm_and_ps(tmp1, close_one);
171  __m128 tmp3 = _mm_andnot_ps(tmp1, v);
172  v = _mm_xor_ps(tmp2, tmp3);
173  __m128 one_o_v = _vec_rec_ps(v);
174  //power series assuming v_v is small
175  __m128 s = zero;
176  __m128 temp1 = _mm_mul_ps(v, v);
177  __m128 temp2 = _mm_mul_ps(one_o_2, Delta);
178  tmp1 = _mm_mul_ps(two, temp2);
179  s = _mm_add_ps(s, tmp1);
180  temp2 = _mm_mul_ps(temp2, temp1);
181  tmp1 = _mm_mul_ps(temp2, one_o_3);
182  s = _mm_add_ps(s, tmp1);
183  temp2 = _mm_mul_ps(temp2, temp1);
184  tmp1 = _mm_mul_ps(temp2, _3_o_20);
185  s = _mm_add_ps(s, tmp1);
186  temp2 = _mm_mul_ps(temp2, temp1);
187  tmp1 = _mm_mul_ps(temp2, _5_o_56);
188  s = _mm_add_ps(s, tmp1);
190  //otherwise we calculate an arcsin
191  //asin(x) = 2*atan( x/( 1 + sqrt( 1 - x*x ) ) )
192  //s = 2*asin(v)/k
193  tmp1 = _mm_mul_ps(v, v);
194  tmp1 = _mm_sub_ps(one, tmp1);
195  tmp1 = _vec_sqrt_ps(tmp1);
196  tmp1 = _mm_add_ps(one, tmp1);
197  tmp1 = _mm_mul_ps(tmp1, one_o_v);
198  tmp2 = _vec_atan_ps(tmp1);
199  tmp2 = _mm_sub_ps(pi_over_two, tmp2);
200  tmp2 = _mm_mul_ps(four, tmp2);
201  tmp2 = _mm_mul_ps(tmp2, one_o_v);
202  tmp2 = _mm_mul_ps(tmp2, Delta);
203  tmp2 = _mm_mul_ps(tmp2, one_o_2);
205  //choose between the two methods to calculate s
206  tmp1 = _mm_cmpgt_ps(v, one_o_100);
207  tmp3 = _mm_and_ps(tmp1, tmp2);
208  tmp2 = _mm_andnot_ps(tmp1, s);
209  __m128 s1 = _mm_xor_ps(tmp3, tmp2);
210 
211  // dz/ds = dzdl/(1 - dzdl^2)
212  __m128 dzds = dzdl*_vec_rsqrt_ps(one - dzdl*dzdl);
213 
214  return (z - dzds*s1);
215 }
216 
217 
218 static inline void __attribute__((always_inline)) find_min_max(__m128 val1, __m128 val2, __m128& min, __m128& max)
219 {
220  __m128 tmp1 = _mm_cmplt_ps(val1, val2);
221  __m128 tmp2 = _mm_and_ps(tmp1, val1);
222  __m128 tmp3 = _mm_andnot_ps(tmp1, val2);
223  min = _mm_xor_ps(tmp2, tmp3);
224  tmp2 = _mm_and_ps(tmp1, val2);
225  tmp3 = _mm_andnot_ps(tmp1, val1);
226  max = _mm_xor_ps(tmp2, tmp3);
227 }
228 
229 
230 void HelixHough::allButKappaRange_sse(float* x1_a,float* x2_a,float* y1_a,float* y2_a,float* z1_a,float* z2_a, float* min_k_a,float* max_k_a, float* min_phi_1_a,float* max_phi_1_a,float* min_phi_2_a,float* max_phi_2_a, float* min_d_1_a,float* max_d_1_a,float* min_d_2_a,float* max_d_2_a, float* min_dzdl_a,float* max_dzdl_a, float* min_z0_1_a,float* max_z0_1_a,float* min_z0_2_a,float* max_z0_2_a)
231 {
232  __m128 x1 = _mm_load_ps(x1_a);
233  __m128 y1 = _mm_load_ps(y1_a);
234  __m128 z1 = _mm_load_ps(z1_a);
235 
236  __m128 x2 = _mm_load_ps(x2_a);
237  __m128 y2 = _mm_load_ps(y2_a);
238  __m128 z2 = _mm_load_ps(z2_a);
239 
240  __m128 x3 = _mm_add_ps(x1, x2);
241  x3 = _mm_mul_ps(one_o_2, x3);
242  __m128 y3 = _mm_add_ps(y1, y2);
243  y3 = _mm_mul_ps(one_o_2, y3);
244  __m128 Delta2 = _mm_sub_ps(x2, x1);
245  Delta2 = _mm_mul_ps(Delta2, Delta2);
246  __m128 tmp1 = _mm_sub_ps(y2, y1);
247  tmp1 = _mm_mul_ps(tmp1, tmp1);
248  Delta2 = _mm_add_ps(Delta2, tmp1);
249  __m128 Delta = _vec_sqrt_ps(Delta2);
250  __m128 Delta_inv = _vec_rec_ps(Delta);
251  __m128 ux = _mm_sub_ps(y2, y1);
252  ux = _mm_mul_ps(ux, Delta_inv);
253  __m128 uy = _mm_sub_ps(x1, x2);
254  uy = _mm_mul_ps(uy, Delta_inv);
255 
256 
257  __m128 k = _mm_load_ps(min_k_a);
258  __m128 phi_1_1, phi_2_1;
259  __m128 d_1_1, d_2_1;
260  calculate_phi_d(k, Delta, Delta2, Delta_inv, ux, uy, x3, y3, phi_1_1, phi_2_1, d_1_1, d_2_1);
261 
262  __m128 dzdl_1 = calculate_dzdl(Delta, z1, z2, k);
263  __m128 z0_1_1 = calculate_z0(x1, y1, z1, k, d_1_1, phi_1_1, dzdl_1);
264  __m128 z0_2_1 = calculate_z0(x1, y1, z1, k, d_2_1, phi_2_1, dzdl_1);
265 
266 
267  k = _mm_load_ps(max_k_a);
268  __m128 phi_1_2, phi_2_2;
269  __m128 d_1_2, d_2_2;
270  calculate_phi_d(k, Delta, Delta2, Delta_inv, ux, uy, x3, y3, phi_1_2, phi_2_2, d_1_2, d_2_2);
271 
272  __m128 dzdl_2 = calculate_dzdl(Delta, z1, z2, k);
273  __m128 z0_1_2 = calculate_z0(x1, y1, z1, k, d_1_2, phi_1_2, dzdl_2);
274  __m128 z0_2_2 = calculate_z0(x1, y1, z1, k, d_2_2, phi_2_2, dzdl_2);
275 
276  // choose the min and max for each helicity
277  // start with helicity 1 :
278  // check if phi overlaps the 0,2pi jump
279  tmp1 = _mm_cmplt_ps(phi_1_1, pi_over_two);
280  __m128 tmp2 = _mm_cmplt_ps(phi_1_2, pi_over_two);
281  tmp1 = _mm_or_ps(tmp1, tmp2);
282  tmp2 = _mm_cmpgt_ps(phi_1_1, three_pi_over_two);
283  __m128 tmp3 = _mm_cmpgt_ps(phi_1_2, three_pi_over_two);
284  tmp2 = _mm_or_ps(tmp2, tmp3);
285  tmp1 = _mm_and_ps(tmp1, tmp2);
286  // tmp1 is now all ones if phi_r overlaps the jump, all zeros otherwise
287  // if tmp1 is true, then subtract 2*pi from all of the phi values > 3*pi/2
288  tmp2 = _mm_and_ps(tmp1, twopi);
289  tmp3 = _mm_andnot_ps(tmp1, zero);
290  tmp2 = _mm_xor_ps(tmp2, tmp3);
291  __m128 tmp4 = _mm_cmpgt_ps(phi_1_1, three_pi_over_two);
292  tmp3 = _mm_and_ps(tmp4, tmp2);
293  __m128 tmp5 = _mm_andnot_ps(tmp4, zero);
294  tmp3 = _mm_xor_ps(tmp3, tmp5);
295  phi_1_1 = _mm_sub_ps(phi_1_1, tmp3);
296  tmp4 = _mm_cmpgt_ps(phi_1_2, three_pi_over_two);
297  tmp3 = _mm_and_ps(tmp4, tmp2);
298  tmp5 = _mm_andnot_ps(tmp4, zero);
299  tmp3 = _mm_xor_ps(tmp3, tmp5);
300  phi_1_2 = _mm_sub_ps(phi_1_2, tmp3);
301 
302  __m128 phi_1_min, phi_1_max;
303  find_min_max(phi_1_1, phi_1_2, phi_1_min, phi_1_max);
304 
305  __m128 d_1_min, d_1_max;
306  find_min_max(d_1_1, d_1_2, d_1_min, d_1_max);
307 
308  __m128 z0_1_min, z0_1_max;
309  find_min_max(z0_1_1, z0_1_2, z0_1_min, z0_1_max);
310 
311  _mm_store_ps(min_phi_1_a, phi_1_min);
312  _mm_store_ps(max_phi_1_a, phi_1_max);
313  _mm_store_ps(min_d_1_a, d_1_min);
314  _mm_store_ps(max_d_1_a, d_1_max);
315  _mm_store_ps(min_z0_1_a, z0_1_min);
316  _mm_store_ps(max_z0_1_a, z0_1_max);
317 
318 
319  // choose the min and max for each helicity
320  // now for helicity 2 :
321  // check if phi overlaps the 0,2pi jump
322  tmp1 = _mm_cmplt_ps(phi_2_1, pi_over_two);
323  tmp2 = _mm_cmplt_ps(phi_2_2, pi_over_two);
324  tmp1 = _mm_or_ps(tmp1, tmp2);
325  tmp2 = _mm_cmpgt_ps(phi_2_1, three_pi_over_two);
326  tmp3 = _mm_cmpgt_ps(phi_2_2, three_pi_over_two);
327  tmp2 = _mm_or_ps(tmp2, tmp3);
328  tmp1 = _mm_and_ps(tmp1, tmp2);
329  // tmp1 is now all ones if phi_r overlaps the jump, all zeros otherwise
330  // if tmp1 is true, then subtract 2*pi from all of the phi values > 3*pi/2
331  tmp2 = _mm_and_ps(tmp1, twopi);
332  tmp3 = _mm_andnot_ps(tmp1, zero);
333  tmp2 = _mm_xor_ps(tmp2, tmp3);
334  tmp4 = _mm_cmpgt_ps(phi_2_1, three_pi_over_two);
335  tmp3 = _mm_and_ps(tmp4, tmp2);
336  tmp5 = _mm_andnot_ps(tmp4, zero);
337  tmp3 = _mm_xor_ps(tmp3, tmp5);
338  phi_2_1 = _mm_sub_ps(phi_2_1, tmp3);
339  tmp4 = _mm_cmpgt_ps(phi_2_2, three_pi_over_two);
340  tmp3 = _mm_and_ps(tmp4, tmp2);
341  tmp5 = _mm_andnot_ps(tmp4, zero);
342  tmp3 = _mm_xor_ps(tmp3, tmp5);
343  phi_2_2 = _mm_sub_ps(phi_2_2, tmp3);
344 
345  __m128 phi_2_min, phi_2_max;
346  find_min_max(phi_2_1, phi_2_2, phi_2_min, phi_2_max);
347 
348  __m128 d_2_min, d_2_max;
349  find_min_max(d_2_1, d_2_2, d_2_min, d_2_max);
350 
351  __m128 z0_2_min, z0_2_max;
352  find_min_max(z0_2_1, z0_2_2, z0_2_min, z0_2_max);
353 
354  _mm_store_ps(min_phi_2_a, phi_2_min);
355  _mm_store_ps(max_phi_2_a, phi_2_max);
356  _mm_store_ps(min_d_2_a, d_2_min);
357  _mm_store_ps(max_d_2_a, d_2_max);
358  _mm_store_ps(min_z0_2_a, z0_2_min);
359  _mm_store_ps(max_z0_2_a, z0_2_max);
360 
361  __m128 dzdl_min, dzdl_max;
362  find_min_max(dzdl_1, dzdl_2, dzdl_min, dzdl_max);
363 
364  _mm_store_ps(min_dzdl_a, dzdl_min);
365  _mm_store_ps(max_dzdl_a, dzdl_max);
366 }
367 
368