11 static const __m128
one_o_4 = {0.25,0.25,0.25,0.25};
12 static const __m128
two = {2., 2., 2., 2.};
13 static const __m128
one_o_100 = {0.01,0.01,0.01,0.01};
14 static const __m128
close_one = {0.999,0.999,0.999,0.999};
15 static const __m128
four = {4., 4., 4., 4.};
16 static const __m128
one_o_3 = {0.3333333333333333333, 0.3333333333333333333, 0.3333333333333333333, 0.3333333333333333333};
17 static const __m128
_3_o_20 = {0.15, 0.15, 0.15, 0.15};
18 static const __m128
_5_o_56 = {8.92857142857142877e-02, 8.92857142857142877e-02, 8.92857142857142877e-02, 8.92857142857142877e-02};
19 static const __m128
SIGNMASK = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
20 static const __m128
three_pi_over_two = {3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f, 3.*0x1.921fb54442d1846ap0f};
23 static inline void __attribute__((always_inline)) calculate_phi_d(__m128&
k, __m128& , __m128&
Delta2, __m128& , __m128&
ux, __m128&
uy, __m128&
x3, __m128&
y3, __m128&
phi_1, __m128&
phi_2, __m128&
d_1, __m128&
d_2)
25 __m128 k_inv = _vec_rec_ps(k);
26 __m128
k2 = _mm_mul_ps(k, k);
29 uscale2 = _mm_mul_ps(
one_o_4, uscale2);
30 uscale2 = _mm_sub_ps(
one, uscale2);
31 __m128
uscale = _vec_sqrt_ps(uscale2);
33 __m128
tmp1 = _mm_mul_ps(k, y3);
34 __m128
tmp2 = _mm_mul_ps(uscale, uy);
35 __m128
tmp3 = _mm_mul_ps(k, x3);
36 __m128
tmp4 = _mm_mul_ps(uscale, ux);
38 __m128
tmp5 = _mm_add_ps(tmp1, tmp2);
39 __m128
tmp6 = _mm_add_ps(tmp3, tmp4);
40 phi_1 = _vec_atan2_ps(tmp5, tmp6);
41 tmp5 = _mm_sub_ps(tmp1, tmp2);
42 tmp6 = _mm_sub_ps(tmp3, tmp4);
43 phi_2 = _vec_atan2_ps(tmp5, tmp6);
45 tmp1 = _mm_cmplt_ps(phi_1,
zero);
46 tmp2 = _mm_and_ps(tmp1,
twopi);
47 tmp1 = _mm_andnot_ps(tmp1,
zero);
48 tmp1 = _mm_xor_ps(tmp1, tmp2);
49 phi_1 = _mm_add_ps(phi_1, tmp1);
51 tmp1 = _mm_cmplt_ps(phi_2,
zero);
52 tmp2 = _mm_and_ps(tmp1,
twopi);
53 tmp1 = _mm_andnot_ps(tmp1,
zero);
54 tmp1 = _mm_xor_ps(tmp1, tmp2);
55 phi_2 = _mm_add_ps(phi_2, tmp1);
57 tmp1 = _mm_mul_ps(x3, x3);
58 tmp2 = _mm_mul_ps(y3, y3);
59 tmp1 = _mm_add_ps(tmp1, tmp2);
60 __m128
kd1 = _mm_mul_ps(k2, tmp1);
61 kd1 = _mm_add_ps(kd1, uscale2);
62 tmp1 = _mm_mul_ps(
two, k);
63 tmp1 = _mm_mul_ps(tmp1, uscale);
64 __m128
k0val = _mm_mul_ps(x3, ux);
65 tmp3 = _mm_mul_ps(y3, uy);
66 k0val = _mm_add_ps(k0val, tmp3);
67 tmp1 = _mm_mul_ps(tmp1, k0val);
68 __m128
kd2 = _mm_sub_ps(kd1, tmp1);
69 kd1 = _mm_add_ps(kd1, tmp1);
70 kd1 = _vec_sqrt_ps(kd1);
71 kd2 = _vec_sqrt_ps(kd2);
75 d_1 = _mm_sub_ps(d_1,
one);
76 d_1 = _mm_mul_ps(d_1, k_inv);
79 d_2 = _mm_sub_ps(d_2,
one);
80 d_2 = _mm_mul_ps(d_2, k_inv);
82 tmp1 = _mm_cmpeq_ps(k,
zero);
83 tmp2 = _mm_and_ps(tmp1, k0val);
84 tmp3 = _mm_andnot_ps(tmp1, d_1);
85 d_1 = _mm_xor_ps(tmp2, tmp3);
86 tmp3 = _mm_andnot_ps(tmp1, d_2);
87 d_2 = _mm_xor_ps(tmp2, tmp3);
94 v = _mm_mul_ps(v, Delta);
98 __m128
tmp3 = _mm_andnot_ps(tmp1, v);
99 v = _mm_xor_ps(tmp2, tmp3);
105 tmp1 = _mm_mul_ps(
two, temp2);
106 s = _mm_add_ps(s, tmp1);
107 temp2 = _mm_mul_ps(temp2, temp1);
108 tmp1 = _mm_mul_ps(temp2,
one_o_3);
109 s = _mm_add_ps(s, tmp1);
110 temp2 = _mm_mul_ps(temp2, temp1);
111 tmp1 = _mm_mul_ps(temp2,
_3_o_20);
112 s = _mm_add_ps(s, tmp1);
113 temp2 = _mm_mul_ps(temp2, temp1);
114 tmp1 = _mm_mul_ps(temp2,
_5_o_56);
115 s = _mm_add_ps(s, tmp1);
120 tmp1 = _mm_mul_ps(v, v);
121 tmp1 = _mm_sub_ps(
one, tmp1);
122 tmp1 = _vec_sqrt_ps(tmp1);
123 tmp1 = _mm_add_ps(
one, tmp1);
124 tmp1 = _mm_mul_ps(tmp1, one_o_v);
125 tmp2 = _vec_atan_ps(tmp1);
127 tmp2 = _mm_mul_ps(
four, tmp2);
128 tmp2 = _mm_mul_ps(tmp2, one_o_v);
129 tmp2 = _mm_mul_ps(tmp2, Delta);
130 tmp2 = _mm_mul_ps(tmp2,
one_o_2);
134 tmp3 = _mm_and_ps(tmp1, tmp2);
135 tmp2 = _mm_andnot_ps(tmp1, s);
136 __m128
s1 = _mm_xor_ps(tmp3, tmp2);
138 __m128
dz2 = _mm_sub_ps(z2, z1);
139 dz2 = _mm_mul_ps(dz2, dz2);
140 tmp1 = _mm_mul_ps(s1, s1);
141 tmp1 = _mm_add_ps(tmp1, dz2);
142 __m128
dzdl = _mm_div_ps(dz2, tmp1);
143 dzdl = _vec_sqrt_ps(dzdl);
145 tmp1 = _mm_cmplt_ps(z2, z1);
147 tmp3 = _mm_and_ps(tmp1, tmp2);
148 dzdl = _mm_andnot_ps(tmp1, dzdl);
149 dzdl = _mm_xor_ps(dzdl, tmp3);
155 static inline __m128
__attribute__((always_inline)) calculate_z0(__m128&
x, __m128&
y, __m128&
z, __m128&
k, __m128&
d, __m128&
phi, __m128&
dzdl)
164 __m128
Delta = _vec_sqrt_ps(Dx*Dx + Dy*Dy);
167 v = _mm_mul_ps(v, Delta);
171 __m128
tmp3 = _mm_andnot_ps(tmp1, v);
172 v = _mm_xor_ps(tmp2, tmp3);
173 __m128
one_o_v = _vec_rec_ps(v);
176 __m128
temp1 = _mm_mul_ps(v, v);
178 tmp1 = _mm_mul_ps(
two, temp2);
179 s = _mm_add_ps(s, tmp1);
180 temp2 = _mm_mul_ps(temp2, temp1);
181 tmp1 = _mm_mul_ps(temp2,
one_o_3);
182 s = _mm_add_ps(s, tmp1);
183 temp2 = _mm_mul_ps(temp2, temp1);
184 tmp1 = _mm_mul_ps(temp2,
_3_o_20);
185 s = _mm_add_ps(s, tmp1);
186 temp2 = _mm_mul_ps(temp2, temp1);
187 tmp1 = _mm_mul_ps(temp2,
_5_o_56);
188 s = _mm_add_ps(s, tmp1);
193 tmp1 = _mm_mul_ps(v, v);
194 tmp1 = _mm_sub_ps(
one, tmp1);
195 tmp1 = _vec_sqrt_ps(tmp1);
196 tmp1 = _mm_add_ps(
one, tmp1);
197 tmp1 = _mm_mul_ps(tmp1, one_o_v);
198 tmp2 = _vec_atan_ps(tmp1);
200 tmp2 = _mm_mul_ps(
four, tmp2);
201 tmp2 = _mm_mul_ps(tmp2, one_o_v);
202 tmp2 = _mm_mul_ps(tmp2, Delta);
203 tmp2 = _mm_mul_ps(tmp2,
one_o_2);
207 tmp3 = _mm_and_ps(tmp1, tmp2);
208 tmp2 = _mm_andnot_ps(tmp1, s);
209 __m128
s1 = _mm_xor_ps(tmp3, tmp2);
212 __m128
dzds = dzdl*_vec_rsqrt_ps(
one - dzdl*dzdl);
214 return (z - dzds*s1);
220 __m128 tmp1 = _mm_cmplt_ps(val1, val2);
221 __m128 tmp2 = _mm_and_ps(tmp1, val1);
222 __m128 tmp3 = _mm_andnot_ps(tmp1, val2);
223 min = _mm_xor_ps(tmp2, tmp3);
224 tmp2 = _mm_and_ps(tmp1, val2);
225 tmp3 = _mm_andnot_ps(tmp1, val1);
226 max = _mm_xor_ps(tmp2, tmp3);
230 void HelixHough::allButKappaRange_sse(
float* x1_a,
float* x2_a,
float* y1_a,
float* y2_a,
float* z1_a,
float* z2_a,
float* min_k_a,
float* max_k_a,
float* min_phi_1_a,
float* max_phi_1_a,
float* min_phi_2_a,
float* max_phi_2_a,
float* min_d_1_a,
float* max_d_1_a,
float* min_d_2_a,
float* max_d_2_a,
float* min_dzdl_a,
float* max_dzdl_a,
float* min_z0_1_a,
float* max_z0_1_a,
float* min_z0_2_a,
float* max_z0_2_a)
232 __m128
x1 = _mm_load_ps(x1_a);
233 __m128
y1 = _mm_load_ps(y1_a);
234 __m128
z1 = _mm_load_ps(z1_a);
236 __m128
x2 = _mm_load_ps(x2_a);
237 __m128
y2 = _mm_load_ps(y2_a);
238 __m128
z2 = _mm_load_ps(z2_a);
240 __m128
x3 = _mm_add_ps(x1, x2);
242 __m128
y3 = _mm_add_ps(y1, y2);
244 __m128
Delta2 = _mm_sub_ps(x2, x1);
245 Delta2 = _mm_mul_ps(Delta2, Delta2);
246 __m128 tmp1 = _mm_sub_ps(y2, y1);
247 tmp1 = _mm_mul_ps(tmp1, tmp1);
248 Delta2 = _mm_add_ps(Delta2, tmp1);
249 __m128 Delta = _vec_sqrt_ps(Delta2);
250 __m128 Delta_inv = _vec_rec_ps(Delta);
251 __m128
ux = _mm_sub_ps(y2, y1);
252 ux = _mm_mul_ps(ux, Delta_inv);
253 __m128
uy = _mm_sub_ps(x1, x2);
254 uy = _mm_mul_ps(uy, Delta_inv);
257 __m128 k = _mm_load_ps(min_k_a);
258 __m128 phi_1_1, phi_2_1;
260 calculate_phi_d(k, Delta, Delta2, Delta_inv, ux, uy, x3, y3, phi_1_1, phi_2_1, d_1_1, d_2_1);
262 __m128 dzdl_1 = calculate_dzdl(Delta, z1, z2, k);
263 __m128 z0_1_1 = calculate_z0(x1, y1, z1, k, d_1_1, phi_1_1, dzdl_1);
264 __m128 z0_2_1 = calculate_z0(x1, y1, z1, k, d_2_1, phi_2_1, dzdl_1);
267 k = _mm_load_ps(max_k_a);
268 __m128 phi_1_2, phi_2_2;
270 calculate_phi_d(k, Delta, Delta2, Delta_inv, ux, uy, x3, y3, phi_1_2, phi_2_2, d_1_2, d_2_2);
272 __m128 dzdl_2 = calculate_dzdl(Delta, z1, z2, k);
273 __m128 z0_1_2 = calculate_z0(x1, y1, z1, k, d_1_2, phi_1_2, dzdl_2);
274 __m128 z0_2_2 = calculate_z0(x1, y1, z1, k, d_2_2, phi_2_2, dzdl_2);
281 tmp1 = _mm_or_ps(tmp1, tmp2);
284 tmp2 = _mm_or_ps(tmp2, tmp3);
285 tmp1 = _mm_and_ps(tmp1, tmp2);
288 tmp2 = _mm_and_ps(tmp1,
twopi);
289 tmp3 = _mm_andnot_ps(tmp1,
zero);
290 tmp2 = _mm_xor_ps(tmp2, tmp3);
292 tmp3 = _mm_and_ps(tmp4, tmp2);
293 __m128
tmp5 = _mm_andnot_ps(tmp4,
zero);
294 tmp3 = _mm_xor_ps(tmp3, tmp5);
295 phi_1_1 = _mm_sub_ps(phi_1_1, tmp3);
297 tmp3 = _mm_and_ps(tmp4, tmp2);
298 tmp5 = _mm_andnot_ps(tmp4,
zero);
299 tmp3 = _mm_xor_ps(tmp3, tmp5);
300 phi_1_2 = _mm_sub_ps(phi_1_2, tmp3);
302 __m128 phi_1_min, phi_1_max;
303 find_min_max(phi_1_1, phi_1_2, phi_1_min, phi_1_max);
305 __m128 d_1_min, d_1_max;
306 find_min_max(d_1_1, d_1_2, d_1_min, d_1_max);
308 __m128 z0_1_min, z0_1_max;
309 find_min_max(z0_1_1, z0_1_2, z0_1_min, z0_1_max);
311 _mm_store_ps(min_phi_1_a, phi_1_min);
312 _mm_store_ps(max_phi_1_a, phi_1_max);
313 _mm_store_ps(min_d_1_a, d_1_min);
314 _mm_store_ps(max_d_1_a, d_1_max);
315 _mm_store_ps(min_z0_1_a, z0_1_min);
316 _mm_store_ps(max_z0_1_a, z0_1_max);
324 tmp1 = _mm_or_ps(tmp1, tmp2);
327 tmp2 = _mm_or_ps(tmp2, tmp3);
328 tmp1 = _mm_and_ps(tmp1, tmp2);
331 tmp2 = _mm_and_ps(tmp1,
twopi);
332 tmp3 = _mm_andnot_ps(tmp1,
zero);
333 tmp2 = _mm_xor_ps(tmp2, tmp3);
335 tmp3 = _mm_and_ps(tmp4, tmp2);
336 tmp5 = _mm_andnot_ps(tmp4,
zero);
337 tmp3 = _mm_xor_ps(tmp3, tmp5);
338 phi_2_1 = _mm_sub_ps(phi_2_1, tmp3);
340 tmp3 = _mm_and_ps(tmp4, tmp2);
341 tmp5 = _mm_andnot_ps(tmp4,
zero);
342 tmp3 = _mm_xor_ps(tmp3, tmp5);
343 phi_2_2 = _mm_sub_ps(phi_2_2, tmp3);
345 __m128 phi_2_min, phi_2_max;
346 find_min_max(phi_2_1, phi_2_2, phi_2_min, phi_2_max);
348 __m128 d_2_min, d_2_max;
349 find_min_max(d_2_1, d_2_2, d_2_min, d_2_max);
351 __m128 z0_2_min, z0_2_max;
352 find_min_max(z0_2_1, z0_2_2, z0_2_min, z0_2_max);
354 _mm_store_ps(min_phi_2_a, phi_2_min);
355 _mm_store_ps(max_phi_2_a, phi_2_max);
356 _mm_store_ps(min_d_2_a, d_2_min);
357 _mm_store_ps(max_d_2_a, d_2_max);
358 _mm_store_ps(min_z0_2_a, z0_2_min);
359 _mm_store_ps(max_z0_2_a, z0_2_max);
361 __m128 dzdl_min, dzdl_max;
362 find_min_max(dzdl_1, dzdl_2, dzdl_min, dzdl_max);
364 _mm_store_ps(min_dzdl_a, dzdl_min);
365 _mm_store_ps(max_dzdl_a, dzdl_max);