ECCE @ EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
HelixHough_vote_sse.cpp
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file HelixHough_vote_sse.cpp
1 #include "HelixHough.h"
2 
3 #include "fastvec.h"
4 #include "SimpleHit3D.h"
5 #include "vector_math_inline.h"
6 
7 #include <cmath>
8 #include <cstddef>
9 #include <emmintrin.h>
10 #include <memory>
11 #include <sys/time.h>
12 #include <vector>
13 #include <xmmintrin.h>
14 
15 using namespace std;
16 
17 //static const unsigned int digits = 2;
18 //static const unsigned int r = 7;
19 //static const unsigned int radix = 1 << r;
20 //static const unsigned int mask = radix - 1;
21 
22 // static void radix_sort(std::vector<BinEntryPair5D>& A) {
23 // unsigned int SIZE = A.size();
24 // std::vector<BinEntryPair5D> B(SIZE);
25 // std::vector<unsigned int> cnt(radix);
26 
27 // for (unsigned int i = 0, shift = 0; i < digits; i++, shift += r) {
28 // for (unsigned int j = 0; j < radix; ++j) {
29 // cnt[j] = 0;
30 // }
31 
32 // for (unsigned int j = 0; j < SIZE; ++j) {
33 // ++cnt[(A[j].bin >> shift) & mask];
34 // }
35 
36 // for (unsigned int j = 1; j < radix; ++j) {
37 // cnt[j] += cnt[j - 1];
38 // }
39 
40 // for (long j = SIZE - 1; j >= 0; --j) {
41 // B[--cnt[(A[j].bin >> shift) & mask]] = A[j];
42 // }
43 
44 // for (unsigned int j = 0; j < SIZE; ++j) {
45 // A[j] = B[j];
46 // }
47 // }
48 // }
49 
50 // static void radix_sort(unsigned int* A, unsigned int* B, unsigned int SIZE) {
51 // unsigned int cnt[1 << 7];
52 
53 // for (unsigned int i = 0, shift = 0; i < digits; i++, shift += r) {
54 // for (unsigned int j = 0; j < radix; ++j) {
55 // cnt[j] = 0;
56 // }
57 
58 // for (unsigned int j = 0; j < SIZE; ++j) {
59 // ++cnt[((A[j] >> 20) >> shift) & mask];
60 // }
61 
62 // for (unsigned int j = 1; j < radix; ++j) {
63 // cnt[j] += cnt[j - 1];
64 // }
65 
66 // for (long j = SIZE - 1; j >= 0; --j) {
67 // B[--cnt[((A[j] >> 20) >> shift) & mask]] = A[j];
68 // }
69 
70 // for (unsigned int j = 0; j < SIZE; ++j) {
71 // A[j] = B[j];
72 // }
73 // }
74 // }
75 
76 static void counting_sort(unsigned int MAX, std::vector<BinEntryPair5D>& A,
77  unsigned int* C, unsigned int* bins_start,
78  unsigned int* bins_end) {
79  unsigned int SIZE = A.size();
80  std::vector<BinEntryPair5D> B(SIZE);
81 
82  for (unsigned int i = 0; i < SIZE; ++i) {
83  C[A[i].bin] += 1;
84  }
85 
86  bins_start[0] = 0;
87  bins_end[0] = C[0] - 1;
88  for (unsigned int i = 1; i < MAX; ++i) {
89  bins_start[i] = C[i - 1];
90  C[i] += C[i - 1];
91  bins_end[i] = C[i] - 1;
92  }
93 
94  for (int i = SIZE - 1; i >= 0; --i) {
95  B[C[A[i].bin] - 1] = A[i];
96  --C[A[i].bin];
97  }
98 
99  for (unsigned int i = 0; i < SIZE; ++i) {
100  A[i] = B[i];
101  }
102 }
103 
104 static void counting_sort(unsigned int SIZE, unsigned int MAX, unsigned int* A,
105  unsigned int* B, unsigned int* C,
106  unsigned int* bins_start, unsigned int* bins_end) {
107  for (unsigned int i = 0; i < SIZE; ++i) {
108  C[(A[i]) >> 20] += 1;
109  }
110  bins_start[0] = 0;
111  bins_end[0] = C[0] - 1;
112  for (unsigned int i = 1; i < MAX; ++i) {
113  bins_start[i] = C[i - 1];
114  C[i] += C[i - 1];
115  bins_end[i] = C[i] - 1;
116  }
117  for (int i = (SIZE - 1); i >= 0; --i) {
118  B[C[A[i] >> 20] - 1] = A[i];
119  --C[A[i] >> 20];
120  }
121  for (unsigned int i = 0; i < SIZE; ++i) {
122  A[i] = B[i];
123  }
124 }
125 
126 void fillBins4_sse(float* min_phi_a, float* max_phi_a, float n_phi_val,
127  float inv_phi_range_val, float low_phi_a, float high_phi_a,
128  unsigned int* philow1, unsigned int* philow2,
129  unsigned int* phihi1, unsigned int* phihi2) {
130  const unsigned int zero_int_a[4] __attribute__((aligned(16))) = {
131  0x00000000, 0x00000000, 0x00000000, 0x00000000};
132  __m128i zero_int = _mm_load_si128((__m128i*)zero_int_a);
133  const unsigned int neg1_int_a[4] __attribute__((aligned(16))) = {
134  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
135  __m128i neg1_int = _mm_load_si128((__m128i*)neg1_int_a);
136  const unsigned int one_int_a[4] __attribute__((aligned(16))) = {
137  0x00000001, 0x00000001, 0x00000001, 0x00000001};
138  __m128i one_int = _mm_load_si128((__m128i*)one_int_a);
139 
140  __m128 min_phi = _mm_load_ps(min_phi_a);
141  __m128 max_phi = _mm_load_ps(max_phi_a);
142  __m128 n_phi = _mm_load1_ps(&(n_phi_val));
143  __m128i n_phi_min1 = _mm_cvtps_epi32(n_phi);
144  n_phi_min1 = _mm_sub_epi32(n_phi_min1, one_int);
145  __m128 inv_phi_range = _mm_load1_ps(&(inv_phi_range_val));
146  __m128 low_phi = _mm_load1_ps(&(low_phi_a));
147  __m128 high_phi = _mm_load1_ps(&(high_phi_a));
148 
149  __m128i low_phi_bin_1 =
150  _mm_cvttps_epi32((((min_phi - low_phi) * inv_phi_range) * ((n_phi))));
151  __m128i high_phi_bin_1 =
152  _mm_cvttps_epi32((((max_phi - low_phi) * inv_phi_range) * ((n_phi))));
153  __m128i low_phi_bin_2 = _mm_cvttps_epi32(
154  (((min_phi + twopi - low_phi) * inv_phi_range) * ((n_phi))));
155 
156  __m128i cmp1 = (__m128i)(_mm_cmplt_ps(min_phi, low_phi));
157  __m128i cmp2 = (__m128i)(_mm_cmplt_ps(high_phi, max_phi));
158  __m128i cmp3 = (__m128i)(_mm_cmplt_ps(min_phi + twopi, low_phi));
159  __m128i cmp4 = (__m128i)(_mm_cmplt_ps(zero, min_phi));
160  // ( high_phi < ( min_phi_a[i] + 2.*M_PI) ) && ( low_phi > max_phi_a[i] )
161  __m128i cmp5 = (__m128i)(_mm_cmplt_ps(high_phi, min_phi + twopi));
162  __m128i cmptmp1 = (__m128i)(_mm_cmplt_ps(max_phi, low_phi));
163  cmp5 = _mm_and_si128(cmp5, cmptmp1);
164  // ( high_phi >= ( min_phi_a[i] + 2.*M_PI) ) && ( low_phi > max_phi_a[i] )
165  __m128i cmp5_1 = (__m128i)(_mm_cmplt_ps(min_phi + twopi, high_phi));
166  cmptmp1 = (__m128i)(_mm_cmplt_ps(max_phi, low_phi));
167  cmp5_1 = _mm_and_si128(cmp5_1, cmptmp1);
168  // ( high_phi < ( min_phi_a[i] + 2.*M_PI) ) && ( low_phi <= max_phi_a[i] )
169  __m128i cmp5_2 = (__m128i)(_mm_cmplt_ps(high_phi, min_phi + twopi));
170  cmptmp1 = (__m128i)(_mm_cmplt_ps(low_phi, max_phi));
171  cmp5_2 = _mm_and_si128(cmp5_2, cmptmp1);
172  // ( high_phi >= ( min_phi_a[i] + 2.*M_PI) ) && ( low_phi <= max_phi_a[i] )
173  __m128i cmp5_3 = (__m128i)(_mm_cmplt_ps(min_phi + twopi, high_phi));
174  cmptmp1 = (__m128i)(_mm_cmplt_ps(low_phi, max_phi));
175  cmp5_3 = _mm_and_si128(cmp5_3, cmptmp1);
176  cmp5_1 = _mm_or_si128(cmp5_1, cmp5_3);
177  cmp5_2 = _mm_or_si128(cmp5_2, cmp5_3);
178  // ( max_phi_a[i] < low_phi ) || ( min_phi_a[i] > high_phi )
179  __m128i cmp6 = (__m128i)(_mm_cmplt_ps(max_phi, low_phi));
180  cmptmp1 = (__m128i)(_mm_cmplt_ps(high_phi, min_phi));
181  cmp6 = _mm_or_si128(cmp6, cmptmp1);
182 
183  // split into two cases due to 0,2pi wraparound
184 
185  // low bin :
186 
187  // choose between zero and low_phi_bin_2
188  __m128i tmp1 = _mm_and_si128(cmp3, zero_int);
189  __m128i tmp2 = _mm_andnot_si128(cmp3, low_phi_bin_2);
190  __m128i lowphi_sel_1 = _mm_xor_si128(tmp1, tmp2);
191  // set result to negative based on cmp5
192  tmp1 = _mm_andnot_si128(cmp5_1, neg1_int);
193  tmp2 = _mm_and_si128(cmp5_1, lowphi_sel_1);
194  lowphi_sel_1 = _mm_xor_si128(tmp1, tmp2);
195  // choose between above result and ((low_phi_bin_1 or 0), or neg if cmp6)
196  tmp1 = _mm_and_si128(cmp1, zero_int);
197  tmp2 = _mm_andnot_si128(cmp1, low_phi_bin_1);
198  __m128i tmp4 = _mm_xor_si128(tmp1, tmp2);
199  tmp1 = _mm_and_si128(cmp6, neg1_int);
200  tmp2 = _mm_andnot_si128(cmp6, tmp4);
201  __m128i tmp3 = _mm_xor_si128(tmp1, tmp2);
202  tmp1 = _mm_and_si128(cmp4, tmp3);
203  tmp2 = _mm_andnot_si128(cmp4, lowphi_sel_1);
204  lowphi_sel_1 = _mm_xor_si128(tmp1, tmp2);
205 
206  // high bin :
207  tmp1 = _mm_and_si128(cmp2, n_phi_min1);
208  tmp2 = _mm_andnot_si128(cmp2, high_phi_bin_1);
209  tmp4 = _mm_xor_si128(tmp1, tmp2);
210  tmp1 = _mm_and_si128(cmp6, neg1_int);
211  tmp2 = _mm_andnot_si128(cmp6, tmp4);
212  tmp4 = _mm_xor_si128(tmp1, tmp2);
213  tmp1 = _mm_and_si128(cmp5, neg1_int);
214  tmp2 = _mm_andnot_si128(cmp5, n_phi_min1);
215  tmp3 = _mm_xor_si128(tmp1, tmp2);
216  tmp1 = _mm_and_si128(cmp4, tmp4);
217  tmp2 = _mm_andnot_si128(cmp4, tmp3);
218  __m128i highphi_sel_1 = _mm_xor_si128(tmp1, tmp2);
219 
220  tmp1 = _mm_and_si128(cmp5_2, zero_int);
221  tmp2 = _mm_andnot_si128(cmp5_2, neg1_int);
222  tmp4 = _mm_xor_si128(tmp1, tmp2);
223  tmp1 = _mm_and_si128(cmp4, neg1_int);
224  tmp2 = _mm_andnot_si128(cmp4, tmp4);
225  __m128i lowphi_sel_2 = _mm_xor_si128(tmp1, tmp2);
226  tmp1 = _mm_and_si128(cmp2, n_phi_min1);
227  tmp2 = _mm_andnot_si128(cmp2, high_phi_bin_1);
228  tmp4 = _mm_xor_si128(tmp1, tmp2);
229  tmp1 = _mm_and_si128(cmp4, neg1_int);
230  tmp2 = _mm_andnot_si128(cmp4, tmp4);
231  __m128i highphi_sel_2 = _mm_xor_si128(tmp1, tmp2);
232 
233  _mm_store_si128((__m128i*)philow1, lowphi_sel_1);
234  _mm_store_si128((__m128i*)philow2, lowphi_sel_2);
235  _mm_store_si128((__m128i*)phihi1, highphi_sel_1);
236  _mm_store_si128((__m128i*)phihi2, highphi_sel_2);
237 }
238 
239 void HelixHough::fillBins(unsigned int /*total_bins*/, unsigned int hit_counter,
240  float* min_phi_a, float* max_phi_a,
241  vector<SimpleHit3D>& four_hits, fastvec2d& z_bins,
242  unsigned int n_d, unsigned int n_k,
243  unsigned int n_dzdl, unsigned int n_z0,
244  unsigned int d_bin, unsigned int k_bin,
245  unsigned int n_phi, unsigned int /*zoomlevel*/,
246  float low_phi, float high_phi, float inv_phi_range,
247  fastvec& vote_array) {
248  unsigned int buffer[(1 << 10)];
249  unsigned int bufnum = 0;
250  unsigned int zbuffer[1 << 10];
251  unsigned int zbufnum[8];
252  unsigned int size2 = n_z0 * n_dzdl;
253 
254  unsigned int philow1[4] __attribute__((aligned(16))) = {
255  0x00000000, 0x00000000, 0x00000000, 0x00000000};
256  unsigned int philow2[4] __attribute__((aligned(16))) = {
257  0x00000000, 0x00000000, 0x00000000, 0x00000000};
258  unsigned int phihi1[4] __attribute__((aligned(16))) = {
259  0x00000000, 0x00000000, 0x00000000, 0x00000000};
260  unsigned int phihi2[4] __attribute__((aligned(16))) = {
261  0x00000000, 0x00000000, 0x00000000, 0x00000000};
262 
263  z_bins.fetch(four_hits[0].get_id(), four_hits[hit_counter - 1].get_id(), zbuffer,
264  zbufnum);
265 
266  unsigned int zoff = n_z0 * n_dzdl * (k_bin + n_k * d_bin);
267  unsigned int binprod = n_z0 * n_dzdl * n_k * n_d;
268 
269  unsigned int count = hit_counter;
270  unsigned int offset = 0;
271  unsigned int cur = 4;
272  if (count < 4) {
273  cur = count;
274  }
275  while (true) {
276  float minphi_a[4] __attribute__((aligned(16)));
277  float maxphi_a[4] __attribute__((aligned(16)));
278  for (unsigned int i = 0; i < cur; ++i) {
279  minphi_a[i] = min_phi_a[i + offset];
280  maxphi_a[i] = max_phi_a[i + offset];
281  }
282 
283  fillBins4_sse(minphi_a, maxphi_a, (float)n_phi, inv_phi_range, low_phi,
284  high_phi, philow1, philow2, phihi1, phihi2);
285 
286  for (unsigned int i = 0; i < cur; ++i) {
287  unsigned int index = four_hits[i + offset].get_id();
288 
289  unsigned int pos = (i + offset) * size2;
290  for (unsigned int zbin = 0; zbin < zbufnum[i + offset]; ++zbin) {
291  unsigned int zint = zbuffer[pos];
292  pos += 1;
293  unsigned int zstart = (zint & 65535) + n_z0 * (zint >> 16) + zoff;
294 
295  for (unsigned int b = philow1[i]; b <= phihi1[i]; ++b) {
296  if (philow1[i] == 4294967295) {
297  break;
298  }
299  if (b >= n_phi) {
300  break;
301  }
302  unsigned int zadd = binprod * b;
303  unsigned int bin = zstart + zadd;
304  buffer[bufnum] = ((bin << 20) ^ index);
305  bufnum += 1;
306  }
307  for (unsigned int b = philow2[i]; b <= phihi2[i]; ++b) {
308  if (philow2[i] == 4294967295) {
309  break;
310  }
311  if (b >= n_phi) {
312  break;
313  }
314  unsigned int zadd = binprod * b;
315  unsigned int bin = zstart + zadd;
316  buffer[bufnum] = ((bin << 20) ^ index);
317  bufnum += 1;
318  }
319  }
320  }
321  if (count == cur) {
322  break;
323  }
324  count -= cur;
325  offset += cur;
326  cur = 4;
327  if (count < 4) {
328  cur = count;
329  }
330  }
331  vote_array.push_back(buffer, bufnum);
332 }
333 
334 void HelixHough::vote_z(unsigned int zoomlevel, unsigned int /*n_phi*/,
335  unsigned int /*n_d*/, unsigned int /*n_k*/, unsigned int n_dzdl,
336  unsigned int n_z0, fastvec2d& z_bins) {
337  float z0_size =
338  (zoomranges[zoomlevel].max_z0 - zoomranges[zoomlevel].min_z0) /
339  ((float)n_z0);
340  float dzdl_size =
341  (zoomranges[zoomlevel].max_dzdl - zoomranges[zoomlevel].min_dzdl) /
342  ((float)n_dzdl);
343  float low_phi = zoomranges[zoomlevel].min_phi;
344  float high_phi = zoomranges[zoomlevel].max_phi;
345  float low_dzdl = zoomranges[zoomlevel].min_dzdl;
346  float high_dzdl = zoomranges[zoomlevel].max_dzdl;
347  float dzdl_bin_size_inv = 1. / dzdl_size;
348 
349  // cache cosine and sine calculations
350  float min_cos = cos(zoomranges[zoomlevel].min_phi);
351  float max_cos = cos(zoomranges[zoomlevel].max_phi);
352  float min_sin = sin(zoomranges[zoomlevel].min_phi);
353  float max_sin = sin(zoomranges[zoomlevel].max_phi);
354  if ((high_phi - low_phi) > M_PI) {
355  min_cos = 1.;
356  max_cos = -1.;
357  min_sin = 0.;
358  max_sin = 0.;
359  }
360 
361  unsigned int one_z_bin;
362  float pwr = 1.0;
363  float min_kappa = pow(zoomranges[zoomlevel].min_k, pwr);
364  float max_kappa = pow(zoomranges[zoomlevel].max_k, pwr);
365 
366  unsigned int hit_counter = 0;
367  float x_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
368  float y_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
369  float z_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
370  float min_dzdl_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
371  float max_dzdl_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
372  float min_z0_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
373  float max_z0_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
374  float dz_a[4] = {0., 0., 0., 0.};
375  vector<SimpleHit3D> four_hits;
376  SimpleHit3D temphit;
377  four_hits.assign(4, temphit);
378  unsigned int temp_zcount[4];
379  unsigned buffer[4][1 << 8];
380  for (unsigned int i = 0; i < hits_vec[zoomlevel]->size(); i++) {
381  x_a[hit_counter] = (*(hits_vec[zoomlevel]))[i].get_x();
382  y_a[hit_counter] = (*(hits_vec[zoomlevel]))[i].get_y();
383  z_a[hit_counter] = (*(hits_vec[zoomlevel]))[i].get_z();
384 
385  dz_a[hit_counter] = (2.0*sqrt((*(hits_vec[zoomlevel]))[i].get_size(2,2)));
386  four_hits[hit_counter] = (*(hits_vec[zoomlevel]))[i];
387 
388  hit_counter++;
389 
390  if (hit_counter == 4) {
391  for (unsigned int h = 0; h < hit_counter; ++h) {
392  temp_zcount[h] = 0.;
393  }
394 
395  for (unsigned int zz = 0; zz < n_z0; ++zz) {
396  float min_z0 = zoomranges[zoomlevel].min_z0 + ((float)(zz)) * z0_size;
397  float max_z0 = min_z0 + z0_size;
398 
399  float avg = 0.5 * (min_z0 + max_z0);
400  float width = 0.5 * (max_z0 - min_z0);
401  max_z0 = avg + width * z_bin_scale;
402  min_z0 = avg - width * z_bin_scale;
403 
404  for (unsigned int h = 0; h < hit_counter; ++h) {
405  float dz = dz_a[h];
406  min_z0_a[h] = min_z0 - dz;
407  max_z0_a[h] = max_z0 + dz;
408  }
409 
410  dzdlRange_sse(x_a, y_a, z_a, min_cos, min_sin, max_cos, max_sin,
411  min_kappa, max_kappa, zoomranges[zoomlevel].min_d,
412  zoomranges[zoomlevel].max_d, min_z0_a, max_z0_a,
413  min_dzdl_a, max_dzdl_a);
414 
415  unsigned int low_bin = 0;
416  unsigned int high_bin = 0;
417 
418  for (unsigned int h = 0; h < hit_counter; ++h) {
419  float d_dzdl = dzdlError(
420  four_hits[h], min_kappa, max_kappa, zoomranges[zoomlevel].min_d,
421  zoomranges[zoomlevel].max_d, min_z0, max_z0,
422  zoomranges[zoomlevel].min_dzdl, zoomranges[zoomlevel].max_dzdl);
423 
424  float min_dzdl = min_dzdl_a[h] - d_dzdl;
425  float max_dzdl = max_dzdl_a[h] + d_dzdl;
426 
427  if ((min_dzdl > high_dzdl) || (max_dzdl < low_dzdl)) {
428  low_bin = 1;
429  high_bin = 0;
430  } else {
431  if (min_dzdl > low_dzdl) {
432  low_bin =
433  (unsigned int)((min_dzdl - low_dzdl) * dzdl_bin_size_inv);
434  } else {
435  low_bin = 0;
436  }
437 
438  if (max_dzdl < high_dzdl) {
439  high_bin =
440  (unsigned int)((max_dzdl - low_dzdl) * dzdl_bin_size_inv);
441  } else {
442  high_bin = n_dzdl - 1;
443  }
444  }
445  for (unsigned int bb = low_bin; bb <= high_bin; bb++) {
446  if (bb >= n_dzdl) {
447  continue;
448  }
449  one_z_bin = (zz ^ (bb << 16));
450  buffer[h][temp_zcount[h]] = one_z_bin;
451  temp_zcount[h] += 1.;
452  }
453  }
454  }
455  for (unsigned int h = 0; h < hit_counter; ++h) {
456  z_bins.fill(buffer[h], temp_zcount[h]);
457  }
458  hit_counter = 0;
459  }
460  }
461  if (hit_counter != 0) {
462  for (unsigned int h = 0; h < hit_counter; ++h) {
463  temp_zcount[h] = 0.;
464  }
465 
466  for (unsigned int zz = 0; zz < n_z0; ++zz) {
467  float min_z0 = zoomranges[zoomlevel].min_z0 + ((float)(zz)) * z0_size;
468  float max_z0 = min_z0 + z0_size;
469 
470  float avg = 0.5 * (min_z0 + max_z0);
471  float width = 0.5 * (max_z0 - min_z0);
472  max_z0 = avg + width * z_bin_scale;
473  min_z0 = avg - width * z_bin_scale;
474 
475  for (unsigned int h = 0; h < hit_counter; ++h) {
476  float dz = dz_a[h];
477  min_z0_a[h] = min_z0 - dz;
478  max_z0_a[h] = max_z0 + dz;
479  }
480 
481  dzdlRange_sse(x_a, y_a, z_a, min_cos, min_sin, max_cos, max_sin,
482  min_kappa, max_kappa, zoomranges[zoomlevel].min_d,
483  zoomranges[zoomlevel].max_d, min_z0_a, max_z0_a, min_dzdl_a,
484  max_dzdl_a);
485 
486  unsigned int low_bin = 0;
487  unsigned int high_bin = 0;
488 
489  for (unsigned int h = 0; h < hit_counter; ++h) {
490  float d_dzdl = dzdlError(
491  four_hits[h], min_kappa, max_kappa, zoomranges[zoomlevel].min_d,
492  zoomranges[zoomlevel].max_d, min_z0, max_z0,
493  zoomranges[zoomlevel].min_dzdl, zoomranges[zoomlevel].max_dzdl);
494 
495  float min_dzdl = min_dzdl_a[h] - d_dzdl;
496  float max_dzdl = max_dzdl_a[h] + d_dzdl;
497 
498  if ((min_dzdl > high_dzdl) || (max_dzdl < low_dzdl)) {
499  low_bin = 1;
500  high_bin = 0;
501  } else {
502  if (min_dzdl > low_dzdl) {
503  low_bin = (unsigned int)((min_dzdl - low_dzdl) * dzdl_bin_size_inv);
504  } else {
505  low_bin = 0;
506  }
507 
508  if (max_dzdl < high_dzdl) {
509  high_bin =
510  (unsigned int)((max_dzdl - low_dzdl) * dzdl_bin_size_inv);
511  } else {
512  high_bin = n_dzdl - 1;
513  }
514  }
515  for (unsigned int bb = low_bin; bb <= high_bin; bb++) {
516  if (bb >= n_dzdl) {
517  continue;
518  }
519  one_z_bin = (zz ^ (bb << 16));
520  buffer[h][temp_zcount[h]] = one_z_bin;
521  temp_zcount[h] += 1.;
522  }
523  }
524  }
525  for (unsigned int h = 0; h < hit_counter; ++h) {
526  z_bins.fill(buffer[h], temp_zcount[h]);
527  }
528  hit_counter = 0;
529  }
530 }
531 
532 void HelixHough::vote(unsigned int zoomlevel) {
533  bins_vec[zoomlevel]->clear();
534  fastvec vote_array;
535 
536  unsigned int n_phi = n_phi_bins[zoomlevel];
537  unsigned int n_d = n_d_bins[zoomlevel];
538  unsigned int n_k = n_k_bins[zoomlevel];
539  unsigned int n_dzdl = n_dzdl_bins[zoomlevel];
540  unsigned int n_z0 = n_z0_bins[zoomlevel];
541 
542  fastvec2d z_bins(n_dzdl * n_z0);
543 
544  unsigned int total_bins = n_phi * n_d * n_k * n_dzdl * n_z0;
545 
546  float d_size = (zoomranges[zoomlevel].max_d - zoomranges[zoomlevel].min_d) /
547  ((float)n_d);
548  float k_size = (zoomranges[zoomlevel].max_k - zoomranges[zoomlevel].min_k) /
549  ((float)n_k);
550  float low_phi = zoomranges[zoomlevel].min_phi;
551  float high_phi = zoomranges[zoomlevel].max_phi;
552  float inv_phi_range = 1. / (high_phi - low_phi);
553 
554  float pwr = 1.0;
555  float min_kappa = pow(zoomranges[zoomlevel].min_k, pwr);
556  float max_kappa = pow(zoomranges[zoomlevel].max_k, pwr);
557 
558  timeval t1, t2;
559  double time1 = 0.;
560  double time2 = 0.;
561  if (print_timings == true) {
562  gettimeofday(&t1, NULL);
563  }
564  vote_z(zoomlevel, n_phi, n_d, n_k, n_dzdl, n_z0, z_bins);
565  if (print_timings == true) {
566  gettimeofday(&t2, NULL);
567  time1 = ((double)(t1.tv_sec) + (double)(t1.tv_usec) / 1000000.);
568  time2 = ((double)(t2.tv_sec) + (double)(t2.tv_usec) / 1000000.);
569  z_vote_time += (time2 - time1);
570  }
571 
572  // now vote in xy
573  __m128 phi_3_in;
574  __m128 phi_4_in;
575  __m128 phi_3_out;
576  __m128 phi_4_out;
577  __m128 phi_3_in_2;
578  __m128 phi_4_in_2;
579  __m128 phi_3_out_2;
580  __m128 phi_4_out_2;
581  unsigned int hit_counter = 0;
582  float x_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
583  float y_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
584  float x_a_2[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
585  float y_a_2[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
586  vector<SimpleHit3D> four_hits;
587  SimpleHit3D temphit;
588  four_hits.assign(4, temphit);
589  vector<SimpleHit3D> four_hits_2;
590  four_hits_2.assign(4, temphit);
591  vector<SimpleHit3D> eight_hits;
592  eight_hits.assign(8, temphit);
593  float min_phi_1_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
594  float max_phi_1_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
595  float min_phi_2_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
596  float max_phi_2_a[4] __attribute__((aligned(16))) = {0., 0., 0., 0.};
597  float min_phi_8[8];
598  float max_phi_8[8];
599  if (print_timings == true) {
600  gettimeofday(&t1, NULL);
601  }
602  float min_k_array[1 << 8];
603  float max_k_array[1 << 8];
604  min_k_array[0] = zoomranges[zoomlevel].min_k;
605  max_k_array[0] = zoomranges[zoomlevel].min_k + k_size;
606  for (unsigned int k_bin = 1; k_bin < n_k; ++k_bin) {
607  min_k_array[k_bin] = min_k_array[k_bin - 1] + k_size;
608  max_k_array[k_bin] = max_k_array[k_bin - 1] + k_size;
609  }
610  for (unsigned int k_bin = 0; k_bin < n_k; ++k_bin) {
611  min_k_array[k_bin] = pow(min_k_array[k_bin], pwr);
612  max_k_array[k_bin] = pow(max_k_array[k_bin], pwr);
613  }
614  float min_d_array[1 << 8];
615  float max_d_array[1 << 8];
616  min_d_array[0] = zoomranges[zoomlevel].min_d;
617  max_d_array[0] = zoomranges[zoomlevel].min_d + d_size;
618  for (unsigned int d_bin = 1; d_bin < n_d; ++d_bin) {
619  min_d_array[d_bin] = min_d_array[d_bin - 1] + d_size;
620  max_d_array[d_bin] = max_d_array[d_bin - 1] + d_size;
621  }
622 
623  for (unsigned int k_bin = 0; k_bin < n_k; ++k_bin) {
624  float avg = 0.5 * (max_k_array[k_bin] + min_k_array[k_bin]);
625  float width = 0.5 * (max_k_array[k_bin] - min_k_array[k_bin]);
626  max_k_array[k_bin] = avg + width * bin_scale;
627  min_k_array[k_bin] = avg - width * bin_scale;
628  }
629  for (unsigned int d_bin = 0; d_bin < n_d; ++d_bin) {
630  float avg = 0.5 * (max_d_array[d_bin] + min_d_array[d_bin]);
631  float width = 0.5 * (max_d_array[d_bin] - min_d_array[d_bin]);
632  max_d_array[d_bin] = avg + width * bin_scale;
633  min_d_array[d_bin] = avg - width * bin_scale;
634  }
635 
636  for (unsigned int i = 0; i < hits_vec[zoomlevel]->size(); i++) {
637  if (hit_counter < 4) {
638  four_hits[hit_counter] = ((*(hits_vec[zoomlevel]))[i]);
639  x_a[hit_counter] = four_hits[hit_counter].get_x();
640  y_a[hit_counter] = four_hits[hit_counter].get_y();
641  four_hits[hit_counter].set_id(i);
642  eight_hits[hit_counter] = four_hits[hit_counter];
643  } else {
644  four_hits_2[hit_counter - 4] = ((*(hits_vec[zoomlevel]))[i]);
645  x_a_2[hit_counter - 4] = four_hits_2[hit_counter - 4].get_x();
646  y_a_2[hit_counter - 4] = four_hits_2[hit_counter - 4].get_y();
647  four_hits_2[hit_counter - 4].set_id(i);
648  eight_hits[hit_counter] = four_hits_2[hit_counter - 4];
649  }
650  hit_counter++;
651  if ((hit_counter == 8) && (separate_by_helicity == true)) {
652  for (unsigned int d_bin = 0; d_bin < n_d; ++d_bin) {
653  float min_d_a[4] __attribute__((aligned(16))) = {
654  min_d_array[d_bin], min_d_array[d_bin], min_d_array[d_bin],
655  min_d_array[d_bin]};
656  float max_d_a[4] __attribute__((aligned(16))) = {
657  max_d_array[d_bin], max_d_array[d_bin], max_d_array[d_bin],
658  max_d_array[d_bin]};
659 
660  for (unsigned int k_bin = 0; k_bin < n_k; ++k_bin) {
661  float min_k_a[4] __attribute__((aligned(16))) = {
662  min_k_array[k_bin], min_k_array[k_bin], min_k_array[k_bin],
663  min_k_array[k_bin]};
664  float max_k_a[4] __attribute__((aligned(16))) = {
665  max_k_array[k_bin], max_k_array[k_bin], max_k_array[k_bin],
666  max_k_array[k_bin]};
667  float hel = -1.;
668  if (helicity == true) {
669  hel = 1.;
670  }
671  if (k_bin == 0) {
673  x_a, y_a, min_d_a, max_d_a, min_k_a, max_k_a, min_phi_1_a,
674  max_phi_1_a, min_phi_2_a, max_phi_2_a, hel, phi_3_out,
675  phi_4_out, x_a_2, y_a_2, phi_3_out_2, phi_4_out_2);
676 
677  phi_3_in = phi_3_out;
678  phi_4_in = phi_4_out;
679  phi_3_in_2 = phi_3_out_2;
680  phi_4_in_2 = phi_4_out_2;
681  } else {
683  x_a, y_a, min_d_a, max_d_a, min_k_a, max_k_a, min_phi_1_a,
684  max_phi_1_a, min_phi_2_a, max_phi_2_a, hel, phi_3_in, phi_4_in,
685  phi_3_out, phi_4_out, x_a_2, y_a_2, phi_3_in_2, phi_4_in_2,
686  phi_3_out_2, phi_4_out_2);
687 
688  phi_3_in = phi_3_out;
689  phi_4_in = phi_4_out;
690  phi_3_in_2 = phi_3_out_2;
691  phi_4_in_2 = phi_4_out_2;
692  }
693  for (unsigned int h = 0; h < hit_counter; ++h) {
694  if (h < 4) {
695 
696  float dphi = sqrt(((2.0*sqrt(four_hits[h].get_size(0,0))) *
697  (2.0*sqrt(four_hits[h].get_size(0,0))) +
698  (2.0*sqrt(four_hits[h].get_size(1,1))) *
699  (2.0*sqrt(four_hits[h].get_size(1,1)))) /
700  (four_hits[h].get_x() * four_hits[h].get_x() +
701  four_hits[h].get_y() * four_hits[h].get_y()));
702  dphi += phiError(
703  four_hits[h], min_kappa, max_kappa, min_d_array[d_bin],
704  max_d_array[d_bin], zoomranges[zoomlevel].min_z0,
705  zoomranges[zoomlevel].max_z0, zoomranges[zoomlevel].min_dzdl,
706  zoomranges[zoomlevel].max_dzdl);
707 
708  min_phi_1_a[h] -= dphi;
709  max_phi_1_a[h] += dphi;
710  min_phi_8[h] = min_phi_1_a[h];
711  max_phi_8[h] = max_phi_1_a[h];
712  } else {
713  float dphi =
714  sqrt(((2.0*sqrt(four_hits_2[h - 4].get_size(0,0))) *
715  (2.0*sqrt(four_hits_2[h - 4].get_size(0,0))) +
716  (2.0*sqrt(four_hits_2[h - 4].get_size(1,1))) *
717  (2.0*sqrt(four_hits_2[h - 4].get_size(1,1)))) /
718  (four_hits_2[h - 4].get_x() * four_hits_2[h - 4].get_x() +
719  four_hits_2[h - 4].get_y() * four_hits_2[h - 4].get_y()));
720  dphi += phiError(
721  four_hits[h - 4], min_kappa, max_kappa, min_d_array[d_bin],
722  max_d_array[d_bin], zoomranges[zoomlevel].min_z0,
723  zoomranges[zoomlevel].max_z0, zoomranges[zoomlevel].min_dzdl,
724  zoomranges[zoomlevel].max_dzdl);
725 
726  min_phi_2_a[h - 4] -= dphi;
727  max_phi_2_a[h - 4] += dphi;
728  min_phi_8[h] = min_phi_2_a[h - 4];
729  max_phi_8[h] = max_phi_2_a[h - 4];
730  }
731  }
732  fillBins(total_bins, 8, min_phi_8, max_phi_8, eight_hits, z_bins, n_d,
733  n_k, n_dzdl, n_z0, d_bin, k_bin, n_phi, zoomlevel, low_phi,
734  high_phi, inv_phi_range, vote_array);
735  }
736  }
737  hit_counter = 0;
738  }
739  if ((hit_counter == 4) && (((hits_vec[zoomlevel]->size() - (i + 1)) < 4) ||
740  (separate_by_helicity == false))) {
741  for (unsigned int d_bin = 0; d_bin < n_d; ++d_bin) {
742  float min_d_a[4] __attribute__((aligned(16))) = {
743  min_d_array[d_bin], min_d_array[d_bin], min_d_array[d_bin],
744  min_d_array[d_bin]};
745  float max_d_a[4] __attribute__((aligned(16))) = {
746  max_d_array[d_bin], max_d_array[d_bin], max_d_array[d_bin],
747  max_d_array[d_bin]};
748 
749  for (unsigned int k_bin = 0; k_bin < n_k; ++k_bin) {
750  float min_k_a[4] __attribute__((aligned(16))) = {
751  min_k_array[k_bin], min_k_array[k_bin], min_k_array[k_bin],
752  min_k_array[k_bin]};
753  float max_k_a[4] __attribute__((aligned(16))) = {
754  max_k_array[k_bin], max_k_array[k_bin], max_k_array[k_bin],
755  max_k_array[k_bin]};
756  if (separate_by_helicity == true) {
757  float hel = -1.;
758  if (helicity == true) {
759  hel = 1.;
760  }
761  if (k_bin == 0) {
762  HelixHough::phiRange_sse(x_a, y_a, min_d_a, max_d_a, min_k_a,
763  max_k_a, min_phi_1_a, max_phi_1_a, hel,
764  phi_3_out, phi_4_out);
765  phi_3_in = phi_3_out;
766  phi_4_in = phi_4_out;
767  } else {
768  HelixHough::phiRange_sse(x_a, y_a, min_d_a, max_d_a, max_k_a,
769  min_phi_1_a, max_phi_1_a, hel, phi_3_in,
770  phi_4_in, phi_3_out, phi_4_out);
771  phi_3_in = phi_3_out;
772  phi_4_in = phi_4_out;
773  }
774  } else {
775  HelixHough::phiRange_sse(x_a, y_a, min_d_a, max_d_a, min_k_a,
776  max_k_a, min_phi_1_a, max_phi_1_a,
777  min_phi_2_a, max_phi_2_a);
778  }
779  for (unsigned int h = 0; h < hit_counter; ++h) {
780 
781  float dphi = sqrt((2.0*sqrt(four_hits[h].get_size(0,0))) *
782  (2.0*sqrt(four_hits[h].get_size(0,0))) +
783  (2.0*sqrt(four_hits[h].get_size(1,1))) *
784  (2.0*sqrt(four_hits[h].get_size(1,1)))) /
785  (four_hits[h].get_x() * four_hits[h].get_x() +
786  four_hits[h].get_y() * four_hits[h].get_y());
787  dphi += phiError(
788  four_hits[h], min_kappa, max_kappa, min_d_array[d_bin],
789  max_d_array[d_bin], zoomranges[zoomlevel].min_z0,
790  zoomranges[zoomlevel].max_z0, zoomranges[zoomlevel].min_dzdl,
791  zoomranges[zoomlevel].max_dzdl);
792 
793  min_phi_1_a[h] -= dphi;
794  min_phi_2_a[h] -= dphi;
795  max_phi_1_a[h] += dphi;
796  max_phi_2_a[h] += dphi;
797  }
798  if (separate_by_helicity == true) {
799  fillBins(total_bins, hit_counter, min_phi_1_a, max_phi_1_a,
800  four_hits, z_bins, n_d, n_k, n_dzdl, n_z0, d_bin, k_bin,
801  n_phi, zoomlevel, low_phi, high_phi, inv_phi_range,
802  vote_array);
803  } else {
804  fillBins(total_bins, hit_counter, min_phi_1_a, max_phi_1_a,
805  four_hits, z_bins, n_d, n_k, n_dzdl, n_z0, d_bin, k_bin,
806  n_phi, zoomlevel, low_phi, high_phi, inv_phi_range,
807  vote_array);
808  fillBins(total_bins, hit_counter, min_phi_2_a, max_phi_2_a,
809  four_hits, z_bins, n_d, n_k, n_dzdl, n_z0, d_bin, k_bin,
810  n_phi, zoomlevel, low_phi, high_phi, inv_phi_range,
811  vote_array);
812  }
813  }
814  }
815  hit_counter = 0;
816  }
817  }
818  if (hit_counter != 0) {
819  for (unsigned int d_bin = 0; d_bin < n_d; ++d_bin) {
820  float min_d_a[4] __attribute__((aligned(16))) = {
821  min_d_array[d_bin], min_d_array[d_bin], min_d_array[d_bin],
822  min_d_array[d_bin]};
823  float max_d_a[4] __attribute__((aligned(16))) = {
824  max_d_array[d_bin], max_d_array[d_bin], max_d_array[d_bin],
825  max_d_array[d_bin]};
826 
827  for (unsigned int k_bin = 0; k_bin < n_k; ++k_bin) {
828  float min_k_a[4] __attribute__((aligned(16))) = {
829  min_k_array[k_bin], min_k_array[k_bin], min_k_array[k_bin],
830  min_k_array[k_bin]};
831  float max_k_a[4] __attribute__((aligned(16))) = {
832  max_k_array[k_bin], max_k_array[k_bin], max_k_array[k_bin],
833  max_k_array[k_bin]};
834  if (separate_by_helicity == true) {
835  float hel = -1.;
836  if (helicity == true) {
837  hel = 1.;
838  }
839  if (k_bin == 0) {
840  HelixHough::phiRange_sse(x_a, y_a, min_d_a, max_d_a, min_k_a,
841  max_k_a, min_phi_1_a, max_phi_1_a, hel,
842  phi_3_out, phi_4_out);
843 
844  phi_3_in = phi_3_out;
845  phi_4_in = phi_4_out;
846  } else {
847  HelixHough::phiRange_sse(x_a, y_a, min_d_a, max_d_a, max_k_a,
848  min_phi_1_a, max_phi_1_a, hel, phi_3_in,
849  phi_4_in, phi_3_out, phi_4_out);
850  phi_3_in = phi_3_out;
851  phi_4_in = phi_4_out;
852  }
853  } else {
854  HelixHough::phiRange_sse(x_a, y_a, min_d_a, max_d_a, min_k_a, max_k_a,
855  min_phi_1_a, max_phi_1_a, min_phi_2_a,
856  max_phi_2_a);
857  }
858  for (unsigned int h = 0; h < hit_counter; ++h) {
859 
860  float dphi = sqrt(((2.0*sqrt(four_hits[h].get_size(0,0))) *
861  (2.0*sqrt(four_hits[h].get_size(0,0))) +
862  (2.0*sqrt(four_hits[h].get_size(1,1))) *
863  (2.0*sqrt(four_hits[h].get_size(1,1)))) /
864  (four_hits[h].get_x() * four_hits[h].get_x() +
865  four_hits[h].get_y() * four_hits[h].get_y()));
866  dphi += phiError(
867  four_hits[h], min_kappa, max_kappa, min_d_array[d_bin],
868  max_d_array[d_bin], zoomranges[zoomlevel].min_z0,
869  zoomranges[zoomlevel].max_z0, zoomranges[zoomlevel].min_dzdl,
870  zoomranges[zoomlevel].max_dzdl);
871 
872  min_phi_1_a[h] -= dphi;
873  min_phi_2_a[h] -= dphi;
874  max_phi_1_a[h] += dphi;
875  max_phi_2_a[h] += dphi;
876  }
877  if (separate_by_helicity == true) {
878  fillBins(total_bins, hit_counter, min_phi_1_a, max_phi_1_a, four_hits,
879  z_bins, n_d, n_k, n_dzdl, n_z0, d_bin, k_bin, n_phi,
880  zoomlevel, low_phi, high_phi, inv_phi_range, vote_array);
881  } else {
882  fillBins(total_bins, hit_counter, min_phi_1_a, max_phi_1_a, four_hits,
883  z_bins, n_d, n_k, n_dzdl, n_z0, d_bin, k_bin, n_phi,
884  zoomlevel, low_phi, high_phi, inv_phi_range, vote_array);
885  fillBins(total_bins, hit_counter, min_phi_2_a, max_phi_2_a, four_hits,
886  z_bins, n_d, n_k, n_dzdl, n_z0, d_bin, k_bin, n_phi,
887  zoomlevel, low_phi, high_phi, inv_phi_range, vote_array);
888  }
889  }
890  }
891  hit_counter = 0;
892  }
893  if (print_timings == true) {
894  gettimeofday(&t2, NULL);
895  time1 = ((double)(t1.tv_sec) + (double)(t1.tv_usec) / 1000000.);
896  time2 = ((double)(t2.tv_sec) + (double)(t2.tv_usec) / 1000000.);
897  xy_vote_time += (time2 - time1);
898  }
899  if (vote_array.size == 0) {
900  return;
901  }
902  unsigned int C[(1 << 12)] = {0};
903  if (vote_array.size < (1 << 14)) {
904  unsigned int B[1 << 14];
905  counting_sort(vote_array.size, total_bins, vote_array.arr, B, C, bins_start,
906  bins_end);
907  bins_vec[zoomlevel]->resize(vote_array.size, BinEntryPair5D(0, 0));
908  for (unsigned int i = 0; i < vote_array.size; ++i) {
909  (*(bins_vec[zoomlevel]))[i].bin = (vote_array.arr[i]) >> 20;
910  (*(bins_vec[zoomlevel]))[i].entry =
911  (vote_array.arr[i]) & ((unsigned int)1048575);
912  (*(bins_vec[zoomlevel]))[i].is_seed = false;
913  }
914  } else {
915  bins_vec[zoomlevel]->resize(vote_array.size, BinEntryPair5D(0, 0));
916  for (unsigned int i = 0; i < vote_array.size; ++i) {
917  (*(bins_vec[zoomlevel]))[i].bin = (vote_array[i]) >> 20;
918  (*(bins_vec[zoomlevel]))[i].entry =
919  (vote_array[i]) & ((unsigned int)1048575);
920  (*(bins_vec[zoomlevel]))[i].is_seed = false;
921  }
922  counting_sort(total_bins, *(bins_vec[zoomlevel]), C, bins_start, bins_end);
923  // if(bins_vec[zoomlevel]->size() >
924  // total_bins){counting_sort(total_bins, *(bins_vec[zoomlevel]), C,
925  // bins_start, bins_end);}
926  // else{sort(bins_vec[zoomlevel]->begin(), bins_vec[zoomlevel]->end());}
927  }
928 }