ECCE @ EIC Software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
BenchmarkTools.cpp
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file BenchmarkTools.cpp
1 // This file is part of the Acts project.
2 //
3 // Copyright (C) 2020 CERN for the benefit of the Acts project
4 //
5 // This Source Code Form is subject to the terms of the Mozilla Public
6 // License, v. 2.0. If a copy of the MPL was not distributed with this
7 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 
9 #include <boost/test/data/test_case.hpp>
10 #include <boost/test/unit_test.hpp>
11 
12 #include <cmath>
13 #include <complex>
14 #include <iostream>
15 #include <sstream>
16 #include <tuple>
17 
20 
21 namespace Acts {
22 namespace Test {
23 
24 // Basic non-timing tests do not validate the core performance aspects of the
25 // benchmark tools, but have the advantage of being runnable on any system.
26 BOOST_AUTO_TEST_SUITE(benchmark_tools)
27 
28 BOOST_AUTO_TEST_CASE(assume_accessed) {
29  int x = 42;
30  assumeAccessed(x);
31  BOOST_CHECK_EQUAL(x, 42);
32 }
33 
34 BOOST_AUTO_TEST_CASE(assume_read) {
35  float x = 4.2f;
36  assumeRead(x);
37  BOOST_CHECK_EQUAL(x, 4.2f);
38 
39  const std::string y = "LOL";
40  assumeRead(x);
41  BOOST_CHECK_EQUAL(y, "LOL");
42 
43  assumeRead(std::make_tuple(1, false, 3.5));
44 }
45 
46 BOOST_AUTO_TEST_CASE(assume_written) {
47  std::complex c(1.2, 3.4);
48  assumeWritten(c);
49  BOOST_CHECK_EQUAL(c, std::complex(1.2, 3.4));
50 }
51 
52 BOOST_AUTO_TEST_CASE(micro_benchmark_result) {
54  res.iters_per_run = 42;
55  res.run_timings = {
56  std::chrono::microseconds(420), std::chrono::microseconds(21),
57  std::chrono::milliseconds(4), std::chrono::microseconds(84),
58  std::chrono::microseconds(294), std::chrono::microseconds(378),
59  std::chrono::microseconds(126), std::chrono::milliseconds(42)};
60 
61  CHECK_CLOSE_REL(res.totalTime().count() / 1'000'000., 47.323, 1e-6);
62 
63  auto sorted = res.sortedRunTimes();
64  BOOST_CHECK_EQUAL(sorted.size(), res.run_timings.size());
65  BOOST_CHECK_EQUAL(sorted[0].count(), 21'000.);
66  BOOST_CHECK_EQUAL(sorted[1].count(), 84'000.);
67  BOOST_CHECK_EQUAL(sorted[2].count(), 126'000.);
68  BOOST_CHECK_EQUAL(sorted[3].count(), 294'000.);
69  BOOST_CHECK_EQUAL(sorted[4].count(), 378'000.);
70  BOOST_CHECK_EQUAL(sorted[5].count(), 420'000.);
71  BOOST_CHECK_EQUAL(sorted[6].count(), 4'000'000.);
72  BOOST_CHECK_EQUAL(sorted[7].count(), 42'000'000.);
73 
74  CHECK_CLOSE_REL(res.runTimeMedian().count() / 1000., (294. + 378.) / 2.,
75  1e-6);
76 
77  auto [firstq, thirdq] = res.runTimeQuartiles();
78  CHECK_CLOSE_REL(firstq.count() / 1000., (84. + 126.) / 2., 1e-6);
79  CHECK_CLOSE_REL(thirdq.count() / 1000., (420. + 4000.) / 2., 1e-6);
80 
82  (thirdq - firstq).count() / 1.349, 1e-3);
83 
84  CHECK_CLOSE_REL(res.iterTimeAverage().count(),
85  res.runTimeMedian().count() / res.iters_per_run, 1e-6);
86 
88  res.iterTimeError().count(),
89  res.runTimeRobustStddev().count() / std::sqrt(res.iters_per_run), 1e-6);
90 
91  std::ostringstream os;
92  os << res;
93  BOOST_CHECK_EQUAL(os.str(),
94  "8 runs of 42 iteration(s), 47.3ms total, "
95  "336.0000+/-1560.4388µs per run, "
96  "8000.000+/-240780.940ns per iteration");
97 }
98 
99 BOOST_AUTO_TEST_CASE(micro_benchmark) {
100  int counter = 0;
101  microBenchmark([&] { ++counter; }, 15, 7, std::chrono::milliseconds(0));
102  BOOST_CHECK_EQUAL(counter, 15 * 7);
103 
104  counter = 0;
106  [&] {
107  ++counter;
108  return counter;
109  },
110  17, 11, std::chrono::milliseconds(0));
111  BOOST_CHECK_EQUAL(counter, 17 * 11);
112 
113  counter = 0;
114  int previous = 64;
115  std::vector<int> ints{1, 2, 4, 8, 16, 32, 64};
117  [&](int input) {
118  if (input == 1) {
119  BOOST_CHECK_EQUAL(previous, 64);
120  counter = 1;
121  } else {
122  BOOST_CHECK_EQUAL(input, previous * 2);
123  counter += input;
124  }
125  previous = input;
126  },
127  ints, 123, std::chrono::milliseconds(3));
128  BOOST_CHECK_EQUAL(counter, 127);
129 
130  counter = 0;
131  previous = -81;
132  std::vector<char> chars{-1, 3, -9, 27, -81};
134  [&](int input) {
135  if (input == -1) {
136  BOOST_CHECK_EQUAL(previous, -81);
137  counter = -1;
138  } else {
139  BOOST_CHECK_EQUAL(input, -previous * 3);
140  counter += input;
141  }
142  previous = input;
143  return &previous;
144  },
145  chars, 456, std::chrono::milliseconds(8));
146  BOOST_CHECK_EQUAL(counter, -61);
147 }
148 
149 BOOST_AUTO_TEST_SUITE_END()
150 
151 // Timing tests are perhaps the most important ones for validation of
152 // benchmarking tools, but they cannot be run by default for two reasons:
153 // - They take a while to run, and therefore slow down the testing cycle
154 // - They require a quiet system to succeed, and will likely fail when invoked
155 // by a parallel run of CTest or when run on a continuous integration VM.
156 //
157 // If you can ensure both of these preconditions, you can run the test with
158 // ./BenchmarkTools --run_test=benchmark_timings
159 BOOST_AUTO_TEST_SUITE(benchmark_timings, *boost::unit_test::disabled())
160 
161 constexpr size_t bench_iters = 1'000;
162 
163 BOOST_AUTO_TEST_CASE(micro_benchmark) {
164  using namespace std::literals::chrono_literals;
165 
166  // For simple microbenchmarking needs, plain use of microBenchmark is enough.
167  //
168  // For example, here, the microbenchmark loop isn't optimized out even though
169  // each iteration does literally nothing. If it were optimized out, the time
170  // per iteration would change, since we wouldn't get linear scaling anymore.
171  const auto nop_x10 = microBenchmark([] {}, 10 * bench_iters);
172  std::cout << "nop (10x iters): " << nop_x10 << std::endl;
173  const auto nop_x100 = microBenchmark([] {}, 100 * bench_iters);
174  std::cout << "nop (100x iters): " << nop_x100 << std::endl;
175  const double nop_x10_iter_ns = nop_x10.iterTimeAverage().count();
176  const double nop_x100_iter_ns = nop_x100.iterTimeAverage().count();
177  CHECK_CLOSE_REL(nop_x10_iter_ns, nop_x100_iter_ns, 0.1);
178 
179 // These tests reason about the performance characteristics of _optimized_ code,
180 // and should therefore be compiled out of debug/coverage builds.
181 #ifdef __OPTIMIZE__
182  // The microbenchmarking harness is super low overhead, less than 1
183  // nanosecond per iteration on a modern CPU.
184  BOOST_CHECK_LT(nop_x100_iter_ns, 1.0);
185 
186  // With a well-chosen iteration count that keeps per-run times under the OS
187  // scheduling quantum (typically 1ms), the noise is also super low.
188  BOOST_CHECK_LT(nop_x100.iterTimeError().count(), 0.1);
189 
190  // You can measure the overhead of any operation as long as it's not
191  // _obnoxiously_ amenable to compiler const-propagation or dead code
192  // elimination. For example, this sqrt throughput microbenchmark works,
193  // because microBenchmark forces the compiler to assume that "x", "y" and "z"
194  // are modified on every benchmark iteration...
195  const double x = 1.2, y = 3.4, z = 5.6;
196  auto sqrt = microBenchmark(
197  [&] { return std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x); },
198  bench_iters);
199  std::cout << "sqrt (correct): " << sqrt << std::endl;
200  BOOST_CHECK_GT(sqrt.iterTimeAverage().count(), 10. * nop_x100_iter_ns);
201 
202  // ...but this variant doesn't work, because the compiler can trivially
203  // precompute the square root when optimizing the inner lambda...
204  const auto sqrt_constprop = microBenchmark(
205  [] {
206  return std::sqrt(1.2 * 3.4) + std::sqrt(3.4 * 5.6) +
207  std::sqrt(5.6 * 1.2);
208  },
209  bench_iters * 20);
210  std::cout << "sqrt (constprop'd): " << sqrt_constprop << std::endl;
211  BOOST_CHECK_LT(sqrt_constprop.iterTimeAverage().count(),
212  sqrt.iterTimeAverage().count() / 5.);
213 
214  // ...and this one doesn't work either, because the compiler can trivially
215  // infer that the result of the computation is unused and stop computing it.
216  //
217  // The lower tolerance of this test is needed because current GCC doesn't
218  // optimize _everything_ out in its default configuration, as sqrt could still
219  // have side-effects like setting the errno thread-local variable...
220  const auto sqrt_deadcode = microBenchmark(
221  [&] { (void)(std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x)); },
222  bench_iters * 10);
223  std::cout << "sqrt (deadcode'd): " << sqrt_deadcode << std::endl;
224  BOOST_CHECK_LT(sqrt_deadcode.iterTimeAverage().count(),
225  sqrt.iterTimeAverage().count() / 3.);
226 #endif
227 }
228 
229 // These tests reason about the performance characteristics of _optimized_ code,
230 // and should therefore be compiled out of debug/coverage builds.
231 #ifdef __OPTIMIZE__
232 BOOST_AUTO_TEST_CASE(assume_read) {
233  // You can use assumeRead when you want the compiler to assume that the result
234  // of some computation has been read and therefore the computation shouldn't
235  // be optimized out. This is what microBenchmark implicitly does to the value
236  // returned by the benchmark iteration function, if any.
237  //
238  // For example, these two computations are almost equivalent. Notice that
239  // assumeRead can be used on temporaries.
240  const double x = 1.2, y = 3.4, z = 5.6;
241  const auto tuple_return = microBenchmark(
242  [&] {
243  return std::make_tuple(
244  std::sqrt(x * y), std::complex(std::sqrt(y * z), std::sqrt(z * x)));
245  },
246  bench_iters);
247  std::cout << "tuple return: " << tuple_return << std::endl;
248  const auto assumeread = microBenchmark(
249  [&] {
250  assumeRead(std::sqrt(x * y));
251  assumeRead(std::complex(std::sqrt(y * z), std::sqrt(z * x)));
252  },
253  bench_iters);
254  std::cout << "assumeRead: " << assumeread << std::endl;
255  const double tuple_return_iter_ns = tuple_return.iterTimeAverage().count();
256  const double assumeRead_iter_ns = assumeread.iterTimeAverage().count();
257  BOOST_CHECK_LT(std::abs(tuple_return_iter_ns - assumeRead_iter_ns),
258  5. * tuple_return.iterTimeError().count());
259 }
260 #endif
261 
262 BOOST_AUTO_TEST_CASE(assume_written) {
263  // You can use assumeWritten when you want the compiler to assume that some
264  // variables have been written to, and every dependent computation must
265  // therefore be recomputed. This is what microBenchmark implicitly does to
266  // every variable captured by the benchmark iteration lambda.
267  //
268  // Since assumeWritten operates on variables in memory, it cannot be used on
269  // temporaries, but only on mutable variables.
270  double x = 1.2, y = 3.4, z = 5.6;
271  auto sqrt_sum = microBenchmark(
272  [&] { return std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x); },
273  bench_iters);
274  std::cout << "sqrt sum: " << sqrt_sum << std::endl;
275  auto sqrt_2sums = microBenchmark(
276  [&] {
277  double tmp = std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x);
278  assumeWritten(x);
279  assumeWritten(y);
280  assumeWritten(z);
281  return tmp + std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x);
282  },
283  bench_iters);
284  std::cout << "2x(sqrt sum): " << sqrt_2sums << std::endl;
285  const double sqrt_sum_iter_ns = sqrt_sum.iterTimeAverage().count();
286  const double sqrt_2sums_iter_ns = sqrt_2sums.iterTimeAverage().count();
287  BOOST_CHECK_LT(std::abs(2. * sqrt_sum_iter_ns - sqrt_2sums_iter_ns),
288  5. * sqrt_sum.iterTimeError().count());
289 }
290 
291 BOOST_AUTO_TEST_SUITE_END()
292 
293 } // namespace Test
294 } // namespace Acts