Bitcoin Core  0.20.99
P2P Digital Currency
nanobench.h
Go to the documentation of this file.
1 // __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2 // | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3 // | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4 //
5 // Microbenchmark framework for C++11/14/17/20
6 // https://github.com/martinus/nanobench
7 //
8 // Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9 // SPDX-License-Identifier: MIT
10 // Copyright (c) 2019-2020 Martin Ankerl <martin.ankerl@gmail.com>
11 //
12 // Permission is hereby granted, free of charge, to any person obtaining a copy
13 // of this software and associated documentation files (the "Software"), to deal
14 // in the Software without restriction, including without limitation the rights
15 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 // copies of the Software, and to permit persons to whom the Software is
17 // furnished to do so, subject to the following conditions:
18 //
19 // The above copyright notice and this permission notice shall be included in all
20 // copies or substantial portions of the Software.
21 //
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 // SOFTWARE.
29 
30 #ifndef ANKERL_NANOBENCH_H_INCLUDED
31 #define ANKERL_NANOBENCH_H_INCLUDED
32 
33 // see https://semver.org/
34 #define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35 #define ANKERL_NANOBENCH_VERSION_MINOR 0 // backwards-compatible changes
36 #define ANKERL_NANOBENCH_VERSION_PATCH 0 // backwards-compatible bug fixes
37 
39 // public facing api - as minimal as possible
41 
42 #include <chrono> // high_resolution_clock
43 #include <cstring> // memcpy
44 #include <iosfwd> // for std::ostream* custom output target in Config
45 #include <string> // all names
46 #include <vector> // holds all results
47 
48 #define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
49 
50 #define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
51 #define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
52 #define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
53 #define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
54 #define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
55 
56 #if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
57 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
58 #else
59 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
60 #endif
61 
62 #if defined(__clang__)
63 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
64  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
65 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
66 #else
67 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
68 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
69 #endif
70 
71 #if defined(__GNUC__)
72 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
73 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
74 #else
75 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
76 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
77 #endif
78 
79 #if defined(ANKERL_NANOBENCH_LOG_ENABLED)
80 # include <iostream>
81 # define ANKERL_NANOBENCH_LOG(x) std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl
82 #else
83 # define ANKERL_NANOBENCH_LOG(x)
84 #endif
85 
86 #if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
87 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
88 #else
89 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
90 #endif
91 
92 #if defined(__clang__)
93 # define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
94 #else
95 # define ANKERL_NANOBENCH_NO_SANITIZE(...)
96 #endif
97 
98 #if defined(_MSC_VER)
99 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
100 #else
101 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
102 #endif
103 
104 // workaround missing "is_trivially_copyable" in g++ < 5.0
105 // See https://stackoverflow.com/a/31798726/48181
106 #if defined(__GNUC__) && __GNUC__ < 5
107 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
108 #else
109 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
110 #endif
111 
112 // declarations ///////////////////////////////////////////////////////////////////////////////////
113 
114 namespace ankerl {
115 namespace nanobench {
116 
117 using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
118  std::chrono::steady_clock>::type;
119 class Bench;
120 struct Config;
121 class Result;
122 class Rng;
123 class BigO;
124 
271 void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
272 
281 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
282 
283 // Contains mustache-like templates
284 namespace templates {
285 
295 char const* csv() noexcept;
296 
307 char const* htmlBoxplot() noexcept;
308 
318 char const* json() noexcept;
319 
320 } // namespace templates
321 
322 namespace detail {
323 
324 template <typename T>
326 
327 class IterationLogic;
328 class PerformanceCounters;
329 
330 #if ANKERL_NANOBENCH(PERF_COUNTERS)
331 class LinuxPerformanceCounters;
332 #endif
333 
334 } // namespace detail
335 } // namespace nanobench
336 } // namespace ankerl
337 
338 // definitions ////////////////////////////////////////////////////////////////////////////////////
339 
340 namespace ankerl {
341 namespace nanobench {
342 namespace detail {
343 
344 template <typename T>
345 struct PerfCountSet {
346  T pageFaults{};
347  T cpuCycles{};
348  T contextSwitches{};
349  T instructions{};
350  T branchInstructions{};
351  T branchMisses{};
352 };
353 
354 } // namespace detail
355 
356 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
357 struct Config {
358  // actual benchmark config
359  std::string mBenchmarkTitle = "benchmark";
360  std::string mBenchmarkName = "noname";
361  std::string mUnit = "op";
362  double mBatch = 1.0;
363  double mComplexityN = -1.0;
364  size_t mNumEpochs = 11;
365  size_t mClockResolutionMultiple = static_cast<size_t>(1000);
366  std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
367  std::chrono::nanoseconds mMinEpochTime{};
368  uint64_t mMinEpochIterations{1};
369  uint64_t mEpochIterations{0}; // If not 0, run *exactly* these number of iterations per epoch.
370  uint64_t mWarmup = 0;
371  std::ostream* mOut = nullptr;
372  bool mShowPerformanceCounters = true;
373  bool mIsRelative = false;
374 
375  Config();
376  ~Config();
377  Config& operator=(Config const&);
378  Config& operator=(Config&&);
379  Config(Config const&);
380  Config(Config&&) noexcept;
381 };
382 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
383 
384 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
385 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
386 class Result {
387 public:
388  enum class Measure : size_t {
389  elapsed,
390  iterations,
391  pagefaults,
392  cpucycles,
393  contextswitches,
394  instructions,
395  branchinstructions,
396  branchmisses,
397  _size
398  };
399 
400  explicit Result(Config const& benchmarkConfig);
401 
402  ~Result();
403  Result& operator=(Result const&);
404  Result& operator=(Result&&);
405  Result(Result const&);
406  Result(Result&&) noexcept;
407 
408  // adds new measurement results
409  // all values are scaled by iters (except iters...)
410  void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
411 
412  ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
413 
414  ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
415  ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
416  ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
417  ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
418  ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
419  ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
420  ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
421 
422  ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
423  ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
424  ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
425  ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
426 
427  // Finds string, if not found, returns _size.
428  static Measure fromString(std::string const& str);
429 
430 private:
431  Config mConfig{};
432  std::vector<std::vector<double>> mNameToMeasurements{};
433 };
434 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
435 
436 
453 class Rng final {
454 public:
458  using result_type = uint64_t;
459 
460  static constexpr uint64_t(min)();
461  static constexpr uint64_t(max)();
462 
468  Rng(Rng const&) = delete;
469 
473  Rng& operator=(Rng const&) = delete;
474 
475  // moving is ok
476  Rng(Rng&&) noexcept = default;
477  Rng& operator=(Rng&&) noexcept = default;
478  ~Rng() noexcept = default;
479 
487  Rng();
488 
505  explicit Rng(uint64_t seed) noexcept;
506  Rng(uint64_t x, uint64_t y) noexcept;
507 
511  ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
512 
520  inline uint64_t operator()() noexcept;
521 
522  // This is slightly biased. See
523 
538  inline uint32_t bounded(uint32_t range) noexcept;
539 
540  // random double in range [0, 1(
541  // see http://prng.di.unimi.it/
542 
549  inline double uniform01() noexcept;
550 
558  template <typename Container>
559  void shuffle(Container& container) noexcept;
560 
561 private:
562  static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
563 
564  uint64_t mX;
565  uint64_t mY;
566 };
567 
582 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
583 class Bench {
584 public:
588  Bench();
589 
590  Bench(Bench&& other);
591  Bench& operator=(Bench&& other);
592  Bench(Bench const& other);
593  Bench& operator=(Bench const& other);
594  ~Bench() noexcept;
595 
614  template <typename Op>
615  ANKERL_NANOBENCH(NOINLINE)
616  Bench& run(char const* benchmarkName, Op&& op);
617 
618  template <typename Op>
619  ANKERL_NANOBENCH(NOINLINE)
620  Bench& run(std::string const& benchmarkName, Op&& op);
621 
626  template <typename Op>
627  ANKERL_NANOBENCH(NOINLINE)
628  Bench& run(Op&& op);
629 
635  Bench& title(char const* benchmarkTitle);
636  Bench& title(std::string const& benchmarkTitle);
637  ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
638 
640  Bench& name(char const* benchmarkName);
641  Bench& name(std::string const& benchmarkName);
642  ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
643 
653  template <typename T>
654  Bench& batch(T b) noexcept;
655  ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
656 
665  Bench& unit(char const* unit);
666  Bench& unit(std::string const& unit);
667  ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
668 
676  Bench& output(std::ostream* outstream) noexcept;
677  ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
678 
699  Bench& clockResolutionMultiple(size_t multiple) noexcept;
700  ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
701 
717  Bench& epochs(size_t numEpochs) noexcept;
718  ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
719 
730  Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
731  ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
732 
743  Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
744  ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
745 
756  Bench& minEpochIterations(uint64_t numIters) noexcept;
757  ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
758 
765  Bench& epochIterations(uint64_t numIters) noexcept;
766  ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
767 
777  Bench& warmup(uint64_t numWarmupIters) noexcept;
778  ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
779 
797  Bench& relative(bool isRelativeEnabled) noexcept;
798  ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
799 
808  Bench& performanceCounters(bool showPerformanceCounters) noexcept;
809  ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
810 
819  ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
820 
828  template <typename Arg>
829  Bench& doNotOptimizeAway(Arg&& arg);
830 
845  template <typename T>
846  Bench& complexityN(T b) noexcept;
847  ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
848 
880  std::vector<BigO> complexityBigO() const;
881 
905  template <typename Op>
906  BigO complexityBigO(char const* name, Op op) const;
907 
908  template <typename Op>
909  BigO complexityBigO(std::string const& name, Op op) const;
910 
918  Bench& render(char const* templateContent, std::ostream& os);
919 
920  Bench& config(Config const& benchmarkConfig);
921  ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
922 
923 private:
924  Config mConfig{};
925  std::vector<Result> mResults{};
926 };
927 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
928 
929 
935 template <typename Arg>
936 void doNotOptimizeAway(Arg&& arg);
937 
938 namespace detail {
939 
940 #if defined(_MSC_VER)
941 void doNotOptimizeAwaySink(void const*);
942 
943 template <typename T>
944 void doNotOptimizeAway(T const& val);
945 
946 #else
947 
948 // see folly's Benchmark.h
949 template <typename T>
950 constexpr bool doNotOptimizeNeedsIndirect() {
951  using Decayed = typename std::decay<T>::type;
952  return !ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(Decayed) || sizeof(Decayed) > sizeof(long) || std::is_pointer<Decayed>::value;
953 }
954 
955 template <typename T>
956 typename std::enable_if<!doNotOptimizeNeedsIndirect<T>()>::type doNotOptimizeAway(T const& val) {
957  // NOLINTNEXTLINE(hicpp-no-assembler)
958  asm volatile("" ::"r"(val));
959 }
960 
961 template <typename T>
962 typename std::enable_if<doNotOptimizeNeedsIndirect<T>()>::type doNotOptimizeAway(T const& val) {
963  // NOLINTNEXTLINE(hicpp-no-assembler)
964  asm volatile("" ::"m"(val) : "memory");
965 }
966 #endif
967 
968 // internally used, but visible because run() is templated.
969 // Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
970 // having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
971 ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
973 public:
974  explicit IterationLogic(Bench const& config) noexcept;
975  ~IterationLogic();
976 
977  ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
978  void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
979  void moveResultTo(std::vector<Result>& results) noexcept;
980 
981 private:
982  struct Impl;
983  Impl* mPimpl;
984 };
985 ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
986 
987 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
989 public:
990  PerformanceCounters(PerformanceCounters const&) = delete;
991  PerformanceCounters& operator=(PerformanceCounters const&) = delete;
992 
993  PerformanceCounters();
994  ~PerformanceCounters();
995 
996  void beginMeasure();
997  void endMeasure();
998  void updateResults(uint64_t numIters);
999 
1000  ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1001  ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1002 
1003 private:
1004 #if ANKERL_NANOBENCH(PERF_COUNTERS)
1005  LinuxPerformanceCounters* mPc = nullptr;
1006 #endif
1009 };
1010 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1011 
1012 // Gets the singleton
1014 
1015 } // namespace detail
1016 
1017 class BigO {
1018 public:
1019  using RangeMeasure = std::vector<std::pair<double, double>>;
1020 
1021  template <typename Op>
1023  for (auto& rangeMeasure : data) {
1024  rangeMeasure.first = op(rangeMeasure.first);
1025  }
1026  return data;
1027  }
1028 
1029  static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1030 
1031  template <typename Op>
1032  BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1033  : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1034 
1035  template <typename Op>
1036  BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1037  : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1038 
1039  BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1040  BigO(std::string const& bigOName, RangeMeasure const& scaledRangeMeasure);
1041  ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1042  ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1043  ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1044  ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1045 
1046 private:
1047  std::string mName{};
1048  double mConstant{};
1049  double mNormalizedRootMeanSquare{};
1050 };
1051 std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1052 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1053 
1054 } // namespace nanobench
1055 } // namespace ankerl
1056 
1057 // implementation /////////////////////////////////////////////////////////////////////////////////
1058 
1059 namespace ankerl {
1060 namespace nanobench {
1061 
1062 constexpr uint64_t(Rng::min)() {
1063  return 0;
1064 }
1065 
1066 constexpr uint64_t(Rng::max)() {
1067  return (std::numeric_limits<uint64_t>::max)();
1068 }
1069 
1071 uint64_t Rng::operator()() noexcept {
1072  auto x = mX;
1073 
1074  mX = UINT64_C(15241094284759029579) * mY;
1075  mY = rotl(mY - x, 27);
1076 
1077  return x;
1078 }
1079 
1081 uint32_t Rng::bounded(uint32_t range) noexcept {
1082  uint64_t r32 = static_cast<uint32_t>(operator()());
1083  auto multiresult = r32 * range;
1084  return static_cast<uint32_t>(multiresult >> 32U);
1085 }
1086 
1087 double Rng::uniform01() noexcept {
1088  auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1089  // can't use union in c++ here for type puning, it's undefined behavior.
1090  // std::memcpy is optimized anyways.
1091  double d;
1092  std::memcpy(&d, &i, sizeof(double));
1093  return d - 1.0;
1094 }
1095 
1096 template <typename Container>
1097 void Rng::shuffle(Container& container) noexcept {
1098  auto size = static_cast<uint32_t>(container.size());
1099  for (auto i = size; i > 1U; --i) {
1100  using std::swap;
1101  auto p = bounded(i); // number in [0, i)
1102  swap(container[i - 1], container[p]);
1103  }
1104 }
1105 
1106 constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1107  return (x << k) | (x >> (64U - k));
1108 }
1109 
1110 template <typename Op>
1112 Bench& Bench::run(Op&& op) {
1113  // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1114  detail::IterationLogic iterationLogic(*this);
1115  auto& pc = detail::performanceCounters();
1116 
1117  while (auto n = iterationLogic.numIters()) {
1118  pc.beginMeasure();
1119  Clock::time_point before = Clock::now();
1120  while (n-- > 0) {
1121  op();
1122  }
1123  Clock::time_point after = Clock::now();
1124  pc.endMeasure();
1125  pc.updateResults(iterationLogic.numIters());
1126  iterationLogic.add(after - before, pc);
1127  }
1128  iterationLogic.moveResultTo(mResults);
1129  return *this;
1130 }
1131 
1132 // Performs all evaluations.
1133 template <typename Op>
1134 Bench& Bench::run(char const* benchmarkName, Op&& op) {
1135  name(benchmarkName);
1136  return run(std::forward<Op>(op));
1137 }
1138 
1139 template <typename Op>
1140 Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1141  name(benchmarkName);
1142  return run(std::forward<Op>(op));
1143 }
1144 
1145 template <typename Op>
1146 BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1147  return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1148 }
1149 
1150 template <typename Op>
1151 BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1152  return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1153 }
1154 
1155 // Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1156 // Any argument is cast to double.
1157 template <typename T>
1158 Bench& Bench::batch(T b) noexcept {
1159  mConfig.mBatch = static_cast<double>(b);
1160  return *this;
1161 }
1162 
1163 // Sets the computation complexity of the next run. Any argument is cast to double.
1164 template <typename T>
1165 Bench& Bench::complexityN(T n) noexcept {
1166  mConfig.mComplexityN = static_cast<double>(n);
1167  return *this;
1168 }
1169 
1170 // Convenience: makes sure none of the given arguments are optimized away by the compiler.
1171 template <typename Arg>
1173  detail::doNotOptimizeAway(std::forward<Arg>(arg));
1174  return *this;
1175 }
1176 
1177 // Makes sure none of the given arguments are optimized away by the compiler.
1178 template <typename Arg>
1179 void doNotOptimizeAway(Arg&& arg) {
1180  detail::doNotOptimizeAway(std::forward<Arg>(arg));
1181 }
1182 
1183 namespace detail {
1184 
1185 #if defined(_MSC_VER)
1186 template <typename T>
1187 void doNotOptimizeAway(T const& val) {
1188  doNotOptimizeAwaySink(&val);
1189 }
1190 
1191 #endif
1192 
1193 } // namespace detail
1194 } // namespace nanobench
1195 } // namespace ankerl
1196 
1197 #if defined(ANKERL_NANOBENCH_IMPLEMENT)
1198 
1200 // implementation part - only visible in .cpp
1202 
1203 # include <algorithm> // sort, reverse
1204 # include <atomic> // compare_exchange_strong in loop overhead
1205 # include <cstdlib> // getenv
1206 # include <cstring> // strstr, strncmp
1207 # include <fstream> // ifstream to parse proc files
1208 # include <iomanip> // setw, setprecision
1209 # include <iostream> // cout
1210 # include <numeric> // accumulate
1211 # include <random> // random_device
1212 # include <sstream> // to_s in Number
1213 # include <stdexcept> // throw for rendering templates
1214 # include <tuple> // std::tie
1215 # if defined(__linux__)
1216 # include <unistd.h> //sysconf
1217 # endif
1218 # if ANKERL_NANOBENCH(PERF_COUNTERS)
1219 # include <map> // map
1220 
1221 # include <linux/perf_event.h>
1222 # include <sys/ioctl.h>
1223 # include <sys/syscall.h>
1224 # include <unistd.h>
1225 # endif
1226 
1227 // declarations ///////////////////////////////////////////////////////////////////////////////////
1228 
1229 namespace ankerl {
1230 namespace nanobench {
1231 
1232 // helper stuff that is only intended to be used internally
1233 namespace detail {
1234 
1235 struct TableInfo;
1236 
1237 // formatting utilities
1238 namespace fmt {
1239 
1240 class NumSep;
1241 class StreamStateRestorer;
1242 class Number;
1243 class MarkDownColumn;
1244 class MarkDownCode;
1245 
1246 } // namespace fmt
1247 } // namespace detail
1248 } // namespace nanobench
1249 } // namespace ankerl
1250 
1251 // definitions ////////////////////////////////////////////////////////////////////////////////////
1252 
1253 namespace ankerl {
1254 namespace nanobench {
1255 
1256 uint64_t splitMix64(uint64_t& state) noexcept;
1257 
1258 namespace detail {
1259 
1260 // helpers to get double values
1261 template <typename T>
1262 inline double d(T t) noexcept {
1263  return static_cast<double>(t);
1264 }
1265 inline double d(Clock::duration duration) noexcept {
1266  return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1267 }
1268 
1269 // Calculates clock resolution once, and remembers the result
1270 inline Clock::duration clockResolution() noexcept;
1271 
1272 } // namespace detail
1273 
1274 namespace templates {
1275 
1276 char const* csv() noexcept {
1277  return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1278 {{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1279 {{/result}})DELIM";
1280 }
1281 
1282 char const* htmlBoxplot() noexcept {
1283  return R"DELIM(<html>
1284 
1285 <head>
1286  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1287 </head>
1288 
1289 <body>
1290  <div id="myDiv"></div>
1291  <script>
1292  var data = [
1293  {{#result}}{
1294  name: '{{name}}',
1295  y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1296  },
1297  {{/result}}
1298  ];
1299  var title = '{{title}}';
1300 
1301  data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1302  var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1303  </script>
1304 </body>
1305 
1306 </html>)DELIM";
1307 }
1308 
1309 char const* json() noexcept {
1310  return R"DELIM({
1311  "results": [
1312 {{#result}} {
1313  "title": "{{title}}",
1314  "name": "{{name}}",
1315  "unit": "{{unit}}",
1316  "batch": {{batch}},
1317  "complexityN": {{complexityN}},
1318  "epochs": {{epochs}},
1319  "clockResolution": {{clockResolution}},
1320  "clockResolutionMultiple": {{clockResolutionMultiple}},
1321  "maxEpochTime": {{maxEpochTime}},
1322  "minEpochTime": {{minEpochTime}},
1323  "minEpochIterations": {{minEpochIterations}},
1324  "epochIterations": {{epochIterations}},
1325  "warmup": {{warmup}},
1326  "relative": {{relative}},
1327  "median(elapsed)": {{median(elapsed)}},
1328  "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1329  "median(instructions)": {{median(instructions)}},
1330  "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1331  "median(cpucycles)": {{median(cpucycles)}},
1332  "median(contextswitches)": {{median(contextswitches)}},
1333  "median(pagefaults)": {{median(pagefaults)}},
1334  "median(branchinstructions)": {{median(branchinstructions)}},
1335  "median(branchmisses)": {{median(branchmisses)}},
1336  "totalTime": {{sumProduct(iterations, elapsed)}},
1337  "measurements": [
1338 {{#measurement}} {
1339  "iterations": {{iterations}},
1340  "elapsed": {{elapsed}},
1341  "pagefaults": {{pagefaults}},
1342  "cpucycles": {{cpucycles}},
1343  "contextswitches": {{contextswitches}},
1344  "instructions": {{instructions}},
1345  "branchinstructions": {{branchinstructions}},
1346  "branchmisses": {{branchmisses}}
1347  }{{^-last}},{{/-last}}
1348 {{/measurement}} ]
1349  }{{^-last}},{{/-last}}
1350 {{/result}} ]
1351 })DELIM";
1352 }
1353 
1354 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1355 struct Node {
1356  enum class Type { tag, content, section, inverted_section };
1357 
1358  char const* begin;
1359  char const* end;
1360  std::vector<Node> children;
1361  Type type;
1362 
1363  template <size_t N>
1364  // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1365  bool operator==(char const (&str)[N]) const noexcept {
1366  return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1367  }
1368 };
1369 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1370 
1371 static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1372  std::vector<Node> nodes;
1373 
1374  while (true) {
1375  auto begin = std::strstr(*tpl, "{{");
1376  auto end = begin;
1377  if (begin != nullptr) {
1378  begin += 2;
1379  end = std::strstr(begin, "}}");
1380  }
1381 
1382  if (begin == nullptr || end == nullptr) {
1383  // nothing found, finish node
1384  nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1385  return nodes;
1386  }
1387 
1388  nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1389 
1390  // we found a tag
1391  *tpl = end + 2;
1392  switch (*begin) {
1393  case '/':
1394  // finished! bail out
1395  return nodes;
1396 
1397  case '#':
1398  nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1399  break;
1400 
1401  case '^':
1402  nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1403  break;
1404 
1405  default:
1406  nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1407  break;
1408  }
1409  }
1410 }
1411 
1412 static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1413  bool matchFirst = n == "-first";
1414  bool matchLast = n == "-last";
1415  if (!matchFirst && !matchLast) {
1416  return false;
1417  }
1418 
1419  bool doWrite = false;
1420  if (n.type == Node::Type::section) {
1421  doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1422  } else if (n.type == Node::Type::inverted_section) {
1423  doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1424  }
1425 
1426  if (doWrite) {
1427  for (auto const& child : n.children) {
1428  if (child.type == Node::Type::content) {
1429  out.write(child.begin, std::distance(child.begin, child.end));
1430  }
1431  }
1432  }
1433  return true;
1434 }
1435 
1436 static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1437  matchResult.clear();
1438  auto idxOpen = str.find('(');
1439  auto idxClose = str.find(')', idxOpen);
1440  if (idxClose == std::string::npos) {
1441  return false;
1442  }
1443 
1444  matchResult.emplace_back(str.substr(0, idxOpen));
1445 
1446  // split by comma
1447  matchResult.emplace_back(std::string{});
1448  for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1449  if (str[i] == ' ' || str[i] == '\t') {
1450  // skip whitespace
1451  continue;
1452  }
1453  if (str[i] == ',') {
1454  // got a comma => new string
1455  matchResult.emplace_back(std::string{});
1456  continue;
1457  }
1458  // no whitespace no comma, append
1459  matchResult.back() += str[i];
1460  }
1461  return true;
1462 }
1463 
1464 static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1465  using detail::d;
1466 
1467  if (n == "title") {
1468  out << config.mBenchmarkTitle;
1469  return true;
1470  } else if (n == "name") {
1471  out << config.mBenchmarkName;
1472  return true;
1473  } else if (n == "unit") {
1474  out << config.mUnit;
1475  return true;
1476  } else if (n == "batch") {
1477  out << config.mBatch;
1478  return true;
1479  } else if (n == "complexityN") {
1480  out << config.mComplexityN;
1481  return true;
1482  } else if (n == "epochs") {
1483  out << config.mNumEpochs;
1484  return true;
1485  } else if (n == "clockResolution") {
1486  out << d(detail::clockResolution());
1487  return true;
1488  } else if (n == "clockResolutionMultiple") {
1489  out << config.mClockResolutionMultiple;
1490  return true;
1491  } else if (n == "maxEpochTime") {
1492  out << d(config.mMaxEpochTime);
1493  return true;
1494  } else if (n == "minEpochTime") {
1495  out << d(config.mMinEpochTime);
1496  return true;
1497  } else if (n == "minEpochIterations") {
1498  out << config.mMinEpochIterations;
1499  return true;
1500  } else if (n == "epochIterations") {
1501  out << config.mEpochIterations;
1502  return true;
1503  } else if (n == "warmup") {
1504  out << config.mWarmup;
1505  return true;
1506  } else if (n == "relative") {
1507  out << config.mIsRelative;
1508  return true;
1509  }
1510  return false;
1511 }
1512 
1513 static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1514  if (generateConfigTag(n, r.config(), out)) {
1515  return out;
1516  }
1517  // match e.g. "median(elapsed)"
1518  // g++ 4.8 doesn't implement std::regex :(
1519  // static std::regex const regOpArg1("^([a-zA-Z]+)\\(([a-zA-Z]*)\\)$");
1520  // std::cmatch matchResult;
1521  // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1522  std::vector<std::string> matchResult;
1523  if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1524  if (matchResult.size() == 2) {
1525  auto m = Result::fromString(matchResult[1]);
1526  if (m == Result::Measure::_size) {
1527  return out << 0.0;
1528  }
1529 
1530  if (matchResult[0] == "median") {
1531  return out << r.median(m);
1532  }
1533  if (matchResult[0] == "average") {
1534  return out << r.average(m);
1535  }
1536  if (matchResult[0] == "medianAbsolutePercentError") {
1537  return out << r.medianAbsolutePercentError(m);
1538  }
1539  if (matchResult[0] == "sum") {
1540  return out << r.sum(m);
1541  }
1542  if (matchResult[0] == "minimum") {
1543  return out << r.minimum(m);
1544  }
1545  if (matchResult[0] == "maximum") {
1546  return out << r.maximum(m);
1547  }
1548  } else if (matchResult.size() == 3) {
1549  auto m1 = Result::fromString(matchResult[1]);
1550  auto m2 = Result::fromString(matchResult[2]);
1551  if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) {
1552  return out << 0.0;
1553  }
1554 
1555  if (matchResult[0] == "sumProduct") {
1556  return out << r.sumProduct(m1, m2);
1557  }
1558  }
1559  }
1560 
1561  // match e.g. "sumProduct(elapsed, iterations)"
1562  // static std::regex const regOpArg2("^([a-zA-Z]+)\\(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\)$");
1563 
1564  // nothing matches :(
1565  throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1566 }
1567 
1568 static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1569  for (auto const& n : nodes) {
1570  if (!generateFirstLast(n, idx, r.size(), out)) {
1571  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1572  switch (n.type) {
1573  case Node::Type::content:
1574  out.write(n.begin, std::distance(n.begin, n.end));
1575  break;
1576 
1577  case Node::Type::inverted_section:
1578  throw std::runtime_error("got a inverted section inside measurement");
1579 
1580  case Node::Type::section:
1581  throw std::runtime_error("got a section inside measurement");
1582 
1583  case Node::Type::tag: {
1584  auto m = Result::fromString(std::string(n.begin, n.end));
1585  if (m == Result::Measure::_size || !r.has(m)) {
1586  out << 0.0;
1587  } else {
1588  out << r.get(idx, m);
1589  }
1590  break;
1591  }
1592  }
1593  }
1594  }
1595 }
1596 
1597 static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1598  auto const& r = results[idx];
1599  for (auto const& n : nodes) {
1600  if (!generateFirstLast(n, idx, results.size(), out)) {
1601  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1602  switch (n.type) {
1603  case Node::Type::content:
1604  out.write(n.begin, std::distance(n.begin, n.end));
1605  break;
1606 
1607  case Node::Type::inverted_section:
1608  throw std::runtime_error("got a inverted section inside result");
1609 
1610  case Node::Type::section:
1611  if (n == "measurement") {
1612  for (size_t i = 0; i < r.size(); ++i) {
1613  generateResultMeasurement(n.children, i, r, out);
1614  }
1615  } else {
1616  throw std::runtime_error("got a section inside result");
1617  }
1618  break;
1619 
1620  case Node::Type::tag:
1621  generateResultTag(n, r, out);
1622  break;
1623  }
1624  }
1625  }
1626 }
1627 
1628 } // namespace templates
1629 
1630 // helper stuff that only intended to be used internally
1631 namespace detail {
1632 
1633 char const* getEnv(char const* name);
1634 bool isEndlessRunning(std::string const& name);
1635 
1636 template <typename T>
1637 T parseFile(std::string const& filename);
1638 
1639 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1640 void printStabilityInformationOnce(std::ostream* os);
1641 
1642 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1643 uint64_t& singletonHeaderHash() noexcept;
1644 
1645 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1646 Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1647 
1648 // formatting utilities
1649 namespace fmt {
1650 
1651 // adds thousands separator to numbers
1652 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1653 class NumSep : public std::numpunct<char> {
1654 public:
1655  explicit NumSep(char sep);
1656  char do_thousands_sep() const override;
1657  std::string do_grouping() const override;
1658 
1659 private:
1660  char mSep;
1661 };
1662 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1663 
1664 // RAII to save & restore a stream's state
1665 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1666 class StreamStateRestorer {
1667 public:
1668  explicit StreamStateRestorer(std::ostream& s);
1669  ~StreamStateRestorer();
1670 
1671  // sets back all stream info that we remembered at construction
1672  void restore();
1673 
1674  // don't allow copying / moving
1675  StreamStateRestorer(StreamStateRestorer const&) = delete;
1676  StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1677  StreamStateRestorer(StreamStateRestorer&&) = delete;
1678  StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1679 
1680 private:
1681  std::ostream& mStream;
1682  std::locale mLocale;
1683  std::streamsize const mPrecision;
1684  std::streamsize const mWidth;
1685  std::ostream::char_type const mFill;
1686  std::ostream::fmtflags const mFmtFlags;
1687 };
1688 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1689 
1690 // Number formatter
1691 class Number {
1692 public:
1693  Number(int width, int precision, double value);
1694  Number(int width, int precision, int64_t value);
1695  std::string to_s() const;
1696 
1697 private:
1698  friend std::ostream& operator<<(std::ostream& os, Number const& n);
1699  std::ostream& write(std::ostream& os) const;
1700 
1701  int mWidth;
1702  int mPrecision;
1703  double mValue;
1704 };
1705 
1706 // helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1707 std::string to_s(uint64_t s);
1708 
1709 std::ostream& operator<<(std::ostream& os, Number const& n);
1710 
1711 class MarkDownColumn {
1712 public:
1713  MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val);
1714  std::string title() const;
1715  std::string separator() const;
1716  std::string invalid() const;
1717  std::string value() const;
1718 
1719 private:
1720  int mWidth;
1721  int mPrecision;
1722  std::string mTitle;
1723  std::string mSuffix;
1724  double mValue;
1725 };
1726 
1727 // Formats any text as markdown code, escaping backticks.
1728 class MarkDownCode {
1729 public:
1730  explicit MarkDownCode(std::string const& what);
1731 
1732 private:
1733  friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1734  std::ostream& write(std::ostream& os) const;
1735 
1736  std::string mWhat{};
1737 };
1738 
1739 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1740 
1741 } // namespace fmt
1742 } // namespace detail
1743 } // namespace nanobench
1744 } // namespace ankerl
1745 
1746 // implementation /////////////////////////////////////////////////////////////////////////////////
1747 
1748 namespace ankerl {
1749 namespace nanobench {
1750 
1751 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1752  detail::fmt::StreamStateRestorer restorer(out);
1753 
1754  out.precision(std::numeric_limits<double>::digits10);
1755  auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1756 
1757  for (auto const& n : nodes) {
1758  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1759  switch (n.type) {
1760  case templates::Node::Type::content:
1761  out.write(n.begin, std::distance(n.begin, n.end));
1762  break;
1763 
1764  case templates::Node::Type::inverted_section:
1765  throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1766 
1767  case templates::Node::Type::section:
1768  if (n == "result") {
1769  const size_t nbResults = results.size();
1770  for (size_t i = 0; i < nbResults; ++i) {
1771  generateResult(n.children, i, results, out);
1772  }
1773  } else {
1774  throw std::runtime_error("unknown section '" + std::string(n.begin, n.end) + "'");
1775  }
1776  break;
1777 
1778  case templates::Node::Type::tag:
1779  // This just uses the last result's config.
1780  if (!generateConfigTag(n, results.back().config(), out)) {
1781  throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
1782  }
1783  break;
1784  }
1785  }
1786 }
1787 
1788 void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
1789  render(mustacheTemplate, bench.results(), out);
1790 }
1791 
1792 namespace detail {
1793 
1794 PerformanceCounters& performanceCounters() {
1795 # if defined(__clang__)
1796 # pragma clang diagnostic push
1797 # pragma clang diagnostic ignored "-Wexit-time-destructors"
1798 # endif
1799  static PerformanceCounters pc;
1800 # if defined(__clang__)
1801 # pragma clang diagnostic pop
1802 # endif
1803  return pc;
1804 }
1805 
1806 // Windows version of doNotOptimizeAway
1807 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
1808 // see https://github.com/facebook/folly/blob/master/folly/Benchmark.h#L280
1809 // see https://docs.microsoft.com/en-us/cpp/preprocessor/optimize
1810 # if defined(_MSC_VER)
1811 # pragma optimize("", off)
1812 void doNotOptimizeAwaySink(void const*) {}
1813 # pragma optimize("", on)
1814 # endif
1815 
1816 template <typename T>
1817 T parseFile(std::string const& filename) {
1818  std::ifstream fin(filename);
1819  T num{};
1820  fin >> num;
1821  return num;
1822 }
1823 
1824 char const* getEnv(char const* name) {
1825 # if defined(_MSC_VER)
1826 # pragma warning(push)
1827 # pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
1828 # endif
1829  return std::getenv(name);
1830 # if defined(_MSC_VER)
1831 # pragma warning(pop)
1832 # endif
1833 }
1834 
1835 bool isEndlessRunning(std::string const& name) {
1836  auto endless = getEnv("NANOBENCH_ENDLESS");
1837  return nullptr != endless && endless == name;
1838 }
1839 
1840 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
1841  warnings.clear();
1842  recommendations.clear();
1843 
1844  bool recommendCheckFlags = false;
1845 
1846 # if defined(DEBUG)
1847  warnings.emplace_back("DEBUG defined");
1848  recommendCheckFlags = true;
1849 # endif
1850 
1851  bool recommendPyPerf = false;
1852 # if defined(__linux__)
1853  auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
1854  if (nprocs <= 0) {
1855  warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
1856  } else {
1857 
1858  // check frequency scaling
1859  for (long id = 0; id < nprocs; ++id) {
1860  auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
1861  auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
1862  auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq");
1863  auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq");
1864  if (minFreq != maxFreq) {
1865  auto minMHz = static_cast<double>(minFreq) / 1000.0;
1866  auto maxMHz = static_cast<double>(maxFreq) / 1000.0;
1867  warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
1868  detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
1869  " MHz");
1870  recommendPyPerf = true;
1871  break;
1872  }
1873  }
1874 
1875  auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
1876  if ("performance" != currentGovernor) {
1877  warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
1878  recommendPyPerf = true;
1879  }
1880 
1881  if (0 == parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo")) {
1882  warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
1883  recommendPyPerf = true;
1884  }
1885  }
1886 # endif
1887 
1888  if (recommendCheckFlags) {
1889  recommendations.emplace_back("Make sure you compile for Release");
1890  }
1891  if (recommendPyPerf) {
1892  recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/vstinner/pyperf");
1893  }
1894 }
1895 
1896 void printStabilityInformationOnce(std::ostream* outStream) {
1897  static bool shouldPrint = true;
1898  if (shouldPrint && outStream) {
1899  auto& os = *outStream;
1900  shouldPrint = false;
1901  std::vector<std::string> warnings;
1902  std::vector<std::string> recommendations;
1903  gatherStabilityInformation(warnings, recommendations);
1904  if (warnings.empty()) {
1905  return;
1906  }
1907 
1908  os << "Warning, results might be unstable:" << std::endl;
1909  for (auto const& w : warnings) {
1910  os << "* " << w << std::endl;
1911  }
1912 
1913  os << std::endl << "Recommendations" << std::endl;
1914  for (auto const& r : recommendations) {
1915  os << "* " << r << std::endl;
1916  }
1917  }
1918 }
1919 
1920 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1921 uint64_t& singletonHeaderHash() noexcept {
1922  static uint64_t sHeaderHash{};
1923  return sHeaderHash;
1924 }
1925 
1927 inline uint64_t fnv1a(std::string const& str) noexcept {
1928  auto val = UINT64_C(14695981039346656037);
1929  for (auto c : str) {
1930  val = (val ^ static_cast<uint8_t>(c)) * UINT64_C(1099511628211);
1931  }
1932  return val;
1933 }
1934 
1936 inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
1937  return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
1938 }
1939 
1940 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1941 Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
1942  auto bestDuration = Clock::duration::max();
1943  Clock::time_point tBegin;
1944  Clock::time_point tEnd;
1945  for (size_t i = 0; i < numEvaluations; ++i) {
1946  tBegin = Clock::now();
1947  do {
1948  tEnd = Clock::now();
1949  } while (tBegin == tEnd);
1950  bestDuration = (std::min)(bestDuration, tEnd - tBegin);
1951  }
1952  return bestDuration;
1953 }
1954 
1955 // Calculates clock resolution once, and remembers the result
1956 Clock::duration clockResolution() noexcept {
1957  static Clock::duration sResolution = calcClockResolution(20);
1958  return sResolution;
1959 }
1960 
1961 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1962 struct IterationLogic::Impl {
1963  enum class State { warmup, upscaling_runtime, measuring, endless };
1964 
1965  explicit Impl(Bench const& bench)
1966  : mBench(bench)
1967  , mResult(bench.config()) {
1968  printStabilityInformationOnce(mBench.output());
1969 
1970  // determine target runtime per epoch
1971  mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
1972  if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
1973  mTargetRuntimePerEpoch = mBench.maxEpochTime();
1974  }
1975  if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
1976  mTargetRuntimePerEpoch = mBench.minEpochTime();
1977  }
1978 
1979  if (isEndlessRunning(mBench.name())) {
1980  std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
1981  mNumIters = (std::numeric_limits<uint64_t>::max)();
1982  mState = State::endless;
1983  } else if (0 != mBench.warmup()) {
1984  mNumIters = mBench.warmup();
1985  mState = State::warmup;
1986  } else if (0 != mBench.epochIterations()) {
1987  // exact number of iterations
1988  mNumIters = mBench.epochIterations();
1989  mState = State::measuring;
1990  } else {
1991  mNumIters = mBench.minEpochIterations();
1992  mState = State::upscaling_runtime;
1993  }
1994  }
1995 
1996  // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
1997  ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
1998  auto doubleElapsed = d(elapsed);
1999  auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2000  auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2001 
2002  auto doubleMinEpochIters = d(mBench.minEpochIterations());
2003  if (doubleNewIters < doubleMinEpochIters) {
2004  doubleNewIters = doubleMinEpochIters;
2005  }
2006  doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2007 
2008  // +0.5 for correct rounding when casting
2009  // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2010  return static_cast<uint64_t>(doubleNewIters + 0.5);
2011  }
2012 
2013  ANKERL_NANOBENCH_NO_SANITIZE("integer") void upscale(std::chrono::nanoseconds elapsed) {
2014  if (elapsed * 10 < mTargetRuntimePerEpoch) {
2015  // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2016  if (mNumIters * 10 < mNumIters) {
2017  // overflow :-(
2018  showResult("iterations overflow. Maybe your code got optimized away?");
2019  mNumIters = 0;
2020  return;
2021  }
2022  mNumIters *= 10;
2023  } else {
2024  mNumIters = calcBestNumIters(elapsed, mNumIters);
2025  }
2026  }
2027 
2028  void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2029 # if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2030  auto oldIters = mNumIters;
2031 # endif
2032 
2033  switch (mState) {
2034  case State::warmup:
2035  if (isCloseEnoughForMeasurements(elapsed)) {
2036  // if elapsed is close enough, we can skip upscaling and go right to measurements
2037  // still, we don't add the result to the measurements.
2038  mState = State::measuring;
2039  mNumIters = calcBestNumIters(elapsed, mNumIters);
2040  } else {
2041  // not close enough: switch to upscaling
2042  mState = State::upscaling_runtime;
2043  upscale(elapsed);
2044  }
2045  break;
2046 
2047  case State::upscaling_runtime:
2048  if (isCloseEnoughForMeasurements(elapsed)) {
2049  // if we are close enough, add measurement and switch to always measuring
2050  mState = State::measuring;
2051  mTotalElapsed += elapsed;
2052  mTotalNumIters += mNumIters;
2053  mResult.add(elapsed, mNumIters, pc);
2054  mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2055  } else {
2056  upscale(elapsed);
2057  }
2058  break;
2059 
2060  case State::measuring:
2061  // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2062  // that fluctuation, or else we would bias the result
2063  mTotalElapsed += elapsed;
2064  mTotalNumIters += mNumIters;
2065  mResult.add(elapsed, mNumIters, pc);
2066  if (0 != mBench.epochIterations()) {
2067  mNumIters = mBench.epochIterations();
2068  } else {
2069  mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2070  }
2071  break;
2072 
2073  case State::endless:
2074  mNumIters = (std::numeric_limits<uint64_t>::max)();
2075  break;
2076  }
2077 
2078  if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2079  // we got all the results that we need, finish it
2080  showResult("");
2081  mNumIters = 0;
2082  }
2083 
2084  ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) << " elapsed, "
2085  << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
2086  << " target. oldIters=" << oldIters << ", mNumIters=" << mNumIters
2087  << ", mState=" << static_cast<int>(mState));
2088  }
2089 
2090  void showResult(std::string const& errorMessage) const {
2091  ANKERL_NANOBENCH_LOG(errorMessage);
2092 
2093  if (mBench.output() != nullptr) {
2094  // prepare column data ///////
2095  std::vector<fmt::MarkDownColumn> columns;
2096 
2097  auto rMedian = mResult.median(Result::Measure::elapsed);
2098 
2099  if (mBench.relative()) {
2100  double d = 100.0;
2101  if (!mBench.results().empty()) {
2102  d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2103  }
2104  columns.emplace_back(11, 1, "relative", "%", d);
2105  }
2106 
2107  if (mBench.complexityN() > 0) {
2108  columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2109  }
2110 
2111  columns.emplace_back(22, 2, "ns/" + mBench.unit(), "", 1e9 * rMedian / mBench.batch());
2112  columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2113 
2114  double rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2115  columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2116 
2117  double rInsMedian = -1.0;
2118  if (mResult.has(Result::Measure::instructions)) {
2119  rInsMedian = mResult.median(Result::Measure::instructions);
2120  columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2121  }
2122 
2123  double rCycMedian = -1.0;
2124  if (mResult.has(Result::Measure::cpucycles)) {
2125  rCycMedian = mResult.median(Result::Measure::cpucycles);
2126  columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2127  }
2128  if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2129  columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2130  }
2131  if (mResult.has(Result::Measure::branchinstructions)) {
2132  double rBraMedian = mResult.median(Result::Measure::branchinstructions);
2133  columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2134  if (mResult.has(Result::Measure::branchmisses)) {
2135  double p = 0.0;
2136  if (rBraMedian >= 1e-9) {
2137  p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2138  }
2139  columns.emplace_back(10, 1, "miss%", "%", p);
2140  }
2141  }
2142 
2143  columns.emplace_back(12, 2, "total", "", mResult.sum(Result::Measure::elapsed));
2144 
2145  // write everything
2146  auto& os = *mBench.output();
2147 
2148  uint64_t hash = 0;
2149  hash = hash_combine(fnv1a(mBench.unit()), hash);
2150  hash = hash_combine(fnv1a(mBench.title()), hash);
2151  hash = hash_combine(mBench.relative(), hash);
2152  hash = hash_combine(mBench.performanceCounters(), hash);
2153 
2154  if (hash != singletonHeaderHash()) {
2155  singletonHeaderHash() = hash;
2156 
2157  // no result yet, print header
2158  os << std::endl;
2159  for (auto const& col : columns) {
2160  os << col.title();
2161  }
2162  os << "| " << mBench.title() << std::endl;
2163 
2164  for (auto const& col : columns) {
2165  os << col.separator();
2166  }
2167  os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2168  }
2169 
2170  if (!errorMessage.empty()) {
2171  for (auto const& col : columns) {
2172  os << col.invalid();
2173  }
2174  os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2175  } else {
2176  for (auto const& col : columns) {
2177  os << col.value();
2178  }
2179  os << "| ";
2180  auto showUnstable = rErrorMedian >= 0.05;
2181  if (showUnstable) {
2182  os << ":wavy_dash: ";
2183  }
2184  os << fmt::MarkDownCode(mBench.name());
2185  if (showUnstable) {
2186  auto avgIters = static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
2187  // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2188  auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2189 
2190  os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2191  << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2192  }
2193  os << std::endl;
2194  }
2195  }
2196  }
2197 
2198  ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2199  return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2200  }
2201 
2202  uint64_t mNumIters = 1;
2203  Bench const& mBench;
2204  std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2205  Result mResult;
2206  Rng mRng{123};
2207  std::chrono::nanoseconds mTotalElapsed{};
2208  uint64_t mTotalNumIters = 0;
2209 
2210  State mState = State::upscaling_runtime;
2211 };
2212 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2213 
2214 IterationLogic::IterationLogic(Bench const& bench) noexcept
2215  : mPimpl(new Impl(bench)) {}
2216 
2217 IterationLogic::~IterationLogic() {
2218  if (mPimpl) {
2219  delete mPimpl;
2220  }
2221 }
2222 
2223 uint64_t IterationLogic::numIters() const noexcept {
2224  ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2225  return mPimpl->mNumIters;
2226 }
2227 
2228 void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2229  mPimpl->add(elapsed, pc);
2230 }
2231 
2232 void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2233  results.emplace_back(std::move(mPimpl->mResult));
2234 }
2235 
2236 # if ANKERL_NANOBENCH(PERF_COUNTERS)
2237 
2238 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2239 class LinuxPerformanceCounters {
2240 public:
2241  struct Target {
2242  Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2243  : targetValue(targetValue_)
2244  , correctMeasuringOverhead(correctMeasuringOverhead_)
2245  , correctLoopOverhead(correctLoopOverhead_) {}
2246 
2247  uint64_t* targetValue{};
2248  bool correctMeasuringOverhead{};
2249  bool correctLoopOverhead{};
2250  };
2251 
2252  ~LinuxPerformanceCounters();
2253 
2254  // quick operation
2255  inline void start() {}
2256 
2257  inline void stop() {}
2258 
2259  bool monitor(perf_sw_ids swId, Target target);
2260  bool monitor(perf_hw_id hwId, Target target);
2261 
2262  bool hasError() const noexcept {
2263  return mHasError;
2264  }
2265 
2266  // Just reading data is faster than enable & disabling.
2267  // we subtract data ourselves.
2268  inline void beginMeasure() {
2269  if (mHasError) {
2270  return;
2271  }
2272 
2273  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2274  mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2275  if (mHasError) {
2276  return;
2277  }
2278 
2279  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2280  mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2281  }
2282 
2283  inline void endMeasure() {
2284  if (mHasError) {
2285  return;
2286  }
2287 
2288  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2289  mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2290  if (mHasError) {
2291  return;
2292  }
2293 
2294  auto const numBytes = sizeof(uint64_t) * mCounters.size();
2295  auto ret = read(mFd, mCounters.data(), numBytes);
2296  mHasError = ret != static_cast<ssize_t>(numBytes);
2297  }
2298 
2299  void updateResults(uint64_t numIters);
2300 
2301  // rounded integer division
2302  template <typename T>
2303  static inline T divRounded(T a, T divisor) {
2304  return (a + divisor / 2) / divisor;
2305  }
2306 
2307  template <typename Op>
2308  ANKERL_NANOBENCH_NO_SANITIZE("integer")
2309  void calibrate(Op&& op) {
2310  // clear current calibration data,
2311  for (auto& v : mCalibratedOverhead) {
2312  v = UINT64_C(0);
2313  }
2314 
2315  // create new calibration data
2316  auto newCalibration = mCalibratedOverhead;
2317  for (auto& v : newCalibration) {
2318  v = (std::numeric_limits<uint64_t>::max)();
2319  }
2320  for (size_t iter = 0; iter < 100; ++iter) {
2321  beginMeasure();
2322  op();
2323  endMeasure();
2324  if (mHasError) {
2325  return;
2326  }
2327 
2328  for (size_t i = 0; i < newCalibration.size(); ++i) {
2329  auto diff = mCounters[i];
2330  if (newCalibration[i] > diff) {
2331  newCalibration[i] = diff;
2332  }
2333  }
2334  }
2335 
2336  mCalibratedOverhead = std::move(newCalibration);
2337 
2338  {
2339  // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2340  // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2341  // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2342  // see https://godbolt.org/z/49RVQ5
2343  uint64_t const numIters = 100000U + (std::random_device{}() & 3);
2344  uint64_t n = numIters;
2345  uint32_t x = 1234567;
2346  auto fn = [&]() {
2347  x ^= x << 13;
2348  x ^= x >> 17;
2349  x ^= x << 5;
2350  };
2351 
2352  beginMeasure();
2353  while (n-- > 0) {
2354  fn();
2355  }
2356  endMeasure();
2358  auto measure1 = mCounters;
2359 
2360  n = numIters;
2361  beginMeasure();
2362  while (n-- > 0) {
2363  // we now run *twice* so we can easily calculate the overhead
2364  fn();
2365  fn();
2366  }
2367  endMeasure();
2369  auto measure2 = mCounters;
2370 
2371  for (size_t i = 0; i < mCounters.size(); ++i) {
2372  // factor 2 because we have two instructions per loop
2373  auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2374  auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2375  auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2376 
2377  mLoopOverhead[i] = divRounded(overhead, numIters);
2378  }
2379  }
2380  }
2381 
2382 private:
2383  bool monitor(uint32_t type, uint64_t eventid, Target target);
2384 
2385  std::map<uint64_t, Target> mIdToTarget{};
2386 
2387  // start with minimum size of 3 for read_format
2388  std::vector<uint64_t> mCounters{3};
2389  std::vector<uint64_t> mCalibratedOverhead{3};
2390  std::vector<uint64_t> mLoopOverhead{3};
2391 
2392  uint64_t mTimeEnabledNanos = 0;
2393  uint64_t mTimeRunningNanos = 0;
2394  int mFd = -1;
2395  bool mHasError = false;
2396 };
2397 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2398 
2399 LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2400  if (-1 != mFd) {
2401  close(mFd);
2402  }
2403 }
2404 
2405 bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2406  return monitor(PERF_TYPE_SOFTWARE, swId, target);
2407 }
2408 
2409 bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2410  return monitor(PERF_TYPE_HARDWARE, hwId, target);
2411 }
2412 
2413 // overflow is ok, it's checked
2415 void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2416  // clear old data
2417  for (auto& id_value : mIdToTarget) {
2418  *id_value.second.targetValue = UINT64_C(0);
2419  }
2420 
2421  if (mHasError) {
2422  return;
2423  }
2424 
2425  mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2426  mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2427 
2428  for (uint64_t i = 0; i < mCounters[0]; ++i) {
2429  auto idx = static_cast<size_t>(3 + i * 2 + 0);
2430  auto id = mCounters[idx + 1U];
2431 
2432  auto it = mIdToTarget.find(id);
2433  if (it != mIdToTarget.end()) {
2434 
2435  auto& tgt = it->second;
2436  *tgt.targetValue = mCounters[idx];
2437  if (tgt.correctMeasuringOverhead) {
2438  if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2439  *tgt.targetValue -= mCalibratedOverhead[idx];
2440  } else {
2441  *tgt.targetValue = 0U;
2442  }
2443  }
2444  if (tgt.correctLoopOverhead) {
2445  auto correctionVal = mLoopOverhead[idx] * numIters;
2446  if (*tgt.targetValue >= correctionVal) {
2447  *tgt.targetValue -= correctionVal;
2448  } else {
2449  *tgt.targetValue = 0U;
2450  }
2451  }
2452  }
2453  }
2454 }
2455 
2456 bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2457  *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2458  if (mHasError) {
2459  return false;
2460  }
2461 
2462  auto pea = perf_event_attr();
2463  std::memset(&pea, 0, sizeof(perf_event_attr));
2464  pea.type = type;
2465  pea.size = sizeof(perf_event_attr);
2466  pea.config = eventid;
2467  pea.disabled = 1; // start counter as disabled
2468  pea.exclude_kernel = 1;
2469  pea.exclude_hv = 1;
2470 
2471  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2472  pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2473 
2474  const int pid = 0; // the current process
2475  const int cpu = -1; // all CPUs
2476 # if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2477  const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2478 # else
2479  const unsigned long flags = 0;
2480 # endif
2481 
2482  auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2483  if (-1 == fd) {
2484  return false;
2485  }
2486  if (-1 == mFd) {
2487  // first call: set to fd, and use this from now on
2488  mFd = fd;
2489  }
2490  uint64_t id = 0;
2491  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2492  if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2493  // couldn't get id
2494  return false;
2495  }
2496 
2497  // insert into map, rely on the fact that map's references are constant.
2498  mIdToTarget.emplace(id, target);
2499 
2500  // prepare readformat with the correct size (after the insert)
2501  auto size = 3 + 2 * mIdToTarget.size();
2502  mCounters.resize(size);
2503  mCalibratedOverhead.resize(size);
2504  mLoopOverhead.resize(size);
2505 
2506  return true;
2507 }
2508 
2509 PerformanceCounters::PerformanceCounters()
2510  : mPc(new LinuxPerformanceCounters())
2511  , mVal()
2512  , mHas() {
2513 
2514  mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2515  mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2516  mHas.contextSwitches =
2517  mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2518  mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2519  mHas.branchInstructions =
2520  mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2521  mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2522  // mHas.branchMisses = false;
2523 
2524  mPc->start();
2525  mPc->calibrate([] {
2526  auto before = ankerl::nanobench::Clock::now();
2527  auto after = ankerl::nanobench::Clock::now();
2528  (void)before;
2529  (void)after;
2530  });
2531 
2532  if (mPc->hasError()) {
2533  // something failed, don't monitor anything.
2534  mHas = PerfCountSet<bool>{};
2535  }
2536 }
2537 
2538 PerformanceCounters::~PerformanceCounters() {
2539  if (nullptr != mPc) {
2540  delete mPc;
2541  }
2542 }
2543 
2544 void PerformanceCounters::beginMeasure() {
2545  mPc->beginMeasure();
2546 }
2547 
2548 void PerformanceCounters::endMeasure() {
2549  mPc->endMeasure();
2550 }
2551 
2552 void PerformanceCounters::updateResults(uint64_t numIters) {
2553  mPc->updateResults(numIters);
2554 }
2555 
2556 # else
2557 
2558 PerformanceCounters::PerformanceCounters() = default;
2559 PerformanceCounters::~PerformanceCounters() = default;
2560 void PerformanceCounters::beginMeasure() {}
2561 void PerformanceCounters::endMeasure() {}
2562 void PerformanceCounters::updateResults(uint64_t) {}
2563 
2564 # endif
2565 
2566 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2567  return mVal;
2568 }
2569 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2570  return mHas;
2571 }
2572 
2573 // formatting utilities
2574 namespace fmt {
2575 
2576 // adds thousands separator to numbers
2577 NumSep::NumSep(char sep)
2578  : mSep(sep) {}
2579 
2580 char NumSep::do_thousands_sep() const {
2581  return mSep;
2582 }
2583 
2584 std::string NumSep::do_grouping() const {
2585  return "\003";
2586 }
2587 
2588 // RAII to save & restore a stream's state
2589 StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2590  : mStream(s)
2591  , mLocale(s.getloc())
2592  , mPrecision(s.precision())
2593  , mWidth(s.width())
2594  , mFill(s.fill())
2595  , mFmtFlags(s.flags()) {}
2596 
2597 StreamStateRestorer::~StreamStateRestorer() {
2598  restore();
2599 }
2600 
2601 // sets back all stream info that we remembered at construction
2602 void StreamStateRestorer::restore() {
2603  mStream.imbue(mLocale);
2604  mStream.precision(mPrecision);
2605  mStream.width(mWidth);
2606  mStream.fill(mFill);
2607  mStream.flags(mFmtFlags);
2608 }
2609 
2610 Number::Number(int width, int precision, int64_t value)
2611  : mWidth(width)
2612  , mPrecision(precision)
2613  , mValue(static_cast<double>(value)) {}
2614 
2615 Number::Number(int width, int precision, double value)
2616  : mWidth(width)
2617  , mPrecision(precision)
2618  , mValue(value) {}
2619 
2620 std::ostream& Number::write(std::ostream& os) const {
2621  StreamStateRestorer restorer(os);
2622  os.imbue(std::locale(os.getloc(), new NumSep(',')));
2623  os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2624  return os;
2625 }
2626 
2627 std::string Number::to_s() const {
2628  std::stringstream ss;
2629  write(ss);
2630  return ss.str();
2631 }
2632 
2633 std::string to_s(uint64_t n) {
2634  std::string str;
2635  do {
2636  str += static_cast<char>('0' + static_cast<char>(n % 10));
2637  n /= 10;
2638  } while (n != 0);
2639  std::reverse(str.begin(), str.end());
2640  return str;
2641 }
2642 
2643 std::ostream& operator<<(std::ostream& os, Number const& n) {
2644  return n.write(os);
2645 }
2646 
2647 MarkDownColumn::MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val)
2648  : mWidth(w)
2649  , mPrecision(prec)
2650  , mTitle(tit)
2651  , mSuffix(suff)
2652  , mValue(val) {}
2653 
2654 std::string MarkDownColumn::title() const {
2655  std::stringstream ss;
2656  ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2657  return ss.str();
2658 }
2659 
2660 std::string MarkDownColumn::separator() const {
2661  std::string sep(static_cast<size_t>(mWidth), '-');
2662  sep.front() = '|';
2663  sep.back() = ':';
2664  return sep;
2665 }
2666 
2667 std::string MarkDownColumn::invalid() const {
2668  std::string sep(static_cast<size_t>(mWidth), ' ');
2669  sep.front() = '|';
2670  sep[sep.size() - 2] = '-';
2671  return sep;
2672 }
2673 
2674 std::string MarkDownColumn::value() const {
2675  std::stringstream ss;
2676  auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2677  ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2678  return ss.str();
2679 }
2680 
2681 // Formats any text as markdown code, escaping backticks.
2682 MarkDownCode::MarkDownCode(std::string const& what) {
2683  mWhat.reserve(what.size() + 2);
2684  mWhat.push_back('`');
2685  for (char c : what) {
2686  mWhat.push_back(c);
2687  if ('`' == c) {
2688  mWhat.push_back('`');
2689  }
2690  }
2691  mWhat.push_back('`');
2692 }
2693 
2694 std::ostream& MarkDownCode::write(std::ostream& os) const {
2695  return os << mWhat;
2696 }
2697 
2698 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2699  return mdCode.write(os);
2700 }
2701 } // namespace fmt
2702 } // namespace detail
2703 
2704 // provide implementation here so it's only generated once
2705 Config::Config() = default;
2706 Config::~Config() = default;
2707 Config& Config::operator=(Config const&) = default;
2708 Config& Config::operator=(Config&&) = default;
2709 Config::Config(Config const&) = default;
2710 Config::Config(Config&&) noexcept = default;
2711 
2712 // provide implementation here so it's only generated once
2713 Result::~Result() = default;
2714 Result& Result::operator=(Result const&) = default;
2715 Result& Result::operator=(Result&&) = default;
2716 Result::Result(Result const&) = default;
2717 Result::Result(Result&&) noexcept = default;
2718 
2719 namespace detail {
2720 template <typename T>
2721 inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2722  return static_cast<typename std::underlying_type<T>::type>(val);
2723 }
2724 } // namespace detail
2725 
2726 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
2727 Result::Result(Config const& benchmarkConfig)
2728  : mConfig(benchmarkConfig)
2729  , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2730 
2731 void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2732  using detail::d;
2733  using detail::u;
2734 
2735  double dIters = d(iters);
2736  mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2737 
2738  mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2739  if (pc.has().pageFaults) {
2740  mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2741  }
2742  if (pc.has().cpuCycles) {
2743  mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2744  }
2745  if (pc.has().contextSwitches) {
2746  mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2747  }
2748  if (pc.has().instructions) {
2749  mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2750  }
2751  if (pc.has().branchInstructions) {
2752  double branchInstructions = 0.0;
2753  // correcting branches: remove branch introduced by the while (...) loop for each iteration.
2754  if (pc.val().branchInstructions > iters + 1U) {
2755  branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2756  }
2757  mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2758 
2759  if (pc.has().branchMisses) {
2760  // correcting branch misses
2761  double branchMisses = d(pc.val().branchMisses);
2762  if (branchMisses > branchInstructions) {
2763  // can't have branch misses when there were branches...
2764  branchMisses = branchInstructions;
2765  }
2766 
2767  // assuming at least one missed branch for the loop
2768  branchMisses -= 1.0;
2769  if (branchMisses < 1.0) {
2770  branchMisses = 1.0;
2771  }
2772  mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2773  }
2774  }
2775 }
2776 
2777 Config const& Result::config() const noexcept {
2778  return mConfig;
2779 }
2780 
2781 inline double calcMedian(std::vector<double>& data) {
2782  if (data.empty()) {
2783  return 0.0;
2784  }
2785  std::sort(data.begin(), data.end());
2786 
2787  auto midIdx = data.size() / 2U;
2788  if (1U == (data.size() & 1U)) {
2789  return data[midIdx];
2790  }
2791  return (data[midIdx - 1U] + data[midIdx]) / 2U;
2792 }
2793 
2794 double Result::median(Measure m) const {
2795  // create a copy so we can sort
2796  auto data = mNameToMeasurements[detail::u(m)];
2797  return calcMedian(data);
2798 }
2799 
2800 double Result::average(Measure m) const {
2801  using detail::d;
2802  auto const& data = mNameToMeasurements[detail::u(m)];
2803  if (data.empty()) {
2804  return 0.0;
2805  }
2806 
2807  // create a copy so we can sort
2808  return sum(m) / d(data.size());
2809 }
2810 
2811 double Result::medianAbsolutePercentError(Measure m) const {
2812  // create copy
2813  auto data = mNameToMeasurements[detail::u(m)];
2814 
2815  // calculates MdAPE which is the median of percentage error
2816  // see https://www.spiderfinancial.com/support/documentation/numxl/reference-manual/forecasting-performance/mdape
2817  auto med = calcMedian(data);
2818 
2819  // transform the data to absolute error
2820  for (auto& x : data) {
2821  x = (x - med) / x;
2822  if (x < 0) {
2823  x = -x;
2824  }
2825  }
2826  return calcMedian(data);
2827 }
2828 
2829 double Result::sum(Measure m) const noexcept {
2830  auto const& data = mNameToMeasurements[detail::u(m)];
2831  return std::accumulate(data.begin(), data.end(), 0.0);
2832 }
2833 
2834 double Result::sumProduct(Measure m1, Measure m2) const noexcept {
2835  auto const& data1 = mNameToMeasurements[detail::u(m1)];
2836  auto const& data2 = mNameToMeasurements[detail::u(m2)];
2837 
2838  if (data1.size() != data2.size()) {
2839  return 0.0;
2840  }
2841 
2842  double result = 0.0;
2843  for (size_t i = 0, s = data1.size(); i != s; ++i) {
2844  result += data1[i] * data2[i];
2845  }
2846  return result;
2847 }
2848 
2849 bool Result::has(Measure m) const noexcept {
2850  return !mNameToMeasurements[detail::u(m)].empty();
2851 }
2852 
2853 double Result::get(size_t idx, Measure m) const {
2854  auto const& data = mNameToMeasurements[detail::u(m)];
2855  return data.at(idx);
2856 }
2857 
2858 bool Result::empty() const noexcept {
2859  return 0U == size();
2860 }
2861 
2862 size_t Result::size() const noexcept {
2863  auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
2864  return data.size();
2865 }
2866 
2867 double Result::minimum(Measure m) const noexcept {
2868  auto const& data = mNameToMeasurements[detail::u(m)];
2869  if (data.empty()) {
2870  return 0.0;
2871  }
2872 
2873  // here its save to assume that at least one element is there
2874  return *std::min_element(data.begin(), data.end());
2875 }
2876 
2877 double Result::maximum(Measure m) const noexcept {
2878  auto const& data = mNameToMeasurements[detail::u(m)];
2879  if (data.empty()) {
2880  return 0.0;
2881  }
2882 
2883  // here its save to assume that at least one element is there
2884  return *std::max_element(data.begin(), data.end());
2885 }
2886 
2887 Result::Measure Result::fromString(std::string const& str) {
2888  if (str == "elapsed") {
2889  return Measure::elapsed;
2890  } else if (str == "iterations") {
2891  return Measure::iterations;
2892  } else if (str == "pagefaults") {
2893  return Measure::pagefaults;
2894  } else if (str == "cpucycles") {
2895  return Measure::cpucycles;
2896  } else if (str == "contextswitches") {
2897  return Measure::contextswitches;
2898  } else if (str == "instructions") {
2899  return Measure::instructions;
2900  } else if (str == "branchinstructions") {
2901  return Measure::branchinstructions;
2902  } else if (str == "branchmisses") {
2903  return Measure::branchmisses;
2904  } else {
2905  // not found, return _size
2906  return Measure::_size;
2907  }
2908 }
2909 
2910 // Configuration of a microbenchmark.
2911 Bench::Bench() {
2912  mConfig.mOut = &std::cout;
2913 }
2914 
2915 Bench::Bench(Bench&&) = default;
2916 Bench& Bench::operator=(Bench&&) = default;
2917 Bench::Bench(Bench const&) = default;
2918 Bench& Bench::operator=(Bench const&) = default;
2919 Bench::~Bench() noexcept = default;
2920 
2921 double Bench::batch() const noexcept {
2922  return mConfig.mBatch;
2923 }
2924 
2925 double Bench::complexityN() const noexcept {
2926  return mConfig.mComplexityN;
2927 }
2928 
2929 // Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
2930 // means it is slower than the baseline.
2931 Bench& Bench::relative(bool isRelativeEnabled) noexcept {
2932  mConfig.mIsRelative = isRelativeEnabled;
2933  return *this;
2934 }
2935 bool Bench::relative() const noexcept {
2936  return mConfig.mIsRelative;
2937 }
2938 
2939 Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
2940  mConfig.mShowPerformanceCounters = showPerformanceCounters;
2941  return *this;
2942 }
2943 bool Bench::performanceCounters() const noexcept {
2944  return mConfig.mShowPerformanceCounters;
2945 }
2946 
2947 // Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
2948 // If u differs from currently set unit, the stored results will be cleared.
2949 // Use singular (byte, not bytes).
2950 Bench& Bench::unit(char const* u) {
2951  if (u != mConfig.mUnit) {
2952  mResults.clear();
2953  }
2954  mConfig.mUnit = u;
2955  return *this;
2956 }
2957 
2958 Bench& Bench::unit(std::string const& u) {
2959  return unit(u.c_str());
2960 }
2961 
2962 std::string const& Bench::unit() const noexcept {
2963  return mConfig.mUnit;
2964 }
2965 
2966 // If benchmarkTitle differs from currently set title, the stored results will be cleared.
2967 Bench& Bench::title(const char* benchmarkTitle) {
2968  if (benchmarkTitle != mConfig.mBenchmarkTitle) {
2969  mResults.clear();
2970  }
2971  mConfig.mBenchmarkTitle = benchmarkTitle;
2972  return *this;
2973 }
2974 Bench& Bench::title(std::string const& benchmarkTitle) {
2975  if (benchmarkTitle != mConfig.mBenchmarkTitle) {
2976  mResults.clear();
2977  }
2978  mConfig.mBenchmarkTitle = benchmarkTitle;
2979  return *this;
2980 }
2981 
2982 std::string const& Bench::title() const noexcept {
2983  return mConfig.mBenchmarkTitle;
2984 }
2985 
2986 Bench& Bench::name(const char* benchmarkName) {
2987  mConfig.mBenchmarkName = benchmarkName;
2988  return *this;
2989 }
2990 
2991 Bench& Bench::name(std::string const& benchmarkName) {
2992  mConfig.mBenchmarkName = benchmarkName;
2993  return *this;
2994 }
2995 
2996 std::string const& Bench::name() const noexcept {
2997  return mConfig.mBenchmarkName;
2998 }
2999 
3000 // Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
3001 Bench& Bench::epochs(size_t numEpochs) noexcept {
3002  mConfig.mNumEpochs = numEpochs;
3003  return *this;
3004 }
3005 size_t Bench::epochs() const noexcept {
3006  return mConfig.mNumEpochs;
3007 }
3008 
3009 // Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
3010 Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3011  mConfig.mClockResolutionMultiple = multiple;
3012  return *this;
3013 }
3014 size_t Bench::clockResolutionMultiple() const noexcept {
3015  return mConfig.mClockResolutionMultiple;
3016 }
3017 
3018 // Sets the maximum time each epoch should take. Default is 100ms.
3019 Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3020  mConfig.mMaxEpochTime = t;
3021  return *this;
3022 }
3023 std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3024  return mConfig.mMaxEpochTime;
3025 }
3026 
3027 // Sets the maximum time each epoch should take. Default is 100ms.
3028 Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3029  mConfig.mMinEpochTime = t;
3030  return *this;
3031 }
3032 std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3033  return mConfig.mMinEpochTime;
3034 }
3035 
3036 Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3037  mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3038  return *this;
3039 }
3040 uint64_t Bench::minEpochIterations() const noexcept {
3041  return mConfig.mMinEpochIterations;
3042 }
3043 
3044 Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3045  mConfig.mEpochIterations = numIters;
3046  return *this;
3047 }
3048 uint64_t Bench::epochIterations() const noexcept {
3049  return mConfig.mEpochIterations;
3050 }
3051 
3052 Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3053  mConfig.mWarmup = numWarmupIters;
3054  return *this;
3055 }
3056 uint64_t Bench::warmup() const noexcept {
3057  return mConfig.mWarmup;
3058 }
3059 
3060 Bench& Bench::config(Config const& benchmarkConfig) {
3061  mConfig = benchmarkConfig;
3062  return *this;
3063 }
3064 Config const& Bench::config() const noexcept {
3065  return mConfig;
3066 }
3067 
3068 Bench& Bench::output(std::ostream* outstream) noexcept {
3069  mConfig.mOut = outstream;
3070  return *this;
3071 }
3072 
3073 ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3074  return mConfig.mOut;
3075 }
3076 
3077 std::vector<Result> const& Bench::results() const noexcept {
3078  return mResults;
3079 }
3080 
3081 Bench& Bench::render(char const* templateContent, std::ostream& os) {
3082  ::ankerl::nanobench::render(templateContent, *this, os);
3083  return *this;
3084 }
3085 
3086 std::vector<BigO> Bench::complexityBigO() const {
3087  std::vector<BigO> bigOs;
3088  auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3089  bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3090  return 1.0;
3091  });
3092  bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3093  return n;
3094  });
3095  bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3096  return std::log2(n);
3097  });
3098  bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3099  return n * std::log2(n);
3100  });
3101  bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3102  return n * n;
3103  });
3104  bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3105  return n * n * n;
3106  });
3107  std::sort(bigOs.begin(), bigOs.end());
3108  return bigOs;
3109 }
3110 
3111 Rng::Rng()
3112  : mX(0)
3113  , mY(0) {
3114  std::random_device rd;
3115  std::uniform_int_distribution<uint64_t> dist;
3116  do {
3117  mX = dist(rd);
3118  mY = dist(rd);
3119  } while (mX == 0 && mY == 0);
3120 }
3121 
3123 uint64_t splitMix64(uint64_t& state) noexcept {
3124  uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3125  z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3126  z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3127  return z ^ (z >> 31U);
3128 }
3129 
3130 // Seeded as described in romu paper (update april 2020)
3131 Rng::Rng(uint64_t seed) noexcept
3132  : mX(splitMix64(seed))
3133  , mY(splitMix64(seed)) {
3134  for (size_t i = 0; i < 10; ++i) {
3135  operator()();
3136  }
3137 }
3138 
3139 // only internally used to copy the RNG.
3140 Rng::Rng(uint64_t x, uint64_t y) noexcept
3141  : mX(x)
3142  , mY(y) {}
3143 
3144 Rng Rng::copy() const noexcept {
3145  return Rng{mX, mY};
3146 }
3147 
3148 BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3149  BigO::RangeMeasure rangeMeasure;
3150  for (auto const& result : results) {
3151  if (result.config().mComplexityN > 0.0) {
3152  rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3153  }
3154  }
3155  return rangeMeasure;
3156 }
3157 
3158 BigO::BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure)
3159  : mName(bigOName) {
3160 
3161  // estimate the constant factor
3162  double sumRangeMeasure = 0.0;
3163  double sumRangeRange = 0.0;
3164 
3165  for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3166  sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
3167  sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
3168  }
3169  mConstant = sumRangeMeasure / sumRangeRange;
3170 
3171  // calculate root mean square
3172  double err = 0.0;
3173  double sumMeasure = 0.0;
3174  for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3175  auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
3176  err += diff * diff;
3177 
3178  sumMeasure += rangeMeasure[i].second;
3179  }
3180 
3181  auto n = static_cast<double>(rangeMeasure.size());
3182  auto mean = sumMeasure / n;
3183  mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3184 }
3185 
3186 BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3187  : BigO(std::string(bigOName), rangeMeasure) {}
3188 
3189 std::string const& BigO::name() const noexcept {
3190  return mName;
3191 }
3192 
3193 double BigO::constant() const noexcept {
3194  return mConstant;
3195 }
3196 
3197 double BigO::normalizedRootMeanSquare() const noexcept {
3198  return mNormalizedRootMeanSquare;
3199 }
3200 
3201 bool BigO::operator<(BigO const& other) const noexcept {
3202  return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3203 }
3204 
3205 std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3206  return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3207 }
3208 
3209 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3210  detail::fmt::StreamStateRestorer restorer(os);
3211  os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3212  for (auto const& bigO : bigOs) {
3213  os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3214  os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3215  os << "| " << bigO.name();
3216  os << std::endl;
3217  }
3218  return os;
3219 }
3220 
3221 } // namespace nanobench
3222 } // namespace ankerl
3223 
3224 #endif // ANKERL_NANOBENCH_IMPLEMENT
3225 #endif // ANKERL_NANOBENCH_H_INCLUDED
char const * json() noexcept
Template to generate JSON data.
void moveResultTo(std::vector< Result > &results) noexcept
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:83
std::deque< CInv >::iterator it
const std::chrono::seconds now
Bench & epochIterations(uint64_t numIters) noexcept
Sets exactly the number of iterations for each epoch.
Bench & warmup(uint64_t numWarmupIters) noexcept
Sets a number of iterations that are initially performed without any measurements.
Bench & maxEpochTime(std::chrono::nanoseconds t) noexcept
Upper limit for the runtime of each epoch.
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:581
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
Rng()
Creates a new Random generator with random seed.
Bench & render(char const *templateContent, std::ostream &os)
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
fs::ifstream ifstream
Definition: fs.h:90
Result(Config const &benchmarkConfig)
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1087
void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const &pc)
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
An extremely fast random generator.
Definition: nanobench.h:453
State
The various states a (txhash,peer) pair can be in.
Definition: txrequest.cpp:39
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
ANKERL_NANOBENCH(NODISCARD) std Bench & name(char const *benchmarkName)
Name of the benchmark, will be shown in the table row.
std::vector< BigO > complexityBigO() const
std::enable_if<!doNotOptimizeNeedsIndirect< T >)>::type doNotOptimizeAway(T const &val)
Definition: nanobench.h:956
std::chrono::nanoseconds mMaxEpochTime
Definition: nanobench.h:366
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:95
volatile double sum
Definition: examples.cpp:10
ANKERL_NANOBENCH(NODISCARD) std Bench & minEpochIterations(uint64_t numIters) noexcept
Sets the minimum number of iterations each epoch should take.
Result & operator=(Result const &)
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1134
std::string mBenchmarkName
Definition: nanobench.h:360
#define NODISCARD
Definition: attributes.h:18
const char * name
Definition: rest.cpp:41
BigO(std::string const &bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1036
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1179
Bench & complexityN(T b) noexcept
Definition: nanobench.h:1165
static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept
Definition: nanobench.h:1106
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:118
ANKERL_NANOBENCH(NODISCARD) std Bench & output(std::ostream *outstream) noexcept
Set the output stream where the resulting markdown table will be printed to.
static Measure fromString(std::string const &str)
Bench()
Creates a new benchmark for configuration and running of benchmarks.
Measure m2 const noexcept
Definition: nanobench.h:418
Config & operator=(Config const &)
ANKERL_NANOBENCH(NODISCARD) std Bench & minEpochTime(std::chrono::nanoseconds t) noexcept
Minimum time each epoch should take.
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:48
static RPCHelpMan stop()
Definition: server.cpp:155
void render(char const *mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
Same as render(char const* mustacheTemplate, Bench const& bench, std::ostream& out), but for when you only have results available.
constexpr bool doNotOptimizeNeedsIndirect()
Definition: nanobench.h:950
Bench & relative(bool isRelativeEnabled) noexcept
Marks the next run as the baseline.
Bench & performanceCounters(bool showPerformanceCounters) noexcept
Enables/disables performance counters.
Bench & title(char const *benchmarkTitle)
Title of the benchmark, will be shown in the table header.
char const * csv() noexcept
CSV data for the benchmark results.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1097
Bench & unit(char const *unit)
Sets the operation unit.
int flags
Definition: bitcoin-tx.cpp:506
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:586
Bench & config(Config const &benchmarkConfig)
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
void * memcpy(void *a, const void *b, size_t c)
Bench & epochs(size_t numEpochs) noexcept
Controls number of epochs, the number of measurements to perform.
static constexpr uint64_t() max()
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1032
std::string mBenchmarkTitle
Definition: nanobench.h:359
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:458
static int count
Definition: tests.c:35
Main entry point to nanobench&#39;s benchmarking facility.
Definition: nanobench.h:583
std::chrono::nanoseconds mMinEpochTime
Definition: nanobench.h:367
Bench & operator=(Bench &&other)
double mNormalizedRootMeanSquare
Definition: nanobench.h:1049
ANKERL_NANOBENCH(NODISCARD) std Bench & batch(T b) noexcept
Sets the batch size.
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1019
PerformanceCounters & performanceCounters()
static constexpr uint64_t() min()
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1022
#define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...)
Definition: nanobench.h:109
ANKERL_NANOBENCH(NODISCARD) std Bench & clockResolutionMultiple(size_t multiple) noexcept
Modern processors have a very accurate clock, being able to measure as low as 20 nanoseconds.