Bitcoin Core  22.99.0
P2P Digital Currency
nanobench.h
Go to the documentation of this file.
1 // __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2 // | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3 // | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4 //
5 // Microbenchmark framework for C++11/14/17/20
6 // https://github.com/martinus/nanobench
7 //
8 // Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9 // SPDX-License-Identifier: MIT
10 // Copyright (c) 2019-2021 Martin Ankerl <martin.ankerl@gmail.com>
11 //
12 // Permission is hereby granted, free of charge, to any person obtaining a copy
13 // of this software and associated documentation files (the "Software"), to deal
14 // in the Software without restriction, including without limitation the rights
15 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 // copies of the Software, and to permit persons to whom the Software is
17 // furnished to do so, subject to the following conditions:
18 //
19 // The above copyright notice and this permission notice shall be included in all
20 // copies or substantial portions of the Software.
21 //
22 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 // SOFTWARE.
29 
30 #ifndef ANKERL_NANOBENCH_H_INCLUDED
31 #define ANKERL_NANOBENCH_H_INCLUDED
32 
33 // see https://semver.org/
34 #define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35 #define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes
36 #define ANKERL_NANOBENCH_VERSION_PATCH 4 // backwards-compatible bug fixes
37 
39 // public facing api - as minimal as possible
41 
42 #include <chrono> // high_resolution_clock
43 #include <cstring> // memcpy
44 #include <iosfwd> // for std::ostream* custom output target in Config
45 #include <string> // all names
46 #include <vector> // holds all results
47 
48 #define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
49 
50 #define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
51 #define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
52 #define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
53 #define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
54 #define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
55 
56 #if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
57 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
58 #else
59 # define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
60 #endif
61 
62 #if defined(__clang__)
63 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
64  _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
65 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
66 #else
67 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
68 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
69 #endif
70 
71 #if defined(__GNUC__)
72 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
73 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
74 #else
75 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
76 # define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
77 #endif
78 
79 #if defined(ANKERL_NANOBENCH_LOG_ENABLED)
80 # include <iostream>
81 # define ANKERL_NANOBENCH_LOG(x) \
82  do { \
83  std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
84  } while (0)
85 #else
86 # define ANKERL_NANOBENCH_LOG(x) \
87  do { \
88  } while (0)
89 #endif
90 
91 #if defined(__linux__) && defined(PERF_EVENT_IOC_ID) && defined(PERF_COUNT_HW_REF_CPU_CYCLES) && defined(PERF_FLAG_FD_CLOEXEC) && \
92  !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
93 // only enable perf counters on kernel 3.14 which seems to have all the necessary defines. The three PERF_... defines are not in
94 // kernel 2.6.32 (all others are).
95 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
96 #else
97 # define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
98 #endif
99 
100 #if defined(__clang__)
101 # define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
102 #else
103 # define ANKERL_NANOBENCH_NO_SANITIZE(...)
104 #endif
105 
106 #if defined(_MSC_VER)
107 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
108 #else
109 # define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
110 #endif
111 
112 // workaround missing "is_trivially_copyable" in g++ < 5.0
113 // See https://stackoverflow.com/a/31798726/48181
114 #if defined(__GNUC__) && __GNUC__ < 5
115 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
116 #else
117 # define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
118 #endif
119 
120 // declarations ///////////////////////////////////////////////////////////////////////////////////
121 
122 namespace ankerl {
123 namespace nanobench {
124 
125 using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
126  std::chrono::steady_clock>::type;
127 class Bench;
128 struct Config;
129 class Result;
130 class Rng;
131 class BigO;
132 
281 void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
282 void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
283 
292 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
293 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
294 
295 // Contains mustache-like templates
296 namespace templates {
297 
307 char const* csv() noexcept;
308 
319 char const* htmlBoxplot() noexcept;
320 
327 char const* pyperf() noexcept;
328 
338 char const* json() noexcept;
339 
340 } // namespace templates
341 
342 namespace detail {
343 
344 template <typename T>
346 
347 class IterationLogic;
348 class PerformanceCounters;
349 
350 #if ANKERL_NANOBENCH(PERF_COUNTERS)
351 class LinuxPerformanceCounters;
352 #endif
353 
354 } // namespace detail
355 } // namespace nanobench
356 } // namespace ankerl
357 
358 // definitions ////////////////////////////////////////////////////////////////////////////////////
359 
360 namespace ankerl {
361 namespace nanobench {
362 namespace detail {
363 
364 template <typename T>
365 struct PerfCountSet {
372 };
373 
374 } // namespace detail
375 
376 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
377 struct Config {
378  // actual benchmark config
379  std::string mBenchmarkTitle = "benchmark";
380  std::string mBenchmarkName = "noname";
381  std::string mUnit = "op";
382  double mBatch = 1.0;
383  double mComplexityN = -1.0;
384  size_t mNumEpochs = 11;
385  size_t mClockResolutionMultiple = static_cast<size_t>(1000);
386  std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100);
387  std::chrono::nanoseconds mMinEpochTime{};
388  uint64_t mMinEpochIterations{1};
389  uint64_t mEpochIterations{0}; // If not 0, run *exactly* these number of iterations per epoch.
390  uint64_t mWarmup = 0;
391  std::ostream* mOut = nullptr;
392  std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1};
393  std::string mTimeUnitName = "ns";
394  bool mShowPerformanceCounters = true;
395  bool mIsRelative = false;
396 
397  Config();
398  ~Config();
399  Config& operator=(Config const&);
400  Config& operator=(Config&&);
401  Config(Config const&);
402  Config(Config&&) noexcept;
403 };
404 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
405 
406 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
407 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
408 class Result {
409 public:
410  enum class Measure : size_t {
411  elapsed,
412  iterations,
413  pagefaults,
414  cpucycles,
415  contextswitches,
416  instructions,
417  branchinstructions,
418  branchmisses,
419  _size
420  };
421 
422  explicit Result(Config const& benchmarkConfig);
423 
424  ~Result();
425  Result& operator=(Result const&);
426  Result& operator=(Result&&);
427  Result(Result const&);
428  Result(Result&&) noexcept;
429 
430  // adds new measurement results
431  // all values are scaled by iters (except iters...)
432  void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
433 
434  ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
435 
436  ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
437  ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
438  ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
439  ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
440  ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
441  ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
442  ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
443 
444  ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
445  ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
446  ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
447  ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
448 
449  // Finds string, if not found, returns _size.
450  static Measure fromString(std::string const& str);
451 
452 private:
453  Config mConfig{};
454  std::vector<std::vector<double>> mNameToMeasurements{};
455 };
456 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
457 
458 
475 class Rng final {
476 public:
480  using result_type = uint64_t;
481 
482  static constexpr uint64_t(min)();
483  static constexpr uint64_t(max)();
484 
490  Rng(Rng const&) = delete;
491 
495  Rng& operator=(Rng const&) = delete;
496 
497  // moving is ok
498  Rng(Rng&&) noexcept = default;
499  Rng& operator=(Rng&&) noexcept = default;
500  ~Rng() noexcept = default;
501 
509  Rng();
510 
527  explicit Rng(uint64_t seed) noexcept;
528  Rng(uint64_t x, uint64_t y) noexcept;
529  Rng(std::vector<uint64_t> const& data);
530 
534  ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
535 
543  inline uint64_t operator()() noexcept;
544 
545  // This is slightly biased. See
546 
561  inline uint32_t bounded(uint32_t range) noexcept;
562 
563  // random double in range [0, 1(
564  // see http://prng.di.unimi.it/
565 
572  inline double uniform01() noexcept;
573 
581  template <typename Container>
582  void shuffle(Container& container) noexcept;
583 
590  std::vector<uint64_t> state() const;
591 
592 private:
593  static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
594 
595  uint64_t mX;
596  uint64_t mY;
597 };
598 
613 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
614 class Bench {
615 public:
619  Bench();
620 
621  Bench(Bench&& other);
622  Bench& operator=(Bench&& other);
623  Bench(Bench const& other);
624  Bench& operator=(Bench const& other);
625  ~Bench() noexcept;
626 
645  template <typename Op>
646  ANKERL_NANOBENCH(NOINLINE)
647  Bench& run(char const* benchmarkName, Op&& op);
648 
649  template <typename Op>
650  ANKERL_NANOBENCH(NOINLINE)
651  Bench& run(std::string const& benchmarkName, Op&& op);
652 
657  template <typename Op>
658  ANKERL_NANOBENCH(NOINLINE)
659  Bench& run(Op&& op);
660 
666  Bench& title(char const* benchmarkTitle);
667  Bench& title(std::string const& benchmarkTitle);
668  ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
669 
671  Bench& name(char const* benchmarkName);
672  Bench& name(std::string const& benchmarkName);
673  ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
674 
684  template <typename T>
685  Bench& batch(T b) noexcept;
686  ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
687 
696  Bench& unit(char const* unit);
697  Bench& unit(std::string const& unit);
698  ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
699 
709  Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
710  ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
711  ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
712 
720  Bench& output(std::ostream* outstream) noexcept;
721  ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
722 
743  Bench& clockResolutionMultiple(size_t multiple) noexcept;
744  ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
745 
761  Bench& epochs(size_t numEpochs) noexcept;
762  ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
763 
774  Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
775  ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
776 
787  Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
788  ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
789 
800  Bench& minEpochIterations(uint64_t numIters) noexcept;
801  ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
802 
809  Bench& epochIterations(uint64_t numIters) noexcept;
810  ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
811 
821  Bench& warmup(uint64_t numWarmupIters) noexcept;
822  ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
823 
841  Bench& relative(bool isRelativeEnabled) noexcept;
842  ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
843 
852  Bench& performanceCounters(bool showPerformanceCounters) noexcept;
853  ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
854 
863  ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
864 
872  template <typename Arg>
873  Bench& doNotOptimizeAway(Arg&& arg);
874 
889  template <typename T>
890  Bench& complexityN(T b) noexcept;
891  ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
892 
924  std::vector<BigO> complexityBigO() const;
925 
949  template <typename Op>
950  BigO complexityBigO(char const* name, Op op) const;
951 
952  template <typename Op>
953  BigO complexityBigO(std::string const& name, Op op) const;
954 
962  Bench& render(char const* templateContent, std::ostream& os);
963  Bench& render(std::string const& templateContent, std::ostream& os);
964 
965  Bench& config(Config const& benchmarkConfig);
966  ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
967 
968 private:
969  Config mConfig{};
970  std::vector<Result> mResults{};
971 };
972 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
973 
974 
980 template <typename Arg>
981 void doNotOptimizeAway(Arg&& arg);
982 
983 namespace detail {
984 
985 #if defined(_MSC_VER)
986 void doNotOptimizeAwaySink(void const*);
987 
988 template <typename T>
989 void doNotOptimizeAway(T const& val);
990 
991 #else
992 
993 // These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
994 // this seemd to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
995 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
996 template <typename T>
997 void doNotOptimizeAway(T const& val) {
998  // NOLINTNEXTLINE(hicpp-no-assembler)
999  asm volatile("" : : "r,m"(val) : "memory");
1000 }
1001 
1002 template <typename T>
1003 void doNotOptimizeAway(T& val) {
1004 # if defined(__clang__)
1005  // NOLINTNEXTLINE(hicpp-no-assembler)
1006  asm volatile("" : "+r,m"(val) : : "memory");
1007 # else
1008  // NOLINTNEXTLINE(hicpp-no-assembler)
1009  asm volatile("" : "+m,r"(val) : : "memory");
1010 # endif
1011 }
1012 #endif
1013 
1014 // internally used, but visible because run() is templated.
1015 // Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1016 // having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
1017 ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1019 public:
1020  explicit IterationLogic(Bench const& config) noexcept;
1021  ~IterationLogic();
1022 
1023  ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1024  void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1025  void moveResultTo(std::vector<Result>& results) noexcept;
1026 
1027 private:
1028  struct Impl;
1029  Impl* mPimpl;
1030 };
1031 ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1032 
1033 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1035 public:
1036  PerformanceCounters(PerformanceCounters const&) = delete;
1037  PerformanceCounters& operator=(PerformanceCounters const&) = delete;
1038 
1041 
1042  void beginMeasure();
1043  void endMeasure();
1044  void updateResults(uint64_t numIters);
1045 
1046  ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1047  ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1048 
1049 private:
1050 #if ANKERL_NANOBENCH(PERF_COUNTERS)
1051  LinuxPerformanceCounters* mPc = nullptr;
1052 #endif
1055 };
1056 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1057 
1058 // Gets the singleton
1059 PerformanceCounters& performanceCounters();
1060 
1061 } // namespace detail
1062 
1063 class BigO {
1064 public:
1065  using RangeMeasure = std::vector<std::pair<double, double>>;
1066 
1067  template <typename Op>
1069  for (auto& rangeMeasure : data) {
1070  rangeMeasure.first = op(rangeMeasure.first);
1071  }
1072  return data;
1073  }
1074 
1075  static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1076 
1077  template <typename Op>
1078  BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1079  : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1080 
1081  template <typename Op>
1082  BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1083  : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1084 
1085  BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1086  BigO(std::string const& bigOName, RangeMeasure const& scaledRangeMeasure);
1087  ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1088  ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1089  ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1090  ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1091 
1092 private:
1093  std::string mName{};
1094  double mConstant{};
1096 };
1097 std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1098 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1099 
1100 } // namespace nanobench
1101 } // namespace ankerl
1102 
1103 // implementation /////////////////////////////////////////////////////////////////////////////////
1104 
1105 namespace ankerl {
1106 namespace nanobench {
1107 
1108 constexpr uint64_t(Rng::min)() {
1109  return 0;
1110 }
1111 
1112 constexpr uint64_t(Rng::max)() {
1113  return (std::numeric_limits<uint64_t>::max)();
1114 }
1115 
1116 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1117 uint64_t Rng::operator()() noexcept {
1118  auto x = mX;
1119 
1120  mX = UINT64_C(15241094284759029579) * mY;
1121  mY = rotl(mY - x, 27);
1122 
1123  return x;
1124 }
1125 
1126 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1127 uint32_t Rng::bounded(uint32_t range) noexcept {
1128  uint64_t r32 = static_cast<uint32_t>(operator()());
1129  auto multiresult = r32 * range;
1130  return static_cast<uint32_t>(multiresult >> 32U);
1131 }
1132 
1133 double Rng::uniform01() noexcept {
1134  auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1135  // can't use union in c++ here for type puning, it's undefined behavior.
1136  // std::memcpy is optimized anyways.
1137  double d;
1138  std::memcpy(&d, &i, sizeof(double));
1139  return d - 1.0;
1140 }
1141 
1142 template <typename Container>
1143 void Rng::shuffle(Container& container) noexcept {
1144  auto size = static_cast<uint32_t>(container.size());
1145  for (auto i = size; i > 1U; --i) {
1146  using std::swap;
1147  auto p = bounded(i); // number in [0, i)
1148  swap(container[i - 1], container[p]);
1149  }
1150 }
1151 
1152 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1153 constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1154  return (x << k) | (x >> (64U - k));
1155 }
1156 
1157 template <typename Op>
1159 Bench& Bench::run(Op&& op) {
1160  // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1161  detail::IterationLogic iterationLogic(*this);
1162  auto& pc = detail::performanceCounters();
1163 
1164  while (auto n = iterationLogic.numIters()) {
1165  pc.beginMeasure();
1166  Clock::time_point before = Clock::now();
1167  while (n-- > 0) {
1168  op();
1169  }
1170  Clock::time_point after = Clock::now();
1171  pc.endMeasure();
1172  pc.updateResults(iterationLogic.numIters());
1173  iterationLogic.add(after - before, pc);
1174  }
1175  iterationLogic.moveResultTo(mResults);
1176  return *this;
1177 }
1178 
1179 // Performs all evaluations.
1180 template <typename Op>
1181 Bench& Bench::run(char const* benchmarkName, Op&& op) {
1182  name(benchmarkName);
1183  return run(std::forward<Op>(op));
1184 }
1185 
1186 template <typename Op>
1187 Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1188  name(benchmarkName);
1189  return run(std::forward<Op>(op));
1190 }
1191 
1192 template <typename Op>
1193 BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1194  return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1195 }
1196 
1197 template <typename Op>
1198 BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1199  return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1200 }
1201 
1202 // Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1203 // Any argument is cast to double.
1204 template <typename T>
1205 Bench& Bench::batch(T b) noexcept {
1206  mConfig.mBatch = static_cast<double>(b);
1207  return *this;
1208 }
1209 
1210 // Sets the computation complexity of the next run. Any argument is cast to double.
1211 template <typename T>
1212 Bench& Bench::complexityN(T n) noexcept {
1213  mConfig.mComplexityN = static_cast<double>(n);
1214  return *this;
1215 }
1216 
1217 // Convenience: makes sure none of the given arguments are optimized away by the compiler.
1218 template <typename Arg>
1220  detail::doNotOptimizeAway(std::forward<Arg>(arg));
1221  return *this;
1222 }
1223 
1224 // Makes sure none of the given arguments are optimized away by the compiler.
1225 template <typename Arg>
1226 void doNotOptimizeAway(Arg&& arg) {
1227  detail::doNotOptimizeAway(std::forward<Arg>(arg));
1228 }
1229 
1230 namespace detail {
1231 
1232 #if defined(_MSC_VER)
1233 template <typename T>
1234 void doNotOptimizeAway(T const& val) {
1235  doNotOptimizeAwaySink(&val);
1236 }
1237 
1238 #endif
1239 
1240 } // namespace detail
1241 } // namespace nanobench
1242 } // namespace ankerl
1243 
1244 #if defined(ANKERL_NANOBENCH_IMPLEMENT)
1245 
1247 // implementation part - only visible in .cpp
1249 
1250 # include <algorithm> // sort, reverse
1251 # include <atomic> // compare_exchange_strong in loop overhead
1252 # include <cstdlib> // getenv
1253 # include <cstring> // strstr, strncmp
1254 # include <fstream> // ifstream to parse proc files
1255 # include <iomanip> // setw, setprecision
1256 # include <iostream> // cout
1257 # include <numeric> // accumulate
1258 # include <random> // random_device
1259 # include <sstream> // to_s in Number
1260 # include <stdexcept> // throw for rendering templates
1261 # include <tuple> // std::tie
1262 # if defined(__linux__)
1263 # include <unistd.h> //sysconf
1264 # endif
1265 # if ANKERL_NANOBENCH(PERF_COUNTERS)
1266 # include <map> // map
1267 
1268 # include <linux/perf_event.h>
1269 # include <sys/ioctl.h>
1270 # include <sys/syscall.h>
1271 # include <unistd.h>
1272 # endif
1273 
1274 // declarations ///////////////////////////////////////////////////////////////////////////////////
1275 
1276 namespace ankerl {
1277 namespace nanobench {
1278 
1279 // helper stuff that is only intended to be used internally
1280 namespace detail {
1281 
1282 struct TableInfo;
1283 
1284 // formatting utilities
1285 namespace fmt {
1286 
1287 class NumSep;
1288 class StreamStateRestorer;
1289 class Number;
1290 class MarkDownColumn;
1291 class MarkDownCode;
1292 
1293 } // namespace fmt
1294 } // namespace detail
1295 } // namespace nanobench
1296 } // namespace ankerl
1297 
1298 // definitions ////////////////////////////////////////////////////////////////////////////////////
1299 
1300 namespace ankerl {
1301 namespace nanobench {
1302 
1303 uint64_t splitMix64(uint64_t& state) noexcept;
1304 
1305 namespace detail {
1306 
1307 // helpers to get double values
1308 template <typename T>
1309 inline double d(T t) noexcept {
1310  return static_cast<double>(t);
1311 }
1312 inline double d(Clock::duration duration) noexcept {
1313  return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1314 }
1315 
1316 // Calculates clock resolution once, and remembers the result
1317 inline Clock::duration clockResolution() noexcept;
1318 
1319 } // namespace detail
1320 
1321 namespace templates {
1322 
1323 char const* csv() noexcept {
1324  return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1325 {{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1326 {{/result}})DELIM";
1327 }
1328 
1329 char const* htmlBoxplot() noexcept {
1330  return R"DELIM(<html>
1331 
1332 <head>
1333  <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1334 </head>
1335 
1336 <body>
1337  <div id="myDiv"></div>
1338  <script>
1339  var data = [
1340  {{#result}}{
1341  name: '{{name}}',
1342  y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1343  },
1344  {{/result}}
1345  ];
1346  var title = '{{title}}';
1347 
1348  data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1349  var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1350  </script>
1351 </body>
1352 
1353 </html>)DELIM";
1354 }
1355 
1356 char const* pyperf() noexcept {
1357  return R"DELIM({
1358  "benchmarks": [
1359  {
1360  "runs": [
1361  {
1362  "values": [
1363 {{#measurement}} {{elapsed}}{{^-last}},
1364 {{/last}}{{/measurement}}
1365  ]
1366  }
1367  ]
1368  }
1369  ],
1370  "metadata": {
1371  "loops": {{sum(iterations)}},
1372  "inner_loops": {{batch}},
1373  "name": "{{title}}",
1374  "unit": "second"
1375  },
1376  "version": "1.0"
1377 })DELIM";
1378 }
1379 
1380 char const* json() noexcept {
1381  return R"DELIM({
1382  "results": [
1383 {{#result}} {
1384  "title": "{{title}}",
1385  "name": "{{name}}",
1386  "unit": "{{unit}}",
1387  "batch": {{batch}},
1388  "complexityN": {{complexityN}},
1389  "epochs": {{epochs}},
1390  "clockResolution": {{clockResolution}},
1391  "clockResolutionMultiple": {{clockResolutionMultiple}},
1392  "maxEpochTime": {{maxEpochTime}},
1393  "minEpochTime": {{minEpochTime}},
1394  "minEpochIterations": {{minEpochIterations}},
1395  "epochIterations": {{epochIterations}},
1396  "warmup": {{warmup}},
1397  "relative": {{relative}},
1398  "median(elapsed)": {{median(elapsed)}},
1399  "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1400  "median(instructions)": {{median(instructions)}},
1401  "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1402  "median(cpucycles)": {{median(cpucycles)}},
1403  "median(contextswitches)": {{median(contextswitches)}},
1404  "median(pagefaults)": {{median(pagefaults)}},
1405  "median(branchinstructions)": {{median(branchinstructions)}},
1406  "median(branchmisses)": {{median(branchmisses)}},
1407  "totalTime": {{sumProduct(iterations, elapsed)}},
1408  "measurements": [
1409 {{#measurement}} {
1410  "iterations": {{iterations}},
1411  "elapsed": {{elapsed}},
1412  "pagefaults": {{pagefaults}},
1413  "cpucycles": {{cpucycles}},
1414  "contextswitches": {{contextswitches}},
1415  "instructions": {{instructions}},
1416  "branchinstructions": {{branchinstructions}},
1417  "branchmisses": {{branchmisses}}
1418  }{{^-last}},{{/-last}}
1419 {{/measurement}} ]
1420  }{{^-last}},{{/-last}}
1421 {{/result}} ]
1422 })DELIM";
1423 }
1424 
1425 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1426 struct Node {
1427  enum class Type { tag, content, section, inverted_section };
1428 
1429  char const* begin;
1430  char const* end;
1431  std::vector<Node> children;
1432  Type type;
1433 
1434  template <size_t N>
1435  // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1436  bool operator==(char const (&str)[N]) const noexcept {
1437  return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1438  }
1439 };
1440 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1441 
1442 static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1443  std::vector<Node> nodes;
1444 
1445  while (true) {
1446  auto begin = std::strstr(*tpl, "{{");
1447  auto end = begin;
1448  if (begin != nullptr) {
1449  begin += 2;
1450  end = std::strstr(begin, "}}");
1451  }
1452 
1453  if (begin == nullptr || end == nullptr) {
1454  // nothing found, finish node
1455  nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1456  return nodes;
1457  }
1458 
1459  nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1460 
1461  // we found a tag
1462  *tpl = end + 2;
1463  switch (*begin) {
1464  case '/':
1465  // finished! bail out
1466  return nodes;
1467 
1468  case '#':
1469  nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1470  break;
1471 
1472  case '^':
1473  nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1474  break;
1475 
1476  default:
1477  nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1478  break;
1479  }
1480  }
1481 }
1482 
1483 static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1484  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1485  bool matchFirst = n == "-first";
1486  bool matchLast = n == "-last";
1487  if (!matchFirst && !matchLast) {
1488  return false;
1489  }
1490 
1491  bool doWrite = false;
1492  if (n.type == Node::Type::section) {
1493  doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1494  } else if (n.type == Node::Type::inverted_section) {
1495  doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1496  }
1497 
1498  if (doWrite) {
1499  for (auto const& child : n.children) {
1500  if (child.type == Node::Type::content) {
1501  out.write(child.begin, std::distance(child.begin, child.end));
1502  }
1503  }
1504  }
1505  return true;
1506 }
1507 
1508 static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1509  matchResult.clear();
1510  auto idxOpen = str.find('(');
1511  auto idxClose = str.find(')', idxOpen);
1512  if (idxClose == std::string::npos) {
1513  return false;
1514  }
1515 
1516  matchResult.emplace_back(str.substr(0, idxOpen));
1517 
1518  // split by comma
1519  matchResult.emplace_back(std::string{});
1520  for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1521  if (str[i] == ' ' || str[i] == '\t') {
1522  // skip whitespace
1523  continue;
1524  }
1525  if (str[i] == ',') {
1526  // got a comma => new string
1527  matchResult.emplace_back(std::string{});
1528  continue;
1529  }
1530  // no whitespace no comma, append
1531  matchResult.back() += str[i];
1532  }
1533  return true;
1534 }
1535 
1536 static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1537  using detail::d;
1538 
1539  if (n == "title") {
1540  out << config.mBenchmarkTitle;
1541  return true;
1542  } else if (n == "name") {
1543  out << config.mBenchmarkName;
1544  return true;
1545  } else if (n == "unit") {
1546  out << config.mUnit;
1547  return true;
1548  } else if (n == "batch") {
1549  out << config.mBatch;
1550  return true;
1551  } else if (n == "complexityN") {
1552  out << config.mComplexityN;
1553  return true;
1554  } else if (n == "epochs") {
1555  out << config.mNumEpochs;
1556  return true;
1557  } else if (n == "clockResolution") {
1558  out << d(detail::clockResolution());
1559  return true;
1560  } else if (n == "clockResolutionMultiple") {
1561  out << config.mClockResolutionMultiple;
1562  return true;
1563  } else if (n == "maxEpochTime") {
1564  out << d(config.mMaxEpochTime);
1565  return true;
1566  } else if (n == "minEpochTime") {
1567  out << d(config.mMinEpochTime);
1568  return true;
1569  } else if (n == "minEpochIterations") {
1570  out << config.mMinEpochIterations;
1571  return true;
1572  } else if (n == "epochIterations") {
1573  out << config.mEpochIterations;
1574  return true;
1575  } else if (n == "warmup") {
1576  out << config.mWarmup;
1577  return true;
1578  } else if (n == "relative") {
1579  out << config.mIsRelative;
1580  return true;
1581  }
1582  return false;
1583 }
1584 
1585 static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1586  if (generateConfigTag(n, r.config(), out)) {
1587  return out;
1588  }
1589  // match e.g. "median(elapsed)"
1590  // g++ 4.8 doesn't implement std::regex :(
1591  // static std::regex const regOpArg1("^([a-zA-Z]+)\\(([a-zA-Z]*)\\)$");
1592  // std::cmatch matchResult;
1593  // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1594  std::vector<std::string> matchResult;
1595  if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1596  if (matchResult.size() == 2) {
1597  auto m = Result::fromString(matchResult[1]);
1598  if (m == Result::Measure::_size) {
1599  return out << 0.0;
1600  }
1601 
1602  if (matchResult[0] == "median") {
1603  return out << r.median(m);
1604  }
1605  if (matchResult[0] == "average") {
1606  return out << r.average(m);
1607  }
1608  if (matchResult[0] == "medianAbsolutePercentError") {
1609  return out << r.medianAbsolutePercentError(m);
1610  }
1611  if (matchResult[0] == "sum") {
1612  return out << r.sum(m);
1613  }
1614  if (matchResult[0] == "minimum") {
1615  return out << r.minimum(m);
1616  }
1617  if (matchResult[0] == "maximum") {
1618  return out << r.maximum(m);
1619  }
1620  } else if (matchResult.size() == 3) {
1621  auto m1 = Result::fromString(matchResult[1]);
1622  auto m2 = Result::fromString(matchResult[2]);
1623  if (m1 == Result::Measure::_size || m2 == Result::Measure::_size) {
1624  return out << 0.0;
1625  }
1626 
1627  if (matchResult[0] == "sumProduct") {
1628  return out << r.sumProduct(m1, m2);
1629  }
1630  }
1631  }
1632 
1633  // match e.g. "sumProduct(elapsed, iterations)"
1634  // static std::regex const regOpArg2("^([a-zA-Z]+)\\(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\)$");
1635 
1636  // nothing matches :(
1637  throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1638 }
1639 
1640 static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1641  for (auto const& n : nodes) {
1642  if (!generateFirstLast(n, idx, r.size(), out)) {
1643  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1644  switch (n.type) {
1645  case Node::Type::content:
1646  out.write(n.begin, std::distance(n.begin, n.end));
1647  break;
1648 
1649  case Node::Type::inverted_section:
1650  throw std::runtime_error("got a inverted section inside measurement");
1651 
1652  case Node::Type::section:
1653  throw std::runtime_error("got a section inside measurement");
1654 
1655  case Node::Type::tag: {
1656  auto m = Result::fromString(std::string(n.begin, n.end));
1657  if (m == Result::Measure::_size || !r.has(m)) {
1658  out << 0.0;
1659  } else {
1660  out << r.get(idx, m);
1661  }
1662  break;
1663  }
1664  }
1665  }
1666  }
1667 }
1668 
1669 static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1670  auto const& r = results[idx];
1671  for (auto const& n : nodes) {
1672  if (!generateFirstLast(n, idx, results.size(), out)) {
1673  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1674  switch (n.type) {
1675  case Node::Type::content:
1676  out.write(n.begin, std::distance(n.begin, n.end));
1677  break;
1678 
1679  case Node::Type::inverted_section:
1680  throw std::runtime_error("got a inverted section inside result");
1681 
1682  case Node::Type::section:
1683  if (n == "measurement") {
1684  for (size_t i = 0; i < r.size(); ++i) {
1685  generateResultMeasurement(n.children, i, r, out);
1686  }
1687  } else {
1688  throw std::runtime_error("got a section inside result");
1689  }
1690  break;
1691 
1692  case Node::Type::tag:
1693  generateResultTag(n, r, out);
1694  break;
1695  }
1696  }
1697  }
1698 }
1699 
1700 } // namespace templates
1701 
1702 // helper stuff that only intended to be used internally
1703 namespace detail {
1704 
1705 char const* getEnv(char const* name);
1706 bool isEndlessRunning(std::string const& name);
1707 bool isWarningsEnabled();
1708 
1709 template <typename T>
1710 T parseFile(std::string const& filename);
1711 
1712 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1713 void printStabilityInformationOnce(std::ostream* os);
1714 
1715 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1716 uint64_t& singletonHeaderHash() noexcept;
1717 
1718 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1719 Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1720 
1721 // formatting utilities
1722 namespace fmt {
1723 
1724 // adds thousands separator to numbers
1725 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1726 class NumSep : public std::numpunct<char> {
1727 public:
1728  explicit NumSep(char sep);
1729  char do_thousands_sep() const override;
1730  std::string do_grouping() const override;
1731 
1732 private:
1733  char mSep;
1734 };
1735 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1736 
1737 // RAII to save & restore a stream's state
1738 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1739 class StreamStateRestorer {
1740 public:
1741  explicit StreamStateRestorer(std::ostream& s);
1742  ~StreamStateRestorer();
1743 
1744  // sets back all stream info that we remembered at construction
1745  void restore();
1746 
1747  // don't allow copying / moving
1748  StreamStateRestorer(StreamStateRestorer const&) = delete;
1749  StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1750  StreamStateRestorer(StreamStateRestorer&&) = delete;
1751  StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1752 
1753 private:
1754  std::ostream& mStream;
1755  std::locale mLocale;
1756  std::streamsize const mPrecision;
1757  std::streamsize const mWidth;
1758  std::ostream::char_type const mFill;
1759  std::ostream::fmtflags const mFmtFlags;
1760 };
1761 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1762 
1763 // Number formatter
1764 class Number {
1765 public:
1766  Number(int width, int precision, double value);
1767  Number(int width, int precision, int64_t value);
1768  std::string to_s() const;
1769 
1770 private:
1771  friend std::ostream& operator<<(std::ostream& os, Number const& n);
1772  std::ostream& write(std::ostream& os) const;
1773 
1774  int mWidth;
1775  int mPrecision;
1776  double mValue;
1777 };
1778 
1779 // helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1780 std::string to_s(uint64_t s);
1781 
1782 std::ostream& operator<<(std::ostream& os, Number const& n);
1783 
1784 class MarkDownColumn {
1785 public:
1786  MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val);
1787  std::string title() const;
1788  std::string separator() const;
1789  std::string invalid() const;
1790  std::string value() const;
1791 
1792 private:
1793  int mWidth;
1794  int mPrecision;
1795  std::string mTitle;
1796  std::string mSuffix;
1797  double mValue;
1798 };
1799 
1800 // Formats any text as markdown code, escaping backticks.
1801 class MarkDownCode {
1802 public:
1803  explicit MarkDownCode(std::string const& what);
1804 
1805 private:
1806  friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1807  std::ostream& write(std::ostream& os) const;
1808 
1809  std::string mWhat{};
1810 };
1811 
1812 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1813 
1814 } // namespace fmt
1815 } // namespace detail
1816 } // namespace nanobench
1817 } // namespace ankerl
1818 
1819 // implementation /////////////////////////////////////////////////////////////////////////////////
1820 
1821 namespace ankerl {
1822 namespace nanobench {
1823 
1824 void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1825  detail::fmt::StreamStateRestorer restorer(out);
1826 
1827  out.precision(std::numeric_limits<double>::digits10);
1828  auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1829 
1830  for (auto const& n : nodes) {
1831  ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1832  switch (n.type) {
1833  case templates::Node::Type::content:
1834  out.write(n.begin, std::distance(n.begin, n.end));
1835  break;
1836 
1837  case templates::Node::Type::inverted_section:
1838  throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1839 
1840  case templates::Node::Type::section:
1841  if (n == "result") {
1842  const size_t nbResults = results.size();
1843  for (size_t i = 0; i < nbResults; ++i) {
1844  generateResult(n.children, i, results, out);
1845  }
1846  } else if (n == "measurement") {
1847  if (results.size() != 1) {
1848  throw std::runtime_error(
1849  "render: can only use section 'measurement' here if there is a single result, but there are " +
1850  detail::fmt::to_s(results.size()));
1851  }
1852  // when we only have a single result, we can immediately go into its measurement.
1853  auto const& r = results.front();
1854  for (size_t i = 0; i < r.size(); ++i) {
1855  generateResultMeasurement(n.children, i, r, out);
1856  }
1857  } else {
1858  throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
1859  }
1860  break;
1861 
1862  case templates::Node::Type::tag:
1863  if (results.size() == 1) {
1864  // result & config are both supported there
1865  generateResultTag(n, results.front(), out);
1866  } else {
1867  // This just uses the last result's config.
1868  if (!generateConfigTag(n, results.back().config(), out)) {
1869  throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
1870  }
1871  }
1872  break;
1873  }
1874  }
1875 }
1876 
1877 void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1878  render(mustacheTemplate.c_str(), results, out);
1879 }
1880 
1881 void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
1882  render(mustacheTemplate, bench.results(), out);
1883 }
1884 
1885 void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
1886  render(mustacheTemplate.c_str(), bench.results(), out);
1887 }
1888 
1889 namespace detail {
1890 
1891 PerformanceCounters& performanceCounters() {
1892 # if defined(__clang__)
1893 # pragma clang diagnostic push
1894 # pragma clang diagnostic ignored "-Wexit-time-destructors"
1895 # endif
1896  static PerformanceCounters pc;
1897 # if defined(__clang__)
1898 # pragma clang diagnostic pop
1899 # endif
1900  return pc;
1901 }
1902 
1903 // Windows version of doNotOptimizeAway
1904 // see https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h#L307
1905 // see https://github.com/facebook/folly/blob/master/folly/Benchmark.h#L280
1906 // see https://docs.microsoft.com/en-us/cpp/preprocessor/optimize
1907 # if defined(_MSC_VER)
1908 # pragma optimize("", off)
1909 void doNotOptimizeAwaySink(void const*) {}
1910 # pragma optimize("", on)
1911 # endif
1912 
1913 template <typename T>
1914 T parseFile(std::string const& filename) {
1915  std::ifstream fin(filename);
1916  T num{};
1917  fin >> num;
1918  return num;
1919 }
1920 
1921 char const* getEnv(char const* name) {
1922 # if defined(_MSC_VER)
1923 # pragma warning(push)
1924 # pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
1925 # endif
1926  return std::getenv(name);
1927 # if defined(_MSC_VER)
1928 # pragma warning(pop)
1929 # endif
1930 }
1931 
1932 bool isEndlessRunning(std::string const& name) {
1933  auto endless = getEnv("NANOBENCH_ENDLESS");
1934  return nullptr != endless && endless == name;
1935 }
1936 
1937 // True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
1938 bool isWarningsEnabled() {
1939  auto suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
1940  return nullptr == suppression || suppression == std::string("0");
1941 }
1942 
1943 void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
1944  warnings.clear();
1945  recommendations.clear();
1946 
1947  bool recommendCheckFlags = false;
1948 
1949 # if defined(DEBUG)
1950  warnings.emplace_back("DEBUG defined");
1951  recommendCheckFlags = true;
1952 # endif
1953 
1954  bool recommendPyPerf = false;
1955 # if defined(__linux__)
1956  auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
1957  if (nprocs <= 0) {
1958  warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
1959  } else {
1960 
1961  // check frequency scaling
1962  for (long id = 0; id < nprocs; ++id) {
1963  auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
1964  auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
1965  auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq");
1966  auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq");
1967  if (minFreq != maxFreq) {
1968  auto minMHz = static_cast<double>(minFreq) / 1000.0;
1969  auto maxMHz = static_cast<double>(maxFreq) / 1000.0;
1970  warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
1971  detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
1972  " MHz");
1973  recommendPyPerf = true;
1974  break;
1975  }
1976  }
1977 
1978  auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor");
1979  if ("performance" != currentGovernor) {
1980  warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
1981  recommendPyPerf = true;
1982  }
1983 
1984  if (0 == parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo")) {
1985  warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
1986  recommendPyPerf = true;
1987  }
1988  }
1989 # endif
1990 
1991  if (recommendCheckFlags) {
1992  recommendations.emplace_back("Make sure you compile for Release");
1993  }
1994  if (recommendPyPerf) {
1995  recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
1996  }
1997 }
1998 
1999 void printStabilityInformationOnce(std::ostream* outStream) {
2000  static bool shouldPrint = true;
2001  if (shouldPrint && outStream && isWarningsEnabled()) {
2002  auto& os = *outStream;
2003  shouldPrint = false;
2004  std::vector<std::string> warnings;
2005  std::vector<std::string> recommendations;
2006  gatherStabilityInformation(warnings, recommendations);
2007  if (warnings.empty()) {
2008  return;
2009  }
2010 
2011  os << "Warning, results might be unstable:" << std::endl;
2012  for (auto const& w : warnings) {
2013  os << "* " << w << std::endl;
2014  }
2015 
2016  os << std::endl << "Recommendations" << std::endl;
2017  for (auto const& r : recommendations) {
2018  os << "* " << r << std::endl;
2019  }
2020  }
2021 }
2022 
2023 // remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2024 uint64_t& singletonHeaderHash() noexcept {
2025  static uint64_t sHeaderHash{};
2026  return sHeaderHash;
2027 }
2028 
2029 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2030 inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2031  return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2032 }
2033 
2034 // determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2035 Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2036  auto bestDuration = Clock::duration::max();
2037  Clock::time_point tBegin;
2038  Clock::time_point tEnd;
2039  for (size_t i = 0; i < numEvaluations; ++i) {
2040  tBegin = Clock::now();
2041  do {
2042  tEnd = Clock::now();
2043  } while (tBegin == tEnd);
2044  bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2045  }
2046  return bestDuration;
2047 }
2048 
2049 // Calculates clock resolution once, and remembers the result
2050 Clock::duration clockResolution() noexcept {
2051  static Clock::duration sResolution = calcClockResolution(20);
2052  return sResolution;
2053 }
2054 
2055 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2056 struct IterationLogic::Impl {
2057  enum class State { warmup, upscaling_runtime, measuring, endless };
2058 
2059  explicit Impl(Bench const& bench)
2060  : mBench(bench)
2061  , mResult(bench.config()) {
2062  printStabilityInformationOnce(mBench.output());
2063 
2064  // determine target runtime per epoch
2065  mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2066  if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2067  mTargetRuntimePerEpoch = mBench.maxEpochTime();
2068  }
2069  if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2070  mTargetRuntimePerEpoch = mBench.minEpochTime();
2071  }
2072 
2073  if (isEndlessRunning(mBench.name())) {
2074  std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2075  mNumIters = (std::numeric_limits<uint64_t>::max)();
2076  mState = State::endless;
2077  } else if (0 != mBench.warmup()) {
2078  mNumIters = mBench.warmup();
2079  mState = State::warmup;
2080  } else if (0 != mBench.epochIterations()) {
2081  // exact number of iterations
2082  mNumIters = mBench.epochIterations();
2083  mState = State::measuring;
2084  } else {
2085  mNumIters = mBench.minEpochIterations();
2086  mState = State::upscaling_runtime;
2087  }
2088  }
2089 
2090  // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2091  ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2092  auto doubleElapsed = d(elapsed);
2093  auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2094  auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2095 
2096  auto doubleMinEpochIters = d(mBench.minEpochIterations());
2097  if (doubleNewIters < doubleMinEpochIters) {
2098  doubleNewIters = doubleMinEpochIters;
2099  }
2100  doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2101 
2102  // +0.5 for correct rounding when casting
2103  // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2104  return static_cast<uint64_t>(doubleNewIters + 0.5);
2105  }
2106 
2107  ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
2108  if (elapsed * 10 < mTargetRuntimePerEpoch) {
2109  // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2110  if (mNumIters * 10 < mNumIters) {
2111  // overflow :-(
2112  showResult("iterations overflow. Maybe your code got optimized away?");
2113  mNumIters = 0;
2114  return;
2115  }
2116  mNumIters *= 10;
2117  } else {
2118  mNumIters = calcBestNumIters(elapsed, mNumIters);
2119  }
2120  }
2121 
2122  void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2123 # if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2124  auto oldIters = mNumIters;
2125 # endif
2126 
2127  switch (mState) {
2128  case State::warmup:
2129  if (isCloseEnoughForMeasurements(elapsed)) {
2130  // if elapsed is close enough, we can skip upscaling and go right to measurements
2131  // still, we don't add the result to the measurements.
2132  mState = State::measuring;
2133  mNumIters = calcBestNumIters(elapsed, mNumIters);
2134  } else {
2135  // not close enough: switch to upscaling
2136  mState = State::upscaling_runtime;
2137  upscale(elapsed);
2138  }
2139  break;
2140 
2141  case State::upscaling_runtime:
2142  if (isCloseEnoughForMeasurements(elapsed)) {
2143  // if we are close enough, add measurement and switch to always measuring
2144  mState = State::measuring;
2145  mTotalElapsed += elapsed;
2146  mTotalNumIters += mNumIters;
2147  mResult.add(elapsed, mNumIters, pc);
2148  mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2149  } else {
2150  upscale(elapsed);
2151  }
2152  break;
2153 
2154  case State::measuring:
2155  // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2156  // that fluctuation, or else we would bias the result
2157  mTotalElapsed += elapsed;
2158  mTotalNumIters += mNumIters;
2159  mResult.add(elapsed, mNumIters, pc);
2160  if (0 != mBench.epochIterations()) {
2161  mNumIters = mBench.epochIterations();
2162  } else {
2163  mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2164  }
2165  break;
2166 
2167  case State::endless:
2168  mNumIters = (std::numeric_limits<uint64_t>::max)();
2169  break;
2170  }
2171 
2172  if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2173  // we got all the results that we need, finish it
2174  showResult("");
2175  mNumIters = 0;
2176  }
2177 
2178  ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, static_cast<double>(elapsed.count())) << " elapsed, "
2179  << detail::fmt::Number(20, 3, static_cast<double>(mTargetRuntimePerEpoch.count()))
2180  << " target. oldIters=" << oldIters << ", mNumIters=" << mNumIters
2181  << ", mState=" << static_cast<int>(mState));
2182  }
2183 
2184  void showResult(std::string const& errorMessage) const {
2185  ANKERL_NANOBENCH_LOG(errorMessage);
2186 
2187  if (mBench.output() != nullptr) {
2188  // prepare column data ///////
2189  std::vector<fmt::MarkDownColumn> columns;
2190 
2191  auto rMedian = mResult.median(Result::Measure::elapsed);
2192 
2193  if (mBench.relative()) {
2194  double d = 100.0;
2195  if (!mBench.results().empty()) {
2196  d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2197  }
2198  columns.emplace_back(11, 1, "relative", "%", d);
2199  }
2200 
2201  if (mBench.complexityN() > 0) {
2202  columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2203  }
2204 
2205  columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2206  rMedian / (mBench.timeUnit().count() * mBench.batch()));
2207  columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2208 
2209  double rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2210  columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2211 
2212  double rInsMedian = -1.0;
2213  if (mResult.has(Result::Measure::instructions)) {
2214  rInsMedian = mResult.median(Result::Measure::instructions);
2215  columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2216  }
2217 
2218  double rCycMedian = -1.0;
2219  if (mResult.has(Result::Measure::cpucycles)) {
2220  rCycMedian = mResult.median(Result::Measure::cpucycles);
2221  columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2222  }
2223  if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2224  columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2225  }
2226  if (mResult.has(Result::Measure::branchinstructions)) {
2227  double rBraMedian = mResult.median(Result::Measure::branchinstructions);
2228  columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2229  if (mResult.has(Result::Measure::branchmisses)) {
2230  double p = 0.0;
2231  if (rBraMedian >= 1e-9) {
2232  p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2233  }
2234  columns.emplace_back(10, 1, "miss%", "%", p);
2235  }
2236  }
2237 
2238  columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2239 
2240  // write everything
2241  auto& os = *mBench.output();
2242 
2243  // combine all elements that are relevant for printing the header
2244  uint64_t hash = 0;
2245  hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2246  hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2247  hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2248  hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2249  hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2250  hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2251 
2252  if (hash != singletonHeaderHash()) {
2253  singletonHeaderHash() = hash;
2254 
2255  // no result yet, print header
2256  os << std::endl;
2257  for (auto const& col : columns) {
2258  os << col.title();
2259  }
2260  os << "| " << mBench.title() << std::endl;
2261 
2262  for (auto const& col : columns) {
2263  os << col.separator();
2264  }
2265  os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2266  }
2267 
2268  if (!errorMessage.empty()) {
2269  for (auto const& col : columns) {
2270  os << col.invalid();
2271  }
2272  os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2273  } else {
2274  for (auto const& col : columns) {
2275  os << col.value();
2276  }
2277  os << "| ";
2278  auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2279  if (showUnstable) {
2280  os << ":wavy_dash: ";
2281  }
2282  os << fmt::MarkDownCode(mBench.name());
2283  if (showUnstable) {
2284  auto avgIters = static_cast<double>(mTotalNumIters) / static_cast<double>(mBench.epochs());
2285  // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2286  auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2287 
2288  os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2289  << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2290  }
2291  os << std::endl;
2292  }
2293  }
2294  }
2295 
2296  ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2297  return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2298  }
2299 
2300  uint64_t mNumIters = 1;
2301  Bench const& mBench;
2302  std::chrono::nanoseconds mTargetRuntimePerEpoch{};
2303  Result mResult;
2304  Rng mRng{123};
2305  std::chrono::nanoseconds mTotalElapsed{};
2306  uint64_t mTotalNumIters = 0;
2307 
2308  State mState = State::upscaling_runtime;
2309 };
2310 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2311 
2312 IterationLogic::IterationLogic(Bench const& bench) noexcept
2313  : mPimpl(new Impl(bench)) {}
2314 
2315 IterationLogic::~IterationLogic() {
2316  if (mPimpl) {
2317  delete mPimpl;
2318  }
2319 }
2320 
2321 uint64_t IterationLogic::numIters() const noexcept {
2322  ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2323  return mPimpl->mNumIters;
2324 }
2325 
2326 void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2327  mPimpl->add(elapsed, pc);
2328 }
2329 
2330 void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2331  results.emplace_back(std::move(mPimpl->mResult));
2332 }
2333 
2334 # if ANKERL_NANOBENCH(PERF_COUNTERS)
2335 
2336 ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2337 class LinuxPerformanceCounters {
2338 public:
2339  struct Target {
2340  Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2341  : targetValue(targetValue_)
2342  , correctMeasuringOverhead(correctMeasuringOverhead_)
2343  , correctLoopOverhead(correctLoopOverhead_) {}
2344 
2345  uint64_t* targetValue{};
2346  bool correctMeasuringOverhead{};
2347  bool correctLoopOverhead{};
2348  };
2349 
2350  ~LinuxPerformanceCounters();
2351 
2352  // quick operation
2353  inline void start() {}
2354 
2355  inline void stop() {}
2356 
2357  bool monitor(perf_sw_ids swId, Target target);
2358  bool monitor(perf_hw_id hwId, Target target);
2359 
2360  bool hasError() const noexcept {
2361  return mHasError;
2362  }
2363 
2364  // Just reading data is faster than enable & disabling.
2365  // we subtract data ourselves.
2366  inline void beginMeasure() {
2367  if (mHasError) {
2368  return;
2369  }
2370 
2371  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2372  mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2373  if (mHasError) {
2374  return;
2375  }
2376 
2377  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2378  mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2379  }
2380 
2381  inline void endMeasure() {
2382  if (mHasError) {
2383  return;
2384  }
2385 
2386  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2387  mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2388  if (mHasError) {
2389  return;
2390  }
2391 
2392  auto const numBytes = sizeof(uint64_t) * mCounters.size();
2393  auto ret = read(mFd, mCounters.data(), numBytes);
2394  mHasError = ret != static_cast<ssize_t>(numBytes);
2395  }
2396 
2397  void updateResults(uint64_t numIters);
2398 
2399  // rounded integer division
2400  template <typename T>
2401  static inline T divRounded(T a, T divisor) {
2402  return (a + divisor / 2) / divisor;
2403  }
2404 
2405  template <typename Op>
2406  ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2407  void calibrate(Op&& op) {
2408  // clear current calibration data,
2409  for (auto& v : mCalibratedOverhead) {
2410  v = UINT64_C(0);
2411  }
2412 
2413  // create new calibration data
2414  auto newCalibration = mCalibratedOverhead;
2415  for (auto& v : newCalibration) {
2416  v = (std::numeric_limits<uint64_t>::max)();
2417  }
2418  for (size_t iter = 0; iter < 100; ++iter) {
2419  beginMeasure();
2420  op();
2421  endMeasure();
2422  if (mHasError) {
2423  return;
2424  }
2425 
2426  for (size_t i = 0; i < newCalibration.size(); ++i) {
2427  auto diff = mCounters[i];
2428  if (newCalibration[i] > diff) {
2429  newCalibration[i] = diff;
2430  }
2431  }
2432  }
2433 
2434  mCalibratedOverhead = std::move(newCalibration);
2435 
2436  {
2437  // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2438  // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2439  // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2440  // see https://godbolt.org/z/49RVQ5
2441  uint64_t const numIters = 100000U + (std::random_device{}() & 3);
2442  uint64_t n = numIters;
2443  uint32_t x = 1234567;
2444  auto fn = [&]() {
2445  x ^= x << 13;
2446  x ^= x >> 17;
2447  x ^= x << 5;
2448  };
2449 
2450  beginMeasure();
2451  while (n-- > 0) {
2452  fn();
2453  }
2454  endMeasure();
2456  auto measure1 = mCounters;
2457 
2458  n = numIters;
2459  beginMeasure();
2460  while (n-- > 0) {
2461  // we now run *twice* so we can easily calculate the overhead
2462  fn();
2463  fn();
2464  }
2465  endMeasure();
2467  auto measure2 = mCounters;
2468 
2469  for (size_t i = 0; i < mCounters.size(); ++i) {
2470  // factor 2 because we have two instructions per loop
2471  auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2472  auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2473  auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2474 
2475  mLoopOverhead[i] = divRounded(overhead, numIters);
2476  }
2477  }
2478  }
2479 
2480 private:
2481  bool monitor(uint32_t type, uint64_t eventid, Target target);
2482 
2483  std::map<uint64_t, Target> mIdToTarget{};
2484 
2485  // start with minimum size of 3 for read_format
2486  std::vector<uint64_t> mCounters{3};
2487  std::vector<uint64_t> mCalibratedOverhead{3};
2488  std::vector<uint64_t> mLoopOverhead{3};
2489 
2490  uint64_t mTimeEnabledNanos = 0;
2491  uint64_t mTimeRunningNanos = 0;
2492  int mFd = -1;
2493  bool mHasError = false;
2494 };
2495 ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2496 
2497 LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2498  if (-1 != mFd) {
2499  close(mFd);
2500  }
2501 }
2502 
2503 bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2504  return monitor(PERF_TYPE_SOFTWARE, swId, target);
2505 }
2506 
2507 bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2508  return monitor(PERF_TYPE_HARDWARE, hwId, target);
2509 }
2510 
2511 // overflow is ok, it's checked
2512 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2513 void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2514  // clear old data
2515  for (auto& id_value : mIdToTarget) {
2516  *id_value.second.targetValue = UINT64_C(0);
2517  }
2518 
2519  if (mHasError) {
2520  return;
2521  }
2522 
2523  mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2524  mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2525 
2526  for (uint64_t i = 0; i < mCounters[0]; ++i) {
2527  auto idx = static_cast<size_t>(3 + i * 2 + 0);
2528  auto id = mCounters[idx + 1U];
2529 
2530  auto it = mIdToTarget.find(id);
2531  if (it != mIdToTarget.end()) {
2532 
2533  auto& tgt = it->second;
2534  *tgt.targetValue = mCounters[idx];
2535  if (tgt.correctMeasuringOverhead) {
2536  if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2537  *tgt.targetValue -= mCalibratedOverhead[idx];
2538  } else {
2539  *tgt.targetValue = 0U;
2540  }
2541  }
2542  if (tgt.correctLoopOverhead) {
2543  auto correctionVal = mLoopOverhead[idx] * numIters;
2544  if (*tgt.targetValue >= correctionVal) {
2545  *tgt.targetValue -= correctionVal;
2546  } else {
2547  *tgt.targetValue = 0U;
2548  }
2549  }
2550  }
2551  }
2552 }
2553 
2554 bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2555  *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2556  if (mHasError) {
2557  return false;
2558  }
2559 
2560  auto pea = perf_event_attr();
2561  std::memset(&pea, 0, sizeof(perf_event_attr));
2562  pea.type = type;
2563  pea.size = sizeof(perf_event_attr);
2564  pea.config = eventid;
2565  pea.disabled = 1; // start counter as disabled
2566  pea.exclude_kernel = 1;
2567  pea.exclude_hv = 1;
2568 
2569  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2570  pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2571 
2572  const int pid = 0; // the current process
2573  const int cpu = -1; // all CPUs
2574 # if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2575  const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2576 # else
2577  const unsigned long flags = 0;
2578 # endif
2579 
2580  auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2581  if (-1 == fd) {
2582  return false;
2583  }
2584  if (-1 == mFd) {
2585  // first call: set to fd, and use this from now on
2586  mFd = fd;
2587  }
2588  uint64_t id = 0;
2589  // NOLINTNEXTLINE(hicpp-signed-bitwise)
2590  if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2591  // couldn't get id
2592  return false;
2593  }
2594 
2595  // insert into map, rely on the fact that map's references are constant.
2596  mIdToTarget.emplace(id, target);
2597 
2598  // prepare readformat with the correct size (after the insert)
2599  auto size = 3 + 2 * mIdToTarget.size();
2600  mCounters.resize(size);
2601  mCalibratedOverhead.resize(size);
2602  mLoopOverhead.resize(size);
2603 
2604  return true;
2605 }
2606 
2607 PerformanceCounters::PerformanceCounters()
2608  : mPc(new LinuxPerformanceCounters())
2609  , mVal()
2610  , mHas() {
2611 
2612  mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2613  mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2614  mHas.contextSwitches =
2615  mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2616  mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2617  mHas.branchInstructions =
2618  mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2619  mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2620  // mHas.branchMisses = false;
2621 
2622  mPc->start();
2623  mPc->calibrate([] {
2624  auto before = ankerl::nanobench::Clock::now();
2625  auto after = ankerl::nanobench::Clock::now();
2626  (void)before;
2627  (void)after;
2628  });
2629 
2630  if (mPc->hasError()) {
2631  // something failed, don't monitor anything.
2632  mHas = PerfCountSet<bool>{};
2633  }
2634 }
2635 
2636 PerformanceCounters::~PerformanceCounters() {
2637  if (nullptr != mPc) {
2638  delete mPc;
2639  }
2640 }
2641 
2642 void PerformanceCounters::beginMeasure() {
2643  mPc->beginMeasure();
2644 }
2645 
2646 void PerformanceCounters::endMeasure() {
2647  mPc->endMeasure();
2648 }
2649 
2650 void PerformanceCounters::updateResults(uint64_t numIters) {
2651  mPc->updateResults(numIters);
2652 }
2653 
2654 # else
2655 
2656 PerformanceCounters::PerformanceCounters() = default;
2657 PerformanceCounters::~PerformanceCounters() = default;
2658 void PerformanceCounters::beginMeasure() {}
2659 void PerformanceCounters::endMeasure() {}
2660 void PerformanceCounters::updateResults(uint64_t) {}
2661 
2662 # endif
2663 
2664 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2665  return mVal;
2666 }
2667 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2668  return mHas;
2669 }
2670 
2671 // formatting utilities
2672 namespace fmt {
2673 
2674 // adds thousands separator to numbers
2675 NumSep::NumSep(char sep)
2676  : mSep(sep) {}
2677 
2678 char NumSep::do_thousands_sep() const {
2679  return mSep;
2680 }
2681 
2682 std::string NumSep::do_grouping() const {
2683  return "\003";
2684 }
2685 
2686 // RAII to save & restore a stream's state
2687 StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2688  : mStream(s)
2689  , mLocale(s.getloc())
2690  , mPrecision(s.precision())
2691  , mWidth(s.width())
2692  , mFill(s.fill())
2693  , mFmtFlags(s.flags()) {}
2694 
2695 StreamStateRestorer::~StreamStateRestorer() {
2696  restore();
2697 }
2698 
2699 // sets back all stream info that we remembered at construction
2700 void StreamStateRestorer::restore() {
2701  mStream.imbue(mLocale);
2702  mStream.precision(mPrecision);
2703  mStream.width(mWidth);
2704  mStream.fill(mFill);
2705  mStream.flags(mFmtFlags);
2706 }
2707 
2708 Number::Number(int width, int precision, int64_t value)
2709  : mWidth(width)
2710  , mPrecision(precision)
2711  , mValue(static_cast<double>(value)) {}
2712 
2713 Number::Number(int width, int precision, double value)
2714  : mWidth(width)
2715  , mPrecision(precision)
2716  , mValue(value) {}
2717 
2718 std::ostream& Number::write(std::ostream& os) const {
2719  StreamStateRestorer restorer(os);
2720  os.imbue(std::locale(os.getloc(), new NumSep(',')));
2721  os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2722  return os;
2723 }
2724 
2725 std::string Number::to_s() const {
2726  std::stringstream ss;
2727  write(ss);
2728  return ss.str();
2729 }
2730 
2731 std::string to_s(uint64_t n) {
2732  std::string str;
2733  do {
2734  str += static_cast<char>('0' + static_cast<char>(n % 10));
2735  n /= 10;
2736  } while (n != 0);
2737  std::reverse(str.begin(), str.end());
2738  return str;
2739 }
2740 
2741 std::ostream& operator<<(std::ostream& os, Number const& n) {
2742  return n.write(os);
2743 }
2744 
2745 MarkDownColumn::MarkDownColumn(int w, int prec, std::string const& tit, std::string const& suff, double val)
2746  : mWidth(w)
2747  , mPrecision(prec)
2748  , mTitle(tit)
2749  , mSuffix(suff)
2750  , mValue(val) {}
2751 
2752 std::string MarkDownColumn::title() const {
2753  std::stringstream ss;
2754  ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2755  return ss.str();
2756 }
2757 
2758 std::string MarkDownColumn::separator() const {
2759  std::string sep(static_cast<size_t>(mWidth), '-');
2760  sep.front() = '|';
2761  sep.back() = ':';
2762  return sep;
2763 }
2764 
2765 std::string MarkDownColumn::invalid() const {
2766  std::string sep(static_cast<size_t>(mWidth), ' ');
2767  sep.front() = '|';
2768  sep[sep.size() - 2] = '-';
2769  return sep;
2770 }
2771 
2772 std::string MarkDownColumn::value() const {
2773  std::stringstream ss;
2774  auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2775  ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2776  return ss.str();
2777 }
2778 
2779 // Formats any text as markdown code, escaping backticks.
2780 MarkDownCode::MarkDownCode(std::string const& what) {
2781  mWhat.reserve(what.size() + 2);
2782  mWhat.push_back('`');
2783  for (char c : what) {
2784  mWhat.push_back(c);
2785  if ('`' == c) {
2786  mWhat.push_back('`');
2787  }
2788  }
2789  mWhat.push_back('`');
2790 }
2791 
2792 std::ostream& MarkDownCode::write(std::ostream& os) const {
2793  return os << mWhat;
2794 }
2795 
2796 std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2797  return mdCode.write(os);
2798 }
2799 } // namespace fmt
2800 } // namespace detail
2801 
2802 // provide implementation here so it's only generated once
2803 Config::Config() = default;
2804 Config::~Config() = default;
2805 Config& Config::operator=(Config const&) = default;
2806 Config& Config::operator=(Config&&) = default;
2807 Config::Config(Config const&) = default;
2808 Config::Config(Config&&) noexcept = default;
2809 
2810 // provide implementation here so it's only generated once
2811 Result::~Result() = default;
2812 Result& Result::operator=(Result const&) = default;
2813 Result& Result::operator=(Result&&) = default;
2814 Result::Result(Result const&) = default;
2815 Result::Result(Result&&) noexcept = default;
2816 
2817 namespace detail {
2818 template <typename T>
2819 inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2820  return static_cast<typename std::underlying_type<T>::type>(val);
2821 }
2822 } // namespace detail
2823 
2824 // Result returned after a benchmark has finished. Can be used as a baseline for relative().
2825 Result::Result(Config const& benchmarkConfig)
2826  : mConfig(benchmarkConfig)
2827  , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2828 
2829 void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2830  using detail::d;
2831  using detail::u;
2832 
2833  double dIters = d(iters);
2834  mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2835 
2836  mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2837  if (pc.has().pageFaults) {
2838  mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2839  }
2840  if (pc.has().cpuCycles) {
2841  mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2842  }
2843  if (pc.has().contextSwitches) {
2844  mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2845  }
2846  if (pc.has().instructions) {
2847  mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2848  }
2849  if (pc.has().branchInstructions) {
2850  double branchInstructions = 0.0;
2851  // correcting branches: remove branch introduced by the while (...) loop for each iteration.
2852  if (pc.val().branchInstructions > iters + 1U) {
2853  branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2854  }
2855  mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2856 
2857  if (pc.has().branchMisses) {
2858  // correcting branch misses
2859  double branchMisses = d(pc.val().branchMisses);
2860  if (branchMisses > branchInstructions) {
2861  // can't have branch misses when there were branches...
2862  branchMisses = branchInstructions;
2863  }
2864 
2865  // assuming at least one missed branch for the loop
2866  branchMisses -= 1.0;
2867  if (branchMisses < 1.0) {
2868  branchMisses = 1.0;
2869  }
2870  mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2871  }
2872  }
2873 }
2874 
2875 Config const& Result::config() const noexcept {
2876  return mConfig;
2877 }
2878 
2879 inline double calcMedian(std::vector<double>& data) {
2880  if (data.empty()) {
2881  return 0.0;
2882  }
2883  std::sort(data.begin(), data.end());
2884 
2885  auto midIdx = data.size() / 2U;
2886  if (1U == (data.size() & 1U)) {
2887  return data[midIdx];
2888  }
2889  return (data[midIdx - 1U] + data[midIdx]) / 2U;
2890 }
2891 
2892 double Result::median(Measure m) const {
2893  // create a copy so we can sort
2894  auto data = mNameToMeasurements[detail::u(m)];
2895  return calcMedian(data);
2896 }
2897 
2898 double Result::average(Measure m) const {
2899  using detail::d;
2900  auto const& data = mNameToMeasurements[detail::u(m)];
2901  if (data.empty()) {
2902  return 0.0;
2903  }
2904 
2905  // create a copy so we can sort
2906  return sum(m) / d(data.size());
2907 }
2908 
2909 double Result::medianAbsolutePercentError(Measure m) const {
2910  // create copy
2911  auto data = mNameToMeasurements[detail::u(m)];
2912 
2913  // calculates MdAPE which is the median of percentage error
2914  // see https://www.spiderfinancial.com/support/documentation/numxl/reference-manual/forecasting-performance/mdape
2915  auto med = calcMedian(data);
2916 
2917  // transform the data to absolute error
2918  for (auto& x : data) {
2919  x = (x - med) / x;
2920  if (x < 0) {
2921  x = -x;
2922  }
2923  }
2924  return calcMedian(data);
2925 }
2926 
2927 double Result::sum(Measure m) const noexcept {
2928  auto const& data = mNameToMeasurements[detail::u(m)];
2929  return std::accumulate(data.begin(), data.end(), 0.0);
2930 }
2931 
2932 double Result::sumProduct(Measure m1, Measure m2) const noexcept {
2933  auto const& data1 = mNameToMeasurements[detail::u(m1)];
2934  auto const& data2 = mNameToMeasurements[detail::u(m2)];
2935 
2936  if (data1.size() != data2.size()) {
2937  return 0.0;
2938  }
2939 
2940  double result = 0.0;
2941  for (size_t i = 0, s = data1.size(); i != s; ++i) {
2942  result += data1[i] * data2[i];
2943  }
2944  return result;
2945 }
2946 
2947 bool Result::has(Measure m) const noexcept {
2948  return !mNameToMeasurements[detail::u(m)].empty();
2949 }
2950 
2951 double Result::get(size_t idx, Measure m) const {
2952  auto const& data = mNameToMeasurements[detail::u(m)];
2953  return data.at(idx);
2954 }
2955 
2956 bool Result::empty() const noexcept {
2957  return 0U == size();
2958 }
2959 
2960 size_t Result::size() const noexcept {
2961  auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
2962  return data.size();
2963 }
2964 
2965 double Result::minimum(Measure m) const noexcept {
2966  auto const& data = mNameToMeasurements[detail::u(m)];
2967  if (data.empty()) {
2968  return 0.0;
2969  }
2970 
2971  // here its save to assume that at least one element is there
2972  return *std::min_element(data.begin(), data.end());
2973 }
2974 
2975 double Result::maximum(Measure m) const noexcept {
2976  auto const& data = mNameToMeasurements[detail::u(m)];
2977  if (data.empty()) {
2978  return 0.0;
2979  }
2980 
2981  // here its save to assume that at least one element is there
2982  return *std::max_element(data.begin(), data.end());
2983 }
2984 
2985 Result::Measure Result::fromString(std::string const& str) {
2986  if (str == "elapsed") {
2987  return Measure::elapsed;
2988  } else if (str == "iterations") {
2989  return Measure::iterations;
2990  } else if (str == "pagefaults") {
2991  return Measure::pagefaults;
2992  } else if (str == "cpucycles") {
2993  return Measure::cpucycles;
2994  } else if (str == "contextswitches") {
2995  return Measure::contextswitches;
2996  } else if (str == "instructions") {
2997  return Measure::instructions;
2998  } else if (str == "branchinstructions") {
2999  return Measure::branchinstructions;
3000  } else if (str == "branchmisses") {
3001  return Measure::branchmisses;
3002  } else {
3003  // not found, return _size
3004  return Measure::_size;
3005  }
3006 }
3007 
3008 // Configuration of a microbenchmark.
3009 Bench::Bench() {
3010  mConfig.mOut = &std::cout;
3011 }
3012 
3013 Bench::Bench(Bench&&) = default;
3014 Bench& Bench::operator=(Bench&&) = default;
3015 Bench::Bench(Bench const&) = default;
3016 Bench& Bench::operator=(Bench const&) = default;
3017 Bench::~Bench() noexcept = default;
3018 
3019 double Bench::batch() const noexcept {
3020  return mConfig.mBatch;
3021 }
3022 
3023 double Bench::complexityN() const noexcept {
3024  return mConfig.mComplexityN;
3025 }
3026 
3027 // Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3028 // means it is slower than the baseline.
3029 Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3030  mConfig.mIsRelative = isRelativeEnabled;
3031  return *this;
3032 }
3033 bool Bench::relative() const noexcept {
3034  return mConfig.mIsRelative;
3035 }
3036 
3037 Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3038  mConfig.mShowPerformanceCounters = showPerformanceCounters;
3039  return *this;
3040 }
3041 bool Bench::performanceCounters() const noexcept {
3042  return mConfig.mShowPerformanceCounters;
3043 }
3044 
3045 // Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3046 // If u differs from currently set unit, the stored results will be cleared.
3047 // Use singular (byte, not bytes).
3048 Bench& Bench::unit(char const* u) {
3049  if (u != mConfig.mUnit) {
3050  mResults.clear();
3051  }
3052  mConfig.mUnit = u;
3053  return *this;
3054 }
3055 
3056 Bench& Bench::unit(std::string const& u) {
3057  return unit(u.c_str());
3058 }
3059 
3060 std::string const& Bench::unit() const noexcept {
3061  return mConfig.mUnit;
3062 }
3063 
3064 Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3065  mConfig.mTimeUnit = tu;
3066  mConfig.mTimeUnitName = tuName;
3067  return *this;
3068 }
3069 
3070 std::string const& Bench::timeUnitName() const noexcept {
3071  return mConfig.mTimeUnitName;
3072 }
3073 
3074 std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3075  return mConfig.mTimeUnit;
3076 }
3077 
3078 // If benchmarkTitle differs from currently set title, the stored results will be cleared.
3079 Bench& Bench::title(const char* benchmarkTitle) {
3080  if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3081  mResults.clear();
3082  }
3083  mConfig.mBenchmarkTitle = benchmarkTitle;
3084  return *this;
3085 }
3086 Bench& Bench::title(std::string const& benchmarkTitle) {
3087  if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3088  mResults.clear();
3089  }
3090  mConfig.mBenchmarkTitle = benchmarkTitle;
3091  return *this;
3092 }
3093 
3094 std::string const& Bench::title() const noexcept {
3095  return mConfig.mBenchmarkTitle;
3096 }
3097 
3098 Bench& Bench::name(const char* benchmarkName) {
3099  mConfig.mBenchmarkName = benchmarkName;
3100  return *this;
3101 }
3102 
3103 Bench& Bench::name(std::string const& benchmarkName) {
3104  mConfig.mBenchmarkName = benchmarkName;
3105  return *this;
3106 }
3107 
3108 std::string const& Bench::name() const noexcept {
3109  return mConfig.mBenchmarkName;
3110 }
3111 
3112 // Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
3113 Bench& Bench::epochs(size_t numEpochs) noexcept {
3114  mConfig.mNumEpochs = numEpochs;
3115  return *this;
3116 }
3117 size_t Bench::epochs() const noexcept {
3118  return mConfig.mNumEpochs;
3119 }
3120 
3121 // Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
3122 Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3123  mConfig.mClockResolutionMultiple = multiple;
3124  return *this;
3125 }
3126 size_t Bench::clockResolutionMultiple() const noexcept {
3127  return mConfig.mClockResolutionMultiple;
3128 }
3129 
3130 // Sets the maximum time each epoch should take. Default is 100ms.
3131 Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3132  mConfig.mMaxEpochTime = t;
3133  return *this;
3134 }
3135 std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3136  return mConfig.mMaxEpochTime;
3137 }
3138 
3139 // Sets the maximum time each epoch should take. Default is 100ms.
3140 Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3141  mConfig.mMinEpochTime = t;
3142  return *this;
3143 }
3144 std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3145  return mConfig.mMinEpochTime;
3146 }
3147 
3148 Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3149  mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3150  return *this;
3151 }
3152 uint64_t Bench::minEpochIterations() const noexcept {
3153  return mConfig.mMinEpochIterations;
3154 }
3155 
3156 Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3157  mConfig.mEpochIterations = numIters;
3158  return *this;
3159 }
3160 uint64_t Bench::epochIterations() const noexcept {
3161  return mConfig.mEpochIterations;
3162 }
3163 
3164 Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3165  mConfig.mWarmup = numWarmupIters;
3166  return *this;
3167 }
3168 uint64_t Bench::warmup() const noexcept {
3169  return mConfig.mWarmup;
3170 }
3171 
3172 Bench& Bench::config(Config const& benchmarkConfig) {
3173  mConfig = benchmarkConfig;
3174  return *this;
3175 }
3176 Config const& Bench::config() const noexcept {
3177  return mConfig;
3178 }
3179 
3180 Bench& Bench::output(std::ostream* outstream) noexcept {
3181  mConfig.mOut = outstream;
3182  return *this;
3183 }
3184 
3185 ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3186  return mConfig.mOut;
3187 }
3188 
3189 std::vector<Result> const& Bench::results() const noexcept {
3190  return mResults;
3191 }
3192 
3193 Bench& Bench::render(char const* templateContent, std::ostream& os) {
3194  ::ankerl::nanobench::render(templateContent, *this, os);
3195  return *this;
3196 }
3197 
3198 Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3199  ::ankerl::nanobench::render(templateContent, *this, os);
3200  return *this;
3201 }
3202 
3203 std::vector<BigO> Bench::complexityBigO() const {
3204  std::vector<BigO> bigOs;
3205  auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3206  bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3207  return 1.0;
3208  });
3209  bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3210  return n;
3211  });
3212  bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3213  return std::log2(n);
3214  });
3215  bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3216  return n * std::log2(n);
3217  });
3218  bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3219  return n * n;
3220  });
3221  bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3222  return n * n * n;
3223  });
3224  std::sort(bigOs.begin(), bigOs.end());
3225  return bigOs;
3226 }
3227 
3228 Rng::Rng()
3229  : mX(0)
3230  , mY(0) {
3231  std::random_device rd;
3232  std::uniform_int_distribution<uint64_t> dist;
3233  do {
3234  mX = dist(rd);
3235  mY = dist(rd);
3236  } while (mX == 0 && mY == 0);
3237 }
3238 
3239 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
3240 uint64_t splitMix64(uint64_t& state) noexcept {
3241  uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3242  z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3243  z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3244  return z ^ (z >> 31U);
3245 }
3246 
3247 // Seeded as described in romu paper (update april 2020)
3248 Rng::Rng(uint64_t seed) noexcept
3249  : mX(splitMix64(seed))
3250  , mY(splitMix64(seed)) {
3251  for (size_t i = 0; i < 10; ++i) {
3252  operator()();
3253  }
3254 }
3255 
3256 // only internally used to copy the RNG.
3257 Rng::Rng(uint64_t x, uint64_t y) noexcept
3258  : mX(x)
3259  , mY(y) {}
3260 
3261 Rng Rng::copy() const noexcept {
3262  return Rng{mX, mY};
3263 }
3264 
3265 Rng::Rng(std::vector<uint64_t> const& data)
3266  : mX(0)
3267  , mY(0) {
3268  if (data.size() != 2) {
3269  throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3270  detail::fmt::to_s(data.size()));
3271  }
3272  mX = data[0];
3273  mY = data[1];
3274 }
3275 
3276 std::vector<uint64_t> Rng::state() const {
3277  std::vector<uint64_t> data(2);
3278  data[0] = mX;
3279  data[1] = mY;
3280  return data;
3281 }
3282 
3283 BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3284  BigO::RangeMeasure rangeMeasure;
3285  for (auto const& result : results) {
3286  if (result.config().mComplexityN > 0.0) {
3287  rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3288  }
3289  }
3290  return rangeMeasure;
3291 }
3292 
3293 BigO::BigO(std::string const& bigOName, RangeMeasure const& rangeMeasure)
3294  : mName(bigOName) {
3295 
3296  // estimate the constant factor
3297  double sumRangeMeasure = 0.0;
3298  double sumRangeRange = 0.0;
3299 
3300  for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3301  sumRangeMeasure += rangeMeasure[i].first * rangeMeasure[i].second;
3302  sumRangeRange += rangeMeasure[i].first * rangeMeasure[i].first;
3303  }
3304  mConstant = sumRangeMeasure / sumRangeRange;
3305 
3306  // calculate root mean square
3307  double err = 0.0;
3308  double sumMeasure = 0.0;
3309  for (size_t i = 0; i < rangeMeasure.size(); ++i) {
3310  auto diff = mConstant * rangeMeasure[i].first - rangeMeasure[i].second;
3311  err += diff * diff;
3312 
3313  sumMeasure += rangeMeasure[i].second;
3314  }
3315 
3316  auto n = static_cast<double>(rangeMeasure.size());
3317  auto mean = sumMeasure / n;
3318  mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3319 }
3320 
3321 BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3322  : BigO(std::string(bigOName), rangeMeasure) {}
3323 
3324 std::string const& BigO::name() const noexcept {
3325  return mName;
3326 }
3327 
3328 double BigO::constant() const noexcept {
3329  return mConstant;
3330 }
3331 
3332 double BigO::normalizedRootMeanSquare() const noexcept {
3333  return mNormalizedRootMeanSquare;
3334 }
3335 
3336 bool BigO::operator<(BigO const& other) const noexcept {
3337  return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3338 }
3339 
3340 std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3341  return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3342 }
3343 
3344 std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3345  detail::fmt::StreamStateRestorer restorer(os);
3346  os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3347  for (auto const& bigO : bigOs) {
3348  os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3349  os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3350  os << "| " << bigO.name();
3351  os << std::endl;
3352  }
3353  return os;
3354 }
3355 
3356 } // namespace nanobench
3357 } // namespace ankerl
3358 
3359 #endif // ANKERL_NANOBENCH_IMPLEMENT
3360 #endif // ANKERL_NANOBENCH_H_INCLUDED
ankerl::nanobench::BigO::mNormalizedRootMeanSquare
double mNormalizedRootMeanSquare
Definition: nanobench.h:1095
ankerl::nanobench::Result::Measure
Measure
Definition: nanobench.h:410
ankerl::nanobench::detail::PerfCountSet::cpuCycles
T cpuCycles
Definition: nanobench.h:367
ankerl::nanobench::render
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
ankerl::nanobench::operator<<
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
ankerl::nanobench::detail::IterationLogic::mPimpl
Impl * mPimpl
Definition: nanobench.h:1028
ankerl::nanobench::detail::performanceCounters
PerformanceCounters & performanceCounters()
ankerl::nanobench::Bench::batch
ANKERL_NANOBENCH(NODISCARD) std Bench & batch(T b) noexcept
Sets the batch size.
ankerl::nanobench::BigO::ANKERL_NANOBENCH
ANKERL_NANOBENCH(NODISCARD) std ANKERL_NANOBENCH(NODISCARD) double const ant() const noexcept
ankerl::nanobench::BigO::RangeMeasure
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1065
ankerl::nanobench::templates::htmlBoxplot
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
ankerl::nanobench::BigO::mConstant
double mConstant
Definition: nanobench.h:1094
fsbridge::ifstream
fs::ifstream ifstream
Definition: fs.h:101
count
static int count
Definition: tests.c:41
flags
int flags
Definition: bitcoin-tx.cpp:512
operator<
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:641
ANKERL_NANOBENCH_NO_SANITIZE
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:103
feebumper::Result
Result
Definition: feebumper.h:19
ankerl::nanobench::detail::PerformanceCounters
Definition: nanobench.h:1034
ankerl::nanobench::render
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
ankerl::nanobench::detail::IterationLogic::moveResultTo
void moveResultTo(std::vector< Result > &results) noexcept
ankerl::nanobench::Rng::min
static constexpr uint64_t() min()
ankerl::nanobench::detail::PerfCountSet::branchMisses
T branchMisses
Definition: nanobench.h:371
ANKERL_NANOBENCH
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:48
ankerl::nanobench::Bench
Main entry point to nanobench's benchmarking facility.
Definition: nanobench.h:614
ankerl::nanobench::Result::Measure::elapsed
@ elapsed
ankerl::nanobench::Rng::uniform01
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1133
ankerl::nanobench::Rng::shuffle
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1143
ankerl::nanobench::BigO::collectRangeMeasure
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
ankerl::nanobench::BigO::BigO
BigO(std::string const &bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1082
ankerl::nanobench::detail::PerfCountSet::instructions
T instructions
Definition: nanobench.h:369
ankerl::nanobench::detail::IterationLogic::add
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
ankerl::nanobench::Result::Measure::branchinstructions
@ branchinstructions
ankerl::nanobench::BigO
Definition: nanobench.h:1063
ankerl::nanobench::Rng
An extremely fast random generator.
Definition: nanobench.h:475
ankerl::nanobench::Bench::complexityBigO
std::vector< BigO > complexityBigO() const
ankerl::nanobench::Result::Measure::instructions
@ instructions
ankerl::nanobench::Result
Definition: nanobench.h:408
ankerl::nanobench::Result::Measure::cpucycles
@ cpucycles
ankerl::nanobench::detail::doNotOptimizeAway
void doNotOptimizeAway(T const &val)
Definition: nanobench.h:997
ankerl::nanobench::Bench::run
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1181
ankerl::nanobench::BigO::mName
std::string mName
Definition: nanobench.h:1093
ankerl::nanobench::Result::fromString
static Measure fromString(std::string const &str)
operator==
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:636
ankerl::nanobench::Result::Measure::_size
@ _size
ankerl::nanobench::Rng::result_type
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:480
ankerl::nanobench::BigO::BigO
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1078
ankerl::nanobench::detail::PerfCountSet::pageFaults
T pageFaults
Definition: nanobench.h:366
name
const char * name
Definition: rest.cpp:43
stop
static RPCHelpMan stop()
Definition: server.cpp:161
ankerl::nanobench::Bench::complexityN
Bench & complexityN(T b) noexcept
Definition: nanobench.h:1212
ankerl::nanobench::detail::PerfCountSet::branchInstructions
T branchInstructions
Definition: nanobench.h:370
ANKERL_NANOBENCH_LOG
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:86
ankerl::nanobench::Bench::doNotOptimizeAway
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
ankerl::nanobench::Rng::max
static constexpr uint64_t() max()
ankerl::nanobench::detail::PerfCountSet::contextSwitches
T contextSwitches
Definition: nanobench.h:368
ankerl::nanobench::templates::json
char const * json() noexcept
Template to generate JSON data.
ankerl::nanobench::operator<<
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
sum
volatile double sum
Definition: examples.cpp:10
ankerl::nanobench::Clock
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:126
ankerl::nanobench::Result::Measure::iterations
@ iterations
ankerl::nanobench::Config
Definition: nanobench.h:377
ankerl::nanobench::detail::PerfCountSet
Definition: nanobench.h:345
ankerl::nanobench::BigO::mapRangeMeasure
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1068
ankerl::nanobench::templates::pyperf
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzations.
ankerl::nanobench::templates::csv
char const * csv() noexcept
CSV data for the benchmark results.
ankerl::nanobench::Result::Measure::branchmisses
@ branchmisses
ankerl
Definition: nanobench.h:122
ankerl::nanobench::detail::IterationLogic
Definition: nanobench.h:1018
ankerl::nanobench::doNotOptimizeAway
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1226