Bitcoin Core 28.99.0
P2P Digital Currency
nanobench.h
Go to the documentation of this file.
1// __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2// | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3// | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4//
5// Microbenchmark framework for C++11/14/17/20
6// https://github.com/martinus/nanobench
7//
8// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9// SPDX-License-Identifier: MIT
10// Copyright (c) 2019-2023 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
11//
12// Permission is hereby granted, free of charge, to any person obtaining a copy
13// of this software and associated documentation files (the "Software"), to deal
14// in the Software without restriction, including without limitation the rights
15// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16// copies of the Software, and to permit persons to whom the Software is
17// furnished to do so, subject to the following conditions:
18//
19// The above copyright notice and this permission notice shall be included in all
20// copies or substantial portions of the Software.
21//
22// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28// SOFTWARE.
29
30#ifndef ANKERL_NANOBENCH_H_INCLUDED
31#define ANKERL_NANOBENCH_H_INCLUDED
32
33// see https://semver.org/
34#define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35#define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes
36#define ANKERL_NANOBENCH_VERSION_PATCH 11 // backwards-compatible bug fixes
37
39// public facing api - as minimal as possible
41
42#include <chrono> // high_resolution_clock
43#include <cstring> // memcpy
44#include <iosfwd> // for std::ostream* custom output target in Config
45#include <string> // all names
46#include <unordered_map> // holds context information of results
47#include <vector> // holds all results
48
49#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
50
51#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
52#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
53#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
54#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
55#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
56
57#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
58# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
59#else
60# define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
61#endif
62
63#if defined(__clang__)
64# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
65 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
66# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
67#else
68# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
69# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
70#endif
71
72#if defined(__GNUC__)
73# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
74# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
75#else
76# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
77# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
78#endif
79
80#if defined(ANKERL_NANOBENCH_LOG_ENABLED)
81# include <iostream>
82# define ANKERL_NANOBENCH_LOG(x) \
83 do { \
84 std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
85 } while (0)
86#else
87# define ANKERL_NANOBENCH_LOG(x) \
88 do { \
89 } while (0)
90#endif
91
92#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
93#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
94# include <linux/version.h>
95# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
96// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3
97// PERF_FLAG_FD_CLOEXEC since kernel 3.14
98# undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
99# define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
100# endif
101#endif
102
103#if defined(__clang__)
104# define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
105#else
106# define ANKERL_NANOBENCH_NO_SANITIZE(...)
107#endif
108
109#if defined(_MSC_VER)
110# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
111#else
112# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
113#endif
114
115// workaround missing "is_trivially_copyable" in g++ < 5.0
116// See https://stackoverflow.com/a/31798726/48181
117#if defined(__GNUC__) && __GNUC__ < 5
118# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
119#else
120# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
121#endif
122
123// noexcept may be missing for std::string.
124// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58265
125#define ANKERL_NANOBENCH_PRIVATE_NOEXCEPT_STRING_MOVE() std::is_nothrow_move_assignable<std::string>::value
126
127// declarations ///////////////////////////////////////////////////////////////////////////////////
128
129namespace ankerl {
130namespace nanobench {
131
132using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
133 std::chrono::steady_clock>::type;
134class Bench;
135struct Config;
136class Result;
137class Rng;
138class BigO;
139
290void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
291void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
292
301void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
302void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
303
304// Contains mustache-like templates
305namespace templates {
306
316char const* csv() noexcept;
317
328char const* htmlBoxplot() noexcept;
329
336char const* pyperf() noexcept;
337
347char const* json() noexcept;
348
349} // namespace templates
350
351namespace detail {
352
353template <typename T>
354struct PerfCountSet;
355
356class IterationLogic;
358
359#if ANKERL_NANOBENCH(PERF_COUNTERS)
360class LinuxPerformanceCounters;
361#endif
362
363} // namespace detail
364} // namespace nanobench
365} // namespace ankerl
366
367// definitions ////////////////////////////////////////////////////////////////////////////////////
368
369namespace ankerl {
370namespace nanobench {
371namespace detail {
372
373template <typename T>
381};
382
383} // namespace detail
384
385ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
386struct Config {
387 // actual benchmark config
388 std::string mBenchmarkTitle = "benchmark"; // NOLINT(misc-non-private-member-variables-in-classes)
389 std::string mBenchmarkName = "noname"; // NOLINT(misc-non-private-member-variables-in-classes)
390 std::string mUnit = "op"; // NOLINT(misc-non-private-member-variables-in-classes)
391 double mBatch = 1.0; // NOLINT(misc-non-private-member-variables-in-classes)
392 double mComplexityN = -1.0; // NOLINT(misc-non-private-member-variables-in-classes)
393 size_t mNumEpochs = 11; // NOLINT(misc-non-private-member-variables-in-classes)
394 size_t mClockResolutionMultiple = static_cast<size_t>(1000); // NOLINT(misc-non-private-member-variables-in-classes)
395 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100); // NOLINT(misc-non-private-member-variables-in-classes)
396 std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1); // NOLINT(misc-non-private-member-variables-in-classes)
397 uint64_t mMinEpochIterations{1}; // NOLINT(misc-non-private-member-variables-in-classes)
398 // If not 0, run *exactly* these number of iterations per epoch.
399 uint64_t mEpochIterations{0}; // NOLINT(misc-non-private-member-variables-in-classes)
400 uint64_t mWarmup = 0; // NOLINT(misc-non-private-member-variables-in-classes)
401 std::ostream* mOut = nullptr; // NOLINT(misc-non-private-member-variables-in-classes)
402 std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1}; // NOLINT(misc-non-private-member-variables-in-classes)
403 std::string mTimeUnitName = "ns"; // NOLINT(misc-non-private-member-variables-in-classes)
404 bool mShowPerformanceCounters = true; // NOLINT(misc-non-private-member-variables-in-classes)
405 bool mIsRelative = false; // NOLINT(misc-non-private-member-variables-in-classes)
406 std::unordered_map<std::string, std::string> mContext{}; // NOLINT(misc-non-private-member-variables-in-classes)
407
410 Config& operator=(Config const& other);
411 Config& operator=(Config&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
412 Config(Config const& other);
413 Config(Config&& other) noexcept;
414};
415ANKERL_NANOBENCH(IGNORE_PADDED_POP)
416
417// Result returned after a benchmark has finished. Can be used as a baseline for relative().
418ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
419class Result {
420public:
421 enum class Measure : size_t {
422 elapsed,
423 iterations,
424 pagefaults,
425 cpucycles,
426 contextswitches,
427 instructions,
428 branchinstructions,
429 branchmisses,
430 _size
431 };
432
433 explicit Result(Config benchmarkConfig);
434
436 Result& operator=(Result const& other);
437 Result& operator=(Result&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
438 Result(Result const& other);
439 Result(Result&& other) noexcept;
440
441 // adds new measurement results
442 // all values are scaled by iters (except iters...)
443 void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
444
445 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
446
447 ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
448 ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
449 ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
450 ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
451 ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
452 ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
453 ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
454 ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const* variableName) const;
455 ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const& variableName) const;
456
457 ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
458 ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
459 ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
460 ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
461
462 // Finds string, if not found, returns _size.
463 static Measure fromString(std::string const& str);
464
465private:
466 Config mConfig{};
467 std::vector<std::vector<double>> mNameToMeasurements{};
468};
469ANKERL_NANOBENCH(IGNORE_PADDED_POP)
470
471
488class Rng final {
489public:
493 using result_type = uint64_t;
494
495 static constexpr uint64_t(min)();
496 static constexpr uint64_t(max)();
497
503 Rng(Rng const&) = delete;
504
508 Rng& operator=(Rng const&) = delete;
509
510 // moving is ok
511 Rng(Rng&&) noexcept = default;
512 Rng& operator=(Rng&&) noexcept = default;
513 ~Rng() noexcept = default;
514
523
540 explicit Rng(uint64_t seed) noexcept;
541 Rng(uint64_t x, uint64_t y) noexcept;
542 explicit Rng(std::vector<uint64_t> const& data);
543
547 ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
548
556 inline uint64_t operator()() noexcept;
557
558 // This is slightly biased. See
559
574 inline uint32_t bounded(uint32_t range) noexcept;
575
576 // random double in range [0, 1(
577 // see http://prng.di.unimi.it/
578
585 inline double uniform01() noexcept;
586
594 template <typename Container>
595 void shuffle(Container& container) noexcept;
596
603 ANKERL_NANOBENCH(NODISCARD) std::vector<uint64_t> state() const;
604
605private:
606 static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
607
608 uint64_t mX;
609 uint64_t mY;
610};
611
626ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
627class Bench {
628public:
633
634 Bench(Bench&& other) noexcept;
635 Bench& operator=(Bench&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
636 Bench(Bench const& other);
637 Bench& operator=(Bench const& other);
638 ~Bench() noexcept;
639
658 template <typename Op>
659 ANKERL_NANOBENCH(NOINLINE)
660 Bench& run(char const* benchmarkName, Op&& op);
661
662 template <typename Op>
663 ANKERL_NANOBENCH(NOINLINE)
664 Bench& run(std::string const& benchmarkName, Op&& op);
665
670 template <typename Op>
671 ANKERL_NANOBENCH(NOINLINE)
672 Bench& run(Op&& op);
673
679 Bench& title(char const* benchmarkTitle);
680 Bench& title(std::string const& benchmarkTitle);
681
685 ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
686
688 Bench& name(char const* benchmarkName);
689 Bench& name(std::string const& benchmarkName);
690 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
691
704 Bench& context(char const* variableName, char const* variableValue);
705 Bench& context(std::string const& variableName, std::string const& variableValue);
706
715 Bench& clearContext();
716
726 template <typename T>
727 Bench& batch(T b) noexcept;
728 ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
729
738 Bench& unit(char const* unit);
739 Bench& unit(std::string const& unit);
740 ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
741
751 Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
752 ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
753 ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
754
762 Bench& output(std::ostream* outstream) noexcept;
763 ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
764
785 Bench& clockResolutionMultiple(size_t multiple) noexcept;
786 ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
787
803 Bench& epochs(size_t numEpochs) noexcept;
804 ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
805
816 Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
817 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
818
829 Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
830 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
831
842 Bench& minEpochIterations(uint64_t numIters) noexcept;
843 ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
844
851 Bench& epochIterations(uint64_t numIters) noexcept;
852 ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
853
863 Bench& warmup(uint64_t numWarmupIters) noexcept;
864 ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
865
883 Bench& relative(bool isRelativeEnabled) noexcept;
884 ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
885
894 Bench& performanceCounters(bool showPerformanceCounters) noexcept;
895 ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
896
905 ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
906
914 template <typename Arg>
916
931 template <typename T>
932 Bench& complexityN(T n) noexcept;
933 ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
934
966 std::vector<BigO> complexityBigO() const;
967
991 template <typename Op>
992 BigO complexityBigO(char const* name, Op op) const;
993
994 template <typename Op>
995 BigO complexityBigO(std::string const& name, Op op) const;
996
1004 Bench& render(char const* templateContent, std::ostream& os);
1005 Bench& render(std::string const& templateContent, std::ostream& os);
1006
1007 Bench& config(Config const& benchmarkConfig);
1008 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
1009
1010private:
1011 Config mConfig{};
1012 std::vector<Result> mResults{};
1013};
1014ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1015
1016
1022template <typename Arg>
1023void doNotOptimizeAway(Arg&& arg);
1024
1025namespace detail {
1026
1027#if defined(_MSC_VER)
1028void doNotOptimizeAwaySink(void const*);
1029
1030template <typename T>
1031void doNotOptimizeAway(T const& val);
1032
1033#else
1034
1035// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
1036// this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
1037// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L443-L446
1038template <typename T>
1039void doNotOptimizeAway(T const& val) {
1040 // NOLINTNEXTLINE(hicpp-no-assembler)
1041 asm volatile("" : : "r,m"(val) : "memory");
1042}
1043
1044template <typename T>
1045void doNotOptimizeAway(T& val) {
1046# if defined(__clang__)
1047 // NOLINTNEXTLINE(hicpp-no-assembler)
1048 asm volatile("" : "+r,m"(val) : : "memory");
1049# else
1050 // NOLINTNEXTLINE(hicpp-no-assembler)
1051 asm volatile("" : "+m,r"(val) : : "memory");
1052# endif
1053}
1054#endif
1055
1056// internally used, but visible because run() is templated.
1057// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1058// having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
1059ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1061public:
1062 explicit IterationLogic(Bench const& bench);
1068
1069 ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1070 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1071 void moveResultTo(std::vector<Result>& results) noexcept;
1072
1073private:
1074 struct Impl;
1075 Impl* mPimpl;
1076};
1077ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1078
1079ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1081public:
1086
1089
1092 void updateResults(uint64_t numIters);
1093
1094 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1095 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1096
1097private:
1098#if ANKERL_NANOBENCH(PERF_COUNTERS)
1099 LinuxPerformanceCounters* mPc = nullptr;
1100#endif
1103};
1104ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1105
1106// Gets the singleton
1108
1109} // namespace detail
1110
1111class BigO {
1112public:
1113 using RangeMeasure = std::vector<std::pair<double, double>>;
1114
1115 template <typename Op>
1117 for (auto& rangeMeasure : data) {
1118 rangeMeasure.first = op(rangeMeasure.first);
1119 }
1120 return data;
1121 }
1122
1123 static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1124
1125 template <typename Op>
1126 BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1127 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1128
1129 template <typename Op>
1130 BigO(std::string bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1131 : BigO(std::move(bigOName), mapRangeMeasure(rangeMeasure, rangeToN)) {}
1132
1133 BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1134 BigO(std::string bigOName, RangeMeasure const& scaledRangeMeasure);
1135 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1136 ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1137 ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1138 ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1139
1140private:
1141 std::string mName{};
1142 double mConstant{};
1143 double mNormalizedRootMeanSquare{};
1144};
1145std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1146std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1147
1148} // namespace nanobench
1149} // namespace ankerl
1150
1151// implementation /////////////////////////////////////////////////////////////////////////////////
1152
1153namespace ankerl {
1154namespace nanobench {
1155
1156constexpr uint64_t(Rng::min)() {
1157 return 0;
1158}
1159
1160constexpr uint64_t(Rng::max)() {
1161 return (std::numeric_limits<uint64_t>::max)();
1162}
1163
1164ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1165uint64_t Rng::operator()() noexcept {
1166 auto x = mX;
1167
1168 mX = UINT64_C(15241094284759029579) * mY;
1169 mY = rotl(mY - x, 27);
1170
1171 return x;
1172}
1173
1174ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1175uint32_t Rng::bounded(uint32_t range) noexcept {
1176 uint64_t const r32 = static_cast<uint32_t>(operator()());
1177 auto multiresult = r32 * range;
1178 return static_cast<uint32_t>(multiresult >> 32U);
1179}
1180
1181double Rng::uniform01() noexcept {
1182 auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1183 // can't use union in c++ here for type puning, it's undefined behavior.
1184 // std::memcpy is optimized anyways.
1185 double d{};
1186 std::memcpy(&d, &i, sizeof(double));
1187 return d - 1.0;
1188}
1189
1190template <typename Container>
1191void Rng::shuffle(Container& container) noexcept {
1192 auto i = container.size();
1193 while (i > 1U) {
1194 using std::swap;
1195 auto n = operator()();
1196 // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80)
1197 auto b1 = static_cast<decltype(i)>((static_cast<uint32_t>(n) * static_cast<uint64_t>(i)) >> 32U);
1198 swap(container[--i], container[b1]);
1199
1200 auto b2 = static_cast<decltype(i)>(((n >> 32U) * static_cast<uint64_t>(i)) >> 32U);
1201 swap(container[--i], container[b2]);
1202 }
1203}
1204
1205ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1206constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1207 return (x << k) | (x >> (64U - k));
1208}
1209
1210template <typename Op>
1212Bench& Bench::run(Op&& op) {
1213 // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1214 detail::IterationLogic iterationLogic(*this);
1215 auto& pc = detail::performanceCounters();
1216
1217 while (auto n = iterationLogic.numIters()) {
1218 pc.beginMeasure();
1219 Clock::time_point const before = Clock::now();
1220 while (n-- > 0) {
1221 op();
1222 }
1223 Clock::time_point const after = Clock::now();
1224 pc.endMeasure();
1225 pc.updateResults(iterationLogic.numIters());
1226 iterationLogic.add(after - before, pc);
1227 }
1228 iterationLogic.moveResultTo(mResults);
1229 return *this;
1230}
1231
1232// Performs all evaluations.
1233template <typename Op>
1234Bench& Bench::run(char const* benchmarkName, Op&& op) {
1235 name(benchmarkName);
1236 return run(std::forward<Op>(op));
1237}
1238
1239template <typename Op>
1240Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1241 name(benchmarkName);
1242 return run(std::forward<Op>(op));
1243}
1244
1245template <typename Op>
1246BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1247 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1248}
1249
1250template <typename Op>
1251BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1252 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1253}
1254
1255// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1256// Any argument is cast to double.
1257template <typename T>
1258Bench& Bench::batch(T b) noexcept {
1259 mConfig.mBatch = static_cast<double>(b);
1260 return *this;
1261}
1262
1263// Sets the computation complexity of the next run. Any argument is cast to double.
1264template <typename T>
1266 mConfig.mComplexityN = static_cast<double>(n);
1267 return *this;
1268}
1269
1270// Convenience: makes sure none of the given arguments are optimized away by the compiler.
1271template <typename Arg>
1273 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1274 return *this;
1275}
1276
1277// Makes sure none of the given arguments are optimized away by the compiler.
1278template <typename Arg>
1279void doNotOptimizeAway(Arg&& arg) {
1280 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1281}
1282
1283namespace detail {
1284
1285#if defined(_MSC_VER)
1286template <typename T>
1287void doNotOptimizeAway(T const& val) {
1288 doNotOptimizeAwaySink(&val);
1289}
1290
1291#endif
1292
1293} // namespace detail
1294} // namespace nanobench
1295} // namespace ankerl
1296
1297#if defined(ANKERL_NANOBENCH_IMPLEMENT)
1298
1300// implementation part - only visible in .cpp
1302
1303# include <algorithm> // sort, reverse
1304# include <atomic> // compare_exchange_strong in loop overhead
1305# include <cstdlib> // getenv
1306# include <cstring> // strstr, strncmp
1307# include <fstream> // ifstream to parse proc files
1308# include <iomanip> // setw, setprecision
1309# include <iostream> // cout
1310# include <numeric> // accumulate
1311# include <random> // random_device
1312# include <sstream> // to_s in Number
1313# include <stdexcept> // throw for rendering templates
1314# include <tuple> // std::tie
1315# if defined(__linux__)
1316# include <unistd.h> //sysconf
1317# endif
1318# if ANKERL_NANOBENCH(PERF_COUNTERS)
1319# include <map> // map
1320
1321# include <linux/perf_event.h>
1322# include <sys/ioctl.h>
1323# include <sys/syscall.h>
1324# endif
1325
1326// declarations ///////////////////////////////////////////////////////////////////////////////////
1327
1328namespace ankerl {
1329namespace nanobench {
1330
1331// helper stuff that is only intended to be used internally
1332namespace detail {
1333
1334struct TableInfo;
1335
1336// formatting utilities
1337namespace fmt {
1338
1339class NumSep;
1340class StreamStateRestorer;
1341class Number;
1342class MarkDownColumn;
1343class MarkDownCode;
1344
1345} // namespace fmt
1346} // namespace detail
1347} // namespace nanobench
1348} // namespace ankerl
1349
1350// definitions ////////////////////////////////////////////////////////////////////////////////////
1351
1352namespace ankerl {
1353namespace nanobench {
1354
1355uint64_t splitMix64(uint64_t& state) noexcept;
1356
1357namespace detail {
1358
1359// helpers to get double values
1360template <typename T>
1361inline double d(T t) noexcept {
1362 return static_cast<double>(t);
1363}
1364inline double d(Clock::duration duration) noexcept {
1365 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1366}
1367
1368// Calculates clock resolution once, and remembers the result
1369inline Clock::duration clockResolution() noexcept;
1370
1371} // namespace detail
1372
1373namespace templates {
1374
1375char const* csv() noexcept {
1376 return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1377{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1378{{/result}})DELIM";
1379}
1380
1381char const* htmlBoxplot() noexcept {
1382 return R"DELIM(<html>
1383
1384<head>
1385 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1386</head>
1387
1388<body>
1389 <div id="myDiv"></div>
1390 <script>
1391 var data = [
1392 {{#result}}{
1393 name: '{{name}}',
1394 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1395 },
1396 {{/result}}
1397 ];
1398 var title = '{{title}}';
1399
1400 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1401 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1402 </script>
1403</body>
1404
1405</html>)DELIM";
1406}
1407
1408char const* pyperf() noexcept {
1409 return R"DELIM({
1410 "benchmarks": [
1411 {
1412 "runs": [
1413 {
1414 "values": [
1415{{#measurement}} {{elapsed}}{{^-last}},
1416{{/last}}{{/measurement}}
1417 ]
1418 }
1419 ]
1420 }
1421 ],
1422 "metadata": {
1423 "loops": {{sum(iterations)}},
1424 "inner_loops": {{batch}},
1425 "name": "{{title}}",
1426 "unit": "second"
1427 },
1428 "version": "1.0"
1429})DELIM";
1430}
1431
1432char const* json() noexcept {
1433 return R"DELIM({
1434 "results": [
1435{{#result}} {
1436 "title": "{{title}}",
1437 "name": "{{name}}",
1438 "unit": "{{unit}}",
1439 "batch": {{batch}},
1440 "complexityN": {{complexityN}},
1441 "epochs": {{epochs}},
1442 "clockResolution": {{clockResolution}},
1443 "clockResolutionMultiple": {{clockResolutionMultiple}},
1444 "maxEpochTime": {{maxEpochTime}},
1445 "minEpochTime": {{minEpochTime}},
1446 "minEpochIterations": {{minEpochIterations}},
1447 "epochIterations": {{epochIterations}},
1448 "warmup": {{warmup}},
1449 "relative": {{relative}},
1450 "median(elapsed)": {{median(elapsed)}},
1451 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1452 "median(instructions)": {{median(instructions)}},
1453 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1454 "median(cpucycles)": {{median(cpucycles)}},
1455 "median(contextswitches)": {{median(contextswitches)}},
1456 "median(pagefaults)": {{median(pagefaults)}},
1457 "median(branchinstructions)": {{median(branchinstructions)}},
1458 "median(branchmisses)": {{median(branchmisses)}},
1459 "totalTime": {{sumProduct(iterations, elapsed)}},
1460 "measurements": [
1461{{#measurement}} {
1462 "iterations": {{iterations}},
1463 "elapsed": {{elapsed}},
1464 "pagefaults": {{pagefaults}},
1465 "cpucycles": {{cpucycles}},
1466 "contextswitches": {{contextswitches}},
1467 "instructions": {{instructions}},
1468 "branchinstructions": {{branchinstructions}},
1469 "branchmisses": {{branchmisses}}
1470 }{{^-last}},{{/-last}}
1471{{/measurement}} ]
1472 }{{^-last}},{{/-last}}
1473{{/result}} ]
1474})DELIM";
1475}
1476
1477ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1478struct Node {
1479 enum class Type { tag, content, section, inverted_section };
1480
1481 char const* begin;
1482 char const* end;
1483 std::vector<Node> children;
1484 Type type;
1485
1486 template <size_t N>
1487 // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1488 bool operator==(char const (&str)[N]) const noexcept {
1489 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
1490 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1491 }
1492};
1493ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1494
1495// NOLINTNEXTLINE(misc-no-recursion)
1496static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1497 std::vector<Node> nodes;
1498
1499 while (true) {
1500 auto const* begin = std::strstr(*tpl, "{{");
1501 auto const* end = begin;
1502 if (begin != nullptr) {
1503 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1504 begin += 2;
1505 end = std::strstr(begin, "}}");
1506 }
1507
1508 if (begin == nullptr || end == nullptr) {
1509 // nothing found, finish node
1510 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1511 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1512 return nodes;
1513 }
1514
1515 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1516 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1517
1518 // we found a tag
1519 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1520 *tpl = end + 2;
1521 switch (*begin) {
1522 case '/':
1523 // finished! bail out
1524 return nodes;
1525
1526 case '#':
1527 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1528 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1529 break;
1530
1531 case '^':
1532 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1533 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1534 break;
1535
1536 default:
1537 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1538 break;
1539 }
1540 }
1541}
1542
1543static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1544 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1545 bool const matchFirst = n == "-first";
1546 bool const matchLast = n == "-last";
1547 if (!matchFirst && !matchLast) {
1548 return false;
1549 }
1550
1551 bool doWrite = false;
1552 if (n.type == Node::Type::section) {
1553 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1554 } else if (n.type == Node::Type::inverted_section) {
1555 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1556 }
1557
1558 if (doWrite) {
1559 for (auto const& child : n.children) {
1560 if (child.type == Node::Type::content) {
1561 out.write(child.begin, std::distance(child.begin, child.end));
1562 }
1563 }
1564 }
1565 return true;
1566}
1567
1568static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1569 matchResult.clear();
1570 auto idxOpen = str.find('(');
1571 auto idxClose = str.find(')', idxOpen);
1572 if (idxClose == std::string::npos) {
1573 return false;
1574 }
1575
1576 matchResult.emplace_back(str.substr(0, idxOpen));
1577
1578 // split by comma
1579 matchResult.emplace_back();
1580 for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1581 if (str[i] == ' ' || str[i] == '\t') {
1582 // skip whitespace
1583 continue;
1584 }
1585 if (str[i] == ',') {
1586 // got a comma => new string
1587 matchResult.emplace_back();
1588 continue;
1589 }
1590 // no whitespace no comma, append
1591 matchResult.back() += str[i];
1592 }
1593 return true;
1594}
1595
1596static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1597 using detail::d;
1598
1599 if (n == "title") {
1600 out << config.mBenchmarkTitle;
1601 return true;
1602 }
1603 if (n == "name") {
1604 out << config.mBenchmarkName;
1605 return true;
1606 }
1607 if (n == "unit") {
1608 out << config.mUnit;
1609 return true;
1610 }
1611 if (n == "batch") {
1612 out << config.mBatch;
1613 return true;
1614 }
1615 if (n == "complexityN") {
1616 out << config.mComplexityN;
1617 return true;
1618 }
1619 if (n == "epochs") {
1620 out << config.mNumEpochs;
1621 return true;
1622 }
1623 if (n == "clockResolution") {
1624 out << d(detail::clockResolution());
1625 return true;
1626 }
1627 if (n == "clockResolutionMultiple") {
1628 out << config.mClockResolutionMultiple;
1629 return true;
1630 }
1631 if (n == "maxEpochTime") {
1632 out << d(config.mMaxEpochTime);
1633 return true;
1634 }
1635 if (n == "minEpochTime") {
1636 out << d(config.mMinEpochTime);
1637 return true;
1638 }
1639 if (n == "minEpochIterations") {
1640 out << config.mMinEpochIterations;
1641 return true;
1642 }
1643 if (n == "epochIterations") {
1644 out << config.mEpochIterations;
1645 return true;
1646 }
1647 if (n == "warmup") {
1648 out << config.mWarmup;
1649 return true;
1650 }
1651 if (n == "relative") {
1652 out << config.mIsRelative;
1653 return true;
1654 }
1655 return false;
1656}
1657
1658// NOLINTNEXTLINE(readability-function-cognitive-complexity)
1659static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1660 if (generateConfigTag(n, r.config(), out)) {
1661 return out;
1662 }
1663 // match e.g. "median(elapsed)"
1664 // g++ 4.8 doesn't implement std::regex :(
1665 // static std::regex const regOpArg1("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\‍)$");
1666 // std::cmatch matchResult;
1667 // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1668 std::vector<std::string> matchResult;
1669 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1670 if (matchResult.size() == 2) {
1671 if (matchResult[0] == "context") {
1672 return out << r.context(matchResult[1]);
1673 }
1674
1675 auto m = Result::fromString(matchResult[1]);
1676 if (m == Result::Measure::_size) {
1677 return out << 0.0;
1678 }
1679
1680 if (matchResult[0] == "median") {
1681 return out << r.median(m);
1682 }
1683 if (matchResult[0] == "average") {
1684 return out << r.average(m);
1685 }
1686 if (matchResult[0] == "medianAbsolutePercentError") {
1687 return out << r.medianAbsolutePercentError(m);
1688 }
1689 if (matchResult[0] == "sum") {
1690 return out << r.sum(m);
1691 }
1692 if (matchResult[0] == "minimum") {
1693 return out << r.minimum(m);
1694 }
1695 if (matchResult[0] == "maximum") {
1696 return out << r.maximum(m);
1697 }
1698 } else if (matchResult.size() == 3) {
1699 auto m1 = Result::fromString(matchResult[1]);
1700 auto m2 = Result::fromString(matchResult[2]);
1702 return out << 0.0;
1703 }
1704
1705 if (matchResult[0] == "sumProduct") {
1706 return out << r.sumProduct(m1, m2);
1707 }
1708 }
1709 }
1710
1711 // match e.g. "sumProduct(elapsed, iterations)"
1712 // static std::regex const regOpArg2("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\‍)$");
1713
1714 // nothing matches :(
1715 throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1716}
1717
1718static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1719 for (auto const& n : nodes) {
1720 if (!generateFirstLast(n, idx, r.size(), out)) {
1721 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1722 switch (n.type) {
1723 case Node::Type::content:
1724 out.write(n.begin, std::distance(n.begin, n.end));
1725 break;
1726
1727 case Node::Type::inverted_section:
1728 throw std::runtime_error("got a inverted section inside measurement");
1729
1730 case Node::Type::section:
1731 throw std::runtime_error("got a section inside measurement");
1732
1733 case Node::Type::tag: {
1734 auto m = Result::fromString(std::string(n.begin, n.end));
1735 if (m == Result::Measure::_size || !r.has(m)) {
1736 out << 0.0;
1737 } else {
1738 out << r.get(idx, m);
1739 }
1740 break;
1741 }
1742 }
1743 }
1744 }
1745}
1746
1747static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1748 auto const& r = results[idx];
1749 for (auto const& n : nodes) {
1750 if (!generateFirstLast(n, idx, results.size(), out)) {
1751 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1752 switch (n.type) {
1753 case Node::Type::content:
1754 out.write(n.begin, std::distance(n.begin, n.end));
1755 break;
1756
1757 case Node::Type::inverted_section:
1758 throw std::runtime_error("got a inverted section inside result");
1759
1760 case Node::Type::section:
1761 if (n == "measurement") {
1762 for (size_t i = 0; i < r.size(); ++i) {
1763 generateResultMeasurement(n.children, i, r, out);
1764 }
1765 } else {
1766 throw std::runtime_error("got a section inside result");
1767 }
1768 break;
1769
1770 case Node::Type::tag:
1771 generateResultTag(n, r, out);
1772 break;
1773 }
1774 }
1775 }
1776}
1777
1778} // namespace templates
1779
1780// helper stuff that only intended to be used internally
1781namespace detail {
1782
1783char const* getEnv(char const* name);
1784bool isEndlessRunning(std::string const& name);
1785bool isWarningsEnabled();
1786
1787template <typename T>
1788T parseFile(std::string const& filename, bool* fail);
1789
1790void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1791void printStabilityInformationOnce(std::ostream* outStream);
1792
1793// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1794uint64_t& singletonHeaderHash() noexcept;
1795
1796// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1797Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1798
1799// formatting utilities
1800namespace fmt {
1801
1802// adds thousands separator to numbers
1803ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1804class NumSep : public std::numpunct<char> {
1805public:
1806 explicit NumSep(char sep);
1807 char do_thousands_sep() const override;
1808 std::string do_grouping() const override;
1809
1810private:
1811 char mSep;
1812};
1813ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1814
1815// RAII to save & restore a stream's state
1816ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1817class StreamStateRestorer {
1818public:
1819 explicit StreamStateRestorer(std::ostream& s);
1820 ~StreamStateRestorer();
1821
1822 // sets back all stream info that we remembered at construction
1823 void restore();
1824
1825 // don't allow copying / moving
1826 StreamStateRestorer(StreamStateRestorer const&) = delete;
1827 StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1828 StreamStateRestorer(StreamStateRestorer&&) = delete;
1829 StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1830
1831private:
1832 std::ostream& mStream;
1833 std::locale mLocale;
1834 std::streamsize const mPrecision;
1835 std::streamsize const mWidth;
1836 std::ostream::char_type const mFill;
1837 std::ostream::fmtflags const mFmtFlags;
1838};
1839ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1840
1841// Number formatter
1842class Number {
1843public:
1844 Number(int width, int precision, double value);
1845 Number(int width, int precision, int64_t value);
1846 ANKERL_NANOBENCH(NODISCARD) std::string to_s() const;
1847
1848private:
1849 friend std::ostream& operator<<(std::ostream& os, Number const& n);
1850 std::ostream& write(std::ostream& os) const;
1851
1852 int mWidth;
1853 int mPrecision;
1854 double mValue;
1855};
1856
1857// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1858std::string to_s(uint64_t n);
1859
1860std::ostream& operator<<(std::ostream& os, Number const& n);
1861
1862class MarkDownColumn {
1863public:
1864 MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept;
1865 ANKERL_NANOBENCH(NODISCARD) std::string title() const;
1866 ANKERL_NANOBENCH(NODISCARD) std::string separator() const;
1867 ANKERL_NANOBENCH(NODISCARD) std::string invalid() const;
1868 ANKERL_NANOBENCH(NODISCARD) std::string value() const;
1869
1870private:
1871 int mWidth;
1872 int mPrecision;
1873 std::string mTitle;
1874 std::string mSuffix;
1875 double mValue;
1876};
1877
1878// Formats any text as markdown code, escaping backticks.
1879class MarkDownCode {
1880public:
1881 explicit MarkDownCode(std::string const& what);
1882
1883private:
1884 friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1885 std::ostream& write(std::ostream& os) const;
1886
1887 std::string mWhat{};
1888};
1889
1890std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1891
1892} // namespace fmt
1893} // namespace detail
1894} // namespace nanobench
1895} // namespace ankerl
1896
1897// implementation /////////////////////////////////////////////////////////////////////////////////
1898
1899namespace ankerl {
1900namespace nanobench {
1901
1902// NOLINTNEXTLINE(readability-function-cognitive-complexity)
1903void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1904 detail::fmt::StreamStateRestorer const restorer(out);
1905
1906 out.precision(std::numeric_limits<double>::digits10);
1907 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1908
1909 for (auto const& n : nodes) {
1910 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1911 switch (n.type) {
1912 case templates::Node::Type::content:
1913 out.write(n.begin, std::distance(n.begin, n.end));
1914 break;
1915
1916 case templates::Node::Type::inverted_section:
1917 throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1918
1919 case templates::Node::Type::section:
1920 if (n == "result") {
1921 const size_t nbResults = results.size();
1922 for (size_t i = 0; i < nbResults; ++i) {
1923 generateResult(n.children, i, results, out);
1924 }
1925 } else if (n == "measurement") {
1926 if (results.size() != 1) {
1927 throw std::runtime_error(
1928 "render: can only use section 'measurement' here if there is a single result, but there are " +
1929 detail::fmt::to_s(results.size()));
1930 }
1931 // when we only have a single result, we can immediately go into its measurement.
1932 auto const& r = results.front();
1933 for (size_t i = 0; i < r.size(); ++i) {
1934 generateResultMeasurement(n.children, i, r, out);
1935 }
1936 } else {
1937 throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
1938 }
1939 break;
1940
1941 case templates::Node::Type::tag:
1942 if (results.size() == 1) {
1943 // result & config are both supported there
1944 generateResultTag(n, results.front(), out);
1945 } else {
1946 // This just uses the last result's config.
1947 if (!generateConfigTag(n, results.back().config(), out)) {
1948 throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
1949 }
1950 }
1951 break;
1952 }
1953 }
1954}
1955
1956void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1957 render(mustacheTemplate.c_str(), results, out);
1958}
1959
1960void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
1961 render(mustacheTemplate, bench.results(), out);
1962}
1963
1964void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
1965 render(mustacheTemplate.c_str(), bench.results(), out);
1966}
1967
1968namespace detail {
1969
1970PerformanceCounters& performanceCounters() {
1971# if defined(__clang__)
1972# pragma clang diagnostic push
1973# pragma clang diagnostic ignored "-Wexit-time-destructors"
1974# endif
1975 static PerformanceCounters pc;
1976# if defined(__clang__)
1977# pragma clang diagnostic pop
1978# endif
1979 return pc;
1980}
1981
1982// Windows version of doNotOptimizeAway
1983// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L514
1984// see https://github.com/facebook/folly/blob/v2023.01.30.00/folly/lang/Hint-inl.h#L54-L58
1985// see https://learn.microsoft.com/en-us/cpp/preprocessor/optimize
1986# if defined(_MSC_VER)
1987# pragma optimize("", off)
1988void doNotOptimizeAwaySink(void const*) {}
1989# pragma optimize("", on)
1990# endif
1991
1992template <typename T>
1993T parseFile(std::string const& filename, bool* fail) {
1994 std::ifstream fin(filename); // NOLINT(misc-const-correctness)
1995 T num{};
1996 fin >> num;
1997 if (fail != nullptr) {
1998 *fail = fin.fail();
1999 }
2000 return num;
2001}
2002
2003char const* getEnv(char const* name) {
2004# if defined(_MSC_VER)
2005# pragma warning(push)
2006# pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
2007# endif
2008 return std::getenv(name); // NOLINT(concurrency-mt-unsafe)
2009# if defined(_MSC_VER)
2010# pragma warning(pop)
2011# endif
2012}
2013
2014bool isEndlessRunning(std::string const& name) {
2015 auto const* const endless = getEnv("NANOBENCH_ENDLESS");
2016 return nullptr != endless && endless == name;
2017}
2018
2019// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
2020bool isWarningsEnabled() {
2021 auto const* const suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
2022 return nullptr == suppression || suppression == std::string("0");
2023}
2024
2025void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
2026 warnings.clear();
2027 recommendations.clear();
2028
2029# if defined(DEBUG)
2030 warnings.emplace_back("DEBUG defined");
2031 bool const recommendCheckFlags = true;
2032# else
2033 bool const recommendCheckFlags = false;
2034# endif
2035
2036 bool recommendPyPerf = false;
2037# if defined(__linux__)
2038 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
2039 if (nprocs <= 0) {
2040 warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
2041 } else {
2042 // check frequency scaling
2043 for (long id = 0; id < nprocs; ++id) {
2044 auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
2045 auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
2046 auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq", nullptr);
2047 auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq", nullptr);
2048 if (minFreq != maxFreq) {
2049 auto minMHz = d(minFreq) / 1000.0;
2050 auto maxMHz = d(maxFreq) / 1000.0;
2051 warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
2052 detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
2053 " MHz");
2054 recommendPyPerf = true;
2055 break;
2056 }
2057 }
2058
2059 auto fail = false;
2060 auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", &fail);
2061 if (!fail && "performance" != currentGovernor) {
2062 warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
2063 recommendPyPerf = true;
2064 }
2065
2066 auto noTurbo = parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo", &fail);
2067 if (!fail && noTurbo == 0) {
2068 warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
2069 recommendPyPerf = true;
2070 }
2071 }
2072# endif
2073
2074 if (recommendCheckFlags) {
2075 recommendations.emplace_back("Make sure you compile for Release");
2076 }
2077 if (recommendPyPerf) {
2078 recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
2079 }
2080}
2081
2082void printStabilityInformationOnce(std::ostream* outStream) {
2083 static bool shouldPrint = true;
2084 if (shouldPrint && (nullptr != outStream) && isWarningsEnabled()) {
2085 auto& os = *outStream;
2086 shouldPrint = false;
2087 std::vector<std::string> warnings;
2088 std::vector<std::string> recommendations;
2089 gatherStabilityInformation(warnings, recommendations);
2090 if (warnings.empty()) {
2091 return;
2092 }
2093
2094 os << "Warning, results might be unstable:" << std::endl;
2095 for (auto const& w : warnings) {
2096 os << "* " << w << std::endl;
2097 }
2098
2099 os << std::endl << "Recommendations" << std::endl;
2100 for (auto const& r : recommendations) {
2101 os << "* " << r << std::endl;
2102 }
2103 }
2104}
2105
2106// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2107uint64_t& singletonHeaderHash() noexcept {
2108 static uint64_t sHeaderHash{};
2109 return sHeaderHash;
2110}
2111
2112ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2113inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2114 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2115}
2116
2117// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2118Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2119 auto bestDuration = Clock::duration::max();
2120 Clock::time_point tBegin;
2121 Clock::time_point tEnd;
2122 for (size_t i = 0; i < numEvaluations; ++i) {
2123 tBegin = Clock::now();
2124 do {
2125 tEnd = Clock::now();
2126 } while (tBegin == tEnd);
2127 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2128 }
2129 return bestDuration;
2130}
2131
2132// Calculates clock resolution once, and remembers the result
2133Clock::duration clockResolution() noexcept {
2134 static Clock::duration const sResolution = calcClockResolution(20);
2135 return sResolution;
2136}
2137
2138ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2139struct IterationLogic::Impl {
2140 enum class State { warmup, upscaling_runtime, measuring, endless };
2141
2142 explicit Impl(Bench const& bench)
2143 : mBench(bench)
2144 , mResult(bench.config()) {
2145 printStabilityInformationOnce(mBench.output());
2146
2147 // determine target runtime per epoch
2148 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2149 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2150 mTargetRuntimePerEpoch = mBench.maxEpochTime();
2151 }
2152 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2153 mTargetRuntimePerEpoch = mBench.minEpochTime();
2154 }
2155
2156 if (isEndlessRunning(mBench.name())) {
2157 std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2158 mNumIters = (std::numeric_limits<uint64_t>::max)();
2159 mState = State::endless;
2160 } else if (0 != mBench.warmup()) {
2161 mNumIters = mBench.warmup();
2162 mState = State::warmup;
2163 } else if (0 != mBench.epochIterations()) {
2164 // exact number of iterations
2165 mNumIters = mBench.epochIterations();
2166 mState = State::measuring;
2167 } else {
2168 mNumIters = mBench.minEpochIterations();
2169 mState = State::upscaling_runtime;
2170 }
2171 }
2172
2173 // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2174 ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2175 auto doubleElapsed = d(elapsed);
2176 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2177 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2178
2179 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2180 if (doubleNewIters < doubleMinEpochIters) {
2181 doubleNewIters = doubleMinEpochIters;
2182 }
2183 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2184
2185 // +0.5 for correct rounding when casting
2186 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2187 return static_cast<uint64_t>(doubleNewIters + 0.5);
2188 }
2189
2190 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
2191 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2192 // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2193 if (mNumIters * 10 < mNumIters) {
2194 // overflow :-(
2195 showResult("iterations overflow. Maybe your code got optimized away?");
2196 mNumIters = 0;
2197 return;
2198 }
2199 mNumIters *= 10;
2200 } else {
2201 mNumIters = calcBestNumIters(elapsed, mNumIters);
2202 }
2203 }
2204
2205 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2206# if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2207 auto oldIters = mNumIters;
2208# endif
2209
2210 switch (mState) {
2211 case State::warmup:
2212 if (isCloseEnoughForMeasurements(elapsed)) {
2213 // if elapsed is close enough, we can skip upscaling and go right to measurements
2214 // still, we don't add the result to the measurements.
2215 mState = State::measuring;
2216 mNumIters = calcBestNumIters(elapsed, mNumIters);
2217 } else {
2218 // not close enough: switch to upscaling
2219 mState = State::upscaling_runtime;
2220 upscale(elapsed);
2221 }
2222 break;
2223
2224 case State::upscaling_runtime:
2225 if (isCloseEnoughForMeasurements(elapsed)) {
2226 // if we are close enough, add measurement and switch to always measuring
2227 mState = State::measuring;
2228 mTotalElapsed += elapsed;
2229 mTotalNumIters += mNumIters;
2230 mResult.add(elapsed, mNumIters, pc);
2231 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2232 } else {
2233 upscale(elapsed);
2234 }
2235 break;
2236
2237 case State::measuring:
2238 // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2239 // that fluctuation, or else we would bias the result
2240 mTotalElapsed += elapsed;
2241 mTotalNumIters += mNumIters;
2242 mResult.add(elapsed, mNumIters, pc);
2243 if (0 != mBench.epochIterations()) {
2244 mNumIters = mBench.epochIterations();
2245 } else {
2246 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2247 }
2248 break;
2249
2250 case State::endless:
2251 mNumIters = (std::numeric_limits<uint64_t>::max)();
2252 break;
2253 }
2254
2255 if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2256 // we got all the results that we need, finish it
2257 showResult("");
2258 mNumIters = 0;
2259 }
2260
2261 ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, d(elapsed.count())) << " elapsed, "
2262 << detail::fmt::Number(20, 3, d(mTargetRuntimePerEpoch.count())) << " target. oldIters="
2263 << oldIters << ", mNumIters=" << mNumIters << ", mState=" << static_cast<int>(mState));
2264 }
2265
2266 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
2267 void showResult(std::string const& errorMessage) const {
2268 ANKERL_NANOBENCH_LOG(errorMessage);
2269
2270 if (mBench.output() != nullptr) {
2271 // prepare column data ///////
2272 std::vector<fmt::MarkDownColumn> columns;
2273
2274 auto rMedian = mResult.median(Result::Measure::elapsed);
2275
2276 if (mBench.relative()) {
2277 double d = 100.0;
2278 if (!mBench.results().empty()) {
2279 d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2280 }
2281 columns.emplace_back(11, 1, "relative", "%", d);
2282 }
2283
2284 if (mBench.complexityN() > 0) {
2285 columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2286 }
2287
2288 columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2289 rMedian / (mBench.timeUnit().count() * mBench.batch()));
2290 columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2291
2292 double const rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2293 columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2294
2295 double rInsMedian = -1.0;
2296 if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {
2297 rInsMedian = mResult.median(Result::Measure::instructions);
2298 columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2299 }
2300
2301 double rCycMedian = -1.0;
2302 if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {
2303 rCycMedian = mResult.median(Result::Measure::cpucycles);
2304 columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2305 }
2306 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2307 columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2308 }
2309 if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {
2310 double const rBraMedian = mResult.median(Result::Measure::branchinstructions);
2311 columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2312 if (mResult.has(Result::Measure::branchmisses)) {
2313 double p = 0.0;
2314 if (rBraMedian >= 1e-9) {
2315 p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2316 }
2317 columns.emplace_back(10, 1, "miss%", "%", p);
2318 }
2319 }
2320
2321 columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2322
2323 // write everything
2324 auto& os = *mBench.output();
2325
2326 // combine all elements that are relevant for printing the header
2327 uint64_t hash = 0;
2328 hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2329 hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2330 hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2331 hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2332 hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2333 hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2334
2335 if (hash != singletonHeaderHash()) {
2336 singletonHeaderHash() = hash;
2337
2338 // no result yet, print header
2339 os << std::endl;
2340 for (auto const& col : columns) {
2341 os << col.title();
2342 }
2343 os << "| " << mBench.title() << std::endl;
2344
2345 for (auto const& col : columns) {
2346 os << col.separator();
2347 }
2348 os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2349 }
2350
2351 if (!errorMessage.empty()) {
2352 for (auto const& col : columns) {
2353 os << col.invalid();
2354 }
2355 os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2356 } else {
2357 for (auto const& col : columns) {
2358 os << col.value();
2359 }
2360 os << "| ";
2361 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2362 if (showUnstable) {
2363 os << ":wavy_dash: ";
2364 }
2365 os << fmt::MarkDownCode(mBench.name());
2366 if (showUnstable) {
2367 auto avgIters = d(mTotalNumIters) / d(mBench.epochs());
2368 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2369 auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2370
2371 os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2372 << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2373 }
2374 os << std::endl;
2375 }
2376 }
2377 }
2378
2379 ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2380 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2381 }
2382
2383 uint64_t mNumIters = 1; // NOLINT(misc-non-private-member-variables-in-classes)
2384 Bench const& mBench; // NOLINT(misc-non-private-member-variables-in-classes)
2385 std::chrono::nanoseconds mTargetRuntimePerEpoch{}; // NOLINT(misc-non-private-member-variables-in-classes)
2386 Result mResult; // NOLINT(misc-non-private-member-variables-in-classes)
2387 Rng mRng{123}; // NOLINT(misc-non-private-member-variables-in-classes)
2388 std::chrono::nanoseconds mTotalElapsed{}; // NOLINT(misc-non-private-member-variables-in-classes)
2389 uint64_t mTotalNumIters = 0; // NOLINT(misc-non-private-member-variables-in-classes)
2390 State mState = State::upscaling_runtime; // NOLINT(misc-non-private-member-variables-in-classes)
2391};
2392ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2393
2394IterationLogic::IterationLogic(Bench const& bench)
2395 : mPimpl(new Impl(bench)) {}
2396
2397IterationLogic::~IterationLogic() {
2398 delete mPimpl;
2399}
2400
2401uint64_t IterationLogic::numIters() const noexcept {
2402 ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2403 return mPimpl->mNumIters;
2404}
2405
2406void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2407 mPimpl->add(elapsed, pc);
2408}
2409
2410void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2411 results.emplace_back(std::move(mPimpl->mResult));
2412}
2413
2414# if ANKERL_NANOBENCH(PERF_COUNTERS)
2415
2416ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2417class LinuxPerformanceCounters {
2418public:
2419 struct Target {
2420 Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2421 : targetValue(targetValue_)
2422 , correctMeasuringOverhead(correctMeasuringOverhead_)
2423 , correctLoopOverhead(correctLoopOverhead_) {}
2424
2425 uint64_t* targetValue{}; // NOLINT(misc-non-private-member-variables-in-classes)
2426 bool correctMeasuringOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2427 bool correctLoopOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2428 };
2429
2430 LinuxPerformanceCounters() = default;
2431 LinuxPerformanceCounters(LinuxPerformanceCounters const&) = delete;
2432 LinuxPerformanceCounters(LinuxPerformanceCounters&&) = delete;
2433 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters const&) = delete;
2434 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters&&) = delete;
2435 ~LinuxPerformanceCounters();
2436
2437 // quick operation
2438 inline void start() {}
2439
2440 inline void stop() {}
2441
2442 bool monitor(perf_sw_ids swId, Target target);
2443 bool monitor(perf_hw_id hwId, Target target);
2444
2445 ANKERL_NANOBENCH(NODISCARD) bool hasError() const noexcept {
2446 return mHasError;
2447 }
2448
2449 // Just reading data is faster than enable & disabling.
2450 // we subtract data ourselves.
2451 inline void beginMeasure() {
2452 if (mHasError) {
2453 return;
2454 }
2455
2456 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2457 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2458 if (mHasError) {
2459 return;
2460 }
2461
2462 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2463 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2464 }
2465
2466 inline void endMeasure() {
2467 if (mHasError) {
2468 return;
2469 }
2470
2471 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2472 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2473 if (mHasError) {
2474 return;
2475 }
2476
2477 auto const numBytes = sizeof(uint64_t) * mCounters.size();
2478 auto ret = read(mFd, mCounters.data(), numBytes);
2479 mHasError = ret != static_cast<ssize_t>(numBytes);
2480 }
2481
2482 void updateResults(uint64_t numIters);
2483
2484 // rounded integer division
2485 template <typename T>
2486 static inline T divRounded(T a, T divisor) {
2487 return (a + divisor / 2) / divisor;
2488 }
2489
2490 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2491 static inline uint32_t mix(uint32_t x) noexcept {
2492 x ^= x << 13U;
2493 x ^= x >> 17U;
2494 x ^= x << 5U;
2495 return x;
2496 }
2497
2498 template <typename Op>
2499 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2500 void calibrate(Op&& op) {
2501 // clear current calibration data,
2502 for (auto& v : mCalibratedOverhead) {
2503 v = UINT64_C(0);
2504 }
2505
2506 // create new calibration data
2507 auto newCalibration = mCalibratedOverhead;
2508 for (auto& v : newCalibration) {
2509 v = (std::numeric_limits<uint64_t>::max)();
2510 }
2511 for (size_t iter = 0; iter < 100; ++iter) {
2512 beginMeasure();
2513 op();
2514 endMeasure();
2515 if (mHasError) {
2516 return;
2517 }
2518
2519 for (size_t i = 0; i < newCalibration.size(); ++i) {
2520 auto diff = mCounters[i];
2521 if (newCalibration[i] > diff) {
2522 newCalibration[i] = diff;
2523 }
2524 }
2525 }
2526
2527 mCalibratedOverhead = std::move(newCalibration);
2528
2529 {
2530 // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2531 // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2532 // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2533 // see https://godbolt.org/z/49RVQ5
2534 uint64_t const numIters = 100000U + (std::random_device{}() & 3U);
2535 uint64_t n = numIters;
2536 uint32_t x = 1234567;
2537
2538 beginMeasure();
2539 while (n-- > 0) {
2540 x = mix(x);
2541 }
2542 endMeasure();
2544 auto measure1 = mCounters;
2545
2546 n = numIters;
2547 beginMeasure();
2548 while (n-- > 0) {
2549 // we now run *twice* so we can easily calculate the overhead
2550 x = mix(x);
2551 x = mix(x);
2552 }
2553 endMeasure();
2555 auto measure2 = mCounters;
2556
2557 for (size_t i = 0; i < mCounters.size(); ++i) {
2558 // factor 2 because we have two instructions per loop
2559 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2560 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2561 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2562
2563 mLoopOverhead[i] = divRounded(overhead, numIters);
2564 }
2565 }
2566 }
2567
2568private:
2569 bool monitor(uint32_t type, uint64_t eventid, Target target);
2570
2571 std::map<uint64_t, Target> mIdToTarget{};
2572
2573 // start with minimum size of 3 for read_format
2574 std::vector<uint64_t> mCounters{3};
2575 std::vector<uint64_t> mCalibratedOverhead{3};
2576 std::vector<uint64_t> mLoopOverhead{3};
2577
2578 uint64_t mTimeEnabledNanos = 0;
2579 uint64_t mTimeRunningNanos = 0;
2580 int mFd = -1;
2581 bool mHasError = false;
2582};
2583ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2584
2585LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2586 if (-1 != mFd) {
2587 close(mFd);
2588 }
2589}
2590
2591bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2592 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2593}
2594
2595bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2596 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2597}
2598
2599// overflow is ok, it's checked
2600ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2601void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2602 // clear old data
2603 for (auto& id_value : mIdToTarget) {
2604 *id_value.second.targetValue = UINT64_C(0);
2605 }
2606
2607 if (mHasError) {
2608 return;
2609 }
2610
2611 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2612 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2613
2614 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2615 auto idx = static_cast<size_t>(3 + i * 2 + 0);
2616 auto id = mCounters[idx + 1U];
2617
2618 auto it = mIdToTarget.find(id);
2619 if (it != mIdToTarget.end()) {
2620
2621 auto& tgt = it->second;
2622 *tgt.targetValue = mCounters[idx];
2623 if (tgt.correctMeasuringOverhead) {
2624 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2625 *tgt.targetValue -= mCalibratedOverhead[idx];
2626 } else {
2627 *tgt.targetValue = 0U;
2628 }
2629 }
2630 if (tgt.correctLoopOverhead) {
2631 auto correctionVal = mLoopOverhead[idx] * numIters;
2632 if (*tgt.targetValue >= correctionVal) {
2633 *tgt.targetValue -= correctionVal;
2634 } else {
2635 *tgt.targetValue = 0U;
2636 }
2637 }
2638 }
2639 }
2640}
2641
2642bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2643 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2644 if (mHasError) {
2645 return false;
2646 }
2647
2648 auto pea = perf_event_attr();
2649 std::memset(&pea, 0, sizeof(perf_event_attr));
2650 pea.type = type;
2651 pea.size = sizeof(perf_event_attr);
2652 pea.config = eventid;
2653 pea.disabled = 1; // start counter as disabled
2654 pea.exclude_kernel = 1;
2655 pea.exclude_hv = 1;
2656
2657 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2658 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2659
2660 const int pid = 0; // the current process
2661 const int cpu = -1; // all CPUs
2662# if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2663 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2664# else
2665 const unsigned long flags = 0;
2666# endif
2667
2668 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
2669 auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2670 if (-1 == fd) {
2671 return false;
2672 }
2673 if (-1 == mFd) {
2674 // first call: set to fd, and use this from now on
2675 mFd = fd;
2676 }
2677 uint64_t id = 0;
2678 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2679 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2680 // couldn't get id
2681 return false;
2682 }
2683
2684 // insert into map, rely on the fact that map's references are constant.
2685 mIdToTarget.emplace(id, target);
2686
2687 // prepare readformat with the correct size (after the insert)
2688 auto size = 3 + 2 * mIdToTarget.size();
2689 mCounters.resize(size);
2690 mCalibratedOverhead.resize(size);
2691 mLoopOverhead.resize(size);
2692
2693 return true;
2694}
2695
2696PerformanceCounters::PerformanceCounters()
2697 : mPc(new LinuxPerformanceCounters())
2698 , mVal()
2699 , mHas() {
2700
2701 // HW events
2702 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2703 if (!mHas.cpuCycles) {
2704 // Fallback to cycles counter, reference cycles not available in many systems.
2705 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2706 }
2707 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2708 mHas.branchInstructions =
2709 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2710 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2711 // mHas.branchMisses = false;
2712
2713 // SW events
2714 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2715 mHas.contextSwitches =
2716 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2717
2718 mPc->start();
2719 mPc->calibrate([] {
2720 auto before = ankerl::nanobench::Clock::now();
2721 auto after = ankerl::nanobench::Clock::now();
2722 (void)before;
2723 (void)after;
2724 });
2725
2726 if (mPc->hasError()) {
2727 // something failed, don't monitor anything.
2728 mHas = PerfCountSet<bool>{};
2729 }
2730}
2731
2732PerformanceCounters::~PerformanceCounters() {
2733 // no need to check for nullptr, delete nullptr has no effect
2734 delete mPc;
2735}
2736
2737void PerformanceCounters::beginMeasure() {
2738 mPc->beginMeasure();
2739}
2740
2741void PerformanceCounters::endMeasure() {
2742 mPc->endMeasure();
2743}
2744
2745void PerformanceCounters::updateResults(uint64_t numIters) {
2746 mPc->updateResults(numIters);
2747}
2748
2749# else
2750
2751PerformanceCounters::PerformanceCounters() = default;
2752PerformanceCounters::~PerformanceCounters() = default;
2753void PerformanceCounters::beginMeasure() {}
2754void PerformanceCounters::endMeasure() {}
2755void PerformanceCounters::updateResults(uint64_t) {}
2756
2757# endif
2758
2759ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2760 return mVal;
2761}
2762ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2763 return mHas;
2764}
2765
2766// formatting utilities
2767namespace fmt {
2768
2769// adds thousands separator to numbers
2770NumSep::NumSep(char sep)
2771 : mSep(sep) {}
2772
2773char NumSep::do_thousands_sep() const {
2774 return mSep;
2775}
2776
2777std::string NumSep::do_grouping() const {
2778 return "\003";
2779}
2780
2781// RAII to save & restore a stream's state
2782StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2783 : mStream(s)
2784 , mLocale(s.getloc())
2785 , mPrecision(s.precision())
2786 , mWidth(s.width())
2787 , mFill(s.fill())
2788 , mFmtFlags(s.flags()) {}
2789
2790StreamStateRestorer::~StreamStateRestorer() {
2791 restore();
2792}
2793
2794// sets back all stream info that we remembered at construction
2795void StreamStateRestorer::restore() {
2796 mStream.imbue(mLocale);
2797 mStream.precision(mPrecision);
2798 mStream.width(mWidth);
2799 mStream.fill(mFill);
2800 mStream.flags(mFmtFlags);
2801}
2802
2803Number::Number(int width, int precision, int64_t value)
2804 : mWidth(width)
2805 , mPrecision(precision)
2806 , mValue(d(value)) {}
2807
2808Number::Number(int width, int precision, double value)
2809 : mWidth(width)
2810 , mPrecision(precision)
2811 , mValue(value) {}
2812
2813std::ostream& Number::write(std::ostream& os) const {
2814 StreamStateRestorer const restorer(os);
2815 os.imbue(std::locale(os.getloc(), new NumSep(',')));
2816 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2817 return os;
2818}
2819
2820std::string Number::to_s() const {
2821 std::stringstream ss;
2822 write(ss);
2823 return ss.str();
2824}
2825
2826std::string to_s(uint64_t n) {
2827 std::string str;
2828 do {
2829 str += static_cast<char>('0' + static_cast<char>(n % 10));
2830 n /= 10;
2831 } while (n != 0);
2832 std::reverse(str.begin(), str.end());
2833 return str;
2834}
2835
2836std::ostream& operator<<(std::ostream& os, Number const& n) {
2837 return n.write(os);
2838}
2839
2840MarkDownColumn::MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept
2841 : mWidth(w)
2842 , mPrecision(prec)
2843 , mTitle(std::move(tit))
2844 , mSuffix(std::move(suff))
2845 , mValue(val) {}
2846
2847std::string MarkDownColumn::title() const {
2848 std::stringstream ss;
2849 ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2850 return ss.str();
2851}
2852
2853std::string MarkDownColumn::separator() const {
2854 std::string sep(static_cast<size_t>(mWidth), '-');
2855 sep.front() = '|';
2856 sep.back() = ':';
2857 return sep;
2858}
2859
2860std::string MarkDownColumn::invalid() const {
2861 std::string sep(static_cast<size_t>(mWidth), ' ');
2862 sep.front() = '|';
2863 sep[sep.size() - 2] = '-';
2864 return sep;
2865}
2866
2867std::string MarkDownColumn::value() const {
2868 std::stringstream ss;
2869 auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2870 ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2871 return ss.str();
2872}
2873
2874// Formats any text as markdown code, escaping backticks.
2875MarkDownCode::MarkDownCode(std::string const& what) {
2876 mWhat.reserve(what.size() + 2);
2877 mWhat.push_back('`');
2878 for (char const c : what) {
2879 mWhat.push_back(c);
2880 if ('`' == c) {
2881 mWhat.push_back('`');
2882 }
2883 }
2884 mWhat.push_back('`');
2885}
2886
2887std::ostream& MarkDownCode::write(std::ostream& os) const {
2888 return os << mWhat;
2889}
2890
2891std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2892 return mdCode.write(os);
2893}
2894} // namespace fmt
2895} // namespace detail
2896
2897// provide implementation here so it's only generated once
2898Config::Config() = default;
2899Config::~Config() = default;
2900Config& Config::operator=(Config const&) = default;
2901Config& Config::operator=(Config&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
2902Config::Config(Config const&) = default;
2903Config::Config(Config&&) noexcept = default;
2904
2905// provide implementation here so it's only generated once
2906Result::~Result() = default;
2907Result& Result::operator=(Result const&) = default;
2908Result& Result::operator=(Result&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
2909Result::Result(Result const&) = default;
2910Result::Result(Result&&) noexcept = default;
2911
2912namespace detail {
2913template <typename T>
2914inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2915 return static_cast<typename std::underlying_type<T>::type>(val);
2916}
2917} // namespace detail
2918
2919// Result returned after a benchmark has finished. Can be used as a baseline for relative().
2920Result::Result(Config benchmarkConfig)
2921 : mConfig(std::move(benchmarkConfig))
2922 , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2923
2924void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2925 using detail::d;
2926 using detail::u;
2927
2928 double const dIters = d(iters);
2929 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2930
2931 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2932 if (pc.has().pageFaults) {
2933 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2934 }
2935 if (pc.has().cpuCycles) {
2936 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2937 }
2938 if (pc.has().contextSwitches) {
2939 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2940 }
2941 if (pc.has().instructions) {
2942 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2943 }
2944 if (pc.has().branchInstructions) {
2945 double branchInstructions = 0.0;
2946 // correcting branches: remove branch introduced by the while (...) loop for each iteration.
2947 if (pc.val().branchInstructions > iters + 1U) {
2948 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
2949 }
2950 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
2951
2952 if (pc.has().branchMisses) {
2953 // correcting branch misses
2954 double branchMisses = d(pc.val().branchMisses);
2955 if (branchMisses > branchInstructions) {
2956 // can't have branch misses when there were branches...
2957 branchMisses = branchInstructions;
2958 }
2959
2960 // assuming at least one missed branch for the loop
2961 branchMisses -= 1.0;
2962 if (branchMisses < 1.0) {
2963 branchMisses = 1.0;
2964 }
2965 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
2966 }
2967 }
2968}
2969
2970Config const& Result::config() const noexcept {
2971 return mConfig;
2972}
2973
2974inline double calcMedian(std::vector<double>& data) {
2975 if (data.empty()) {
2976 return 0.0;
2977 }
2978 std::sort(data.begin(), data.end());
2979
2980 auto midIdx = data.size() / 2U;
2981 if (1U == (data.size() & 1U)) {
2982 return data[midIdx];
2983 }
2984 return (data[midIdx - 1U] + data[midIdx]) / 2U;
2985}
2986
2987double Result::median(Measure m) const {
2988 // create a copy so we can sort
2989 auto data = mNameToMeasurements[detail::u(m)];
2990 return calcMedian(data);
2991}
2992
2993double Result::average(Measure m) const {
2994 using detail::d;
2995 auto const& data = mNameToMeasurements[detail::u(m)];
2996 if (data.empty()) {
2997 return 0.0;
2998 }
2999
3000 // create a copy so we can sort
3001 return sum(m) / d(data.size());
3002}
3003
3004double Result::medianAbsolutePercentError(Measure m) const {
3005 // create copy
3006 auto data = mNameToMeasurements[detail::u(m)];
3007
3008 // calculates MdAPE which is the median of percentage error
3009 // see https://support.numxl.com/hc/en-us/articles/115001223503-MdAPE-Median-Absolute-Percentage-Error
3010 auto med = calcMedian(data);
3011
3012 // transform the data to absolute error
3013 for (auto& x : data) {
3014 x = (x - med) / x;
3015 if (x < 0) {
3016 x = -x;
3017 }
3018 }
3019 return calcMedian(data);
3020}
3021
3022double Result::sum(Measure m) const noexcept {
3023 auto const& data = mNameToMeasurements[detail::u(m)];
3024 return std::accumulate(data.begin(), data.end(), 0.0);
3025}
3026
3027double Result::sumProduct(Measure m1, Measure m2) const noexcept {
3028 auto const& data1 = mNameToMeasurements[detail::u(m1)];
3029 auto const& data2 = mNameToMeasurements[detail::u(m2)];
3030
3031 if (data1.size() != data2.size()) {
3032 return 0.0;
3033 }
3034
3035 double result = 0.0;
3036 for (size_t i = 0, s = data1.size(); i != s; ++i) {
3037 result += data1[i] * data2[i];
3038 }
3039 return result;
3040}
3041
3042bool Result::has(Measure m) const noexcept {
3043 return !mNameToMeasurements[detail::u(m)].empty();
3044}
3045
3046double Result::get(size_t idx, Measure m) const {
3047 auto const& data = mNameToMeasurements[detail::u(m)];
3048 return data.at(idx);
3049}
3050
3051bool Result::empty() const noexcept {
3052 return 0U == size();
3053}
3054
3055size_t Result::size() const noexcept {
3056 auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
3057 return data.size();
3058}
3059
3060double Result::minimum(Measure m) const noexcept {
3061 auto const& data = mNameToMeasurements[detail::u(m)];
3062 if (data.empty()) {
3063 return 0.0;
3064 }
3065
3066 // here its save to assume that at least one element is there
3067 return *std::min_element(data.begin(), data.end());
3068}
3069
3070double Result::maximum(Measure m) const noexcept {
3071 auto const& data = mNameToMeasurements[detail::u(m)];
3072 if (data.empty()) {
3073 return 0.0;
3074 }
3075
3076 // here its save to assume that at least one element is there
3077 return *std::max_element(data.begin(), data.end());
3078}
3079
3080std::string const& Result::context(char const* variableName) const {
3081 return mConfig.mContext.at(variableName);
3082}
3083
3084std::string const& Result::context(std::string const& variableName) const {
3085 return mConfig.mContext.at(variableName);
3086}
3087
3088Result::Measure Result::fromString(std::string const& str) {
3089 if (str == "elapsed") {
3090 return Measure::elapsed;
3091 }
3092 if (str == "iterations") {
3093 return Measure::iterations;
3094 }
3095 if (str == "pagefaults") {
3096 return Measure::pagefaults;
3097 }
3098 if (str == "cpucycles") {
3099 return Measure::cpucycles;
3100 }
3101 if (str == "contextswitches") {
3102 return Measure::contextswitches;
3103 }
3104 if (str == "instructions") {
3105 return Measure::instructions;
3106 }
3107 if (str == "branchinstructions") {
3108 return Measure::branchinstructions;
3109 }
3110 if (str == "branchmisses") {
3111 return Measure::branchmisses;
3112 }
3113 // not found, return _size
3114 return Measure::_size;
3115}
3116
3117// Configuration of a microbenchmark.
3118Bench::Bench() {
3119 mConfig.mOut = &std::cout;
3120}
3121
3122Bench::Bench(Bench&&) noexcept = default;
3123Bench& Bench::operator=(Bench&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
3124Bench::Bench(Bench const&) = default;
3125Bench& Bench::operator=(Bench const&) = default;
3126Bench::~Bench() noexcept = default;
3127
3128double Bench::batch() const noexcept {
3129 return mConfig.mBatch;
3130}
3131
3132double Bench::complexityN() const noexcept {
3133 return mConfig.mComplexityN;
3134}
3135
3136// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3137// means it is slower than the baseline.
3138Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3139 mConfig.mIsRelative = isRelativeEnabled;
3140 return *this;
3141}
3142bool Bench::relative() const noexcept {
3143 return mConfig.mIsRelative;
3144}
3145
3146Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3147 mConfig.mShowPerformanceCounters = showPerformanceCounters;
3148 return *this;
3149}
3150bool Bench::performanceCounters() const noexcept {
3151 return mConfig.mShowPerformanceCounters;
3152}
3153
3154// Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3155// If u differs from currently set unit, the stored results will be cleared.
3156// Use singular (byte, not bytes).
3157Bench& Bench::unit(char const* u) {
3158 if (u != mConfig.mUnit) {
3159 mResults.clear();
3160 }
3161 mConfig.mUnit = u;
3162 return *this;
3163}
3164
3165Bench& Bench::unit(std::string const& u) {
3166 return unit(u.c_str());
3167}
3168
3169std::string const& Bench::unit() const noexcept {
3170 return mConfig.mUnit;
3171}
3172
3173Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3174 mConfig.mTimeUnit = tu;
3175 mConfig.mTimeUnitName = tuName;
3176 return *this;
3177}
3178
3179std::string const& Bench::timeUnitName() const noexcept {
3180 return mConfig.mTimeUnitName;
3181}
3182
3183std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3184 return mConfig.mTimeUnit;
3185}
3186
3187// If benchmarkTitle differs from currently set title, the stored results will be cleared.
3188Bench& Bench::title(const char* benchmarkTitle) {
3189 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3190 mResults.clear();
3191 }
3192 mConfig.mBenchmarkTitle = benchmarkTitle;
3193 return *this;
3194}
3195Bench& Bench::title(std::string const& benchmarkTitle) {
3196 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3197 mResults.clear();
3198 }
3199 mConfig.mBenchmarkTitle = benchmarkTitle;
3200 return *this;
3201}
3202
3203std::string const& Bench::title() const noexcept {
3204 return mConfig.mBenchmarkTitle;
3205}
3206
3207Bench& Bench::name(const char* benchmarkName) {
3208 mConfig.mBenchmarkName = benchmarkName;
3209 return *this;
3210}
3211
3212Bench& Bench::name(std::string const& benchmarkName) {
3213 mConfig.mBenchmarkName = benchmarkName;
3214 return *this;
3215}
3216
3217std::string const& Bench::name() const noexcept {
3218 return mConfig.mBenchmarkName;
3219}
3220
3221Bench& Bench::context(char const* variableName, char const* variableValue) {
3222 mConfig.mContext[variableName] = variableValue;
3223 return *this;
3224}
3225
3226Bench& Bench::context(std::string const& variableName, std::string const& variableValue) {
3227 mConfig.mContext[variableName] = variableValue;
3228 return *this;
3229}
3230
3231Bench& Bench::clearContext() {
3232 mConfig.mContext.clear();
3233 return *this;
3234}
3235
3236// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
3237Bench& Bench::epochs(size_t numEpochs) noexcept {
3238 mConfig.mNumEpochs = numEpochs;
3239 return *this;
3240}
3241size_t Bench::epochs() const noexcept {
3242 return mConfig.mNumEpochs;
3243}
3244
3245// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
3246Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3247 mConfig.mClockResolutionMultiple = multiple;
3248 return *this;
3249}
3250size_t Bench::clockResolutionMultiple() const noexcept {
3251 return mConfig.mClockResolutionMultiple;
3252}
3253
3254// Sets the maximum time each epoch should take. Default is 100ms.
3255Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3256 mConfig.mMaxEpochTime = t;
3257 return *this;
3258}
3259std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3260 return mConfig.mMaxEpochTime;
3261}
3262
3263// Sets the maximum time each epoch should take. Default is 100ms.
3264Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3265 mConfig.mMinEpochTime = t;
3266 return *this;
3267}
3268std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3269 return mConfig.mMinEpochTime;
3270}
3271
3272Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3273 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3274 return *this;
3275}
3276uint64_t Bench::minEpochIterations() const noexcept {
3277 return mConfig.mMinEpochIterations;
3278}
3279
3280Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3281 mConfig.mEpochIterations = numIters;
3282 return *this;
3283}
3284uint64_t Bench::epochIterations() const noexcept {
3285 return mConfig.mEpochIterations;
3286}
3287
3288Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3289 mConfig.mWarmup = numWarmupIters;
3290 return *this;
3291}
3292uint64_t Bench::warmup() const noexcept {
3293 return mConfig.mWarmup;
3294}
3295
3296Bench& Bench::config(Config const& benchmarkConfig) {
3297 mConfig = benchmarkConfig;
3298 return *this;
3299}
3300Config const& Bench::config() const noexcept {
3301 return mConfig;
3302}
3303
3304Bench& Bench::output(std::ostream* outstream) noexcept {
3305 mConfig.mOut = outstream;
3306 return *this;
3307}
3308
3309ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3310 return mConfig.mOut;
3311}
3312
3313std::vector<Result> const& Bench::results() const noexcept {
3314 return mResults;
3315}
3316
3317Bench& Bench::render(char const* templateContent, std::ostream& os) {
3318 ::ankerl::nanobench::render(templateContent, *this, os);
3319 return *this;
3320}
3321
3322Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3323 ::ankerl::nanobench::render(templateContent, *this, os);
3324 return *this;
3325}
3326
3327std::vector<BigO> Bench::complexityBigO() const {
3328 std::vector<BigO> bigOs;
3329 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3330 bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3331 return 1.0;
3332 });
3333 bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3334 return n;
3335 });
3336 bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3337 return std::log2(n);
3338 });
3339 bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3340 return n * std::log2(n);
3341 });
3342 bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3343 return n * n;
3344 });
3345 bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3346 return n * n * n;
3347 });
3348 std::sort(bigOs.begin(), bigOs.end());
3349 return bigOs;
3350}
3351
3352Rng::Rng()
3353 : mX(0)
3354 , mY(0) {
3355 std::random_device rd;
3356 std::uniform_int_distribution<uint64_t> dist;
3357 do {
3358 mX = dist(rd);
3359 mY = dist(rd);
3360 } while (mX == 0 && mY == 0);
3361}
3362
3363ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
3364uint64_t splitMix64(uint64_t& state) noexcept {
3365 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3366 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3367 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3368 return z ^ (z >> 31U);
3369}
3370
3371// Seeded as described in romu paper (update april 2020)
3372Rng::Rng(uint64_t seed) noexcept
3373 : mX(splitMix64(seed))
3374 , mY(splitMix64(seed)) {
3375 for (size_t i = 0; i < 10; ++i) {
3376 operator()();
3377 }
3378}
3379
3380// only internally used to copy the RNG.
3381Rng::Rng(uint64_t x, uint64_t y) noexcept
3382 : mX(x)
3383 , mY(y) {}
3384
3385Rng Rng::copy() const noexcept {
3386 return Rng{mX, mY};
3387}
3388
3389Rng::Rng(std::vector<uint64_t> const& data)
3390 : mX(0)
3391 , mY(0) {
3392 if (data.size() != 2) {
3393 throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3394 detail::fmt::to_s(data.size()));
3395 }
3396 mX = data[0];
3397 mY = data[1];
3398}
3399
3400std::vector<uint64_t> Rng::state() const {
3401 std::vector<uint64_t> data(2);
3402 data[0] = mX;
3403 data[1] = mY;
3404 return data;
3405}
3406
3407BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3408 BigO::RangeMeasure rangeMeasure;
3409 for (auto const& result : results) {
3410 if (result.config().mComplexityN > 0.0) {
3411 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3412 }
3413 }
3414 return rangeMeasure;
3415}
3416
3417BigO::BigO(std::string bigOName, RangeMeasure const& rangeMeasure)
3418 : mName(std::move(bigOName)) {
3419
3420 // estimate the constant factor
3421 double sumRangeMeasure = 0.0;
3422 double sumRangeRange = 0.0;
3423
3424 for (const auto& rm : rangeMeasure) {
3425 sumRangeMeasure += rm.first * rm.second;
3426 sumRangeRange += rm.first * rm.first;
3427 }
3428 mConstant = sumRangeMeasure / sumRangeRange;
3429
3430 // calculate root mean square
3431 double err = 0.0;
3432 double sumMeasure = 0.0;
3433 for (const auto& rm : rangeMeasure) {
3434 auto diff = mConstant * rm.first - rm.second;
3435 err += diff * diff;
3436
3437 sumMeasure += rm.second;
3438 }
3439
3440 auto n = detail::d(rangeMeasure.size());
3441 auto mean = sumMeasure / n;
3442 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3443}
3444
3445BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3446 : BigO(std::string(bigOName), rangeMeasure) {}
3447
3448std::string const& BigO::name() const noexcept {
3449 return mName;
3450}
3451
3452double BigO::constant() const noexcept {
3453 return mConstant;
3454}
3455
3456double BigO::normalizedRootMeanSquare() const noexcept {
3457 return mNormalizedRootMeanSquare;
3458}
3459
3460bool BigO::operator<(BigO const& other) const noexcept {
3461 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3462}
3463
3464std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3465 return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3466}
3467
3468std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3469 detail::fmt::StreamStateRestorer const restorer(os);
3470 os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3471 for (auto const& bigO : bigOs) {
3472 os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3473 os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3474 os << "| " << bigO.name();
3475 os << std::endl;
3476 }
3477 return os;
3478}
3479
3480} // namespace nanobench
3481} // namespace ankerl
3482
3483#endif // ANKERL_NANOBENCH_IMPLEMENT
3484#endif // ANKERL_NANOBENCH_H_INCLUDED
int ret
int flags
Definition: bitcoin-tx.cpp:536
Main entry point to nanobench's benchmarking facility.
Definition: nanobench.h:627
Bench & operator=(Bench const &other)
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1234
Bench & batch(T b) noexcept
Sets the batch size.
Definition: nanobench.h:1258
std::vector< BigO > complexityBigO() const
Bench()
Creates a new benchmark for configuration and running of benchmarks.
Bench & operator=(Bench &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
Bench(Bench &&other) noexcept
Bench(Bench const &other)
Bench & complexityN(T n) noexcept
Definition: nanobench.h:1265
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1116
BigO(std::string bigOName, RangeMeasure const &scaledRangeMeasure)
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1113
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1126
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
BigO(std::string bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1130
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)
Result(Config benchmarkConfig)
static Measure fromString(std::string const &str)
void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const &pc)
Result(Result &&other) noexcept
ANKERL_NANOBENCH(NODISCARD) Config const &config() const noexcept
Result & operator=(Result const &other)
Result(Result const &other)
Result & operator=(Result &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
An extremely fast random generator.
Definition: nanobench.h:488
static constexpr uint64_t() min()
Rng(Rng const &)=delete
As a safety precaution, we don't allow copying.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1191
Rng(Rng &&) noexcept=default
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.
static constexpr uint64_t() max()
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1181
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:493
void moveResultTo(std::vector< Result > &results) noexcept
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
IterationLogic(IterationLogic &&)=delete
IterationLogic & operator=(IterationLogic const &)=delete
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept
IterationLogic(IterationLogic const &)=delete
IterationLogic & operator=(IterationLogic &&)=delete
PerformanceCounters(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters &&)=delete
ANKERL_NANOBENCH(NODISCARD) PerfCountSet< uint64_t > const &val() const noexcept
PerformanceCounters(PerformanceCounters &&)=delete
volatile double sum
Definition: examples.cpp:10
#define T(expected, seed, data)
void doNotOptimizeAway(T &val)
Definition: nanobench.h:1045
PerformanceCounters & performanceCounters()
void doNotOptimizeAway(T const &val)
Definition: nanobench.h:1039
char const * json() noexcept
Template to generate JSON data.
char const * csv() noexcept
CSV data for the benchmark results.
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzation.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:133
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1279
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:87
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:106
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:49
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:607
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:612
const char * name
Definition: rest.cpp:49
static RPCHelpMan stop()
Definition: server.cpp:155
Config & operator=(Config const &other)
Config(Config const &other)
Config & operator=(Config &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
Config(Config &&other) noexcept
static SECP256K1_INLINE uint64_t rotl(const uint64_t x, int k)
Definition: testrand_impl.h:39
static int count