Bitcoin Core 31.99.0
P2P Digital Currency
nanobench.h
Go to the documentation of this file.
1// __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2// | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3// | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4//
5// Microbenchmark framework for C++11/14/17/20
6// https://github.com/martinus/nanobench
7//
8// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9// SPDX-License-Identifier: MIT
10// Copyright (c) 2019-2023 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
11//
12// Permission is hereby granted, free of charge, to any person obtaining a copy
13// of this software and associated documentation files (the "Software"), to deal
14// in the Software without restriction, including without limitation the rights
15// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16// copies of the Software, and to permit persons to whom the Software is
17// furnished to do so, subject to the following conditions:
18//
19// The above copyright notice and this permission notice shall be included in all
20// copies or substantial portions of the Software.
21//
22// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28// SOFTWARE.
29
30#ifndef ANKERL_NANOBENCH_H_INCLUDED
31#define ANKERL_NANOBENCH_H_INCLUDED
32
33// see https://semver.org/
34#define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35#define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes
36#define ANKERL_NANOBENCH_VERSION_PATCH 11 // backwards-compatible bug fixes
37
39// public facing api - as minimal as possible
41
42#include <chrono> // high_resolution_clock
43#include <cstring> // memcpy
44#include <iosfwd> // for std::ostream* custom output target in Config
45#include <string> // all names
46#include <unordered_map> // holds context information of results
47#include <vector> // holds all results
48
49#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
50
51#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
52#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
53#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
54#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
55#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
56
57#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
58# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
59#else
60# define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
61#endif
62
63#if defined(__clang__)
64# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
65 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
66# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
67#else
68# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
69# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
70#endif
71
72#if defined(__GNUC__)
73# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
74# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
75#else
76# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
77# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
78#endif
79
80#if defined(ANKERL_NANOBENCH_LOG_ENABLED)
81# include <iostream>
82# define ANKERL_NANOBENCH_LOG(x) \
83 do { \
84 std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
85 } while (0)
86#else
87# define ANKERL_NANOBENCH_LOG(x) \
88 do { \
89 } while (0)
90#endif
91
92#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
93#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
94# include <linux/version.h>
95# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
96// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3
97// PERF_FLAG_FD_CLOEXEC since kernel 3.14
98# undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
99# define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
100# endif
101#endif
102
103#if defined(__clang__)
104# define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
105#else
106# define ANKERL_NANOBENCH_NO_SANITIZE(...)
107#endif
108
109#if defined(_MSC_VER)
110# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
111#else
112# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
113#endif
114
115// workaround missing "is_trivially_copyable" in g++ < 5.0
116// See https://stackoverflow.com/a/31798726/48181
117#if defined(__GNUC__) && __GNUC__ < 5
118# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
119#else
120# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
121#endif
122
123// noexcept may be missing for std::string.
124// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58265
125#define ANKERL_NANOBENCH_PRIVATE_NOEXCEPT_STRING_MOVE() std::is_nothrow_move_assignable<std::string>::value
126
127// declarations ///////////////////////////////////////////////////////////////////////////////////
128
129namespace ankerl {
130namespace nanobench {
131
132using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
133 std::chrono::steady_clock>::type;
134class Bench;
135struct Config;
136class Result;
137class Rng;
138class BigO;
139
140namespace detail {
141template <typename SetupOp>
142class SetupRunner;
143} // namespace detail
144
295void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
296void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
297
306void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
307void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
308
309// Contains mustache-like templates
310namespace templates {
311
321char const* csv() noexcept;
322
333char const* htmlBoxplot() noexcept;
334
341char const* pyperf() noexcept;
342
352char const* json() noexcept;
353
354} // namespace templates
355
356namespace detail {
357
358template <typename T>
359struct PerfCountSet;
360
361class IterationLogic;
362class PerformanceCounters;
363
364#if ANKERL_NANOBENCH(PERF_COUNTERS)
365class LinuxPerformanceCounters;
366#endif
367
368} // namespace detail
369} // namespace nanobench
370} // namespace ankerl
371
372// definitions ////////////////////////////////////////////////////////////////////////////////////
373
374namespace ankerl {
375namespace nanobench {
376namespace detail {
377
378template <typename T>
386};
387
388} // namespace detail
389
390ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
391struct Config {
392 // actual benchmark config
393 std::string mBenchmarkTitle = "benchmark"; // NOLINT(misc-non-private-member-variables-in-classes)
394 std::string mBenchmarkName = "noname"; // NOLINT(misc-non-private-member-variables-in-classes)
395 std::string mUnit = "op"; // NOLINT(misc-non-private-member-variables-in-classes)
396 double mBatch = 1.0; // NOLINT(misc-non-private-member-variables-in-classes)
397 double mComplexityN = -1.0; // NOLINT(misc-non-private-member-variables-in-classes)
398 size_t mNumEpochs = 11; // NOLINT(misc-non-private-member-variables-in-classes)
399 size_t mClockResolutionMultiple = static_cast<size_t>(1000); // NOLINT(misc-non-private-member-variables-in-classes)
400 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100); // NOLINT(misc-non-private-member-variables-in-classes)
401 std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1); // NOLINT(misc-non-private-member-variables-in-classes)
402 uint64_t mMinEpochIterations{1}; // NOLINT(misc-non-private-member-variables-in-classes)
403 // If not 0, run *exactly* these number of iterations per epoch.
404 uint64_t mEpochIterations{0}; // NOLINT(misc-non-private-member-variables-in-classes)
405 uint64_t mWarmup = 0; // NOLINT(misc-non-private-member-variables-in-classes)
406 std::ostream* mOut = nullptr; // NOLINT(misc-non-private-member-variables-in-classes)
407 std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1}; // NOLINT(misc-non-private-member-variables-in-classes)
408 std::string mTimeUnitName = "ns"; // NOLINT(misc-non-private-member-variables-in-classes)
409 bool mShowPerformanceCounters = true; // NOLINT(misc-non-private-member-variables-in-classes)
410 bool mIsRelative = false; // NOLINT(misc-non-private-member-variables-in-classes)
411 std::unordered_map<std::string, std::string> mContext{}; // NOLINT(misc-non-private-member-variables-in-classes)
412
415 Config& operator=(Config const& other);
416 Config& operator=(Config&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
417 Config(Config const& other);
418 Config(Config&& other) noexcept;
419};
420ANKERL_NANOBENCH(IGNORE_PADDED_POP)
421
422// Result returned after a benchmark has finished. Can be used as a baseline for relative().
423ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
424class Result {
425public:
426 enum class Measure : size_t {
427 elapsed,
428 iterations,
429 pagefaults,
430 cpucycles,
431 contextswitches,
432 instructions,
433 branchinstructions,
434 branchmisses,
435 _size
436 };
437
438 explicit Result(Config benchmarkConfig);
439
441 Result& operator=(Result const& other);
442 Result& operator=(Result&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
443 Result(Result const& other);
444 Result(Result&& other) noexcept;
445
446 // adds new measurement results
447 // all values are scaled by iters (except iters...)
448 void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
449
450 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
451
452 ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
453 ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
454 ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
455 ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
456 ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
457 ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
458 ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
459 ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const* variableName) const;
460 ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const& variableName) const;
461
462 ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
463 ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
464 ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
465 ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
466
467 // Finds string, if not found, returns _size.
468 static Measure fromString(std::string const& str);
469
470private:
471 Config mConfig{};
472 std::vector<std::vector<double>> mNameToMeasurements{};
473};
474ANKERL_NANOBENCH(IGNORE_PADDED_POP)
475
476
493class Rng final {
494public:
498 using result_type = uint64_t;
499
500 static constexpr uint64_t(min)();
501 static constexpr uint64_t(max)();
502
508 Rng(Rng const&) = delete;
509
513 Rng& operator=(Rng const&) = delete;
514
515 // moving is ok
516 Rng(Rng&&) noexcept = default;
517 Rng& operator=(Rng&&) noexcept = default;
518 ~Rng() noexcept = default;
519
528
545 explicit Rng(uint64_t seed) noexcept;
546 Rng(uint64_t x, uint64_t y) noexcept;
547 explicit Rng(std::vector<uint64_t> const& data);
548
552 ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
553
561 inline uint64_t operator()() noexcept;
562
563 // This is slightly biased. See
564
579 inline uint32_t bounded(uint32_t range) noexcept;
580
581 // random double in range [0, 1(
582 // see http://prng.di.unimi.it/
583
590 inline double uniform01() noexcept;
591
599 template <typename Container>
600 void shuffle(Container& container) noexcept;
601
608 ANKERL_NANOBENCH(NODISCARD) std::vector<uint64_t> state() const;
609
610private:
611 static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
612
613 uint64_t mX;
614 uint64_t mY;
615};
616
631ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
632class Bench {
633public:
638
639 Bench(Bench&& other) noexcept;
640 Bench& operator=(Bench&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
641 Bench(Bench const& other);
642 Bench& operator=(Bench const& other);
643 ~Bench() noexcept;
644
663 template <typename Op>
664 ANKERL_NANOBENCH(NOINLINE)
665 Bench& run(char const* benchmarkName, Op&& op);
666
667 template <typename Op>
668 ANKERL_NANOBENCH(NOINLINE)
669 Bench& run(std::string const& benchmarkName, Op&& op);
670
675 template <typename Op>
676 ANKERL_NANOBENCH(NOINLINE)
677 Bench& run(Op&& op);
678
684 Bench& title(char const* benchmarkTitle);
685 Bench& title(std::string const& benchmarkTitle);
686
690 ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
691
693 Bench& name(char const* benchmarkName);
694 Bench& name(std::string const& benchmarkName);
695 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
696
709 Bench& context(char const* variableName, char const* variableValue);
710 Bench& context(std::string const& variableName, std::string const& variableValue);
711
720 Bench& clearContext();
721
731 template <typename T>
732 Bench& batch(T b) noexcept;
733 ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
734
743 Bench& unit(char const* unit);
744 Bench& unit(std::string const& unit);
745 ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
746
756 Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
757 ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
758 ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
759
767 Bench& output(std::ostream* outstream) noexcept;
768 ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
769
790 Bench& clockResolutionMultiple(size_t multiple) noexcept;
791 ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
792
808 Bench& epochs(size_t numEpochs) noexcept;
809 ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
810
821 Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
822 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
823
834 Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
835 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
836
847 Bench& minEpochIterations(uint64_t numIters) noexcept;
848 ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
849
856 Bench& epochIterations(uint64_t numIters) noexcept;
857 ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
858
868 Bench& warmup(uint64_t numWarmupIters) noexcept;
869 ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
870
888 Bench& relative(bool isRelativeEnabled) noexcept;
889 ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
890
899 Bench& performanceCounters(bool showPerformanceCounters) noexcept;
900 ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
901
910 ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
911
919 template <typename Arg>
921
936 template <typename T>
937 Bench& complexityN(T n) noexcept;
938 ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
939
971 std::vector<BigO> complexityBigO() const;
972
996 template <typename Op>
997 BigO complexityBigO(char const* name, Op op) const;
998
999 template <typename Op>
1000 BigO complexityBigO(std::string const& name, Op op) const;
1001
1009 Bench& render(char const* templateContent, std::ostream& os);
1010 Bench& render(std::string const& templateContent, std::ostream& os);
1011
1012 Bench& config(Config const& benchmarkConfig);
1013 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
1014
1020 template <typename SetupOp>
1021 detail::SetupRunner<SetupOp> setup(SetupOp setupOp);
1022
1023private:
1024 template <typename SetupOp, typename Op>
1025 Bench& runImpl(SetupOp& setupOp, Op&& op);
1026
1027 template <typename SetupOp>
1028 friend class detail::SetupRunner;
1029
1030 Config mConfig{};
1031 std::vector<Result> mResults{};
1032};
1033ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1034
1035
1041template <typename Arg>
1042void doNotOptimizeAway(Arg&& arg);
1043
1044namespace detail {
1045
1046#if defined(_MSC_VER)
1047void doNotOptimizeAwaySink(void const*);
1048
1049template <typename T>
1050void doNotOptimizeAway(T const& val);
1051
1052#else
1053
1054// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
1055// this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
1056// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L443-L446
1057template <typename T>
1058void doNotOptimizeAway(T const& val) {
1059 // NOLINTNEXTLINE(hicpp-no-assembler)
1060 asm volatile("" : : "r,m"(val) : "memory");
1061}
1062
1063template <typename T>
1064void doNotOptimizeAway(T& val) {
1065# if defined(__clang__)
1066 // NOLINTNEXTLINE(hicpp-no-assembler)
1067 asm volatile("" : "+r,m"(val) : : "memory");
1068# else
1069 // NOLINTNEXTLINE(hicpp-no-assembler)
1070 asm volatile("" : "+m,r"(val) : : "memory");
1071# endif
1072}
1073#endif
1074
1075// internally used, but visible because run() is templated.
1076// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1077// having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
1078ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1080public:
1081 explicit IterationLogic(Bench const& bench);
1087
1088 ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1089 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1090 void moveResultTo(std::vector<Result>& results) noexcept;
1091
1092private:
1093 struct Impl;
1094 Impl* mPimpl;
1095};
1096ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1097
1098ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1100public:
1105
1108
1111 void updateResults(uint64_t numIters);
1112
1113 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1114 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1115
1116private:
1117#if ANKERL_NANOBENCH(PERF_COUNTERS)
1118 LinuxPerformanceCounters* mPc = nullptr;
1119#endif
1122};
1123ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1124
1125// Gets the singleton
1127
1128} // namespace detail
1129
1130class BigO {
1131public:
1132 using RangeMeasure = std::vector<std::pair<double, double>>;
1133
1134 template <typename Op>
1136 for (auto& rangeMeasure : data) {
1137 rangeMeasure.first = op(rangeMeasure.first);
1138 }
1139 return data;
1140 }
1141
1142 static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1143
1144 template <typename Op>
1145 BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1146 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1147
1148 template <typename Op>
1149 BigO(std::string bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1150 : BigO(std::move(bigOName), mapRangeMeasure(rangeMeasure, rangeToN)) {}
1151
1152 BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1153 BigO(std::string bigOName, RangeMeasure const& scaledRangeMeasure);
1154 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1155 ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1156 ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1157 ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1158
1159private:
1160 std::string mName{};
1161 double mConstant{};
1162 double mNormalizedRootMeanSquare{};
1163};
1164std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1165std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1166
1167} // namespace nanobench
1168} // namespace ankerl
1169
1170// implementation /////////////////////////////////////////////////////////////////////////////////
1171
1172namespace ankerl {
1173namespace nanobench {
1174
1175constexpr uint64_t(Rng::min)() {
1176 return 0;
1177}
1178
1179constexpr uint64_t(Rng::max)() {
1180 return (std::numeric_limits<uint64_t>::max)();
1181}
1182
1183ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1184uint64_t Rng::operator()() noexcept {
1185 auto x = mX;
1186
1187 mX = UINT64_C(15241094284759029579) * mY;
1188 mY = rotl(mY - x, 27);
1189
1190 return x;
1191}
1192
1193ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1194uint32_t Rng::bounded(uint32_t range) noexcept {
1195 uint64_t const r32 = static_cast<uint32_t>(operator()());
1196 auto multiresult = r32 * range;
1197 return static_cast<uint32_t>(multiresult >> 32U);
1198}
1199
1200double Rng::uniform01() noexcept {
1201 auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1202 // can't use union in c++ here for type puning, it's undefined behavior.
1203 // std::memcpy is optimized anyways.
1204 double d{};
1205 std::memcpy(&d, &i, sizeof(double));
1206 return d - 1.0;
1207}
1208
1209template <typename Container>
1210void Rng::shuffle(Container& container) noexcept {
1211 auto i = container.size();
1212 while (i > 1U) {
1213 using std::swap;
1214 auto n = operator()();
1215 // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80)
1216 auto b1 = static_cast<decltype(i)>((static_cast<uint32_t>(n) * static_cast<uint64_t>(i)) >> 32U);
1217 swap(container[--i], container[b1]);
1218
1219 auto b2 = static_cast<decltype(i)>(((n >> 32U) * static_cast<uint64_t>(i)) >> 32U);
1220 swap(container[--i], container[b2]);
1221 }
1222}
1223
1224ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1225constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1226 return (x << k) | (x >> (64U - k));
1227}
1228
1229namespace detail {
1230
1231template <typename SetupOp>
1233public:
1234 explicit SetupRunner(SetupOp setupOp, Bench& bench)
1235 : mSetupOp(std::move(setupOp))
1236 , mBench(bench) {}
1237
1238 template <typename Op>
1240 Bench& run(Op&& op) {
1241 return mBench.runImpl(mSetupOp, std::forward<Op>(op));
1242 }
1243
1244private:
1245 SetupOp mSetupOp;
1247};
1248} // namespace detail
1249
1250template <typename Op>
1252Bench& Bench::run(Op&& op) {
1253 auto setupOp = [] {};
1254 return runImpl(setupOp, std::forward<Op>(op));
1255}
1256
1257template <typename SetupOp, typename Op>
1259Bench& Bench::runImpl(SetupOp& setupOp, Op&& op) {
1260 // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1261 detail::IterationLogic iterationLogic(*this);
1262 auto& pc = detail::performanceCounters();
1263
1264 while (auto n = iterationLogic.numIters()) {
1265 setupOp();
1266
1267 pc.beginMeasure();
1268 Clock::time_point const before = Clock::now();
1269 while (n-- > 0) {
1270 op();
1271 }
1272 Clock::time_point const after = Clock::now();
1273 pc.endMeasure();
1274 pc.updateResults(iterationLogic.numIters());
1275 iterationLogic.add(after - before, pc);
1276 }
1277 iterationLogic.moveResultTo(mResults);
1278 return *this;
1279}
1280
1281template <typename SetupOp>
1283 return detail::SetupRunner<SetupOp>(std::move(setupOp), *this);
1284}
1285
1286// Performs all evaluations.
1287template <typename Op>
1288Bench& Bench::run(char const* benchmarkName, Op&& op) {
1289 name(benchmarkName);
1290 return run(std::forward<Op>(op));
1291}
1292
1293template <typename Op>
1294Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1295 name(benchmarkName);
1296 return run(std::forward<Op>(op));
1297}
1298
1299template <typename Op>
1300BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1301 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1302}
1303
1304template <typename Op>
1305BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1306 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1307}
1308
1309// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1310// Any argument is cast to double.
1311template <typename T>
1312Bench& Bench::batch(T b) noexcept {
1313 mConfig.mBatch = static_cast<double>(b);
1314 return *this;
1315}
1316
1317// Sets the computation complexity of the next run. Any argument is cast to double.
1318template <typename T>
1320 mConfig.mComplexityN = static_cast<double>(n);
1321 return *this;
1322}
1323
1324// Convenience: makes sure none of the given arguments are optimized away by the compiler.
1325template <typename Arg>
1327 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1328 return *this;
1329}
1330
1331// Makes sure none of the given arguments are optimized away by the compiler.
1332template <typename Arg>
1333void doNotOptimizeAway(Arg&& arg) {
1334 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1335}
1336
1337namespace detail {
1338
1339#if defined(_MSC_VER)
1340template <typename T>
1341void doNotOptimizeAway(T const& val) {
1342 doNotOptimizeAwaySink(&val);
1343}
1344
1345#endif
1346
1347} // namespace detail
1348} // namespace nanobench
1349} // namespace ankerl
1350
1351#if defined(ANKERL_NANOBENCH_IMPLEMENT)
1352
1354// implementation part - only visible in .cpp
1356
1357# include <algorithm> // sort, reverse
1358# include <atomic> // compare_exchange_strong in loop overhead
1359# include <cstdlib> // getenv
1360# include <cstring> // strstr, strncmp
1361# include <fstream> // ifstream to parse proc files
1362# include <iomanip> // setw, setprecision
1363# include <iostream> // cout
1364# include <numeric> // accumulate
1365# include <random> // random_device
1366# include <sstream> // to_s in Number
1367# include <stdexcept> // throw for rendering templates
1368# include <tuple> // std::tie
1369# if defined(__linux__)
1370# include <unistd.h> //sysconf
1371# endif
1372# if ANKERL_NANOBENCH(PERF_COUNTERS)
1373# include <map> // map
1374
1375# include <linux/perf_event.h>
1376# include <sys/ioctl.h>
1377# include <sys/syscall.h>
1378# endif
1379
1380// declarations ///////////////////////////////////////////////////////////////////////////////////
1381
1382namespace ankerl {
1383namespace nanobench {
1384
1385// helper stuff that is only intended to be used internally
1386namespace detail {
1387
1388struct TableInfo;
1389
1390// formatting utilities
1391namespace fmt {
1392
1393class NumSep;
1394class StreamStateRestorer;
1395class Number;
1396class MarkDownColumn;
1397class MarkDownCode;
1398
1399} // namespace fmt
1400} // namespace detail
1401} // namespace nanobench
1402} // namespace ankerl
1403
1404// definitions ////////////////////////////////////////////////////////////////////////////////////
1405
1406namespace ankerl {
1407namespace nanobench {
1408
1409uint64_t splitMix64(uint64_t& state) noexcept;
1410
1411namespace detail {
1412
1413// helpers to get double values
1414template <typename T>
1415inline double d(T t) noexcept {
1416 return static_cast<double>(t);
1417}
1418inline double d(Clock::duration duration) noexcept {
1419 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1420}
1421
1422// Calculates clock resolution once, and remembers the result
1423inline Clock::duration clockResolution() noexcept;
1424
1425} // namespace detail
1426
1427namespace templates {
1428
1429char const* csv() noexcept {
1430 return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1431{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1432{{/result}})DELIM";
1433}
1434
1435char const* htmlBoxplot() noexcept {
1436 return R"DELIM(<html>
1437
1438<head>
1439 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1440</head>
1441
1442<body>
1443 <div id="myDiv"></div>
1444 <script>
1445 var data = [
1446 {{#result}}{
1447 name: '{{name}}',
1448 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1449 },
1450 {{/result}}
1451 ];
1452 var title = '{{title}}';
1453
1454 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1455 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1456 </script>
1457</body>
1458
1459</html>)DELIM";
1460}
1461
1462char const* pyperf() noexcept {
1463 return R"DELIM({
1464 "benchmarks": [
1465 {
1466 "runs": [
1467 {
1468 "values": [
1469{{#measurement}} {{elapsed}}{{^-last}},
1470{{/last}}{{/measurement}}
1471 ]
1472 }
1473 ]
1474 }
1475 ],
1476 "metadata": {
1477 "loops": {{sum(iterations)}},
1478 "inner_loops": {{batch}},
1479 "name": "{{title}}",
1480 "unit": "second"
1481 },
1482 "version": "1.0"
1483})DELIM";
1484}
1485
1486char const* json() noexcept {
1487 return R"DELIM({
1488 "results": [
1489{{#result}} {
1490 "title": "{{title}}",
1491 "name": "{{name}}",
1492 "unit": "{{unit}}",
1493 "batch": {{batch}},
1494 "complexityN": {{complexityN}},
1495 "epochs": {{epochs}},
1496 "clockResolution": {{clockResolution}},
1497 "clockResolutionMultiple": {{clockResolutionMultiple}},
1498 "maxEpochTime": {{maxEpochTime}},
1499 "minEpochTime": {{minEpochTime}},
1500 "minEpochIterations": {{minEpochIterations}},
1501 "epochIterations": {{epochIterations}},
1502 "warmup": {{warmup}},
1503 "relative": {{relative}},
1504 "median(elapsed)": {{median(elapsed)}},
1505 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1506 "median(instructions)": {{median(instructions)}},
1507 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1508 "median(cpucycles)": {{median(cpucycles)}},
1509 "median(contextswitches)": {{median(contextswitches)}},
1510 "median(pagefaults)": {{median(pagefaults)}},
1511 "median(branchinstructions)": {{median(branchinstructions)}},
1512 "median(branchmisses)": {{median(branchmisses)}},
1513 "totalTime": {{sumProduct(iterations, elapsed)}},
1514 "measurements": [
1515{{#measurement}} {
1516 "iterations": {{iterations}},
1517 "elapsed": {{elapsed}},
1518 "pagefaults": {{pagefaults}},
1519 "cpucycles": {{cpucycles}},
1520 "contextswitches": {{contextswitches}},
1521 "instructions": {{instructions}},
1522 "branchinstructions": {{branchinstructions}},
1523 "branchmisses": {{branchmisses}}
1524 }{{^-last}},{{/-last}}
1525{{/measurement}} ]
1526 }{{^-last}},{{/-last}}
1527{{/result}} ]
1528})DELIM";
1529}
1530
1531ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1532struct Node {
1533 enum class Type { tag, content, section, inverted_section };
1534
1535 char const* begin;
1536 char const* end;
1537 std::vector<Node> children;
1538 Type type;
1539
1540 template <size_t N>
1541 // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1542 bool operator==(char const (&str)[N]) const noexcept {
1543 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
1544 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1545 }
1546};
1547ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1548
1549// NOLINTNEXTLINE(misc-no-recursion)
1550static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1551 std::vector<Node> nodes;
1552
1553 while (true) {
1554 auto const* begin = std::strstr(*tpl, "{{");
1555 auto const* end = begin;
1556 if (begin != nullptr) {
1557 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1558 begin += 2;
1559 end = std::strstr(begin, "}}");
1560 }
1561
1562 if (begin == nullptr || end == nullptr) {
1563 // nothing found, finish node
1564 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1565 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1566 return nodes;
1567 }
1568
1569 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1570 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1571
1572 // we found a tag
1573 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1574 *tpl = end + 2;
1575 switch (*begin) {
1576 case '/':
1577 // finished! bail out
1578 return nodes;
1579
1580 case '#':
1581 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1582 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1583 break;
1584
1585 case '^':
1586 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1587 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1588 break;
1589
1590 default:
1591 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1592 break;
1593 }
1594 }
1595}
1596
1597static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1598 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1599 bool const matchFirst = n == "-first";
1600 bool const matchLast = n == "-last";
1601 if (!matchFirst && !matchLast) {
1602 return false;
1603 }
1604
1605 bool doWrite = false;
1606 if (n.type == Node::Type::section) {
1607 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1608 } else if (n.type == Node::Type::inverted_section) {
1609 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1610 }
1611
1612 if (doWrite) {
1613 for (auto const& child : n.children) {
1614 if (child.type == Node::Type::content) {
1615 out.write(child.begin, std::distance(child.begin, child.end));
1616 }
1617 }
1618 }
1619 return true;
1620}
1621
1622static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1623 matchResult.clear();
1624 auto idxOpen = str.find('(');
1625 auto idxClose = str.find(')', idxOpen);
1626 if (idxClose == std::string::npos) {
1627 return false;
1628 }
1629
1630 matchResult.emplace_back(str.substr(0, idxOpen));
1631
1632 // split by comma
1633 matchResult.emplace_back();
1634 for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1635 if (str[i] == ' ' || str[i] == '\t') {
1636 // skip whitespace
1637 continue;
1638 }
1639 if (str[i] == ',') {
1640 // got a comma => new string
1641 matchResult.emplace_back();
1642 continue;
1643 }
1644 // no whitespace no comma, append
1645 matchResult.back() += str[i];
1646 }
1647 return true;
1648}
1649
1650static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1651 using detail::d;
1652
1653 if (n == "title") {
1654 out << config.mBenchmarkTitle;
1655 return true;
1656 }
1657 if (n == "name") {
1658 out << config.mBenchmarkName;
1659 return true;
1660 }
1661 if (n == "unit") {
1662 out << config.mUnit;
1663 return true;
1664 }
1665 if (n == "batch") {
1666 out << config.mBatch;
1667 return true;
1668 }
1669 if (n == "complexityN") {
1670 out << config.mComplexityN;
1671 return true;
1672 }
1673 if (n == "epochs") {
1674 out << config.mNumEpochs;
1675 return true;
1676 }
1677 if (n == "clockResolution") {
1678 out << d(detail::clockResolution());
1679 return true;
1680 }
1681 if (n == "clockResolutionMultiple") {
1682 out << config.mClockResolutionMultiple;
1683 return true;
1684 }
1685 if (n == "maxEpochTime") {
1686 out << d(config.mMaxEpochTime);
1687 return true;
1688 }
1689 if (n == "minEpochTime") {
1690 out << d(config.mMinEpochTime);
1691 return true;
1692 }
1693 if (n == "minEpochIterations") {
1694 out << config.mMinEpochIterations;
1695 return true;
1696 }
1697 if (n == "epochIterations") {
1698 out << config.mEpochIterations;
1699 return true;
1700 }
1701 if (n == "warmup") {
1702 out << config.mWarmup;
1703 return true;
1704 }
1705 if (n == "relative") {
1706 out << config.mIsRelative;
1707 return true;
1708 }
1709 return false;
1710}
1711
1712// NOLINTNEXTLINE(readability-function-cognitive-complexity)
1713static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1714 if (generateConfigTag(n, r.config(), out)) {
1715 return out;
1716 }
1717 // match e.g. "median(elapsed)"
1718 // g++ 4.8 doesn't implement std::regex :(
1719 // static std::regex const regOpArg1("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\‍)$");
1720 // std::cmatch matchResult;
1721 // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1722 std::vector<std::string> matchResult;
1723 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1724 if (matchResult.size() == 2) {
1725 if (matchResult[0] == "context") {
1726 return out << r.context(matchResult[1]);
1727 }
1728
1729 auto m = Result::fromString(matchResult[1]);
1730 if (m == Result::Measure::_size) {
1731 return out << 0.0;
1732 }
1733
1734 if (matchResult[0] == "median") {
1735 return out << r.median(m);
1736 }
1737 if (matchResult[0] == "average") {
1738 return out << r.average(m);
1739 }
1740 if (matchResult[0] == "medianAbsolutePercentError") {
1741 return out << r.medianAbsolutePercentError(m);
1742 }
1743 if (matchResult[0] == "sum") {
1744 return out << r.sum(m);
1745 }
1746 if (matchResult[0] == "minimum") {
1747 return out << r.minimum(m);
1748 }
1749 if (matchResult[0] == "maximum") {
1750 return out << r.maximum(m);
1751 }
1752 } else if (matchResult.size() == 3) {
1753 auto m1 = Result::fromString(matchResult[1]);
1754 auto m2 = Result::fromString(matchResult[2]);
1756 return out << 0.0;
1757 }
1758
1759 if (matchResult[0] == "sumProduct") {
1760 return out << r.sumProduct(m1, m2);
1761 }
1762 }
1763 }
1764
1765 // match e.g. "sumProduct(elapsed, iterations)"
1766 // static std::regex const regOpArg2("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\‍)$");
1767
1768 // nothing matches :(
1769 throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1770}
1771
1772static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1773 for (auto const& n : nodes) {
1774 if (!generateFirstLast(n, idx, r.size(), out)) {
1775 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1776 switch (n.type) {
1777 case Node::Type::content:
1778 out.write(n.begin, std::distance(n.begin, n.end));
1779 break;
1780
1781 case Node::Type::inverted_section:
1782 throw std::runtime_error("got a inverted section inside measurement");
1783
1784 case Node::Type::section:
1785 throw std::runtime_error("got a section inside measurement");
1786
1787 case Node::Type::tag: {
1788 auto m = Result::fromString(std::string(n.begin, n.end));
1789 if (m == Result::Measure::_size || !r.has(m)) {
1790 out << 0.0;
1791 } else {
1792 out << r.get(idx, m);
1793 }
1794 break;
1795 }
1796 }
1797 }
1798 }
1799}
1800
1801static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1802 auto const& r = results[idx];
1803 for (auto const& n : nodes) {
1804 if (!generateFirstLast(n, idx, results.size(), out)) {
1805 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1806 switch (n.type) {
1807 case Node::Type::content:
1808 out.write(n.begin, std::distance(n.begin, n.end));
1809 break;
1810
1811 case Node::Type::inverted_section:
1812 throw std::runtime_error("got a inverted section inside result");
1813
1814 case Node::Type::section:
1815 if (n == "measurement") {
1816 for (size_t i = 0; i < r.size(); ++i) {
1817 generateResultMeasurement(n.children, i, r, out);
1818 }
1819 } else {
1820 throw std::runtime_error("got a section inside result");
1821 }
1822 break;
1823
1824 case Node::Type::tag:
1825 generateResultTag(n, r, out);
1826 break;
1827 }
1828 }
1829 }
1830}
1831
1832} // namespace templates
1833
1834// helper stuff that only intended to be used internally
1835namespace detail {
1836
1837char const* getEnv(char const* name);
1838bool isEndlessRunning(std::string const& name);
1839bool isWarningsEnabled();
1840
1841template <typename T>
1842T parseFile(std::string const& filename, bool* fail);
1843
1844void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1845void printStabilityInformationOnce(std::ostream* outStream);
1846
1847// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1848uint64_t& singletonHeaderHash() noexcept;
1849
1850// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1851Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1852
1853// formatting utilities
1854namespace fmt {
1855
1856// adds thousands separator to numbers
1857ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1858class NumSep : public std::numpunct<char> {
1859public:
1860 explicit NumSep(char sep);
1861 char do_thousands_sep() const override;
1862 std::string do_grouping() const override;
1863
1864private:
1865 char mSep;
1866};
1867ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1868
1869// RAII to save & restore a stream's state
1870ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1871class StreamStateRestorer {
1872public:
1873 explicit StreamStateRestorer(std::ostream& s);
1874 ~StreamStateRestorer();
1875
1876 // sets back all stream info that we remembered at construction
1877 void restore();
1878
1879 // don't allow copying / moving
1880 StreamStateRestorer(StreamStateRestorer const&) = delete;
1881 StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1882 StreamStateRestorer(StreamStateRestorer&&) = delete;
1883 StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1884
1885private:
1886 std::ostream& mStream;
1887 std::locale mLocale;
1888 std::streamsize const mPrecision;
1889 std::streamsize const mWidth;
1890 std::ostream::char_type const mFill;
1891 std::ostream::fmtflags const mFmtFlags;
1892};
1893ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1894
1895// Number formatter
1896class Number {
1897public:
1898 Number(int width, int precision, double value);
1899 Number(int width, int precision, int64_t value);
1900 ANKERL_NANOBENCH(NODISCARD) std::string to_s() const;
1901
1902private:
1903 friend std::ostream& operator<<(std::ostream& os, Number const& n);
1904 std::ostream& write(std::ostream& os) const;
1905
1906 int mWidth;
1907 int mPrecision;
1908 double mValue;
1909};
1910
1911// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1912std::string to_s(uint64_t n);
1913
1914std::ostream& operator<<(std::ostream& os, Number const& n);
1915
1916class MarkDownColumn {
1917public:
1918 MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept;
1919 ANKERL_NANOBENCH(NODISCARD) std::string title() const;
1920 ANKERL_NANOBENCH(NODISCARD) std::string separator() const;
1921 ANKERL_NANOBENCH(NODISCARD) std::string invalid() const;
1922 ANKERL_NANOBENCH(NODISCARD) std::string value() const;
1923
1924private:
1925 int mWidth;
1926 int mPrecision;
1927 std::string mTitle;
1928 std::string mSuffix;
1929 double mValue;
1930};
1931
1932// Formats any text as markdown code, escaping backticks.
1933class MarkDownCode {
1934public:
1935 explicit MarkDownCode(std::string const& what);
1936
1937private:
1938 friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1939 std::ostream& write(std::ostream& os) const;
1940
1941 std::string mWhat{};
1942};
1943
1944std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1945
1946} // namespace fmt
1947} // namespace detail
1948} // namespace nanobench
1949} // namespace ankerl
1950
1951// implementation /////////////////////////////////////////////////////////////////////////////////
1952
1953namespace ankerl {
1954namespace nanobench {
1955
1956// NOLINTNEXTLINE(readability-function-cognitive-complexity)
1957void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1958 detail::fmt::StreamStateRestorer const restorer(out);
1959
1960 out.precision(std::numeric_limits<double>::digits10);
1961 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1962
1963 for (auto const& n : nodes) {
1964 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1965 switch (n.type) {
1966 case templates::Node::Type::content:
1967 out.write(n.begin, std::distance(n.begin, n.end));
1968 break;
1969
1970 case templates::Node::Type::inverted_section:
1971 throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1972
1973 case templates::Node::Type::section:
1974 if (n == "result") {
1975 const size_t nbResults = results.size();
1976 for (size_t i = 0; i < nbResults; ++i) {
1977 generateResult(n.children, i, results, out);
1978 }
1979 } else if (n == "measurement") {
1980 if (results.size() != 1) {
1981 throw std::runtime_error(
1982 "render: can only use section 'measurement' here if there is a single result, but there are " +
1983 detail::fmt::to_s(results.size()));
1984 }
1985 // when we only have a single result, we can immediately go into its measurement.
1986 auto const& r = results.front();
1987 for (size_t i = 0; i < r.size(); ++i) {
1988 generateResultMeasurement(n.children, i, r, out);
1989 }
1990 } else {
1991 throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
1992 }
1993 break;
1994
1995 case templates::Node::Type::tag:
1996 if (results.size() == 1) {
1997 // result & config are both supported there
1998 generateResultTag(n, results.front(), out);
1999 } else {
2000 // This just uses the last result's config.
2001 if (!generateConfigTag(n, results.back().config(), out)) {
2002 throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
2003 }
2004 }
2005 break;
2006 }
2007 }
2008}
2009
2010void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
2011 render(mustacheTemplate.c_str(), results, out);
2012}
2013
2014void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
2015 render(mustacheTemplate, bench.results(), out);
2016}
2017
2018void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
2019 render(mustacheTemplate.c_str(), bench.results(), out);
2020}
2021
2022namespace detail {
2023
2024PerformanceCounters& performanceCounters() {
2025# if defined(__clang__)
2026# pragma clang diagnostic push
2027# pragma clang diagnostic ignored "-Wexit-time-destructors"
2028# endif
2029 static PerformanceCounters pc;
2030# if defined(__clang__)
2031# pragma clang diagnostic pop
2032# endif
2033 return pc;
2034}
2035
2036// Windows version of doNotOptimizeAway
2037// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L514
2038// see https://github.com/facebook/folly/blob/v2023.01.30.00/folly/lang/Hint-inl.h#L54-L58
2039// see https://learn.microsoft.com/en-us/cpp/preprocessor/optimize
2040# if defined(_MSC_VER)
2041# pragma optimize("", off)
2042void doNotOptimizeAwaySink(void const*) {}
2043# pragma optimize("", on)
2044# endif
2045
2046template <typename T>
2047T parseFile(std::string const& filename, bool* fail) {
2048 std::ifstream fin(filename); // NOLINT(misc-const-correctness)
2049 T num{};
2050 fin >> num;
2051 if (fail != nullptr) {
2052 *fail = fin.fail();
2053 }
2054 return num;
2055}
2056
2057char const* getEnv(char const* name) {
2058# if defined(_MSC_VER)
2059# pragma warning(push)
2060# pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
2061# endif
2062 return std::getenv(name); // NOLINT(concurrency-mt-unsafe)
2063# if defined(_MSC_VER)
2064# pragma warning(pop)
2065# endif
2066}
2067
2068bool isEndlessRunning(std::string const& name) {
2069 auto const* const endless = getEnv("NANOBENCH_ENDLESS");
2070 return nullptr != endless && endless == name;
2071}
2072
2073// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
2074bool isWarningsEnabled() {
2075 auto const* const suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
2076 return nullptr == suppression || suppression == std::string("0");
2077}
2078
2079void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
2080 warnings.clear();
2081 recommendations.clear();
2082
2083# if defined(DEBUG)
2084 warnings.emplace_back("DEBUG defined");
2085 bool const recommendCheckFlags = true;
2086# else
2087 bool const recommendCheckFlags = false;
2088# endif
2089
2090 bool recommendPyPerf = false;
2091# if defined(__linux__)
2092 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
2093 if (nprocs <= 0) {
2094 warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
2095 } else {
2096 // check frequency scaling
2097 for (long id = 0; id < nprocs; ++id) {
2098 auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
2099 auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
2100 auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq", nullptr);
2101 auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq", nullptr);
2102 if (minFreq != maxFreq) {
2103 auto minMHz = d(minFreq) / 1000.0;
2104 auto maxMHz = d(maxFreq) / 1000.0;
2105 warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
2106 detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
2107 " MHz");
2108 recommendPyPerf = true;
2109 break;
2110 }
2111 }
2112
2113 auto fail = false;
2114 auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", &fail);
2115 if (!fail && "performance" != currentGovernor) {
2116 warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
2117 recommendPyPerf = true;
2118 }
2119
2120 auto noTurbo = parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo", &fail);
2121 if (!fail && noTurbo == 0) {
2122 warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
2123 recommendPyPerf = true;
2124 }
2125 }
2126# endif
2127
2128 if (recommendCheckFlags) {
2129 recommendations.emplace_back("Make sure you compile for Release");
2130 }
2131 if (recommendPyPerf) {
2132 recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
2133 }
2134}
2135
2136void printStabilityInformationOnce(std::ostream* outStream) {
2137 static bool shouldPrint = true;
2138 if (shouldPrint && (nullptr != outStream) && isWarningsEnabled()) {
2139 auto& os = *outStream;
2140 shouldPrint = false;
2141 std::vector<std::string> warnings;
2142 std::vector<std::string> recommendations;
2143 gatherStabilityInformation(warnings, recommendations);
2144 if (warnings.empty()) {
2145 return;
2146 }
2147
2148 os << "Warning, results might be unstable:" << std::endl;
2149 for (auto const& w : warnings) {
2150 os << "* " << w << std::endl;
2151 }
2152
2153 os << std::endl << "Recommendations" << std::endl;
2154 for (auto const& r : recommendations) {
2155 os << "* " << r << std::endl;
2156 }
2157 }
2158}
2159
2160// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2161uint64_t& singletonHeaderHash() noexcept {
2162 static uint64_t sHeaderHash{};
2163 return sHeaderHash;
2164}
2165
2166ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2167inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2168 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2169}
2170
2171// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2172Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2173 auto bestDuration = Clock::duration::max();
2174 Clock::time_point tBegin;
2175 Clock::time_point tEnd;
2176 for (size_t i = 0; i < numEvaluations; ++i) {
2177 tBegin = Clock::now();
2178 do {
2179 tEnd = Clock::now();
2180 } while (tBegin == tEnd);
2181 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2182 }
2183 return bestDuration;
2184}
2185
2186// Calculates clock resolution once, and remembers the result
2187Clock::duration clockResolution() noexcept {
2188 static Clock::duration const sResolution = calcClockResolution(20);
2189 return sResolution;
2190}
2191
2192ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2193struct IterationLogic::Impl {
2194 enum class State { warmup, upscaling_runtime, measuring, endless };
2195
2196 explicit Impl(Bench const& bench)
2197 : mBench(bench)
2198 , mResult(bench.config()) {
2199 printStabilityInformationOnce(mBench.output());
2200
2201 // determine target runtime per epoch
2202 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2203 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2204 mTargetRuntimePerEpoch = mBench.maxEpochTime();
2205 }
2206 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2207 mTargetRuntimePerEpoch = mBench.minEpochTime();
2208 }
2209
2210 if (isEndlessRunning(mBench.name())) {
2211 std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2212 mNumIters = (std::numeric_limits<uint64_t>::max)();
2213 mState = State::endless;
2214 } else if (0 != mBench.warmup()) {
2215 mNumIters = mBench.warmup();
2216 mState = State::warmup;
2217 } else if (0 != mBench.epochIterations()) {
2218 // exact number of iterations
2219 mNumIters = mBench.epochIterations();
2220 mState = State::measuring;
2221 } else {
2222 mNumIters = mBench.minEpochIterations();
2223 mState = State::upscaling_runtime;
2224 }
2225 }
2226
2227 // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2228 ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2229 auto doubleElapsed = d(elapsed);
2230 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2231 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2232
2233 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2234 if (doubleNewIters < doubleMinEpochIters) {
2235 doubleNewIters = doubleMinEpochIters;
2236 }
2237 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2238
2239 // +0.5 for correct rounding when casting
2240 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2241 return static_cast<uint64_t>(doubleNewIters + 0.5);
2242 }
2243
2244 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
2245 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2246 // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2247 if (mNumIters * 10 < mNumIters) {
2248 // overflow :-(
2249 showResult("iterations overflow. Maybe your code got optimized away?");
2250 mNumIters = 0;
2251 return;
2252 }
2253 mNumIters *= 10;
2254 } else {
2255 mNumIters = calcBestNumIters(elapsed, mNumIters);
2256 }
2257 }
2258
2259 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2260# if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2261 auto oldIters = mNumIters;
2262# endif
2263
2264 switch (mState) {
2265 case State::warmup:
2266 if (isCloseEnoughForMeasurements(elapsed)) {
2267 // if elapsed is close enough, we can skip upscaling and go right to measurements
2268 // still, we don't add the result to the measurements.
2269 mState = State::measuring;
2270 mNumIters = calcBestNumIters(elapsed, mNumIters);
2271 } else {
2272 // not close enough: switch to upscaling
2273 mState = State::upscaling_runtime;
2274 upscale(elapsed);
2275 }
2276 break;
2277
2278 case State::upscaling_runtime:
2279 if (isCloseEnoughForMeasurements(elapsed)) {
2280 // if we are close enough, add measurement and switch to always measuring
2281 mState = State::measuring;
2282 mTotalElapsed += elapsed;
2283 mTotalNumIters += mNumIters;
2284 mResult.add(elapsed, mNumIters, pc);
2285 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2286 } else {
2287 upscale(elapsed);
2288 }
2289 break;
2290
2291 case State::measuring:
2292 // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2293 // that fluctuation, or else we would bias the result
2294 mTotalElapsed += elapsed;
2295 mTotalNumIters += mNumIters;
2296 mResult.add(elapsed, mNumIters, pc);
2297 if (0 != mBench.epochIterations()) {
2298 mNumIters = mBench.epochIterations();
2299 } else {
2300 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2301 }
2302 break;
2303
2304 case State::endless:
2305 mNumIters = (std::numeric_limits<uint64_t>::max)();
2306 break;
2307 }
2308
2309 if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2310 // we got all the results that we need, finish it
2311 showResult("");
2312 mNumIters = 0;
2313 }
2314
2315 ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, d(elapsed.count())) << " elapsed, "
2316 << detail::fmt::Number(20, 3, d(mTargetRuntimePerEpoch.count())) << " target. oldIters="
2317 << oldIters << ", mNumIters=" << mNumIters << ", mState=" << static_cast<int>(mState));
2318 }
2319
2320 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
2321 void showResult(std::string const& errorMessage) const {
2322 ANKERL_NANOBENCH_LOG(errorMessage);
2323
2324 if (mBench.output() != nullptr) {
2325 // prepare column data ///////
2326 std::vector<fmt::MarkDownColumn> columns;
2327
2328 auto rMedian = mResult.median(Result::Measure::elapsed);
2329
2330 if (mBench.relative()) {
2331 double d = 100.0;
2332 if (!mBench.results().empty()) {
2333 d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2334 }
2335 columns.emplace_back(11, 1, "relative", "%", d);
2336 }
2337
2338 if (mBench.complexityN() > 0) {
2339 columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2340 }
2341
2342 columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2343 rMedian / (mBench.timeUnit().count() * mBench.batch()));
2344 columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2345
2346 double const rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2347 columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2348
2349 double rInsMedian = -1.0;
2350 if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {
2351 rInsMedian = mResult.median(Result::Measure::instructions);
2352 columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2353 }
2354
2355 double rCycMedian = -1.0;
2356 if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {
2357 rCycMedian = mResult.median(Result::Measure::cpucycles);
2358 columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2359 }
2360 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2361 columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2362 }
2363 if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {
2364 double const rBraMedian = mResult.median(Result::Measure::branchinstructions);
2365 columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2366 if (mResult.has(Result::Measure::branchmisses)) {
2367 double p = 0.0;
2368 if (rBraMedian >= 1e-9) {
2369 p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2370 }
2371 columns.emplace_back(10, 1, "miss%", "%", p);
2372 }
2373 }
2374
2375 columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2376
2377 // write everything
2378 auto& os = *mBench.output();
2379
2380 // combine all elements that are relevant for printing the header
2381 uint64_t hash = 0;
2382 hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2383 hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2384 hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2385 hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2386 hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2387 hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2388
2389 if (hash != singletonHeaderHash()) {
2390 singletonHeaderHash() = hash;
2391
2392 // no result yet, print header
2393 os << std::endl;
2394 for (auto const& col : columns) {
2395 os << col.title();
2396 }
2397 os << "| " << mBench.title() << std::endl;
2398
2399 for (auto const& col : columns) {
2400 os << col.separator();
2401 }
2402 os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2403 }
2404
2405 if (!errorMessage.empty()) {
2406 for (auto const& col : columns) {
2407 os << col.invalid();
2408 }
2409 os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2410 } else {
2411 for (auto const& col : columns) {
2412 os << col.value();
2413 }
2414 os << "| ";
2415 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2416 if (showUnstable) {
2417 os << ":wavy_dash: ";
2418 }
2419 os << fmt::MarkDownCode(mBench.name());
2420 if (showUnstable) {
2421 auto avgIters = d(mTotalNumIters) / d(mBench.epochs());
2422 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2423 auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2424
2425 os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2426 << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2427 }
2428 os << std::endl;
2429 }
2430 }
2431 }
2432
2433 ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2434 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2435 }
2436
2437 uint64_t mNumIters = 1; // NOLINT(misc-non-private-member-variables-in-classes)
2438 Bench const& mBench; // NOLINT(misc-non-private-member-variables-in-classes)
2439 std::chrono::nanoseconds mTargetRuntimePerEpoch{}; // NOLINT(misc-non-private-member-variables-in-classes)
2440 Result mResult; // NOLINT(misc-non-private-member-variables-in-classes)
2441 Rng mRng{123}; // NOLINT(misc-non-private-member-variables-in-classes)
2442 std::chrono::nanoseconds mTotalElapsed{}; // NOLINT(misc-non-private-member-variables-in-classes)
2443 uint64_t mTotalNumIters = 0; // NOLINT(misc-non-private-member-variables-in-classes)
2444 State mState = State::upscaling_runtime; // NOLINT(misc-non-private-member-variables-in-classes)
2445};
2446ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2447
2448IterationLogic::IterationLogic(Bench const& bench)
2449 : mPimpl(new Impl(bench)) {}
2450
2451IterationLogic::~IterationLogic() {
2452 delete mPimpl;
2453}
2454
2455uint64_t IterationLogic::numIters() const noexcept {
2456 ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2457 return mPimpl->mNumIters;
2458}
2459
2460void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2461 mPimpl->add(elapsed, pc);
2462}
2463
2464void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2465 results.emplace_back(std::move(mPimpl->mResult));
2466}
2467
2468# if ANKERL_NANOBENCH(PERF_COUNTERS)
2469
2470ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2471class LinuxPerformanceCounters {
2472public:
2473 struct Target {
2474 Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2475 : targetValue(targetValue_)
2476 , correctMeasuringOverhead(correctMeasuringOverhead_)
2477 , correctLoopOverhead(correctLoopOverhead_) {}
2478
2479 uint64_t* targetValue{}; // NOLINT(misc-non-private-member-variables-in-classes)
2480 bool correctMeasuringOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2481 bool correctLoopOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2482 };
2483
2484 LinuxPerformanceCounters() = default;
2485 LinuxPerformanceCounters(LinuxPerformanceCounters const&) = delete;
2486 LinuxPerformanceCounters(LinuxPerformanceCounters&&) = delete;
2487 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters const&) = delete;
2488 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters&&) = delete;
2489 ~LinuxPerformanceCounters();
2490
2491 // quick operation
2492 inline void start() {}
2493
2494 inline void stop() {}
2495
2496 bool monitor(perf_sw_ids swId, Target target);
2497 bool monitor(perf_hw_id hwId, Target target);
2498
2499 ANKERL_NANOBENCH(NODISCARD) bool hasError() const noexcept {
2500 return mHasError;
2501 }
2502
2503 // Just reading data is faster than enable & disabling.
2504 // we subtract data ourselves.
2505 inline void beginMeasure() {
2506 if (mHasError) {
2507 return;
2508 }
2509
2510 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2511 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2512 if (mHasError) {
2513 return;
2514 }
2515
2516 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2517 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2518 }
2519
2520 inline void endMeasure() {
2521 if (mHasError) {
2522 return;
2523 }
2524
2525 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2526 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2527 if (mHasError) {
2528 return;
2529 }
2530
2531 auto const numBytes = sizeof(uint64_t) * mCounters.size();
2532 auto ret = read(mFd, mCounters.data(), numBytes);
2533 mHasError = ret != static_cast<ssize_t>(numBytes);
2534 }
2535
2536 void updateResults(uint64_t numIters);
2537
2538 // rounded integer division
2539 template <typename T>
2540 static inline T divRounded(T a, T divisor) {
2541 return (a + divisor / 2) / divisor;
2542 }
2543
2544 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2545 static inline uint32_t mix(uint32_t x) noexcept {
2546 x ^= x << 13U;
2547 x ^= x >> 17U;
2548 x ^= x << 5U;
2549 return x;
2550 }
2551
2552 template <typename Op>
2553 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2554 void calibrate(Op&& op) {
2555 // clear current calibration data,
2556 for (auto& v : mCalibratedOverhead) {
2557 v = UINT64_C(0);
2558 }
2559
2560 // create new calibration data
2561 auto newCalibration = mCalibratedOverhead;
2562 for (auto& v : newCalibration) {
2563 v = (std::numeric_limits<uint64_t>::max)();
2564 }
2565 for (size_t iter = 0; iter < 100; ++iter) {
2566 beginMeasure();
2567 op();
2568 endMeasure();
2569 if (mHasError) {
2570 return;
2571 }
2572
2573 for (size_t i = 0; i < newCalibration.size(); ++i) {
2574 auto diff = mCounters[i];
2575 if (newCalibration[i] > diff) {
2576 newCalibration[i] = diff;
2577 }
2578 }
2579 }
2580
2581 mCalibratedOverhead = std::move(newCalibration);
2582
2583 {
2584 // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2585 // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2586 // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2587 // see https://godbolt.org/z/49RVQ5
2588 uint64_t const numIters = 100000U + (std::random_device{}() & 3U);
2589 uint64_t n = numIters;
2590 uint32_t x = 1234567;
2591
2592 beginMeasure();
2593 while (n-- > 0) {
2594 x = mix(x);
2595 }
2596 endMeasure();
2598 auto measure1 = mCounters;
2599
2600 n = numIters;
2601 beginMeasure();
2602 while (n-- > 0) {
2603 // we now run *twice* so we can easily calculate the overhead
2604 x = mix(x);
2605 x = mix(x);
2606 }
2607 endMeasure();
2609 auto measure2 = mCounters;
2610
2611 for (size_t i = 0; i < mCounters.size(); ++i) {
2612 // factor 2 because we have two instructions per loop
2613 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2614 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2615 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2616
2617 mLoopOverhead[i] = divRounded(overhead, numIters);
2618 }
2619 }
2620 }
2621
2622private:
2623 bool monitor(uint32_t type, uint64_t eventid, Target target);
2624
2625 std::map<uint64_t, Target> mIdToTarget{};
2626
2627 // start with minimum size of 3 for read_format
2628 std::vector<uint64_t> mCounters{3};
2629 std::vector<uint64_t> mCalibratedOverhead{3};
2630 std::vector<uint64_t> mLoopOverhead{3};
2631
2632 uint64_t mTimeEnabledNanos = 0;
2633 uint64_t mTimeRunningNanos = 0;
2634 int mFd = -1;
2635 bool mHasError = false;
2636};
2637ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2638
2639LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2640 if (-1 != mFd) {
2641 close(mFd);
2642 }
2643}
2644
2645bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2646 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2647}
2648
2649bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2650 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2651}
2652
2653// overflow is ok, it's checked
2654ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2655void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2656 // clear old data
2657 for (auto& id_value : mIdToTarget) {
2658 *id_value.second.targetValue = UINT64_C(0);
2659 }
2660
2661 if (mHasError) {
2662 return;
2663 }
2664
2665 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2666 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2667
2668 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2669 auto idx = static_cast<size_t>(3 + i * 2 + 0);
2670 auto id = mCounters[idx + 1U];
2671
2672 auto it = mIdToTarget.find(id);
2673 if (it != mIdToTarget.end()) {
2674
2675 auto& tgt = it->second;
2676 *tgt.targetValue = mCounters[idx];
2677 if (tgt.correctMeasuringOverhead) {
2678 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2679 *tgt.targetValue -= mCalibratedOverhead[idx];
2680 } else {
2681 *tgt.targetValue = 0U;
2682 }
2683 }
2684 if (tgt.correctLoopOverhead) {
2685 auto correctionVal = mLoopOverhead[idx] * numIters;
2686 if (*tgt.targetValue >= correctionVal) {
2687 *tgt.targetValue -= correctionVal;
2688 } else {
2689 *tgt.targetValue = 0U;
2690 }
2691 }
2692 }
2693 }
2694}
2695
2696bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2697 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2698 if (mHasError) {
2699 return false;
2700 }
2701
2702 auto pea = perf_event_attr();
2703 std::memset(&pea, 0, sizeof(perf_event_attr));
2704 pea.type = type;
2705 pea.size = sizeof(perf_event_attr);
2706 pea.config = eventid;
2707 pea.disabled = 1; // start counter as disabled
2708 pea.exclude_kernel = 1;
2709 pea.exclude_hv = 1;
2710
2711 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2712 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2713
2714 const int pid = 0; // the current process
2715 const int cpu = -1; // all CPUs
2716# if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2717 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2718# else
2719 const unsigned long flags = 0;
2720# endif
2721
2722 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
2723 auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2724 if (-1 == fd) {
2725 return false;
2726 }
2727 if (-1 == mFd) {
2728 // first call: set to fd, and use this from now on
2729 mFd = fd;
2730 }
2731 uint64_t id = 0;
2732 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2733 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2734 // couldn't get id
2735 return false;
2736 }
2737
2738 // insert into map, rely on the fact that map's references are constant.
2739 mIdToTarget.emplace(id, target);
2740
2741 // prepare readformat with the correct size (after the insert)
2742 auto size = 3 + 2 * mIdToTarget.size();
2743 mCounters.resize(size);
2744 mCalibratedOverhead.resize(size);
2745 mLoopOverhead.resize(size);
2746
2747 return true;
2748}
2749
2750PerformanceCounters::PerformanceCounters()
2751 : mPc(new LinuxPerformanceCounters())
2752 , mVal()
2753 , mHas() {
2754
2755 // HW events
2756 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2757 if (!mHas.cpuCycles) {
2758 // Fallback to cycles counter, reference cycles not available in many systems.
2759 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2760 }
2761 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2762 mHas.branchInstructions =
2763 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2764 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2765 // mHas.branchMisses = false;
2766
2767 // SW events
2768 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2769 mHas.contextSwitches =
2770 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2771
2772 mPc->start();
2773 mPc->calibrate([] {
2774 auto before = ankerl::nanobench::Clock::now();
2775 auto after = ankerl::nanobench::Clock::now();
2776 (void)before;
2777 (void)after;
2778 });
2779
2780 if (mPc->hasError()) {
2781 // something failed, don't monitor anything.
2782 mHas = PerfCountSet<bool>{};
2783 }
2784}
2785
2786PerformanceCounters::~PerformanceCounters() {
2787 // no need to check for nullptr, delete nullptr has no effect
2788 delete mPc;
2789}
2790
2791void PerformanceCounters::beginMeasure() {
2792 mPc->beginMeasure();
2793}
2794
2795void PerformanceCounters::endMeasure() {
2796 mPc->endMeasure();
2797}
2798
2799void PerformanceCounters::updateResults(uint64_t numIters) {
2800 mPc->updateResults(numIters);
2801}
2802
2803# else
2804
2805PerformanceCounters::PerformanceCounters() = default;
2806PerformanceCounters::~PerformanceCounters() = default;
2807void PerformanceCounters::beginMeasure() {}
2808void PerformanceCounters::endMeasure() {}
2809void PerformanceCounters::updateResults(uint64_t) {}
2810
2811# endif
2812
2813ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2814 return mVal;
2815}
2816ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2817 return mHas;
2818}
2819
2820// formatting utilities
2821namespace fmt {
2822
2823// adds thousands separator to numbers
2824NumSep::NumSep(char sep)
2825 : mSep(sep) {}
2826
2827char NumSep::do_thousands_sep() const {
2828 return mSep;
2829}
2830
2831std::string NumSep::do_grouping() const {
2832 return "\003";
2833}
2834
2835// RAII to save & restore a stream's state
2836StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2837 : mStream(s)
2838 , mLocale(s.getloc())
2839 , mPrecision(s.precision())
2840 , mWidth(s.width())
2841 , mFill(s.fill())
2842 , mFmtFlags(s.flags()) {}
2843
2844StreamStateRestorer::~StreamStateRestorer() {
2845 restore();
2846}
2847
2848// sets back all stream info that we remembered at construction
2849void StreamStateRestorer::restore() {
2850 mStream.imbue(mLocale);
2851 mStream.precision(mPrecision);
2852 mStream.width(mWidth);
2853 mStream.fill(mFill);
2854 mStream.flags(mFmtFlags);
2855}
2856
2857Number::Number(int width, int precision, int64_t value)
2858 : mWidth(width)
2859 , mPrecision(precision)
2860 , mValue(d(value)) {}
2861
2862Number::Number(int width, int precision, double value)
2863 : mWidth(width)
2864 , mPrecision(precision)
2865 , mValue(value) {}
2866
2867std::ostream& Number::write(std::ostream& os) const {
2868 StreamStateRestorer const restorer(os);
2869 os.imbue(std::locale(os.getloc(), new NumSep(',')));
2870 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2871 return os;
2872}
2873
2874std::string Number::to_s() const {
2875 std::stringstream ss;
2876 write(ss);
2877 return ss.str();
2878}
2879
2880std::string to_s(uint64_t n) {
2881 std::string str;
2882 do {
2883 str += static_cast<char>('0' + static_cast<char>(n % 10));
2884 n /= 10;
2885 } while (n != 0);
2886 std::reverse(str.begin(), str.end());
2887 return str;
2888}
2889
2890std::ostream& operator<<(std::ostream& os, Number const& n) {
2891 return n.write(os);
2892}
2893
2894MarkDownColumn::MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept
2895 : mWidth(w)
2896 , mPrecision(prec)
2897 , mTitle(std::move(tit))
2898 , mSuffix(std::move(suff))
2899 , mValue(val) {}
2900
2901std::string MarkDownColumn::title() const {
2902 std::stringstream ss;
2903 ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2904 return ss.str();
2905}
2906
2907std::string MarkDownColumn::separator() const {
2908 std::string sep(static_cast<size_t>(mWidth), '-');
2909 sep.front() = '|';
2910 sep.back() = ':';
2911 return sep;
2912}
2913
2914std::string MarkDownColumn::invalid() const {
2915 std::string sep(static_cast<size_t>(mWidth), ' ');
2916 sep.front() = '|';
2917 sep[sep.size() - 2] = '-';
2918 return sep;
2919}
2920
2921std::string MarkDownColumn::value() const {
2922 std::stringstream ss;
2923 auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2924 ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2925 return ss.str();
2926}
2927
2928// Formats any text as markdown code, escaping backticks.
2929MarkDownCode::MarkDownCode(std::string const& what) {
2930 mWhat.reserve(what.size() + 2);
2931 mWhat.push_back('`');
2932 for (char const c : what) {
2933 mWhat.push_back(c);
2934 if ('`' == c) {
2935 mWhat.push_back('`');
2936 }
2937 }
2938 mWhat.push_back('`');
2939}
2940
2941std::ostream& MarkDownCode::write(std::ostream& os) const {
2942 return os << mWhat;
2943}
2944
2945std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2946 return mdCode.write(os);
2947}
2948} // namespace fmt
2949} // namespace detail
2950
2951// provide implementation here so it's only generated once
2952Config::Config() = default;
2953Config::~Config() = default;
2954Config& Config::operator=(Config const&) = default;
2955Config& Config::operator=(Config&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
2956Config::Config(Config const&) = default;
2957Config::Config(Config&&) noexcept = default;
2958
2959// provide implementation here so it's only generated once
2960Result::~Result() = default;
2961Result& Result::operator=(Result const&) = default;
2962Result& Result::operator=(Result&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
2963Result::Result(Result const&) = default;
2964Result::Result(Result&&) noexcept = default;
2965
2966namespace detail {
2967template <typename T>
2968inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2969 return static_cast<typename std::underlying_type<T>::type>(val);
2970}
2971} // namespace detail
2972
2973// Result returned after a benchmark has finished. Can be used as a baseline for relative().
2974Result::Result(Config benchmarkConfig)
2975 : mConfig(std::move(benchmarkConfig))
2976 , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2977
2978void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2979 using detail::d;
2980 using detail::u;
2981
2982 double const dIters = d(iters);
2983 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2984
2985 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2986 if (pc.has().pageFaults) {
2987 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2988 }
2989 if (pc.has().cpuCycles) {
2990 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2991 }
2992 if (pc.has().contextSwitches) {
2993 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2994 }
2995 if (pc.has().instructions) {
2996 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
2997 }
2998 if (pc.has().branchInstructions) {
2999 double branchInstructions = 0.0;
3000 // correcting branches: remove branch introduced by the while (...) loop for each iteration.
3001 if (pc.val().branchInstructions > iters + 1U) {
3002 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
3003 }
3004 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
3005
3006 if (pc.has().branchMisses) {
3007 // correcting branch misses
3008 double branchMisses = d(pc.val().branchMisses);
3009 if (branchMisses > branchInstructions) {
3010 // can't have branch misses when there were branches...
3011 branchMisses = branchInstructions;
3012 }
3013
3014 // assuming at least one missed branch for the loop
3015 branchMisses -= 1.0;
3016 if (branchMisses < 1.0) {
3017 branchMisses = 1.0;
3018 }
3019 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
3020 }
3021 }
3022}
3023
3024Config const& Result::config() const noexcept {
3025 return mConfig;
3026}
3027
3028inline double calcMedian(std::vector<double>& data) {
3029 if (data.empty()) {
3030 return 0.0;
3031 }
3032 std::sort(data.begin(), data.end());
3033
3034 auto midIdx = data.size() / 2U;
3035 if (1U == (data.size() & 1U)) {
3036 return data[midIdx];
3037 }
3038 return (data[midIdx - 1U] + data[midIdx]) / 2U;
3039}
3040
3041double Result::median(Measure m) const {
3042 // create a copy so we can sort
3043 auto data = mNameToMeasurements[detail::u(m)];
3044 return calcMedian(data);
3045}
3046
3047double Result::average(Measure m) const {
3048 using detail::d;
3049 auto const& data = mNameToMeasurements[detail::u(m)];
3050 if (data.empty()) {
3051 return 0.0;
3052 }
3053
3054 // create a copy so we can sort
3055 return sum(m) / d(data.size());
3056}
3057
3058double Result::medianAbsolutePercentError(Measure m) const {
3059 // create copy
3060 auto data = mNameToMeasurements[detail::u(m)];
3061
3062 // calculates MdAPE which is the median of percentage error
3063 // see https://support.numxl.com/hc/en-us/articles/115001223503-MdAPE-Median-Absolute-Percentage-Error
3064 auto med = calcMedian(data);
3065
3066 // transform the data to absolute error
3067 for (auto& x : data) {
3068 x = (x - med) / x;
3069 if (x < 0) {
3070 x = -x;
3071 }
3072 }
3073 return calcMedian(data);
3074}
3075
3076double Result::sum(Measure m) const noexcept {
3077 auto const& data = mNameToMeasurements[detail::u(m)];
3078 return std::accumulate(data.begin(), data.end(), 0.0);
3079}
3080
3081double Result::sumProduct(Measure m1, Measure m2) const noexcept {
3082 auto const& data1 = mNameToMeasurements[detail::u(m1)];
3083 auto const& data2 = mNameToMeasurements[detail::u(m2)];
3084
3085 if (data1.size() != data2.size()) {
3086 return 0.0;
3087 }
3088
3089 double result = 0.0;
3090 for (size_t i = 0, s = data1.size(); i != s; ++i) {
3091 result += data1[i] * data2[i];
3092 }
3093 return result;
3094}
3095
3096bool Result::has(Measure m) const noexcept {
3097 return !mNameToMeasurements[detail::u(m)].empty();
3098}
3099
3100double Result::get(size_t idx, Measure m) const {
3101 auto const& data = mNameToMeasurements[detail::u(m)];
3102 return data.at(idx);
3103}
3104
3105bool Result::empty() const noexcept {
3106 return 0U == size();
3107}
3108
3109size_t Result::size() const noexcept {
3110 auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
3111 return data.size();
3112}
3113
3114double Result::minimum(Measure m) const noexcept {
3115 auto const& data = mNameToMeasurements[detail::u(m)];
3116 if (data.empty()) {
3117 return 0.0;
3118 }
3119
3120 // here its save to assume that at least one element is there
3121 return *std::min_element(data.begin(), data.end());
3122}
3123
3124double Result::maximum(Measure m) const noexcept {
3125 auto const& data = mNameToMeasurements[detail::u(m)];
3126 if (data.empty()) {
3127 return 0.0;
3128 }
3129
3130 // here its save to assume that at least one element is there
3131 return *std::max_element(data.begin(), data.end());
3132}
3133
3134std::string const& Result::context(char const* variableName) const {
3135 return mConfig.mContext.at(variableName);
3136}
3137
3138std::string const& Result::context(std::string const& variableName) const {
3139 return mConfig.mContext.at(variableName);
3140}
3141
3142Result::Measure Result::fromString(std::string const& str) {
3143 if (str == "elapsed") {
3144 return Measure::elapsed;
3145 }
3146 if (str == "iterations") {
3147 return Measure::iterations;
3148 }
3149 if (str == "pagefaults") {
3150 return Measure::pagefaults;
3151 }
3152 if (str == "cpucycles") {
3153 return Measure::cpucycles;
3154 }
3155 if (str == "contextswitches") {
3156 return Measure::contextswitches;
3157 }
3158 if (str == "instructions") {
3159 return Measure::instructions;
3160 }
3161 if (str == "branchinstructions") {
3162 return Measure::branchinstructions;
3163 }
3164 if (str == "branchmisses") {
3165 return Measure::branchmisses;
3166 }
3167 // not found, return _size
3168 return Measure::_size;
3169}
3170
3171// Configuration of a microbenchmark.
3172Bench::Bench() {
3173 mConfig.mOut = &std::cout;
3174}
3175
3176Bench::Bench(Bench&&) noexcept = default;
3177Bench& Bench::operator=(Bench&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
3178Bench::Bench(Bench const&) = default;
3179Bench& Bench::operator=(Bench const&) = default;
3180Bench::~Bench() noexcept = default;
3181
3182double Bench::batch() const noexcept {
3183 return mConfig.mBatch;
3184}
3185
3186double Bench::complexityN() const noexcept {
3187 return mConfig.mComplexityN;
3188}
3189
3190// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3191// means it is slower than the baseline.
3192Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3193 mConfig.mIsRelative = isRelativeEnabled;
3194 return *this;
3195}
3196bool Bench::relative() const noexcept {
3197 return mConfig.mIsRelative;
3198}
3199
3200Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3201 mConfig.mShowPerformanceCounters = showPerformanceCounters;
3202 return *this;
3203}
3204bool Bench::performanceCounters() const noexcept {
3205 return mConfig.mShowPerformanceCounters;
3206}
3207
3208// Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3209// If u differs from currently set unit, the stored results will be cleared.
3210// Use singular (byte, not bytes).
3211Bench& Bench::unit(char const* u) {
3212 if (u != mConfig.mUnit) {
3213 mResults.clear();
3214 }
3215 mConfig.mUnit = u;
3216 return *this;
3217}
3218
3219Bench& Bench::unit(std::string const& u) {
3220 return unit(u.c_str());
3221}
3222
3223std::string const& Bench::unit() const noexcept {
3224 return mConfig.mUnit;
3225}
3226
3227Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3228 mConfig.mTimeUnit = tu;
3229 mConfig.mTimeUnitName = tuName;
3230 return *this;
3231}
3232
3233std::string const& Bench::timeUnitName() const noexcept {
3234 return mConfig.mTimeUnitName;
3235}
3236
3237std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3238 return mConfig.mTimeUnit;
3239}
3240
3241// If benchmarkTitle differs from currently set title, the stored results will be cleared.
3242Bench& Bench::title(const char* benchmarkTitle) {
3243 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3244 mResults.clear();
3245 }
3246 mConfig.mBenchmarkTitle = benchmarkTitle;
3247 return *this;
3248}
3249Bench& Bench::title(std::string const& benchmarkTitle) {
3250 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3251 mResults.clear();
3252 }
3253 mConfig.mBenchmarkTitle = benchmarkTitle;
3254 return *this;
3255}
3256
3257std::string const& Bench::title() const noexcept {
3258 return mConfig.mBenchmarkTitle;
3259}
3260
3261Bench& Bench::name(const char* benchmarkName) {
3262 mConfig.mBenchmarkName = benchmarkName;
3263 return *this;
3264}
3265
3266Bench& Bench::name(std::string const& benchmarkName) {
3267 mConfig.mBenchmarkName = benchmarkName;
3268 return *this;
3269}
3270
3271std::string const& Bench::name() const noexcept {
3272 return mConfig.mBenchmarkName;
3273}
3274
3275Bench& Bench::context(char const* variableName, char const* variableValue) {
3276 mConfig.mContext[variableName] = variableValue;
3277 return *this;
3278}
3279
3280Bench& Bench::context(std::string const& variableName, std::string const& variableValue) {
3281 mConfig.mContext[variableName] = variableValue;
3282 return *this;
3283}
3284
3285Bench& Bench::clearContext() {
3286 mConfig.mContext.clear();
3287 return *this;
3288}
3289
3290// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
3291Bench& Bench::epochs(size_t numEpochs) noexcept {
3292 mConfig.mNumEpochs = numEpochs;
3293 return *this;
3294}
3295size_t Bench::epochs() const noexcept {
3296 return mConfig.mNumEpochs;
3297}
3298
3299// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
3300Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3301 mConfig.mClockResolutionMultiple = multiple;
3302 return *this;
3303}
3304size_t Bench::clockResolutionMultiple() const noexcept {
3305 return mConfig.mClockResolutionMultiple;
3306}
3307
3308// Sets the maximum time each epoch should take. Default is 100ms.
3309Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3310 mConfig.mMaxEpochTime = t;
3311 return *this;
3312}
3313std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3314 return mConfig.mMaxEpochTime;
3315}
3316
3317// Sets the maximum time each epoch should take. Default is 100ms.
3318Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3319 mConfig.mMinEpochTime = t;
3320 return *this;
3321}
3322std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3323 return mConfig.mMinEpochTime;
3324}
3325
3326Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3327 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3328 return *this;
3329}
3330uint64_t Bench::minEpochIterations() const noexcept {
3331 return mConfig.mMinEpochIterations;
3332}
3333
3334Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3335 mConfig.mEpochIterations = numIters;
3336 return *this;
3337}
3338uint64_t Bench::epochIterations() const noexcept {
3339 return mConfig.mEpochIterations;
3340}
3341
3342Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3343 mConfig.mWarmup = numWarmupIters;
3344 return *this;
3345}
3346uint64_t Bench::warmup() const noexcept {
3347 return mConfig.mWarmup;
3348}
3349
3350Bench& Bench::config(Config const& benchmarkConfig) {
3351 mConfig = benchmarkConfig;
3352 return *this;
3353}
3354Config const& Bench::config() const noexcept {
3355 return mConfig;
3356}
3357
3358Bench& Bench::output(std::ostream* outstream) noexcept {
3359 mConfig.mOut = outstream;
3360 return *this;
3361}
3362
3363ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3364 return mConfig.mOut;
3365}
3366
3367std::vector<Result> const& Bench::results() const noexcept {
3368 return mResults;
3369}
3370
3371Bench& Bench::render(char const* templateContent, std::ostream& os) {
3372 ::ankerl::nanobench::render(templateContent, *this, os);
3373 return *this;
3374}
3375
3376Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3377 ::ankerl::nanobench::render(templateContent, *this, os);
3378 return *this;
3379}
3380
3381std::vector<BigO> Bench::complexityBigO() const {
3382 std::vector<BigO> bigOs;
3383 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3384 bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3385 return 1.0;
3386 });
3387 bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3388 return n;
3389 });
3390 bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3391 return std::log2(n);
3392 });
3393 bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3394 return n * std::log2(n);
3395 });
3396 bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3397 return n * n;
3398 });
3399 bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3400 return n * n * n;
3401 });
3402 std::sort(bigOs.begin(), bigOs.end());
3403 return bigOs;
3404}
3405
3406Rng::Rng()
3407 : mX(0)
3408 , mY(0) {
3409 std::random_device rd;
3410 std::uniform_int_distribution<uint64_t> dist;
3411 do {
3412 mX = dist(rd);
3413 mY = dist(rd);
3414 } while (mX == 0 && mY == 0);
3415}
3416
3417ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
3418uint64_t splitMix64(uint64_t& state) noexcept {
3419 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3420 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3421 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3422 return z ^ (z >> 31U);
3423}
3424
3425// Seeded as described in romu paper (update april 2020)
3426Rng::Rng(uint64_t seed) noexcept
3427 : mX(splitMix64(seed))
3428 , mY(splitMix64(seed)) {
3429 for (size_t i = 0; i < 10; ++i) {
3430 operator()();
3431 }
3432}
3433
3434// only internally used to copy the RNG.
3435Rng::Rng(uint64_t x, uint64_t y) noexcept
3436 : mX(x)
3437 , mY(y) {}
3438
3439Rng Rng::copy() const noexcept {
3440 return Rng{mX, mY};
3441}
3442
3443Rng::Rng(std::vector<uint64_t> const& data)
3444 : mX(0)
3445 , mY(0) {
3446 if (data.size() != 2) {
3447 throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3448 detail::fmt::to_s(data.size()));
3449 }
3450 mX = data[0];
3451 mY = data[1];
3452}
3453
3454std::vector<uint64_t> Rng::state() const {
3455 std::vector<uint64_t> data(2);
3456 data[0] = mX;
3457 data[1] = mY;
3458 return data;
3459}
3460
3461BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3462 BigO::RangeMeasure rangeMeasure;
3463 for (auto const& result : results) {
3464 if (result.config().mComplexityN > 0.0) {
3465 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3466 }
3467 }
3468 return rangeMeasure;
3469}
3470
3471BigO::BigO(std::string bigOName, RangeMeasure const& rangeMeasure)
3472 : mName(std::move(bigOName)) {
3473
3474 // estimate the constant factor
3475 double sumRangeMeasure = 0.0;
3476 double sumRangeRange = 0.0;
3477
3478 for (const auto& rm : rangeMeasure) {
3479 sumRangeMeasure += rm.first * rm.second;
3480 sumRangeRange += rm.first * rm.first;
3481 }
3482 mConstant = sumRangeMeasure / sumRangeRange;
3483
3484 // calculate root mean square
3485 double err = 0.0;
3486 double sumMeasure = 0.0;
3487 for (const auto& rm : rangeMeasure) {
3488 auto diff = mConstant * rm.first - rm.second;
3489 err += diff * diff;
3490
3491 sumMeasure += rm.second;
3492 }
3493
3494 auto n = detail::d(rangeMeasure.size());
3495 auto mean = sumMeasure / n;
3496 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3497}
3498
3499BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3500 : BigO(std::string(bigOName), rangeMeasure) {}
3501
3502std::string const& BigO::name() const noexcept {
3503 return mName;
3504}
3505
3506double BigO::constant() const noexcept {
3507 return mConstant;
3508}
3509
3510double BigO::normalizedRootMeanSquare() const noexcept {
3511 return mNormalizedRootMeanSquare;
3512}
3513
3514bool BigO::operator<(BigO const& other) const noexcept {
3515 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3516}
3517
3518std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3519 return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3520}
3521
3522std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3523 detail::fmt::StreamStateRestorer const restorer(os);
3524 os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3525 for (auto const& bigO : bigOs) {
3526 os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3527 os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3528 os << "| " << bigO.name();
3529 os << std::endl;
3530 }
3531 return os;
3532}
3533
3534} // namespace nanobench
3535} // namespace ankerl
3536
3537#endif // ANKERL_NANOBENCH_IMPLEMENT
3538#endif // ANKERL_NANOBENCH_H_INCLUDED
int ret
int flags
Definition: bitcoin-tx.cpp:530
Main entry point to nanobench's benchmarking facility.
Definition: nanobench.h:632
Bench & operator=(Bench const &other)
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1288
Bench & batch(T b) noexcept
Sets the batch size.
Definition: nanobench.h:1312
std::vector< BigO > complexityBigO() const
Bench()
Creates a new benchmark for configuration and running of benchmarks.
Bench & operator=(Bench &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
Bench & runImpl(SetupOp &setupOp, Op &&op)
Definition: nanobench.h:1259
detail::SetupRunner< SetupOp > setup(SetupOp setupOp)
Configure an untimed setup step per epoch (fluent API).
Definition: nanobench.h:1282
Bench(Bench &&other) noexcept
Bench(Bench const &other)
Bench & complexityN(T n) noexcept
Definition: nanobench.h:1319
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1135
BigO(std::string bigOName, RangeMeasure const &scaledRangeMeasure)
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1132
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1145
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
BigO(std::string bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1149
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)
Result(Config benchmarkConfig)
static Measure fromString(std::string const &str)
void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const &pc)
Result(Result &&other) noexcept
ANKERL_NANOBENCH(NODISCARD) Config const &config() const noexcept
Result & operator=(Result const &other)
Result(Result const &other)
Result & operator=(Result &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
An extremely fast random generator.
Definition: nanobench.h:493
static constexpr uint64_t() min()
Rng(Rng const &)=delete
As a safety precaution, we don't allow copying.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1210
Rng(Rng &&) noexcept=default
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.
static constexpr uint64_t() max()
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1200
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:498
void moveResultTo(std::vector< Result > &results) noexcept
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
IterationLogic(IterationLogic &&)=delete
IterationLogic & operator=(IterationLogic const &)=delete
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept
IterationLogic(IterationLogic const &)=delete
IterationLogic & operator=(IterationLogic &&)=delete
PerformanceCounters(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters &&)=delete
ANKERL_NANOBENCH(NODISCARD) PerfCountSet< uint64_t > const &val() const noexcept
PerformanceCounters(PerformanceCounters &&)=delete
SetupRunner(SetupOp setupOp, Bench &bench)
Definition: nanobench.h:1234
volatile double sum
Definition: examples.cpp:10
#define T(expected, seed, data)
void doNotOptimizeAway(T &val)
Definition: nanobench.h:1064
PerformanceCounters & performanceCounters()
void doNotOptimizeAway(T const &val)
Definition: nanobench.h:1058
char const * json() noexcept
Template to generate JSON data.
char const * csv() noexcept
CSV data for the benchmark results.
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzation.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:133
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1333
Definition: common.h:30
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:87
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:106
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:49
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:603
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:608
const char * name
Definition: rest.cpp:49
static RPCMethod stop()
Definition: server.cpp:145
Config & operator=(Config const &other)
Config(Config const &other)
Config & operator=(Config &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
Config(Config &&other) noexcept
static SECP256K1_INLINE uint64_t rotl(const uint64_t x, int k)
Definition: testrand_impl.h:39
static int setup(void)
Definition: tests.c:7808
static int count