Bitcoin Core 31.99.0
P2P Digital Currency
nanobench.h
Go to the documentation of this file.
1// __ _ _______ __ _ _____ ______ _______ __ _ _______ _ _
2// | \ | |_____| | \ | | | |_____] |______ | \ | | |_____|
3// | \_| | | | \_| |_____| |_____] |______ | \_| |_____ | |
4//
5// Microbenchmark framework for C++11/14/17/20
6// https://github.com/martinus/nanobench
7//
8// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
9// SPDX-License-Identifier: MIT
10// Copyright (c) 2019-2023 Martin Leitner-Ankerl <martin.ankerl@gmail.com>
11//
12// Permission is hereby granted, free of charge, to any person obtaining a copy
13// of this software and associated documentation files (the "Software"), to deal
14// in the Software without restriction, including without limitation the rights
15// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16// copies of the Software, and to permit persons to whom the Software is
17// furnished to do so, subject to the following conditions:
18//
19// The above copyright notice and this permission notice shall be included in all
20// copies or substantial portions of the Software.
21//
22// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28// SOFTWARE.
29
30#ifndef ANKERL_NANOBENCH_H_INCLUDED
31#define ANKERL_NANOBENCH_H_INCLUDED
32
33// see https://semver.org/
34#define ANKERL_NANOBENCH_VERSION_MAJOR 4 // incompatible API changes
35#define ANKERL_NANOBENCH_VERSION_MINOR 3 // backwards-compatible changes
36#define ANKERL_NANOBENCH_VERSION_PATCH 11 // backwards-compatible bug fixes
37
39// public facing api - as minimal as possible
41
42#include <chrono> // high_resolution_clock
43#include <cassert> // assert
44#include <cstring> // memcpy
45#include <iosfwd> // for std::ostream* custom output target in Config
46#include <string> // all names
47#include <unordered_map> // holds context information of results
48#include <vector> // holds all results
49
50#define ANKERL_NANOBENCH(x) ANKERL_NANOBENCH_PRIVATE_##x()
51
52#define ANKERL_NANOBENCH_PRIVATE_CXX() __cplusplus
53#define ANKERL_NANOBENCH_PRIVATE_CXX98() 199711L
54#define ANKERL_NANOBENCH_PRIVATE_CXX11() 201103L
55#define ANKERL_NANOBENCH_PRIVATE_CXX14() 201402L
56#define ANKERL_NANOBENCH_PRIVATE_CXX17() 201703L
57
58#if ANKERL_NANOBENCH(CXX) >= ANKERL_NANOBENCH(CXX17)
59# define ANKERL_NANOBENCH_PRIVATE_NODISCARD() [[nodiscard]]
60#else
61# define ANKERL_NANOBENCH_PRIVATE_NODISCARD()
62#endif
63
64#if defined(__clang__)
65# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH() \
66 _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wpadded\"")
67# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP() _Pragma("clang diagnostic pop")
68#else
69# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_PUSH()
70# define ANKERL_NANOBENCH_PRIVATE_IGNORE_PADDED_POP()
71#endif
72
73#if defined(__GNUC__)
74# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH() _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Weffc++\"")
75# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP() _Pragma("GCC diagnostic pop")
76#else
77# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_PUSH()
78# define ANKERL_NANOBENCH_PRIVATE_IGNORE_EFFCPP_POP()
79#endif
80
81#if defined(ANKERL_NANOBENCH_LOG_ENABLED)
82# include <iostream>
83# define ANKERL_NANOBENCH_LOG(x) \
84 do { \
85 std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl; \
86 } while (0)
87#else
88# define ANKERL_NANOBENCH_LOG(x) \
89 do { \
90 } while (0)
91#endif
92
93#define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 0
94#if defined(__linux__) && !defined(ANKERL_NANOBENCH_DISABLE_PERF_COUNTERS)
95# include <linux/version.h>
96# if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
97// PERF_COUNT_HW_REF_CPU_CYCLES only available since kernel 3.3
98// PERF_FLAG_FD_CLOEXEC since kernel 3.14
99# undef ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS
100# define ANKERL_NANOBENCH_PRIVATE_PERF_COUNTERS() 1
101# endif
102#endif
103
104#if defined(__clang__)
105# define ANKERL_NANOBENCH_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
106#else
107# define ANKERL_NANOBENCH_NO_SANITIZE(...)
108#endif
109
110#if defined(_MSC_VER)
111# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __declspec(noinline)
112#else
113# define ANKERL_NANOBENCH_PRIVATE_NOINLINE() __attribute__((noinline))
114#endif
115
116// workaround missing "is_trivially_copyable" in g++ < 5.0
117// See https://stackoverflow.com/a/31798726/48181
118#if defined(__GNUC__) && __GNUC__ < 5
119# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
120#else
121# define ANKERL_NANOBENCH_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value
122#endif
123
124// noexcept may be missing for std::string.
125// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58265
126#define ANKERL_NANOBENCH_PRIVATE_NOEXCEPT_STRING_MOVE() std::is_nothrow_move_assignable<std::string>::value
127
128// declarations ///////////////////////////////////////////////////////////////////////////////////
129
130namespace ankerl {
131namespace nanobench {
132
133using Clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock,
134 std::chrono::steady_clock>::type;
135class Bench;
136struct Config;
137class Result;
138class Rng;
139class BigO;
140
141namespace detail {
142template <typename SetupOp>
143class SetupRunner;
144} // namespace detail
145
296void render(char const* mustacheTemplate, Bench const& bench, std::ostream& out);
297void render(std::string const& mustacheTemplate, Bench const& bench, std::ostream& out);
298
307void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
308void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out);
309
310// Contains mustache-like templates
311namespace templates {
312
322char const* csv() noexcept;
323
334char const* htmlBoxplot() noexcept;
335
342char const* pyperf() noexcept;
343
353char const* json() noexcept;
354
355} // namespace templates
356
357namespace detail {
358
359template <typename T>
360struct PerfCountSet;
361
362class IterationLogic;
363class PerformanceCounters;
364
365#if ANKERL_NANOBENCH(PERF_COUNTERS)
366class LinuxPerformanceCounters;
367#endif
368
369} // namespace detail
370} // namespace nanobench
371} // namespace ankerl
372
373// definitions ////////////////////////////////////////////////////////////////////////////////////
374
375namespace ankerl {
376namespace nanobench {
377namespace detail {
378
379template <typename T>
387};
388
389} // namespace detail
390
391ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
392struct Config {
393 // actual benchmark config
394 std::string mBenchmarkTitle = "benchmark"; // NOLINT(misc-non-private-member-variables-in-classes)
395 std::string mBenchmarkName = "noname"; // NOLINT(misc-non-private-member-variables-in-classes)
396 std::string mUnit = "op"; // NOLINT(misc-non-private-member-variables-in-classes)
397 double mBatch = 1.0; // NOLINT(misc-non-private-member-variables-in-classes)
398 double mComplexityN = -1.0; // NOLINT(misc-non-private-member-variables-in-classes)
399 size_t mNumEpochs = 11; // NOLINT(misc-non-private-member-variables-in-classes)
400 size_t mClockResolutionMultiple = static_cast<size_t>(1000); // NOLINT(misc-non-private-member-variables-in-classes)
401 std::chrono::nanoseconds mMaxEpochTime = std::chrono::milliseconds(100); // NOLINT(misc-non-private-member-variables-in-classes)
402 std::chrono::nanoseconds mMinEpochTime = std::chrono::milliseconds(1); // NOLINT(misc-non-private-member-variables-in-classes)
403 uint64_t mMinEpochIterations{1}; // NOLINT(misc-non-private-member-variables-in-classes)
404 // If not 0, run *exactly* these number of iterations per epoch.
405 uint64_t mEpochIterations{0}; // NOLINT(misc-non-private-member-variables-in-classes)
406 uint64_t mWarmup = 0; // NOLINT(misc-non-private-member-variables-in-classes)
407 std::ostream* mOut = nullptr; // NOLINT(misc-non-private-member-variables-in-classes)
408 std::chrono::duration<double> mTimeUnit = std::chrono::nanoseconds{1}; // NOLINT(misc-non-private-member-variables-in-classes)
409 std::string mTimeUnitName = "ns"; // NOLINT(misc-non-private-member-variables-in-classes)
410 bool mShowPerformanceCounters = true; // NOLINT(misc-non-private-member-variables-in-classes)
411 bool mIsRelative = false; // NOLINT(misc-non-private-member-variables-in-classes)
412 std::unordered_map<std::string, std::string> mContext{}; // NOLINT(misc-non-private-member-variables-in-classes)
413
416 Config& operator=(Config const& other);
417 Config& operator=(Config&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
418 Config(Config const& other);
419 Config(Config&& other) noexcept;
420};
421ANKERL_NANOBENCH(IGNORE_PADDED_POP)
422
423// Result returned after a benchmark has finished. Can be used as a baseline for relative().
424ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
425class Result {
426public:
427 enum class Measure : size_t {
428 elapsed,
429 iterations,
430 pagefaults,
431 cpucycles,
432 contextswitches,
433 instructions,
434 branchinstructions,
435 branchmisses,
436 _size
437 };
438
439 explicit Result(Config benchmarkConfig);
440
442 Result& operator=(Result const& other);
443 Result& operator=(Result&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
444 Result(Result const& other);
445 Result(Result&& other) noexcept;
446
447 // adds new measurement results
448 // all values are scaled by iters (except iters...)
449 void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc);
450
451 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
452
453 ANKERL_NANOBENCH(NODISCARD) double median(Measure m) const;
454 ANKERL_NANOBENCH(NODISCARD) double medianAbsolutePercentError(Measure m) const;
455 ANKERL_NANOBENCH(NODISCARD) double average(Measure m) const;
456 ANKERL_NANOBENCH(NODISCARD) double sum(Measure m) const noexcept;
457 ANKERL_NANOBENCH(NODISCARD) double sumProduct(Measure m1, Measure m2) const noexcept;
458 ANKERL_NANOBENCH(NODISCARD) double minimum(Measure m) const noexcept;
459 ANKERL_NANOBENCH(NODISCARD) double maximum(Measure m) const noexcept;
460 ANKERL_NANOBENCH(NODISCARD) std::string const& context(char const* variableName) const;
461 ANKERL_NANOBENCH(NODISCARD) std::string const& context(std::string const& variableName) const;
462
463 ANKERL_NANOBENCH(NODISCARD) bool has(Measure m) const noexcept;
464 ANKERL_NANOBENCH(NODISCARD) double get(size_t idx, Measure m) const;
465 ANKERL_NANOBENCH(NODISCARD) bool empty() const noexcept;
466 ANKERL_NANOBENCH(NODISCARD) size_t size() const noexcept;
467
468 // Finds string, if not found, returns _size.
469 static Measure fromString(std::string const& str);
470
471private:
472 Config mConfig{};
473 std::vector<std::vector<double>> mNameToMeasurements{};
474};
475ANKERL_NANOBENCH(IGNORE_PADDED_POP)
476
477
494class Rng final {
495public:
499 using result_type = uint64_t;
500
501 static constexpr uint64_t(min)();
502 static constexpr uint64_t(max)();
503
509 Rng(Rng const&) = delete;
510
514 Rng& operator=(Rng const&) = delete;
515
516 // moving is ok
517 Rng(Rng&&) noexcept = default;
518 Rng& operator=(Rng&&) noexcept = default;
519 ~Rng() noexcept = default;
520
529
546 explicit Rng(uint64_t seed) noexcept;
547 Rng(uint64_t x, uint64_t y) noexcept;
548 explicit Rng(std::vector<uint64_t> const& data);
549
553 ANKERL_NANOBENCH(NODISCARD) Rng copy() const noexcept;
554
562 inline uint64_t operator()() noexcept;
563
564 // This is slightly biased. See
565
580 inline uint32_t bounded(uint32_t range) noexcept;
581
582 // random double in range [0, 1(
583 // see http://prng.di.unimi.it/
584
591 inline double uniform01() noexcept;
592
600 template <typename Container>
601 void shuffle(Container& container) noexcept;
602
609 ANKERL_NANOBENCH(NODISCARD) std::vector<uint64_t> state() const;
610
611private:
612 static constexpr uint64_t rotl(uint64_t x, unsigned k) noexcept;
613
614 uint64_t mX;
615 uint64_t mY;
616};
617
632ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
633class Bench {
634public:
639
640 Bench(Bench&& other) noexcept;
641 Bench& operator=(Bench&& other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE));
642 Bench(Bench const& other);
643 Bench& operator=(Bench const& other);
644 ~Bench() noexcept;
645
664 template <typename Op>
665 ANKERL_NANOBENCH(NOINLINE)
666 Bench& run(char const* benchmarkName, Op&& op);
667
668 template <typename Op>
669 ANKERL_NANOBENCH(NOINLINE)
670 Bench& run(std::string const& benchmarkName, Op&& op);
671
676 template <typename Op>
677 ANKERL_NANOBENCH(NOINLINE)
678 Bench& run(Op&& op);
679
685 Bench& title(char const* benchmarkTitle);
686 Bench& title(std::string const& benchmarkTitle);
687
691 ANKERL_NANOBENCH(NODISCARD) std::string const& title() const noexcept;
692
694 Bench& name(char const* benchmarkName);
695 Bench& name(std::string const& benchmarkName);
696 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
697
710 Bench& context(char const* variableName, char const* variableValue);
711 Bench& context(std::string const& variableName, std::string const& variableValue);
712
721 Bench& clearContext();
722
732 template <typename T>
733 Bench& batch(T b) noexcept;
734 ANKERL_NANOBENCH(NODISCARD) double batch() const noexcept;
735
744 Bench& unit(char const* unit);
745 Bench& unit(std::string const& unit);
746 ANKERL_NANOBENCH(NODISCARD) std::string const& unit() const noexcept;
747
757 Bench& timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName);
758 ANKERL_NANOBENCH(NODISCARD) std::string const& timeUnitName() const noexcept;
759 ANKERL_NANOBENCH(NODISCARD) std::chrono::duration<double> const& timeUnit() const noexcept;
760
768 Bench& output(std::ostream* outstream) noexcept;
769 ANKERL_NANOBENCH(NODISCARD) std::ostream* output() const noexcept;
770
791 Bench& clockResolutionMultiple(size_t multiple) noexcept;
792 ANKERL_NANOBENCH(NODISCARD) size_t clockResolutionMultiple() const noexcept;
793
809 Bench& epochs(size_t numEpochs) noexcept;
810 ANKERL_NANOBENCH(NODISCARD) size_t epochs() const noexcept;
811
822 Bench& maxEpochTime(std::chrono::nanoseconds t) noexcept;
823 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds maxEpochTime() const noexcept;
824
835 Bench& minEpochTime(std::chrono::nanoseconds t) noexcept;
836 ANKERL_NANOBENCH(NODISCARD) std::chrono::nanoseconds minEpochTime() const noexcept;
837
848 Bench& minEpochIterations(uint64_t numIters) noexcept;
849 ANKERL_NANOBENCH(NODISCARD) uint64_t minEpochIterations() const noexcept;
850
857 Bench& epochIterations(uint64_t numIters) noexcept;
858 ANKERL_NANOBENCH(NODISCARD) uint64_t epochIterations() const noexcept;
859
869 Bench& warmup(uint64_t numWarmupIters) noexcept;
870 ANKERL_NANOBENCH(NODISCARD) uint64_t warmup() const noexcept;
871
889 Bench& relative(bool isRelativeEnabled) noexcept;
890 ANKERL_NANOBENCH(NODISCARD) bool relative() const noexcept;
891
900 Bench& performanceCounters(bool showPerformanceCounters) noexcept;
901 ANKERL_NANOBENCH(NODISCARD) bool performanceCounters() const noexcept;
902
911 ANKERL_NANOBENCH(NODISCARD) std::vector<Result> const& results() const noexcept;
912
920 template <typename Arg>
922
937 template <typename T>
938 Bench& complexityN(T n) noexcept;
939 ANKERL_NANOBENCH(NODISCARD) double complexityN() const noexcept;
940
972 std::vector<BigO> complexityBigO() const;
973
997 template <typename Op>
998 BigO complexityBigO(char const* name, Op op) const;
999
1000 template <typename Op>
1001 BigO complexityBigO(std::string const& name, Op op) const;
1002
1010 Bench& render(char const* templateContent, std::ostream& os);
1011 Bench& render(std::string const& templateContent, std::ostream& os);
1012
1013 Bench& config(Config const& benchmarkConfig);
1014 ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
1015
1021 template <typename SetupOp>
1022 detail::SetupRunner<SetupOp> setup(SetupOp setupOp);
1023
1024private:
1025 template <typename SetupOp, typename Op>
1026 Bench& runImpl(SetupOp& setupOp, Op&& op);
1027
1028 template <typename SetupOp>
1029 friend class detail::SetupRunner;
1030
1031 Config mConfig{};
1032 std::vector<Result> mResults{};
1033};
1034ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1035
1036
1042template <typename Arg>
1043void doNotOptimizeAway(Arg&& arg);
1044
1045namespace detail {
1046
1047#if defined(_MSC_VER)
1048void doNotOptimizeAwaySink(void const*);
1049
1050template <typename T>
1051void doNotOptimizeAway(T const& val);
1052
1053#else
1054
1055// These assembly magic is directly from what Google Benchmark is doing. I have previously used what facebook's folly was doing, but
1056// this seemed to have compilation problems in some cases. Google Benchmark seemed to be the most well tested anyways.
1057// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L443-L446
1058template <typename T>
1059void doNotOptimizeAway(T const& val) {
1060 // NOLINTNEXTLINE(hicpp-no-assembler)
1061 asm volatile("" : : "r,m"(val) : "memory");
1062}
1063
1064template <typename T>
1065void doNotOptimizeAway(T& val) {
1066# if defined(__clang__)
1067 // NOLINTNEXTLINE(hicpp-no-assembler)
1068 asm volatile("" : "+r,m"(val) : : "memory");
1069# else
1070 // NOLINTNEXTLINE(hicpp-no-assembler)
1071 asm volatile("" : "+m,r"(val) : : "memory");
1072# endif
1073}
1074#endif
1075
1076// internally used, but visible because run() is templated.
1077// Not movable/copy-able, so we simply use a pointer instead of unique_ptr. This saves us from
1078// having to include <memory>, and the template instantiation overhead of unique_ptr which is unfortunately quite significant.
1079ANKERL_NANOBENCH(IGNORE_EFFCPP_PUSH)
1081public:
1082 explicit IterationLogic(Bench const& bench);
1088
1089 ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept;
1090 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept;
1091 void moveResultTo(std::vector<Result>& results) noexcept;
1092
1093private:
1094 struct Impl;
1095 Impl* mPimpl;
1096};
1097ANKERL_NANOBENCH(IGNORE_EFFCPP_POP)
1098
1099ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1101public:
1106
1109
1112 void updateResults(uint64_t numIters);
1113
1114 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& val() const noexcept;
1115 ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& has() const noexcept;
1116
1117private:
1118#if ANKERL_NANOBENCH(PERF_COUNTERS)
1119 LinuxPerformanceCounters* mPc = nullptr;
1120#endif
1123};
1124ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1125
1126// Gets the singleton
1128
1129} // namespace detail
1130
1131class BigO {
1132public:
1133 using RangeMeasure = std::vector<std::pair<double, double>>;
1134
1135 template <typename Op>
1137 for (auto& rangeMeasure : data) {
1138 rangeMeasure.first = op(rangeMeasure.first);
1139 }
1140 return data;
1141 }
1142
1143 static RangeMeasure collectRangeMeasure(std::vector<Result> const& results);
1144
1145 template <typename Op>
1146 BigO(char const* bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1147 : BigO(bigOName, mapRangeMeasure(rangeMeasure, rangeToN)) {}
1148
1149 template <typename Op>
1150 BigO(std::string bigOName, RangeMeasure const& rangeMeasure, Op rangeToN)
1151 : BigO(std::move(bigOName), mapRangeMeasure(rangeMeasure, rangeToN)) {}
1152
1153 BigO(char const* bigOName, RangeMeasure const& scaledRangeMeasure);
1154 BigO(std::string bigOName, RangeMeasure const& scaledRangeMeasure);
1155 ANKERL_NANOBENCH(NODISCARD) std::string const& name() const noexcept;
1156 ANKERL_NANOBENCH(NODISCARD) double constant() const noexcept;
1157 ANKERL_NANOBENCH(NODISCARD) double normalizedRootMeanSquare() const noexcept;
1158 ANKERL_NANOBENCH(NODISCARD) bool operator<(BigO const& other) const noexcept;
1159
1160private:
1161 std::string mName{};
1162 double mConstant{};
1163 double mNormalizedRootMeanSquare{};
1164};
1165std::ostream& operator<<(std::ostream& os, BigO const& bigO);
1166std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs);
1167
1168} // namespace nanobench
1169} // namespace ankerl
1170
1171// implementation /////////////////////////////////////////////////////////////////////////////////
1172
1173namespace ankerl {
1174namespace nanobench {
1175
1176constexpr uint64_t(Rng::min)() {
1177 return 0;
1178}
1179
1180constexpr uint64_t(Rng::max)() {
1181 return (std::numeric_limits<uint64_t>::max)();
1182}
1183
1184ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1185uint64_t Rng::operator()() noexcept {
1186 auto x = mX;
1187
1188 mX = UINT64_C(15241094284759029579) * mY;
1189 mY = rotl(mY - x, 27);
1190
1191 return x;
1192}
1193
1194ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1195uint32_t Rng::bounded(uint32_t range) noexcept {
1196 uint64_t const r32 = static_cast<uint32_t>(operator()());
1197 auto multiresult = r32 * range;
1198 return static_cast<uint32_t>(multiresult >> 32U);
1199}
1200
1201double Rng::uniform01() noexcept {
1202 auto i = (UINT64_C(0x3ff) << 52U) | (operator()() >> 12U);
1203 // can't use union in c++ here for type puning, it's undefined behavior.
1204 // std::memcpy is optimized anyways.
1205 double d{};
1206 std::memcpy(&d, &i, sizeof(double));
1207 return d - 1.0;
1208}
1209
1210template <typename Container>
1211void Rng::shuffle(Container& container) noexcept {
1212 auto i = container.size();
1213 while (i > 1U) {
1214 using std::swap;
1215 auto n = operator()();
1216 // using decltype(i) instead of size_t to be compatible to containers with 32bit index (see #80)
1217 auto b1 = static_cast<decltype(i)>((static_cast<uint32_t>(n) * static_cast<uint64_t>(i)) >> 32U);
1218 swap(container[--i], container[b1]);
1219
1220 auto b2 = static_cast<decltype(i)>(((n >> 32U) * static_cast<uint64_t>(i)) >> 32U);
1221 swap(container[--i], container[b2]);
1222 }
1223}
1224
1225ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
1226constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
1227 return (x << k) | (x >> (64U - k));
1228}
1229
1230namespace detail {
1231
1232template <typename SetupOp>
1234public:
1235 explicit SetupRunner(SetupOp setupOp, Bench& bench)
1236 : mSetupOp(std::move(setupOp))
1237 , mBench(bench) {}
1238
1239 template <typename Op>
1241 Bench& run(Op&& op) {
1242 assert((mBench.epochIterations() <= 1) &&
1243 "setup() runs once per epoch, not once per iteration; it requires epochIterations(1)");
1244 mBench.epochIterations(1);
1245 return mBench.runImpl(mSetupOp, std::forward<Op>(op));
1246 }
1247
1248private:
1249 SetupOp mSetupOp;
1251};
1252} // namespace detail
1253
1254template <typename Op>
1256Bench& Bench::run(Op&& op) {
1257 auto setupOp = [] {};
1258 return runImpl(setupOp, std::forward<Op>(op));
1259}
1260
1261template <typename SetupOp, typename Op>
1263Bench& Bench::runImpl(SetupOp& setupOp, Op&& op) {
1264 // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
1265 detail::IterationLogic iterationLogic(*this);
1266 auto& pc = detail::performanceCounters();
1267
1268 while (auto n = iterationLogic.numIters()) {
1269 setupOp();
1270
1271 pc.beginMeasure();
1272 Clock::time_point const before = Clock::now();
1273 while (n-- > 0) {
1274 op();
1275 }
1276 Clock::time_point const after = Clock::now();
1277 pc.endMeasure();
1278 pc.updateResults(iterationLogic.numIters());
1279 iterationLogic.add(after - before, pc);
1280 }
1281 iterationLogic.moveResultTo(mResults);
1282 return *this;
1283}
1284
1285template <typename SetupOp>
1287 return detail::SetupRunner<SetupOp>(std::move(setupOp), *this);
1288}
1289
1290// Performs all evaluations.
1291template <typename Op>
1292Bench& Bench::run(char const* benchmarkName, Op&& op) {
1293 name(benchmarkName);
1294 return run(std::forward<Op>(op));
1295}
1296
1297template <typename Op>
1298Bench& Bench::run(std::string const& benchmarkName, Op&& op) {
1299 name(benchmarkName);
1300 return run(std::forward<Op>(op));
1301}
1302
1303template <typename Op>
1304BigO Bench::complexityBigO(char const* benchmarkName, Op op) const {
1305 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1306}
1307
1308template <typename Op>
1309BigO Bench::complexityBigO(std::string const& benchmarkName, Op op) const {
1310 return BigO(benchmarkName, BigO::collectRangeMeasure(mResults), op);
1311}
1312
1313// Set the batch size, e.g. number of processed bytes, or some other metric for the size of the processed data in each iteration.
1314// Any argument is cast to double.
1315template <typename T>
1316Bench& Bench::batch(T b) noexcept {
1317 mConfig.mBatch = static_cast<double>(b);
1318 return *this;
1319}
1320
1321// Sets the computation complexity of the next run. Any argument is cast to double.
1322template <typename T>
1324 mConfig.mComplexityN = static_cast<double>(n);
1325 return *this;
1326}
1327
1328// Convenience: makes sure none of the given arguments are optimized away by the compiler.
1329template <typename Arg>
1331 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1332 return *this;
1333}
1334
1335// Makes sure none of the given arguments are optimized away by the compiler.
1336template <typename Arg>
1337void doNotOptimizeAway(Arg&& arg) {
1338 detail::doNotOptimizeAway(std::forward<Arg>(arg));
1339}
1340
1341namespace detail {
1342
1343#if defined(_MSC_VER)
1344template <typename T>
1345void doNotOptimizeAway(T const& val) {
1346 doNotOptimizeAwaySink(&val);
1347}
1348
1349#endif
1350
1351} // namespace detail
1352} // namespace nanobench
1353} // namespace ankerl
1354
1355#if defined(ANKERL_NANOBENCH_IMPLEMENT)
1356
1358// implementation part - only visible in .cpp
1360
1361# include <algorithm> // sort, reverse
1362# include <atomic> // compare_exchange_strong in loop overhead
1363# include <cstdlib> // getenv
1364# include <cstring> // strstr, strncmp
1365# include <fstream> // ifstream to parse proc files
1366# include <iomanip> // setw, setprecision
1367# include <iostream> // cout
1368# include <numeric> // accumulate
1369# include <random> // random_device
1370# include <sstream> // to_s in Number
1371# include <stdexcept> // throw for rendering templates
1372# include <tuple> // std::tie
1373# if defined(__linux__)
1374# include <unistd.h> //sysconf
1375# endif
1376# if ANKERL_NANOBENCH(PERF_COUNTERS)
1377# include <map> // map
1378
1379# include <linux/perf_event.h>
1380# include <sys/ioctl.h>
1381# include <sys/syscall.h>
1382# endif
1383
1384// declarations ///////////////////////////////////////////////////////////////////////////////////
1385
1386namespace ankerl {
1387namespace nanobench {
1388
1389// helper stuff that is only intended to be used internally
1390namespace detail {
1391
1392struct TableInfo;
1393
1394// formatting utilities
1395namespace fmt {
1396
1397class NumSep;
1398class StreamStateRestorer;
1399class Number;
1400class MarkDownColumn;
1401class MarkDownCode;
1402
1403} // namespace fmt
1404} // namespace detail
1405} // namespace nanobench
1406} // namespace ankerl
1407
1408// definitions ////////////////////////////////////////////////////////////////////////////////////
1409
1410namespace ankerl {
1411namespace nanobench {
1412
1413uint64_t splitMix64(uint64_t& state) noexcept;
1414
1415namespace detail {
1416
1417// helpers to get double values
1418template <typename T>
1419inline double d(T t) noexcept {
1420 return static_cast<double>(t);
1421}
1422inline double d(Clock::duration duration) noexcept {
1423 return std::chrono::duration_cast<std::chrono::duration<double>>(duration).count();
1424}
1425
1426// Calculates clock resolution once, and remembers the result
1427inline Clock::duration clockResolution() noexcept;
1428
1429} // namespace detail
1430
1431namespace templates {
1432
1433char const* csv() noexcept {
1434 return R"DELIM("title";"name";"unit";"batch";"elapsed";"error %";"instructions";"branches";"branch misses";"total"
1435{{#result}}"{{title}}";"{{name}}";"{{unit}}";{{batch}};{{median(elapsed)}};{{medianAbsolutePercentError(elapsed)}};{{median(instructions)}};{{median(branchinstructions)}};{{median(branchmisses)}};{{sumProduct(iterations, elapsed)}}
1436{{/result}})DELIM";
1437}
1438
1439char const* htmlBoxplot() noexcept {
1440 return R"DELIM(<html>
1441
1442<head>
1443 <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
1444</head>
1445
1446<body>
1447 <div id="myDiv"></div>
1448 <script>
1449 var data = [
1450 {{#result}}{
1451 name: '{{name}}',
1452 y: [{{#measurement}}{{elapsed}}{{^-last}}, {{/last}}{{/measurement}}],
1453 },
1454 {{/result}}
1455 ];
1456 var title = '{{title}}';
1457
1458 data = data.map(a => Object.assign(a, { boxpoints: 'all', pointpos: 0, type: 'box' }));
1459 var layout = { title: { text: title }, showlegend: false, yaxis: { title: 'time per unit', rangemode: 'tozero', autorange: true } }; Plotly.newPlot('myDiv', data, layout, {responsive: true});
1460 </script>
1461</body>
1462
1463</html>)DELIM";
1464}
1465
1466char const* pyperf() noexcept {
1467 return R"DELIM({
1468 "benchmarks": [
1469 {
1470 "runs": [
1471 {
1472 "values": [
1473{{#measurement}} {{elapsed}}{{^-last}},
1474{{/last}}{{/measurement}}
1475 ]
1476 }
1477 ]
1478 }
1479 ],
1480 "metadata": {
1481 "loops": {{sum(iterations)}},
1482 "inner_loops": {{batch}},
1483 "name": "{{title}}",
1484 "unit": "second"
1485 },
1486 "version": "1.0"
1487})DELIM";
1488}
1489
1490char const* json() noexcept {
1491 return R"DELIM({
1492 "results": [
1493{{#result}} {
1494 "title": "{{title}}",
1495 "name": "{{name}}",
1496 "unit": "{{unit}}",
1497 "batch": {{batch}},
1498 "complexityN": {{complexityN}},
1499 "epochs": {{epochs}},
1500 "clockResolution": {{clockResolution}},
1501 "clockResolutionMultiple": {{clockResolutionMultiple}},
1502 "maxEpochTime": {{maxEpochTime}},
1503 "minEpochTime": {{minEpochTime}},
1504 "minEpochIterations": {{minEpochIterations}},
1505 "epochIterations": {{epochIterations}},
1506 "warmup": {{warmup}},
1507 "relative": {{relative}},
1508 "median(elapsed)": {{median(elapsed)}},
1509 "medianAbsolutePercentError(elapsed)": {{medianAbsolutePercentError(elapsed)}},
1510 "median(instructions)": {{median(instructions)}},
1511 "medianAbsolutePercentError(instructions)": {{medianAbsolutePercentError(instructions)}},
1512 "median(cpucycles)": {{median(cpucycles)}},
1513 "median(contextswitches)": {{median(contextswitches)}},
1514 "median(pagefaults)": {{median(pagefaults)}},
1515 "median(branchinstructions)": {{median(branchinstructions)}},
1516 "median(branchmisses)": {{median(branchmisses)}},
1517 "totalTime": {{sumProduct(iterations, elapsed)}},
1518 "measurements": [
1519{{#measurement}} {
1520 "iterations": {{iterations}},
1521 "elapsed": {{elapsed}},
1522 "pagefaults": {{pagefaults}},
1523 "cpucycles": {{cpucycles}},
1524 "contextswitches": {{contextswitches}},
1525 "instructions": {{instructions}},
1526 "branchinstructions": {{branchinstructions}},
1527 "branchmisses": {{branchmisses}}
1528 }{{^-last}},{{/-last}}
1529{{/measurement}} ]
1530 }{{^-last}},{{/-last}}
1531{{/result}} ]
1532})DELIM";
1533}
1534
1535ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1536struct Node {
1537 enum class Type { tag, content, section, inverted_section };
1538
1539 char const* begin;
1540 char const* end;
1541 std::vector<Node> children;
1542 Type type;
1543
1544 template <size_t N>
1545 // NOLINTNEXTLINE(hicpp-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
1546 bool operator==(char const (&str)[N]) const noexcept {
1547 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
1548 return static_cast<size_t>(std::distance(begin, end) + 1) == N && 0 == strncmp(str, begin, N - 1);
1549 }
1550};
1551ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1552
1553// NOLINTNEXTLINE(misc-no-recursion)
1554static std::vector<Node> parseMustacheTemplate(char const** tpl) {
1555 std::vector<Node> nodes;
1556
1557 while (true) {
1558 auto const* begin = std::strstr(*tpl, "{{");
1559 auto const* end = begin;
1560 if (begin != nullptr) {
1561 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1562 begin += 2;
1563 end = std::strstr(begin, "}}");
1564 }
1565
1566 if (begin == nullptr || end == nullptr) {
1567 // nothing found, finish node
1568 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1569 nodes.emplace_back(Node{*tpl, *tpl + std::strlen(*tpl), std::vector<Node>{}, Node::Type::content});
1570 return nodes;
1571 }
1572
1573 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1574 nodes.emplace_back(Node{*tpl, begin - 2, std::vector<Node>{}, Node::Type::content});
1575
1576 // we found a tag
1577 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1578 *tpl = end + 2;
1579 switch (*begin) {
1580 case '/':
1581 // finished! bail out
1582 return nodes;
1583
1584 case '#':
1585 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1586 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::section});
1587 break;
1588
1589 case '^':
1590 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1591 nodes.emplace_back(Node{begin + 1, end, parseMustacheTemplate(tpl), Node::Type::inverted_section});
1592 break;
1593
1594 default:
1595 nodes.emplace_back(Node{begin, end, std::vector<Node>{}, Node::Type::tag});
1596 break;
1597 }
1598 }
1599}
1600
1601static bool generateFirstLast(Node const& n, size_t idx, size_t size, std::ostream& out) {
1602 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1603 bool const matchFirst = n == "-first";
1604 bool const matchLast = n == "-last";
1605 if (!matchFirst && !matchLast) {
1606 return false;
1607 }
1608
1609 bool doWrite = false;
1610 if (n.type == Node::Type::section) {
1611 doWrite = (matchFirst && idx == 0) || (matchLast && idx == size - 1);
1612 } else if (n.type == Node::Type::inverted_section) {
1613 doWrite = (matchFirst && idx != 0) || (matchLast && idx != size - 1);
1614 }
1615
1616 if (doWrite) {
1617 for (auto const& child : n.children) {
1618 if (child.type == Node::Type::content) {
1619 out.write(child.begin, std::distance(child.begin, child.end));
1620 }
1621 }
1622 }
1623 return true;
1624}
1625
1626static bool matchCmdArgs(std::string const& str, std::vector<std::string>& matchResult) {
1627 matchResult.clear();
1628 auto idxOpen = str.find('(');
1629 auto idxClose = str.find(')', idxOpen);
1630 if (idxClose == std::string::npos) {
1631 return false;
1632 }
1633
1634 matchResult.emplace_back(str.substr(0, idxOpen));
1635
1636 // split by comma
1637 matchResult.emplace_back();
1638 for (size_t i = idxOpen + 1; i != idxClose; ++i) {
1639 if (str[i] == ' ' || str[i] == '\t') {
1640 // skip whitespace
1641 continue;
1642 }
1643 if (str[i] == ',') {
1644 // got a comma => new string
1645 matchResult.emplace_back();
1646 continue;
1647 }
1648 // no whitespace no comma, append
1649 matchResult.back() += str[i];
1650 }
1651 return true;
1652}
1653
1654static bool generateConfigTag(Node const& n, Config const& config, std::ostream& out) {
1655 using detail::d;
1656
1657 if (n == "title") {
1658 out << config.mBenchmarkTitle;
1659 return true;
1660 }
1661 if (n == "name") {
1662 out << config.mBenchmarkName;
1663 return true;
1664 }
1665 if (n == "unit") {
1666 out << config.mUnit;
1667 return true;
1668 }
1669 if (n == "batch") {
1670 out << config.mBatch;
1671 return true;
1672 }
1673 if (n == "complexityN") {
1674 out << config.mComplexityN;
1675 return true;
1676 }
1677 if (n == "epochs") {
1678 out << config.mNumEpochs;
1679 return true;
1680 }
1681 if (n == "clockResolution") {
1682 out << d(detail::clockResolution());
1683 return true;
1684 }
1685 if (n == "clockResolutionMultiple") {
1686 out << config.mClockResolutionMultiple;
1687 return true;
1688 }
1689 if (n == "maxEpochTime") {
1690 out << d(config.mMaxEpochTime);
1691 return true;
1692 }
1693 if (n == "minEpochTime") {
1694 out << d(config.mMinEpochTime);
1695 return true;
1696 }
1697 if (n == "minEpochIterations") {
1698 out << config.mMinEpochIterations;
1699 return true;
1700 }
1701 if (n == "epochIterations") {
1702 out << config.mEpochIterations;
1703 return true;
1704 }
1705 if (n == "warmup") {
1706 out << config.mWarmup;
1707 return true;
1708 }
1709 if (n == "relative") {
1710 out << config.mIsRelative;
1711 return true;
1712 }
1713 return false;
1714}
1715
1716// NOLINTNEXTLINE(readability-function-cognitive-complexity)
1717static std::ostream& generateResultTag(Node const& n, Result const& r, std::ostream& out) {
1718 if (generateConfigTag(n, r.config(), out)) {
1719 return out;
1720 }
1721 // match e.g. "median(elapsed)"
1722 // g++ 4.8 doesn't implement std::regex :(
1723 // static std::regex const regOpArg1("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\‍)$");
1724 // std::cmatch matchResult;
1725 // if (std::regex_match(n.begin, n.end, matchResult, regOpArg1)) {
1726 std::vector<std::string> matchResult;
1727 if (matchCmdArgs(std::string(n.begin, n.end), matchResult)) {
1728 if (matchResult.size() == 2) {
1729 if (matchResult[0] == "context") {
1730 return out << r.context(matchResult[1]);
1731 }
1732
1733 auto m = Result::fromString(matchResult[1]);
1734 if (m == Result::Measure::_size) {
1735 return out << 0.0;
1736 }
1737
1738 if (matchResult[0] == "median") {
1739 return out << r.median(m);
1740 }
1741 if (matchResult[0] == "average") {
1742 return out << r.average(m);
1743 }
1744 if (matchResult[0] == "medianAbsolutePercentError") {
1745 return out << r.medianAbsolutePercentError(m);
1746 }
1747 if (matchResult[0] == "sum") {
1748 return out << r.sum(m);
1749 }
1750 if (matchResult[0] == "minimum") {
1751 return out << r.minimum(m);
1752 }
1753 if (matchResult[0] == "maximum") {
1754 return out << r.maximum(m);
1755 }
1756 } else if (matchResult.size() == 3) {
1757 auto m1 = Result::fromString(matchResult[1]);
1758 auto m2 = Result::fromString(matchResult[2]);
1760 return out << 0.0;
1761 }
1762
1763 if (matchResult[0] == "sumProduct") {
1764 return out << r.sumProduct(m1, m2);
1765 }
1766 }
1767 }
1768
1769 // match e.g. "sumProduct(elapsed, iterations)"
1770 // static std::regex const regOpArg2("^([a-zA-Z]+)\\‍(([a-zA-Z]*)\\s*,\\s+([a-zA-Z]*)\\‍)$");
1771
1772 // nothing matches :(
1773 throw std::runtime_error("command '" + std::string(n.begin, n.end) + "' not understood");
1774}
1775
1776static void generateResultMeasurement(std::vector<Node> const& nodes, size_t idx, Result const& r, std::ostream& out) {
1777 for (auto const& n : nodes) {
1778 if (!generateFirstLast(n, idx, r.size(), out)) {
1779 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1780 switch (n.type) {
1781 case Node::Type::content:
1782 out.write(n.begin, std::distance(n.begin, n.end));
1783 break;
1784
1785 case Node::Type::inverted_section:
1786 throw std::runtime_error("got a inverted section inside measurement");
1787
1788 case Node::Type::section:
1789 throw std::runtime_error("got a section inside measurement");
1790
1791 case Node::Type::tag: {
1792 auto m = Result::fromString(std::string(n.begin, n.end));
1793 if (m == Result::Measure::_size || !r.has(m)) {
1794 out << 0.0;
1795 } else {
1796 out << r.get(idx, m);
1797 }
1798 break;
1799 }
1800 }
1801 }
1802 }
1803}
1804
1805static void generateResult(std::vector<Node> const& nodes, size_t idx, std::vector<Result> const& results, std::ostream& out) {
1806 auto const& r = results[idx];
1807 for (auto const& n : nodes) {
1808 if (!generateFirstLast(n, idx, results.size(), out)) {
1809 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1810 switch (n.type) {
1811 case Node::Type::content:
1812 out.write(n.begin, std::distance(n.begin, n.end));
1813 break;
1814
1815 case Node::Type::inverted_section:
1816 throw std::runtime_error("got a inverted section inside result");
1817
1818 case Node::Type::section:
1819 if (n == "measurement") {
1820 for (size_t i = 0; i < r.size(); ++i) {
1821 generateResultMeasurement(n.children, i, r, out);
1822 }
1823 } else {
1824 throw std::runtime_error("got a section inside result");
1825 }
1826 break;
1827
1828 case Node::Type::tag:
1829 generateResultTag(n, r, out);
1830 break;
1831 }
1832 }
1833 }
1834}
1835
1836} // namespace templates
1837
1838// helper stuff that only intended to be used internally
1839namespace detail {
1840
1841char const* getEnv(char const* name);
1842bool isEndlessRunning(std::string const& name);
1843bool isWarningsEnabled();
1844
1845template <typename T>
1846T parseFile(std::string const& filename, bool* fail);
1847
1848void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations);
1849void printStabilityInformationOnce(std::ostream* outStream);
1850
1851// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
1852uint64_t& singletonHeaderHash() noexcept;
1853
1854// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
1855Clock::duration calcClockResolution(size_t numEvaluations) noexcept;
1856
1857// formatting utilities
1858namespace fmt {
1859
1860// adds thousands separator to numbers
1861ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1862class NumSep : public std::numpunct<char> {
1863public:
1864 explicit NumSep(char sep);
1865 char do_thousands_sep() const override;
1866 std::string do_grouping() const override;
1867
1868private:
1869 char mSep;
1870};
1871ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1872
1873// RAII to save & restore a stream's state
1874ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
1875class StreamStateRestorer {
1876public:
1877 explicit StreamStateRestorer(std::ostream& s);
1878 ~StreamStateRestorer();
1879
1880 // sets back all stream info that we remembered at construction
1881 void restore();
1882
1883 // don't allow copying / moving
1884 StreamStateRestorer(StreamStateRestorer const&) = delete;
1885 StreamStateRestorer& operator=(StreamStateRestorer const&) = delete;
1886 StreamStateRestorer(StreamStateRestorer&&) = delete;
1887 StreamStateRestorer& operator=(StreamStateRestorer&&) = delete;
1888
1889private:
1890 std::ostream& mStream;
1891 std::locale mLocale;
1892 std::streamsize const mPrecision;
1893 std::streamsize const mWidth;
1894 std::ostream::char_type const mFill;
1895 std::ostream::fmtflags const mFmtFlags;
1896};
1897ANKERL_NANOBENCH(IGNORE_PADDED_POP)
1898
1899// Number formatter
1900class Number {
1901public:
1902 Number(int width, int precision, double value);
1903 Number(int width, int precision, int64_t value);
1904 ANKERL_NANOBENCH(NODISCARD) std::string to_s() const;
1905
1906private:
1907 friend std::ostream& operator<<(std::ostream& os, Number const& n);
1908 std::ostream& write(std::ostream& os) const;
1909
1910 int mWidth;
1911 int mPrecision;
1912 double mValue;
1913};
1914
1915// helper replacement for std::to_string of signed/unsigned numbers so we are locale independent
1916std::string to_s(uint64_t n);
1917
1918std::ostream& operator<<(std::ostream& os, Number const& n);
1919
1920class MarkDownColumn {
1921public:
1922 MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept;
1923 ANKERL_NANOBENCH(NODISCARD) std::string title() const;
1924 ANKERL_NANOBENCH(NODISCARD) std::string separator() const;
1925 ANKERL_NANOBENCH(NODISCARD) std::string invalid() const;
1926 ANKERL_NANOBENCH(NODISCARD) std::string value() const;
1927
1928private:
1929 int mWidth;
1930 int mPrecision;
1931 std::string mTitle;
1932 std::string mSuffix;
1933 double mValue;
1934};
1935
1936// Formats any text as markdown code, escaping backticks.
1937class MarkDownCode {
1938public:
1939 explicit MarkDownCode(std::string const& what);
1940
1941private:
1942 friend std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1943 std::ostream& write(std::ostream& os) const;
1944
1945 std::string mWhat{};
1946};
1947
1948std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode);
1949
1950} // namespace fmt
1951} // namespace detail
1952} // namespace nanobench
1953} // namespace ankerl
1954
1955// implementation /////////////////////////////////////////////////////////////////////////////////
1956
1957namespace ankerl {
1958namespace nanobench {
1959
1960// NOLINTNEXTLINE(readability-function-cognitive-complexity)
1961void render(char const* mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
1962 detail::fmt::StreamStateRestorer const restorer(out);
1963
1964 out.precision(std::numeric_limits<double>::digits10);
1965 auto nodes = templates::parseMustacheTemplate(&mustacheTemplate);
1966
1967 for (auto const& n : nodes) {
1968 ANKERL_NANOBENCH_LOG("n.type=" << static_cast<int>(n.type));
1969 switch (n.type) {
1970 case templates::Node::Type::content:
1971 out.write(n.begin, std::distance(n.begin, n.end));
1972 break;
1973
1974 case templates::Node::Type::inverted_section:
1975 throw std::runtime_error("unknown list '" + std::string(n.begin, n.end) + "'");
1976
1977 case templates::Node::Type::section:
1978 if (n == "result") {
1979 const size_t nbResults = results.size();
1980 for (size_t i = 0; i < nbResults; ++i) {
1981 generateResult(n.children, i, results, out);
1982 }
1983 } else if (n == "measurement") {
1984 if (results.size() != 1) {
1985 throw std::runtime_error(
1986 "render: can only use section 'measurement' here if there is a single result, but there are " +
1987 detail::fmt::to_s(results.size()));
1988 }
1989 // when we only have a single result, we can immediately go into its measurement.
1990 auto const& r = results.front();
1991 for (size_t i = 0; i < r.size(); ++i) {
1992 generateResultMeasurement(n.children, i, r, out);
1993 }
1994 } else {
1995 throw std::runtime_error("render: unknown section '" + std::string(n.begin, n.end) + "'");
1996 }
1997 break;
1998
1999 case templates::Node::Type::tag:
2000 if (results.size() == 1) {
2001 // result & config are both supported there
2002 generateResultTag(n, results.front(), out);
2003 } else {
2004 // This just uses the last result's config.
2005 if (!generateConfigTag(n, results.back().config(), out)) {
2006 throw std::runtime_error("unknown tag '" + std::string(n.begin, n.end) + "'");
2007 }
2008 }
2009 break;
2010 }
2011 }
2012}
2013
2014void render(std::string const& mustacheTemplate, std::vector<Result> const& results, std::ostream& out) {
2015 render(mustacheTemplate.c_str(), results, out);
2016}
2017
2018void render(char const* mustacheTemplate, const Bench& bench, std::ostream& out) {
2019 render(mustacheTemplate, bench.results(), out);
2020}
2021
2022void render(std::string const& mustacheTemplate, const Bench& bench, std::ostream& out) {
2023 render(mustacheTemplate.c_str(), bench.results(), out);
2024}
2025
2026namespace detail {
2027
2028PerformanceCounters& performanceCounters() {
2029# if defined(__clang__)
2030# pragma clang diagnostic push
2031# pragma clang diagnostic ignored "-Wexit-time-destructors"
2032# endif
2033 static PerformanceCounters pc;
2034# if defined(__clang__)
2035# pragma clang diagnostic pop
2036# endif
2037 return pc;
2038}
2039
2040// Windows version of doNotOptimizeAway
2041// see https://github.com/google/benchmark/blob/v1.7.1/include/benchmark/benchmark.h#L514
2042// see https://github.com/facebook/folly/blob/v2023.01.30.00/folly/lang/Hint-inl.h#L54-L58
2043// see https://learn.microsoft.com/en-us/cpp/preprocessor/optimize
2044# if defined(_MSC_VER)
2045# pragma optimize("", off)
2046void doNotOptimizeAwaySink(void const*) {}
2047# pragma optimize("", on)
2048# endif
2049
2050template <typename T>
2051T parseFile(std::string const& filename, bool* fail) {
2052 std::ifstream fin(filename); // NOLINT(misc-const-correctness)
2053 T num{};
2054 fin >> num;
2055 if (fail != nullptr) {
2056 *fail = fin.fail();
2057 }
2058 return num;
2059}
2060
2061char const* getEnv(char const* name) {
2062# if defined(_MSC_VER)
2063# pragma warning(push)
2064# pragma warning(disable : 4996) // getenv': This function or variable may be unsafe.
2065# endif
2066 return std::getenv(name); // NOLINT(concurrency-mt-unsafe)
2067# if defined(_MSC_VER)
2068# pragma warning(pop)
2069# endif
2070}
2071
2072bool isEndlessRunning(std::string const& name) {
2073 auto const* const endless = getEnv("NANOBENCH_ENDLESS");
2074 return nullptr != endless && endless == name;
2075}
2076
2077// True when environment variable NANOBENCH_SUPPRESS_WARNINGS is either not set at all, or set to "0"
2078bool isWarningsEnabled() {
2079 auto const* const suppression = getEnv("NANOBENCH_SUPPRESS_WARNINGS");
2080 return nullptr == suppression || suppression == std::string("0");
2081}
2082
2083void gatherStabilityInformation(std::vector<std::string>& warnings, std::vector<std::string>& recommendations) {
2084 warnings.clear();
2085 recommendations.clear();
2086
2087# if defined(DEBUG)
2088 warnings.emplace_back("DEBUG defined");
2089 bool const recommendCheckFlags = true;
2090# else
2091 bool const recommendCheckFlags = false;
2092# endif
2093
2094 bool recommendPyPerf = false;
2095# if defined(__linux__)
2096 auto nprocs = sysconf(_SC_NPROCESSORS_CONF);
2097 if (nprocs <= 0) {
2098 warnings.emplace_back("couldn't figure out number of processors - no governor, turbo check possible");
2099 } else {
2100 // check frequency scaling
2101 for (long id = 0; id < nprocs; ++id) {
2102 auto idStr = detail::fmt::to_s(static_cast<uint64_t>(id));
2103 auto sysCpu = "/sys/devices/system/cpu/cpu" + idStr;
2104 auto minFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_min_freq", nullptr);
2105 auto maxFreq = parseFile<int64_t>(sysCpu + "/cpufreq/scaling_max_freq", nullptr);
2106 if (minFreq != maxFreq) {
2107 auto minMHz = d(minFreq) / 1000.0;
2108 auto maxMHz = d(maxFreq) / 1000.0;
2109 warnings.emplace_back("CPU frequency scaling enabled: CPU " + idStr + " between " +
2110 detail::fmt::Number(1, 1, minMHz).to_s() + " and " + detail::fmt::Number(1, 1, maxMHz).to_s() +
2111 " MHz");
2112 recommendPyPerf = true;
2113 break;
2114 }
2115 }
2116
2117 auto fail = false;
2118 auto currentGovernor = parseFile<std::string>("/sys/devices/system/cpu/cpu0/cpufreq/scaling_governor", &fail);
2119 if (!fail && "performance" != currentGovernor) {
2120 warnings.emplace_back("CPU governor is '" + currentGovernor + "' but should be 'performance'");
2121 recommendPyPerf = true;
2122 }
2123
2124 auto noTurbo = parseFile<int>("/sys/devices/system/cpu/intel_pstate/no_turbo", &fail);
2125 if (!fail && noTurbo == 0) {
2126 warnings.emplace_back("Turbo is enabled, CPU frequency will fluctuate");
2127 recommendPyPerf = true;
2128 }
2129 }
2130# endif
2131
2132 if (recommendCheckFlags) {
2133 recommendations.emplace_back("Make sure you compile for Release");
2134 }
2135 if (recommendPyPerf) {
2136 recommendations.emplace_back("Use 'pyperf system tune' before benchmarking. See https://github.com/psf/pyperf");
2137 }
2138}
2139
2140void printStabilityInformationOnce(std::ostream* outStream) {
2141 static bool shouldPrint = true;
2142 if (shouldPrint && (nullptr != outStream) && isWarningsEnabled()) {
2143 auto& os = *outStream;
2144 shouldPrint = false;
2145 std::vector<std::string> warnings;
2146 std::vector<std::string> recommendations;
2147 gatherStabilityInformation(warnings, recommendations);
2148 if (warnings.empty()) {
2149 return;
2150 }
2151
2152 os << "Warning, results might be unstable:" << std::endl;
2153 for (auto const& w : warnings) {
2154 os << "* " << w << std::endl;
2155 }
2156
2157 os << std::endl << "Recommendations" << std::endl;
2158 for (auto const& r : recommendations) {
2159 os << "* " << r << std::endl;
2160 }
2161 }
2162}
2163
2164// remembers the last table settings used. When it changes, a new table header is automatically written for the new entry.
2165uint64_t& singletonHeaderHash() noexcept {
2166 static uint64_t sHeaderHash{};
2167 return sHeaderHash;
2168}
2169
2170ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2171inline uint64_t hash_combine(uint64_t seed, uint64_t val) {
2172 return seed ^ (val + UINT64_C(0x9e3779b9) + (seed << 6U) + (seed >> 2U));
2173}
2174
2175// determines resolution of the given clock. This is done by measuring multiple times and returning the minimum time difference.
2176Clock::duration calcClockResolution(size_t numEvaluations) noexcept {
2177 auto bestDuration = Clock::duration::max();
2178 Clock::time_point tBegin;
2179 Clock::time_point tEnd;
2180 for (size_t i = 0; i < numEvaluations; ++i) {
2181 tBegin = Clock::now();
2182 do {
2183 tEnd = Clock::now();
2184 } while (tBegin == tEnd);
2185 bestDuration = (std::min)(bestDuration, tEnd - tBegin);
2186 }
2187 return bestDuration;
2188}
2189
2190// Calculates clock resolution once, and remembers the result
2191Clock::duration clockResolution() noexcept {
2192 static Clock::duration const sResolution = calcClockResolution(20);
2193 return sResolution;
2194}
2195
2196ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2197struct IterationLogic::Impl {
2198 enum class State { warmup, upscaling_runtime, measuring, endless };
2199
2200 explicit Impl(Bench const& bench)
2201 : mBench(bench)
2202 , mResult(bench.config()) {
2203 printStabilityInformationOnce(mBench.output());
2204
2205 // determine target runtime per epoch
2206 mTargetRuntimePerEpoch = detail::clockResolution() * mBench.clockResolutionMultiple();
2207 if (mTargetRuntimePerEpoch > mBench.maxEpochTime()) {
2208 mTargetRuntimePerEpoch = mBench.maxEpochTime();
2209 }
2210 if (mTargetRuntimePerEpoch < mBench.minEpochTime()) {
2211 mTargetRuntimePerEpoch = mBench.minEpochTime();
2212 }
2213
2214 if (isEndlessRunning(mBench.name())) {
2215 std::cerr << "NANOBENCH_ENDLESS set: running '" << mBench.name() << "' endlessly" << std::endl;
2216 mNumIters = (std::numeric_limits<uint64_t>::max)();
2217 mState = State::endless;
2218 } else if (0 != mBench.warmup()) {
2219 mNumIters = mBench.warmup();
2220 mState = State::warmup;
2221 } else if (0 != mBench.epochIterations()) {
2222 // exact number of iterations
2223 mNumIters = mBench.epochIterations();
2224 mState = State::measuring;
2225 } else {
2226 mNumIters = mBench.minEpochIterations();
2227 mState = State::upscaling_runtime;
2228 }
2229 }
2230
2231 // directly calculates new iters based on elapsed&iters, and adds a 10% noise. Makes sure we don't underflow.
2232 ANKERL_NANOBENCH(NODISCARD) uint64_t calcBestNumIters(std::chrono::nanoseconds elapsed, uint64_t iters) noexcept {
2233 auto doubleElapsed = d(elapsed);
2234 auto doubleTargetRuntimePerEpoch = d(mTargetRuntimePerEpoch);
2235 auto doubleNewIters = doubleTargetRuntimePerEpoch / doubleElapsed * d(iters);
2236
2237 auto doubleMinEpochIters = d(mBench.minEpochIterations());
2238 if (doubleNewIters < doubleMinEpochIters) {
2239 doubleNewIters = doubleMinEpochIters;
2240 }
2241 doubleNewIters *= 1.0 + 0.2 * mRng.uniform01();
2242
2243 // +0.5 for correct rounding when casting
2244 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2245 return static_cast<uint64_t>(doubleNewIters + 0.5);
2246 }
2247
2248 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined") void upscale(std::chrono::nanoseconds elapsed) {
2249 if (elapsed * 10 < mTargetRuntimePerEpoch) {
2250 // we are far below the target runtime. Multiply iterations by 10 (with overflow check)
2251 if (mNumIters * 10 < mNumIters) {
2252 // overflow :-(
2253 showResult("iterations overflow. Maybe your code got optimized away?");
2254 mNumIters = 0;
2255 return;
2256 }
2257 mNumIters *= 10;
2258 } else {
2259 mNumIters = calcBestNumIters(elapsed, mNumIters);
2260 }
2261 }
2262
2263 void add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2264# if defined(ANKERL_NANOBENCH_LOG_ENABLED)
2265 auto oldIters = mNumIters;
2266# endif
2267
2268 switch (mState) {
2269 case State::warmup:
2270 if (isCloseEnoughForMeasurements(elapsed)) {
2271 // if elapsed is close enough, we can skip upscaling and go right to measurements
2272 // still, we don't add the result to the measurements.
2273 mState = State::measuring;
2274 mNumIters = calcBestNumIters(elapsed, mNumIters);
2275 } else {
2276 // not close enough: switch to upscaling
2277 mState = State::upscaling_runtime;
2278 upscale(elapsed);
2279 }
2280 break;
2281
2282 case State::upscaling_runtime:
2283 if (isCloseEnoughForMeasurements(elapsed)) {
2284 // if we are close enough, add measurement and switch to always measuring
2285 mState = State::measuring;
2286 mTotalElapsed += elapsed;
2287 mTotalNumIters += mNumIters;
2288 mResult.add(elapsed, mNumIters, pc);
2289 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2290 } else {
2291 upscale(elapsed);
2292 }
2293 break;
2294
2295 case State::measuring:
2296 // just add measurements - no questions asked. Even when runtime is low. But we can't ignore
2297 // that fluctuation, or else we would bias the result
2298 mTotalElapsed += elapsed;
2299 mTotalNumIters += mNumIters;
2300 mResult.add(elapsed, mNumIters, pc);
2301 if (0 != mBench.epochIterations()) {
2302 mNumIters = mBench.epochIterations();
2303 } else {
2304 mNumIters = calcBestNumIters(mTotalElapsed, mTotalNumIters);
2305 }
2306 break;
2307
2308 case State::endless:
2309 mNumIters = (std::numeric_limits<uint64_t>::max)();
2310 break;
2311 }
2312
2313 if (static_cast<uint64_t>(mResult.size()) == mBench.epochs()) {
2314 // we got all the results that we need, finish it
2315 showResult("");
2316 mNumIters = 0;
2317 }
2318
2319 ANKERL_NANOBENCH_LOG(mBench.name() << ": " << detail::fmt::Number(20, 3, d(elapsed.count())) << " elapsed, "
2320 << detail::fmt::Number(20, 3, d(mTargetRuntimePerEpoch.count())) << " target. oldIters="
2321 << oldIters << ", mNumIters=" << mNumIters << ", mState=" << static_cast<int>(mState));
2322 }
2323
2324 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
2325 void showResult(std::string const& errorMessage) const {
2326 ANKERL_NANOBENCH_LOG(errorMessage);
2327
2328 if (mBench.output() != nullptr) {
2329 // prepare column data ///////
2330 std::vector<fmt::MarkDownColumn> columns;
2331
2332 auto rMedian = mResult.median(Result::Measure::elapsed);
2333
2334 if (mBench.relative()) {
2335 double d = 100.0;
2336 if (!mBench.results().empty()) {
2337 d = rMedian <= 0.0 ? 0.0 : mBench.results().front().median(Result::Measure::elapsed) / rMedian * 100.0;
2338 }
2339 columns.emplace_back(11, 1, "relative", "%", d);
2340 }
2341
2342 if (mBench.complexityN() > 0) {
2343 columns.emplace_back(14, 0, "complexityN", "", mBench.complexityN());
2344 }
2345
2346 columns.emplace_back(22, 2, mBench.timeUnitName() + "/" + mBench.unit(), "",
2347 rMedian / (mBench.timeUnit().count() * mBench.batch()));
2348 columns.emplace_back(22, 2, mBench.unit() + "/s", "", rMedian <= 0.0 ? 0.0 : mBench.batch() / rMedian);
2349
2350 double const rErrorMedian = mResult.medianAbsolutePercentError(Result::Measure::elapsed);
2351 columns.emplace_back(10, 1, "err%", "%", rErrorMedian * 100.0);
2352
2353 double rInsMedian = -1.0;
2354 if (mBench.performanceCounters() && mResult.has(Result::Measure::instructions)) {
2355 rInsMedian = mResult.median(Result::Measure::instructions);
2356 columns.emplace_back(18, 2, "ins/" + mBench.unit(), "", rInsMedian / mBench.batch());
2357 }
2358
2359 double rCycMedian = -1.0;
2360 if (mBench.performanceCounters() && mResult.has(Result::Measure::cpucycles)) {
2361 rCycMedian = mResult.median(Result::Measure::cpucycles);
2362 columns.emplace_back(18, 2, "cyc/" + mBench.unit(), "", rCycMedian / mBench.batch());
2363 }
2364 if (rInsMedian > 0.0 && rCycMedian > 0.0) {
2365 columns.emplace_back(9, 3, "IPC", "", rCycMedian <= 0.0 ? 0.0 : rInsMedian / rCycMedian);
2366 }
2367 if (mBench.performanceCounters() && mResult.has(Result::Measure::branchinstructions)) {
2368 double const rBraMedian = mResult.median(Result::Measure::branchinstructions);
2369 columns.emplace_back(17, 2, "bra/" + mBench.unit(), "", rBraMedian / mBench.batch());
2370 if (mResult.has(Result::Measure::branchmisses)) {
2371 double p = 0.0;
2372 if (rBraMedian >= 1e-9) {
2373 p = 100.0 * mResult.median(Result::Measure::branchmisses) / rBraMedian;
2374 }
2375 columns.emplace_back(10, 1, "miss%", "%", p);
2376 }
2377 }
2378
2379 columns.emplace_back(12, 2, "total", "", mResult.sumProduct(Result::Measure::iterations, Result::Measure::elapsed));
2380
2381 // write everything
2382 auto& os = *mBench.output();
2383
2384 // combine all elements that are relevant for printing the header
2385 uint64_t hash = 0;
2386 hash = hash_combine(std::hash<std::string>{}(mBench.unit()), hash);
2387 hash = hash_combine(std::hash<std::string>{}(mBench.title()), hash);
2388 hash = hash_combine(std::hash<std::string>{}(mBench.timeUnitName()), hash);
2389 hash = hash_combine(std::hash<double>{}(mBench.timeUnit().count()), hash);
2390 hash = hash_combine(std::hash<bool>{}(mBench.relative()), hash);
2391 hash = hash_combine(std::hash<bool>{}(mBench.performanceCounters()), hash);
2392
2393 if (hash != singletonHeaderHash()) {
2394 singletonHeaderHash() = hash;
2395
2396 // no result yet, print header
2397 os << std::endl;
2398 for (auto const& col : columns) {
2399 os << col.title();
2400 }
2401 os << "| " << mBench.title() << std::endl;
2402
2403 for (auto const& col : columns) {
2404 os << col.separator();
2405 }
2406 os << "|:" << std::string(mBench.title().size() + 1U, '-') << std::endl;
2407 }
2408
2409 if (!errorMessage.empty()) {
2410 for (auto const& col : columns) {
2411 os << col.invalid();
2412 }
2413 os << "| :boom: " << fmt::MarkDownCode(mBench.name()) << " (" << errorMessage << ')' << std::endl;
2414 } else {
2415 for (auto const& col : columns) {
2416 os << col.value();
2417 }
2418 os << "| ";
2419 auto showUnstable = isWarningsEnabled() && rErrorMedian >= 0.05;
2420 if (showUnstable) {
2421 os << ":wavy_dash: ";
2422 }
2423 os << fmt::MarkDownCode(mBench.name());
2424 if (showUnstable) {
2425 auto avgIters = d(mTotalNumIters) / d(mBench.epochs());
2426 // NOLINTNEXTLINE(bugprone-incorrect-roundings)
2427 auto suggestedIters = static_cast<uint64_t>(avgIters * 10 + 0.5);
2428
2429 os << " (Unstable with ~" << detail::fmt::Number(1, 1, avgIters)
2430 << " iters. Increase `minEpochIterations` to e.g. " << suggestedIters << ")";
2431 }
2432 os << std::endl;
2433 }
2434 }
2435 }
2436
2437 ANKERL_NANOBENCH(NODISCARD) bool isCloseEnoughForMeasurements(std::chrono::nanoseconds elapsed) const noexcept {
2438 return elapsed * 3 >= mTargetRuntimePerEpoch * 2;
2439 }
2440
2441 uint64_t mNumIters = 1; // NOLINT(misc-non-private-member-variables-in-classes)
2442 Bench const& mBench; // NOLINT(misc-non-private-member-variables-in-classes)
2443 std::chrono::nanoseconds mTargetRuntimePerEpoch{}; // NOLINT(misc-non-private-member-variables-in-classes)
2444 Result mResult; // NOLINT(misc-non-private-member-variables-in-classes)
2445 Rng mRng{123}; // NOLINT(misc-non-private-member-variables-in-classes)
2446 std::chrono::nanoseconds mTotalElapsed{}; // NOLINT(misc-non-private-member-variables-in-classes)
2447 uint64_t mTotalNumIters = 0; // NOLINT(misc-non-private-member-variables-in-classes)
2448 State mState = State::upscaling_runtime; // NOLINT(misc-non-private-member-variables-in-classes)
2449};
2450ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2451
2452IterationLogic::IterationLogic(Bench const& bench)
2453 : mPimpl(new Impl(bench)) {}
2454
2455IterationLogic::~IterationLogic() {
2456 delete mPimpl;
2457}
2458
2459uint64_t IterationLogic::numIters() const noexcept {
2460 ANKERL_NANOBENCH_LOG(mPimpl->mBench.name() << ": mNumIters=" << mPimpl->mNumIters);
2461 return mPimpl->mNumIters;
2462}
2463
2464void IterationLogic::add(std::chrono::nanoseconds elapsed, PerformanceCounters const& pc) noexcept {
2465 mPimpl->add(elapsed, pc);
2466}
2467
2468void IterationLogic::moveResultTo(std::vector<Result>& results) noexcept {
2469 results.emplace_back(std::move(mPimpl->mResult));
2470}
2471
2472# if ANKERL_NANOBENCH(PERF_COUNTERS)
2473
2474ANKERL_NANOBENCH(IGNORE_PADDED_PUSH)
2475class LinuxPerformanceCounters {
2476public:
2477 struct Target {
2478 Target(uint64_t* targetValue_, bool correctMeasuringOverhead_, bool correctLoopOverhead_)
2479 : targetValue(targetValue_)
2480 , correctMeasuringOverhead(correctMeasuringOverhead_)
2481 , correctLoopOverhead(correctLoopOverhead_) {}
2482
2483 uint64_t* targetValue{}; // NOLINT(misc-non-private-member-variables-in-classes)
2484 bool correctMeasuringOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2485 bool correctLoopOverhead{}; // NOLINT(misc-non-private-member-variables-in-classes)
2486 };
2487
2488 LinuxPerformanceCounters() = default;
2489 LinuxPerformanceCounters(LinuxPerformanceCounters const&) = delete;
2490 LinuxPerformanceCounters(LinuxPerformanceCounters&&) = delete;
2491 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters const&) = delete;
2492 LinuxPerformanceCounters& operator=(LinuxPerformanceCounters&&) = delete;
2493 ~LinuxPerformanceCounters();
2494
2495 // quick operation
2496 inline void start() {}
2497
2498 inline void stop() {}
2499
2500 bool monitor(perf_sw_ids swId, Target target);
2501 bool monitor(perf_hw_id hwId, Target target);
2502
2503 ANKERL_NANOBENCH(NODISCARD) bool hasError() const noexcept {
2504 return mHasError;
2505 }
2506
2507 // Just reading data is faster than enable & disabling.
2508 // we subtract data ourselves.
2509 inline void beginMeasure() {
2510 if (mHasError) {
2511 return;
2512 }
2513
2514 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2515 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
2516 if (mHasError) {
2517 return;
2518 }
2519
2520 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2521 mHasError = -1 == ioctl(mFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
2522 }
2523
2524 inline void endMeasure() {
2525 if (mHasError) {
2526 return;
2527 }
2528
2529 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2530 mHasError = (-1 == ioctl(mFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP));
2531 if (mHasError) {
2532 return;
2533 }
2534
2535 auto const numBytes = sizeof(uint64_t) * mCounters.size();
2536 auto ret = read(mFd, mCounters.data(), numBytes);
2537 mHasError = ret != static_cast<ssize_t>(numBytes);
2538 }
2539
2540 void updateResults(uint64_t numIters);
2541
2542 // rounded integer division
2543 template <typename T>
2544 static inline T divRounded(T a, T divisor) {
2545 return (a + divisor / 2) / divisor;
2546 }
2547
2548 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2549 static inline uint32_t mix(uint32_t x) noexcept {
2550 x ^= x << 13U;
2551 x ^= x >> 17U;
2552 x ^= x << 5U;
2553 return x;
2554 }
2555
2556 template <typename Op>
2557 ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2558 void calibrate(Op&& op) {
2559 // clear current calibration data,
2560 for (auto& v : mCalibratedOverhead) {
2561 v = UINT64_C(0);
2562 }
2563
2564 // create new calibration data
2565 auto newCalibration = mCalibratedOverhead;
2566 for (auto& v : newCalibration) {
2567 v = (std::numeric_limits<uint64_t>::max)();
2568 }
2569 for (size_t iter = 0; iter < 100; ++iter) {
2570 beginMeasure();
2571 op();
2572 endMeasure();
2573 if (mHasError) {
2574 return;
2575 }
2576
2577 for (size_t i = 0; i < newCalibration.size(); ++i) {
2578 auto diff = mCounters[i];
2579 if (newCalibration[i] > diff) {
2580 newCalibration[i] = diff;
2581 }
2582 }
2583 }
2584
2585 mCalibratedOverhead = std::move(newCalibration);
2586
2587 {
2588 // calibrate loop overhead. For branches & instructions this makes sense, not so much for everything else like cycles.
2589 // marsaglia's xorshift: mov, sal/shr, xor. Times 3.
2590 // This has the nice property that the compiler doesn't seem to be able to optimize multiple calls any further.
2591 // see https://godbolt.org/z/49RVQ5
2592 uint64_t const numIters = 100000U + (std::random_device{}() & 3U);
2593 uint64_t n = numIters;
2594 uint32_t x = 1234567;
2595
2596 beginMeasure();
2597 while (n-- > 0) {
2598 x = mix(x);
2599 }
2600 endMeasure();
2602 auto measure1 = mCounters;
2603
2604 n = numIters;
2605 beginMeasure();
2606 while (n-- > 0) {
2607 // we now run *twice* so we can easily calculate the overhead
2608 x = mix(x);
2609 x = mix(x);
2610 }
2611 endMeasure();
2613 auto measure2 = mCounters;
2614
2615 for (size_t i = 0; i < mCounters.size(); ++i) {
2616 // factor 2 because we have two instructions per loop
2617 auto m1 = measure1[i] > mCalibratedOverhead[i] ? measure1[i] - mCalibratedOverhead[i] : 0;
2618 auto m2 = measure2[i] > mCalibratedOverhead[i] ? measure2[i] - mCalibratedOverhead[i] : 0;
2619 auto overhead = m1 * 2 > m2 ? m1 * 2 - m2 : 0;
2620
2621 mLoopOverhead[i] = divRounded(overhead, numIters);
2622 }
2623 }
2624 }
2625
2626private:
2627 bool monitor(uint32_t type, uint64_t eventid, Target target);
2628
2629 std::map<uint64_t, Target> mIdToTarget{};
2630
2631 // start with minimum size of 3 for read_format
2632 std::vector<uint64_t> mCounters{3};
2633 std::vector<uint64_t> mCalibratedOverhead{3};
2634 std::vector<uint64_t> mLoopOverhead{3};
2635
2636 uint64_t mTimeEnabledNanos = 0;
2637 uint64_t mTimeRunningNanos = 0;
2638 int mFd = -1;
2639 bool mHasError = false;
2640};
2641ANKERL_NANOBENCH(IGNORE_PADDED_POP)
2642
2643LinuxPerformanceCounters::~LinuxPerformanceCounters() {
2644 if (-1 != mFd) {
2645 close(mFd);
2646 }
2647}
2648
2649bool LinuxPerformanceCounters::monitor(perf_sw_ids swId, LinuxPerformanceCounters::Target target) {
2650 return monitor(PERF_TYPE_SOFTWARE, swId, target);
2651}
2652
2653bool LinuxPerformanceCounters::monitor(perf_hw_id hwId, LinuxPerformanceCounters::Target target) {
2654 return monitor(PERF_TYPE_HARDWARE, hwId, target);
2655}
2656
2657// overflow is ok, it's checked
2658ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
2659void LinuxPerformanceCounters::updateResults(uint64_t numIters) {
2660 // clear old data
2661 for (auto& id_value : mIdToTarget) {
2662 *id_value.second.targetValue = UINT64_C(0);
2663 }
2664
2665 if (mHasError) {
2666 return;
2667 }
2668
2669 mTimeEnabledNanos = mCounters[1] - mCalibratedOverhead[1];
2670 mTimeRunningNanos = mCounters[2] - mCalibratedOverhead[2];
2671
2672 for (uint64_t i = 0; i < mCounters[0]; ++i) {
2673 auto idx = static_cast<size_t>(3 + i * 2 + 0);
2674 auto id = mCounters[idx + 1U];
2675
2676 auto it = mIdToTarget.find(id);
2677 if (it != mIdToTarget.end()) {
2678
2679 auto& tgt = it->second;
2680 *tgt.targetValue = mCounters[idx];
2681 if (tgt.correctMeasuringOverhead) {
2682 if (*tgt.targetValue >= mCalibratedOverhead[idx]) {
2683 *tgt.targetValue -= mCalibratedOverhead[idx];
2684 } else {
2685 *tgt.targetValue = 0U;
2686 }
2687 }
2688 if (tgt.correctLoopOverhead) {
2689 auto correctionVal = mLoopOverhead[idx] * numIters;
2690 if (*tgt.targetValue >= correctionVal) {
2691 *tgt.targetValue -= correctionVal;
2692 } else {
2693 *tgt.targetValue = 0U;
2694 }
2695 }
2696 }
2697 }
2698}
2699
2700bool LinuxPerformanceCounters::monitor(uint32_t type, uint64_t eventid, Target target) {
2701 *target.targetValue = (std::numeric_limits<uint64_t>::max)();
2702 if (mHasError) {
2703 return false;
2704 }
2705
2706 auto pea = perf_event_attr();
2707 std::memset(&pea, 0, sizeof(perf_event_attr));
2708 pea.type = type;
2709 pea.size = sizeof(perf_event_attr);
2710 pea.config = eventid;
2711 pea.disabled = 1; // start counter as disabled
2712 pea.exclude_kernel = 1;
2713 pea.exclude_hv = 1;
2714
2715 // NOLINTNEXTLINE(hicpp-signed-bitwise)
2716 pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
2717
2718 const int pid = 0; // the current process
2719 const int cpu = -1; // all CPUs
2720# if defined(PERF_FLAG_FD_CLOEXEC) // since Linux 3.14
2721 const unsigned long flags = PERF_FLAG_FD_CLOEXEC;
2722# else
2723 const unsigned long flags = 0;
2724# endif
2725
2726 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
2727 auto fd = static_cast<int>(syscall(__NR_perf_event_open, &pea, pid, cpu, mFd, flags));
2728 if (-1 == fd) {
2729 return false;
2730 }
2731 if (-1 == mFd) {
2732 // first call: set to fd, and use this from now on
2733 mFd = fd;
2734 }
2735 uint64_t id = 0;
2736 // NOLINTNEXTLINE(hicpp-signed-bitwise,cppcoreguidelines-pro-type-vararg)
2737 if (-1 == ioctl(fd, PERF_EVENT_IOC_ID, &id)) {
2738 // couldn't get id
2739 return false;
2740 }
2741
2742 // insert into map, rely on the fact that map's references are constant.
2743 mIdToTarget.emplace(id, target);
2744
2745 // prepare readformat with the correct size (after the insert)
2746 auto size = 3 + 2 * mIdToTarget.size();
2747 mCounters.resize(size);
2748 mCalibratedOverhead.resize(size);
2749 mLoopOverhead.resize(size);
2750
2751 return true;
2752}
2753
2754PerformanceCounters::PerformanceCounters()
2755 : mPc(new LinuxPerformanceCounters())
2756 , mVal()
2757 , mHas() {
2758
2759 // HW events
2760 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_REF_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2761 if (!mHas.cpuCycles) {
2762 // Fallback to cycles counter, reference cycles not available in many systems.
2763 mHas.cpuCycles = mPc->monitor(PERF_COUNT_HW_CPU_CYCLES, LinuxPerformanceCounters::Target(&mVal.cpuCycles, true, false));
2764 }
2765 mHas.instructions = mPc->monitor(PERF_COUNT_HW_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.instructions, true, true));
2766 mHas.branchInstructions =
2767 mPc->monitor(PERF_COUNT_HW_BRANCH_INSTRUCTIONS, LinuxPerformanceCounters::Target(&mVal.branchInstructions, true, false));
2768 mHas.branchMisses = mPc->monitor(PERF_COUNT_HW_BRANCH_MISSES, LinuxPerformanceCounters::Target(&mVal.branchMisses, true, false));
2769 // mHas.branchMisses = false;
2770
2771 // SW events
2772 mHas.pageFaults = mPc->monitor(PERF_COUNT_SW_PAGE_FAULTS, LinuxPerformanceCounters::Target(&mVal.pageFaults, true, false));
2773 mHas.contextSwitches =
2774 mPc->monitor(PERF_COUNT_SW_CONTEXT_SWITCHES, LinuxPerformanceCounters::Target(&mVal.contextSwitches, true, false));
2775
2776 mPc->start();
2777 mPc->calibrate([] {
2778 auto before = ankerl::nanobench::Clock::now();
2779 auto after = ankerl::nanobench::Clock::now();
2780 (void)before;
2781 (void)after;
2782 });
2783
2784 if (mPc->hasError()) {
2785 // something failed, don't monitor anything.
2786 mHas = PerfCountSet<bool>{};
2787 }
2788}
2789
2790PerformanceCounters::~PerformanceCounters() {
2791 // no need to check for nullptr, delete nullptr has no effect
2792 delete mPc;
2793}
2794
2795void PerformanceCounters::beginMeasure() {
2796 mPc->beginMeasure();
2797}
2798
2799void PerformanceCounters::endMeasure() {
2800 mPc->endMeasure();
2801}
2802
2803void PerformanceCounters::updateResults(uint64_t numIters) {
2804 mPc->updateResults(numIters);
2805}
2806
2807# else
2808
2809PerformanceCounters::PerformanceCounters() = default;
2810PerformanceCounters::~PerformanceCounters() = default;
2811void PerformanceCounters::beginMeasure() {}
2812void PerformanceCounters::endMeasure() {}
2813void PerformanceCounters::updateResults(uint64_t) {}
2814
2815# endif
2816
2817ANKERL_NANOBENCH(NODISCARD) PerfCountSet<uint64_t> const& PerformanceCounters::val() const noexcept {
2818 return mVal;
2819}
2820ANKERL_NANOBENCH(NODISCARD) PerfCountSet<bool> const& PerformanceCounters::has() const noexcept {
2821 return mHas;
2822}
2823
2824// formatting utilities
2825namespace fmt {
2826
2827// adds thousands separator to numbers
2828NumSep::NumSep(char sep)
2829 : mSep(sep) {}
2830
2831char NumSep::do_thousands_sep() const {
2832 return mSep;
2833}
2834
2835std::string NumSep::do_grouping() const {
2836 return "\003";
2837}
2838
2839// RAII to save & restore a stream's state
2840StreamStateRestorer::StreamStateRestorer(std::ostream& s)
2841 : mStream(s)
2842 , mLocale(s.getloc())
2843 , mPrecision(s.precision())
2844 , mWidth(s.width())
2845 , mFill(s.fill())
2846 , mFmtFlags(s.flags()) {}
2847
2848StreamStateRestorer::~StreamStateRestorer() {
2849 restore();
2850}
2851
2852// sets back all stream info that we remembered at construction
2853void StreamStateRestorer::restore() {
2854 mStream.imbue(mLocale);
2855 mStream.precision(mPrecision);
2856 mStream.width(mWidth);
2857 mStream.fill(mFill);
2858 mStream.flags(mFmtFlags);
2859}
2860
2861Number::Number(int width, int precision, int64_t value)
2862 : mWidth(width)
2863 , mPrecision(precision)
2864 , mValue(d(value)) {}
2865
2866Number::Number(int width, int precision, double value)
2867 : mWidth(width)
2868 , mPrecision(precision)
2869 , mValue(value) {}
2870
2871std::ostream& Number::write(std::ostream& os) const {
2872 StreamStateRestorer const restorer(os);
2873 os.imbue(std::locale(os.getloc(), new NumSep(',')));
2874 os << std::setw(mWidth) << std::setprecision(mPrecision) << std::fixed << mValue;
2875 return os;
2876}
2877
2878std::string Number::to_s() const {
2879 std::stringstream ss;
2880 write(ss);
2881 return ss.str();
2882}
2883
2884std::string to_s(uint64_t n) {
2885 std::string str;
2886 do {
2887 str += static_cast<char>('0' + static_cast<char>(n % 10));
2888 n /= 10;
2889 } while (n != 0);
2890 std::reverse(str.begin(), str.end());
2891 return str;
2892}
2893
2894std::ostream& operator<<(std::ostream& os, Number const& n) {
2895 return n.write(os);
2896}
2897
2898MarkDownColumn::MarkDownColumn(int w, int prec, std::string tit, std::string suff, double val) noexcept
2899 : mWidth(w)
2900 , mPrecision(prec)
2901 , mTitle(std::move(tit))
2902 , mSuffix(std::move(suff))
2903 , mValue(val) {}
2904
2905std::string MarkDownColumn::title() const {
2906 std::stringstream ss;
2907 ss << '|' << std::setw(mWidth - 2) << std::right << mTitle << ' ';
2908 return ss.str();
2909}
2910
2911std::string MarkDownColumn::separator() const {
2912 std::string sep(static_cast<size_t>(mWidth), '-');
2913 sep.front() = '|';
2914 sep.back() = ':';
2915 return sep;
2916}
2917
2918std::string MarkDownColumn::invalid() const {
2919 std::string sep(static_cast<size_t>(mWidth), ' ');
2920 sep.front() = '|';
2921 sep[sep.size() - 2] = '-';
2922 return sep;
2923}
2924
2925std::string MarkDownColumn::value() const {
2926 std::stringstream ss;
2927 auto width = mWidth - 2 - static_cast<int>(mSuffix.size());
2928 ss << '|' << Number(width, mPrecision, mValue) << mSuffix << ' ';
2929 return ss.str();
2930}
2931
2932// Formats any text as markdown code, escaping backticks.
2933MarkDownCode::MarkDownCode(std::string const& what) {
2934 mWhat.reserve(what.size() + 2);
2935 mWhat.push_back('`');
2936 for (char const c : what) {
2937 mWhat.push_back(c);
2938 if ('`' == c) {
2939 mWhat.push_back('`');
2940 }
2941 }
2942 mWhat.push_back('`');
2943}
2944
2945std::ostream& MarkDownCode::write(std::ostream& os) const {
2946 return os << mWhat;
2947}
2948
2949std::ostream& operator<<(std::ostream& os, MarkDownCode const& mdCode) {
2950 return mdCode.write(os);
2951}
2952} // namespace fmt
2953} // namespace detail
2954
2955// provide implementation here so it's only generated once
2956Config::Config() = default;
2957Config::~Config() = default;
2958Config& Config::operator=(Config const&) = default;
2959Config& Config::operator=(Config&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
2960Config::Config(Config const&) = default;
2961Config::Config(Config&&) noexcept = default;
2962
2963// provide implementation here so it's only generated once
2964Result::~Result() = default;
2965Result& Result::operator=(Result const&) = default;
2966Result& Result::operator=(Result&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
2967Result::Result(Result const&) = default;
2968Result::Result(Result&&) noexcept = default;
2969
2970namespace detail {
2971template <typename T>
2972inline constexpr typename std::underlying_type<T>::type u(T val) noexcept {
2973 return static_cast<typename std::underlying_type<T>::type>(val);
2974}
2975} // namespace detail
2976
2977// Result returned after a benchmark has finished. Can be used as a baseline for relative().
2978Result::Result(Config benchmarkConfig)
2979 : mConfig(std::move(benchmarkConfig))
2980 , mNameToMeasurements{detail::u(Result::Measure::_size)} {}
2981
2982void Result::add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const& pc) {
2983 using detail::d;
2984 using detail::u;
2985
2986 double const dIters = d(iters);
2987 mNameToMeasurements[u(Result::Measure::iterations)].push_back(dIters);
2988
2989 mNameToMeasurements[u(Result::Measure::elapsed)].push_back(d(totalElapsed) / dIters);
2990 if (pc.has().pageFaults) {
2991 mNameToMeasurements[u(Result::Measure::pagefaults)].push_back(d(pc.val().pageFaults) / dIters);
2992 }
2993 if (pc.has().cpuCycles) {
2994 mNameToMeasurements[u(Result::Measure::cpucycles)].push_back(d(pc.val().cpuCycles) / dIters);
2995 }
2996 if (pc.has().contextSwitches) {
2997 mNameToMeasurements[u(Result::Measure::contextswitches)].push_back(d(pc.val().contextSwitches) / dIters);
2998 }
2999 if (pc.has().instructions) {
3000 mNameToMeasurements[u(Result::Measure::instructions)].push_back(d(pc.val().instructions) / dIters);
3001 }
3002 if (pc.has().branchInstructions) {
3003 double branchInstructions = 0.0;
3004 // correcting branches: remove branch introduced by the while (...) loop for each iteration.
3005 if (pc.val().branchInstructions > iters + 1U) {
3006 branchInstructions = d(pc.val().branchInstructions - (iters + 1U));
3007 }
3008 mNameToMeasurements[u(Result::Measure::branchinstructions)].push_back(branchInstructions / dIters);
3009
3010 if (pc.has().branchMisses) {
3011 // correcting branch misses
3012 double branchMisses = d(pc.val().branchMisses);
3013 if (branchMisses > branchInstructions) {
3014 // can't have branch misses when there were branches...
3015 branchMisses = branchInstructions;
3016 }
3017
3018 // assuming at least one missed branch for the loop
3019 branchMisses -= 1.0;
3020 if (branchMisses < 1.0) {
3021 branchMisses = 1.0;
3022 }
3023 mNameToMeasurements[u(Result::Measure::branchmisses)].push_back(branchMisses / dIters);
3024 }
3025 }
3026}
3027
3028Config const& Result::config() const noexcept {
3029 return mConfig;
3030}
3031
3032inline double calcMedian(std::vector<double>& data) {
3033 if (data.empty()) {
3034 return 0.0;
3035 }
3036 std::sort(data.begin(), data.end());
3037
3038 auto midIdx = data.size() / 2U;
3039 if (1U == (data.size() & 1U)) {
3040 return data[midIdx];
3041 }
3042 return (data[midIdx - 1U] + data[midIdx]) / 2U;
3043}
3044
3045double Result::median(Measure m) const {
3046 // create a copy so we can sort
3047 auto data = mNameToMeasurements[detail::u(m)];
3048 return calcMedian(data);
3049}
3050
3051double Result::average(Measure m) const {
3052 using detail::d;
3053 auto const& data = mNameToMeasurements[detail::u(m)];
3054 if (data.empty()) {
3055 return 0.0;
3056 }
3057
3058 // create a copy so we can sort
3059 return sum(m) / d(data.size());
3060}
3061
3062double Result::medianAbsolutePercentError(Measure m) const {
3063 // create copy
3064 auto data = mNameToMeasurements[detail::u(m)];
3065
3066 // calculates MdAPE which is the median of percentage error
3067 // see https://support.numxl.com/hc/en-us/articles/115001223503-MdAPE-Median-Absolute-Percentage-Error
3068 auto med = calcMedian(data);
3069
3070 // transform the data to absolute error
3071 for (auto& x : data) {
3072 x = (x - med) / x;
3073 if (x < 0) {
3074 x = -x;
3075 }
3076 }
3077 return calcMedian(data);
3078}
3079
3080double Result::sum(Measure m) const noexcept {
3081 auto const& data = mNameToMeasurements[detail::u(m)];
3082 return std::accumulate(data.begin(), data.end(), 0.0);
3083}
3084
3085double Result::sumProduct(Measure m1, Measure m2) const noexcept {
3086 auto const& data1 = mNameToMeasurements[detail::u(m1)];
3087 auto const& data2 = mNameToMeasurements[detail::u(m2)];
3088
3089 if (data1.size() != data2.size()) {
3090 return 0.0;
3091 }
3092
3093 double result = 0.0;
3094 for (size_t i = 0, s = data1.size(); i != s; ++i) {
3095 result += data1[i] * data2[i];
3096 }
3097 return result;
3098}
3099
3100bool Result::has(Measure m) const noexcept {
3101 return !mNameToMeasurements[detail::u(m)].empty();
3102}
3103
3104double Result::get(size_t idx, Measure m) const {
3105 auto const& data = mNameToMeasurements[detail::u(m)];
3106 return data.at(idx);
3107}
3108
3109bool Result::empty() const noexcept {
3110 return 0U == size();
3111}
3112
3113size_t Result::size() const noexcept {
3114 auto const& data = mNameToMeasurements[detail::u(Measure::elapsed)];
3115 return data.size();
3116}
3117
3118double Result::minimum(Measure m) const noexcept {
3119 auto const& data = mNameToMeasurements[detail::u(m)];
3120 if (data.empty()) {
3121 return 0.0;
3122 }
3123
3124 // here its save to assume that at least one element is there
3125 return *std::min_element(data.begin(), data.end());
3126}
3127
3128double Result::maximum(Measure m) const noexcept {
3129 auto const& data = mNameToMeasurements[detail::u(m)];
3130 if (data.empty()) {
3131 return 0.0;
3132 }
3133
3134 // here its save to assume that at least one element is there
3135 return *std::max_element(data.begin(), data.end());
3136}
3137
3138std::string const& Result::context(char const* variableName) const {
3139 return mConfig.mContext.at(variableName);
3140}
3141
3142std::string const& Result::context(std::string const& variableName) const {
3143 return mConfig.mContext.at(variableName);
3144}
3145
3146Result::Measure Result::fromString(std::string const& str) {
3147 if (str == "elapsed") {
3148 return Measure::elapsed;
3149 }
3150 if (str == "iterations") {
3151 return Measure::iterations;
3152 }
3153 if (str == "pagefaults") {
3154 return Measure::pagefaults;
3155 }
3156 if (str == "cpucycles") {
3157 return Measure::cpucycles;
3158 }
3159 if (str == "contextswitches") {
3160 return Measure::contextswitches;
3161 }
3162 if (str == "instructions") {
3163 return Measure::instructions;
3164 }
3165 if (str == "branchinstructions") {
3166 return Measure::branchinstructions;
3167 }
3168 if (str == "branchmisses") {
3169 return Measure::branchmisses;
3170 }
3171 // not found, return _size
3172 return Measure::_size;
3173}
3174
3175// Configuration of a microbenchmark.
3176Bench::Bench() {
3177 mConfig.mOut = &std::cout;
3178}
3179
3180Bench::Bench(Bench&&) noexcept = default;
3181Bench& Bench::operator=(Bench&&) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE)) = default;
3182Bench::Bench(Bench const&) = default;
3183Bench& Bench::operator=(Bench const&) = default;
3184Bench::~Bench() noexcept = default;
3185
3186double Bench::batch() const noexcept {
3187 return mConfig.mBatch;
3188}
3189
3190double Bench::complexityN() const noexcept {
3191 return mConfig.mComplexityN;
3192}
3193
3194// Set a baseline to compare it to. 100% it is exactly as fast as the baseline, >100% means it is faster than the baseline, <100%
3195// means it is slower than the baseline.
3196Bench& Bench::relative(bool isRelativeEnabled) noexcept {
3197 mConfig.mIsRelative = isRelativeEnabled;
3198 return *this;
3199}
3200bool Bench::relative() const noexcept {
3201 return mConfig.mIsRelative;
3202}
3203
3204Bench& Bench::performanceCounters(bool showPerformanceCounters) noexcept {
3205 mConfig.mShowPerformanceCounters = showPerformanceCounters;
3206 return *this;
3207}
3208bool Bench::performanceCounters() const noexcept {
3209 return mConfig.mShowPerformanceCounters;
3210}
3211
3212// Operation unit. Defaults to "op", could be e.g. "byte" for string processing.
3213// If u differs from currently set unit, the stored results will be cleared.
3214// Use singular (byte, not bytes).
3215Bench& Bench::unit(char const* u) {
3216 if (u != mConfig.mUnit) {
3217 mResults.clear();
3218 }
3219 mConfig.mUnit = u;
3220 return *this;
3221}
3222
3223Bench& Bench::unit(std::string const& u) {
3224 return unit(u.c_str());
3225}
3226
3227std::string const& Bench::unit() const noexcept {
3228 return mConfig.mUnit;
3229}
3230
3231Bench& Bench::timeUnit(std::chrono::duration<double> const& tu, std::string const& tuName) {
3232 mConfig.mTimeUnit = tu;
3233 mConfig.mTimeUnitName = tuName;
3234 return *this;
3235}
3236
3237std::string const& Bench::timeUnitName() const noexcept {
3238 return mConfig.mTimeUnitName;
3239}
3240
3241std::chrono::duration<double> const& Bench::timeUnit() const noexcept {
3242 return mConfig.mTimeUnit;
3243}
3244
3245// If benchmarkTitle differs from currently set title, the stored results will be cleared.
3246Bench& Bench::title(const char* benchmarkTitle) {
3247 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3248 mResults.clear();
3249 }
3250 mConfig.mBenchmarkTitle = benchmarkTitle;
3251 return *this;
3252}
3253Bench& Bench::title(std::string const& benchmarkTitle) {
3254 if (benchmarkTitle != mConfig.mBenchmarkTitle) {
3255 mResults.clear();
3256 }
3257 mConfig.mBenchmarkTitle = benchmarkTitle;
3258 return *this;
3259}
3260
3261std::string const& Bench::title() const noexcept {
3262 return mConfig.mBenchmarkTitle;
3263}
3264
3265Bench& Bench::name(const char* benchmarkName) {
3266 mConfig.mBenchmarkName = benchmarkName;
3267 return *this;
3268}
3269
3270Bench& Bench::name(std::string const& benchmarkName) {
3271 mConfig.mBenchmarkName = benchmarkName;
3272 return *this;
3273}
3274
3275std::string const& Bench::name() const noexcept {
3276 return mConfig.mBenchmarkName;
3277}
3278
3279Bench& Bench::context(char const* variableName, char const* variableValue) {
3280 mConfig.mContext[variableName] = variableValue;
3281 return *this;
3282}
3283
3284Bench& Bench::context(std::string const& variableName, std::string const& variableValue) {
3285 mConfig.mContext[variableName] = variableValue;
3286 return *this;
3287}
3288
3289Bench& Bench::clearContext() {
3290 mConfig.mContext.clear();
3291 return *this;
3292}
3293
3294// Number of epochs to evaluate. The reported result will be the median of evaluation of each epoch.
3295Bench& Bench::epochs(size_t numEpochs) noexcept {
3296 mConfig.mNumEpochs = numEpochs;
3297 return *this;
3298}
3299size_t Bench::epochs() const noexcept {
3300 return mConfig.mNumEpochs;
3301}
3302
3303// Desired evaluation time is a multiple of clock resolution. Default is to be 1000 times above this measurement precision.
3304Bench& Bench::clockResolutionMultiple(size_t multiple) noexcept {
3305 mConfig.mClockResolutionMultiple = multiple;
3306 return *this;
3307}
3308size_t Bench::clockResolutionMultiple() const noexcept {
3309 return mConfig.mClockResolutionMultiple;
3310}
3311
3312// Sets the maximum time each epoch should take. Default is 100ms.
3313Bench& Bench::maxEpochTime(std::chrono::nanoseconds t) noexcept {
3314 mConfig.mMaxEpochTime = t;
3315 return *this;
3316}
3317std::chrono::nanoseconds Bench::maxEpochTime() const noexcept {
3318 return mConfig.mMaxEpochTime;
3319}
3320
3321// Sets the maximum time each epoch should take. Default is 100ms.
3322Bench& Bench::minEpochTime(std::chrono::nanoseconds t) noexcept {
3323 mConfig.mMinEpochTime = t;
3324 return *this;
3325}
3326std::chrono::nanoseconds Bench::minEpochTime() const noexcept {
3327 return mConfig.mMinEpochTime;
3328}
3329
3330Bench& Bench::minEpochIterations(uint64_t numIters) noexcept {
3331 mConfig.mMinEpochIterations = (numIters == 0) ? 1 : numIters;
3332 return *this;
3333}
3334uint64_t Bench::minEpochIterations() const noexcept {
3335 return mConfig.mMinEpochIterations;
3336}
3337
3338Bench& Bench::epochIterations(uint64_t numIters) noexcept {
3339 mConfig.mEpochIterations = numIters;
3340 return *this;
3341}
3342uint64_t Bench::epochIterations() const noexcept {
3343 return mConfig.mEpochIterations;
3344}
3345
3346Bench& Bench::warmup(uint64_t numWarmupIters) noexcept {
3347 mConfig.mWarmup = numWarmupIters;
3348 return *this;
3349}
3350uint64_t Bench::warmup() const noexcept {
3351 return mConfig.mWarmup;
3352}
3353
3354Bench& Bench::config(Config const& benchmarkConfig) {
3355 mConfig = benchmarkConfig;
3356 return *this;
3357}
3358Config const& Bench::config() const noexcept {
3359 return mConfig;
3360}
3361
3362Bench& Bench::output(std::ostream* outstream) noexcept {
3363 mConfig.mOut = outstream;
3364 return *this;
3365}
3366
3367ANKERL_NANOBENCH(NODISCARD) std::ostream* Bench::output() const noexcept {
3368 return mConfig.mOut;
3369}
3370
3371std::vector<Result> const& Bench::results() const noexcept {
3372 return mResults;
3373}
3374
3375Bench& Bench::render(char const* templateContent, std::ostream& os) {
3376 ::ankerl::nanobench::render(templateContent, *this, os);
3377 return *this;
3378}
3379
3380Bench& Bench::render(std::string const& templateContent, std::ostream& os) {
3381 ::ankerl::nanobench::render(templateContent, *this, os);
3382 return *this;
3383}
3384
3385std::vector<BigO> Bench::complexityBigO() const {
3386 std::vector<BigO> bigOs;
3387 auto rangeMeasure = BigO::collectRangeMeasure(mResults);
3388 bigOs.emplace_back("O(1)", rangeMeasure, [](double) {
3389 return 1.0;
3390 });
3391 bigOs.emplace_back("O(n)", rangeMeasure, [](double n) {
3392 return n;
3393 });
3394 bigOs.emplace_back("O(log n)", rangeMeasure, [](double n) {
3395 return std::log2(n);
3396 });
3397 bigOs.emplace_back("O(n log n)", rangeMeasure, [](double n) {
3398 return n * std::log2(n);
3399 });
3400 bigOs.emplace_back("O(n^2)", rangeMeasure, [](double n) {
3401 return n * n;
3402 });
3403 bigOs.emplace_back("O(n^3)", rangeMeasure, [](double n) {
3404 return n * n * n;
3405 });
3406 std::sort(bigOs.begin(), bigOs.end());
3407 return bigOs;
3408}
3409
3410Rng::Rng()
3411 : mX(0)
3412 , mY(0) {
3413 std::random_device rd;
3414 std::uniform_int_distribution<uint64_t> dist;
3415 do {
3416 mX = dist(rd);
3417 mY = dist(rd);
3418 } while (mX == 0 && mY == 0);
3419}
3420
3421ANKERL_NANOBENCH_NO_SANITIZE("integer", "undefined")
3422uint64_t splitMix64(uint64_t& state) noexcept {
3423 uint64_t z = (state += UINT64_C(0x9e3779b97f4a7c15));
3424 z = (z ^ (z >> 30U)) * UINT64_C(0xbf58476d1ce4e5b9);
3425 z = (z ^ (z >> 27U)) * UINT64_C(0x94d049bb133111eb);
3426 return z ^ (z >> 31U);
3427}
3428
3429// Seeded as described in romu paper (update april 2020)
3430Rng::Rng(uint64_t seed) noexcept
3431 : mX(splitMix64(seed))
3432 , mY(splitMix64(seed)) {
3433 for (size_t i = 0; i < 10; ++i) {
3434 operator()();
3435 }
3436}
3437
3438// only internally used to copy the RNG.
3439Rng::Rng(uint64_t x, uint64_t y) noexcept
3440 : mX(x)
3441 , mY(y) {}
3442
3443Rng Rng::copy() const noexcept {
3444 return Rng{mX, mY};
3445}
3446
3447Rng::Rng(std::vector<uint64_t> const& data)
3448 : mX(0)
3449 , mY(0) {
3450 if (data.size() != 2) {
3451 throw std::runtime_error("ankerl::nanobench::Rng::Rng: needed exactly 2 entries in data, but got " +
3452 detail::fmt::to_s(data.size()));
3453 }
3454 mX = data[0];
3455 mY = data[1];
3456}
3457
3458std::vector<uint64_t> Rng::state() const {
3459 std::vector<uint64_t> data(2);
3460 data[0] = mX;
3461 data[1] = mY;
3462 return data;
3463}
3464
3465BigO::RangeMeasure BigO::collectRangeMeasure(std::vector<Result> const& results) {
3466 BigO::RangeMeasure rangeMeasure;
3467 for (auto const& result : results) {
3468 if (result.config().mComplexityN > 0.0) {
3469 rangeMeasure.emplace_back(result.config().mComplexityN, result.median(Result::Measure::elapsed));
3470 }
3471 }
3472 return rangeMeasure;
3473}
3474
3475BigO::BigO(std::string bigOName, RangeMeasure const& rangeMeasure)
3476 : mName(std::move(bigOName)) {
3477
3478 // estimate the constant factor
3479 double sumRangeMeasure = 0.0;
3480 double sumRangeRange = 0.0;
3481
3482 for (const auto& rm : rangeMeasure) {
3483 sumRangeMeasure += rm.first * rm.second;
3484 sumRangeRange += rm.first * rm.first;
3485 }
3486 mConstant = sumRangeMeasure / sumRangeRange;
3487
3488 // calculate root mean square
3489 double err = 0.0;
3490 double sumMeasure = 0.0;
3491 for (const auto& rm : rangeMeasure) {
3492 auto diff = mConstant * rm.first - rm.second;
3493 err += diff * diff;
3494
3495 sumMeasure += rm.second;
3496 }
3497
3498 auto n = detail::d(rangeMeasure.size());
3499 auto mean = sumMeasure / n;
3500 mNormalizedRootMeanSquare = std::sqrt(err / n) / mean;
3501}
3502
3503BigO::BigO(const char* bigOName, RangeMeasure const& rangeMeasure)
3504 : BigO(std::string(bigOName), rangeMeasure) {}
3505
3506std::string const& BigO::name() const noexcept {
3507 return mName;
3508}
3509
3510double BigO::constant() const noexcept {
3511 return mConstant;
3512}
3513
3514double BigO::normalizedRootMeanSquare() const noexcept {
3515 return mNormalizedRootMeanSquare;
3516}
3517
3518bool BigO::operator<(BigO const& other) const noexcept {
3519 return std::tie(mNormalizedRootMeanSquare, mName) < std::tie(other.mNormalizedRootMeanSquare, other.mName);
3520}
3521
3522std::ostream& operator<<(std::ostream& os, BigO const& bigO) {
3523 return os << bigO.constant() << " * " << bigO.name() << ", rms=" << bigO.normalizedRootMeanSquare();
3524}
3525
3526std::ostream& operator<<(std::ostream& os, std::vector<ankerl::nanobench::BigO> const& bigOs) {
3527 detail::fmt::StreamStateRestorer const restorer(os);
3528 os << std::endl << "| coefficient | err% | complexity" << std::endl << "|--------------:|-------:|------------" << std::endl;
3529 for (auto const& bigO : bigOs) {
3530 os << "|" << std::setw(14) << std::setprecision(7) << std::scientific << bigO.constant() << " ";
3531 os << "|" << detail::fmt::Number(6, 1, bigO.normalizedRootMeanSquare() * 100.0) << "% ";
3532 os << "| " << bigO.name();
3533 os << std::endl;
3534 }
3535 return os;
3536}
3537
3538} // namespace nanobench
3539} // namespace ankerl
3540
3541#endif // ANKERL_NANOBENCH_IMPLEMENT
3542#endif // ANKERL_NANOBENCH_H_INCLUDED
int ret
int flags
Definition: bitcoin-tx.cpp:530
Main entry point to nanobench's benchmarking facility.
Definition: nanobench.h:633
Bench & operator=(Bench const &other)
ANKERL_NANOBENCH(NODISCARD) std Bench & doNotOptimizeAway(Arg &&arg)
Retrieves all benchmark results collected by the bench object so far.
Bench & run(char const *benchmarkName, Op &&op)
Repeatedly calls op() based on the configuration, and performs measurements.
Definition: nanobench.h:1292
Bench & batch(T b) noexcept
Sets the batch size.
Definition: nanobench.h:1316
std::vector< BigO > complexityBigO() const
Bench()
Creates a new benchmark for configuration and running of benchmarks.
Bench & operator=(Bench &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
detail::SetupRunner< SetupOp > setup(SetupOp setupOp)
Configure an untimed setup step per epoch (forces single-iteration epochs).
Definition: nanobench.h:1286
Bench(Bench &&other) noexcept
Bench(Bench const &other)
Bench & complexityN(T n) noexcept
Definition: nanobench.h:1323
static RangeMeasure mapRangeMeasure(RangeMeasure data, Op op)
Definition: nanobench.h:1136
BigO(std::string bigOName, RangeMeasure const &scaledRangeMeasure)
std::vector< std::pair< double, double > > RangeMeasure
Definition: nanobench.h:1133
BigO(char const *bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1146
static RangeMeasure collectRangeMeasure(std::vector< Result > const &results)
BigO(std::string bigOName, RangeMeasure const &rangeMeasure, Op rangeToN)
Definition: nanobench.h:1150
BigO(char const *bigOName, RangeMeasure const &scaledRangeMeasure)
Result(Config benchmarkConfig)
static Measure fromString(std::string const &str)
void add(Clock::duration totalElapsed, uint64_t iters, detail::PerformanceCounters const &pc)
Result(Result &&other) noexcept
ANKERL_NANOBENCH(NODISCARD) Config const &config() const noexcept
Result & operator=(Result const &other)
Result(Result const &other)
Result & operator=(Result &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
An extremely fast random generator.
Definition: nanobench.h:494
static constexpr uint64_t() min()
Rng(Rng const &)=delete
As a safety precaution, we don't allow copying.
void shuffle(Container &container) noexcept
Shuffles all entries in the given container.
Definition: nanobench.h:1211
Rng(Rng &&) noexcept=default
Rng & operator=(Rng const &)=delete
Same as Rng(Rng const&), we don't allow assignment.
static constexpr uint64_t() max()
double uniform01() noexcept
Provides a random uniform double value between 0 and 1.
Definition: nanobench.h:1201
uint64_t result_type
This RNG provides 64bit randomness.
Definition: nanobench.h:499
void moveResultTo(std::vector< Result > &results) noexcept
void add(std::chrono::nanoseconds elapsed, PerformanceCounters const &pc) noexcept
IterationLogic(IterationLogic &&)=delete
IterationLogic & operator=(IterationLogic const &)=delete
ANKERL_NANOBENCH(NODISCARD) uint64_t numIters() const noexcept
IterationLogic(IterationLogic const &)=delete
IterationLogic & operator=(IterationLogic &&)=delete
PerformanceCounters(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters const &)=delete
PerformanceCounters & operator=(PerformanceCounters &&)=delete
ANKERL_NANOBENCH(NODISCARD) PerfCountSet< uint64_t > const &val() const noexcept
PerformanceCounters(PerformanceCounters &&)=delete
SetupRunner(SetupOp setupOp, Bench &bench)
Definition: nanobench.h:1235
volatile double sum
Definition: examples.cpp:10
#define T(expected, seed, data)
void doNotOptimizeAway(T &val)
Definition: nanobench.h:1065
PerformanceCounters & performanceCounters()
void doNotOptimizeAway(T const &val)
Definition: nanobench.h:1059
char const * json() noexcept
Template to generate JSON data.
char const * csv() noexcept
CSV data for the benchmark results.
char const * pyperf() noexcept
Output in pyperf compatible JSON format, which can be used for more analyzation.
char const * htmlBoxplot() noexcept
HTML output that uses plotly to generate an interactive boxplot chart. See the tutorial for an exampl...
void render(char const *mustacheTemplate, Bench const &bench, std::ostream &out)
Renders output from a mustache-like template and benchmark results.
std::conditional< std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock >::type Clock
Definition: nanobench.h:134
void render(std::string const &mustacheTemplate, std::vector< Result > const &results, std::ostream &out)
std::ostream & operator<<(std::ostream &os, BigO const &bigO)
std::ostream & operator<<(std::ostream &os, std::vector< ankerl::nanobench::BigO > const &bigOs)
void doNotOptimizeAway(Arg &&arg)
Makes sure none of the given arguments are optimized away by the compiler.
Definition: nanobench.h:1337
Definition: common.h:30
#define ANKERL_NANOBENCH_LOG(x)
Definition: nanobench.h:88
#define ANKERL_NANOBENCH_NO_SANITIZE(...)
Definition: nanobench.h:107
#define ANKERL_NANOBENCH(x)
Definition: nanobench.h:50
bool operator==(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:603
bool operator<(const CNetAddr &a, const CNetAddr &b)
Definition: netaddress.cpp:608
const char * name
Definition: rest.cpp:49
static RPCMethod stop()
Definition: server.cpp:145
Config & operator=(Config const &other)
Config(Config const &other)
Config & operator=(Config &&other) noexcept(ANKERL_NANOBENCH(NOEXCEPT_STRING_MOVE))
Config(Config &&other) noexcept
static SECP256K1_INLINE uint64_t rotl(const uint64_t x, int k)
Definition: testrand_impl.h:40
static int setup(void)
Definition: tests.c:8040
static int count
assert(!tx.IsCoinBase())